#'--- #' title: "Minnesota Statewide Aquatic Plants and Associated Data" #' author: "Mike Verhoeven & Dan Larkin" #' output: #' html_document: #' toc: true #' theme: default #' toc_depth: 3 #' toc_float: #' collapsed: false #'--- #' This script will pull in plant observation data from PI surveys, Secchi #' clarity, lake/watershed geodata, species statuses, and collaborator feedback. #' Then we'll sync them into our dataset, creating a full macrophyte obs and env #' dataset for MN Lakes #' #' #' # Outstanding Work: #' # Document Preamble knitr::opts_chunk$set(warning = FALSE, message = FALSE) strttime <- Sys.time() getwd() # load libraries ------------------------------------------------------------------ #' ## Libraries library(data.table) # update_dev_pkg()# remotes::install_github("Rdatatable/data.table") library(ggplot2) library(stringr) library(sf) library(vegan) library(gridExtra) library(dplyr) library(tidyr) library(janitor) # library(lme4) # library(sjPlot) # library(mediation) library(ggpubr) # library(EnvStats) # library(lmerTest) # library(merTools) # library(rstanarm) # library(ggsn) library(ggpmisc) library(cowplot) # load in functions ------------------------------------------------------- #' ## Functions f_dowle3natozeros = function(DT, x) { # or by number (slightly faster than by name) : for (j in x) set(DT,which(is.na(DT[[j]])),j,"0") } # load in data ------------------------------------------------- #' ## Data #' #' The plants observations data and collaborator corrections are datasets that #' we have generated. #' #' Secchi data have been aggregated from public sources by #' Kelsey Vitense for https://aslopubs.onlinelibrary.wiley.com/doi/full/10.1002/lol2.10323#lol210323-bib-0034 #' #' #' # #plants observation dataset: # plants <- fread(input = "data&scripts/data/input/plant_surveys_mn.csv", drop = 1:2) #import, dropping the exported row numbers # #collaborator corrections and feedback: # coll_edits <- fread(input = "data&scripts/data/input/Edited_post_contrib_feedback.csv") #secchi data: secchi <- fread(input = "data&scripts/data/input/AllSecchi_plus_ShallowLakesSecchi.csv", drop = 1) #import, dropping the exported row numbers #' The MN DNR does not allow publication of copies of their datasets, and thus the following datasets must be downloaded by a user in order to run this code. #' #' Hydrography (https://gisdata.mn.gov/dataset/water-dnr-hydrography; 5April2022) and #' watershed (https://gisdata.mn.gov/dataset/geos-dnr-watersheds; 10Aug2022) data were #' retrieved from the MN Geospatial commons. #' #' Species statuses were retrieved from the MN DNR website #' (https://www.dnr.state.mn.us/eco/mcbs/plant_lists.html; 5April2022). #' #'Citations: #'DNR Hydrography Dataset. (2012). Retrieved 5April2022, from https://resources.gisdata.mn.gov/pub/gdrs/data/pub/us_mn_state_dnr/water_dnr_hydrography/metadata/metadata.html #'DNR Watersheds—DNR Level 04—HUC 08—Majors. (2023). Retrieved 10Aug2022, from https://resources.gisdata.mn.gov/pub/gdrs/data/pub/us_mn_state_dnr/geos_dnr_watersheds/metadata/dnr_watersheds_dnr_level_04_huc_08_majors.html #'MNTaxa: The State of Minnesota Vascular Plant Checklist. (2013). Retrieved 5April2022, from https://www.dnr.state.mn.us/eco/mcbs/plant_lists.html #' ## MN DNR Datasets #hydrography & watersheds pwi_l <- st_read(dsn = "data&scripts/data/input/shp_water_dnr_hydrography", layer = "dnr_hydro_features_all") watersheds_huc8 <- st_read(dsn = "data&scripts/data/input/shp_geos_dnr_watersheds", layer = "dnr_watersheds_dnr_level_04_huc_08_majors") #species statuses rte <- fread(input = "data&scripts/data/input/2013_dnr_plant_checklist_web.csv") # Strip Rare Species ------------------------------------------------------ #' ### Strip Rare Species Idents #' Due to legal protections on rare, threatened, and endangered species, the following section strips the identities of rare species in the dataset. The identities can be recovered by requesting either the full data imported above (currently commented out), or the key to the protected species names below (note that only the former will get you any location info that was associated with those data). # de-identify rare species: rte <- clean_names(rte)#tidy names on status dataset # #Strip species ID from protected species # namesstripkey <- plants[ TAXON %in% rte[rarity_status != "" , mn_dnr_scientific_name,], .N , TAXON] # others <- plants[ !(TAXON %in% rte[, mn_dnr_scientific_name,]), .N , TAXON] # rm(others) #none of these trigger me to need to deident them # # namesstripkey[ , new_name := paste("ProtectedSpecies", .I, sep = "_") , ,] # # plants[ namesstripkey , on = .(TAXON=TAXON) , new_name := new_name] # plants[ TAXON %in% namesstripkey[,TAXON], new_name ] # # plants[ , .N , new_name] # # plants[ !is.na(new_name), TAXON := new_name ] # # plants[ , new_name := NULL] # colnames(plants) # # #strip locs from all points with a protected species # # ps_points <- plants[str_detect(TAXON, "ProtectedSpecies" ), unique(POINT_ID) , ]#give me the point ids for all points with a protected species # # #any loc data there? # plants[POINT_ID %in% ps_points, c("X","Y","NORTHING","EASTING","LATITUDE","LONGITUDE","UTMX","UTMY")] # # #delete that data # plants[POINT_ID %in% ps_points, c("X","Y","NORTHING","EASTING","LATITUDE","LONGITUDE","UTMX","UTMY"):= NA, ] # # #export rte key: # fwrite(namesstripkey, file = "data&scripts/data/output/rte_namestrip_key.csv" ) # #export rte stripped product: # fwrite(plants, file = "data&scripts/data/output/plants_input_data_rtestrip.csv") #import rte stripped product: # plants <- fread(input = "data&scripts/data/output/plants_input_data_rtestrip.csv") #import, dropping the exported row numbers # stripped out peoples personally identifiable information ------------------------------------------------ # #collaborator corrections and feedback: coll_edits <- fread(input = "data&scripts/data/input/Edited_post_contrib_feedback_noPII.csv") #import PLANTS INPUT product: plants <- fread(input = "data&scripts/data/output/plants_input_data_rtestrip_noPII.csv") #import, dropping the exported row numbers plants[TAXON == "", .N] plants[TAXON == "", TAXON:= NA] # *****DatasetUpdates***** ------------------------------------------------ #' # Dataset Updates # collaborator corrections ------------------------------------------------ #' ## Collaborator Corrections #' This section uses the collaborator feedback to revise the dataset. # check survey ID alignment #sum(!coll_edits[, SURVEY_ID, ] %in% plants[ , SURVEY_ID]) #100% of collaborator input has a match in plants names(coll_edits)[1] <- "feedback" coll_edits[ , .N , feedback ] # deletions # coll_edits[ str_detect(feedback, "delete",), SURVEY_ID , ]#which are marked for deletion? plants <- plants[ !SURVEY_ID %in% coll_edits[ str_detect(feedback, "delete",), SURVEY_ID , ]#this drops about 10k observations from the dataset , , ] # tag for reimport # these surveys need to be reimported and any current data deleted. We have these data in our files. # coll_edits[str_detect(feedback, "import",) , SURVEY_ID , ] # plants[ SURVEY_ID %in% coll_edits[str_detect(feedback, "import",) , SURVEY_ID , ], .N , SURVEY_ID] sel <- plants[ SURVEY_ID %in% coll_edits[str_detect(feedback, "import",) , SURVEY_ID , ], , ] #peel off those reimport data sel <- sel[!duplicated(sel[ , SURVEY_ID , ]),] #compress to one row where botched import produced some data # plants[ SURVEY_ID %in% coll_edits[str_detect(feedback, "import",) , SURVEY_ID , ], .N , SURVEY_ID] # drop or modify some cols to reflect bad import sel[ , c("STA_NBR_DATASOURCE", "DEPTH_FT", "NO_VEG_FOUND", "REL_ABUND", "WHOLE_RAKE_REL_ABUND","SUBSTRATE", "SURVEYOR", "TAXON", "SAMPLE_NOTES", "SURFACE_GROWTH", "POINT_LVL_SECCHI", "X", "Y", "NORTHING", "EASTING", "LATITUDE", "LONGITUDE", "UTMX", "UTMY", "POINT_ID", "OBS_ID") := NA ,]#in sel, dump these columns sel[ , INDATABASE := FALSE]#mark these as not in database plants <- plants[ !SURVEY_ID %in% coll_edits[str_detect(feedback, "import",) , SURVEY_ID , ], , ] #drops ~900 obs plants <- rbind(plants, sel) plants[ SURVEY_ID %in% sel[, SURVEY_ID] , SURVEY_FEEDBACK := "reimport required" , ] # data available from collaborator # coll_edits[ feedback %in% c("data available", "missing metadata") , SURVEY_ID , ] plants[ SURVEY_ID %in% coll_edits[ feedback %in% c("data available", "missing metadata") , SURVEY_ID , ] , SURVEY_FEEDBACK := "data available from collaborator" , ] plants[ SURVEY_ID %in% coll_edits[ feedback %in% c("rake density data available") , SURVEY_ID , ] , SURVEY_FEEDBACK := "rake density data available from collaborator" , ] # no data available plants[ SURVEY_ID %in% coll_edits[ feedback %in% c("no data available") , SURVEY_ID , ] , SURVEY_FEEDBACK := "no data available" , ] nrow(plants[SURVEY_FEEDBACK == "no data available" , .N , SURVEY_ID] )# how many cases with data unavailable/ not known where raw data are? # one-offs # coll_edits[feedback == "Point 69 has Lmin entered as 5 - change to 2", SURVEY_ID] plants[SURVEY_ID == coll_edits[feedback == "Point 69 has Lmin entered as 5 - change to 2", SURVEY_ID] & STA_NBR_DATASOURCE == 69 & REL_ABUND == 5, REL_ABUND := 2] # Taxa naming problem: plants[TAXON == "Mitellopsis", TAXON := "Nitellopsis"] # preferred datasource name # coll_edits[ , .N , EDIT_DATASOURCE] plants[ , SURVEY_DATASOURCE := coll_edits[match(plants$DATASOURCE, coll_edits$DATASOURCE), EDIT_DATASOURCE ] , ] # plants[ , .N , SURVEY_DATASOURCE] # plants[SURVEY_DATASOURCE == "", .N , DATASOURCE ] plants[DATASOURCE == "DNR Lakes and Rivers", SURVEY_DATASOURCE := "DNR Lakes and Rivers"] plants[DATASOURCE == "DNR Fisheries", SURVEY_DATASOURCE := "DNR Fisheries"] plants[DATASOURCE == "Rantala TIP", SURVEY_DATASOURCE := "DNR Fisheries"] plants[DATASOURCE == "Muthukrishnan Et al", SURVEY_DATASOURCE := "DNR Shallow Lakes" , ] plants[SURVEY_DATASOURCE == "DNR Fisheries Research" , SURVEY_DATASOURCE := "DNR Fisheries"] # check contribution # of surveys by new named datasources plants[ , length(unique(SURVEY_ID)) , SURVEY_DATASOURCE ] # lake name corrections plants[ , NEW_LAKE_NAME := coll_edits[match(plants$SURVEY_ID, coll_edits$SURVEY_ID), EDIT_LAKE_NAME ] , ] plants[NEW_LAKE_NAME %in% c("lake of the isles", "clear", "bde maka ska"), LAKE_NAME := NEW_LAKE_NAME ] plants[ , NEW_LAKE_NAME := NULL ,] # surveyor corrections plants[ , NEW_SURVEYOR := coll_edits[match(plants$SURVEY_ID, coll_edits$SURVEY_ID), EDIT_SURVEYOR ] , ] # plants[, .N, NEW_SURVEYOR ] plants[!NEW_SURVEYOR == "" & !is.na(NEW_SURVEYOR), SURVEYOR := NEW_SURVEYOR , ] plants[ , NEW_SURVEYOR := NULL ,] # a <- plants[ , length(unique(SURVEY_ID)) , SURVEYOR] # date corrections plants[ , NEW_DATE := coll_edits[match(plants$SURVEY_ID, coll_edits$SURVEY_ID), EDIT_DATE ] , ] # plants[, .N, NEW_DATE ] # plants[!NEW_DATE == "" & !is.na(NEW_DATE), .N , NEW_DATE ] plants[!NEW_DATE == "" & !is.na(NEW_DATE) , SURVEY_DATE := as.Date(NEW_DATE, format = "%d%b%Y") ,] plants[ , NEW_DATE := NULL ,] # input rake density scales #overwrite any bad rake scales: coll_edits[!is.na(`EDITED_SCALE_RAKE_DENS (0-X)`) , `SCALE_RAKE_DENS (0-X)` := `EDITED_SCALE_RAKE_DENS (0-X)` ] # coll_edits[ ,.N , `SCALE_RAKE_DENS (0-X)` ] coll_edits[ `SCALE_RAKE_DENS (0-X)` %in% c(1,2) , `SCALE_RAKE_DENS (0-X)` := NA ]# these aren't real abundance scales--they should be marked as NA, to indicate only pres-abs data are useable. #push over to plants DB: plants[ , RAKE_SCALE_USED := coll_edits[match(plants$SURVEY_ID, coll_edits$SURVEY_ID), `SCALE_RAKE_DENS (0-X)` ] , ] # for these surveys, we can see that our collaborators inputs on rake scale was not correct: plants[REL_ABUND>RAKE_SCALE_USED, .N , .(SURVEY_ID) ] #two more left now to manually change to max scale observed rather than reported: plants[SURVEY_ID == 1418 , RAKE_SCALE_USED := 5] plants[SURVEY_ID == 3128 , RAKE_SCALE_USED := 5] # check process: plants[ , .("max_observed_in_data" = max(REL_ABUND, na.rm = T)) , RAKE_SCALE_USED] #there are 29 unique surveys where no rake scale data are provided, and no easy inference exists -- swap these to presence absence plants[SURVEY_ID %in% plants[is.na(RAKE_SCALE_USED) & REL_ABUND >1, SURVEY_ID], .N , RAKE_SCALE_USED] plants[is.na(RAKE_SCALE_USED), .N , REL_ABUND ] #drop the abundance data from these plants[is.na(RAKE_SCALE_USED), REL_ABUND := NA ] # clean up WS rm(coll_edits, sel) # summarize plants dataset --------------------------------------------------- #' ## Review Data Summaries #' #' Review current data status and outline changes needed for #' #' str(plants) #what data formats? names(plants) #field names plants[ , length(unique(SURVEY_ID)) , ] #how many surveys in all? plants[ INDATABASE == T , length(unique(SURVEY_ID))] #how many surveys do we have the data in our db for? plants[ , length((unique(DOW))) , ] #how many lake in all? plants[ , length(unique(YEAR)) , ] #how many years of data? plants[ , length(unique(POINT_ID)),] #how samples pulled from the lake? plants[!is.na(TAXON) , length(unique(OBS_ID))] # how many times was a plant identified in these data? #' Lets see how many surveys (then number of points) we have been given by each contributor: plants[ , unique(SURVEY_DATASOURCE) ,] # survey contribution viz ggplot(plants[ , .N, .(SURVEY_ID, SURVEY_DATASOURCE, INDATABASE)], aes(SURVEY_DATASOURCE, fill = INDATABASE))+ geom_bar(stat = "count", position = "stack" )+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+ ggtitle(label = "n surveys by contributor")+ scale_y_log10() # point contributions ggplot(plants[INDATABASE==T , .N, .(POINT_ID, SURVEY_DATASOURCE, INDATABASE)], aes(SURVEY_DATASOURCE))+ geom_bar(stat = "count", position = "stack" )+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+ ggtitle(label = "n points by contributor")+ scale_y_log10() #' The database has all the surveys we know exist for MN in it, including those #' for which we do not have the data. It is often useful to snip those no-data #' ones off right away to avoid running any calcs using all those rows w/o any #' species data. missing_data_surveys <- plants[ INDATABASE == F] plants <- plants[INDATABASE == T] # drop zeros -------------------------------------------------------------- #' ### Zero Depths #' We dropped surveys with no depth data in an early cleaning step. This #' happened before we merged datasets from the MN DNR into the database, meaning #' that we've still got to do a purge of 0 and NA depths to be sure we've #' handled DNR and other collaborator data consistently: #any remaining points with depth == NA or 0? They need to be dropped to be consistent in the handling of all no depth sampled points (currently only MNDNR ): plants[is.na(DEPTH_FT)|DEPTH_FT == 0 , ][ , .N , .(SURVEY_ID, DATASOURCE)] plants[is.na(DEPTH_FT)|DEPTH_FT == 0 , .N, .(SURVEY_ID, DATASOURCE)][, unique(DATASOURCE)] #these only still remain in the DNR data--thats because we did the DNR data merge after cleaning up the other data sum(plants[ SURVEY_ID %in% plants[is.na(DEPTH_FT)|DEPTH_FT == 0 , .N, .(SURVEY_ID, DATASOURCE)][,SURVEY_ID], .N , .(SURVEY_ID, DATASOURCE) ][ , N]) #counts all points in those surveys #drop them: plants <- plants[!is.na(DEPTH_FT)|DEPTH_FT == 0 ] # duplicated entries ------------------------------------------------------ #' ## Duplicated Records #' It's become aparrent to me that when we casted the data to long format in the #' survey collation project, we ended up with many cases of multiple #' "observations" of the same thing from within a single point. Here we clean #' up this issue. I found the cause of it by opening up the surveycollation #' project #drop duplicated entries: names(plants) plants[ , .N , .(SURVEY_ID, POINT_ID , NO_VEG_FOUND , # proplight , DEPTH_FT , SUBSTRATE , SURVEYOR, TAXON) ][N>1 , hist(N) , ] sum(duplicated(plants$OBS_ID)) #obs ID is unique, but it was generated on row value, not on a unique key set names(plants[ , .SD , .SDcols = !c("OBS_ID") ]) sum(duplicated(plants[ , .SD , .SDcols = !c("OBS_ID") ])) #here we see the unique key set (everything BUT obs ID) tells us to drop 500k observations! plants <- plants[!duplicated(plants[ , .SD , .SDcols = !c("OBS_ID") ]) , , ] # still a bunch of dups leftover where we've got: # two abunds for one species or two of something for one species... # with a little sleuthing, I can see that these are a whole mix of things. For # example, James Johnson submitted one survey with two samples for point 213... # the solution I'll use is to allow these obs to stay (assuming that both obs # are real, and the data entry resulted in a bad point ID for one of them). # because of this, when we agg to the point level, we'll have to choose an obs # to use that taxon. You'll see this play out in the species matrix # construction below: plants[ , .N , .(SURVEY_ID, POINT_ID , NO_VEG_FOUND , #proplight , DEPTH_FT , SUBSTRATE , SURVEYOR, TAXON) ][N>1 , .N , ] plants[ , .N , .(SURVEY_ID, POINT_ID , NO_VEG_FOUND , #proplight , DEPTH_FT , SUBSTRATE , SURVEYOR, TAXON) ][N>1 , unique(POINT_ID) , ] #' ## Review Data Format #' #' Now "plants" is only those surveys for which we were able to gather and #' collate the data. Below we organize these data 3 ways: #' #' 1. As long format: each row is a species observation within a point (multiple #' rows per point) including all fields retained through cleaning processes #' 2. As a wide format of occurrences: each row is a point record, and the columns include a #' species observation (presence/absence) matrix. Here we keep only fields we #' 3. As a wide format with species abundances (a subset of "2.") where each row #' is a point record , and the columns include a species abundance matrix #how many surveys and how many points were sampled in each? hist(plants[ ,length(unique(POINT_ID)) , SURVEY_ID]$V1, breaks= 100, main = "N points per survey", xlab = "Npoints") #how many unique TAXA? unique(plants$TAXON) # N taxa per survey: plants[ , .("Ntaxa" = length(unique(TAXON))) , SURVEY_ID] #if you want to name cols on the fly you need to wrap in .() which makes list from them hist(plants[ , length(unique(TAXON)) , SURVEY_ID][ , V1], main = "N taxa per survey", xlab = "N taxa") hist(plants[ , length(unique(TAXON)) , POINT_ID][ , V1], main = "N taxa per point", xlab = "N taxa") # rake scale normalization ------------------------------------------- #' ## Rake Abundance Normalization #' This code will clean the relative rake density data from the whole PI dataset, #' shifting all to a 0,1,2,3 scale. This code was developed in the #' surveycollation project, but is implemented here (post-collaborator feedback) #' to allow the collabs to specify what the rake scale they used was. #drop surveys with max vals of 1s and 1-2s rakes1 <- plants[RAKE_SCALE_USED %in% c(3,4,5), ] # rakes1[ , .N , REL_ABUND] #how many surveys in these categories? # rakes1[ , .N , SURVEY_ID] #982 #Now shift/ realign data per discussion above #1-4 survey shifted to 1-3 rakes1[RAKE_SCALE_USED == 4 & REL_ABUND == 3 , REL_ABUND := 2 ] rakes1[RAKE_SCALE_USED == 4 & REL_ABUND == 4 , REL_ABUND := 3 ] #1-5 surveys shifted to 1-3 rakes1[RAKE_SCALE_USED == 5 & (REL_ABUND == 3 |REL_ABUND == 4) , REL_ABUND := 2 ] rakes1[RAKE_SCALE_USED == 5 & REL_ABUND == 5 , REL_ABUND := 3 ] # #check that the max vals are all 3's # hist(rakes1[ !is.na(REL_ABUND) , max(REL_ABUND) , SURVEY_ID ][,V1]) # # #and all data are distributed in 1-3 rake density framework # hist(rakes1[ !is.na(REL_ABUND) , REL_ABUND , ]) # # # count the number of surveys we've got # rakes1[ , .N , SURVEY_ID ] # N points per survey (includes NA's--points where no species were observed) # put the corrected rake scales back into the plants db # plants[ , , ] # rakes1[ , .N , OBS_ID][N>1] # rakes1[is.na(OBS_ID) , ,] #remove any cases where people told us the rake scale but data to-date are not in db: rakes1 <- rakes1[!is.na(OBS_ID)] #pop these corrected rake scale data into the plants dataset plants[OBS_ID %in% rakes1$OBS_ID , REL_ABUND_CORRECTED := rakes1$REL_ABUND , ] #clean out intermediates rm(rakes1) #which surveys used rakabunds plants[ , .N , RAKE_SCALE_USED] #why are there NA abunds for species in these surveys? plants[!is.na(RAKE_SCALE_USED)& !is.na(TAXON), .N , REL_ABUND_CORRECTED ] #expect some NAs to come through on these. May be worth looking back at the # DNR data and evaluating how these plant obs were recorded and how they # report then carry through our workflow. Seems to me that these surveys # oughtta have numbers assigned to them -- like how coudl a record show a # species present but not have a numeric indicator on that? Were they 0/1? plants[!is.na(RAKE_SCALE_USED) & is.na(REL_ABUND) & !is.na(TAXON), .N , SURVEY_ID ] bad_abund_surveys <- plants[!is.na(RAKE_SCALE_USED) & is.na(REL_ABUND) & !is.na(TAXON), unique(SURVEY_ID), ] #force pres/abs on those plants[SURVEY_ID %in% bad_abund_surveys, c("RAKE_SCALE_USED", "REL_ABUND", "REL_ABUND_CORRECTED"):= NA , ] # georeference data ------------------------------------------------------- #' ## Georeference Data #' This section uses MN hydrography geodata to add direct geodata into the #' dataset. After run, pwi_l and plants can be linked on the shared "order_ID" #' column, and HUC-8 level watersheds are included in the dataset # merge geospatial files #change sf data.frame to a data.table setDT(pwi_l) # linking plants db to spatial reference: #shapefile dows need to be made numeric (drops leading zeros) # pwi_l[ , dowlknum , ] pwi_l[ , dow_main := round(as.numeric(dowlknum)/100,0)*100 , ] #there's a lot of junk in there, work towards a 1:1 of plants dows and pwi_l dows pwi_l <- pwi_l[!is.na(dowlknum)]# drops many polygons that aren't lakes (islands, rivers, etc) pwi_l[ , order_ID:= .I , ]#adds a key #drop non-mn shapes pwi_l <- pwi_l[!outside_mn == "Y"] #which dows are duplicated in the shapes? pwi_l[pwi_l[, duplicated(dowlknum),], dowlknum] #lets review those data and see if we can devise any cleaning ideas #pwi_l[dowlknum %in% pwi_l[pwi_l[, duplicated(dowlknum),], dowlknum],] # we can just use the first instance of these duplicated waterbodies: # we'll do that by dropping the subsequent duplicates! pwi_l <- pwi_l[!pwi_l[, duplicated(dowlknum),], , ] # missing matches in the plants data to shapefile dows sum(is.na(match(plants[ , unique(DOW) ,], unique(pwi_l[ , dowlknum , ])))) # missing matches in the plants data to shapefile mainlake dows sum(is.na(match(plants[ , unique(DOW) ,], unique(pwi_l[ , dow_main , ])))) #append a polygon value to the plants data (here we'll use our order_ID from above) plants[ , order_ID := pwi_l[ match( plants[ , DOW ,], pwi_l[ , as.numeric(dow_main) , ]) , order_ID , ] ] #and any that didn't match on that, try the basin specific plants[ is.na(order_ID) , order_ID := pwi_l[ match(plants[ is.na(order_ID) , DOW ,], pwi_l[ , as.numeric(dowlknum) , ]), order_ID , ] ] # now to navigate these last non-compliant ones... plants[is.na(order_ID) & !is.na(DOW), .N , .(LAKE_NAME, DOW, DATASOURCE)] pwi_l[ dowlknum == "40000200", order_ID] plants[ DOW == 40000201, order_ID := pwi_l[ dowlknum == "40000200", order_ID] ] #Upper Sakatah polygon pwi_l[ dowlknum == "68000500", order_ID] plants[ DOW %in% c(68000501,68000502), order_ID := pwi_l[ dowlknum == "68000500", order_ID] ] #Roseau River WMA # pwi_l[ dowlknum == "70005000", order_ID] # plants[ DOW == 70050000, ] #Carls Lake # pwi_l[ dowlknum == "82011800", order_ID] plants[ DOW == 82009999, order_ID := pwi_l[ dowlknum == "82011800", order_ID] ] #Katherine Abbott Pond # the plants datset lakes with no geodata in the hydrography layer we used: plants[is.na(order_ID) , .N , .(LAKE_NAME, DOW, DATASOURCE)] # in total, this is 18 surveys and 3611 observations without ANY geolocation plants[, summary(order_ID)] # fix up local geospatial info #check for weird X,Y vals in th UTM-looking columns plants[!is.na("X"), summary(X) ,] plants[!is.na("Y"), summary(Y) ,] #some are clearly lat/longs plants[X < 4600, X ,] plants[X < 4600, LATITUDE := X ,] plants[X < 4600, X := NA ,] plants[Y<10000, summary(Y) ,] plants[Y<10000 & Y>0, LATITUDE := Y ,] plants[Y<10000 & Y>0, Y := NA ,] plants[Y<10000, LONGITUDE := Y ,] plants[Y<10000, Y := NA ,] #whatever the heck is leftover here is weeeeeird and muddled. plants[!is.na("X"), summary(X) ,] plants[!is.na("Y"), summary(Y) ,] # we need to delete these non-UTM vals form X & Y # plants[Y<4800000, .N, DATASOURCE ] plants[ Y < 4800000, c("X","Y") := NA, ] #looks clean, now move into the UTM slots? plants[!is.na(UTMY) , summary(UTMY) ,] plants[!is.na(UTMX) , summary(UTMX) ,] #any conflicts with UTM loc data? plants[!is.na(UTMX) & !is.na(X)] plants[!is.na(UTMY) & !is.na(Y)] #move X, Y to UTMs plants[!is.na(X) , UTMX := X , ] plants[!is.na(Y) , UTMY := Y , ] plants[ , c("X", "Y") := NULL , ] #now Northing Easting, which happen to look like clean UTM data plants[!is.na(NORTHING), summary(NORTHING) ,] plants[!is.na(EASTING), summary(EASTING) ,] #overlap/ conflict? plants[!is.na(UTMX) & !is.na(NORTHING)] plants[!is.na(UTMY) & !is.na(EASTING)] #move Northing and easting to UTMs plants[!is.na(NORTHING), UTMY := NORTHING ,] plants[!is.na(EASTING), UTMX := EASTING ,] plants[ , c("NORTHING", "EASTING") := NULL , ] #now get all into same CRS: #conflicts? plants[!is.na(UTMX) & !is.na(LONGITUDE)] plants[!is.na(UTMY) & !is.na(LATITUDE)] plants[!is.na(LONGITUDE), UTMX:=NA] plants[!is.na(LATITUDE), UTMY:=NA] #here we'll split into a non, UTM, and LL georef set, then convert ref'd to sf objects, then merge all back together # plants complete x,y in one CRS or another? NOPE... Oh well. moving on. # plants[!is.na(UTMX) & is.na(UTMY)] # # plants[] # # plants[!is.na(LATITUDE) & is.na(LONGITUDE)] # plants[is.na(LATITUDE) & !is.na(LONGITUDE)] #Conversion of data frame to sf object (note we've assumed NAD1983, Z15N for UTMs) plants_UTMS <- st_as_sf(x = plants[!is.na(UTMX)], coords = c("UTMX", "UTMY"), crs = "+proj=utm +zone=15") #Projection transformation plants_U_LL = st_transform(plants_UTMS, crs = "+proj=longlat +datum=WGS84") setDT(plants_U_LL) #Conversion of data frame to sf object plants_LLS <- st_as_sf(x = plants[!is.na(LONGITUDE)], coords = c("LONGITUDE", "LATITUDE"), crs = "+proj=longlat +datum=WGS84") setDT(plants_LLS) #drop unusedCRS cols from each: plants_U_LL[ , c("LATITUDE", "LONGITUDE") := NULL , ] plants_LLS[ , c("UTMX","UTMY") := NULL, ] plants2 <- rbindlist(list(plants_LLS, plants_U_LL)) plants2 <- cbind(plants2, st_coordinates(st_as_sf(plants2))) plants2[ , geometry := NULL ,] names(plants2)[names(plants2)%in% c("X","Y")] <- c("Longitude","Latitude") #merge back to plants (check dims to ensure no duplications or overlaps): dim(plants) plants[is.na(UTMX) & is.na(LONGITUDE) , .N , ]+ plants2[ , .N ,] plants1 <- plants[is.na(UTMX) & is.na(LONGITUDE), ] plants1[ , c("UTMX", "UTMY", "LATITUDE", "LONGITUDE") := NULL ,] plants1[ , c("Longitude", "Latitude") := NA, ] nrow(plants1)+nrow(plants2) plants <- rbindlist(list(plants1, plants2)) rm(plants_LLS,plants_U_LL, plants1, plants2) # label all pwi_l with watershed names # st_join(pwi_l, watersheds_huc8) # # st_crs(plants_UTMS) <- st_crs(watersheds_huc8) # # plantsUTMS <- st_join(plants_UTMS, left = TRUE, watersheds_huc8) pwi_l <- st_sf(pwi_l) st_crs(pwi_l) <- st_crs(watersheds_huc8) #ignore warning, no re-projection needed in this case, we do this because I lost the crs in some of my data manipulation pwi_l <- st_join(pwi_l, left = TRUE, watersheds_huc8) setDT(pwi_l) rm( plants_UTMS) # add in secchi data ------------------------------------------------------ #' ## Secchi Data Join #' #' This code will conduct an eval of the fuzzy join of Secchi to plants data, #' calculate Secchi metrics based on the chosen fuzzy join, then excute the join. #' The code includes a solution adapted from a script written by Dan Larkin for #' the niches project (https://conservancy.umn.edu/handle/11299/218009). #' #' Assign a Secchi to each observationuse the closest Secchi temporally. #' For each plant observation, we'll append the Secchi observation from that DOW #' that was closest in time to the plant obs. #' # number of observations hist(secchi[,year(Date)]) hist(secchi[,month(Date)]) secchi[,.N,Source] # and for the plants data? plants[ is.na(INDATABASE) , YEAR:= year(as.IDate(DATESURVEYSTART)) , ] #new data imports need to move date in from the chr strings plants[ is.na(INDATABASE) , SURVEY_DATE:= as.IDate(DATESURVEYSTART) , ] #new data imports need to move date in from the chr strings hist(plants[ ,.N , .(SURVEY_ID,YEAR) ][ , YEAR,]) #clean some data in prep for join secchi[, YEAR := year(Date)] secchi[, MONTH := month(Date) ] secchi[ , old_DOW := DOW] secchi[, DOW := as.integer(DOW)] secchi[ is.na(DOW) , old_DOW ] #how many survey DOW's have a secchi for the lake (ever)? summary(plants[ , unique(DOW) , ]%in%secchi[ ,DOW ,]) #how many surveys have a secchi for that year? summary(plants[ , .N ,.(DOW,YEAR) ][,paste(DOW,YEAR, sep = "_"),] %in% secchi[ ,paste(DOW,YEAR, sep = "_") ,]) #how many surveys have a secchi for that month? summary(plants[ , .N ,.(DOW,YEAR,MONTH) ][,paste(paste(DOW,YEAR, sep = "_"),MONTH, sep = "_"),] %in% secchi[ ,paste(paste(DOW,YEAR, sep = "_"),MONTH, sep = "_") ,]) plants[ , date := SURVEY_DATE] #consolidate to the DOW-Date level secchi <- secchi[ , .("Secchi_m" = mean(Secchi_m)) , .(DOW, Date) ] secchi[ , SECCHI_DATE := Date] secchi <- secchi[!is.na(DOW)] plants <- secchi[plants, , on = .(DOW, Date = date), roll='nearest' ] #drop the Date field (now a dup of SURVEY_DATE) plants[ , Date := NULL] #how far apart are plant and secchi obs? hist(plants[,SURVEY_DATE-SECCHI_DATE,]) #keep only Secchi obs within a month (date+/-30d) hist(plants[ , abs(yday(SECCHI_DATE) - yday(SURVEY_DATE)), ]) plants[abs(yday(SECCHI_DATE) - yday(SURVEY_DATE))<30 & abs(year(SECCHI_DATE) - year(SURVEY_DATE))<1, SECCHI_m_ACCEPTED := Secchi_m ] #cleanup: rm(secchi) #' ### Calc Light Availability # calculate point level light avail plants[ , proplight := exp(-(log(10)/SECCHI_m_ACCEPTED)*(DEPTH_FT/3.2804)) ] nrow(plants[!is.na(proplight) , .N , POINT_ID])/ #how many points can we do this for? nrow(plants[, .N , POINT_ID])#out of total n points plants[ ,hist(proplight, breaks = 100, main = "78% coverage for light availability")] # clean taxa names here --------------------------------------------------- #' ## Tidy Taxa Names #these names ought to be pretty close already. plants[ , .N , TAXON] plants[ , sort(unique(tolower(gsub("_", " ", TAXON)))) , ] plants[ , TAXON := tolower(gsub("_", " ", TAXON)) , ] # clean up taxonomy using macroniche (see paper or git repo) taxonomy ------------------------------------------------------ #' Verhoeven, M. R., Glisson, W. J., & Larkin, D. J. (2020). Niche models #' differentiate potential impacts of two aquatic invasive plant species on #' native macrophytes. Diversity, 12, 162. https://doi.org/10.3390/d12040162 # pull in taxonomy corrections: tnrs <- fread( file = "data&scripts/data/input/tnrs.final.csv", drop = 1) tnrs[ , submittedname := tolower(gsub("\\.", "", submittedname)) ,]#make format match sum(plants$TAXON %in% tnrs$submittedname) tnrs[match(plants$TAXON, tnrs$submittedname), "species"] # implement name changes -------------------------------------------------------- # Add in a Taxon corrected column plants[, TAXONC := tnrs[match(plants$TAXON, tnrs$submittedname), "species"], ] #these are all looking good to me. print(plants[ , .N , TAXONC], max.levels = 200) #tidy up names plants[is.na(TAXONC), .N , TAXON ] #review unmatched plants[is.na(TAXONC) & str_detect(TAXON, "\\(", negate = T ) & sapply(strsplit(TAXON, " "), length) == 2, .N , sub("\\b(\\w)(\\w*)\\b", "\\U\\1\\L\\2", TAXON, perl = TRUE) ] #fix first cap in binomials: plants[is.na(TAXONC) & str_detect(TAXON, "\\(", negate = T ) & sapply(strsplit(TAXON, " "), length) == 2, TAXON := sub("\\b(\\w)(\\w*)\\b", "\\U\\1\\L\\2", TAXON, perl = TRUE) ] # overwrite TAXON with corrected names plants[!is.na(TAXONC), TAXON := TAXONC] #delete correction col: plants[ , TAXONC := NULL ,] #review taxa plants[ , sort(unique(TAXON)) ,] #only one species of nitellopsis- plants[TAXON == "nitellopsis", TAXON := "Nitellopsis obtusa" , ] # Make sure they aren't NA'd taxa that should be marked no-veg-found (are all NA taxa marked for NO_VEG_FOUND?) Yes, looks good plants[ is.na(TAXON) , .N, NO_VEG_FOUND , ] # #now we need to ensure we retain rows where no useable taxa were found # plants[ NO_VEG_FOUND == TRUE, , ] # # #now delete all rows where TAXON == DELETE & NO_VEG_FOUND == F # # plants[ TAXON == "DELETE" & NO_VEG_FOUND == F, ] # plants <- plants[ !(TAXON == "DELETE" & NO_VEG_FOUND == F), ] # samples per taxon plants[ , .N , TAXON][order(-N), ] #and finally a new unique ID for each observation in the dataset plants[, OBS_ID := .I] #update SURVEY_ID plants[ , length(unique(DATASOURCE)) , .(SURVEY_ID)][V1>2] #not needed unless this call is non-empty # plants[ , SURVEY_ID_NEW:= .GRP , .(SURVEY_ID, DATASOURCE) ] # plant status information ------------------------------------------------ #' ## Plant Status Key # native diversity must exclude invasives/introduced species, so generate a df that can be use to select cols in these categories plants[ , sort(unique(TAXON)) , ][!(plants[ , unique(TAXON) , ] %in% c(rte[native_status == "I", mn_dnr_scientific_name], "Nitellopsis obtusa", "Typha glauca"))] #we'll use this to select columns as we calculate diversty metrics! natcols <- plants[ !is.na(TAXON), unique(TAXON) , ][!(plants[ !is.na(TAXON), unique(TAXON) , ] %in% c(rte[native_status == "I", mn_dnr_scientific_name], "Nitellopsis obtusa", "Typha glauca"))] taxacols <- plants[!is.na(TAXON) , unique(TAXON) , ] # Prep Data Products ------------------------------------------------------ # **plants_db ------------------------------------------------------------- #' # Prep Data Products #' ## Observations Long names(plants) plants[ , .N , month(SURVEY_DATE) ] plants[ , unique(DATESURVEYSTART), , ][1:100] plants[ POINT_ID == 151207 , ] #one of these has a substrate, one not any(duplicated(plants[,.SD, .SDcols = !c("SUBSTRATE","OBS_ID")])) plants[duplicated(plants[,.SD, .SDcols = !c("SUBSTRATE","OBS_ID")]), , ] plants <- plants[!duplicated(plants[,.SD, .SDcols = !c("SUBSTRATE","OBS_ID")]), , ] #check has orderID field plants[ , .N , is.na(order_ID) ] plants[is.na(order_ID) , .N , LAKE_NAME ] #I've not been able to resolve the location on these WBs, so we'll leave them un-georeferenced names(plants) # **point level p/a ------------------------------------------------- #' ## Point Occurrences Wide plants[ ,.N , REL_ABUND] plants[ , .N , INDATABASE] # In case of desired fill 1/0 rather than T/F # plants[ , FILLFIELD := as.numeric(INDATABASE) ,] # plants[ , .N , FILLFIELD ] plants_occurrence_wide <- dcast(plants, SURVEY_ID+ POINT_ID + NO_VEG_FOUND + proplight + DEPTH_FT + SUBSTRATE + SURVEYOR ~ TAXON, fun.aggregate = last, value.var = "INDATABASE", fill = FALSE) #Specify a logical var for all included data (INDATABASE) so that this species matrix is all T/F; see previous lines for a 0/1 fill #diversity metrics (only have richness with p/a, no "evenness", no "diversity"): # point_natcols <- names(plants_occurrence_wide)[names(plants_occurrence_wide)%in%natcols] # names(plants_occurrence_wide) plants_occurrence_wide[ , richness := rowSums(.SD > 0), .SDcols = taxacols ] plants_occurrence_wide[ , nat_richness := rowSums(.SD > 0), .SDcols = natcols ] #bring all survey level variables back into the dataset #check join # nrow(plants[plants_occurrence_wide, on = .(POINT_ID, SURVEY_ID, NO_VEG_FOUND, proplight,DEPTH_FT,SUBSTRATE,SURVEYOR), mult = "last" , ]) # names(plants[plants_occurrence_wide, on = .(POINT_ID, SURVEY_ID, NO_VEG_FOUND, proplight,DEPTH_FT,SUBSTRATE,SURVEYOR), mult = "last" , ]) # plants_occurrence_wide <- plants[plants_occurrence_wide, on = .(POINT_ID, SURVEY_ID, NO_VEG_FOUND, proplight, DEPTH_FT, SUBSTRATE, SURVEYOR), mult = "last" , ] #and drop unneeded cols & those with loss of meaning through munging: names(plants_occurrence_wide) plants_occurrence_wide[ , c("STA_NBR_DATASOURCE","SURVEY_ID_DATASOURCE", "REL_ABUND", "REL_ABUND_CORRECTED", "WHOLE_RAKE_REL_ABUND", "SAMPLE_NOTES", "SURFACE_GROWTH", "POINT_LVL_SECCHI", "OLD_SURVEY_ID", "COHORT", "DATEINFO", "MONTH", "DAY", "YEAR", "DATESURVEYSTART", "INVENTORY_STAFF", "INVENTORY_STAFFDATE", "INVENTORY_NOTES", "USEABLE", "CLEANED", "INDATABASE", "SUBMISSION_STAFF", "SUBMISSION_STAFFDATE", "SUBMISSION_NOTES", "SURVEY_FEEDBACK", "DATASOURCE", "RAKE_SCALE_USED", "NA", "TAXON", "SUBSTRATE", "OBS_ID", "NO_VEG_FOUND") := NULL , ] setcolorder(plants_occurrence_wide, c("DOW", "LAKE_NAME", "order_ID", "SUBBASIN", "SURVEY_ID", "SURVEY_DATASOURCE", "SURVEY_DATE", "MULTIPARTSURVEY", "SURVEYOR", "Secchi_m", "SECCHI_DATE", "SECCHI_m_ACCEPTED", "POINT_ID" ,"DEPTH_FT", "proplight", "Longitude", "Latitude")) #check to make sure I didn't dump something critical for an ident: plants_occurrence_wide[duplicated(plants_occurrence_wide)] # **point level rake abund -------------------------------------------------- #' ## Point Abundances Wide plants_rakeabund_wide <- dcast(plants[!is.na(RAKE_SCALE_USED)], SURVEY_ID+ POINT_ID + NO_VEG_FOUND + proplight + DEPTH_FT + SUBSTRATE + SURVEYOR ~ TAXON, value.var = "REL_ABUND_CORRECTED", fun.aggregate = last, fill = 0) plants_rakeabund_wide[ ,.N , NO_VEG_FOUND] #calculate diversity metrics for each rake throw rake_taxacols <- names(plants_rakeabund_wide)[names(plants_rakeabund_wide)%in%taxacols] rake_natcols <- names(plants_rakeabund_wide)[names(plants_rakeabund_wide)%in%natcols] # names(plants_rakeabund_wide) # diversity metrics plants_rakeabund_wide[ , shannon_div := diversity(plants_rakeabund_wide[,.SD, .SDcols = rake_taxacols],index = "shannon") ] plants_rakeabund_wide[ , simpsons_div := diversity(plants_rakeabund_wide[,.SD, .SDcols = rake_taxacols],index = "invsimpson") ] plants_rakeabund_wide[ , shannon_div_nat := diversity(plants_rakeabund_wide[,.SD, .SDcols = rake_natcols],index = "shannon") ] plants_rakeabund_wide[ , simpsons_div_nat := diversity(plants_rakeabund_wide[,.SD, .SDcols = rake_natcols],index = "invsimpson") ] # richness plants_rakeabund_wide[ , richness := rowSums(.SD > 0), .SDcols = rake_taxacols ] plants_rakeabund_wide[ , nat_richness := rowSums(.SD > 0), .SDcols = rake_natcols ] summary(plants_rakeabund_wide$`Potamogeton crispus`)#check that max rakeabund is 3 #bring all survey level variables back into the dataset #check join # nrow(plants[plants_rakeabund_wide, on = .(POINT_ID, SURVEY_ID, NO_VEG_FOUND, proplight,DEPTH_FT,SUBSTRATE,SURVEYOR), mult = "last" , ]) # names(plants[plants_rakeabund_wide, on = .(POINT_ID, SURVEY_ID, NO_VEG_FOUND, proplight,DEPTH_FT,SUBSTRATE,SURVEYOR), mult = "last" , ]) # plants_rakeabund_wide <- plants[plants_rakeabund_wide, on = .(POINT_ID, SURVEY_ID, NO_VEG_FOUND, proplight,DEPTH_FT,SURVEYOR, SUBSTRATE), mult = "last" , ] #and drop unneeded cols & those with loss of meaning through munging: plants_rakeabund_wide[ , c("STA_NBR_DATASOURCE", "REL_ABUND", "REL_ABUND_CORRECTED", "WHOLE_RAKE_REL_ABUND", "SURVEY_ID_DATASOURCE", "SAMPLE_NOTES", "SURFACE_GROWTH", "POINT_LVL_SECCHI", "OLD_SURVEY_ID", "DATESURVEYSTART", "COHORT", "DATEINFO", "MONTH", "DAY", "YEAR", "INVENTORY_STAFF", "INVENTORY_STAFFDATE", "INVENTORY_NOTES", "USEABLE", "CLEANED", "INDATABASE", "SUBMISSION_STAFF", "SUBMISSION_STAFFDATE", "SUBMISSION_NOTES", "SURVEY_FEEDBACK", "DATASOURCE", "RAKE_SCALE_USED") := NULL , ] names(plants_rakeabund_wide) plants_rakeabund_wide[ , c("STA_NBR_DATASOURCE","SURVEY_ID_DATASOURCE", "REL_ABUND", "REL_ABUND_CORRECTED", "WHOLE_RAKE_REL_ABUND", "SAMPLE_NOTES", "SURFACE_GROWTH", "POINT_LVL_SECCHI", "OLD_SURVEY_ID", "COHORT", "DATEINFO", "MONTH", "DAY", "YEAR", "DATESURVEYSTART", "INVENTORY_STAFF", "INVENTORY_STAFFDATE", "INVENTORY_NOTES", "USEABLE", "CLEANED", "INDATABASE", "SUBMISSION_STAFF", "SUBMISSION_STAFFDATE", "SUBMISSION_NOTES", "SURVEY_FEEDBACK", "DATASOURCE", "RAKE_SCALE_USED", "NA","TAXON", "SUBSTRATE", "OBS_ID", "NO_VEG_FOUND") := NULL , ] setcolorder(plants_rakeabund_wide, c("DOW", "LAKE_NAME", "order_ID", "SUBBASIN", "SURVEY_ID", "SURVEY_DATASOURCE", "SURVEY_DATE", "MULTIPARTSURVEY", "SURVEYOR", "Secchi_m", "SECCHI_DATE", "SECCHI_m_ACCEPTED", "POINT_ID" ,"DEPTH_FT", "proplight", "Longitude", "Latitude")) #cleanup: rm(rake_taxacols,rake_natcols) # **survey level stats ------------------------------------------- #' ## Survey Level Aggregation surveys <- plants[ , .(tot_n_samp = length(unique(POINT_ID))) , SURVEY_ID ] #add richness to the surveys dataset surveys[ , taxa_richness := #take the "taxon count" and subtract one if the survey includes NAs (see next two lines) plants[ , length(unique(TAXON)) , SURVEY_ID ][ , V1]-# ("total richness", but counts NAs as a taxon) minus plants[ , ifelse(sum(is.na(TAXON))== 0, 0, 1), SURVEY_ID][,V1],]# (each survey get a 0 if no NAs or a 1 if contains NA's) # extent of vegetation in survey (proportion vegetated) surveys <- merge(surveys,plants[!is.na(TAXON), .(n_points_vegetated=length(unique(POINT_ID))) , SURVEY_ID ], by = "SURVEY_ID", all.x = TRUE)[is.na(n_points_vegetated), n_points_vegetated := 0 ] surveys[ , prop_veg := n_points_vegetated/tot_n_samp ,] #create a plant observation matrix (species abund by survey) survey_species_matrix <- dcast(plants[!is.na(TAXON) , .("count" = length(unique(POINT_ID))) , .(SURVEY_ID,TAXON)], SURVEY_ID ~ TAXON, value.var = "count", fill = 0) #note that this line creates the matrix ONLY for surveys that had species observations (~70 surveys had no species observed) #diversity indicies: # species names: natcols <- names(survey_species_matrix)[names(survey_species_matrix) %in% natcols] # total diversity survey_species_matrix[ , shannon_div := diversity(.SD,index = "shannon"), .SDcols = taxacols ] survey_species_matrix[ , simpsons_div := diversity(.SD,index = "invsimpson"), .SDcols = taxacols ] # native diversity survey_species_matrix[ , shannon_div_nat := diversity(.SD,index = "shannon"), .SDcols = natcols ] survey_species_matrix[ , simpsons_div_nat := diversity(.SD,index = "invsimpson"), .SDcols = natcols ] survey_species_matrix[simpsons_div_nat == Inf, simpsons_div_nat := 0] # native richness survey_species_matrix[ , nat_richness := rowSums(survey_species_matrix[ , .SD, .SDcols = natcols] > 0), ] # depth stats # depth surveyed stats: surveys <- surveys[plants[ !is.na(DEPTH_FT), .("max_depth_surveyed" = max(DEPTH_FT)) , SURVEY_ID], on = "SURVEY_ID" , ] surveys <- surveys[plants[ !is.na(DEPTH_FT), .("min_depth_surveyed" = min(DEPTH_FT)) , SURVEY_ID], on = "SURVEY_ID" , ] surveys <- surveys[plants[ !is.na(DEPTH_FT), .("mean_depth_surveyed" = mean(DEPTH_FT)) , SURVEY_ID], on = "SURVEY_ID" , ] surveys <- surveys[plants[ !is.na(DEPTH_FT), .("median_depth_surveyed" = median(DEPTH_FT)) , SURVEY_ID], on = "SURVEY_ID" , ] surveys <- surveys[plants[ !is.na(DEPTH_FT), .("IQR_depth_surveyed" = IQR(DEPTH_FT)) , SURVEY_ID], on = "SURVEY_ID" , ] #vegetated depths data #max depth vegetated within survey: #some of these might warrant removal, depending on whats being done with the data plants[NO_VEG_FOUND == F & DEPTH_FT>50, length(POINT_ID) , .(SURVEY_DATASOURCE, DOW, SUBBASIN, SURVEY_DATE, LAKE_NAME)] plants[ NO_VEG_FOUND == FALSE & DEPTH_FT<50 , .("max_depth_vegetated" = max(DEPTH_FT)) , SURVEY_ID] surveys <- merge( surveys , plants[ NO_VEG_FOUND == FALSE& DEPTH_FT<50 , .("max_depth_vegetated" = max(DEPTH_FT, na.rm = T)) , SURVEY_ID] , by = "SURVEY_ID" , all.x =TRUE ) #other depth vegetated stats: surveys <- merge( surveys , plants[ NO_VEG_FOUND == FALSE& DEPTH_FT<50 , .("min_depth_vegetated" = min(DEPTH_FT, na.rm = T)) , SURVEY_ID], by = "SURVEY_ID" , all.x =TRUE ) surveys <- merge( surveys , plants[ NO_VEG_FOUND == FALSE& DEPTH_FT<50 , .("mean_depth_vegetated" = mean(DEPTH_FT, na.rm = T)) , SURVEY_ID], by = "SURVEY_ID" , all.x =TRUE ) surveys <- merge( surveys , plants[ NO_VEG_FOUND == FALSE& DEPTH_FT<50 , .("median_depth_vegetated" = median(DEPTH_FT, na.rm = T)) , SURVEY_ID], by = "SURVEY_ID" , all.x =TRUE ) surveys <- merge( surveys , plants[ NO_VEG_FOUND == FALSE& DEPTH_FT>50 , .("IQR_depth_vegetated" = IQR(DEPTH_FT, na.rm = T)) , SURVEY_ID], by = "SURVEY_ID" , all.x =TRUE ) # species matrix into survey data #species matrix for surveys surveys <- merge(surveys, survey_species_matrix, by = "SURVEY_ID", all.x = T) f_dowle3natozeros(surveys, names(survey_species_matrix)) #the merge incorrectly assigns NAs for non obs... here we replace those with 0s # check work: # summary(surveys[,1:17]) #append survey data (basic data from plants db) to these # names(plants) surveys <- merge(plants[order(SURVEY_DATE) , .("nobs" = .N, "SURVEY_DATE" = first(SURVEY_DATE)) , .(SURVEY_ID, SURVEY_DATASOURCE, LAKE_NAME, DOW, SUBBASIN, MULTIPARTSURVEY, order_ID) ],surveys, by = "SURVEY_ID") # summary(surveys) # names(surveys) <- gsub(" ", "_", gsub( "\\(", "_", gsub( "\\)", "_", names(surveys)))) # secchi data metrics # rescue the secchi data from the plants db for these surveys surveys[plants, Secchi_m := Secchi_m, on = "SURVEY_ID"] surveys[plants, Secchi_m_date := SECCHI_DATE, on = "SURVEY_ID"] #OPTIONAL: merge in the geodata + lake data to the survey work # surveys <- pwi_l[surveys, on = .(order_ID), mult = "first" ] #have to strip off the geometry to prevent failure in write to csv # surveys[ ,geometry := NULL ,] # n samples within historical max depth # surveys <- merge(surveys,plants[!is.na(TAXON), .(alltime_maxvegdep = max(DEPTH_FT)) , .(DOW, SUBBASIN) ], by = c("DOW", "SUBBASIN"), all.x = TRUE) [is.na(alltime_maxvegdep), alltime_maxvegdep := 0 ] # summary(surveys$alltime_maxvegdep) # surveys[ , hist(alltime_maxvegdep) , ] #for plants records with plants, whats the max depth by lake? #remove these cols if they exist (added this in troubleshooting, should kick warning, but have no effect on product) plants[ ,alltime_maxvegdep := NULL] plants[ ,survey_maxvegdep := NULL] # Calculate max depth for non-NA TAXON records max_depth <- plants[!is.na(TAXON) & DEPTH_FT < 50, .(alltime_maxvegdep = max(DEPTH_FT, na.rm = T)), by = .(DOW, SUBBASIN)] # Merge the result back into the original data plants <- merge(plants, max_depth, by = c("DOW", "SUBBASIN"), all.x = TRUE) # Calculate max depth for non-NA TAXON records max_depth <- plants[!is.na(TAXON) & DEPTH_FT < 50, .(survey_maxvegdep = max(DEPTH_FT, na.rm = T)), by = .(SURVEY_ID)] # Merge the result back into the original data plants <- merge(plants, max_depth, by = "SURVEY_ID", all.x = TRUE) plants[ , .N , .(alltime_maxvegdep, survey_maxvegdep, SURVEY_ID) , ] summary(plants[ , .N , .(alltime_maxvegdep, survey_maxvegdep, SURVEY_ID) , ]) plants[is.na(survey_maxvegdep), .N , NO_VEG_FOUND ] plot(data = plants[, .N , .(alltime_maxvegdep, survey_maxvegdep, SURVEY_ID) , ],alltime_maxvegdep~survey_maxvegdep ) # n_points within all time max vegetated depth plants[, .(alltime_maxvegdep_n_samp = fifelse(is.na(first(alltime_maxvegdep)) , NA , length(POINT_ID)) ) , SURVEY_ID ] surveys <- merge(surveys, plants[, .(alltime_maxvegdep_n_samp = fifelse(is.na(first(alltime_maxvegdep)) , NA , length( unique(ifelse(DEPTH_FT <= alltime_maxvegdep,POINT_ID, NA), na.rm =T) ) ), alltime_maxvegdep = first(alltime_maxvegdep)) , SURVEY_ID ], by = "SURVEY_ID", all.x = TRUE ) # n_points within survey specific max vegetated depth surveys <- merge(surveys, plants[, .(survey_maxvegdep_n_samp = fifelse( is.na( first(survey_maxvegdep)) , NA , length( unique( ifelse(DEPTH_FT <= survey_maxvegdep, POINT_ID,NA ), na.rm =T ) ) ), survey_maxvegdep = first(survey_maxvegdep ) ) , SURVEY_ID ], by = "SURVEY_ID", all.x = TRUE ) plot( data = surveys, alltime_maxvegdep_n_samp ~ n_points_vegetated, xlab = "n points vegetated in this survey", ylab = "n points within all time max vegetated depth") plot( data = surveys[ , .("propsamples_in_histmaxveg_depth" = (alltime_maxvegdep_n_samp/tot_n_samp), "prop_samples_vegetated" = (n_points_vegetated/tot_n_samp) ) , ], propsamples_in_histmaxveg_depth ~ prop_samples_vegetated) plot(data = surveys, alltime_maxvegdep_n_samp ~ survey_maxvegdep_n_samp, xlab = "n points within survey specific vegetated depth", ylab = "n points within all time max vegetated depth") plot(data = surveys, alltime_maxvegdep ~ survey_maxvegdep) summary(surveys[ , .(alltime_maxvegdep, survey_maxvegdep, max_depth_vegetated, min_depth_vegetated)]) surveys[taxa_richness == 0, .N , alltime_maxvegdep] # **species pools & watershed metrics ---------------------------------------------------- #' ## Watersheds and Species Pools Across Scales #' We have super awesome species pool data because we've got species abunds #' across multiple scales: #' From the smallest (point-- plants_rakeabund_wide or plants_occurrence_wide) #' scale we have a species abundance matrix that can be treated as a product of #' the species pool above it (whole survey/lake), which we also have an #' abundance matrix for! We can also move up to the landscape scale, building #' species abundance matricies by aggregating these next-lower-scale data. #' #' For example, we can do as described above (compressing matricies to richness #' for viz) and aggregate to the HUC-8 watershed level. #' #' #first add the lake level richness from each survey to the point rake abund data plants_rakeabund_wide[ , surveyrichness := surveys[match(plants_rakeabund_wide[ ,SURVEY_ID ,], surveys[, SURVEY_ID, ]), taxa_richness] ,] plants_occurrence_wide[ , surveyrichness := surveys[match(plants_occurrence_wide[ ,SURVEY_ID ,], surveys[, SURVEY_ID, ]), taxa_richness] ,] ggplot( data = plants_rakeabund_wide, aes(jitter(surveyrichness), jitter(nat_richness)))+ geom_point(alpha = .05)+ ylab("point level native richness")+ xlab("suvey level native richness") # now we need to create a watershed level species matrix: #check keys # pwi_l$order_ID # plants$order_ID plants[ , watershed := pwi_l[match(plants[ , order_ID ,],pwi_l[ , order_ID ,]), major , ],] # plants[ , length(unique(POINT_ID)) , watershed] watersheds <- plants[ , .("n_points" = length(unique(POINT_ID))) , watershed] watersheds <- merge(watersheds, plants[!is.na(TAXON) , .("n_species" = length(unique(TAXON))) , watershed], by = "watershed", all.x = T) watershed_occurrence_wide <- dcast(plants, watershed ~ TAXON, value.var = "INDATABASE", fun.aggregate = sum, fill = 0) watershed_occurrence_wide <- merge(watersheds, watershed_occurrence_wide, by = "watershed", all.x = T) plants_rakeabund_wide[ , watershed := plants[match(plants_rakeabund_wide[ ,SURVEY_ID ,], plants[, SURVEY_ID, ]), watershed] ,] plants_rakeabund_wide[ , watershedrichness := watershed_occurrence_wide[match(plants_rakeabund_wide[ ,watershed ,], watershed_occurrence_wide[, watershed, ]), n_species] ,] # add to occurrence wide set: plants_occurrence_wide[ , watershed := plants[match(plants_occurrence_wide[ ,SURVEY_ID ,], plants[, SURVEY_ID, ]), watershed] ,] plants_occurrence_wide[ , watershedrichness := watershed_occurrence_wide[match(plants_occurrence_wide[ ,watershed ,], watershed_occurrence_wide[, watershed, ]), n_species] ,] # watershed richness as the predictor of point scale richness: ggplot( data = plants_rakeabund_wide, aes(jitter(watershedrichness), jitter(nat_richness)))+ geom_point()+ ylab("point level native richness")+ xlab("watershed level native richness") surveys[ , watershed := plants[match(surveys[ , SURVEY_ID ,], plants[, SURVEY_ID, ]), watershed] ,] surveys[ , watershedrichness := watershed_occurrence_wide[match(surveys[ ,watershed ,], watershed_occurrence_wide[, watershed, ]), n_species] ,] surveys[is.na(watershedrichness), watershedrichness := 0] ggplot( data = surveys, aes(watershedrichness, nat_richness))+ geom_point()+ geom_smooth(method = "loess")+ ylab("Survey Richness")+ xlab("HUC-8 Watershed Richness")+ theme_bw() # Get watershed Diversity names(watershed_occurrence_wide) %in% natcols watershed_occurrence_wide[ , simpson_div_nat := diversity(.SD,index = "invsimpson" ) , .SDcols = c(names(watershed_occurrence_wide) %in% natcols)] hist(watershed_occurrence_wide$simpson_div_nat) plants_rakeabund_wide[ , watershedsimpson_nat := watershed_occurrence_wide[match(plants_rakeabund_wide[ ,watershed ,], watershed_occurrence_wide[, watershed, ]), simpson_div_nat] ,] plants_occurrence_wide[ , watershedsimpson_nat := watershed_occurrence_wide[match(plants_occurrence_wide[ ,watershed ,], watershed_occurrence_wide[, watershed, ]), simpson_div_nat] ,] plants_rakeabund_wide[ , surveysimpson_nat := surveys[match(plants_rakeabund_wide[ ,SURVEY_ID ,], surveys[, SURVEY_ID, ]), simpsons_div_nat] ,] plants_occurrence_wide[ , surveysimpson_nat := surveys[match(plants_occurrence_wide[ ,SURVEY_ID ,], surveys[, SURVEY_ID, ]), simpsons_div_nat] ,] surveys[ , watershedsimpson_nat := watershed_occurrence_wide[match(surveys[ ,watershed ,], watershed_occurrence_wide[, watershed, ]), simpson_div_nat] ,] #redo species pool plots: point_pools <- ggplot( data = plants_rakeabund_wide[simpsons_div_nat != Inf], aes(surveysimpson_nat, simpsons_div_nat))+ geom_point()+ geom_smooth(method = "lm")+ ylab("Point-scale ENSpie")+ xlab("Lake-scale ENSpie")+ theme_bw() lake_pools <- ggplot( data = surveys, aes(watershedsimpson_nat, simpsons_div_nat))+ geom_point()+ geom_smooth(method = "lm")+ ylab("Lake-scale ENSpie")+ xlab("Watershed-scale ENSpie")+ theme_bw() ggarrange( point_pools, lake_pools ) #bring simpsons div back to HUc8 table setDT(watersheds_huc8) watersheds_huc8[watershed_occurrence_wide, on = .(major = watershed) , simpson_div_nat := simpson_div_nat ] #clean up intermediates: rm(lake_pools, point_pools, survey_species_matrix) # **data products ----------------------------------------------------------- #' # Export Datasets #' #' We have 6 datasets to export: #' #' #' #' 1. plants --> plants_env_data.csv -- As long format: each row is a species observation within a point (multiple #' rows per point) including all fields retained through cleaning processes #' # names(plants) plants[ , c("DATASOURCE", "STA_NBR_DATASOURCE", "SURVEY_ID_DATASOURCE", "SAMPLE_NOTES", "OLD_SURVEY_ID", "DATESURVEYSTART", "COHORT", "DATEINFO", "MONTH", "DAY", "YEAR", "INVENTORY_STAFF", "INVENTORY_STAFFDATE", "USEABLE", "CLEANED", "INDATABASE", "INVENTORY_NOTES", "SUBMISSION_STAFF", "SUBMISSION_STAFFDATE", "SUBMISSION_NOTES", "SURVEY_FEEDBACK") := NULL , ] setcolorder(plants, c("DOW", "LAKE_NAME", "order_ID", "SUBBASIN", "watershed","alltime_maxvegdep", "SURVEY_ID", "SURVEY_DATASOURCE", "SURVEY_DATE", "MULTIPARTSURVEY", "SURVEYOR","RAKE_SCALE_USED","survey_maxvegdep", "Secchi_m", "SECCHI_DATE", "SECCHI_m_ACCEPTED", "POINT_ID" ,"DEPTH_FT", "proplight", "Longitude", "Latitude", "NO_VEG_FOUND", "WHOLE_RAKE_REL_ABUND","SUBSTRATE", "SURFACE_GROWTH", "POINT_LVL_SECCHI", "OBS_ID", "TAXON", "REL_ABUND", "REL_ABUND_CORRECTED" )) # export_names_plants <- tolower(names(plants)) # metadata for column names in this file # dow: MN Dept of Waters Ident. # lake_name: Name of the lake. # order_ID: key used to link to MN Hydrography dataset # subbasin: Sub-basin where the observation was made. # watershed: Watershed associated with the observation. # alltime_maxvegdep: Maximum vegetation depth ever observed in the lake (excludes any depth observation >50ft). # survey_id: Identification number for the survey. # survey_datasource: Name of the source of the survey data. # survey_date: Date when the survey was conducted, if multiple dates uses the first day of the survey. # multipartsurvey: Indicator for if the survey is part of a larger survey. Numeric with structure of SURVEY.PART # surveyor: Person or entity conducting the survey if known. # rake_scale_used: Scale used for rake abundance measurements. # survey_maxvegdep: Maximum vegetation depth observed during the survey. # secchi_m: Nearest temporal Secchi depth measured in meters. # secchi_date: Date when Secchi depth was measured. # secchi_m_accepted: Secchi depth measurement if observation is within 30d of the plant survey (used for proplight calculation). # point_id: Identification number for the observation point. # depth_ft: Depth in feet. # proplight: Proportion of surface light remaining at DEPTH_FT. # longitude: Longitude coordinate of the observation point. # latitude: Latitude coordinate of the observation point. # no_veg_found: Indicator if no vegetation was found at point. # whole_rake_rel_abund: Relative abundance rating assigned to the whole rake (all species), if assigned. # substrate: Substrate type. # surface_growth: Indicator variable for plant growth reached surface of water. # point_lvl_secchi: Secchi level at the observation point if recorded. # obs_id: Identification number for the observation. # taxon: Name of taxon observed. # rel_abund: Relative abundance observed (see RAKE_SCALE_USED for possible values). # rel_abund_corrected: Corrected relative abundance (fixes all relative abunds to scale of 1,2,3). # fwrite(plants, file = "data&scripts/data/output/DRUM/plants_env_data.csv") #' 2. plants_occurrence_wide --> plants_env_data_wide.csv -- As a wide format of occurrences: each row is a point record, and the columns include a #' species observation (presence/absence) matrix. # names(plants_occurrence_wide) setcolorder(plants_occurrence_wide, c("DOW", "LAKE_NAME", "order_ID", "SUBBASIN", "watershed", "watershedrichness", "watershedsimpson_nat", "SURVEY_ID", "SURVEY_DATASOURCE", "SURVEY_DATE", "MULTIPARTSURVEY", "SURVEYOR","surveyrichness", "surveysimpson_nat", "Secchi_m", "SECCHI_DATE", "SECCHI_m_ACCEPTED", "POINT_ID" ,"DEPTH_FT", "proplight", "Longitude", "Latitude", "richness", "nat_richness")) # export_names_plants_occurrence_wide <- tolower(names(plants_occurrence_wide)) names(plants_occurrence_wide) # I ran glimpse() and then added in metadata: # DOW MN Dept of Waters Ident. # LAKE_NAME Name of the lake. # order_ID key used to link to MN Hydrography dataset # SUBBASIN Sub-basin where the observation was made. # watershed Watershed associated with the observation. numeric key for watershed (see watershed_occurrence_wide for detail on watersheds like names, sizes, etc. ) # watershedrichness taxa richness across all surveys in watershed # watershedsimpson_nat inverse simpsons diversity in watershed # SURVEY_ID Identification number for the survey. # SURVEY_DATASOURCE Name of the source of the survey data. # SURVEY_DATE Date when the survey was conducted, if multiple dates uses the first day of the survey. # MULTIPARTSURVEY Indicator for if the survey is part of a larger survey. Numeric with structure of [SURVEY.PART] # SURVEYOR Person or entity conducting the survey if known. # surveyrichness Taxonomic richness of survey # surveysimpson_nat Inverse simpsons diversity of survey # Secchi_m Nearest temporal Secchi depth measured in meters. # SECCHI_DATE Date when Secchi depth was measured. # SECCHI_m_ACCEPTED Secchi depth measurement if observation is within 30d of the plant survey (used for proplight calculation). # POINT_ID Identification number for the observation point. # DEPTH_FT Depth to substrate in feet. # proplight Proportion of surface light remaining at DEPTH_FT. # Longitude Longitude coordinate of the observation point. # Latitude Latitude coordinate of the observation point. # richness Number of unique taxa observed at the point. # nat_richness Number of unique native taxa observed at the point. # the remaining columns are species occurrence columns, indicating whether a taxon was observed, along with it's relative rake abundance (1-3) or not observed (0) at a sample point # fwrite(plants_occurrence_wide, file = "data&scripts/data/output/DRUM/plants_occurrence_wide.csv") #' #' 3. plants_rakeabund_wide --> plants_abund_env_data_wide.csv -- As a wide #' format with species abundances (a subset of "2.") where each row #' is a point record , and the columns include a species abundance matrix # names(plants_rakeabund_wide) setcolorder(plants_rakeabund_wide, c("DOW", "LAKE_NAME", "order_ID", "SUBBASIN", "watershed", "watershedrichness", "watershedsimpson_nat", "SURVEY_ID", "SURVEY_DATASOURCE", "SURVEY_DATE", "MULTIPARTSURVEY", "SURVEYOR", "surveyrichness", "surveysimpson_nat", "Secchi_m", "SECCHI_DATE", "SECCHI_m_ACCEPTED", "POINT_ID" ,"DEPTH_FT", "proplight", "Longitude", "Latitude", "shannon_div", "simpsons_div", "shannon_div_nat", "simpsons_div_nat", "richness", "nat_richness" )) # export_names_plants_rakeabund_wide <- tolower(names(plants_rakeabund_wide)) names(plants_rakeabund_wide) # I ran glimpse() and then added in metadata: # DOW MN Dept of Waters Ident. # LAKE_NAME Name of the lake. # order_ID key used to link to MN Hydrography dataset # SUBBASIN Sub-basin where the observation was made. # watershed Watershed associated with the observation. numeric key for watershed (see watershed_occurrence_wide for detail on watersheds like names, sizes, etc. ) # watershedrichness taxa richness across all surveys in watershed # watershedsimpson_nat inverse simpsons diversity in watershed # SURVEY_ID Identification number for the survey. # SURVEY_DATASOURCE Name of the source of the survey data. # SURVEY_DATE Date when the survey was conducted, if multiple dates uses the first day of the survey. # MULTIPARTSURVEY Indicator for if the survey is part of a larger survey. Numeric with structure of [SURVEY.PART] # SURVEYOR Person or entity conducting the survey if known. # surveyrichness Taxonomic richness of survey # surveysimpson_nat Inverse simpsons diversity of survey # Secchi_m Nearest temporal Secchi depth measured in meters. # SECCHI_DATE Date when Secchi depth was measured. # SECCHI_m_ACCEPTED Secchi depth measurement if observation is within 30d of the plant survey (used for proplight calculation). # POINT_ID Identification number for the observation point. # DEPTH_FT Depth to substrate in feet. # proplight Proportion of surface light remaining at DEPTH_FT. # Longitude Longitude coordinate of the observation point. # Latitude Latitude coordinate of the observation point. # shannon_div Shannon diversity of taxa observed at the point # simpsons_div Inverse simpsons diversity of taxa observed at the point # shannon_div_nat Shannon diversity of native taxa observed at the point # simpsons_div_nat Inverse simpsons diversity of native taxa observed at the point # richness Number of unique taxa observed at the point. # nat_richness Number of unique native taxa observed at the point. # the remaining columns are species occurrence columns, indicating whether a taxon was observed (1) or not observed (0) at a sample point # fwrite(plants_rakeabund_wide, file = "data&scripts/data/output/DRUM/plants_abund_env_data_wide.csv") #' 4. surveys --> surveys_aqplants.csv -- aggregated plants data at the survey #' level. Each row #' is a set of survey-level summary stats and abundances (number of obs) for all #' species in the dataset. setcolorder(surveys, c("DOW", "LAKE_NAME", "order_ID", "SUBBASIN", "watershed", "watershedrichness", "watershedsimpson_nat", "SURVEY_ID", "SURVEY_DATASOURCE", "SURVEY_DATE", "MULTIPARTSURVEY", "Secchi_m", "Secchi_m_date", "nobs", "tot_n_samp", "max_depth_surveyed", "min_depth_surveyed", "mean_depth_surveyed", "median_depth_surveyed", "IQR_depth_surveyed", "max_depth_vegetated", "min_depth_vegetated", "mean_depth_vegetated", "median_depth_vegetated", "IQR_depth_vegetated", "alltime_maxvegdep", "alltime_maxvegdep_n_samp", "survey_maxvegdep", "survey_maxvegdep_n_samp", "n_points_vegetated", "prop_veg", "shannon_div", "simpsons_div", "shannon_div_nat", "simpsons_div_nat", "taxa_richness", "nat_richness" )) glimpse(surveys) # export_names_surveys <- tolower(names(surveys)) # DOW MN Dept of Waters Ident. # LAKE_NAME Name of the lake. # order_ID key used to link to MN Hydrography dataset # SUBBASIN Sub-basin where the observation was made. # watershed Watershed associated with the observation. numeric key for watershed (see watershed_occurrence_wide for detail on watersheds like names, sizes, etc. ) # watershedrichness taxa richness across all surveys in watershed # watershedsimpson_nat inverse simpsons diversity in watershed # SURVEY_ID Identification number for the survey. # SURVEY_DATASOURCE Name of the source of the survey data. # SURVEY_DATE Date when the survey was conducted, if multiple dates uses the first day of the survey. # MULTIPARTSURVEY Indicator for if the survey is part of a larger survey. Numeric with structure of [SURVEY.PART] # Secchi_m Nearest temporal Secchi depth measured in meters. # Secchi_m_date Date when Secchi depth was measured. # nobs number of observations in this survey (5 taxa observed at a point = 5 observations--thus this metric can exceed the tot n samp vlaue) # tot_n_samp total number of samples taken/points sampled # max_depth_surveyed max depth that survyors sampled (ALL DEPTHS IN FEET) # min_depth_surveyed min depth that surveyors sampled (ALL DEPTHS IN FEET) # mean_depth_surveyed mean depth that surveyors sampled (ALL DEPTHS IN FEET) # median_depth_surveyed median depth that surveyors sampled (ALL DEPTHS IN FEET) # IQR_depth_surveyed inter-quartile range depth that surveyors sampled (ALL DEPTHS IN FEET) # max_depth_vegetated maximum depth where vegetation was observed (ALL DEPTHS IN FEET) # min_depth_vegetated min depth where vegetation was observed (ALL DEPTHS IN FEET) # mean_depth_vegetated mean depth where vegetation was observed (ALL DEPTHS IN FEET) # median_depth_vegetated median depth where vegetation was observed (ALL DEPTHS IN FEET) # IQR_depth_vegetated inter-quartile range depth where vegetation was observed (ALL DEPTHS IN FEET) # alltime_maxvegdep the max depth of plants ever observed in this lake (across all surveys in this db) # alltime_maxvegdep_n_samp Number of samples taken from points less than alltime_maxvegdep during this survey # survey_maxvegdep Survey maximum vegetation depth. # survey_maxvegdep_n_samp Number of samples for survey maximum vegetation depth. # n_points_vegetated Number of points with veg present # prop_veg n_points_vegetated/tot_n_samp # shannon_div Shannon diversity index for this survey # simpsons_div survey inverse Simpson's diversity index. # shannon_div_nat survey Shannon diversity index including native taxa only. # simpsons_div_nat survey inverse Simpson's diversity index including native taxa only # taxa_richness count of taxa in this survey # nat_richness native species taxon count in this survey # the remaining columns are species occurrence count columns, indicating the number of points at which a taxon was observed (1+) or not observed (0) during a survey. # fwrite(surveys, file = "data&scripts/data/output/DRUM/surveys_aqplants.csv") #' 5. missing surveys --> missing_data_surveys.csv -- Surveys identified, but for #' which plant data but were not sucessfully collated for this project. Each #' row is a survey. # names(missing_data_surveys) missing_data_surveys[ , c( "STA_NBR_DATASOURCE", "SURVEY_ID_DATASOURCE", "SAMPLE_NOTES", "OLD_SURVEY_ID", "DATESURVEYSTART", "COHORT","POINT_ID" ,"DEPTH_FT", "NO_VEG_FOUND", "WHOLE_RAKE_REL_ABUND","SUBSTRATE", "SURFACE_GROWTH", "POINT_LVL_SECCHI","OBS_ID", "TAXON", "REL_ABUND","RAKE_SCALE_USED", "X", "Y", "NORTHING", "EASTING", "LATITUDE", "LONGITUDE", "UTMX", "UTMY" ) := NULL , ] setcolorder(missing_data_surveys, c("DOW", "LAKE_NAME", "SUBBASIN", "DATASOURCE", "SURVEY_ID", "SURVEY_DATASOURCE", "SURVEY_DATE", "MULTIPARTSURVEY", "SURVEYOR", "DATEINFO", "MONTH", "DAY", "YEAR", "INVENTORY_STAFF", "INVENTORY_STAFFDATE", "USEABLE", "CLEANED", "INDATABASE", "INVENTORY_NOTES", "SUBMISSION_STAFF", "SUBMISSION_STAFFDATE", "SUBMISSION_NOTES", "SURVEY_FEEDBACK" )) # export_missing_data_surveys <- tolower(names(missing_data_surveys)) glimpse(missing_data_surveys) # DOW MN Dept of Waters Ident. # LAKE_NAME Name of the lake # SUBBASIN Subbasin where the survey was conducted if applicable # DATASOURCE Internal listing for source that identified the survey # SURVEY_ID Unique identifier for the survey # SURVEY_DATASOURCE Source or authority for the survey data (could be contacted to try to acquire these data) # SURVEY_DATE Date when the survey was conducted, if multiple dates uses the first day of the survey # MULTIPARTSURVEY Indicator for if the survey is part of a larger survey. Numeric with structure of SURVEY.PART # SURVEYOR Surveyor name(s) if known # DATEINFO Date information that may help in identifying the survey # MONTH Month of the survey # DAY Day of the survey. # YEAR Year of the survey # INVENTORY_STAFF Inventory staff name # INVENTORY_STAFFDATE Date of inventory by staff # USEABLE Indicator for data usability as submitted to project team # CLEANED Indicator for successful pre-cleaning of the data # INDATABASE Indicator for sucessful processing into database # INVENTORY_NOTES Inventory notes from project staff # SUBMISSION_STAFF staff name that processed the original submission # SUBMISSION_STAFFDATE Date of submission processing # SUBMISSION_NOTES Submission notes from project staff # SURVEY_FEEDBACK Feedback from the survey of data contributors # fwrite(missing_data_surveys, file = "data&scripts/data/output/DRUM/missing_data_surveys.csv") #' 6. watershed level summaries --> watershed_occurrence_wide.csv -- Surveys identified, but for #' which plant data but were not sucessfully collated for this project. Each #' row is a survey. # watersheds_huc8[ , simpson_div_nat := NULL , ] watershed_occurrence_wide <- watershed_occurrence_wide[as.data.table(watersheds_huc8)[ ,.SD , .SDcols = !c("simpson_div_nat")], on = .(watershed=major) , ] watershed_occurrence_wide[ , c("NA", "HUC_8", "Shape_Leng", "Shape_Area") := NULL , ] # colnames(watershed_occurrence_wide) names <- c("watershed","major_name", "acres", "sq_miles", "prod_year", "source", "geometry", "n_points", "n_species", "simpson_div_nat") setcolorder(watershed_occurrence_wide, names ) glimpse(watershed_occurrence_wide ) # A. Name: watershed # Description: numeric code for watershed. matches to same field in other dataset. See also source file 4. from SHARING/ACCESS SECTION (DNR Watersheds 2023). # # B. Name: major_name # Description: name of major watershed that corresponds to the watershed code. See also source file 4. from SHARING/ACCESS SECTION (DNR Watersheds 2023). # # C. Name: acres # Description: acres encompassed by watershed. See also source file 4. from SHARING/ACCESS SECTION (DNR Watersheds 2023). # # D. Name: sq_mile # Description: square miles encompassed by watershed. See also source file 4. from SHARING/ACCESS SECTION (DNR Watersheds 2023). # # E. Name: prod_year # Description: The year of production associated with polygon linework. See also source file 4. from SHARING/ACCESS SECTION (DNR Watersheds 2023). # # F. Name: source # Description: The source of polygon linework. See also source file 4. from SHARING/ACCESS SECTION (DNR Watersheds 2023). # # G. Name: n_points # Description: number of points sampled in the watershed (not resampling of a points is unaccounted for, so resampled points are counted as n points where n = number of resamples) # # H. Name: n_species # Description: number of unique taxa observed in the watershed # # I. Name: simpson_div_nat # Description: inverse Simpson's diversity of watershed taxa community # # J.- END: [name of taxon observed in the database] # Description: Number of observations of [named taxa] in this watershed # fwrite(watershed_occurrence_wide[ ,.SD ,.SDcols = !c("geometry" )], file = "data&scripts/data/output/DRUM/watershed_occurrence_wide.csv") # DataVizFigs ------------------------------------------------------- # **distribution of lakes and surveys --------------------------------------- #' # Generate Data Viz for Pub #' #' ## Map and Timeline #plot with lakes shapes and point locs!! # pwi_l[order_ID %in% unique(plants[ , order_ID]), ,] # # ggplot(pwi_l[order_ID %in% plants[ , unique(order_ID) , ] , , ], aes(geometry=geometry)) + # geom_sf() + # labs(caption = "Map of lakes with surveys in our database") #Conversion of data frame to sf object to add points plants_pts <- st_as_sf(x = plants[!is.na(Longitude)], coords = c("Longitude", "Latitude"), crs = "+proj=lonlat +datum=WGS84") # #map points # ggplot(plants_pts, aes(geometry=geometry)) + # geom_sf() + # labs(caption = "Map of survey points in our database") #plot all together! #other data for fig # usa <- map_data("usa") # canada <- map_data("world", region = "canada") # states <- map_data("state") states <- sf::st_as_sf(maps::map("state", plot = FALSE, fill = TRUE)) mn_df <- subset(states, ID == "minnesota") #Projection transformation plants_pts = st_transform(plants_pts, crs = "+proj=utm +zone=15") pwi_l <- st_sf(pwi_l) pwi_l <- st_transform(pwi_l, crs = st_crs(mn_df)) setDT(pwi_l) watersheds_huc8 <- st_sf(watersheds_huc8) watersheds_huc8 <- st_transform(watersheds_huc8, crs = st_crs(mn_df)) #' ### Sampled Lakes Spatial Dist #map study_map <- ggplot(data = pwi_l, aes(geometry=geometry))+ geom_sf(data = watersheds_huc8,aes(geometry = geometry), alpha = .05, color = "gray")+ geom_sf(alpha = .5, color = "blue")+ # geom_point( # aes(geometry = geometry), # stat = "sf_coordinates", color = "blue", alpha = 0.5)+ geom_point(data = pwi_l[order_ID %in% plants[ , unique(order_ID) , ] , , ], stat = "sf_coordinates", aes(geometry=geometry), color = "red", alpha = .5)+ geom_sf(data = mn_df,aes(geometry = geom), color = "black", alpha = .05)+ scale_shape_discrete(solid = FALSE)+ theme(text = element_text(size=20), legend.position = )+ # theme_bw()+ ylab("Longitude")+ xlab("Latitude") # study_map pwi_l[order_ID %in% plants[ , unique(order_ID) , ] , plant_survey := T ,] #' ### Schupps lake classes ggplot(pwi_l, aes(lake_class))+ scale_y_log10()+ geom_histogram( binwidth = 1 )+ geom_histogram(binwidth = 1, data =pwi_l[plant_survey == T], aes(lake_class), color = "red", alpha = .5)+ labs( title = "Distribution versus samping of Schupps lake classes \n https://files.dnr.state.mn.us/publications/fisheries/investigational_reports/417.pdf")+ scale_x_continuous(breaks = seq(0,44,2) ) # geom_density(aes(color = plant_survey)) #lake area ggplot(pwi_l, aes(acres.x))+ scale_x_log10()+ scale_y_log10()+ geom_histogram( )+ geom_histogram( data = pwi_l[plant_survey == T], aes(acres.x), color = "red", alpha = .5) ggplot(pwi_l, aes(acres.x))+ # geom_histogram()+ scale_x_log10()+ geom_density(aes(color = plant_survey)) # hist(pwi_l$lake_class) # temporal accumulation ---------------------------------------------------- #' ### Temporal Accumulation # of surveys plants[ , length(unique(SURVEY_ID)), year(SURVEY_DATE)] #of obs plants[ , length(unique(OBS_ID)), year(SURVEY_DATE)] #taxa plants[!is.na(TAXON) , length(unique(TAXON)), year(SURVEY_DATE)] missing_data_surveys[ , .N , SURVEY_DATE] plotdat <- rbindlist(list(plants[ , first(SURVEY_DATE) , SURVEY_ID],missing_data_surveys[ , first(SURVEY_DATE) , SURVEY_ID]))[!is.na(V1)] setorder(plotdat, V1) plotdat[ , cumval := .I , ] temporal_accumulation <- ggplot(plotdat, aes(V1, cumval)) + geom_line()+ # theme_bw()+ xlab("Year")+ ylab("Cumulative Survey Count")+ theme(text = element_text(size=20), legend.position = ) # plotdat[ , metric := "surveys" , ] # # plotdat_pts <- plants[ , first(SURVEY_DATE) , POINT_ID] # setorder(plotdat_pts, V1) # plotdat_pts[ , cumval := .I , ] # plotdat_pts[ , metric := "points" , ] # # # plotdat_taxa <- plants[ , first(SURVEY_DATE) , TAXON] # setorder(plotdat_taxa, V1) # plotdat_taxa[ , cumval := .I , ] # plotdat_taxa[ , metric := "taxa" , ] # # # plotdat_all <- rbind(rbind(plotdat[ ,2:4 ], plotdat_pts[ ,2:4 ] ) , plotdat_taxa[ ,2:4 ]) # # ggplot(plotdat_all, aes(V1, cumval)) + # geom_line()+ # facet_wrap(~ metric, scales = "free") # #' ### Figure 2 # arrange plots.row <- align_plots(study_map, temporal_accumulation, align="hv", axis="tblr") div.rows <- plot_grid(plots.row[[1]], plots.row[[2]], nrow=1, label_size = 20, label_fontface = "plain", labels = c("(a)", "(b)"), hjust = -0, vjust = 2.4) #write to file # png(file = "Fig_Map_Time.png", width = 10, height = 5, units = "in", res = 1200) div.rows # dev.off() # species abundance distributions ------------------------------------- #' ## Species Abundance Distribution sad.dat <- plants[!is.na(TAXON) , .N , TAXON] setorder(sad.dat, -N) sad.dat[ , TAXON := factor(TAXON, levels = sad.dat$TAXON)] sad.dat[ , perc_abund := N/sum(sad.dat$N) , ] # ggplot(plotdat[], aes(TAXON, perc_abund))+ # geom_point()+ # scale_y_log10()+ # xlab("Taxa")+ # ylab("log10(percent of all observations)")+ # theme_bw()+ # theme(axis.text.x = element_blank()) # write.csv(plotdat, "data&scripts/data/output/species_abund_list.csv") #' Excluding unvegetated points, recalculating and ordering by perc-abund sad.dat <- sad.dat %>% filter(TAXON != "") %>% mutate(perc_abund = N/sum(N)) %>% arrange(desc(perc_abund)) %>% mutate(rank = 1:n()) %>% rename(Taxon = TAXON) sum(sad.dat$perc_abund) #' Most and least abundant top1 <- sad.dat %>% filter(rank %in% 1:22) %>% dplyr::select(Taxon, N) top2 <- sad.dat %>% filter(rank %in% 23:50) %>% dplyr::select(Taxon, N) #' #' ### Figure 3 sad.plot <- ggplot(sad.dat, aes(x = rank, y = perc_abund)) + geom_segment(aes(xend = 105, y = 0.03, x = 52, yend = 0.03), arrow = arrow(length = unit(0.25, "cm")), color = "black") + geom_table(data = top1, aes(x = 175, y = 0.26, label=list(top1)), size = 2, table.theme=ttheme_gtsimple, vjust = "top", hjust = "right") + geom_table(data = top2, aes(x = 237, y = 0.26, label=list(top2)), size = 2, table.theme=ttheme_gtsimple, vjust = "top", hjust = "right") + geom_point(shape = 19, size = 2, alpha = 0.5, color = "steelblue") + geom_rect(aes(xmin = -2, xmax = 52, ymin = 0.0015, ymax = 0.18), fill = "transparent", color = "gray30", linetype = 1) + scale_y_log10(breaks = c(0.1, 0.01, 0.001, 0.0001, 0.00001), labels = c(0.1, 0.01, 0.001, expression(paste("1 × 10"^{-4})), expression(paste("1 × 10"^{-5})))) + scale_x_continuous(breaks = c(seq(0, 200, 50))) + xlab(expression(Species~(italic(n)^{th}~most~abundant))) + ylab("Proportion of total abundance") + coord_cartesian(xlim = c(5, 225), ylim = c(0.000002, 0.15)) + theme(panel.grid.minor=element_blank()) + annotate("text", x = 80, y = 0.03, size = 3, adj = 0.5, label = "50 most\nabundant taxa") # png(file = "Fig_SAD.png", width = 5.5, height = 5, units = "in", res = 1200) sad.plot # dev.off() # diversity environment relationships ------------------------------------ #' ## Diversity-Environment Relationships #' #' ### Depth summary(plants_rakeabund_wide$simpsons_div_nat) summary(plants_rakeabund_wide$DEPTH_FT) # desc(sort(plants_rakeabund_wide$DEPTH_FT)) #' Removing zero depth, extreme depth, and Inf ENSpie points plants_rakeabund_wide <- plants_rakeabund_wide %>% filter(simpsons_div_nat != "Inf") %>% filter(DEPTH_FT != 0) %>% filter(DEPTH_FT < 50) label_depth <- "Point~scale~(italic(N) == 70745)" point_depth <- ggplot(plants_rakeabund_wide, aes(DEPTH_FT*0.348, simpsons_div_nat)) + geom_point(shape = 19, size = 1, alpha = 0.1, color = "steelblue4")+ scale_x_log10() + geom_smooth(method = 'gam', color = "black") + ylab(expression(italic(ENS[PIE]))) + xlab("Water depth (m)") + ylim(c(0, 15)) + annotate("text", x = 0.08, y = 15 - 0.025*15, size = 2.25, adj = 0, label = label_depth, parse = TRUE) + theme(text = element_text(size = 7), plot.margin = unit(c(0.1, 0, 0.1, 0.1), "cm")) #' ### Secchi #' #' Removing NAs summary(surveys$simpsons_div_nat) summary(surveys$Secchi_m) surveys <- surveys %>% filter(!is.na(Secchi_m)) label_Secchi <- "Lake~scale~(italic(N) == 2955)" lake_Secchi <- ggplot(surveys, aes(Secchi_m, simpsons_div_nat)) + geom_point(shape = 19, size = 1.5, alpha = 0.3, color = "steelblue4")+ geom_smooth(method = 'gam', color = "black") + ylab("") + xlab("Secchi depth (m)") + ylim(c(0, 21)) + annotate("text", x = 0.07, y = 21 - 21*0.025, size = 2.25, adj = 0, label = label_Secchi, parse = TRUE) + theme(text = element_text(size = 7), plot.margin = unit(c(0.1, 0, 0.1, 0), "cm")) #' ### Area #' #' Removing NAs summary(watersheds_huc8$simpson_div_nat) watersheds_huc8 <- watersheds_huc8 %>% dplyr::select(-geometry) %>% filter(!is.na(simpson_div_nat)) %>% filter(simpson_div_nat != "Inf") label_area <- "Watershed~scale~(italic(N) == 64)" wshed_area <- ggplot(watersheds_huc8, aes(acres*0.404686, simpson_div_nat)) + geom_point(shape = 19, size = 2, alpha = 0.6, color = "steelblue4")+ geom_smooth(method = 'gam', color = "black") + ylab("") + xlab("Area (ha)") + ylim(c(0, 30)) + scale_x_continuous(breaks = c(0, 200000, 400000, 600000), labels = c(0, "200,000", "400,000", "600,000")) + annotate("text", x = 0, y = 30 - 30*0.025, size = 2.25, adj = 0, label = label_area, parse = TRUE) + theme(text = element_text(size = 7), plot.margin = unit(c(0.1, 0.1, 0.1, 0), "cm")) #' ### Figure 4 plots.row <- align_plots(point_depth, lake_Secchi, wshed_area, align="hv", axis="tblr") div.rows <- plot_grid(plots.row[[1]], plots.row[[2]], plots.row[[3]], nrow=1, label_size = 7.5, label_fontface = "plain", labels = c("(a)", "(b)", "(c)"), hjust = -0.25, vjust = 2) # png(file = "Fig_DivEnv.png", width = 6.5, height = 2.25, units = "in", res = 1200) div.rows # dev.off() # footer ------------------------------------------------------------------ #' ## Document footer # workspace cleanup ------------------------------------------------------- rm(div.rows, lake_Secchi, max_depth, mn_df, plots.row, point_depth, sad.dat, sad.plot, states, study_map, temporal_accumulation, top1, top2, watersheds, wshed_area, bad_abund_surveys, label_area, label_depth, label_Secchi) #' #' Session Information: #+ sessionInfo sessionInfo() #script runtime Sys.time() - strttime