This script will pull in plant observation data from PI surveys, Secchi clarity, lake/watershed geodata, species statuses, and collaborator feedback. Then we’ll sync them into our dataset, creating a full macrophyte obs and env dataset for MN Lakes

# Outstanding Work:

Document Preamble

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 

strttime <- Sys.time()
getwd()
## [1] "E:/My Drive/Documents/UMN/Grad School/Larkin Lab/R_projects/MN_aquatic_plants_synthesis"
# load libraries ------------------------------------------------------------------

Libraries

  library(data.table) 
    # update_dev_pkg()# remotes::install_github("Rdatatable/data.table")
  library(ggplot2)
  library(stringr)
  library(sf)
  library(vegan)
  library(gridExtra)
  library(dplyr)
  library(tidyr)
  library(janitor)
  # library(lme4)
  # library(sjPlot)
  # library(mediation)
  library(ggpubr)
  # library(EnvStats)
  # library(lmerTest)
  # library(merTools)
  # library(rstanarm)
  # library(ggsn)
  library(ggpmisc)
  library(cowplot)


# load in functions -------------------------------------------------------

Functions

  f_dowle3natozeros = function(DT, x) {
  # or by number (slightly faster than by name) :
  for (j in x)
    set(DT,which(is.na(DT[[j]])),j,"0")
}


# load in data -------------------------------------------------

Data

The plants observations data and collaborator corrections are datasets that we have generated.

Secchi data have been aggregated from public sources by Kelsey Vitense for https://aslopubs.onlinelibrary.wiley.com/doi/full/10.1002/lol2.10323#lol210323-bib-0034

  # #plants observation dataset:
  # plants <- fread(input = "data&scripts/data/input/plant_surveys_mn.csv", drop = 1:2) #import, dropping the exported row numbers

  # #collaborator corrections and feedback:
  # coll_edits <- fread(input = "data&scripts/data/input/Edited_post_contrib_feedback.csv")

  #secchi data:
  secchi <- fread(input = "data&scripts/data/input/AllSecchi_plus_ShallowLakesSecchi.csv", drop = 1) #import, dropping the exported row numbers


  
  #' The MN DNR does not allow publication of copies of their datasets, and thus the following datasets must be downloaded by a user in order to run this code. 
  #'
  #' Hydrography (https://gisdata.mn.gov/dataset/water-dnr-hydrography; 5April2022) and 
  #' watershed (https://gisdata.mn.gov/dataset/geos-dnr-watersheds; 10Aug2022) data were
  #' retrieved from the MN Geospatial commons. 
  #' 
  #' Species statuses were retrieved from the MN DNR website
  #' (https://www.dnr.state.mn.us/eco/mcbs/plant_lists.html; 5April2022).
  #' 
  #'Citations:
  #'DNR Hydrography Dataset. (2012). Retrieved 5April2022, from https://resources.gisdata.mn.gov/pub/gdrs/data/pub/us_mn_state_dnr/water_dnr_hydrography/metadata/metadata.html
  #'DNR Watersheds—DNR Level 04—HUC 08—Majors. (2023). Retrieved 10Aug2022, from https://resources.gisdata.mn.gov/pub/gdrs/data/pub/us_mn_state_dnr/geos_dnr_watersheds/metadata/dnr_watersheds_dnr_level_04_huc_08_majors.html
  #'MNTaxa: The State of Minnesota Vascular Plant Checklist. (2013). Retrieved 5April2022, from https://www.dnr.state.mn.us/eco/mcbs/plant_lists.html
  #' 
  
  
  
  ## MN DNR Datasets
  
  #hydrography & watersheds
  pwi_l <- st_read(dsn = "data&scripts/data/input/shp_water_dnr_hydrography", layer = "dnr_hydro_features_all")
## Reading layer `dnr_hydro_features_all' from data source 
##   `E:\My Drive\Documents\UMN\Grad School\Larkin Lab\R_projects\MN_aquatic_plants_synthesis\data&scripts\data\input\shp_water_dnr_hydrography' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 130913 features and 43 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 189729.8 ymin: 4793853 xmax: 1165764 ymax: 5514207
## Projected CRS: NAD83 / UTM zone 15N
  watersheds_huc8 <- st_read(dsn = "data&scripts/data/input/shp_geos_dnr_watersheds", layer = "dnr_watersheds_dnr_level_04_huc_08_majors")
## Reading layer `dnr_watersheds_dnr_level_04_huc_08_majors' from data source 
##   `E:\My Drive\Documents\UMN\Grad School\Larkin Lab\R_projects\MN_aquatic_plants_synthesis\data&scripts\data\input\shp_geos_dnr_watersheds' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 81 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 189775.3 ymin: 4816305 xmax: 761638.2 ymax: 5472428
## Projected CRS: NAD83 / UTM zone 15N
  #species statuses
  rte <- fread(input = "data&scripts/data/input/2013_dnr_plant_checklist_web.csv")

  

  

# Strip Rare Species ------------------------------------------------------

Strip Rare Species Idents

Due to legal protections on rare, threatened, and endangered species, the following section strips the identities of rare species in the dataset. The identities can be recovered by requesting either the full data imported above (currently commented out), or the key to the protected species names below (note that only the former will get you any location info that was associated with those data).

  # de-identify rare species:
  rte <- clean_names(rte)#tidy names on status dataset
  
  # #Strip species ID from protected species
  # namesstripkey <- plants[ TAXON %in% rte[rarity_status != "" , mn_dnr_scientific_name,], .N , TAXON]
  # others <- plants[ !(TAXON %in% rte[, mn_dnr_scientific_name,]), .N , TAXON]
  # rm(others) #none of these trigger me to need to deident them
  # 
  # namesstripkey[ , new_name := paste("ProtectedSpecies", .I, sep = "_") , ,]
  # 
  # plants[ namesstripkey , on = .(TAXON=TAXON) , new_name := new_name]
  # plants[ TAXON %in% namesstripkey[,TAXON],  new_name  ]
  # 
  # plants[ , .N , new_name]
  # 
  # plants[ !is.na(new_name),  TAXON := new_name ]
  # 
  # plants[ , new_name := NULL]
  # colnames(plants)
  # 
  # #strip locs from all points with a protected species
  # 
  # ps_points <- plants[str_detect(TAXON, "ProtectedSpecies" ), unique(POINT_ID)  ,  ]#give me the point ids for all points with a protected species
  # 
  # #any loc data there?
  # plants[POINT_ID %in% ps_points, c("X","Y","NORTHING","EASTING","LATITUDE","LONGITUDE","UTMX","UTMY")]
  # 
  # #delete that data
  # plants[POINT_ID %in% ps_points, c("X","Y","NORTHING","EASTING","LATITUDE","LONGITUDE","UTMX","UTMY"):= NA, ]
  # 
  # #export rte key:
  # fwrite(namesstripkey, file = "data&scripts/data/output/rte_namestrip_key.csv" )
  # #export rte stripped product:
  # fwrite(plants, file = "data&scripts/data/output/plants_input_data_rtestrip.csv")
  #import rte stripped product:
  # plants <- fread(input = "data&scripts/data/output/plants_input_data_rtestrip.csv") #import, dropping the exported row numbers

  
  

# stripped out peoples personally identifiable information ------------------------------------------------

  # #collaborator corrections and feedback:
  coll_edits <- fread(input = "data&scripts/data/input/Edited_post_contrib_feedback_noPII.csv") 
  
  #import PLANTS INPUT product:
  plants <- fread(input = "data&scripts/data/output/plants_input_data_rtestrip_noPII.csv") #import, dropping the exported row numbers
    plants[TAXON == "", .N]
## [1] 689009
    plants[TAXON == "", TAXON:= NA]
  
  

# *****DatasetUpdates***** ------------------------------------------------

Dataset Updates

# collaborator corrections ------------------------------------------------

Collaborator Corrections

This section uses the collaborator feedback to revise the dataset.

  # check survey ID alignment
  #sum(!coll_edits[, SURVEY_ID, ] %in% plants[ , SURVEY_ID]) #100% of collaborator input has a match in plants
  names(coll_edits)[1] <- "feedback"
  coll_edits[ , .N , feedback ]
##                                         feedback     N
##                                           <char> <int>
##  1:             curlyleaf pondweed survey delete    23
##  2:                               data available     9
##  3:                       delete erroneous entry     1
##  4:                             duplicate delete     6
##  5:                           errors from import     2
##  6:                             missing metadata     2
##  7:                            no data available    92
##  8: Point 69 has Lmin entered as 5 - change to 2     1
##  9:                  rake density data available     1
## 10:                            reimport required     2
## 11:                             unuseable delete   683
## 12:                        useable data reimport   149
## 13:                                               1853
  # deletions 
  # coll_edits[ str_detect(feedback, "delete",), SURVEY_ID ,    ]#which are marked for deletion?
  plants <- plants[  !SURVEY_ID %in% 
                       coll_edits[ str_detect(feedback, "delete",), SURVEY_ID ,    ]#this drops about 10k observations from the dataset
                     , , ]
  
  # tag for reimport
  # these surveys need to be reimported and any current data deleted. We have these data in our files. 
  
  # coll_edits[str_detect(feedback, "import",) , SURVEY_ID  , ]
  # plants[ SURVEY_ID %in% coll_edits[str_detect(feedback, "import",) , SURVEY_ID  , ], .N ,  SURVEY_ID]
  sel <- plants[ SURVEY_ID %in% coll_edits[str_detect(feedback, "import",) , SURVEY_ID  , ],  ,  ] #peel off those reimport data
  sel <- sel[!duplicated(sel[ , SURVEY_ID , ]),] #compress to one row where botched import produced some data
  # plants[ SURVEY_ID %in% coll_edits[str_detect(feedback, "import",) , SURVEY_ID  , ], .N ,  SURVEY_ID]
  
  # drop or modify some cols to reflect bad import
  sel[ , c("STA_NBR_DATASOURCE", "DEPTH_FT", "NO_VEG_FOUND", "REL_ABUND", "WHOLE_RAKE_REL_ABUND","SUBSTRATE", "SURVEYOR", "TAXON", "SAMPLE_NOTES", "SURFACE_GROWTH", "POINT_LVL_SECCHI", "X", "Y", "NORTHING", "EASTING", "LATITUDE", "LONGITUDE", "UTMX", "UTMY", "POINT_ID", "OBS_ID") := NA ,]#in sel, dump these columns
  sel[ , INDATABASE := FALSE]#mark these as not in database
  plants <- plants[  !SURVEY_ID %in% coll_edits[str_detect(feedback, "import",) , SURVEY_ID  , ],  ,  ] #drops ~900 obs
  plants <- rbind(plants, sel)
  plants[ SURVEY_ID %in% sel[, SURVEY_ID] , SURVEY_FEEDBACK := "reimport required" , ]
  
  # data available from collaborator
  # coll_edits[ feedback %in% c("data available", "missing metadata") , SURVEY_ID , ]
  plants[ SURVEY_ID %in% coll_edits[ feedback %in% c("data available", "missing metadata") , SURVEY_ID , ]  , SURVEY_FEEDBACK := "data available from collaborator" ,   ]
  plants[ SURVEY_ID %in% coll_edits[ feedback %in% c("rake density data available") , SURVEY_ID , ]  , SURVEY_FEEDBACK := "rake density data available from collaborator" ,   ]
  
  
  # no data available 
  plants[ SURVEY_ID %in% coll_edits[ feedback %in% c("no data available") , SURVEY_ID , ]  , SURVEY_FEEDBACK := "no data available" ,   ]
  nrow(plants[SURVEY_FEEDBACK == "no data available" , .N , SURVEY_ID] )# how many cases with data unavailable/ not known where raw data are?
## [1] 92
  # one-offs
  # coll_edits[feedback == "Point 69 has Lmin entered as 5 - change to 2", SURVEY_ID]
  plants[SURVEY_ID == coll_edits[feedback == "Point 69 has Lmin entered as 5 - change to 2", SURVEY_ID] & 
           STA_NBR_DATASOURCE == 69 &
           REL_ABUND == 5,
         REL_ABUND := 2]
  
  # Taxa naming problem:
  plants[TAXON == "Mitellopsis", TAXON := "Nitellopsis"]
  
 
  # preferred datasource name
  # coll_edits[ , .N , EDIT_DATASOURCE]
  plants[ , SURVEY_DATASOURCE := coll_edits[match(plants$DATASOURCE, coll_edits$DATASOURCE),  EDIT_DATASOURCE ] ,  ]
  # plants[ , .N ,  SURVEY_DATASOURCE]
  # plants[SURVEY_DATASOURCE == "", .N , DATASOURCE ]
  plants[DATASOURCE == "DNR Lakes and Rivers", SURVEY_DATASOURCE := "DNR Lakes and Rivers"]
  plants[DATASOURCE == "DNR Fisheries", SURVEY_DATASOURCE := "DNR Fisheries"]
  plants[DATASOURCE == "Rantala TIP", SURVEY_DATASOURCE := "DNR Fisheries"]
  plants[DATASOURCE == "Muthukrishnan Et al", SURVEY_DATASOURCE := "DNR Shallow Lakes" , ]
  plants[SURVEY_DATASOURCE == "DNR Fisheries Research" , SURVEY_DATASOURCE := "DNR Fisheries"]
  
  # check contribution # of surveys by new named datasources
  plants[ , length(unique(SURVEY_ID)) , SURVEY_DATASOURCE ]
##                              SURVEY_DATASOURCE    V1
##                                         <char> <int>
##  1:                          DNR Shallow Lakes  1666
##  2:             Freshwater Scientific Services   200
##  3:         Newman Lab University of Minnesota   115
##  4:         Minnehaha Creek Watershed District   111
##  5:                                              495
##  6:               DNR Invasive Species Program   352
##  7:                              DNR Fisheries    37
##  8:                         Blue Water Science   110
##  9:        Minneapolis Park & Recreation Board    10
## 10:          Endangered Resource Services, LLC     7
## 11:                           Barr Engineering    85
## 12:                 Three Rivers Park District    63
## 13:                    AIS Consulting Services    13
## 14:                              Ramsey County    91
## 15: Ramsey-Washington Metro Watershed District    30
## 16:          Capitol Region Watershed District    60
## 17:           Emmons & Olivier Resources, Inc.     8
  # lake name corrections 
  plants[ , NEW_LAKE_NAME := coll_edits[match(plants$SURVEY_ID, coll_edits$SURVEY_ID),  EDIT_LAKE_NAME ] ,  ]
  plants[NEW_LAKE_NAME %in% c("lake of the isles", "clear", "bde maka ska"), LAKE_NAME := NEW_LAKE_NAME ]
  plants[ , NEW_LAKE_NAME := NULL ,]
  
  
  # surveyor corrections
  plants[ , NEW_SURVEYOR := coll_edits[match(plants$SURVEY_ID, coll_edits$SURVEY_ID),  EDIT_SURVEYOR ] ,  ]
  # plants[, .N, NEW_SURVEYOR ]
  plants[!NEW_SURVEYOR == "" & !is.na(NEW_SURVEYOR), SURVEYOR := NEW_SURVEYOR , ]
  plants[ , NEW_SURVEYOR := NULL ,]
  
  # a <- plants[ , length(unique(SURVEY_ID)) , SURVEYOR]
  
  
  # date corrections
  plants[ , NEW_DATE := coll_edits[match(plants$SURVEY_ID, coll_edits$SURVEY_ID),  EDIT_DATE ] ,  ]
  # plants[, .N, NEW_DATE ]
  # plants[!NEW_DATE == "" & !is.na(NEW_DATE), .N , NEW_DATE ]
  plants[!NEW_DATE == "" & !is.na(NEW_DATE) , SURVEY_DATE := as.Date(NEW_DATE, format = "%d%b%Y") ,]
  plants[ , NEW_DATE := NULL ,]
  
  
  # input rake density scales
  #overwrite any bad rake scales:
  coll_edits[!is.na(`EDITED_SCALE_RAKE_DENS (0-X)`) , `SCALE_RAKE_DENS (0-X)` := `EDITED_SCALE_RAKE_DENS (0-X)`  ]
  # coll_edits[ ,.N , `SCALE_RAKE_DENS (0-X)`  ]
  coll_edits[ `SCALE_RAKE_DENS (0-X)` %in% c(1,2) , `SCALE_RAKE_DENS (0-X)` := NA  ]# these aren't real abundance scales--they should be marked as NA, to indicate only pres-abs data are useable.
  
  #push over to plants DB:
  plants[ , RAKE_SCALE_USED := coll_edits[match(plants$SURVEY_ID, coll_edits$SURVEY_ID),  `SCALE_RAKE_DENS (0-X)` ] ,  ]
  
  
  # for these surveys, we can see that our collaborators inputs on rake scale was not correct:
  plants[REL_ABUND>RAKE_SCALE_USED, .N , .(SURVEY_ID)  ]
##    SURVEY_ID     N
##        <int> <int>
## 1:      1418    48
## 2:      3128     1
  #two more left now to manually change to max scale observed rather than reported:
  plants[SURVEY_ID == 1418 , RAKE_SCALE_USED := 5]
  plants[SURVEY_ID == 3128 , RAKE_SCALE_USED := 5]
  
  # check process:
  plants[ , .("max_observed_in_data" = max(REL_ABUND, na.rm = T)) , RAKE_SCALE_USED]
##    RAKE_SCALE_USED max_observed_in_data
##              <int>                <int>
## 1:              NA                    2
## 2:               4                    4
## 3:               5                    5
## 4:               3                    3
  #there are 29 unique surveys where no rake scale data are provided, and no easy inference exists -- swap these to presence absence
  plants[SURVEY_ID %in% plants[is.na(RAKE_SCALE_USED) & REL_ABUND >1, SURVEY_ID], .N , RAKE_SCALE_USED] 
##    RAKE_SCALE_USED     N
##              <int> <int>
## 1:              NA  3443
  plants[is.na(RAKE_SCALE_USED), .N , REL_ABUND ]
##    REL_ABUND      N
##        <int>  <int>
## 1:        NA 984135
## 2:         1  17188
## 3:         2    135
  #drop the abundance data from these
  plants[is.na(RAKE_SCALE_USED), REL_ABUND := NA ]
  
  
  # clean up WS
  rm(coll_edits, sel)

  # summarize plants dataset ---------------------------------------------------

Review Data Summaries

Review current data status and outline changes needed for

  str(plants) #what data formats?
## Classes 'data.table' and 'data.frame':   1278192 obs. of  48 variables:
##  $ SURVEY_ID           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ LAKE_NAME           : chr  "little prairie" "little prairie" "little prairie" "little prairie" ...
##  $ DATASOURCE          : chr  "Muthukrishnan Et al" "Muthukrishnan Et al" "Muthukrishnan Et al" "Muthukrishnan Et al" ...
##  $ SURVEY_DATE         : IDate, format: "2011-08-16" "2011-08-16" ...
##  $ STA_NBR_DATASOURCE  : chr  "1" "1" "10" "11" ...
##  $ DEPTH_FT            : num  3.8 3.8 2.5 5.8 6 6.8 7 6.8 7 6.2 ...
##  $ NO_VEG_FOUND        : logi  FALSE FALSE FALSE TRUE TRUE FALSE ...
##  $ REL_ABUND           : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ WHOLE_RAKE_REL_ABUND: chr  "" "" "" "" ...
##  $ SUBSTRATE           : chr  "" "" "" "" ...
##  $ SURVEYOR            : chr  "surveyors_1" "surveyors_1" "surveyors_1" "surveyors_1" ...
##  $ TAXON               : chr  "Ceratophyllum demersum" "Vallisneria americana" "Drepanocladus" NA ...
##  $ SURVEY_ID_DATASOURCE: chr  "4664" "4664" "4664" "4664" ...
##  $ SAMPLE_NOTES        : chr  "" "" "" "" ...
##  $ SURFACE_GROWTH      : chr  "" "" "" "" ...
##  $ POINT_LVL_SECCHI    : num  2 2 2.25 2.25 2.25 2.25 2 2 2 2 ...
##  $ X                   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Y                   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ NORTHING            : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ EASTING             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ LATITUDE            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ LONGITUDE           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ UTMX                : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ UTMY                : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ POINT_ID            : int  1 1 2 3 4 5 6 7 8 9 ...
##  $ OBS_ID              : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ OLD_SURVEY_ID       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ DATESURVEYSTART     : chr  "8/16/2011" "8/16/2011" "8/16/2011" "8/16/2011" ...
##  $ DOW                 : int  1001600 1001600 1001600 1001600 1001600 1001600 1001600 1001600 1001600 1001600 ...
##  $ COHORT              : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ DATEINFO            : chr  "" "" "" "" ...
##  $ MONTH               : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ DAY                 : int  16 16 16 16 16 16 16 16 16 16 ...
##  $ YEAR                : int  2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
##  $ SUBBASIN            : chr  "" "" "" "" ...
##  $ INVENTORY_STAFF     : chr  "" "" "" "" ...
##  $ INVENTORY_STAFFDATE : chr  "" "" "" "" ...
##  $ USEABLE             : chr  "" "" "" "" ...
##  $ CLEANED             : chr  "" "" "" "" ...
##  $ INDATABASE          : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ INVENTORY_NOTES     : chr  "" "" "" "" ...
##  $ SUBMISSION_STAFF    : chr  "staff_1" "staff_1" "staff_1" "staff_1" ...
##  $ SUBMISSION_STAFFDATE: chr  "" "" "" "" ...
##  $ SUBMISSION_NOTES    : chr  "" "" "" "" ...
##  $ MULTIPARTSURVEY     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ SURVEY_FEEDBACK     : chr  NA NA NA NA ...
##  $ SURVEY_DATASOURCE   : chr  "DNR Shallow Lakes" "DNR Shallow Lakes" "DNR Shallow Lakes" "DNR Shallow Lakes" ...
##  $ RAKE_SCALE_USED     : int  NA NA NA NA NA NA NA NA NA NA ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "index")= int(0) 
##   ..- attr(*, "__SURVEY_ID")= int [1:1278192] 1 2 3 4 5 6 7 8 9 10 ...
##   ..- attr(*, "__SURVEY_FEEDBACK")= int [1:1278192] 1 2 3 4 5 6 7 8 9 10 ...
##   ..- attr(*, "__DATASOURCE")= int [1:1278192] 1 2 3 4 5 6 7 8 9 10 ...
  names(plants) #field names
##  [1] "SURVEY_ID"            "LAKE_NAME"            "DATASOURCE"          
##  [4] "SURVEY_DATE"          "STA_NBR_DATASOURCE"   "DEPTH_FT"            
##  [7] "NO_VEG_FOUND"         "REL_ABUND"            "WHOLE_RAKE_REL_ABUND"
## [10] "SUBSTRATE"            "SURVEYOR"             "TAXON"               
## [13] "SURVEY_ID_DATASOURCE" "SAMPLE_NOTES"         "SURFACE_GROWTH"      
## [16] "POINT_LVL_SECCHI"     "X"                    "Y"                   
## [19] "NORTHING"             "EASTING"              "LATITUDE"            
## [22] "LONGITUDE"            "UTMX"                 "UTMY"                
## [25] "POINT_ID"             "OBS_ID"               "OLD_SURVEY_ID"       
## [28] "DATESURVEYSTART"      "DOW"                  "COHORT"              
## [31] "DATEINFO"             "MONTH"                "DAY"                 
## [34] "YEAR"                 "SUBBASIN"             "INVENTORY_STAFF"     
## [37] "INVENTORY_STAFFDATE"  "USEABLE"              "CLEANED"             
## [40] "INDATABASE"           "INVENTORY_NOTES"      "SUBMISSION_STAFF"    
## [43] "SUBMISSION_STAFFDATE" "SUBMISSION_NOTES"     "MULTIPARTSURVEY"     
## [46] "SURVEY_FEEDBACK"      "SURVEY_DATASOURCE"    "RAKE_SCALE_USED"
  plants[ , length(unique(SURVEY_ID)) , ] #how many surveys in all?
## [1] 3453
  plants[ INDATABASE == T , length(unique(SURVEY_ID))] #how many surveys do we have the data in our db for?
## [1] 3196
  plants[ , length((unique(DOW))) , ] #how many lake in all?
## [1] 1553
  plants[ , length(unique(YEAR)) , ] #how many years of data?
## [1] 22
  plants[ , length(unique(POINT_ID)),] #how samples pulled from the lake?
## [1] 372827
  plants[!is.na(TAXON) , length(unique(OBS_ID))] # how many times was a plant identified in these data? 
## [1] 594127
  #' Lets see how many surveys (then number of points) we have been given by each contributor:
  
  plants[ , unique(SURVEY_DATASOURCE) ,] 
##  [1] "DNR Shallow Lakes"                         
##  [2] "Freshwater Scientific Services"            
##  [3] "Newman Lab University of Minnesota"        
##  [4] "Minnehaha Creek Watershed District"        
##  [5] ""                                          
##  [6] "DNR Invasive Species Program"              
##  [7] "DNR Fisheries"                             
##  [8] "Blue Water Science"                        
##  [9] "Minneapolis Park & Recreation Board"       
## [10] "Endangered Resource Services, LLC"         
## [11] "Barr Engineering"                          
## [12] "Three Rivers Park District"                
## [13] "AIS Consulting Services"                   
## [14] "Ramsey County"                             
## [15] "Ramsey-Washington Metro Watershed District"
## [16] "Capitol Region Watershed District"         
## [17] "Emmons & Olivier Resources, Inc."
  # survey contribution viz
  ggplot(plants[ , .N, .(SURVEY_ID, SURVEY_DATASOURCE, INDATABASE)], aes(SURVEY_DATASOURCE, fill = INDATABASE))+
    geom_bar(stat = "count", position = "stack" )+
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
    ggtitle(label = "n surveys by contributor")+
    scale_y_log10()

  # point contributions
  ggplot(plants[INDATABASE==T , .N, .(POINT_ID, SURVEY_DATASOURCE, INDATABASE)], aes(SURVEY_DATASOURCE))+
    geom_bar(stat = "count", position = "stack" )+
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
    ggtitle(label = "n points by contributor")+
    scale_y_log10()

The database has all the surveys we know exist for MN in it, including those for which we do not have the data. It is often useful to snip those no-data ones off right away to avoid running any calcs using all those rows w/o any species data.

  missing_data_surveys <- plants[ INDATABASE == F] 
  plants <- plants[INDATABASE == T]
  
  
  # drop zeros --------------------------------------------------------------

Zero Depths

We dropped surveys with no depth data in an early cleaning step. This happened before we merged datasets from the MN DNR into the database, meaning that we’ve still got to do a purge of 0 and NA depths to be sure we’ve handled DNR and other collaborator data consistently:

  #any remaining points with depth == NA or 0? They need to be dropped to be consistent in the handling of all no depth sampled points (currently only MNDNR ):
  plants[is.na(DEPTH_FT)|DEPTH_FT == 0 , ][ , .N , .(SURVEY_ID, DATASOURCE)]
##      SURVEY_ID          DATASOURCE     N
##          <int>              <char> <int>
##   1:        16 Muthukrishnan Et al     8
##   2:       128 Muthukrishnan Et al     2
##   3:       134 Muthukrishnan Et al     1
##   4:       197 Muthukrishnan Et al     1
##   5:       227 Muthukrishnan Et al    11
##  ---                                    
## 191:      3305 Muthukrishnan Et al     2
## 192:      3307 Muthukrishnan Et al     5
## 193:       827           source_29    23
## 194:       832           source_29    13
## 195:      4335           source_29    15
  plants[is.na(DEPTH_FT)|DEPTH_FT == 0 , .N, .(SURVEY_ID, DATASOURCE)][, unique(DATASOURCE)] #these only still remain in the DNR data--thats because we did the DNR data merge after cleaning up the other data 
## [1] "Muthukrishnan Et al" "source_2"            "source_29"          
## [4] "source_28"           "source_38"           "source_27"          
## [7] "source_34"
  sum(plants[ SURVEY_ID %in% plants[is.na(DEPTH_FT)|DEPTH_FT == 0 , .N, .(SURVEY_ID, DATASOURCE)][,SURVEY_ID], .N , .(SURVEY_ID, DATASOURCE)
  ][ , N]) #counts all points in those surveys
## [1] 113259
  #drop them:
  plants <- plants[!is.na(DEPTH_FT)|DEPTH_FT == 0  ]
  
  
  # duplicated entries ------------------------------------------------------

Duplicated Records

It’s become aparrent to me that when we casted the data to long format in the survey collation project, we ended up with many cases of multiple “observations” of the same thing from within a single point. Here we clean up this issue. I found the cause of it by opening up the surveycollation project

  #drop duplicated entries:
  names(plants)
##  [1] "SURVEY_ID"            "LAKE_NAME"            "DATASOURCE"          
##  [4] "SURVEY_DATE"          "STA_NBR_DATASOURCE"   "DEPTH_FT"            
##  [7] "NO_VEG_FOUND"         "REL_ABUND"            "WHOLE_RAKE_REL_ABUND"
## [10] "SUBSTRATE"            "SURVEYOR"             "TAXON"               
## [13] "SURVEY_ID_DATASOURCE" "SAMPLE_NOTES"         "SURFACE_GROWTH"      
## [16] "POINT_LVL_SECCHI"     "X"                    "Y"                   
## [19] "NORTHING"             "EASTING"              "LATITUDE"            
## [22] "LONGITUDE"            "UTMX"                 "UTMY"                
## [25] "POINT_ID"             "OBS_ID"               "OLD_SURVEY_ID"       
## [28] "DATESURVEYSTART"      "DOW"                  "COHORT"              
## [31] "DATEINFO"             "MONTH"                "DAY"                 
## [34] "YEAR"                 "SUBBASIN"             "INVENTORY_STAFF"     
## [37] "INVENTORY_STAFFDATE"  "USEABLE"              "CLEANED"             
## [40] "INDATABASE"           "INVENTORY_NOTES"      "SUBMISSION_STAFF"    
## [43] "SUBMISSION_STAFFDATE" "SUBMISSION_NOTES"     "MULTIPARTSURVEY"     
## [46] "SURVEY_FEEDBACK"      "SURVEY_DATASOURCE"    "RAKE_SCALE_USED"
  plants[ , .N , .(SURVEY_ID,
                   POINT_ID ,
                   NO_VEG_FOUND ,
                   # proplight ,
                   DEPTH_FT ,
                   SUBSTRATE ,
                   SURVEYOR, TAXON)  ][N>1 , hist(N) , ]

## $breaks
##  [1]  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16
## 
## $counts
##  [1]  2397    15     5     0     0     0     0     0     0     0     0     0
## [13]     0 36151
## 
## $density
##  [1] 0.0621499689 0.0003889235 0.0001296412 0.0000000000 0.0000000000
##  [6] 0.0000000000 0.0000000000 0.0000000000 0.0000000000 0.0000000000
## [11] 0.0000000000 0.0000000000 0.0000000000 0.9373314665
## 
## $mids
##  [1]  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5 12.5 13.5 14.5 15.5
## 
## $xname
## [1] "N"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
  sum(duplicated(plants$OBS_ID)) #obs ID is unique, but it was generated on row value, not on a unique key set
## [1] 0
  names(plants[ , .SD , .SDcols = !c("OBS_ID") ])
##  [1] "SURVEY_ID"            "LAKE_NAME"            "DATASOURCE"          
##  [4] "SURVEY_DATE"          "STA_NBR_DATASOURCE"   "DEPTH_FT"            
##  [7] "NO_VEG_FOUND"         "REL_ABUND"            "WHOLE_RAKE_REL_ABUND"
## [10] "SUBSTRATE"            "SURVEYOR"             "TAXON"               
## [13] "SURVEY_ID_DATASOURCE" "SAMPLE_NOTES"         "SURFACE_GROWTH"      
## [16] "POINT_LVL_SECCHI"     "X"                    "Y"                   
## [19] "NORTHING"             "EASTING"              "LATITUDE"            
## [22] "LONGITUDE"            "UTMX"                 "UTMY"                
## [25] "POINT_ID"             "OLD_SURVEY_ID"        "DATESURVEYSTART"     
## [28] "DOW"                  "COHORT"               "DATEINFO"            
## [31] "MONTH"                "DAY"                  "YEAR"                
## [34] "SUBBASIN"             "INVENTORY_STAFF"      "INVENTORY_STAFFDATE" 
## [37] "USEABLE"              "CLEANED"              "INDATABASE"          
## [40] "INVENTORY_NOTES"      "SUBMISSION_STAFF"     "SUBMISSION_STAFFDATE"
## [43] "SUBMISSION_NOTES"     "MULTIPARTSURVEY"      "SURVEY_FEEDBACK"     
## [46] "SURVEY_DATASOURCE"    "RAKE_SCALE_USED"
  sum(duplicated(plants[ , .SD , .SDcols = !c("OBS_ID") ])) #here we see the unique key set (everything BUT obs ID) tells us to drop 500k observations!
## [1] 543494
  plants <- plants[!duplicated(plants[ , .SD , .SDcols = !c("OBS_ID") ]) , , ]
  
  # still a bunch of dups leftover where we've got:
  # two abunds for one species or two of something for one species...
  # with a little sleuthing, I can see that these are a whole mix of things. For
  # example, James Johnson submitted one survey with two samples for point 213...
  # the solution I'll use is to allow these obs to stay (assuming that both obs 
  # are real, and the data entry resulted in a bad point ID for one of them).
  # because of this, when we agg to the point level, we'll have to choose an obs
  # to use that taxon. You'll see this play out in the species matrix
  # construction below:
  
  
  plants[ , .N , .(SURVEY_ID,
                   POINT_ID ,
                   NO_VEG_FOUND ,
                   #proplight ,
                   DEPTH_FT ,
                   SUBSTRATE ,
                   SURVEYOR, TAXON)  ][N>1 , .N , ]
## [1] 1292
  plants[ , .N , .(SURVEY_ID,
                   POINT_ID ,
                   NO_VEG_FOUND ,
                   #proplight ,
                   DEPTH_FT ,
                   SUBSTRATE ,
                   SURVEYOR, TAXON)  ][N>1 , unique(POINT_ID) , ]
##   [1]  96591  96844 138942 138943 138944 138945 138948 138949 138950 138951
##  [11] 138952 138953 138954 138955 138956 138957 138958 138959 138960 138961
##  [21] 138962 138963 138964 138966 138967 138968 138969 138970 138971 138973
##  [31] 138974 138975 138976 138977 138979 138980 138981 138982 138983 138984
##  [41] 138987 138988 138989 138990 138991 138992 138993 138994 138995 138996
##  [51] 138997 138998 138999 139000 139001 139002 139003 139004 139005 139006
##  [61] 139007 139008 139009 139010 139012 139013 139014 139015 139016 139017
##  [71] 139018 139019 139020 139021 139022 139023 139024 139025 139026 139027
##  [81] 139028 139029 139030 139031 139032 139033 139034 139035 139036 139037
##  [91] 139038 139039 139040 139043 139044 139045 139046 139047 139050 139052
## [101] 139053 139054 139055 139056 139059 139063 139065 139066 139067 139068
## [111] 139070 139071 139072 139073 139074 139075 139076 139077 139080 139081
## [121] 139082 139083 139084 139085 139087 139088 139089 139090 139096 139097
## [131] 139098 139099 139101 139102 139103 139104 139105 139108 139109 139110
## [141] 139111 139112 139113 139114 139115 139116 139117 139118 139119 139121
## [151] 139122 139123 139124 139125 139126 139129 139130 139131 139132 139133
## [161] 139134 139140 139142 139145 139146 139147 139148 139151 139152 139153
## [171] 139154 139156 139158 139159 139161 139162 139164 139165 139166 139167
## [181] 139168 139169 139170 139171 139172 139173 139175 139176 139177 139180
## [191] 139181 139182 139185 139186 139188 139189 139190 139191 139192 139193
## [201] 139194 139197 139198 139199 139200 139201 139203 139204 139205 139206
## [211] 139208 139209 139210 139211 139212 139213 139214 139215 139216 139217
## [221] 139219 139220 139222 139223 139224 139225 139228 139229 139231 139232
## [231] 139233 139234 139235 139236 139237 139238 139239 139240 139241 139242
## [241] 139243 139244 139246 139247 139250 139251 139252 139254 139256 139258
## [251] 139259 139260 139261 139262 139263 139264 139265 139266 139267 139268
## [261] 139269 139270 139271 139272 139273 139274 139275 139276 139277 139278
## [271] 139279 139280 139281 139282 139283 139284 139285 139287 139289 139290
## [281] 139291 139292 139293 139294 139295 139296 139297 139298 139299 139300
## [291] 139301 139302 139303 139304 139305 139306 139307 139308 139309 139310
## [301] 139311 139312 139313 139315 139316 139317 139318 139319 139320 139321
## [311] 139322 139325 139326 139330 139332 139333 139334 139335 139336 139337
## [321] 139338 139339 139340 139341 139342 139343 139345 139346 139347 139348
## [331] 139350 139352 139353 139354 139357 139358 139359 139360 139361 139364
## [341] 139365 139368 139369 139370 139371 139372 139373 139374 139375 139376
## [351] 139377 139378 139379 139380 139381 139385 139386 139388 139391 139392
## [361] 139395 139396 139397 139398 139402 139403 139405 139406 139407 139408
## [371] 139411 139412 139413 139415 139416 139417 139418 139420 139423 139424
## [381] 139427 139428 139429 139430 139432 139434 139435 139436 139437 139438
## [391] 139441 139442 139443 139444 139447 139448 139449 139451 139455 139456
## [401] 139457 139459 139460 139461 139462 139463 139465 139466 139467 139470
## [411] 139472 139473 139474 139476 139477 139478 139479 139480 139483 139487
## [421] 139489 139490 139494 139495 139496 139497 139499 139501 139502 139503
## [431] 139505 139507 139508 139509 139511 139513 139514 139515 139516 139517
## [441] 139518 139519 139522 139523 139524 139525 139527 139530 139531 139532
## [451] 139533 139535 139536 139537 139540 139542 139543 139544 139555 139557
## [461] 139558 139560 139561 139562 139564 139567 139571 139576 139583 139589
## [471] 139596 139597 139598 139600 139605 139609 139610 139611 139612 139616
## [481] 139618 139619 139620 139621 139622 139623 139624 139625 139626 139627
## [491] 139629 139630 139631 139635 139637 139638 139639 139643 139645 139646
## [501] 139647 139650 139651 139652 139655 139656 139657 139659 139660 139661
## [511] 139662 139663 139665 139667 139668 139678 139682 139683 139692 139701
## [521] 139708 139710 139722 139723 139728 139729 139730 139733 139734 139735
## [531] 139736 139741 139742 139743 139744 139746 139747 139748 139750 139753
## [541] 139754 139755 139756 139759 139761 139763 139765 139767 139768 139769
## [551] 139771 139772 139775 139777 139780 139781 139782 139783 139784 139786
## [561] 139787 139788 139790 139792 139793 139799 139803 139807 139809 139816
## [571] 139817 139818 139819 139822 139826 139831 139833 139834 139835 139836
## [581] 139846 139848 139854 139855 139856 139857 139859 139860 139861 139862
## [591] 139866 139867 139870 139871 139872 139873 139875 139876 139877 139879
## [601] 139880 139881 139883 139884 139885 139887 139889 139891 139893 139894
## [611] 139895 139898 139899 139900 139901 139903 139906 139907 139908 139910
## [621] 139916 139918 139926 139933 139942 139943 139946 139947 139952 139959
## [631] 139965 139973 139981 139986 139992 139994 139995 139996 139997 139998
## [641] 139999 140001 140002 140007 140011 140013 140014 140015 140019 140020
## [651] 140021 140023 140027 140965 140983 253279 255349 255353 359306

Review Data Format

Now “plants” is only those surveys for which we were able to gather and collate the data. Below we organize these data 3 ways:

  1. As long format: each row is a species observation within a point (multiple rows per point) including all fields retained through cleaning processes
  2. As a wide format of occurrences: each row is a point record, and the columns include a species observation (presence/absence) matrix. Here we keep only fields we
  3. As a wide format with species abundances (a subset of “2.”) where each row is a point record , and the columns include a species abundance matrix
  #how many surveys and how many points were sampled in each?
  hist(plants[ ,length(unique(POINT_ID)) , SURVEY_ID]$V1, breaks= 100, main = "N points per survey", xlab = "Npoints")

  #how many unique TAXA?
  unique(plants$TAXON)
##   [1] "Ceratophyllum demersum"         "Vallisneria americana"         
##   [3] "Drepanocladus"                  NA                              
##   [5] "Potamogeton robbinsii"          "Carex"                         
##   [7] "Equisetum"                      "Nuphar variegata"              
##   [9] "Potentilla palustris"           "Sagittaria"                    
##  [11] "Sparganium (floating)"          "Nymphaea odorata"              
##  [13] "Typha angustifolia"             "Potamogeton zosteriformis"     
##  [15] "Zizania palustris"              "Schoenoplectus acutus"         
##  [17] "Potamogeton praelongus"         "Brasenia schreberi"            
##  [19] "Utricularia vulgaris"           "Potamogeton natans"            
##  [21] "Myriophyllum sibiricum"         "Pontederia cordata"            
##  [23] "Schoenoplectus subterminalis"   "Potamogeton amplifolius"       
##  [25] "Eleocharis"                     "Nitella"                       
##  [27] "Najas flexilis"                 "Potamogeton illinoensis"       
##  [29] "Potamogeton gramineus"          "Chara"                         
##  [31] "Utricularia intermedia"         "Sparganium (emergent)"         
##  [33] "Nymphaeaceae"                   "Calla palustris"               
##  [35] "Sagittaria cuneata"             "Bidens beckii"                 
##  [37] "Potamogeton richardsonii"       "Elodea canadensis"             
##  [39] "Potamogeton (narrow)"           "Eleocharis acicularis"         
##  [41] "Potamogeton friesii"            "Potamogeton epihydrus"         
##  [43] "ProtectedSpecies_1"             "Utricularia minor"             
##  [45] "Utricularia"                    "Iris"                          
##  [47] "Schoenoplectus pungens"         "Eleocharis smallii"            
##  [49] "Salix"                          "Schoenoplectus"                
##  [51] "Sparganium"                     "Stuckenia pectinata"           
##  [53] "Sparganium fluctuans"           "Heteranthera dubia"            
##  [55] "Elodea"                         "Najas"                         
##  [57] "Isoetes"                        "Ranunculus flammula"           
##  [59] "Potamogeton crispus"            "Phragmites australis"          
##  [61] "Ranunculus"                     "Dulichium arundinaceum"        
##  [63] "Bolboschoenus fluviatilis"      "Spirodela polyrhiza"           
##  [65] "Potamogeton alpinus"            "Myriophyllum tenellum"         
##  [67] "Schoenoplectus tabernaemontani" "Eriocaulon aquaticum"          
##  [69] "Hypericum"                      "Acorus americanus"             
##  [71] "Persicaria"                     "Impatiens"                     
##  [73] "Lemna trisulca"                 "ProtectedSpecies_2"            
##  [75] "Potamogeton (broad)"            "Lemna minor"                   
##  [77] "Myriophyllum"                   "Hippuris vulgaris"             
##  [79] "Typha latifolia"                "ProtectedSpecies_3"            
##  [81] "Zannichellia palustris"         "ProtectedSpecies_4"            
##  [83] "Poaceae"                        "Potamogeton pusillus"          
##  [85] "Potamogeton foliosus"           "Potamogeton hillii"            
##  [87] "Typha"                          "Typha glauca"                  
##  [89] "Stuckenia"                      "Glyceria borealis"             
##  [91] "Stuckenia filiformis"           "Sium suave"                    
##  [93] "Carex comosa"                   "Scirpus atrovirens"            
##  [95] "Myriophyllum spicatum"          "Sagittaria latifolia"          
##  [97] "Andromeda polifolia"            "Eupatorium perfoliatum"        
##  [99] "Scirpus cyperinus"              "Eragrostis"                    
## [101] "Phalaris arundinacea"           "Sagittaria graminea"           
## [103] "Scutellaria"                    "Sagittaria rigida"             
## [105] "Sparganium eurycarpum"          "Asclepias incarnata"           
## [107] "Alisma triviale"                "Calamagrostis canadensis"      
## [109] "Eupatorium dubium"              "Wolffia columbiana"            
## [111] "Najas guadalupensis"            "Potamogeton nodosus"           
## [113] "Nelumbo lutea"                  "Scirpus validus"               
## [115] "Nuphar advena"                  "Scirpus"                       
## [117] "Nuphar"                         "Nymphaea"                      
## [119] "Ranunculus longirostris"        "Juncus"                        
## [121] "Fontinalis antipyretica"        "Lemna"                         
## [123] "Ranunculus aquatilis"           "Potamogeton"                   
## [125] "Wolffia"                        "Zizania"                       
## [127] "Equisetum fluviatile"           "Eutrochium"                    
## [129] "Sparganium emersum"             "Persicaria amphibia"           
## [131] "Chara canescens"                "Myriophyllum farwellii"        
## [133] "Sparganium angustifolium"       "Utricularia gibba"             
## [135] "Potamogeton vaseyi"             "Drepanocladus aduncus"         
## [137] "Nitellopsis"                    "Eleocharis palustris"          
## [139] "ProtectedSpecies_5"             "Potamogeton spirillus"         
## [141] "Juncus pelocarpus"              "Elatine"                       
## [143] "Riccia fluitans"                "Tolypella intricata"           
## [145] "Potamogeton strictifolius"      "Sparganium natans"             
## [147] "Bidens"                         "Carex lacustris"               
## [149] "Iris versicolor"                "Myriophyllum verticillatum"    
## [151] "Lemna turionifera"              "Zosterella"                    
## [153] "Nuphar microphylla"             "Lysimachia terrestris"         
## [155] "Menyanthes trifoliata"          "Carex aquatilis"               
## [157] "Lobelia dortmanna"              "Characeae"                     
## [159] "ProtectedSpecies_6"             "Lamiaceae"                     
## [161] "Leersia oryzoides"              "Sphagnum"                      
## [163] "Chamaedaphne calyculata"        "Alnus"                         
## [165] "Betula pumila"                  "Sparganium americanum"         
## [167] "Elodea nuttallii"               "Lythrum salicaria"             
## [169] "Impatiens capensis"             "Verbena"                       
## [171] "Persicaria lapathifolia"        "Polygonum amphibium"           
## [173] "Najas minor"                    "Ranunculus flabellaris"        
## [175] "Lychnothamnus barbatus"         "Wolffia borealis"              
## [177] "Caltha palustris"               "Cyperaceae"                    
## [179] "Hypericum ellipticum"           "ProtectedSpecies_7"            
## [181] "Butomus umbellatus"             "Scorpidium scorpioides"        
## [183] "Fontinalis sullivantii"         "Eupatorium maculatum"          
## [185] "Ledum groenlandicum"            "ProtectedSpecies_8"            
## [187] "Sagittaria cristata"            "Boltonia asteroides"           
## [189] "Scolochloa festucacea"          "ProtectedSpecies_9"            
## [191] "Myrica gale"                    "Potamogeton obtusifolius"      
## [193] "Schoenoplectus x oblongus"      "Solidago"                      
## [195] "Utricularia macrorhiza"         "Chara globularis"              
## [197] "Alisma"                         "Asteraceae"                    
## [199] "Lycopus americanus"             "Triadenum fraseri"             
## [201] "Isoetes echinospora"            "Callitriche"                   
## [203] "Nymphaea tuberosa"              "ProtectedSpecies_10"           
## [205] "Acorus"                         "Lysimachia"                    
## [207] "Myriophyllum alterniflorum"     "Littorella uniflora"           
## [209] "Andromeda glaucophylla"         "Sphagnum magellanicum"         
## [211] "Elatine minima"                 "Ricciocarpos natans"           
## [213] "Carex pellita"                  "Carex scoparia"                
## [215] "Iris virginica"                 "Juncus effusus"                
## [217] "Eleocharis erythropoda"         "Juncus arcticus"               
## [219] "Juncus canadensis"              "Ceratophyllum"                 
## [221] "Cicuta"                         "ProtectedSpecies_11"           
## [223] "Nasturtium officinale"          "Rumex orbiculatus"             
## [225] "ProtectedSpecies_12"            "Cicuta maculata"               
## [227] "Veronica americana"             "Bolboschoenus maritimus"       
## [229] "Scutellaria lateriflora"        "Myriophyllum exalbescens"      
## [231] "Schoenoplectus americanus"      "Riccia"
  # N taxa per survey:
  plants[ , .("Ntaxa" = length(unique(TAXON))) , SURVEY_ID] #if you want to name cols on the fly you need to wrap in .() which makes list from them 
##       SURVEY_ID Ntaxa
##           <int> <int>
##    1:         1    13
##    2:         2    16
##    3:         3    25
##    4:         4    14
##    5:         5    16
##   ---                
## 3190:      4338     9
## 3191:      4339     8
## 3192:      4340     8
## 3193:      4341    26
## 3194:      4333     6
  hist(plants[ , length(unique(TAXON)) , SURVEY_ID][ , V1], main = "N taxa per survey", xlab = "N taxa")

  hist(plants[ , length(unique(TAXON)) , POINT_ID][ , V1], main = "N taxa per point", xlab = "N taxa")

# rake scale normalization -------------------------------------------

Rake Abundance Normalization

This code will clean the relative rake density data from the whole PI dataset, shifting all to a 0,1,2,3 scale. This code was developed in the surveycollation project, but is implemented here (post-collaborator feedback) to allow the collabs to specify what the rake scale they used was.

  #drop surveys with max vals of 1s and 1-2s
  rakes1 <- plants[RAKE_SCALE_USED %in% c(3,4,5), ]
  # rakes1[ , .N , REL_ABUND]
  #how many surveys in these categories?
  # rakes1[  , .N  , SURVEY_ID] #982
  
  #Now shift/ realign data per discussion above
  #1-4 survey shifted to 1-3
  rakes1[RAKE_SCALE_USED == 4 & REL_ABUND == 3 ,
         REL_ABUND := 2 ]
  rakes1[RAKE_SCALE_USED == 4 & REL_ABUND == 4 ,
         REL_ABUND := 3 ]
  #1-5 surveys shifted to 1-3
  rakes1[RAKE_SCALE_USED == 5 & (REL_ABUND == 3 |REL_ABUND == 4) ,
         REL_ABUND := 2 ]
  rakes1[RAKE_SCALE_USED == 5 & REL_ABUND == 5 ,
         REL_ABUND := 3 ]
  
  
  # #check that the max vals are all 3's
  # hist(rakes1[ !is.na(REL_ABUND) , max(REL_ABUND) , SURVEY_ID  ][,V1])
  # 
  # #and all data are distributed in 1-3 rake density framework
  # hist(rakes1[ !is.na(REL_ABUND) , REL_ABUND ,  ])
  # 
  # # count the number of surveys we've got
  # rakes1[ , .N , SURVEY_ID ] # N points per survey (includes NA's--points where no species were observed)
  
  
  # put the corrected rake scales back into the plants db
  # plants[ , , ] 
  # rakes1[ , .N , OBS_ID][N>1]
  
  # rakes1[is.na(OBS_ID) , ,]
  
  #remove any cases where people told us the rake scale but data to-date are not in db:
  rakes1 <- rakes1[!is.na(OBS_ID)]
  
  #pop these corrected rake scale data into the plants dataset
  plants[OBS_ID %in% rakes1$OBS_ID , REL_ABUND_CORRECTED := rakes1$REL_ABUND  , ]
  
  #clean out intermediates
  rm(rakes1)

  #which surveys used rakabunds
  plants[ , .N , RAKE_SCALE_USED] 
##    RAKE_SCALE_USED      N
##              <int>  <int>
## 1:              NA 485133
## 2:               4 115191
## 3:               5  98010
## 4:               3  33965
  #why are there NA abunds for species in these surveys?
  plants[!is.na(RAKE_SCALE_USED)& !is.na(TAXON), .N , REL_ABUND_CORRECTED ] 
##    REL_ABUND_CORRECTED      N
##                  <int>  <int>
## 1:                   2  67019
## 2:                   1 118929
## 3:                   3  16762
## 4:                  NA   3412
  #expect some NAs to come through on these. May be worth looking back at the 
  # DNR data and evaluating how these plant obs were recorded and how they 
  # report then carry through our workflow. Seems to me that these surveys 
  # oughtta have numbers assigned to them -- like how coudl a record show a
  # species present but not have a numeric indicator on that? Were they 0/1?
  plants[!is.na(RAKE_SCALE_USED) & is.na(REL_ABUND) & !is.na(TAXON), .N , SURVEY_ID ]
##     SURVEY_ID     N
##         <int> <int>
##  1:       430   272
##  2:       480  1121
##  3:       515    58
##  4:       592    11
##  5:       718    23
##  6:       738    22
##  7:       855    29
##  8:      1531  1857
##  9:      3172    18
## 10:      2822     1
  bad_abund_surveys <- plants[!is.na(RAKE_SCALE_USED) & is.na(REL_ABUND) & !is.na(TAXON), unique(SURVEY_ID),  ]
  #force pres/abs on those
  plants[SURVEY_ID %in% bad_abund_surveys, c("RAKE_SCALE_USED", "REL_ABUND", "REL_ABUND_CORRECTED"):= NA , ]

 
# georeference data -------------------------------------------------------

Georeference Data

This section uses MN hydrography geodata to add direct geodata into the dataset. After run, pwi_l and plants can be linked on the shared “order_ID” column, and HUC-8 level watersheds are included in the dataset

  # merge geospatial files
  #change sf data.frame to a data.table
  setDT(pwi_l)
  
  # linking plants db to spatial reference:
  #shapefile dows need to be made numeric (drops leading zeros)
  # pwi_l[ , dowlknum , ]
  pwi_l[ , dow_main := round(as.numeric(dowlknum)/100,0)*100 , ]
  
  #there's a lot of junk in there, work towards a 1:1 of plants dows and pwi_l dows
  pwi_l <- pwi_l[!is.na(dowlknum)]# drops many polygons that aren't lakes (islands, rivers, etc)
  pwi_l[  , order_ID:= .I , ]#adds a key
  #drop non-mn shapes
  pwi_l <- pwi_l[!outside_mn == "Y"]
  #which dows are duplicated in the shapes?
  pwi_l[pwi_l[, duplicated(dowlknum),], dowlknum]
##  [1] "32005700" "66001400" "34002800" "43011500" "86017800" "70001600"
##  [7] "27000300" "02000500" "86025202" "86025202" "73008200" "75002400"
## [13] "78002400" "56097900" "56078100" "18039400" "18014500" "09003900"
## [19] "18026900" "18031100" "03007700" "03011200" "56164900" "03030400"
## [25] "03037402" "56078600" "60021700" "60017800" "60021700" "04029700"
## [31] "31090300" "31090300" "69058000" "69000300" "16063300" "69034500"
  #lets review those data and see if we can devise any cleaning ideas
  #pwi_l[dowlknum %in% pwi_l[pwi_l[, duplicated(dowlknum),], dowlknum],]
  # we can just use the first instance of these duplicated waterbodies:
  # we'll do that by dropping the subsequent duplicates!
  pwi_l <- pwi_l[!pwi_l[, duplicated(dowlknum),], , ]
  
  # missing matches in the plants data to shapefile dows
  sum(is.na(match(plants[ , unique(DOW) ,], unique(pwi_l[ , dowlknum , ]))))
## [1] 217
  # missing matches in the plants data to shapefile mainlake dows
  sum(is.na(match(plants[ , unique(DOW) ,], unique(pwi_l[ , dow_main , ]))))
## [1] 42
  #append a polygon value to the plants data (here we'll use our order_ID from above)
  plants[ , order_ID := pwi_l[ match( plants[ , DOW ,], pwi_l[ , as.numeric(dow_main) , ]) , order_ID , ]  ]
  
  #and any that didn't match on that, try the basin specific
  plants[ is.na(order_ID) , order_ID := pwi_l[ match(plants[ is.na(order_ID) , DOW ,], pwi_l[ , as.numeric(dowlknum) , ]), order_ID , ]  ]
  
  # now to navigate these last non-compliant ones...
  plants[is.na(order_ID) & !is.na(DOW), .N ,  .(LAKE_NAME, DOW, DATASOURCE)]
##                LAKE_NAME      DOW          DATASOURCE     N
##                   <char>    <int>              <char> <int>
## 1:           sakatah bay 40000201           source_17   934
## 2: rrwma - pool 1 - east 68000501 Muthukrishnan Et al    74
## 3: rrwma - pool 1 - west 68000502 Muthukrishnan Et al    94
## 4: katherine abbott pond 82009999           source_25    36
  pwi_l[ dowlknum == "40000200", order_ID]
## [1] 1267
  plants[ DOW == 40000201, order_ID := pwi_l[ dowlknum == "40000200", order_ID] ] #Upper Sakatah polygon
  
  pwi_l[ dowlknum == "68000500", order_ID]
## [1] 26678
  plants[ DOW %in% c(68000501,68000502), order_ID := pwi_l[ dowlknum == "68000500", order_ID] ] #Roseau River WMA
  
  # pwi_l[ dowlknum == "70005000", order_ID]
  # plants[ DOW == 70050000, ]  #Carls Lake
  # 
  pwi_l[ dowlknum == "82011800", order_ID]
## [1] 7235
  plants[ DOW == 82009999, order_ID := pwi_l[ dowlknum == "82011800", order_ID] ] #Katherine Abbott Pond
  
  # the plants datset lakes with no geodata in the hydrography layer we used:
  plants[is.na(order_ID) , .N ,  .(LAKE_NAME, DOW, DATASOURCE)]
##                                     LAKE_NAME   DOW          DATASOURCE     N
##                                        <char> <int>              <char> <int>
##  1:                                    pool 2    NA Muthukrishnan Et al    97
##  2:                               reynen pond    NA Muthukrishnan Et al    46
##  3:                        unnamed delong wpa    NA Muthukrishnan Et al    22
##  4:                                   big sob    NA           source_26    64
##  5:                    ivanhoe wma east basin    NA Muthukrishnan Et al    26
##  6:                                olson pool    NA Muthukrishnan Et al   104
##  7: goldmine slough section - vermilion river    NA Muthukrishnan Et al   215
##  8: vermilion falls section - vermilion river    NA Muthukrishnan Et al    88
##  9:                                gull river    NA Muthukrishnan Et al   335
## 10:                         mississippi river    NA Muthukrishnan Et al  1769
## 11:                            little elk wma    NA Muthukrishnan Et al    98
## 12:                                   unnamed    NA Muthukrishnan Et al    20
## 13:                          sand prairie wma    NA Muthukrishnan Et al    75
## 14:                    loerch wma impoundment    NA Muthukrishnan Et al    31
## 15:             daggett brook wma impoundment    NA Muthukrishnan Et al   107
## 16:                               sterle pool    NA Muthukrishnan Et al   175
## 17:                              trettle pool    NA Muthukrishnan Et al   323
## 18:                              dundee marsh    NA Muthukrishnan Et al     8
  # in total, this is 18 surveys and 3611 observations without ANY geolocation
  plants[, summary(order_ID)]
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##      13    5269    9954   11269   17166   26778    3603
  # fix up local geospatial info 
  #check for weird X,Y vals in th UTM-looking columns
  plants[!is.na("X"), summary(X) ,]
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##     45.1 341123.0 495722.0 464619.0 564792.0 598930.0   728703
  plants[!is.na("Y"), summary(Y) ,]
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     -93  166689 4909613 2728113 5084488 5184300  725818
  #some are clearly lat/longs
  plants[X < 4600, X ,]
##  [1] 45.09664 45.09664 45.09664 45.09664 45.09664 45.09664 45.09664 45.09664
##  [9] 45.09664 45.09664 45.09664 45.09727 45.09727 45.09727 45.09727 45.09727
  plants[X < 4600, LATITUDE := X ,]
  plants[X < 4600, X := NA ,]
  
  plants[Y<10000, summary(Y) ,]
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -93.07   44.90   44.91   43.39   44.98   45.00
  plants[Y<10000 & Y>0,  LATITUDE := Y ,]
  plants[Y<10000 & Y>0,  Y := NA ,]
  
  plants[Y<10000, LONGITUDE := Y  ,]
  plants[Y<10000, Y := NA  ,]
  
  #whatever the heck is leftover here is weeeeeird and muddled.
  plants[!is.na("X"), summary(X) ,]
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  237606  341171  495725  466695  564792  598930  728719
  plants[!is.na("Y"), summary(Y) ,]
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  140723  210010 5003604 3499077 5086243 5184300  727246
  # we need to delete these non-UTM vals form X & Y
  # plants[Y<4800000, .N, DATASOURCE ]
  plants[ Y < 4800000, c("X","Y") := NA,  ]
  
  #looks clean, now move into the UTM slots?  
  plants[!is.na(UTMY) , summary(UTMY) ,]
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 4828777 5066644 5161078 5134746 5204415 5339053
  plants[!is.na(UTMX) , summary(UTMX) ,]
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  229942  380655  408661  417766  453041  712374
  #any conflicts with UTM loc data?
  plants[!is.na(UTMX) & !is.na(X)]
## Empty data.table (0 rows and 50 cols): SURVEY_ID,LAKE_NAME,DATASOURCE,SURVEY_DATE,STA_NBR_DATASOURCE,DEPTH_FT...
  plants[!is.na(UTMY) & !is.na(Y)]
## Empty data.table (0 rows and 50 cols): SURVEY_ID,LAKE_NAME,DATASOURCE,SURVEY_DATE,STA_NBR_DATASOURCE,DEPTH_FT...
  #move X, Y to UTMs
  plants[!is.na(X) , UTMX := X ,  ]
  plants[!is.na(Y) , UTMY := Y ,  ]
  
  plants[ , c("X", "Y") := NULL , ]
  
  
  #now Northing Easting, which happen to look like clean UTM data
  plants[!is.na(NORTHING), summary(NORTHING) ,]
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 4860888 5003560 5136308 5089221 5203396 5208310
  plants[!is.na(EASTING), summary(EASTING) ,]
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  315321  339882  401439  413157  455109  515302
  #overlap/ conflict?
  plants[!is.na(UTMX) & !is.na(NORTHING)]
## Empty data.table (0 rows and 48 cols): SURVEY_ID,LAKE_NAME,DATASOURCE,SURVEY_DATE,STA_NBR_DATASOURCE,DEPTH_FT...
  plants[!is.na(UTMY) & !is.na(EASTING)]
## Empty data.table (0 rows and 48 cols): SURVEY_ID,LAKE_NAME,DATASOURCE,SURVEY_DATE,STA_NBR_DATASOURCE,DEPTH_FT...
  #move Northing and easting to UTMs
  plants[!is.na(NORTHING), UTMY := NORTHING  ,]
  plants[!is.na(EASTING), UTMX := EASTING  ,]
  
  plants[ , c("NORTHING", "EASTING") := NULL , ]
  
  #now get all into same CRS:
  #conflicts?
  plants[!is.na(UTMX) & !is.na(LONGITUDE)]
## Empty data.table (0 rows and 46 cols): SURVEY_ID,LAKE_NAME,DATASOURCE,SURVEY_DATE,STA_NBR_DATASOURCE,DEPTH_FT...
  plants[!is.na(UTMY) & !is.na(LATITUDE)]
## Empty data.table (0 rows and 46 cols): SURVEY_ID,LAKE_NAME,DATASOURCE,SURVEY_DATE,STA_NBR_DATASOURCE,DEPTH_FT...
  plants[!is.na(LONGITUDE), UTMX:=NA]
  plants[!is.na(LATITUDE), UTMY:=NA]
  
  
  #here we'll split into a non, UTM, and LL georef set, then convert ref'd to sf objects, then merge all back together 
  # plants complete x,y in one CRS or another? NOPE... Oh well. moving on.
  # plants[!is.na(UTMX) & is.na(UTMY)]
  # # plants[]
  # 
  # plants[!is.na(LATITUDE) & is.na(LONGITUDE)]
  # plants[is.na(LATITUDE) & !is.na(LONGITUDE)]
  
  #Conversion of data frame to sf object (note we've assumed NAD1983, Z15N for UTMs)
  plants_UTMS <- st_as_sf(x = plants[!is.na(UTMX)],                         
                          coords = c("UTMX", "UTMY"),
                          crs = "+proj=utm +zone=15")
  
  #Projection transformation
  plants_U_LL = st_transform(plants_UTMS, crs = "+proj=longlat +datum=WGS84")
  
  setDT(plants_U_LL)
  
  #Conversion of data frame to sf object
  plants_LLS <- st_as_sf(x = plants[!is.na(LONGITUDE)],                         
                         coords = c("LONGITUDE", "LATITUDE"),
                         crs = "+proj=longlat +datum=WGS84")
  
  setDT(plants_LLS)
  
  #drop unusedCRS cols from each:
  plants_U_LL[ , c("LATITUDE", "LONGITUDE") := NULL , ]
  plants_LLS[ , c("UTMX","UTMY") := NULL, ]
  
  plants2 <- rbindlist(list(plants_LLS, plants_U_LL))
  
  plants2 <- cbind(plants2, st_coordinates(st_as_sf(plants2)))
  
  plants2[ , geometry := NULL ,]
  
  names(plants2)[names(plants2)%in% c("X","Y")] <- c("Longitude","Latitude")
  
  #merge back to plants (check dims to ensure no duplications or overlaps):
  dim(plants)
## [1] 732299     46
  plants[is.na(UTMX) & is.na(LONGITUDE) , .N , ]+
    plants2[ , .N ,]
## [1] 732299
  plants1 <- plants[is.na(UTMX) & is.na(LONGITUDE), ]
  
  plants1[ , c("UTMX", "UTMY", "LATITUDE", "LONGITUDE") := NULL ,]
  
  plants1[ , c("Longitude", "Latitude") := NA, ]
  
  nrow(plants1)+nrow(plants2)
## [1] 732299
  plants <- rbindlist(list(plants1, plants2))
  
  rm(plants_LLS,plants_U_LL, plants1, plants2)
  
  # label all pwi_l with watershed names
  
  # st_join(pwi_l, watersheds_huc8)
  # 
  # st_crs(plants_UTMS) <- st_crs(watersheds_huc8)
  # 
  # plantsUTMS <- st_join(plants_UTMS, left = TRUE, watersheds_huc8)
  
  pwi_l <- st_sf(pwi_l)
  st_crs(pwi_l) <- st_crs(watersheds_huc8) #ignore warning, no re-projection needed in this case, we do this because I lost the crs in some of my data manipulation
  pwi_l <- st_join(pwi_l, left = TRUE, watersheds_huc8)
  setDT(pwi_l)
  rm( plants_UTMS)

  

  
# add in secchi data ------------------------------------------------------

Secchi Data Join

This code will conduct an eval of the fuzzy join of Secchi to plants data, calculate Secchi metrics based on the chosen fuzzy join, then excute the join. The code includes a solution adapted from a script written by Dan Larkin for the niches project (https://conservancy.umn.edu/handle/11299/218009).

Assign a Secchi to each observationuse the closest Secchi temporally. For each plant observation, we’ll append the Secchi observation from that DOW that was closest in time to the plant obs.

  # number of observations
  hist(secchi[,year(Date)])

  hist(secchi[,month(Date)])

  secchi[,.N,Source]
##                  Source      N
##                  <char>  <int>
## 1:        Shallow Lakes   1124
## 2:                  PCA 576270
## 3:              DNRFish  10165
## 4: DNR Historical Files   4013
## 5:   ll_biologist_files   2320
  # and for the plants data?
  plants[ is.na(INDATABASE) , YEAR:= year(as.IDate(DATESURVEYSTART))  , ] #new data imports need to move date in from the chr strings
  plants[ is.na(INDATABASE) , SURVEY_DATE:= as.IDate(DATESURVEYSTART)  , ] #new data imports need to move date in from the chr strings
  hist(plants[ ,.N , .(SURVEY_ID,YEAR) ][ , YEAR,])

  #clean some data in prep for join
  secchi[, YEAR := year(Date)]
  secchi[, MONTH := month(Date) ]
  
  secchi[ , old_DOW := DOW]
  secchi[, DOW := as.integer(DOW)]
  secchi[ is.na(DOW) , old_DOW ]
## [1] "03IMP002" "24IMP001" "26IMP001" "R001-46G" "R001-46V" "R1-96-1"  "W0127601"
## [8] "W0655001"
  #how many survey DOW's have a secchi for the lake (ever)?
  summary(plants[ , unique(DOW) , ]%in%secchi[ ,DOW ,])
##    Mode   FALSE    TRUE 
## logical     156    1364
  #how many surveys have a secchi for that year?
  summary(plants[ , .N ,.(DOW,YEAR) ][,paste(DOW,YEAR, sep = "_"),] %in% secchi[ ,paste(DOW,YEAR, sep = "_") ,])
##    Mode   FALSE    TRUE 
## logical     555    2129
  #how many surveys have a secchi for that month?
  summary(plants[ , .N ,.(DOW,YEAR,MONTH) ][,paste(paste(DOW,YEAR, sep = "_"),MONTH, sep = "_"),] %in% 
            secchi[ ,paste(paste(DOW,YEAR, sep = "_"),MONTH, sep = "_")  ,])
##    Mode   FALSE    TRUE 
## logical     729    2373
  plants[ , date := SURVEY_DATE]
  
  
  
  #consolidate to the DOW-Date level
  secchi <- secchi[ , .("Secchi_m" = mean(Secchi_m))  , .(DOW, Date) ]
  secchi[ , SECCHI_DATE := Date]
  secchi <- secchi[!is.na(DOW)]
  
  plants <- secchi[plants,  , on = .(DOW, Date = date),  roll='nearest' ]
  
  #drop the Date field (now a dup of SURVEY_DATE)
  plants[ , Date := NULL]
  
  #how far apart are plant and secchi obs?
  hist(plants[,SURVEY_DATE-SECCHI_DATE,])

  #keep only Secchi obs within a month (date+/-30d)
  hist(plants[ , abs(yday(SECCHI_DATE) - yday(SURVEY_DATE)), ])

  plants[abs(yday(SECCHI_DATE) - yday(SURVEY_DATE))<30 &
           abs(year(SECCHI_DATE) - year(SURVEY_DATE))<1, SECCHI_m_ACCEPTED := Secchi_m ]

  #cleanup:
  rm(secchi)

Calc Light Availability

  # calculate point level light avail 
  plants[ , proplight := exp(-(log(10)/SECCHI_m_ACCEPTED)*(DEPTH_FT/3.2804)) ]
  nrow(plants[!is.na(proplight) , .N , POINT_ID])/ #how many points can we do this for?
  nrow(plants[, .N , POINT_ID])#out of total n points
## [1] 0.7834643
  plants[ ,hist(proplight, breaks = 100, main = "78% coverage for light availability")]

## $breaks
##   [1] 0.00 0.01 0.02 0.03 0.04 0.05 0.06 0.07 0.08 0.09 0.10 0.11 0.12 0.13 0.14
##  [16] 0.15 0.16 0.17 0.18 0.19 0.20 0.21 0.22 0.23 0.24 0.25 0.26 0.27 0.28 0.29
##  [31] 0.30 0.31 0.32 0.33 0.34 0.35 0.36 0.37 0.38 0.39 0.40 0.41 0.42 0.43 0.44
##  [46] 0.45 0.46 0.47 0.48 0.49 0.50 0.51 0.52 0.53 0.54 0.55 0.56 0.57 0.58 0.59
##  [61] 0.60 0.61 0.62 0.63 0.64 0.65 0.66 0.67 0.68 0.69 0.70 0.71 0.72 0.73 0.74
##  [76] 0.75 0.76 0.77 0.78 0.79 0.80 0.81 0.82 0.83 0.84 0.85 0.86 0.87 0.88 0.89
##  [91] 0.90 0.91 0.92 0.93 0.94 0.95 0.96 0.97 0.98 0.99 1.00
## 
## $counts
##   [1] 87425 21541 20051 14027 13790 12125 12384 11809 10560 16318  8779 10295
##  [13] 11928 10512  9710 10318  7846 11072  7909  9283  5139 11261  5899  7674
##  [25]  6808  7357  8058  6522  7034  5416  4022  9332  4087  6153  4514  7329
##  [37]  4885  5087  3903  5843  3253  5504  4414  5296  4431  3439  5605  2951
##  [49]  3747  3006  3890  5037  2405  3224  4298  1994  3505  3115  2712  3075
##  [61]  2234  3197  1338  3328  2724  2579  2934  1052  2193  1870  1651  2135
##  [73]  1066  1485  1283  1893   660  1695   906  1243  1033  1312   768   909
##  [85]   674   523   664   361   419   566   409   291   168   120   107   122
##  [97]    55    41    37  4525
## 
## $density
##   [1] 15.351832211  3.782600145  3.520956107  2.463141555  2.421524349
##   [6]  2.129150307  2.174630713  2.073660699  1.854336267  2.865441213
##  [11]  1.541592622  1.807802260  2.094557102  1.845907466  1.705076246
##  [16]  1.811841061  1.377757798  1.944243480  1.388820600  1.630095035
##  [21]  0.902408530  1.977431885  1.035864549  1.347554594  1.195484972
##  [26]  1.291889386  1.414985004  1.145263365  1.235170578  0.951049737
##  [31]  0.706263302  1.638699436  0.717677303  1.080466956  0.792658514
##  [36]  1.286972585  0.857806124  0.893277329  0.685366899  1.026030948
##  [41]  0.571226882  0.966502539  0.775098512  0.929977734  0.778083712
##  [46]  0.603888487  0.984238142  0.518195675  0.657973295  0.527853676
##  [51]  0.683084098  0.884497327  0.422318061  0.566134482  0.754728909
##  [56]  0.350146450  0.615478089  0.546994079  0.476227269  0.539970078
##  [61]  0.392290456  0.561393281  0.234952834  0.584396884  0.478334469
##  [66]  0.452872465  0.515210474  0.184731227  0.385090855  0.328372047
##  [71]  0.289915642  0.374906054  0.187189627  0.260766038  0.225294832
##  [76]  0.332410848  0.115896017  0.297642043  0.159093623  0.218270831
##  [81]  0.181394826  0.230387233  0.134860819  0.159620423  0.118354417
##  [86]  0.091838813  0.116598417  0.063391609  0.073576411  0.099389614
##  [91]  0.071820410  0.051099607  0.029500804  0.021072003  0.018789203
##  [96]  0.021423203  0.009658001  0.007199601  0.006497201  0.794590114
## 
## $mids
##   [1] 0.005 0.015 0.025 0.035 0.045 0.055 0.065 0.075 0.085 0.095 0.105 0.115
##  [13] 0.125 0.135 0.145 0.155 0.165 0.175 0.185 0.195 0.205 0.215 0.225 0.235
##  [25] 0.245 0.255 0.265 0.275 0.285 0.295 0.305 0.315 0.325 0.335 0.345 0.355
##  [37] 0.365 0.375 0.385 0.395 0.405 0.415 0.425 0.435 0.445 0.455 0.465 0.475
##  [49] 0.485 0.495 0.505 0.515 0.525 0.535 0.545 0.555 0.565 0.575 0.585 0.595
##  [61] 0.605 0.615 0.625 0.635 0.645 0.655 0.665 0.675 0.685 0.695 0.705 0.715
##  [73] 0.725 0.735 0.745 0.755 0.765 0.775 0.785 0.795 0.805 0.815 0.825 0.835
##  [85] 0.845 0.855 0.865 0.875 0.885 0.895 0.905 0.915 0.925 0.935 0.945 0.955
##  [97] 0.965 0.975 0.985 0.995
## 
## $xname
## [1] "proplight"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
# clean taxa names here ---------------------------------------------------

Tidy Taxa Names

  #these names ought to be pretty close already. 
  
plants[ , .N , TAXON] 
##                       TAXON      N
##                      <char>  <int>
##   1: Ceratophyllum demersum  91658
##   2:  Vallisneria americana  13764
##   3:          Drepanocladus   1772
##   4:                   <NA> 140457
##   5:  Potamogeton robbinsii  10131
##  ---                              
## 228:    Littorella uniflora      3
## 229: Andromeda glaucophylla      2
## 230:  Sphagnum magellanicum      1
## 231:         Elatine minima      3
## 232:        Cicuta maculata      1
  plants[ , sort(unique(tolower(gsub("_", " ", TAXON)))) , ]
##   [1] "acorus"                         "acorus americanus"             
##   [3] "alisma"                         "alisma triviale"               
##   [5] "alnus"                          "andromeda glaucophylla"        
##   [7] "andromeda polifolia"            "asclepias incarnata"           
##   [9] "asteraceae"                     "betula pumila"                 
##  [11] "bidens"                         "bidens beckii"                 
##  [13] "bolboschoenus fluviatilis"      "bolboschoenus maritimus"       
##  [15] "boltonia asteroides"            "brasenia schreberi"            
##  [17] "butomus umbellatus"             "calamagrostis canadensis"      
##  [19] "calla palustris"                "callitriche"                   
##  [21] "caltha palustris"               "carex"                         
##  [23] "carex aquatilis"                "carex comosa"                  
##  [25] "carex lacustris"                "carex pellita"                 
##  [27] "carex scoparia"                 "ceratophyllum"                 
##  [29] "ceratophyllum demersum"         "chamaedaphne calyculata"       
##  [31] "chara"                          "chara canescens"               
##  [33] "chara globularis"               "characeae"                     
##  [35] "cicuta"                         "cicuta maculata"               
##  [37] "cyperaceae"                     "drepanocladus"                 
##  [39] "drepanocladus aduncus"          "dulichium arundinaceum"        
##  [41] "elatine"                        "elatine minima"                
##  [43] "eleocharis"                     "eleocharis acicularis"         
##  [45] "eleocharis erythropoda"         "eleocharis palustris"          
##  [47] "eleocharis smallii"             "elodea"                        
##  [49] "elodea canadensis"              "elodea nuttallii"              
##  [51] "equisetum"                      "equisetum fluviatile"          
##  [53] "eragrostis"                     "eriocaulon aquaticum"          
##  [55] "eupatorium dubium"              "eupatorium maculatum"          
##  [57] "eupatorium perfoliatum"         "eutrochium"                    
##  [59] "fontinalis antipyretica"        "fontinalis sullivantii"        
##  [61] "glyceria borealis"              "heteranthera dubia"            
##  [63] "hippuris vulgaris"              "hypericum"                     
##  [65] "hypericum ellipticum"           "impatiens"                     
##  [67] "impatiens capensis"             "iris"                          
##  [69] "iris versicolor"                "iris virginica"                
##  [71] "isoetes"                        "isoetes echinospora"           
##  [73] "juncus"                         "juncus arcticus"               
##  [75] "juncus canadensis"              "juncus effusus"                
##  [77] "juncus pelocarpus"              "lamiaceae"                     
##  [79] "ledum groenlandicum"            "leersia oryzoides"             
##  [81] "lemna"                          "lemna minor"                   
##  [83] "lemna trisulca"                 "lemna turionifera"             
##  [85] "littorella uniflora"            "lobelia dortmanna"             
##  [87] "lychnothamnus barbatus"         "lycopus americanus"            
##  [89] "lysimachia"                     "lysimachia terrestris"         
##  [91] "lythrum salicaria"              "menyanthes trifoliata"         
##  [93] "myrica gale"                    "myriophyllum"                  
##  [95] "myriophyllum alterniflorum"     "myriophyllum exalbescens"      
##  [97] "myriophyllum farwellii"         "myriophyllum sibiricum"        
##  [99] "myriophyllum spicatum"          "myriophyllum tenellum"         
## [101] "myriophyllum verticillatum"     "najas"                         
## [103] "najas flexilis"                 "najas guadalupensis"           
## [105] "najas minor"                    "nasturtium officinale"         
## [107] "nelumbo lutea"                  "nitella"                       
## [109] "nitellopsis"                    "nuphar"                        
## [111] "nuphar advena"                  "nuphar microphylla"            
## [113] "nuphar variegata"               "nymphaea"                      
## [115] "nymphaea odorata"               "nymphaea tuberosa"             
## [117] "nymphaeaceae"                   "persicaria"                    
## [119] "persicaria amphibia"            "persicaria lapathifolia"       
## [121] "phalaris arundinacea"           "phragmites australis"          
## [123] "poaceae"                        "polygonum amphibium"           
## [125] "pontederia cordata"             "potamogeton"                   
## [127] "potamogeton (broad)"            "potamogeton (narrow)"          
## [129] "potamogeton alpinus"            "potamogeton amplifolius"       
## [131] "potamogeton crispus"            "potamogeton epihydrus"         
## [133] "potamogeton foliosus"           "potamogeton friesii"           
## [135] "potamogeton gramineus"          "potamogeton hillii"            
## [137] "potamogeton illinoensis"        "potamogeton natans"            
## [139] "potamogeton nodosus"            "potamogeton obtusifolius"      
## [141] "potamogeton praelongus"         "potamogeton pusillus"          
## [143] "potamogeton richardsonii"       "potamogeton robbinsii"         
## [145] "potamogeton spirillus"          "potamogeton strictifolius"     
## [147] "potamogeton vaseyi"             "potamogeton zosteriformis"     
## [149] "potentilla palustris"           "protectedspecies 1"            
## [151] "protectedspecies 10"            "protectedspecies 11"           
## [153] "protectedspecies 12"            "protectedspecies 2"            
## [155] "protectedspecies 3"             "protectedspecies 4"            
## [157] "protectedspecies 5"             "protectedspecies 6"            
## [159] "protectedspecies 7"             "protectedspecies 8"            
## [161] "protectedspecies 9"             "ranunculus"                    
## [163] "ranunculus aquatilis"           "ranunculus flabellaris"        
## [165] "ranunculus flammula"            "ranunculus longirostris"       
## [167] "riccia"                         "riccia fluitans"               
## [169] "ricciocarpos natans"            "rumex orbiculatus"             
## [171] "sagittaria"                     "sagittaria cristata"           
## [173] "sagittaria cuneata"             "sagittaria graminea"           
## [175] "sagittaria latifolia"           "sagittaria rigida"             
## [177] "salix"                          "schoenoplectus"                
## [179] "schoenoplectus acutus"          "schoenoplectus americanus"     
## [181] "schoenoplectus pungens"         "schoenoplectus subterminalis"  
## [183] "schoenoplectus tabernaemontani" "schoenoplectus x oblongus"     
## [185] "scirpus"                        "scirpus atrovirens"            
## [187] "scirpus cyperinus"              "scirpus validus"               
## [189] "scolochloa festucacea"          "scorpidium scorpioides"        
## [191] "scutellaria"                    "scutellaria lateriflora"       
## [193] "sium suave"                     "solidago"                      
## [195] "sparganium"                     "sparganium (emergent)"         
## [197] "sparganium (floating)"          "sparganium americanum"         
## [199] "sparganium angustifolium"       "sparganium emersum"            
## [201] "sparganium eurycarpum"          "sparganium fluctuans"          
## [203] "sparganium natans"              "sphagnum"                      
## [205] "sphagnum magellanicum"          "spirodela polyrhiza"           
## [207] "stuckenia"                      "stuckenia filiformis"          
## [209] "stuckenia pectinata"            "tolypella intricata"           
## [211] "triadenum fraseri"              "typha"                         
## [213] "typha angustifolia"             "typha glauca"                  
## [215] "typha latifolia"                "utricularia"                   
## [217] "utricularia gibba"              "utricularia intermedia"        
## [219] "utricularia macrorhiza"         "utricularia minor"             
## [221] "utricularia vulgaris"           "vallisneria americana"         
## [223] "verbena"                        "veronica americana"            
## [225] "wolffia"                        "wolffia borealis"              
## [227] "wolffia columbiana"             "zannichellia palustris"        
## [229] "zizania"                        "zizania palustris"             
## [231] "zosterella"
plants[ , TAXON := tolower(gsub("_", " ", TAXON)) ,    ]
  

# clean up  taxonomy using macroniche (see paper or git repo) taxonomy ------------------------------------------------------

Verhoeven, M. R., Glisson, W. J., & Larkin, D. J. (2020). Niche models

  #' differentiate potential impacts of two aquatic invasive plant species on 
  #' native macrophytes. Diversity, 12, 162. https://doi.org/10.3390/d12040162
  
  # pull in taxonomy corrections:
  tnrs <- fread( file = "data&scripts/data/input/tnrs.final.csv", drop = 1)  
tnrs[ , submittedname := tolower(gsub("\\.", "", submittedname)) ,]#make format match

sum(plants$TAXON %in% tnrs$submittedname)
## [1] 484941
  tnrs[match(plants$TAXON, tnrs$submittedname), "species"]
##                        species
##                         <char>
##      1: Ceratophyllum demersum
##      2:  Vallisneria americana
##      3:                   <NA>
##      4:                   <NA>
##      5:                   <NA>
##     ---                       
## 732295: Myriophyllum sibiricum
## 732296:                   <NA>
## 732297:     Heteranthera dubia
## 732298:         Lemna trisulca
## 732299: Myriophyllum sibiricum
# implement name changes --------------------------------------------------------
  
# Add in a Taxon corrected column  
  plants[, TAXONC := tnrs[match(plants$TAXON, tnrs$submittedname), "species"],  ]
  #these are all looking good to me.
  print(plants[ , .N , TAXONC], max.levels = 200)
##                      TAXONC      N
##                      <char>  <int>
##   1: Ceratophyllum demersum  91658
##   2:  Vallisneria americana  13764
##   3:                   <NA> 247358
##   4:  Potamogeton robbinsii  10131
##   5:       Nuphar variegata  10020
##  ---                              
## 152:    Littorella uniflora      3
## 153: Andromeda glaucophylla      2
## 154:  Sphagnum magellanicum      1
## 155:         Elatine minima      3
## 156:        Cicuta maculata      1
  #tidy up names
  
  
  plants[is.na(TAXONC), .N , TAXON ] #review unmatched
##                       TAXON      N
##                      <char>  <int>
##  1:           drepanocladus   1772
##  2:                    <NA> 140457
##  3:                   carex    680
##  4:               equisetum    294
##  5:              sagittaria   1895
##  6:   sparganium (floating)    865
##  7:              eleocharis   1128
##  8:                 nitella   1481
##  9:                   chara  51713
## 10:   sparganium (emergent)     90
## 11:            nymphaeaceae      5
## 12:    potamogeton (narrow)   7426
## 13:      protectedspecies 1    243
## 14:             utricularia   1314
## 15:                    iris     13
## 16:      eleocharis smallii     67
## 17:                   salix     58
## 18:          schoenoplectus   5094
## 19:              sparganium    721
## 20:                  elodea   2519
## 21:                   najas   7351
## 22:                 isoetes    166
## 23:              ranunculus   2091
## 24:               hypericum     20
## 25:              persicaria    174
## 26:               impatiens     14
## 27:      protectedspecies 2    884
## 28:     potamogeton (broad)     61
## 29:            myriophyllum   2078
## 30:      protectedspecies 3    277
## 31:      protectedspecies 4    114
## 32:                 poaceae     43
## 33:                   typha   1529
## 34:            typha glauca     28
## 35:               stuckenia    104
## 36:     andromeda polifolia      2
## 37:              eragrostis      4
## 38:             scutellaria      7
## 39:         alisma triviale      4
## 40:                 scirpus    639
## 41:                  juncus     45
## 42:                   lemna   1112
## 43:             potamogeton   5876
## 44:                 wolffia   1205
## 45:                 zizania    502
## 46:      protectedspecies 5    162
## 47:                 elatine      5
## 48:                  bidens     24
## 49:               characeae   1479
## 50:                nymphaea   1945
## 51:      protectedspecies 6      3
## 52:               lamiaceae      6
## 53:                sphagnum      4
## 54:                   alnus     14
## 55:                 verbena      8
## 56:                  nuphar   1100
## 57:      protectedspecies 7     14
## 58:      protectedspecies 8      9
## 59:      protectedspecies 9     58
## 60:              zosterella     20
## 61:             callitriche      2
## 62:     protectedspecies 10      4
## 63:              cyperaceae      6
## 64:                  alisma     29
## 65:           ceratophyllum      4
## 66:                  cicuta      1
## 67:     protectedspecies 11      1
## 68:   nasturtium officinale      1
## 69:     protectedspecies 12     11
## 70: bolboschoenus maritimus      2
## 71:     ricciocarpos natans      2
## 72:                  riccia     12
## 73:              eutrochium      3
## 74:             nitellopsis    259
## 75:                solidago      2
## 76:                  acorus      1
## 77:              lysimachia      2
##                       TAXON      N
  plants[is.na(TAXONC) & str_detect(TAXON, "\\(", negate = T ) & sapply(strsplit(TAXON, " "), length) == 2,
         .N  ,
         sub("\\b(\\w)(\\w*)\\b", "\\U\\1\\L\\2", TAXON, perl = TRUE) ]
##                         sub     N
##                      <char> <int>
##  1:      Protectedspecies 1   243
##  2:      Eleocharis smallii    67
##  3:      Protectedspecies 2   884
##  4:      Protectedspecies 3   277
##  5:      Protectedspecies 4   114
##  6:            Typha glauca    28
##  7:     Andromeda polifolia     2
##  8:         Alisma triviale     4
##  9:      Protectedspecies 5   162
## 10:      Protectedspecies 6     3
## 11:      Protectedspecies 7    14
## 12:      Protectedspecies 8     9
## 13:      Protectedspecies 9    58
## 14:     Protectedspecies 10     4
## 15:     Protectedspecies 11     1
## 16:   Nasturtium officinale     1
## 17:     Protectedspecies 12    11
## 18: Bolboschoenus maritimus     2
## 19:     Ricciocarpos natans     2
  #fix first cap in binomials:
  plants[is.na(TAXONC) & str_detect(TAXON, "\\(", negate = T ) & sapply(strsplit(TAXON, " "), length) == 2,
         TAXON :=  sub("\\b(\\w)(\\w*)\\b", "\\U\\1\\L\\2", TAXON, perl = TRUE) ]
  
  # overwrite TAXON with corrected names
  plants[!is.na(TAXONC), TAXON := TAXONC]
 
  #delete correction col:
  plants[ , TAXONC := NULL ,]
  
  #review taxa
  plants[ , sort(unique(TAXON)) ,]
##   [1] "acorus"                         "Acorus americanus"             
##   [3] "alisma"                         "Alisma triviale"               
##   [5] "alnus"                          "Andromeda glaucophylla"        
##   [7] "Andromeda polifolia"            "Asclepias incarnata"           
##   [9] "Asteraceae"                     "Betula pumila"                 
##  [11] "bidens"                         "Bidens beckii"                 
##  [13] "Bolboschoenus fluviatilis"      "Bolboschoenus maritimus"       
##  [15] "Boltonia asteroides"            "Brasenia schreberi"            
##  [17] "Butomus umbellatus"             "Calamagrostis canadensis"      
##  [19] "Calla palustris"                "callitriche"                   
##  [21] "Caltha palustris"               "carex"                         
##  [23] "Carex aquatilis"                "Carex comosa"                  
##  [25] "Carex lacustris"                "Carex pellita"                 
##  [27] "Carex scoparia"                 "ceratophyllum"                 
##  [29] "Ceratophyllum demersum"         "Chamaedaphne calyculata"       
##  [31] "chara"                          "Chara canescens"               
##  [33] "Chara globularis"               "characeae"                     
##  [35] "cicuta"                         "Cicuta maculata"               
##  [37] "cyperaceae"                     "drepanocladus"                 
##  [39] "Drepanocladus aduncus"          "Dulichium arundinaceum"        
##  [41] "elatine"                        "Elatine minima"                
##  [43] "eleocharis"                     "Eleocharis acicularis"         
##  [45] "Eleocharis erythropoda"         "Eleocharis palustris"          
##  [47] "Eleocharis smallii"             "elodea"                        
##  [49] "Elodea canadensis"              "Elodea nuttallii"              
##  [51] "equisetum"                      "Equisetum fluviatile"          
##  [53] "eragrostis"                     "Eriocaulon aquaticum"          
##  [55] "Eupatorium dubium"              "Eupatorium maculatum"          
##  [57] "Eupatorium perfoliatum"         "eutrochium"                    
##  [59] "Fontinalis antipyretica"        "Fontinalis sullivantii"        
##  [61] "Glyceria borealis"              "Heteranthera dubia"            
##  [63] "Hippuris vulgaris"              "hypericum"                     
##  [65] "Hypericum ellipticum"           "impatiens"                     
##  [67] "Impatiens capensis"             "iris"                          
##  [69] "Iris versicolor"                "Iris virginica"                
##  [71] "isoetes"                        "Isoetes echinospora"           
##  [73] "juncus"                         "Juncus arcticus"               
##  [75] "Juncus canadensis"              "Juncus effusus"                
##  [77] "Juncus pelocarpus"              "lamiaceae"                     
##  [79] "Ledum groenlandicum"            "Leersia oryzoides"             
##  [81] "lemna"                          "Lemna minor"                   
##  [83] "Lemna trisulca"                 "Lemna turionifera"             
##  [85] "Littorella uniflora"            "Lobelia dortmanna"             
##  [87] "Lychnothamnus barbatus"         "Lycopus americanus"            
##  [89] "lysimachia"                     "Lysimachia terrestris"         
##  [91] "Lythrum salicaria"              "Menyanthes trifoliata"         
##  [93] "Myrica gale"                    "myriophyllum"                  
##  [95] "Myriophyllum alterniflorum"     "Myriophyllum exalbescens"      
##  [97] "Myriophyllum farwellii"         "Myriophyllum sibiricum"        
##  [99] "Myriophyllum spicatum"          "Myriophyllum tenellum"         
## [101] "Myriophyllum verticillatum"     "najas"                         
## [103] "Najas flexilis"                 "Najas guadalupensis"           
## [105] "Najas minor"                    "Nasturtium officinale"         
## [107] "Nelumbo lutea"                  "nitella"                       
## [109] "nitellopsis"                    "nuphar"                        
## [111] "Nuphar advena"                  "Nuphar microphylla"            
## [113] "Nuphar variegata"               "nymphaea"                      
## [115] "Nymphaea odorata"               "Nymphaea tuberosa"             
## [117] "nymphaeaceae"                   "persicaria"                    
## [119] "Persicaria amphibia"            "Persicaria lapathifolia"       
## [121] "Phalaris arundinacea"           "Phragmites australis"          
## [123] "poaceae"                        "Polygonum amphibium"           
## [125] "Pontederia cordata"             "potamogeton"                   
## [127] "potamogeton (broad)"            "potamogeton (narrow)"          
## [129] "Potamogeton alpinus"            "Potamogeton amplifolius"       
## [131] "Potamogeton crispus"            "Potamogeton epihydrus"         
## [133] "Potamogeton foliosus"           "Potamogeton friesii"           
## [135] "Potamogeton gramineus"          "Potamogeton hillii"            
## [137] "Potamogeton illinoensis"        "Potamogeton natans"            
## [139] "Potamogeton nodosus"            "Potamogeton obtusifolius"      
## [141] "Potamogeton praelongus"         "Potamogeton pusillus"          
## [143] "Potamogeton richardsonii"       "Potamogeton robbinsii"         
## [145] "Potamogeton spirillus"          "Potamogeton strictifolius"     
## [147] "Potamogeton vaseyi"             "Potamogeton zosteriformis"     
## [149] "Potentilla palustris"           "Protectedspecies 1"            
## [151] "Protectedspecies 10"            "Protectedspecies 11"           
## [153] "Protectedspecies 12"            "Protectedspecies 2"            
## [155] "Protectedspecies 3"             "Protectedspecies 4"            
## [157] "Protectedspecies 5"             "Protectedspecies 6"            
## [159] "Protectedspecies 7"             "Protectedspecies 8"            
## [161] "Protectedspecies 9"             "ranunculus"                    
## [163] "Ranunculus aquatilis"           "Ranunculus flabellaris"        
## [165] "Ranunculus flammula"            "Ranunculus longirostris"       
## [167] "riccia"                         "Riccia fluitans"               
## [169] "Ricciocarpos natans"            "Rumex orbiculatus"             
## [171] "sagittaria"                     "Sagittaria cristata"           
## [173] "Sagittaria cuneata"             "Sagittaria graminea"           
## [175] "Sagittaria latifolia"           "Sagittaria rigida"             
## [177] "salix"                          "schoenoplectus"                
## [179] "Schoenoplectus acutus"          "Schoenoplectus americanus"     
## [181] "Schoenoplectus pungens"         "Schoenoplectus subterminalis"  
## [183] "Schoenoplectus tabernaemontani" "Schoenoplectus x oblongus"     
## [185] "scirpus"                        "Scirpus atrovirens"            
## [187] "Scirpus cyperinus"              "Scirpus validus"               
## [189] "Scolochloa festucacea"          "Scorpidium scorpioides"        
## [191] "scutellaria"                    "Scutellaria lateriflora"       
## [193] "Sium suave"                     "solidago"                      
## [195] "sparganium"                     "sparganium (emergent)"         
## [197] "sparganium (floating)"          "Sparganium americanum"         
## [199] "Sparganium angustifolium"       "Sparganium emersum"            
## [201] "Sparganium eurycarpum"          "Sparganium fluctuans"          
## [203] "Sparganium natans"              "sphagnum"                      
## [205] "Sphagnum magellanicum"          "Spirodela polyrhiza"           
## [207] "stuckenia"                      "Stuckenia filiformis"          
## [209] "Stuckenia pectinata"            "Tolypella intricata"           
## [211] "Triadenum fraseri"              "typha"                         
## [213] "Typha angustifolia"             "Typha glauca"                  
## [215] "Typha latifolia"                "utricularia"                   
## [217] "Utricularia gibba"              "Utricularia intermedia"        
## [219] "Utricularia macrorhiza"         "Utricularia minor"             
## [221] "Utricularia vulgaris"           "Vallisneria americana"         
## [223] "verbena"                        "Veronica americana"            
## [225] "wolffia"                        "Wolffia borealis"              
## [227] "Wolffia columbiana"             "Zannichellia palustris"        
## [229] "zizania"                        "Zizania palustris"             
## [231] "zosterella"
  #only one species of nitellopsis-
  plants[TAXON == "nitellopsis", TAXON := "Nitellopsis obtusa" , ]
  

  # Make sure they aren't NA'd taxa that should be marked no-veg-found (are all NA taxa marked for NO_VEG_FOUND?) Yes, looks good
  plants[ is.na(TAXON) , .N, NO_VEG_FOUND , ]
##    NO_VEG_FOUND      N
##          <lgcl>  <int>
## 1:         TRUE 140457
  # #now we need to ensure we retain rows where no useable taxa were found
  # plants[ NO_VEG_FOUND == TRUE, , ]
  #
  # #now delete all rows where TAXON == DELETE & NO_VEG_FOUND == F
  # # plants[ TAXON == "DELETE" & NO_VEG_FOUND == F, ]
  # plants <-   plants[ !(TAXON == "DELETE" & NO_VEG_FOUND == F), ]
  
  # samples per taxon
  plants[ , .N , TAXON][order(-N), ]
##                          TAXON      N
##                         <char>  <int>
##   1:                      <NA> 140457
##   2:    Ceratophyllum demersum  91658
##   3:                     chara  51713
##   4:       Potamogeton crispus  36032
##   5: Potamogeton zosteriformis  35772
##  ---                                 
## 228:    Scorpidium scorpioides      1
## 229:    Fontinalis sullivantii      1
## 230:                    acorus      1
## 231:     Sphagnum magellanicum      1
## 232:           Cicuta maculata      1
  #and finally a new unique ID for each observation in the dataset
  plants[, OBS_ID := .I]
  
  #update SURVEY_ID
  plants[ , length(unique(DATASOURCE)) , .(SURVEY_ID)][V1>2] #not needed unless this call is non-empty
## Empty data.table (0 rows and 2 cols): SURVEY_ID,V1
  # plants[ , SURVEY_ID_NEW:= .GRP , .(SURVEY_ID, DATASOURCE) ]

# plant status information ------------------------------------------------

Plant Status Key

  # native diversity must exclude invasives/introduced species, so generate a df that can be use to select cols in these categories
 
  
  plants[ , sort(unique(TAXON)) , ][!(plants[ , unique(TAXON) , ] %in% c(rte[native_status == "I", mn_dnr_scientific_name],
                                     "Nitellopsis obtusa", "Typha glauca"))]
##   [1] "acorus"                         "Acorus americanus"             
##   [3] "alisma"                         "Alisma triviale"               
##   [5] "alnus"                          "Andromeda glaucophylla"        
##   [7] "Andromeda polifolia"            "Asclepias incarnata"           
##   [9] "Asteraceae"                     "Betula pumila"                 
##  [11] "bidens"                         "Bidens beckii"                 
##  [13] "Bolboschoenus maritimus"        "Boltonia asteroides"           
##  [15] "Brasenia schreberi"             "Butomus umbellatus"            
##  [17] "Calamagrostis canadensis"       "Calla palustris"               
##  [19] "callitriche"                    "Caltha palustris"              
##  [21] "carex"                          "Carex aquatilis"               
##  [23] "Carex comosa"                   "Carex lacustris"               
##  [25] "Carex pellita"                  "Carex scoparia"                
##  [27] "ceratophyllum"                  "Ceratophyllum demersum"        
##  [29] "Chamaedaphne calyculata"        "chara"                         
##  [31] "Chara canescens"                "Chara globularis"              
##  [33] "characeae"                      "cicuta"                        
##  [35] "Cicuta maculata"                "cyperaceae"                    
##  [37] "drepanocladus"                  "Drepanocladus aduncus"         
##  [39] "Dulichium arundinaceum"         "elatine"                       
##  [41] "Elatine minima"                 "eleocharis"                    
##  [43] "Eleocharis acicularis"          "Eleocharis erythropoda"        
##  [45] "Eleocharis palustris"           "Eleocharis smallii"            
##  [47] "elodea"                         "Elodea canadensis"             
##  [49] "Elodea nuttallii"               "equisetum"                     
##  [51] "Equisetum fluviatile"           "eragrostis"                    
##  [53] "Eriocaulon aquaticum"           "Eupatorium dubium"             
##  [55] "Eupatorium maculatum"           "Eupatorium perfoliatum"        
##  [57] "eutrochium"                     "Fontinalis sullivantii"        
##  [59] "Glyceria borealis"              "Heteranthera dubia"            
##  [61] "Hippuris vulgaris"              "hypericum"                     
##  [63] "Hypericum ellipticum"           "impatiens"                     
##  [65] "Impatiens capensis"             "iris"                          
##  [67] "Iris versicolor"                "Iris virginica"                
##  [69] "isoetes"                        "Isoetes echinospora"           
##  [71] "juncus"                         "Juncus arcticus"               
##  [73] "Juncus canadensis"              "Juncus effusus"                
##  [75] "Juncus pelocarpus"              "lamiaceae"                     
##  [77] "Ledum groenlandicum"            "Leersia oryzoides"             
##  [79] "lemna"                          "Lemna minor"                   
##  [81] "Lemna trisulca"                 "Lemna turionifera"             
##  [83] "Littorella uniflora"            "Lobelia dortmanna"             
##  [85] "Lychnothamnus barbatus"         "lysimachia"                    
##  [87] "Lysimachia terrestris"          "Lythrum salicaria"             
##  [89] "Menyanthes trifoliata"          "Myrica gale"                   
##  [91] "myriophyllum"                   "Myriophyllum exalbescens"      
##  [93] "Myriophyllum farwellii"         "Myriophyllum sibiricum"        
##  [95] "Myriophyllum spicatum"          "Myriophyllum tenellum"         
##  [97] "najas"                          "Najas flexilis"                
##  [99] "Najas guadalupensis"            "Najas minor"                   
## [101] "Nasturtium officinale"          "Nelumbo lutea"                 
## [103] "nitella"                        "Nitellopsis obtusa"            
## [105] "nuphar"                         "Nuphar advena"                 
## [107] "Nuphar microphylla"             "Nuphar variegata"              
## [109] "nymphaea"                       "Nymphaea odorata"              
## [111] "Nymphaea tuberosa"              "nymphaeaceae"                  
## [113] "persicaria"                     "Persicaria amphibia"           
## [115] "Persicaria lapathifolia"        "Phalaris arundinacea"          
## [117] "Phragmites australis"           "poaceae"                       
## [119] "Polygonum amphibium"            "Pontederia cordata"            
## [121] "potamogeton"                    "potamogeton (broad)"           
## [123] "potamogeton (narrow)"           "Potamogeton alpinus"           
## [125] "Potamogeton amplifolius"        "Potamogeton crispus"           
## [127] "Potamogeton epihydrus"          "Potamogeton foliosus"          
## [129] "Potamogeton friesii"            "Potamogeton gramineus"         
## [131] "Potamogeton hillii"             "Potamogeton illinoensis"       
## [133] "Potamogeton natans"             "Potamogeton nodosus"           
## [135] "Potamogeton obtusifolius"       "Potamogeton praelongus"        
## [137] "Potamogeton pusillus"           "Potamogeton richardsonii"      
## [139] "Potamogeton robbinsii"          "Potamogeton spirillus"         
## [141] "Potamogeton strictifolius"      "Potamogeton vaseyi"            
## [143] "Potamogeton zosteriformis"      "Protectedspecies 1"            
## [145] "Protectedspecies 10"            "Protectedspecies 11"           
## [147] "Protectedspecies 12"            "Protectedspecies 2"            
## [149] "Protectedspecies 4"             "Protectedspecies 5"            
## [151] "Protectedspecies 6"             "Protectedspecies 7"            
## [153] "Protectedspecies 8"             "Protectedspecies 9"            
## [155] "ranunculus"                     "Ranunculus aquatilis"          
## [157] "Ranunculus flabellaris"         "Ranunculus flammula"           
## [159] "Ranunculus longirostris"        "riccia"                        
## [161] "Riccia fluitans"                "Ricciocarpos natans"           
## [163] "Rumex orbiculatus"              "sagittaria"                    
## [165] "Sagittaria cristata"            "Sagittaria cuneata"            
## [167] "Sagittaria graminea"            "Sagittaria latifolia"          
## [169] "Sagittaria rigida"              "salix"                         
## [171] "schoenoplectus"                 "Schoenoplectus acutus"         
## [173] "Schoenoplectus americanus"      "Schoenoplectus pungens"        
## [175] "Schoenoplectus subterminalis"   "Schoenoplectus tabernaemontani"
## [177] "Schoenoplectus x oblongus"      "scirpus"                       
## [179] "Scirpus atrovirens"             "Scirpus cyperinus"             
## [181] "Scirpus validus"                "Scolochloa festucacea"         
## [183] "Scorpidium scorpioides"         "Scutellaria lateriflora"       
## [185] "Sium suave"                     "solidago"                      
## [187] "sparganium"                     "sparganium (emergent)"         
## [189] "sparganium (floating)"          "Sparganium americanum"         
## [191] "Sparganium angustifolium"       "Sparganium emersum"            
## [193] "Sparganium eurycarpum"          "Sparganium fluctuans"          
## [195] "Sparganium natans"              "sphagnum"                      
## [197] "Sphagnum magellanicum"          "Spirodela polyrhiza"           
## [199] "stuckenia"                      "Stuckenia filiformis"          
## [201] "Stuckenia pectinata"            "Tolypella intricata"           
## [203] "typha"                          "Typha angustifolia"            
## [205] "Typha glauca"                   "Typha latifolia"               
## [207] "utricularia"                    "Utricularia intermedia"        
## [209] "Utricularia macrorhiza"         "Utricularia minor"             
## [211] "Utricularia vulgaris"           "Vallisneria americana"         
## [213] "verbena"                        "Veronica americana"            
## [215] "wolffia"                        "Wolffia borealis"              
## [217] "Wolffia columbiana"             "Zannichellia palustris"        
## [219] "zizania"                        "Zizania palustris"             
## [221] "zosterella"                     NA
  #we'll use this to select columns as we calculate diversty metrics!
  natcols <- plants[ !is.na(TAXON), unique(TAXON) , ][!(plants[ !is.na(TAXON), unique(TAXON) , ] %in% c(rte[native_status == "I", mn_dnr_scientific_name],
                                                                              "Nitellopsis obtusa", "Typha glauca"))]
  taxacols <- plants[!is.na(TAXON) , unique(TAXON) , ]

 

# Prep Data Products ------------------------------------------------------


# **plants_db -------------------------------------------------------------

Prep Data Products

Observations Long

  names(plants)
##  [1] "DOW"                  "Secchi_m"             "SECCHI_DATE"         
##  [4] "SURVEY_ID"            "LAKE_NAME"            "DATASOURCE"          
##  [7] "SURVEY_DATE"          "STA_NBR_DATASOURCE"   "DEPTH_FT"            
## [10] "NO_VEG_FOUND"         "REL_ABUND"            "WHOLE_RAKE_REL_ABUND"
## [13] "SUBSTRATE"            "SURVEYOR"             "TAXON"               
## [16] "SURVEY_ID_DATASOURCE" "SAMPLE_NOTES"         "SURFACE_GROWTH"      
## [19] "POINT_LVL_SECCHI"     "POINT_ID"             "OBS_ID"              
## [22] "OLD_SURVEY_ID"        "DATESURVEYSTART"      "COHORT"              
## [25] "DATEINFO"             "MONTH"                "DAY"                 
## [28] "YEAR"                 "SUBBASIN"             "INVENTORY_STAFF"     
## [31] "INVENTORY_STAFFDATE"  "USEABLE"              "CLEANED"             
## [34] "INDATABASE"           "INVENTORY_NOTES"      "SUBMISSION_STAFF"    
## [37] "SUBMISSION_STAFFDATE" "SUBMISSION_NOTES"     "MULTIPARTSURVEY"     
## [40] "SURVEY_FEEDBACK"      "SURVEY_DATASOURCE"    "RAKE_SCALE_USED"     
## [43] "REL_ABUND_CORRECTED"  "order_ID"             "Longitude"           
## [46] "Latitude"             "SECCHI_m_ACCEPTED"    "proplight"
  plants[ , .N , month(SURVEY_DATE) ]
##    month      N
##    <int>  <int>
## 1:     8 224560
## 2:     7 237049
## 3:     6 187167
## 4:     9  27043
## 5:     5  43651
## 6:    10   1641
## 7:     4  10214
## 8:     3    974
  plants[ , unique(DATESURVEYSTART), , ][1:100]
##   [1] "8/16/2011" "8/10/2011" "7/17/2014" "6/25/2014" "7/15/2009" "7/29/2011"
##   [7] "7/21/2011" "6/28/2011" "8/3/2011"  "6/29/2010" "7/1/2009"  "6/10/2002"
##  [13] "7/12/2006" "6/30/2009" "6/18/2014" "7/31/2009" "7/10/2009" "7/11/2012"
##  [19] "6/18/2012" "6/8/2011"  "6/29/2011" "8/5/2011"  "8/20/2012" "8/26/2010"
##  [25] "6/16/2011" "8/6/2009"  "7/7/2014"  "6/16/2005" "6/3/2009"  "6/21/2012"
##  [31] "6/21/2007" "6/12/2007" "7/9/2010"  "7/23/2012" "8/17/2011" "7/3/2014" 
##  [37] "6/23/2011" "6/15/2007" "7/31/2014" "8/3/2009"  "7/2/2010"  "6/28/2010"
##  [43] "6/17/2013" "7/7/2003"  "6/6/2006"  "7/19/2012" "8/15/2012" "7/12/2012"
##  [49] "8/16/2012" "7/15/2003" "8/6/2013"  "6/13/2006" "8/18/2005" "8/18/2014"
##  [55] "6/14/2006" "8/22/2002" "7/1/2005"  "7/23/2008" "7/22/2011" "7/31/2003"
##  [61] "7/22/2003" "6/29/2004" "6/12/2012" "7/24/2012" "7/23/2003" "7/25/2003"
##  [67] "9/23/2014" "8/21/2002" "9/22/2014" "7/29/2013" "8/1/2013"  "7/25/2013"
##  [73] "6/20/2011" "8/11/2011" "7/20/2004" "7/30/2008" "7/21/2010" "8/15/2003"
##  [79] "8/12/2003" "9/9/2003"  "6/24/2003" "6/19/2013" "9/17/2002" "6/24/2013"
##  [85] "7/29/2003" "8/19/2003" "9/18/2002" "6/25/2003" "8/6/2012"  "8/14/2003"
##  [91] "8/2/2012"  "9/19/2003" "8/8/2011"  "8/15/2011" "9/24/2014" "8/7/2012" 
##  [97] "7/9/2013"  "6/16/2003" "7/28/2011" "7/28/2010"
  plants[ POINT_ID == 151207  , ] #one of these has a substrate, one not
##         DOW Secchi_m SECCHI_DATE SURVEY_ID   LAKE_NAME DATASOURCE SURVEY_DATE
##       <int>    <num>      <IDat>     <int>      <char>     <char>      <IDat>
## 1: 29014600   4.8768  2016-08-22      1440 belle taine   source_2  2016-08-22
## 2: 29014600   4.8768  2016-08-22      1440 belle taine   source_2  2016-08-22
##    STA_NBR_DATASOURCE DEPTH_FT NO_VEG_FOUND REL_ABUND WHOLE_RAKE_REL_ABUND
##                <char>    <num>       <lgcl>     <int>               <char>
## 1:                 27      3.5         TRUE        NA                     
## 2:                 27      3.5         TRUE        NA                     
##    SUBSTRATE    SURVEYOR  TAXON SURVEY_ID_DATASOURCE SAMPLE_NOTES
##       <char>      <char> <char>               <char>       <char>
## 1:           surveyors_6   <NA>    11981608342433000             
## 2:      sand surveyors_6   <NA>    11981608342433000             
##    SURFACE_GROWTH POINT_LVL_SECCHI POINT_ID OBS_ID OLD_SURVEY_ID
##            <char>            <num>    <int>  <int>         <int>
## 1:                              NA   151207 577513          1440
## 2:                              NA   151207 577514          1440
##    DATESURVEYSTART COHORT DATEINFO MONTH   DAY  YEAR SUBBASIN INVENTORY_STAFF
##             <char>  <int>   <char> <int> <int> <int>   <char>          <char>
## 1:       8/22/2016     NA              8    22  2016                         
## 2:       8/22/2016     NA              8    22  2016                         
##    INVENTORY_STAFFDATE USEABLE CLEANED INDATABASE INVENTORY_NOTES
##                 <char>  <char>  <char>     <lgcl>          <char>
## 1:                                           TRUE                
## 2:                                           TRUE                
##    SUBMISSION_STAFF SUBMISSION_STAFFDATE SUBMISSION_NOTES MULTIPARTSURVEY
##              <char>               <char>           <char>           <num>
## 1:          staff_1                                                    NA
## 2:          staff_1                                                    NA
##    SURVEY_FEEDBACK SURVEY_DATASOURCE RAKE_SCALE_USED REL_ABUND_CORRECTED
##             <char>            <char>           <int>               <int>
## 1:            <NA>                                NA                  NA
## 2:            <NA>                                NA                  NA
##    order_ID Longitude Latitude SECCHI_m_ACCEPTED proplight
##       <int>     <num>    <num>             <num>     <num>
## 1:    18230 -94.91323 46.93566            4.8768 0.6042556
## 2:    18230 -94.91323 46.93566            4.8768 0.6042556
  any(duplicated(plants[,.SD, .SDcols = !c("SUBSTRATE","OBS_ID")]))
## [1] TRUE
  plants[duplicated(plants[,.SD, .SDcols = !c("SUBSTRATE","OBS_ID")]), , ]
##         DOW Secchi_m SECCHI_DATE SURVEY_ID   LAKE_NAME DATASOURCE SURVEY_DATE
##       <int>    <num>      <IDat>     <int>      <char>     <char>      <IDat>
## 1: 29014600   4.8768  2016-08-22      1440 belle taine   source_2  2016-08-22
##    STA_NBR_DATASOURCE DEPTH_FT NO_VEG_FOUND REL_ABUND WHOLE_RAKE_REL_ABUND
##                <char>    <num>       <lgcl>     <int>               <char>
## 1:                 27      3.5         TRUE        NA                     
##    SUBSTRATE    SURVEYOR  TAXON SURVEY_ID_DATASOURCE SAMPLE_NOTES
##       <char>      <char> <char>               <char>       <char>
## 1:      sand surveyors_6   <NA>    11981608342433000             
##    SURFACE_GROWTH POINT_LVL_SECCHI POINT_ID OBS_ID OLD_SURVEY_ID
##            <char>            <num>    <int>  <int>         <int>
## 1:                              NA   151207 577514          1440
##    DATESURVEYSTART COHORT DATEINFO MONTH   DAY  YEAR SUBBASIN INVENTORY_STAFF
##             <char>  <int>   <char> <int> <int> <int>   <char>          <char>
## 1:       8/22/2016     NA              8    22  2016                         
##    INVENTORY_STAFFDATE USEABLE CLEANED INDATABASE INVENTORY_NOTES
##                 <char>  <char>  <char>     <lgcl>          <char>
## 1:                                           TRUE                
##    SUBMISSION_STAFF SUBMISSION_STAFFDATE SUBMISSION_NOTES MULTIPARTSURVEY
##              <char>               <char>           <char>           <num>
## 1:          staff_1                                                    NA
##    SURVEY_FEEDBACK SURVEY_DATASOURCE RAKE_SCALE_USED REL_ABUND_CORRECTED
##             <char>            <char>           <int>               <int>
## 1:            <NA>                                NA                  NA
##    order_ID Longitude Latitude SECCHI_m_ACCEPTED proplight
##       <int>     <num>    <num>             <num>     <num>
## 1:    18230 -94.91323 46.93566            4.8768 0.6042556
  plants <-   plants[!duplicated(plants[,.SD, .SDcols = !c("SUBSTRATE","OBS_ID")]), , ]
  
  #check has orderID field
  plants[ , .N , is.na(order_ID) ]
##     is.na      N
##    <lgcl>  <int>
## 1:  FALSE 728695
## 2:   TRUE   3603
  plants[is.na(order_ID) , .N , LAKE_NAME  ] #I've not been able to resolve the location on these WBs, so we'll leave them un-georeferenced
##                                     LAKE_NAME     N
##                                        <char> <int>
##  1:                                    pool 2    97
##  2:                               reynen pond    46
##  3:                        unnamed delong wpa    22
##  4:                                   big sob    64
##  5:                    ivanhoe wma east basin    26
##  6:                                olson pool   104
##  7: goldmine slough section - vermilion river   215
##  8: vermilion falls section - vermilion river    88
##  9:                                gull river   335
## 10:                         mississippi river  1769
## 11:                            little elk wma    98
## 12:                                   unnamed    20
## 13:                          sand prairie wma    75
## 14:                    loerch wma impoundment    31
## 15:             daggett brook wma impoundment   107
## 16:                               sterle pool   175
## 17:                              trettle pool   323
## 18:                              dundee marsh     8
  names(plants)
##  [1] "DOW"                  "Secchi_m"             "SECCHI_DATE"         
##  [4] "SURVEY_ID"            "LAKE_NAME"            "DATASOURCE"          
##  [7] "SURVEY_DATE"          "STA_NBR_DATASOURCE"   "DEPTH_FT"            
## [10] "NO_VEG_FOUND"         "REL_ABUND"            "WHOLE_RAKE_REL_ABUND"
## [13] "SUBSTRATE"            "SURVEYOR"             "TAXON"               
## [16] "SURVEY_ID_DATASOURCE" "SAMPLE_NOTES"         "SURFACE_GROWTH"      
## [19] "POINT_LVL_SECCHI"     "POINT_ID"             "OBS_ID"              
## [22] "OLD_SURVEY_ID"        "DATESURVEYSTART"      "COHORT"              
## [25] "DATEINFO"             "MONTH"                "DAY"                 
## [28] "YEAR"                 "SUBBASIN"             "INVENTORY_STAFF"     
## [31] "INVENTORY_STAFFDATE"  "USEABLE"              "CLEANED"             
## [34] "INDATABASE"           "INVENTORY_NOTES"      "SUBMISSION_STAFF"    
## [37] "SUBMISSION_STAFFDATE" "SUBMISSION_NOTES"     "MULTIPARTSURVEY"     
## [40] "SURVEY_FEEDBACK"      "SURVEY_DATASOURCE"    "RAKE_SCALE_USED"     
## [43] "REL_ABUND_CORRECTED"  "order_ID"             "Longitude"           
## [46] "Latitude"             "SECCHI_m_ACCEPTED"    "proplight"
# **point level p/a  -------------------------------------------------

Point Occurrences Wide

  plants[ ,.N , REL_ABUND]
##    REL_ABUND      N
##        <int>  <int>
## 1:        NA 535525
## 2:         1 116421
## 3:         2  39231
## 4:         3  21579
## 5:         4  13274
## 6:         5   6268
  plants[ , .N , INDATABASE]
##    INDATABASE      N
##        <lgcl>  <int>
## 1:       TRUE 732298
  # In case of desired fill 1/0 rather than T/F
  # plants[ , FILLFIELD := as.numeric(INDATABASE) ,]
  # plants[ , .N , FILLFIELD ]
  
  plants_occurrence_wide <- dcast(plants, 
                                  SURVEY_ID+ 
                                    POINT_ID +
                                    NO_VEG_FOUND +
                                    proplight +
                                    DEPTH_FT + 
                                    SUBSTRATE +
                                    SURVEYOR ~ TAXON, fun.aggregate = last, value.var = "INDATABASE",   fill = FALSE) #Specify a logical var for all included data (INDATABASE) so that this species matrix is all T/F; see previous lines for a 0/1 fill
  
  #diversity metrics (only have richness with p/a, no "evenness", no "diversity"):
  # point_natcols <- names(plants_occurrence_wide)[names(plants_occurrence_wide)%in%natcols]
  # names(plants_occurrence_wide)
  plants_occurrence_wide[ ,  richness := rowSums(.SD > 0), .SDcols = taxacols ]
  plants_occurrence_wide[ ,  nat_richness := rowSums(.SD > 0), .SDcols = natcols ]  
  
  #bring all survey level variables back into the dataset 
  #check join
  # nrow(plants[plants_occurrence_wide, on = .(POINT_ID, SURVEY_ID, NO_VEG_FOUND, proplight,DEPTH_FT,SUBSTRATE,SURVEYOR), mult = "last" , ])
  # names(plants[plants_occurrence_wide, on = .(POINT_ID, SURVEY_ID, NO_VEG_FOUND, proplight,DEPTH_FT,SUBSTRATE,SURVEYOR), mult = "last" , ])
  # 
  plants_occurrence_wide <- plants[plants_occurrence_wide, on = .(POINT_ID, SURVEY_ID, NO_VEG_FOUND, proplight, DEPTH_FT, SUBSTRATE, SURVEYOR), mult = "last" , ]
  

  #and drop unneeded cols & those with loss of meaning through munging:
  names(plants_occurrence_wide)
##   [1] "DOW"                            "Secchi_m"                      
##   [3] "SECCHI_DATE"                    "SURVEY_ID"                     
##   [5] "LAKE_NAME"                      "DATASOURCE"                    
##   [7] "SURVEY_DATE"                    "STA_NBR_DATASOURCE"            
##   [9] "DEPTH_FT"                       "NO_VEG_FOUND"                  
##  [11] "REL_ABUND"                      "WHOLE_RAKE_REL_ABUND"          
##  [13] "SUBSTRATE"                      "SURVEYOR"                      
##  [15] "TAXON"                          "SURVEY_ID_DATASOURCE"          
##  [17] "SAMPLE_NOTES"                   "SURFACE_GROWTH"                
##  [19] "POINT_LVL_SECCHI"               "POINT_ID"                      
##  [21] "OBS_ID"                         "OLD_SURVEY_ID"                 
##  [23] "DATESURVEYSTART"                "COHORT"                        
##  [25] "DATEINFO"                       "MONTH"                         
##  [27] "DAY"                            "YEAR"                          
##  [29] "SUBBASIN"                       "INVENTORY_STAFF"               
##  [31] "INVENTORY_STAFFDATE"            "USEABLE"                       
##  [33] "CLEANED"                        "INDATABASE"                    
##  [35] "INVENTORY_NOTES"                "SUBMISSION_STAFF"              
##  [37] "SUBMISSION_STAFFDATE"           "SUBMISSION_NOTES"              
##  [39] "MULTIPARTSURVEY"                "SURVEY_FEEDBACK"               
##  [41] "SURVEY_DATASOURCE"              "RAKE_SCALE_USED"               
##  [43] "REL_ABUND_CORRECTED"            "order_ID"                      
##  [45] "Longitude"                      "Latitude"                      
##  [47] "SECCHI_m_ACCEPTED"              "proplight"                     
##  [49] "NA"                             "Acorus americanus"             
##  [51] "Alisma triviale"                "Andromeda glaucophylla"        
##  [53] "Andromeda polifolia"            "Asclepias incarnata"           
##  [55] "Asteraceae"                     "Betula pumila"                 
##  [57] "Bidens beckii"                  "Bolboschoenus fluviatilis"     
##  [59] "Bolboschoenus maritimus"        "Boltonia asteroides"           
##  [61] "Brasenia schreberi"             "Butomus umbellatus"            
##  [63] "Calamagrostis canadensis"       "Calla palustris"               
##  [65] "Caltha palustris"               "Carex aquatilis"               
##  [67] "Carex comosa"                   "Carex lacustris"               
##  [69] "Carex pellita"                  "Carex scoparia"                
##  [71] "Ceratophyllum demersum"         "Chamaedaphne calyculata"       
##  [73] "Chara canescens"                "Chara globularis"              
##  [75] "Cicuta maculata"                "Drepanocladus aduncus"         
##  [77] "Dulichium arundinaceum"         "Elatine minima"                
##  [79] "Eleocharis acicularis"          "Eleocharis erythropoda"        
##  [81] "Eleocharis palustris"           "Eleocharis smallii"            
##  [83] "Elodea canadensis"              "Elodea nuttallii"              
##  [85] "Equisetum fluviatile"           "Eriocaulon aquaticum"          
##  [87] "Eupatorium dubium"              "Eupatorium maculatum"          
##  [89] "Eupatorium perfoliatum"         "Fontinalis antipyretica"       
##  [91] "Fontinalis sullivantii"         "Glyceria borealis"             
##  [93] "Heteranthera dubia"             "Hippuris vulgaris"             
##  [95] "Hypericum ellipticum"           "Impatiens capensis"            
##  [97] "Iris versicolor"                "Iris virginica"                
##  [99] "Isoetes echinospora"            "Juncus arcticus"               
## [101] "Juncus canadensis"              "Juncus effusus"                
## [103] "Juncus pelocarpus"              "Ledum groenlandicum"           
## [105] "Leersia oryzoides"              "Lemna minor"                   
## [107] "Lemna trisulca"                 "Lemna turionifera"             
## [109] "Littorella uniflora"            "Lobelia dortmanna"             
## [111] "Lychnothamnus barbatus"         "Lycopus americanus"            
## [113] "Lysimachia terrestris"          "Lythrum salicaria"             
## [115] "Menyanthes trifoliata"          "Myrica gale"                   
## [117] "Myriophyllum alterniflorum"     "Myriophyllum exalbescens"      
## [119] "Myriophyllum farwellii"         "Myriophyllum sibiricum"        
## [121] "Myriophyllum spicatum"          "Myriophyllum tenellum"         
## [123] "Myriophyllum verticillatum"     "Najas flexilis"                
## [125] "Najas guadalupensis"            "Najas minor"                   
## [127] "Nasturtium officinale"          "Nelumbo lutea"                 
## [129] "Nitellopsis obtusa"             "Nuphar advena"                 
## [131] "Nuphar microphylla"             "Nuphar variegata"              
## [133] "Nymphaea odorata"               "Nymphaea tuberosa"             
## [135] "Persicaria amphibia"            "Persicaria lapathifolia"       
## [137] "Phalaris arundinacea"           "Phragmites australis"          
## [139] "Polygonum amphibium"            "Pontederia cordata"            
## [141] "Potamogeton alpinus"            "Potamogeton amplifolius"       
## [143] "Potamogeton crispus"            "Potamogeton epihydrus"         
## [145] "Potamogeton foliosus"           "Potamogeton friesii"           
## [147] "Potamogeton gramineus"          "Potamogeton hillii"            
## [149] "Potamogeton illinoensis"        "Potamogeton natans"            
## [151] "Potamogeton nodosus"            "Potamogeton obtusifolius"      
## [153] "Potamogeton praelongus"         "Potamogeton pusillus"          
## [155] "Potamogeton richardsonii"       "Potamogeton robbinsii"         
## [157] "Potamogeton spirillus"          "Potamogeton strictifolius"     
## [159] "Potamogeton vaseyi"             "Potamogeton zosteriformis"     
## [161] "Potentilla palustris"           "Protectedspecies 1"            
## [163] "Protectedspecies 10"            "Protectedspecies 11"           
## [165] "Protectedspecies 12"            "Protectedspecies 2"            
## [167] "Protectedspecies 3"             "Protectedspecies 4"            
## [169] "Protectedspecies 5"             "Protectedspecies 6"            
## [171] "Protectedspecies 7"             "Protectedspecies 8"            
## [173] "Protectedspecies 9"             "Ranunculus aquatilis"          
## [175] "Ranunculus flabellaris"         "Ranunculus flammula"           
## [177] "Ranunculus longirostris"        "Riccia fluitans"               
## [179] "Ricciocarpos natans"            "Rumex orbiculatus"             
## [181] "Sagittaria cristata"            "Sagittaria cuneata"            
## [183] "Sagittaria graminea"            "Sagittaria latifolia"          
## [185] "Sagittaria rigida"              "Schoenoplectus acutus"         
## [187] "Schoenoplectus americanus"      "Schoenoplectus pungens"        
## [189] "Schoenoplectus subterminalis"   "Schoenoplectus tabernaemontani"
## [191] "Schoenoplectus x oblongus"      "Scirpus atrovirens"            
## [193] "Scirpus cyperinus"              "Scirpus validus"               
## [195] "Scolochloa festucacea"          "Scorpidium scorpioides"        
## [197] "Scutellaria lateriflora"        "Sium suave"                    
## [199] "Sparganium americanum"          "Sparganium angustifolium"      
## [201] "Sparganium emersum"             "Sparganium eurycarpum"         
## [203] "Sparganium fluctuans"           "Sparganium natans"             
## [205] "Sphagnum magellanicum"          "Spirodela polyrhiza"           
## [207] "Stuckenia filiformis"           "Stuckenia pectinata"           
## [209] "Tolypella intricata"            "Triadenum fraseri"             
## [211] "Typha angustifolia"             "Typha glauca"                  
## [213] "Typha latifolia"                "Utricularia gibba"             
## [215] "Utricularia intermedia"         "Utricularia macrorhiza"        
## [217] "Utricularia minor"              "Utricularia vulgaris"          
## [219] "Vallisneria americana"          "Veronica americana"            
## [221] "Wolffia borealis"               "Wolffia columbiana"            
## [223] "Zannichellia palustris"         "Zizania palustris"             
## [225] "acorus"                         "alisma"                        
## [227] "alnus"                          "bidens"                        
## [229] "callitriche"                    "carex"                         
## [231] "ceratophyllum"                  "chara"                         
## [233] "characeae"                      "cicuta"                        
## [235] "cyperaceae"                     "drepanocladus"                 
## [237] "elatine"                        "eleocharis"                    
## [239] "elodea"                         "equisetum"                     
## [241] "eragrostis"                     "eutrochium"                    
## [243] "hypericum"                      "impatiens"                     
## [245] "iris"                           "isoetes"                       
## [247] "juncus"                         "lamiaceae"                     
## [249] "lemna"                          "lysimachia"                    
## [251] "myriophyllum"                   "najas"                         
## [253] "nitella"                        "nuphar"                        
## [255] "nymphaea"                       "nymphaeaceae"                  
## [257] "persicaria"                     "poaceae"                       
## [259] "potamogeton"                    "potamogeton (broad)"           
## [261] "potamogeton (narrow)"           "ranunculus"                    
## [263] "riccia"                         "sagittaria"                    
## [265] "salix"                          "schoenoplectus"                
## [267] "scirpus"                        "scutellaria"                   
## [269] "solidago"                       "sparganium"                    
## [271] "sparganium (emergent)"          "sparganium (floating)"         
## [273] "sphagnum"                       "stuckenia"                     
## [275] "typha"                          "utricularia"                   
## [277] "verbena"                        "wolffia"                       
## [279] "zizania"                        "zosterella"                    
## [281] "richness"                       "nat_richness"
  plants_occurrence_wide[ , c("STA_NBR_DATASOURCE","SURVEY_ID_DATASOURCE",
                              "REL_ABUND", "REL_ABUND_CORRECTED", "WHOLE_RAKE_REL_ABUND", 
                              "SAMPLE_NOTES",
                              "SURFACE_GROWTH",
                              "POINT_LVL_SECCHI",
                              "OLD_SURVEY_ID",
                              "COHORT",
                              "DATEINFO", "MONTH", "DAY", "YEAR", "DATESURVEYSTART",
                              "INVENTORY_STAFF", "INVENTORY_STAFFDATE", "INVENTORY_NOTES",
                              "USEABLE", "CLEANED", "INDATABASE",
                              "SUBMISSION_STAFF", "SUBMISSION_STAFFDATE", "SUBMISSION_NOTES",
                              "SURVEY_FEEDBACK", "DATASOURCE", "RAKE_SCALE_USED",
                              "NA", "TAXON", "SUBSTRATE", "OBS_ID", "NO_VEG_FOUND") := NULL , ]
  setcolorder(plants_occurrence_wide, c("DOW", "LAKE_NAME", "order_ID", "SUBBASIN",
                                        "SURVEY_ID", "SURVEY_DATASOURCE", "SURVEY_DATE", "MULTIPARTSURVEY", "SURVEYOR",
                                        "Secchi_m", "SECCHI_DATE", "SECCHI_m_ACCEPTED",
                                        "POINT_ID" ,"DEPTH_FT", "proplight", "Longitude", "Latitude"))
  

  #check to make sure I didn't dump something critical for an ident:
  plants_occurrence_wide[duplicated(plants_occurrence_wide)]
## Empty data.table (0 rows and 250 cols): DOW,LAKE_NAME,order_ID,SUBBASIN,SURVEY_ID,SURVEY_DATASOURCE...
# **point level rake abund --------------------------------------------------

Point Abundances Wide

  plants_rakeabund_wide <- dcast(plants[!is.na(RAKE_SCALE_USED)], SURVEY_ID+ 
                                   POINT_ID +
                                   NO_VEG_FOUND +
                                   proplight +
                                   DEPTH_FT + 
                                   SUBSTRATE +
                                   SURVEYOR ~ TAXON, value.var = "REL_ABUND_CORRECTED", fun.aggregate = last, fill = 0)
  
  plants_rakeabund_wide[ ,.N , NO_VEG_FOUND]
##    NO_VEG_FOUND     N
##          <lgcl> <int>
## 1:        FALSE 79660
## 2:         TRUE 39208
  #calculate diversity metrics for each rake throw
  rake_taxacols <- names(plants_rakeabund_wide)[names(plants_rakeabund_wide)%in%taxacols]
  rake_natcols <- names(plants_rakeabund_wide)[names(plants_rakeabund_wide)%in%natcols]
  # names(plants_rakeabund_wide)
  # diversity metrics
  plants_rakeabund_wide[ , shannon_div := diversity(plants_rakeabund_wide[,.SD, .SDcols = rake_taxacols],index = "shannon") ]
  plants_rakeabund_wide[ , simpsons_div := diversity(plants_rakeabund_wide[,.SD, .SDcols = rake_taxacols],index = "invsimpson") ]
  plants_rakeabund_wide[ , shannon_div_nat := diversity(plants_rakeabund_wide[,.SD, .SDcols = rake_natcols],index = "shannon") ]
  plants_rakeabund_wide[ , simpsons_div_nat := diversity(plants_rakeabund_wide[,.SD, .SDcols = rake_natcols],index = "invsimpson") ]
  # richness
  plants_rakeabund_wide[ ,  richness := rowSums(.SD > 0), .SDcols = rake_taxacols ]
  plants_rakeabund_wide[ ,  nat_richness := rowSums(.SD > 0), .SDcols = rake_natcols ]
  
  summary(plants_rakeabund_wide$`Potamogeton crispus`)#check that max rakeabund is 3
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2689  0.0000  3.0000
  #bring all survey level variables back into the dataset 
  #check join
  # nrow(plants[plants_rakeabund_wide, on = .(POINT_ID, SURVEY_ID, NO_VEG_FOUND, proplight,DEPTH_FT,SUBSTRATE,SURVEYOR), mult = "last" , ])
  # names(plants[plants_rakeabund_wide, on = .(POINT_ID, SURVEY_ID, NO_VEG_FOUND, proplight,DEPTH_FT,SUBSTRATE,SURVEYOR), mult = "last" , ])
  # 
  plants_rakeabund_wide <- plants[plants_rakeabund_wide, on = .(POINT_ID, SURVEY_ID, NO_VEG_FOUND, proplight,DEPTH_FT,SURVEYOR, SUBSTRATE), mult = "last" , ]
  
  
  #and drop unneeded cols & those with loss of meaning through munging:
  plants_rakeabund_wide[ , c("STA_NBR_DATASOURCE", "REL_ABUND", "REL_ABUND_CORRECTED",
                             "WHOLE_RAKE_REL_ABUND", "SURVEY_ID_DATASOURCE", "SAMPLE_NOTES",
                             "SURFACE_GROWTH", "POINT_LVL_SECCHI", "OLD_SURVEY_ID", "DATESURVEYSTART",
                             "COHORT", "DATEINFO", "MONTH", "DAY", "YEAR", "INVENTORY_STAFF", "INVENTORY_STAFFDATE",
                             "INVENTORY_NOTES", "USEABLE", "CLEANED", "INDATABASE", "SUBMISSION_STAFF",
                             "SUBMISSION_STAFFDATE", "SUBMISSION_NOTES", "SURVEY_FEEDBACK", "DATASOURCE",
                             "RAKE_SCALE_USED") := NULL , ]
  
  
  names(plants_rakeabund_wide)
##   [1] "DOW"                            "Secchi_m"                      
##   [3] "SECCHI_DATE"                    "SURVEY_ID"                     
##   [5] "LAKE_NAME"                      "SURVEY_DATE"                   
##   [7] "DEPTH_FT"                       "NO_VEG_FOUND"                  
##   [9] "SUBSTRATE"                      "SURVEYOR"                      
##  [11] "TAXON"                          "POINT_ID"                      
##  [13] "OBS_ID"                         "SUBBASIN"                      
##  [15] "MULTIPARTSURVEY"                "SURVEY_DATASOURCE"             
##  [17] "order_ID"                       "Longitude"                     
##  [19] "Latitude"                       "SECCHI_m_ACCEPTED"             
##  [21] "proplight"                      "NA"                            
##  [23] "Bidens beckii"                  "Bolboschoenus fluviatilis"     
##  [25] "Brasenia schreberi"             "Caltha palustris"              
##  [27] "Carex comosa"                   "Carex pellita"                 
##  [29] "Carex scoparia"                 "Ceratophyllum demersum"        
##  [31] "Chara globularis"               "Eleocharis acicularis"         
##  [33] "Eleocharis erythropoda"         "Eleocharis palustris"          
##  [35] "Elodea canadensis"              "Elodea nuttallii"              
##  [37] "Equisetum fluviatile"           "Fontinalis antipyretica"       
##  [39] "Glyceria borealis"              "Heteranthera dubia"            
##  [41] "Hippuris vulgaris"              "Iris virginica"                
##  [43] "Isoetes echinospora"            "Juncus arcticus"               
##  [45] "Juncus canadensis"              "Juncus effusus"                
##  [47] "Juncus pelocarpus"              "Lemna minor"                   
##  [49] "Lemna trisulca"                 "Lychnothamnus barbatus"        
##  [51] "Lythrum salicaria"              "Myriophyllum exalbescens"      
##  [53] "Myriophyllum farwellii"         "Myriophyllum sibiricum"        
##  [55] "Myriophyllum spicatum"          "Myriophyllum verticillatum"    
##  [57] "Najas flexilis"                 "Najas guadalupensis"           
##  [59] "Najas minor"                    "Nelumbo lutea"                 
##  [61] "Nitellopsis obtusa"             "Nuphar advena"                 
##  [63] "Nuphar variegata"               "Nymphaea odorata"              
##  [65] "Nymphaea tuberosa"              "Persicaria amphibia"           
##  [67] "Phalaris arundinacea"           "Phragmites australis"          
##  [69] "Polygonum amphibium"            "Pontederia cordata"            
##  [71] "Potamogeton amplifolius"        "Potamogeton crispus"           
##  [73] "Potamogeton epihydrus"          "Potamogeton foliosus"          
##  [75] "Potamogeton friesii"            "Potamogeton gramineus"         
##  [77] "Potamogeton illinoensis"        "Potamogeton natans"            
##  [79] "Potamogeton nodosus"            "Potamogeton obtusifolius"      
##  [81] "Potamogeton praelongus"         "Potamogeton pusillus"          
##  [83] "Potamogeton richardsonii"       "Potamogeton robbinsii"         
##  [85] "Potamogeton spirillus"          "Potamogeton strictifolius"     
##  [87] "Potamogeton zosteriformis"      "Protectedspecies 1"            
##  [89] "Protectedspecies 10"            "Protectedspecies 12"           
##  [91] "Protectedspecies 2"             "Protectedspecies 3"            
##  [93] "Protectedspecies 7"             "Protectedspecies 8"            
##  [95] "Ranunculus aquatilis"           "Ranunculus flabellaris"        
##  [97] "Ranunculus longirostris"        "Riccia fluitans"               
##  [99] "Ricciocarpos natans"            "Sagittaria cristata"           
## [101] "Sagittaria graminea"            "Sagittaria latifolia"          
## [103] "Sagittaria rigida"              "Schoenoplectus acutus"         
## [105] "Schoenoplectus americanus"      "Schoenoplectus pungens"        
## [107] "Schoenoplectus subterminalis"   "Schoenoplectus tabernaemontani"
## [109] "Scirpus cyperinus"              "Scirpus validus"               
## [111] "Sium suave"                     "Sparganium eurycarpum"         
## [113] "Spirodela polyrhiza"            "Stuckenia filiformis"          
## [115] "Stuckenia pectinata"            "Tolypella intricata"           
## [117] "Typha angustifolia"             "Typha glauca"                  
## [119] "Typha latifolia"                "Utricularia gibba"             
## [121] "Utricularia macrorhiza"         "Utricularia minor"             
## [123] "Utricularia vulgaris"           "Vallisneria americana"         
## [125] "Wolffia borealis"               "Wolffia columbiana"            
## [127] "Zannichellia palustris"         "carex"                         
## [129] "ceratophyllum"                  "chara"                         
## [131] "characeae"                      "cyperaceae"                    
## [133] "drepanocladus"                  "eleocharis"                    
## [135] "elodea"                         "juncus"                        
## [137] "lemna"                          "myriophyllum"                  
## [139] "najas"                          "nitella"                       
## [141] "nuphar"                         "nymphaea"                      
## [143] "poaceae"                        "potamogeton"                   
## [145] "potamogeton (broad)"            "potamogeton (narrow)"          
## [147] "ranunculus"                     "riccia"                        
## [149] "sagittaria"                     "salix"                         
## [151] "schoenoplectus"                 "scirpus"                       
## [153] "sparganium"                     "typha"                         
## [155] "utricularia"                    "wolffia"                       
## [157] "zizania"                        "shannon_div"                   
## [159] "simpsons_div"                   "shannon_div_nat"               
## [161] "simpsons_div_nat"               "richness"                      
## [163] "nat_richness"
  plants_rakeabund_wide[ , c("STA_NBR_DATASOURCE","SURVEY_ID_DATASOURCE",
                              "REL_ABUND", "REL_ABUND_CORRECTED", "WHOLE_RAKE_REL_ABUND",
                              "SAMPLE_NOTES",
                              "SURFACE_GROWTH",
                              "POINT_LVL_SECCHI",
                              "OLD_SURVEY_ID",
                              "COHORT",
                              "DATEINFO", "MONTH", "DAY", "YEAR", "DATESURVEYSTART",
                              "INVENTORY_STAFF", "INVENTORY_STAFFDATE", "INVENTORY_NOTES",
                              "USEABLE", "CLEANED", "INDATABASE",
                              "SUBMISSION_STAFF", "SUBMISSION_STAFFDATE", "SUBMISSION_NOTES",
                              "SURVEY_FEEDBACK", "DATASOURCE", "RAKE_SCALE_USED",
                              "NA","TAXON", "SUBSTRATE", "OBS_ID", "NO_VEG_FOUND") := NULL , ]
  setcolorder(plants_rakeabund_wide, c("DOW", "LAKE_NAME", "order_ID", "SUBBASIN",
                                        "SURVEY_ID", "SURVEY_DATASOURCE", "SURVEY_DATE", "MULTIPARTSURVEY", "SURVEYOR",
                                        "Secchi_m", "SECCHI_DATE", "SECCHI_m_ACCEPTED",
                                        "POINT_ID" ,"DEPTH_FT", "proplight", "Longitude", "Latitude"))

  
  #cleanup:
  rm(rake_taxacols,rake_natcols)

  


# **survey level stats -------------------------------------------

Survey Level Aggregation

  surveys <- plants[ , .(tot_n_samp = length(unique(POINT_ID)))  , SURVEY_ID ]
  
  #add richness to the surveys dataset
  surveys[  , taxa_richness := #take the "taxon count" and subtract one if the survey includes NAs (see next two lines)
              plants[ , length(unique(TAXON))   , SURVEY_ID ][ , V1]-# ("total richness", but counts NAs as a taxon) minus
              plants[ , ifelse(sum(is.na(TAXON))== 0, 0, 1), SURVEY_ID][,V1],]#  (each survey get a 0 if no NAs or a 1 if contains NA's)
  
  # extent of vegetation in survey (proportion vegetated)
  surveys <- merge(surveys,plants[!is.na(TAXON), .(n_points_vegetated=length(unique(POINT_ID))) , SURVEY_ID ],
                   by = "SURVEY_ID", all.x = TRUE)[is.na(n_points_vegetated), n_points_vegetated := 0 ]
  surveys[ , prop_veg := n_points_vegetated/tot_n_samp ,]
  
  
  #create a plant observation matrix (species abund by survey)
  survey_species_matrix <- dcast(plants[!is.na(TAXON) , .("count" = length(unique(POINT_ID))) , .(SURVEY_ID,TAXON)], SURVEY_ID ~ TAXON, value.var = "count", fill = 0) #note that this line creates the matrix ONLY for surveys that had species observations (~70 surveys had no species observed)

  #diversity indicies:
  # species names:
  natcols <- names(survey_species_matrix)[names(survey_species_matrix) %in% natcols]
  
  # total diversity
  survey_species_matrix[ , shannon_div := diversity(.SD,index = "shannon"), .SDcols = taxacols ]
  survey_species_matrix[ , simpsons_div := diversity(.SD,index = "invsimpson"), .SDcols = taxacols ]
  
  # native diversity
  survey_species_matrix[ , shannon_div_nat := diversity(.SD,index = "shannon"), .SDcols = natcols  ]
  survey_species_matrix[ , simpsons_div_nat := diversity(.SD,index = "invsimpson"), .SDcols = natcols ]
  survey_species_matrix[simpsons_div_nat == Inf, simpsons_div_nat := 0]
  
  # native richness
  survey_species_matrix[ ,  nat_richness := rowSums(survey_species_matrix[ , .SD, .SDcols = natcols] > 0), ]

  # depth stats
  # depth surveyed stats:
  surveys <- surveys[plants[ !is.na(DEPTH_FT), .("max_depth_surveyed" = max(DEPTH_FT)) , SURVEY_ID], on = "SURVEY_ID" , ]
  surveys <- surveys[plants[ !is.na(DEPTH_FT), .("min_depth_surveyed" = min(DEPTH_FT)) , SURVEY_ID], on = "SURVEY_ID" , ]
  surveys <- surveys[plants[ !is.na(DEPTH_FT), .("mean_depth_surveyed" = mean(DEPTH_FT)) , SURVEY_ID], on = "SURVEY_ID" , ]
  surveys <- surveys[plants[ !is.na(DEPTH_FT), .("median_depth_surveyed" = median(DEPTH_FT)) , SURVEY_ID], on = "SURVEY_ID" , ]
  surveys <- surveys[plants[ !is.na(DEPTH_FT), .("IQR_depth_surveyed" = IQR(DEPTH_FT)) , SURVEY_ID], on = "SURVEY_ID" , ]
  
  #vegetated depths data
  #max depth vegetated within survey:
  
  #some of these might warrant removal, depending on whats being done with the data
  plants[NO_VEG_FOUND == F & DEPTH_FT>50, length(POINT_ID) ,  .(SURVEY_DATASOURCE, DOW, SUBBASIN, SURVEY_DATE, LAKE_NAME)]
##                     SURVEY_DATASOURCE      DOW   SUBBASIN SURVEY_DATE
##                                <char>    <int>     <char>      <IDat>
## 1: Newman Lab University of Minnesota 10000200             2011-06-29
## 2:       DNR Invasive Species Program 40005600             2012-06-15
## 3:       DNR Invasive Species Program 82010600             2016-04-30
## 4:         Three Rivers Park District 27019101 west basin  2014-06-20
## 5:     Freshwater Scientific Services 27013300  Grays Bay  2017-08-28
## 6:                                    21005700             2009-08-10
## 7:                                    21005700             2011-08-17
## 8:                      DNR Fisheries 21005700             2008-06-02
## 9:                                    34003200             2011-06-23
##     LAKE_NAME    V1
##        <char> <int>
## 1:      riley     1
## 2:       rays     2
## 3:       elmo     6
## 4:      sarah     2
## 5: minnetonka     4
## 6:     carlos     2
## 7:     carlos     1
## 8:     carlos     1
## 9:     carrie     2
  plants[ NO_VEG_FOUND == FALSE & DEPTH_FT<50 , .("max_depth_vegetated" = max(DEPTH_FT)) , SURVEY_ID]
##       SURVEY_ID max_depth_vegetated
##           <int>               <num>
##    1:         1                 7.0
##    2:         2                11.0
##    3:         3                 6.0
##    4:         4                 4.0
##    5:         5                 6.2
##   ---                              
## 3122:      2041                10.0
## 3123:      2042                13.0
## 3124:      2043                 8.0
## 3125:      2822                19.0
## 3126:      4336                 9.0
  surveys <- merge( surveys , plants[ NO_VEG_FOUND == FALSE& DEPTH_FT<50 , .("max_depth_vegetated" = max(DEPTH_FT, na.rm = T)) , SURVEY_ID] , by = "SURVEY_ID" , all.x =TRUE )
  #other depth vegetated stats:
  surveys <- merge( surveys , plants[ NO_VEG_FOUND == FALSE& DEPTH_FT<50 , .("min_depth_vegetated" = min(DEPTH_FT, na.rm = T)) , SURVEY_ID], by = "SURVEY_ID" , all.x =TRUE )
  surveys <- merge( surveys , plants[ NO_VEG_FOUND == FALSE& DEPTH_FT<50 , .("mean_depth_vegetated" = mean(DEPTH_FT, na.rm = T)) , SURVEY_ID], by = "SURVEY_ID" , all.x =TRUE )
  surveys <- merge( surveys , plants[ NO_VEG_FOUND == FALSE& DEPTH_FT<50 , .("median_depth_vegetated" = median(DEPTH_FT, na.rm = T)) , SURVEY_ID], by = "SURVEY_ID" , all.x =TRUE )
  surveys <- merge( surveys , plants[ NO_VEG_FOUND == FALSE& DEPTH_FT>50 , .("IQR_depth_vegetated" = IQR(DEPTH_FT, na.rm = T)) , SURVEY_ID], by = "SURVEY_ID" , all.x =TRUE )


  # species matrix into survey data
  #species matrix for surveys
  surveys <- merge(surveys, survey_species_matrix, by = "SURVEY_ID", all.x = T)
  f_dowle3natozeros(surveys, names(survey_species_matrix)) #the merge incorrectly assigns NAs for non obs... here we replace those with 0s
  
  # check work:
  # summary(surveys[,1:17])
  
  #append survey data (basic data from plants db) to these
  # names(plants)
  surveys <- merge(plants[order(SURVEY_DATE) , .("nobs" = .N, "SURVEY_DATE" = first(SURVEY_DATE)) , .(SURVEY_ID, SURVEY_DATASOURCE,
                                                LAKE_NAME, DOW, SUBBASIN,
                                                MULTIPARTSURVEY, order_ID) ],surveys,  by = "SURVEY_ID")
  # summary(surveys)
  # names(surveys) <- gsub(" ", "_", gsub( "\\(", "_", gsub( "\\)", "_", names(surveys))))

  # secchi data metrics 
  # rescue the secchi data from the plants db for these surveys

  surveys[plants, Secchi_m := Secchi_m, on = "SURVEY_ID"]
  surveys[plants, Secchi_m_date := SECCHI_DATE, on = "SURVEY_ID"]

  #OPTIONAL: merge in the geodata + lake data to the survey work
  # surveys <- pwi_l[surveys, on = .(order_ID), mult = "first" ]  
  #have to strip off the geometry to prevent failure in write to csv
  # surveys[ ,geometry := NULL ,]

  
  # n samples within historical max depth
  
  # surveys <- merge(surveys,plants[!is.na(TAXON), .(alltime_maxvegdep = max(DEPTH_FT)) , .(DOW, SUBBASIN) ], by = c("DOW", "SUBBASIN"), all.x = TRUE) [is.na(alltime_maxvegdep), alltime_maxvegdep := 0  ]
  # summary(surveys$alltime_maxvegdep)
  # surveys[ , hist(alltime_maxvegdep) , ]
  
  
  #for plants records with plants, whats the max depth by lake?
  
  #remove these cols if they exist (added this in troubleshooting, should kick warning, but have no effect on product)
  plants[ ,alltime_maxvegdep := NULL]
  plants[ ,survey_maxvegdep := NULL]
  # Calculate max depth for non-NA TAXON records
  max_depth <- plants[!is.na(TAXON) & DEPTH_FT < 50, .(alltime_maxvegdep = max(DEPTH_FT, na.rm = T)), by = .(DOW, SUBBASIN)]
  # Merge the result back into the original data
  plants <- merge(plants, max_depth, by = c("DOW", "SUBBASIN"), all.x = TRUE)
  
  # Calculate max depth for non-NA TAXON records
  max_depth <- plants[!is.na(TAXON) & DEPTH_FT < 50, .(survey_maxvegdep = max(DEPTH_FT, na.rm = T)), by = .(SURVEY_ID)]
    # Merge the result back into the original data
  plants <- merge(plants, max_depth, by = "SURVEY_ID", all.x = TRUE)
  
  plants[ , .N , .(alltime_maxvegdep, survey_maxvegdep, SURVEY_ID) , ]
##       alltime_maxvegdep survey_maxvegdep SURVEY_ID     N
##                   <num>            <num>     <int> <int>
##    1:               7.0              7.0         1    63
##    2:              11.0             11.0         2   126
##    3:               6.0              6.0         3   160
##    4:               4.0              4.0         4   147
##    5:               6.2              6.2         5   118
##   ---                                                   
## 3190:              20.6             14.5      4337   270
## 3191:               7.0              5.2      4338    57
## 3192:               7.0              7.0      4339    68
## 3193:              12.0             12.0      4340   114
## 3194:              26.9             23.0      4341   744
  summary(plants[ , .N , .(alltime_maxvegdep, survey_maxvegdep, SURVEY_ID) , ])
##  alltime_maxvegdep survey_maxvegdep   SURVEY_ID            N         
##  Min.   : 0.00     Min.   : 0.000   Min.   :   1.0   Min.   :   3.0  
##  1st Qu.: 5.20     1st Qu.: 4.800   1st Qu.: 823.2   1st Qu.:  66.0  
##  Median :10.00     Median : 8.000   Median :1649.5   Median : 131.5  
##  Mean   :11.84     Mean   : 9.597   Mean   :1655.2   Mean   : 229.3  
##  3rd Qu.:16.40     3rd Qu.:13.100   3rd Qu.:2483.8   3rd Qu.: 270.0  
##  Max.   :45.60     Max.   :45.600   Max.   :4341.0   Max.   :3836.0  
##  NA's   :39        NA's   :68
  plants[is.na(survey_maxvegdep), .N , NO_VEG_FOUND ]
##    NO_VEG_FOUND     N
##          <lgcl> <int>
## 1:         TRUE  3741
  plot(data =  plants[, .N , .(alltime_maxvegdep, survey_maxvegdep, SURVEY_ID) , ],alltime_maxvegdep~survey_maxvegdep )

  # n_points within all time max vegetated depth
  
  plants[,
         .(alltime_maxvegdep_n_samp = fifelse(is.na(first(alltime_maxvegdep)) , NA ,
                                             length(POINT_ID))
                ) , SURVEY_ID ]
## Key: <SURVEY_ID>
##       SURVEY_ID alltime_maxvegdep_n_samp
##           <int>                    <int>
##    1:         1                       63
##    2:         2                      126
##    3:         3                      160
##    4:         4                      147
##    5:         5                      118
##   ---                                   
## 3190:      4337                      270
## 3191:      4338                       57
## 3192:      4339                       68
## 3193:      4340                      114
## 3194:      4341                      744
  surveys <- merge(surveys,
                   plants[,
                          .(alltime_maxvegdep_n_samp = fifelse(is.na(first(alltime_maxvegdep)) , NA ,
                                                               length(
                                                                 unique(ifelse(DEPTH_FT <= alltime_maxvegdep,POINT_ID,
                                                                                    NA),
                                                                             na.rm =T)
                                                                      )
                                                               ),
                            alltime_maxvegdep = first(alltime_maxvegdep)) ,
                          SURVEY_ID ],
                   by = "SURVEY_ID", all.x = TRUE
  )
  
  # n_points within survey specific max vegetated depth
  surveys <- merge(surveys,
                   plants[,
                          .(survey_maxvegdep_n_samp = 
                              fifelse(
                                is.na(
                                  first(survey_maxvegdep)) , NA ,
                                length(
                                  unique(
                                    ifelse(DEPTH_FT <= survey_maxvegdep,
                                           POINT_ID,NA
                                    ),
                                    na.rm =T
                                  )
                                )
                              ), survey_maxvegdep = first(survey_maxvegdep
                                                          )
                            )
                          , SURVEY_ID ],
                   by = "SURVEY_ID", all.x = TRUE
  )
  
  
  plot( data = surveys, alltime_maxvegdep_n_samp ~ n_points_vegetated, xlab = "n points vegetated in this survey", ylab = "n points within all time max vegetated depth")

  plot( data = surveys[ , .("propsamples_in_histmaxveg_depth" = (alltime_maxvegdep_n_samp/tot_n_samp),
                            "prop_samples_vegetated" = (n_points_vegetated/tot_n_samp)
                            ) , ], propsamples_in_histmaxveg_depth  ~ prop_samples_vegetated)

  plot(data = surveys,  alltime_maxvegdep_n_samp ~ survey_maxvegdep_n_samp, xlab = "n points within survey specific vegetated depth", ylab = "n points within all time max vegetated depth")

  plot(data = surveys, alltime_maxvegdep ~ survey_maxvegdep)

  summary(surveys[ , .(alltime_maxvegdep, survey_maxvegdep, max_depth_vegetated, min_depth_vegetated)])
##  alltime_maxvegdep survey_maxvegdep max_depth_vegetated min_depth_vegetated
##  Min.   : 0.00     Min.   : 0.000   Min.   : 0.000      Min.   :0.000      
##  1st Qu.: 5.20     1st Qu.: 4.800   1st Qu.: 4.800      1st Qu.:1.000      
##  Median :10.00     Median : 8.000   Median : 8.000      Median :1.300      
##  Mean   :11.84     Mean   : 9.597   Mean   : 9.597      Mean   :1.586      
##  3rd Qu.:16.40     3rd Qu.:13.100   3rd Qu.:13.100      3rd Qu.:2.000      
##  Max.   :45.60     Max.   :45.600   Max.   :45.600      Max.   :9.800      
##  NA's   :39        NA's   :68       NA's   :68          NA's   :68
  surveys[taxa_richness == 0, .N ,  alltime_maxvegdep]
##     alltime_maxvegdep     N
##                 <num> <int>
##  1:               2.5     3
##  2:                NA    39
##  3:              12.0     1
##  4:              10.6     1
##  5:               1.0     2
##  6:               4.5     1
##  7:               8.8     3
##  8:              11.2     4
##  9:               5.5     2
## 10:               5.0     1
## 11:               9.8     3
## 12:               2.0     1
## 13:               8.5     1
## 14:              10.0     1
## 15:               2.8     1
## 16:              11.0     1
## 17:               4.8     1
## 18:               6.5     2
# **species pools & watershed metrics ----------------------------------------------------

Watersheds and Species Pools Across Scales

We have super awesome species pool data because we’ve got species abunds across multiple scales: From the smallest (point– plants_rakeabund_wide or plants_occurrence_wide) scale we have a species abundance matrix that can be treated as a product of the species pool above it (whole survey/lake), which we also have an abundance matrix for! We can also move up to the landscape scale, building species abundance matricies by aggregating these next-lower-scale data.

For example, we can do as described above (compressing matricies to richness for viz) and aggregate to the HUC-8 watershed level.

  #first add the lake level richness from each survey to the point rake abund data
  plants_rakeabund_wide[ , surveyrichness := surveys[match(plants_rakeabund_wide[ ,SURVEY_ID ,], surveys[, SURVEY_ID, ]), taxa_richness] ,]
  
  plants_occurrence_wide[ , surveyrichness := surveys[match(plants_occurrence_wide[ ,SURVEY_ID ,], surveys[, SURVEY_ID, ]), taxa_richness] ,]
  
    ggplot( data = plants_rakeabund_wide,
            aes(jitter(surveyrichness), jitter(nat_richness)))+
      geom_point(alpha = .05)+
      ylab("point level native richness")+
      xlab("suvey level native richness")

  # now we need to create a watershed level species matrix:
  #check keys
  # pwi_l$order_ID
  # plants$order_ID
   
  plants[ , watershed := pwi_l[match(plants[ , order_ID ,],pwi_l[ , order_ID ,]), major , ],]
  
  # plants[ , length(unique(POINT_ID)) , watershed]

    
  watersheds <- plants[ , .("n_points" = length(unique(POINT_ID))) , watershed]
  
  watersheds <- merge(watersheds, 
        plants[!is.na(TAXON) , .("n_species" = length(unique(TAXON))) , watershed],
        by = "watershed", 
        all.x = T)
  
  
  watershed_occurrence_wide <- dcast(plants, watershed ~ TAXON, value.var = "INDATABASE", fun.aggregate = sum, fill = 0)
  
  watershed_occurrence_wide <- merge(watersheds, 
                      watershed_occurrence_wide,
                      by = "watershed", 
                      all.x = T)
  
  plants_rakeabund_wide[ , watershed := plants[match(plants_rakeabund_wide[ ,SURVEY_ID ,], plants[, SURVEY_ID, ]), watershed] ,]
  
  plants_rakeabund_wide[ , watershedrichness := watershed_occurrence_wide[match(plants_rakeabund_wide[ ,watershed ,],    watershed_occurrence_wide[, watershed, ]), n_species] ,]
  
  # add to occurrence wide set:
  plants_occurrence_wide[ , watershed := plants[match(plants_occurrence_wide[ ,SURVEY_ID ,], plants[, SURVEY_ID, ]), watershed] ,]
  
  plants_occurrence_wide[ , watershedrichness := watershed_occurrence_wide[match(plants_occurrence_wide[ ,watershed ,],    watershed_occurrence_wide[, watershed, ]), n_species] ,]
  
  # watershed richness as the predictor of point scale richness: 
  ggplot( data = plants_rakeabund_wide,
          aes(jitter(watershedrichness), jitter(nat_richness)))+
    geom_point()+
    ylab("point level native richness")+
    xlab("watershed level native richness")

  surveys[ , watershed := plants[match(surveys[ , SURVEY_ID ,], plants[, SURVEY_ID, ]), watershed] ,]
  surveys[ , watershedrichness := watershed_occurrence_wide[match(surveys[ ,watershed ,], watershed_occurrence_wide[, watershed, ]), n_species] ,]
  
  surveys[is.na(watershedrichness), watershedrichness := 0]
  
  ggplot( data = surveys,
          aes(watershedrichness, nat_richness))+
    geom_point()+
    geom_smooth(method = "loess")+
    ylab("Survey Richness")+
    xlab("HUC-8 Watershed Richness")+
    theme_bw()

  # Get watershed Diversity

  names(watershed_occurrence_wide) %in% natcols
##   [1] FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [13]  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [25]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [49]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE
##  [73]  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE FALSE
##  [85]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE
##  [97]  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [109]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [121]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [133]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [145]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [157]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE
## [169]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [181]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [193]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [205]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [217]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [229]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
  watershed_occurrence_wide[ , simpson_div_nat := diversity(.SD,index = "invsimpson" ) , .SDcols = c(names(watershed_occurrence_wide) %in% natcols)]
  
  hist(watershed_occurrence_wide$simpson_div_nat)

  plants_rakeabund_wide[ , watershedsimpson_nat := watershed_occurrence_wide[match(plants_rakeabund_wide[ ,watershed ,], watershed_occurrence_wide[, watershed, ]), simpson_div_nat] ,]
  
  plants_occurrence_wide[ , watershedsimpson_nat := watershed_occurrence_wide[match(plants_occurrence_wide[ ,watershed ,], watershed_occurrence_wide[, watershed, ]), simpson_div_nat] ,]
  
  plants_rakeabund_wide[ , surveysimpson_nat := surveys[match(plants_rakeabund_wide[ ,SURVEY_ID ,], surveys[, SURVEY_ID, ]), simpsons_div_nat] ,]
  
  plants_occurrence_wide[ , surveysimpson_nat := surveys[match(plants_occurrence_wide[ ,SURVEY_ID ,], surveys[, SURVEY_ID, ]), simpsons_div_nat] ,]
  
  surveys[ , watershedsimpson_nat := watershed_occurrence_wide[match(surveys[ ,watershed ,], watershed_occurrence_wide[, watershed, ]), simpson_div_nat] ,]
  
  
  #redo species pool plots:
  point_pools <- ggplot( data = plants_rakeabund_wide[simpsons_div_nat != Inf],
          aes(surveysimpson_nat, simpsons_div_nat))+
    geom_point()+
    geom_smooth(method = "lm")+
    ylab("Point-scale ENSpie")+
    xlab("Lake-scale ENSpie")+
    theme_bw()
  
  lake_pools <- ggplot( data = surveys,
          aes(watershedsimpson_nat, simpsons_div_nat))+
    geom_point()+
    geom_smooth(method = "lm")+
    ylab("Lake-scale ENSpie")+
    xlab("Watershed-scale ENSpie")+
    theme_bw()
   ggarrange(
     point_pools,
     lake_pools
   )

   #bring simpsons div back to HUc8 table
   setDT(watersheds_huc8)
   watersheds_huc8[watershed_occurrence_wide, on = .(major = watershed) , simpson_div_nat := simpson_div_nat ]
  
   
#clean up intermediates:
   
   rm(lake_pools, point_pools, survey_species_matrix)



# **data products -----------------------------------------------------------

Export Datasets

We have 6 datasets to export:

  1. plants –> plants_env_data.csv – As long format: each row is a species observation within a point (multiple rows per point) including all fields retained through cleaning processes
   # names(plants)
   plants[ , c("DATASOURCE", "STA_NBR_DATASOURCE", "SURVEY_ID_DATASOURCE", "SAMPLE_NOTES", "OLD_SURVEY_ID", "DATESURVEYSTART", "COHORT", "DATEINFO", "MONTH", "DAY", "YEAR", "INVENTORY_STAFF", "INVENTORY_STAFFDATE", "USEABLE", "CLEANED", "INDATABASE", "INVENTORY_NOTES", "SUBMISSION_STAFF", "SUBMISSION_STAFFDATE", "SUBMISSION_NOTES", "SURVEY_FEEDBACK") := NULL , ]
   setcolorder(plants, c("DOW", "LAKE_NAME", "order_ID", "SUBBASIN", "watershed","alltime_maxvegdep",
                         "SURVEY_ID", "SURVEY_DATASOURCE", "SURVEY_DATE", "MULTIPARTSURVEY", "SURVEYOR","RAKE_SCALE_USED","survey_maxvegdep",
                         "Secchi_m", "SECCHI_DATE", "SECCHI_m_ACCEPTED",
                         "POINT_ID" ,"DEPTH_FT", "proplight", "Longitude", "Latitude", 
                         "NO_VEG_FOUND", "WHOLE_RAKE_REL_ABUND","SUBSTRATE", "SURFACE_GROWTH", "POINT_LVL_SECCHI",  
                         "OBS_ID", "TAXON", "REL_ABUND", "REL_ABUND_CORRECTED"
                         ))
   # export_names_plants <- tolower(names(plants))
   
# metadata for column names in this file
   # dow: MN Dept of Waters Ident.
   # lake_name: Name of the lake.
   # order_ID: key used to link to MN Hydrography dataset
   # subbasin: Sub-basin where the observation was made.
   # watershed: Watershed associated with the observation.
   # alltime_maxvegdep: Maximum vegetation depth ever observed in the lake (excludes any depth observation >50ft).
   # survey_id: Identification number for the survey.
   # survey_datasource: Name of the source of the survey data.
   # survey_date: Date when the survey was conducted, if multiple dates uses the first day of the survey.
   # multipartsurvey: Indicator for if the survey is part of a larger survey. Numeric with structure of SURVEY.PART
   # surveyor: Person or entity conducting the survey if known.
   # rake_scale_used: Scale used for rake abundance measurements.
   # survey_maxvegdep: Maximum vegetation depth observed during the survey.
   # secchi_m: Nearest temporal Secchi depth measured in meters.
   # secchi_date: Date when Secchi depth was measured.
   # secchi_m_accepted: Secchi depth measurement if observation is within 30d of the plant survey (used for proplight calculation).
   # point_id: Identification number for the observation point.
   # depth_ft: Depth in feet.
   # proplight: Proportion of surface light remaining at DEPTH_FT.
   # longitude: Longitude coordinate of the observation point.
   # latitude: Latitude coordinate of the observation point.
   # no_veg_found: Indicator if no vegetation was found at point.
   # whole_rake_rel_abund: Relative abundance rating assigned to the whole rake (all species), if assigned.
   # substrate: Substrate type.
   # surface_growth: Indicator variable for plant growth reached surface of water.
   # point_lvl_secchi: Secchi level at the observation point if recorded.
   # obs_id: Identification number for the observation.
   # taxon: Name of taxon observed.
   # rel_abund: Relative abundance observed (see RAKE_SCALE_USED for possible values).
   # rel_abund_corrected: Corrected relative abundance (fixes all relative abunds to scale of 1,2,3).
   
  # fwrite(plants, file = "data&scripts/data/output/DRUM/plants_env_data.csv")   
  1. plants_occurrence_wide –> plants_env_data_wide.csv – As a wide format of occurrences: each row is a point record, and the columns include a species observation (presence/absence) matrix.
   # names(plants_occurrence_wide)

   setcolorder(plants_occurrence_wide, c("DOW", "LAKE_NAME", "order_ID", "SUBBASIN",
                                         "watershed", "watershedrichness", "watershedsimpson_nat",
                                         "SURVEY_ID", "SURVEY_DATASOURCE", "SURVEY_DATE", "MULTIPARTSURVEY", "SURVEYOR","surveyrichness", "surveysimpson_nat",
                                         "Secchi_m", "SECCHI_DATE", "SECCHI_m_ACCEPTED",
                                         "POINT_ID" ,"DEPTH_FT", "proplight", "Longitude", "Latitude",
                                         "richness", "nat_richness"))
   
   
   # export_names_plants_occurrence_wide <- tolower(names(plants_occurrence_wide))
   
   names(plants_occurrence_wide)
##   [1] "DOW"                            "LAKE_NAME"                     
##   [3] "order_ID"                       "SUBBASIN"                      
##   [5] "watershed"                      "watershedrichness"             
##   [7] "watershedsimpson_nat"           "SURVEY_ID"                     
##   [9] "SURVEY_DATASOURCE"              "SURVEY_DATE"                   
##  [11] "MULTIPARTSURVEY"                "SURVEYOR"                      
##  [13] "surveyrichness"                 "surveysimpson_nat"             
##  [15] "Secchi_m"                       "SECCHI_DATE"                   
##  [17] "SECCHI_m_ACCEPTED"              "POINT_ID"                      
##  [19] "DEPTH_FT"                       "proplight"                     
##  [21] "Longitude"                      "Latitude"                      
##  [23] "richness"                       "nat_richness"                  
##  [25] "Acorus americanus"              "Alisma triviale"               
##  [27] "Andromeda glaucophylla"         "Andromeda polifolia"           
##  [29] "Asclepias incarnata"            "Asteraceae"                    
##  [31] "Betula pumila"                  "Bidens beckii"                 
##  [33] "Bolboschoenus fluviatilis"      "Bolboschoenus maritimus"       
##  [35] "Boltonia asteroides"            "Brasenia schreberi"            
##  [37] "Butomus umbellatus"             "Calamagrostis canadensis"      
##  [39] "Calla palustris"                "Caltha palustris"              
##  [41] "Carex aquatilis"                "Carex comosa"                  
##  [43] "Carex lacustris"                "Carex pellita"                 
##  [45] "Carex scoparia"                 "Ceratophyllum demersum"        
##  [47] "Chamaedaphne calyculata"        "Chara canescens"               
##  [49] "Chara globularis"               "Cicuta maculata"               
##  [51] "Drepanocladus aduncus"          "Dulichium arundinaceum"        
##  [53] "Elatine minima"                 "Eleocharis acicularis"         
##  [55] "Eleocharis erythropoda"         "Eleocharis palustris"          
##  [57] "Eleocharis smallii"             "Elodea canadensis"             
##  [59] "Elodea nuttallii"               "Equisetum fluviatile"          
##  [61] "Eriocaulon aquaticum"           "Eupatorium dubium"             
##  [63] "Eupatorium maculatum"           "Eupatorium perfoliatum"        
##  [65] "Fontinalis antipyretica"        "Fontinalis sullivantii"        
##  [67] "Glyceria borealis"              "Heteranthera dubia"            
##  [69] "Hippuris vulgaris"              "Hypericum ellipticum"          
##  [71] "Impatiens capensis"             "Iris versicolor"               
##  [73] "Iris virginica"                 "Isoetes echinospora"           
##  [75] "Juncus arcticus"                "Juncus canadensis"             
##  [77] "Juncus effusus"                 "Juncus pelocarpus"             
##  [79] "Ledum groenlandicum"            "Leersia oryzoides"             
##  [81] "Lemna minor"                    "Lemna trisulca"                
##  [83] "Lemna turionifera"              "Littorella uniflora"           
##  [85] "Lobelia dortmanna"              "Lychnothamnus barbatus"        
##  [87] "Lycopus americanus"             "Lysimachia terrestris"         
##  [89] "Lythrum salicaria"              "Menyanthes trifoliata"         
##  [91] "Myrica gale"                    "Myriophyllum alterniflorum"    
##  [93] "Myriophyllum exalbescens"       "Myriophyllum farwellii"        
##  [95] "Myriophyllum sibiricum"         "Myriophyllum spicatum"         
##  [97] "Myriophyllum tenellum"          "Myriophyllum verticillatum"    
##  [99] "Najas flexilis"                 "Najas guadalupensis"           
## [101] "Najas minor"                    "Nasturtium officinale"         
## [103] "Nelumbo lutea"                  "Nitellopsis obtusa"            
## [105] "Nuphar advena"                  "Nuphar microphylla"            
## [107] "Nuphar variegata"               "Nymphaea odorata"              
## [109] "Nymphaea tuberosa"              "Persicaria amphibia"           
## [111] "Persicaria lapathifolia"        "Phalaris arundinacea"          
## [113] "Phragmites australis"           "Polygonum amphibium"           
## [115] "Pontederia cordata"             "Potamogeton alpinus"           
## [117] "Potamogeton amplifolius"        "Potamogeton crispus"           
## [119] "Potamogeton epihydrus"          "Potamogeton foliosus"          
## [121] "Potamogeton friesii"            "Potamogeton gramineus"         
## [123] "Potamogeton hillii"             "Potamogeton illinoensis"       
## [125] "Potamogeton natans"             "Potamogeton nodosus"           
## [127] "Potamogeton obtusifolius"       "Potamogeton praelongus"        
## [129] "Potamogeton pusillus"           "Potamogeton richardsonii"      
## [131] "Potamogeton robbinsii"          "Potamogeton spirillus"         
## [133] "Potamogeton strictifolius"      "Potamogeton vaseyi"            
## [135] "Potamogeton zosteriformis"      "Potentilla palustris"          
## [137] "Protectedspecies 1"             "Protectedspecies 10"           
## [139] "Protectedspecies 11"            "Protectedspecies 12"           
## [141] "Protectedspecies 2"             "Protectedspecies 3"            
## [143] "Protectedspecies 4"             "Protectedspecies 5"            
## [145] "Protectedspecies 6"             "Protectedspecies 7"            
## [147] "Protectedspecies 8"             "Protectedspecies 9"            
## [149] "Ranunculus aquatilis"           "Ranunculus flabellaris"        
## [151] "Ranunculus flammula"            "Ranunculus longirostris"       
## [153] "Riccia fluitans"                "Ricciocarpos natans"           
## [155] "Rumex orbiculatus"              "Sagittaria cristata"           
## [157] "Sagittaria cuneata"             "Sagittaria graminea"           
## [159] "Sagittaria latifolia"           "Sagittaria rigida"             
## [161] "Schoenoplectus acutus"          "Schoenoplectus americanus"     
## [163] "Schoenoplectus pungens"         "Schoenoplectus subterminalis"  
## [165] "Schoenoplectus tabernaemontani" "Schoenoplectus x oblongus"     
## [167] "Scirpus atrovirens"             "Scirpus cyperinus"             
## [169] "Scirpus validus"                "Scolochloa festucacea"         
## [171] "Scorpidium scorpioides"         "Scutellaria lateriflora"       
## [173] "Sium suave"                     "Sparganium americanum"         
## [175] "Sparganium angustifolium"       "Sparganium emersum"            
## [177] "Sparganium eurycarpum"          "Sparganium fluctuans"          
## [179] "Sparganium natans"              "Sphagnum magellanicum"         
## [181] "Spirodela polyrhiza"            "Stuckenia filiformis"          
## [183] "Stuckenia pectinata"            "Tolypella intricata"           
## [185] "Triadenum fraseri"              "Typha angustifolia"            
## [187] "Typha glauca"                   "Typha latifolia"               
## [189] "Utricularia gibba"              "Utricularia intermedia"        
## [191] "Utricularia macrorhiza"         "Utricularia minor"             
## [193] "Utricularia vulgaris"           "Vallisneria americana"         
## [195] "Veronica americana"             "Wolffia borealis"              
## [197] "Wolffia columbiana"             "Zannichellia palustris"        
## [199] "Zizania palustris"              "acorus"                        
## [201] "alisma"                         "alnus"                         
## [203] "bidens"                         "callitriche"                   
## [205] "carex"                          "ceratophyllum"                 
## [207] "chara"                          "characeae"                     
## [209] "cicuta"                         "cyperaceae"                    
## [211] "drepanocladus"                  "elatine"                       
## [213] "eleocharis"                     "elodea"                        
## [215] "equisetum"                      "eragrostis"                    
## [217] "eutrochium"                     "hypericum"                     
## [219] "impatiens"                      "iris"                          
## [221] "isoetes"                        "juncus"                        
## [223] "lamiaceae"                      "lemna"                         
## [225] "lysimachia"                     "myriophyllum"                  
## [227] "najas"                          "nitella"                       
## [229] "nuphar"                         "nymphaea"                      
## [231] "nymphaeaceae"                   "persicaria"                    
## [233] "poaceae"                        "potamogeton"                   
## [235] "potamogeton (broad)"            "potamogeton (narrow)"          
## [237] "ranunculus"                     "riccia"                        
## [239] "sagittaria"                     "salix"                         
## [241] "schoenoplectus"                 "scirpus"                       
## [243] "scutellaria"                    "solidago"                      
## [245] "sparganium"                     "sparganium (emergent)"         
## [247] "sparganium (floating)"          "sphagnum"                      
## [249] "stuckenia"                      "typha"                         
## [251] "utricularia"                    "verbena"                       
## [253] "wolffia"                        "zizania"                       
## [255] "zosterella"
# I ran glimpse() and then added in metadata:
# DOW                              <int> MN Dept of Waters Ident.
# LAKE_NAME                        <chr> Name of the lake.
# order_ID                         <int> key used to link to MN Hydrography dataset
# SUBBASIN                         <chr> Sub-basin where the observation was made.
# watershed                        <dbl> Watershed associated with the observation. numeric key for watershed (see watershed_occurrence_wide for detail on watersheds like names, sizes, etc. )
# watershedrichness                <int> taxa richness across all surveys in watershed
# watershedsimpson_nat             <dbl> inverse simpsons diversity in watershed
# SURVEY_ID                        <int> Identification number for the survey.
# SURVEY_DATASOURCE                <chr> Name of the source of the survey data.
# SURVEY_DATE                      <IDate> Date when the survey was conducted, if multiple dates uses the first day of the survey.
# MULTIPARTSURVEY                  <dbl> Indicator for if the survey is part of a larger survey. Numeric with structure of [SURVEY.PART]
# SURVEYOR                         <chr> Person or entity conducting the survey if known.
# surveyrichness                   <dbl> Taxonomic richness of survey
# surveysimpson_nat                <dbl> Inverse simpsons diversity of survey
# Secchi_m                         <dbl> Nearest temporal Secchi depth measured in meters.
# SECCHI_DATE                      <IDate> Date when Secchi depth was measured.
# SECCHI_m_ACCEPTED                <dbl> Secchi depth measurement if observation is within 30d of the plant survey (used for proplight calculation).
# POINT_ID                         <int> Identification number for the observation point.
# DEPTH_FT                         <dbl> Depth to substrate in feet.
# proplight                        <dbl> Proportion of surface light remaining at DEPTH_FT.
# Longitude                        <dbl> Longitude coordinate of the observation point.
# Latitude                         <dbl> Latitude coordinate of the observation point.
# richness                         <dbl> Number of unique taxa observed at the point.
# nat_richness                     <dbl> Number of unique native taxa observed at the point.
# the remaining columns are species occurrence columns, indicating whether a taxon was observed, along with it's relative rake abundance (1-3) or not observed (0) at a sample point
   
   # fwrite(plants_occurrence_wide, file = "data&scripts/data/output/DRUM/plants_occurrence_wide.csv")
  1. plants_rakeabund_wide –> plants_abund_env_data_wide.csv – As a wide format with species abundances (a subset of “2.”) where each row is a point record , and the columns include a species abundance matrix
   # names(plants_rakeabund_wide)
   
   setcolorder(plants_rakeabund_wide, c("DOW", "LAKE_NAME", "order_ID", "SUBBASIN", 
                                        "watershed", "watershedrichness", "watershedsimpson_nat",
                                        "SURVEY_ID", "SURVEY_DATASOURCE", "SURVEY_DATE", "MULTIPARTSURVEY", "SURVEYOR", "surveyrichness", "surveysimpson_nat",
                                        "Secchi_m", "SECCHI_DATE", "SECCHI_m_ACCEPTED",
                                        "POINT_ID" ,"DEPTH_FT", "proplight", "Longitude", "Latitude",
                                        "shannon_div", "simpsons_div", "shannon_div_nat", "simpsons_div_nat", "richness", "nat_richness"
                                        ))
  
   # export_names_plants_rakeabund_wide <- tolower(names(plants_rakeabund_wide))
   
   names(plants_rakeabund_wide)
##   [1] "DOW"                            "LAKE_NAME"                     
##   [3] "order_ID"                       "SUBBASIN"                      
##   [5] "watershed"                      "watershedrichness"             
##   [7] "watershedsimpson_nat"           "SURVEY_ID"                     
##   [9] "SURVEY_DATASOURCE"              "SURVEY_DATE"                   
##  [11] "MULTIPARTSURVEY"                "SURVEYOR"                      
##  [13] "surveyrichness"                 "surveysimpson_nat"             
##  [15] "Secchi_m"                       "SECCHI_DATE"                   
##  [17] "SECCHI_m_ACCEPTED"              "POINT_ID"                      
##  [19] "DEPTH_FT"                       "proplight"                     
##  [21] "Longitude"                      "Latitude"                      
##  [23] "shannon_div"                    "simpsons_div"                  
##  [25] "shannon_div_nat"                "simpsons_div_nat"              
##  [27] "richness"                       "nat_richness"                  
##  [29] "Bidens beckii"                  "Bolboschoenus fluviatilis"     
##  [31] "Brasenia schreberi"             "Caltha palustris"              
##  [33] "Carex comosa"                   "Carex pellita"                 
##  [35] "Carex scoparia"                 "Ceratophyllum demersum"        
##  [37] "Chara globularis"               "Eleocharis acicularis"         
##  [39] "Eleocharis erythropoda"         "Eleocharis palustris"          
##  [41] "Elodea canadensis"              "Elodea nuttallii"              
##  [43] "Equisetum fluviatile"           "Fontinalis antipyretica"       
##  [45] "Glyceria borealis"              "Heteranthera dubia"            
##  [47] "Hippuris vulgaris"              "Iris virginica"                
##  [49] "Isoetes echinospora"            "Juncus arcticus"               
##  [51] "Juncus canadensis"              "Juncus effusus"                
##  [53] "Juncus pelocarpus"              "Lemna minor"                   
##  [55] "Lemna trisulca"                 "Lychnothamnus barbatus"        
##  [57] "Lythrum salicaria"              "Myriophyllum exalbescens"      
##  [59] "Myriophyllum farwellii"         "Myriophyllum sibiricum"        
##  [61] "Myriophyllum spicatum"          "Myriophyllum verticillatum"    
##  [63] "Najas flexilis"                 "Najas guadalupensis"           
##  [65] "Najas minor"                    "Nelumbo lutea"                 
##  [67] "Nitellopsis obtusa"             "Nuphar advena"                 
##  [69] "Nuphar variegata"               "Nymphaea odorata"              
##  [71] "Nymphaea tuberosa"              "Persicaria amphibia"           
##  [73] "Phalaris arundinacea"           "Phragmites australis"          
##  [75] "Polygonum amphibium"            "Pontederia cordata"            
##  [77] "Potamogeton amplifolius"        "Potamogeton crispus"           
##  [79] "Potamogeton epihydrus"          "Potamogeton foliosus"          
##  [81] "Potamogeton friesii"            "Potamogeton gramineus"         
##  [83] "Potamogeton illinoensis"        "Potamogeton natans"            
##  [85] "Potamogeton nodosus"            "Potamogeton obtusifolius"      
##  [87] "Potamogeton praelongus"         "Potamogeton pusillus"          
##  [89] "Potamogeton richardsonii"       "Potamogeton robbinsii"         
##  [91] "Potamogeton spirillus"          "Potamogeton strictifolius"     
##  [93] "Potamogeton zosteriformis"      "Protectedspecies 1"            
##  [95] "Protectedspecies 10"            "Protectedspecies 12"           
##  [97] "Protectedspecies 2"             "Protectedspecies 3"            
##  [99] "Protectedspecies 7"             "Protectedspecies 8"            
## [101] "Ranunculus aquatilis"           "Ranunculus flabellaris"        
## [103] "Ranunculus longirostris"        "Riccia fluitans"               
## [105] "Ricciocarpos natans"            "Sagittaria cristata"           
## [107] "Sagittaria graminea"            "Sagittaria latifolia"          
## [109] "Sagittaria rigida"              "Schoenoplectus acutus"         
## [111] "Schoenoplectus americanus"      "Schoenoplectus pungens"        
## [113] "Schoenoplectus subterminalis"   "Schoenoplectus tabernaemontani"
## [115] "Scirpus cyperinus"              "Scirpus validus"               
## [117] "Sium suave"                     "Sparganium eurycarpum"         
## [119] "Spirodela polyrhiza"            "Stuckenia filiformis"          
## [121] "Stuckenia pectinata"            "Tolypella intricata"           
## [123] "Typha angustifolia"             "Typha glauca"                  
## [125] "Typha latifolia"                "Utricularia gibba"             
## [127] "Utricularia macrorhiza"         "Utricularia minor"             
## [129] "Utricularia vulgaris"           "Vallisneria americana"         
## [131] "Wolffia borealis"               "Wolffia columbiana"            
## [133] "Zannichellia palustris"         "carex"                         
## [135] "ceratophyllum"                  "chara"                         
## [137] "characeae"                      "cyperaceae"                    
## [139] "drepanocladus"                  "eleocharis"                    
## [141] "elodea"                         "juncus"                        
## [143] "lemna"                          "myriophyllum"                  
## [145] "najas"                          "nitella"                       
## [147] "nuphar"                         "nymphaea"                      
## [149] "poaceae"                        "potamogeton"                   
## [151] "potamogeton (broad)"            "potamogeton (narrow)"          
## [153] "ranunculus"                     "riccia"                        
## [155] "sagittaria"                     "salix"                         
## [157] "schoenoplectus"                 "scirpus"                       
## [159] "sparganium"                     "typha"                         
## [161] "utricularia"                    "wolffia"                       
## [163] "zizania"
# I ran glimpse() and then added in metadata:
# DOW                              <int> MN Dept of Waters Ident.
# LAKE_NAME                        <chr> Name of the lake.
# order_ID                         <int> key used to link to MN Hydrography dataset
# SUBBASIN                         <chr> Sub-basin where the observation was made.
# watershed                        <dbl> Watershed associated with the observation. numeric key for watershed (see watershed_occurrence_wide for detail on watersheds like names, sizes, etc. )
# watershedrichness                <int> taxa richness across all surveys in watershed
# watershedsimpson_nat             <dbl> inverse simpsons diversity in watershed
# SURVEY_ID                        <int> Identification number for the survey.
# SURVEY_DATASOURCE                <chr> Name of the source of the survey data.
# SURVEY_DATE                      <IDate> Date when the survey was conducted, if multiple dates uses the first day of the survey.
# MULTIPARTSURVEY                  <dbl> Indicator for if the survey is part of a larger survey. Numeric with structure of [SURVEY.PART]
# SURVEYOR                         <chr> Person or entity conducting the survey if known.
# surveyrichness                   <dbl> Taxonomic richness of survey
# surveysimpson_nat                <dbl> Inverse simpsons diversity of survey
# Secchi_m                         <dbl> Nearest temporal Secchi depth measured in meters.
# SECCHI_DATE                      <IDate> Date when Secchi depth was measured.
# SECCHI_m_ACCEPTED                <dbl> Secchi depth measurement if observation is within 30d of the plant survey (used for proplight calculation).
# POINT_ID                         <int> Identification number for the observation point.
# DEPTH_FT                         <dbl> Depth to substrate in feet.
# proplight                        <dbl> Proportion of surface light remaining at DEPTH_FT.
# Longitude                        <dbl> Longitude coordinate of the observation point.
# Latitude                         <dbl> Latitude coordinate of the observation point.
# shannon_div                      <dbl> Shannon diversity of taxa observed at the point
# simpsons_div                     <dbl> Inverse simpsons diversity of taxa observed at the point
# shannon_div_nat                  <dbl> Shannon diversity of native taxa observed at the point
# simpsons_div_nat                 <dbl> Inverse simpsons diversity of native taxa observed at the point
# richness                         <dbl> Number of unique taxa observed at the point.
# nat_richness                     <dbl> Number of unique native taxa observed at the point.
# the remaining columns are species occurrence columns, indicating whether a taxon was observed (1) or not observed (0) at a sample point

   # fwrite(plants_rakeabund_wide, file = "data&scripts/data/output/DRUM/plants_abund_env_data_wide.csv")
  1. surveys –> surveys_aqplants.csv – aggregated plants data at the survey level. Each row is a set of survey-level summary stats and abundances (number of obs) for all species in the dataset.
   setcolorder(surveys, c("DOW", "LAKE_NAME", "order_ID", "SUBBASIN", 
                          "watershed", "watershedrichness", "watershedsimpson_nat",
                          "SURVEY_ID", "SURVEY_DATASOURCE", "SURVEY_DATE", "MULTIPARTSURVEY",
                          "Secchi_m", "Secchi_m_date",
                          "nobs", "tot_n_samp",  
                          "max_depth_surveyed", "min_depth_surveyed", "mean_depth_surveyed", "median_depth_surveyed", "IQR_depth_surveyed", 
                          "max_depth_vegetated", "min_depth_vegetated", "mean_depth_vegetated", "median_depth_vegetated", "IQR_depth_vegetated",
                          "alltime_maxvegdep", "alltime_maxvegdep_n_samp", "survey_maxvegdep", "survey_maxvegdep_n_samp", 
                          "n_points_vegetated", "prop_veg", 
                          "shannon_div", "simpsons_div", "shannon_div_nat", "simpsons_div_nat", "taxa_richness", "nat_richness" 
   ))
   glimpse(surveys)
## Rows: 3,194
## Columns: 268
## $ DOW                              <int> 1001600, 1003400, 1003500, 1005300, 1…
## $ LAKE_NAME                        <chr> "little prairie", "horseshoe", "mud",…
## $ order_ID                         <int> 16369, 16343, 16671, 16418, 16689, 16…
## $ SUBBASIN                         <chr> "", "", "", "", "", "", "", "", "", "…
## $ watershed                        <dbl> 9, 9, 9, 9, 9, 9, 9, 9, 9, 21, 21, 10…
## $ watershedrichness                <int> 107, 107, 107, 107, 107, 107, 107, 10…
## $ watershedsimpson_nat             <dbl> 17.02048, 17.02048, 17.02048, 17.0204…
## $ SURVEY_ID                        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12…
## $ SURVEY_DATASOURCE                <chr> "DNR Shallow Lakes", "DNR Shallow Lak…
## $ SURVEY_DATE                      <IDate> 2011-08-16, 2011-08-10, 2014-07-17,…
## $ MULTIPARTSURVEY                  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Secchi_m                         <dbl> 0.7620, 0.6000, 1.0000, 0.8382, 1.524…
## $ Secchi_m_date                    <IDate> 2011-08-16, 2011-08-07, 2014-07-17,…
## $ nobs                             <int> 63, 126, 160, 147, 118, 133, 128, 133…
## $ tot_n_samp                       <int> 41, 66, 37, 35, 52, 39, 56, 45, 56, 4…
## $ max_depth_surveyed               <dbl> 7.5, 11.2, 15.0, 4.0, 6.8, 6.0, 12.0,…
## $ min_depth_surveyed               <dbl> 1.2, 2.2, 0.5, 1.2, 1.5, 1.5, 2.0, 2.…
## $ mean_depth_surveyed              <dbl> 4.665079, 5.526984, 4.320625, 3.24217…
## $ median_depth_surveyed            <dbl> 5.50, 5.00, 4.65, 3.50, 4.00, 3.80, 5…
## $ IQR_depth_surveyed               <dbl> 3.600, 3.150, 1.700, 0.650, 1.600, 1.…
## $ max_depth_vegetated              <dbl> 7.0, 11.0, 6.0, 4.0, 6.2, 5.0, 10.0, …
## $ min_depth_vegetated              <dbl> 1.2, 2.2, 0.5, 1.2, 1.5, 1.5, 2.0, 2.…
## $ mean_depth_vegetated             <dbl> 3.715000, 4.198780, 4.018301, 3.24217…
## $ median_depth_vegetated           <dbl> 3.8, 3.8, 4.0, 3.5, 4.0, 3.5, 5.0, 6.…
## $ IQR_depth_vegetated              <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ alltime_maxvegdep                <dbl> 7.0, 11.0, 6.0, 4.0, 6.2, 5.0, 10.0, …
## $ alltime_maxvegdep_n_samp         <int> 41, 66, 32, 35, 52, 36, 55, 45, 48, 4…
## $ survey_maxvegdep                 <dbl> 7.0, 11.0, 6.0, 4.0, 6.2, 5.0, 10.0, …
## $ survey_maxvegdep_n_samp          <int> 41, 66, 32, 35, 52, 36, 55, 45, 48, 4…
## $ n_points_vegetated               <int> 18, 22, 30, 35, 45, 33, 31, 45, 36, 4…
## $ prop_veg                         <dbl> 0.4390244, 0.3333333, 0.8108108, 1.00…
## $ shannon_div                      <dbl> 2.149490, 2.471995, 2.711635, 2.15670…
## $ simpsons_div                     <dbl> 6.250000, 10.035821, 11.292330, 7.059…
## $ shannon_div_nat                  <dbl> 2.053658, 2.471995, 2.689823, 2.13046…
## $ simpsons_div_nat                 <dbl> 5.730159, 10.035821, 11.150579, 6.966…
## $ taxa_richness                    <dbl> 12, 15, 24, 14, 15, 15, 23, 8, 22, 17…
## $ nat_richness                     <dbl> 11, 15, 23, 13, 15, 14, 23, 8, 21, 16…
## $ `Acorus americanus`              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Alisma triviale`                <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Andromeda glaucophylla`         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Andromeda polifolia`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Asclepias incarnata`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Asteraceae                       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Betula pumila`                  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Bidens beckii`                  <int> 0, 0, 0, 0, 7, 6, 2, 0, 0, 5, 0, 0, 1…
## $ `Bolboschoenus fluviatilis`      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Bolboschoenus maritimus`        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Boltonia asteroides`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Brasenia schreberi`             <int> 0, 5, 10, 0, 0, 0, 0, 0, 25, 0, 0, 2,…
## $ `Butomus umbellatus`             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Calamagrostis canadensis`       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Calla palustris`                <int> 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Caltha palustris`               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Carex aquatilis`                <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Carex comosa`                   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Carex lacustris`                <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Carex pellita`                  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Carex scoparia`                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Ceratophyllum demersum`         <int> 1, 5, 4, 30, 4, 23, 15, 42, 1, 26, 6,…
## $ `Chamaedaphne calyculata`        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Chara canescens`                <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Chara globularis`               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Cicuta maculata`                <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Drepanocladus aduncus`          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Dulichium arundinaceum`         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Elatine minima`                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Eleocharis acicularis`          <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ `Eleocharis erythropoda`         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Eleocharis palustris`           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Eleocharis smallii`             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ `Elodea canadensis`              <int> 0, 0, 0, 0, 0, 2, 10, 10, 0, 20, 7, 0…
## $ `Elodea nuttallii`               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Equisetum fluviatile`           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Eriocaulon aquaticum`           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Eupatorium dubium`              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Eupatorium maculatum`           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Eupatorium perfoliatum`         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Fontinalis antipyretica`        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Fontinalis sullivantii`         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Glyceria borealis`              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Heteranthera dubia`             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ `Hippuris vulgaris`              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Hypericum ellipticum`           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Impatiens capensis`             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Iris versicolor`                <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Iris virginica`                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Isoetes echinospora`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Juncus arcticus`                <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Juncus canadensis`              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Juncus effusus`                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Juncus pelocarpus`              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Ledum groenlandicum`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Leersia oryzoides`              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Lemna minor`                    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Lemna trisulca`                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Lemna turionifera`              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Littorella uniflora`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Lobelia dortmanna`              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Lychnothamnus barbatus`         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Lycopus americanus`             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Lysimachia terrestris`          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Lythrum salicaria`              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Menyanthes trifoliata`          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Myrica gale`                    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Myriophyllum alterniflorum`     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Myriophyllum exalbescens`       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Myriophyllum farwellii`         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Myriophyllum sibiricum`         <int> 0, 1, 0, 0, 1, 0, 2, 0, 0, 6, 0, 7, 0…
## $ `Myriophyllum spicatum`          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Myriophyllum tenellum`          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Myriophyllum verticillatum`     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Najas flexilis`                 <int> 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 28, …
## $ `Najas guadalupensis`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Najas minor`                    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Nasturtium officinale`          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Nelumbo lutea`                  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Nitellopsis obtusa`             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Nuphar advena`                  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Nuphar microphylla`             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Nuphar variegata`               <int> 1, 6, 5, 15, 11, 10, 8, 1, 5, 13, 15,…
## $ `Nymphaea odorata`               <int> 5, 12, 28, 13, 8, 20, 7, 0, 21, 21, 2…
## $ `Nymphaea tuberosa`              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Persicaria amphibia`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Persicaria lapathifolia`        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Phalaris arundinacea`           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Phragmites australis`           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Polygonum amphibium`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Pontederia cordata`             <int> 0, 1, 12, 2, 0, 0, 3, 0, 0, 0, 0, 0, …
## $ `Potamogeton alpinus`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Potamogeton amplifolius`        <int> 0, 0, 10, 0, 0, 0, 5, 0, 0, 0, 6, 16,…
## $ `Potamogeton crispus`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Potamogeton epihydrus`          <int> 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0…
## $ `Potamogeton foliosus`           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Potamogeton friesii`            <int> 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0…
## $ `Potamogeton gramineus`          <int> 0, 0, 1, 0, 0, 0, 1, 0, 4, 0, 2, 0, 1…
## $ `Potamogeton hillii`             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Potamogeton illinoensis`        <int> 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ `Potamogeton natans`             <int> 0, 3, 13, 15, 5, 4, 1, 5, 0, 11, 1, 0…
## $ `Potamogeton nodosus`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Potamogeton obtusifolius`       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Potamogeton praelongus`         <int> 0, 4, 0, 0, 1, 0, 2, 0, 0, 0, 13, 9, …
## $ `Potamogeton pusillus`           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Potamogeton richardsonii`       <int> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0…
## $ `Potamogeton robbinsii`          <int> 5, 0, 0, 15, 0, 0, 0, 0, 2, 0, 0, 7, …
## $ `Potamogeton spirillus`          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Potamogeton strictifolius`      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Potamogeton vaseyi`             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Potamogeton zosteriformis`      <int> 0, 11, 6, 7, 14, 15, 11, 43, 0, 37, 1…
## $ `Potentilla palustris`           <int> 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ `Protectedspecies 1`             <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 3…
## $ `Protectedspecies 10`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Protectedspecies 11`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Protectedspecies 12`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Protectedspecies 2`             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Protectedspecies 3`             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Protectedspecies 4`             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Protectedspecies 5`             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Protectedspecies 6`             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Protectedspecies 7`             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Protectedspecies 8`             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Protectedspecies 9`             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Ranunculus aquatilis`           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Ranunculus flabellaris`         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Ranunculus flammula`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Ranunculus longirostris`        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Riccia fluitans`                <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Ricciocarpos natans`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Rumex orbiculatus`              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Sagittaria cristata`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Sagittaria cuneata`             <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Sagittaria graminea`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Sagittaria latifolia`           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Sagittaria rigida`              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Schoenoplectus acutus`          <int> 0, 2, 0, 0, 0, 0, 8, 1, 0, 4, 1, 0, 0…
## $ `Schoenoplectus americanus`      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Schoenoplectus pungens`         <int> 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0…
## $ `Schoenoplectus subterminalis`   <int> 0, 0, 17, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ `Schoenoplectus tabernaemontani` <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Schoenoplectus x oblongus`      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Scirpus atrovirens`             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Scirpus cyperinus`              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Scirpus validus`                <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Scolochloa festucacea`          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Scorpidium scorpioides`         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Scutellaria lateriflora`        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Sium suave`                     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Sparganium americanum`          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Sparganium angustifolium`       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Sparganium emersum`             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Sparganium eurycarpum`          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Sparganium fluctuans`           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0…
## $ `Sparganium natans`              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Sphagnum magellanicum`          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Spirodela polyrhiza`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Stuckenia filiformis`           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Stuckenia pectinata`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0…
## $ `Tolypella intricata`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Triadenum fraseri`              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Typha angustifolia`             <int> 2, 0, 1, 1, 0, 1, 0, 0, 1, 3, 0, 0, 0…
## $ `Typha glauca`                   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Typha latifolia`                <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Utricularia gibba`              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Utricularia intermedia`         <int> 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ `Utricularia macrorhiza`         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Utricularia minor`              <int> 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0…
## $ `Utricularia vulgaris`           <int> 0, 4, 18, 10, 1, 3, 5, 3, 9, 14, 3, 0…
## $ `Vallisneria americana`          <int> 3, 6, 2, 0, 13, 9, 1, 0, 0, 0, 11, 2,…
## $ `Veronica americana`             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Wolffia borealis`               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Wolffia columbiana`             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Zannichellia palustris`         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Zizania palustris`              <int> 0, 14, 4, 34, 33, 22, 4, 28, 0, 39, 2…
## $ acorus                           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ alisma                           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ alnus                            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ bidens                           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ callitriche                      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ carex                            <int> 3, 0, 4, 1, 0, 9, 0, 0, 1, 0, 0, 0, 0…
## $ ceratophyllum                    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ chara                            <int> 0, 0, 1, 0, 0, 1, 5, 0, 0, 1, 0, 1, 1…
## $ characeae                        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cicuta                           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cyperaceae                       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ drepanocladus                    <int> 13, 6, 2, 0, 8, 0, 0, 0, 13, 0, 0, 0,…
## $ elatine                          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ eleocharis                       <int> 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ elodea                           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2…
## $ equisetum                        <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0…
## $ eragrostis                       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ eutrochium                       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ hypericum                        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ impatiens                        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ iris                             <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ isoetes                          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ juncus                           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ lamiaceae                        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ lemna                            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ lysimachia                       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ myriophyllum                     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ najas                            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ nitella                          <int> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ nuphar                           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ nymphaea                         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ nymphaeaceae                     <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ persicaria                       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ poaceae                          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ potamogeton                      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `potamogeton (broad)`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `potamogeton (narrow)`           <int> 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 25, …
## $ ranunculus                       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ riccia                           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ sagittaria                       <int> 2, 0, 4, 1, 3, 0, 0, 0, 3, 10, 6, 0, …
## $ salix                            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ schoenoplectus                   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ scirpus                          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ scutellaria                      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ solidago                         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ sparganium                       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ `sparganium (emergent)`          <int> 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ `sparganium (floating)`          <int> 2, 2, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0…
## $ sphagnum                         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ stuckenia                        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ typha                            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ utricularia                      <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ verbena                          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ wolffia                          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ zizania                          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ zosterella                       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
    # export_names_surveys <- tolower(names(surveys))
    
# DOW                              <int> MN Dept of Waters Ident.
# LAKE_NAME                        <chr> Name of the lake.
# order_ID                         <int> key used to link to MN Hydrography dataset
# SUBBASIN                         <chr> Sub-basin where the observation was made.
# watershed                        <dbl> Watershed associated with the observation. numeric key for watershed (see watershed_occurrence_wide for detail on watersheds like names, sizes, etc. )
# watershedrichness                <int> taxa richness across all surveys in watershed
# watershedsimpson_nat             <dbl> inverse simpsons diversity in watershed
# SURVEY_ID                        <int> Identification number for the survey.
# SURVEY_DATASOURCE                <chr> Name of the source of the survey data.
# SURVEY_DATE                      <IDate> Date when the survey was conducted, if multiple dates uses the first day of the survey.
# MULTIPARTSURVEY                  <dbl> Indicator for if the survey is part of a larger survey. Numeric with structure of [SURVEY.PART]
# Secchi_m                         <dbl> Nearest temporal Secchi depth measured in meters.
# Secchi_m_date                      <IDate> Date when Secchi depth was measured.
# nobs                             <int> number of observations in this survey (5 taxa observed at a point = 5 observations--thus this metric can exceed the tot n samp vlaue)
# tot_n_samp                       <int> total number of samples taken/points sampled
# max_depth_surveyed               <dbl> max depth that survyors sampled (ALL DEPTHS IN FEET)
# min_depth_surveyed               <dbl> min depth that surveyors sampled (ALL DEPTHS IN FEET)
# mean_depth_surveyed              <dbl> mean depth that surveyors sampled (ALL DEPTHS IN FEET)
# median_depth_surveyed            <dbl> median depth that surveyors sampled (ALL DEPTHS IN FEET)
# IQR_depth_surveyed               <dbl> inter-quartile range depth that surveyors sampled (ALL DEPTHS IN FEET)
# max_depth_vegetated              <dbl> maximum depth where vegetation was observed (ALL DEPTHS IN FEET)
# min_depth_vegetated              <dbl> min depth where vegetation was observed (ALL DEPTHS IN FEET)
# mean_depth_vegetated             <dbl> mean depth where vegetation was observed (ALL DEPTHS IN FEET)
# median_depth_vegetated           <dbl> median depth where vegetation was observed (ALL DEPTHS IN FEET)
# IQR_depth_vegetated              <dbl> inter-quartile range depth where vegetation was observed (ALL DEPTHS IN FEET) 
# alltime_maxvegdep                <dbl> the max depth of plants ever observed in this lake (across all surveys in this db)
# alltime_maxvegdep_n_samp         <int> Number of samples taken from points less than alltime_maxvegdep during this survey
# survey_maxvegdep                 <dbl> Survey maximum vegetation depth.
# survey_maxvegdep_n_samp          <int> Number of samples for survey maximum vegetation depth.
# n_points_vegetated               <int> Number of points with veg present
# prop_veg                         <dbl> n_points_vegetated/tot_n_samp
# shannon_div                      <dbl> Shannon diversity index for this survey
# simpsons_div                     <dbl> survey inverse Simpson's diversity index.
# shannon_div_nat                  <dbl> survey Shannon diversity index including native taxa only.
# simpsons_div_nat                 <dbl> survey inverse Simpson's diversity index including native taxa only 
# taxa_richness                    <dbl> count of taxa in this survey
# nat_richness                     <dbl> native species taxon count in this survey     
# the remaining columns are species occurrence count columns, indicating the number of points at which a taxon was observed (1+) or not observed (0) during a survey.
    
       
    # fwrite(surveys, file = "data&scripts/data/output/DRUM/surveys_aqplants.csv")
  1. missing surveys –> missing_data_surveys.csv – Surveys identified, but for which plant data but were not sucessfully collated for this project. Each row is a survey.
   # names(missing_data_surveys)
   missing_data_surveys[ , c( "STA_NBR_DATASOURCE", "SURVEY_ID_DATASOURCE", "SAMPLE_NOTES", "OLD_SURVEY_ID", "DATESURVEYSTART",
                              "COHORT","POINT_ID" ,"DEPTH_FT", "NO_VEG_FOUND", "WHOLE_RAKE_REL_ABUND","SUBSTRATE", "SURFACE_GROWTH",
                              "POINT_LVL_SECCHI","OBS_ID", "TAXON", "REL_ABUND","RAKE_SCALE_USED", 
                              "X", "Y", "NORTHING", "EASTING", "LATITUDE", "LONGITUDE", "UTMX", "UTMY"
                              ) := NULL , ]
   
   setcolorder(missing_data_surveys, c("DOW", "LAKE_NAME", "SUBBASIN", 
                                       "DATASOURCE", "SURVEY_ID", "SURVEY_DATASOURCE", "SURVEY_DATE", "MULTIPARTSURVEY", "SURVEYOR",
                         "DATEINFO", "MONTH", "DAY", "YEAR", 
                         "INVENTORY_STAFF", "INVENTORY_STAFFDATE", "USEABLE", "CLEANED", "INDATABASE",
                         "INVENTORY_NOTES", "SUBMISSION_STAFF", "SUBMISSION_STAFFDATE", "SUBMISSION_NOTES", "SURVEY_FEEDBACK"
   ))
   
   # export_missing_data_surveys <- tolower(names(missing_data_surveys))
   glimpse(missing_data_surveys)
## Rows: 257
## Columns: 23
## $ DOW                  <int> 62000600, 62000600, 62000600, 62000600, 62000600,…
## $ LAKE_NAME            <chr> "kohlman", "kohlman", "kohlman", "kohlman", "kohl…
## $ SUBBASIN             <chr> "", "", "", "", "", "", "", "", "", "", "", "", "…
## $ DATASOURCE           <chr> "source_14", "source_14", "source_14", "source_14…
## $ SURVEY_ID            <int> 3319, 3320, 3321, 3322, 3323, 3324, 3325, 3326, 3…
## $ SURVEY_DATASOURCE    <chr> "Ramsey County", "Ramsey County", "Ramsey County"…
## $ SURVEY_DATE          <IDate> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ MULTIPARTSURVEY      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ SURVEYOR             <chr> "Ramsey County", "Ramsey County", "Ramsey County"…
## $ DATEINFO             <chr> "", "", "", "", "", "", "", "", "", "", "", "", "…
## $ MONTH                <int> 8, 8, 4, 8, 5, 4, 8, 9, 6, 9, 6, 8, 8, 9, 8, 8, 9…
## $ DAY                  <int> 23, 21, 28, 19, 13, 1, 23, 26, 16, 10, 9, 23, 26,…
## $ YEAR                 <int> 2004, 2007, 2008, 2008, 2009, 2010, 2004, 2008, 2…
## $ INVENTORY_STAFF      <chr> "Staff_1", "Staff_1", "Staff_1", "Staff_1", "Staf…
## $ INVENTORY_STAFFDATE  <chr> "1/13/2020", "1/13/2020", "1/13/2020", "1/13/2020…
## $ USEABLE              <chr> "N", "N", "N", "N", "N", "N", "N", "N", "N", "N",…
## $ CLEANED              <chr> "N", "N", "N", "N", "N", "N", "N", "N", "N", "N",…
## $ INDATABASE           <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ INVENTORY_NOTES      <chr> "No raw data;", "No raw data;", "No raw data;", "…
## $ SUBMISSION_STAFF     <chr> "staff_5", "staff_5", "staff_3", "staff_3", "staf…
## $ SUBMISSION_STAFFDATE <chr> "2/21/2019", "2/21/2019", "8/6/2019", "8/6/2019",…
## $ SUBMISSION_NOTES     <chr> "excel with general stats but no raw data;", "exc…
## $ SURVEY_FEEDBACK      <chr> "no data available", "no data available", "no dat…
# DOW                  <int> MN Dept of Waters Ident.
# LAKE_NAME            <chr> Name of the lake
# SUBBASIN             <chr> Subbasin where the survey was conducted if applicable
# DATASOURCE           <chr> Internal listing for source that identified the survey
# SURVEY_ID            <int> Unique identifier for the survey
# SURVEY_DATASOURCE    <chr> Source or authority for the survey data (could be contacted to try to acquire these data)
# SURVEY_DATE          <IDate> Date when the survey was conducted, if multiple dates uses the first day of the survey
# MULTIPARTSURVEY      <dbl> Indicator for if the survey is part of a larger survey. Numeric with structure of SURVEY.PART
# SURVEYOR             <chr> Surveyor name(s) if known
# DATEINFO             <chr> Date information that may help in identifying the survey
# MONTH                <int> Month of the survey
# DAY                  <int> Day of the survey.
# YEAR                 <int> Year of the survey
# INVENTORY_STAFF      <chr> Inventory staff name
# INVENTORY_STAFFDATE  <chr> Date of inventory by staff
# USEABLE              <chr> Indicator for data usability as submitted to project team
# CLEANED              <chr> Indicator for successful pre-cleaning of the data
# INDATABASE           <lgl> Indicator for sucessful processing into database
# INVENTORY_NOTES      <chr> Inventory notes from project staff
# SUBMISSION_STAFF     <chr> staff name that processed the original submission
# SUBMISSION_STAFFDATE <chr> Date of submission processing
# SUBMISSION_NOTES     <chr> Submission notes from project staff
# SURVEY_FEEDBACK      <chr> Feedback from the survey of data contributors    
   
   
   # fwrite(missing_data_surveys, file = "data&scripts/data/output/DRUM/missing_data_surveys.csv")
  1. watershed level summaries –> watershed_occurrence_wide.csv – Surveys identified, but for which plant data but were not sucessfully collated for this project. Each row is a survey.
   # watersheds_huc8[ , simpson_div_nat := NULL , ]
   
   watershed_occurrence_wide <- watershed_occurrence_wide[as.data.table(watersheds_huc8)[ ,.SD , .SDcols = !c("simpson_div_nat")], on = .(watershed=major)   , ]
   
   watershed_occurrence_wide[ , c("NA", "HUC_8", "Shape_Leng", "Shape_Area") := NULL , ] 
   
   # colnames(watershed_occurrence_wide)
   
   
   names <-    c("watershed","major_name",
                 "acres", "sq_miles", "prod_year", "source", "geometry", 
                 "n_points", "n_species", "simpson_div_nat")                      
                 
   setcolorder(watershed_occurrence_wide, names )
   
   glimpse(watershed_occurrence_wide
           )
## Rows: 81
## Columns: 241
## $ watershed                        <dbl> 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, …
## $ major_name                       <chr> "Mississippi River - Headwaters", "Le…
## $ acres                            <dbl> 1228884, 857968, 1332793, 1076295, 50…
## $ sq_miles                         <dbl> 1920, 1341, 2082, 1682, 783, 1983, 89…
## $ prod_year                        <dbl> 2009, 2009, 2009, 2009, 2009, 2009, 2…
## $ source                           <chr> "DNR Catchment Dataset", "DNR Catchme…
## $ geometry                         <MULTIPOLYGON [m]> MULTIPOLYGON (((449453.7…
## $ n_points                         <int> 8458, 23372, 7304, 7560, 15915, 18619…
## $ n_species                        <int> 93, 83, 107, 107, 92, 115, 39, 79, 75…
## $ simpson_div_nat                  <dbl> 18.708643, 11.810110, 17.020477, 17.0…
## $ `Acorus americanus`              <int> 0, 3, 2, 4, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ `Alisma triviale`                <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Andromeda glaucophylla`         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Andromeda polifolia`            <int> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Asclepias incarnata`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Asteraceae                       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Betula pumila`                  <int> 0, 0, 0, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Bidens beckii`                  <int> 89, 162, 103, 159, 404, 251, 0, 146, …
## $ `Bolboschoenus fluviatilis`      <int> 7, 0, 2, 0, 2, 0, 0, 0, 7, 0, 0, 0, 9…
## $ `Bolboschoenus maritimus`        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Boltonia asteroides`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6…
## $ `Brasenia schreberi`             <int> 20, 279, 186, 297, 166, 604, 16, 127,…
## $ `Butomus umbellatus`             <int> 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Calamagrostis canadensis`       <int> 0, 0, 4, 6, 0, 9, 0, 0, 0, 0, 0, 0, 0…
## $ `Calla palustris`                <int> 0, 0, 4, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Caltha palustris`               <int> 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0…
## $ `Carex aquatilis`                <int> 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ `Carex comosa`                   <int> 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0…
## $ `Carex lacustris`                <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1…
## $ `Carex pellita`                  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Carex scoparia`                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Ceratophyllum demersum`         <int> 1493, 1827, 1709, 2756, 4378, 5275, 7…
## $ `Chamaedaphne calyculata`        <int> 0, 0, 5, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ `Chara canescens`                <int> 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Chara globularis`               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Cicuta maculata`                <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Drepanocladus aduncus`          <int> 1, 2, 1, 0, 3, 2, 0, 1, 0, 0, 0, 0, 0…
## $ `Dulichium arundinaceum`         <int> 1, 6, 17, 4, 4, 9, 0, 0, 0, 4, 0, 0, …
## $ `Elatine minima`                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Eleocharis acicularis`          <int> 5, 38, 6, 48, 119, 26, 0, 91, 18, 0, …
## $ `Eleocharis erythropoda`         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Eleocharis palustris`           <int> 8, 1, 12, 0, 0, 11, 0, 0, 0, 0, 0, 0,…
## $ `Eleocharis smallii`             <int> 4, 0, 2, 9, 1, 10, 0, 0, 0, 0, 0, 2, …
## $ `Elodea canadensis`              <int> 651, 2280, 113, 1023, 1949, 2667, 9, …
## $ `Elodea nuttallii`               <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Equisetum fluviatile`           <int> 44, 2, 23, 10, 15, 18, 0, 16, 2, 0, 0…
## $ `Eriocaulon aquaticum`           <int> 0, 3, 4, 14, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `Eupatorium dubium`              <int> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Eupatorium maculatum`           <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Eupatorium perfoliatum`         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Fontinalis antipyretica`        <int> 0, 0, 0, 36, 0, 0, 0, 27, 0, 0, 1, 0,…
## $ `Fontinalis sullivantii`         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Glyceria borealis`              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Heteranthera dubia`             <int> 49, 162, 46, 282, 481, 290, 0, 211, 1…
## $ `Hippuris vulgaris`              <int> 24, 21, 0, 0, 0, 87, 2, 7, 0, 1, 0, 8…
## $ `Hypericum ellipticum`           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Impatiens capensis`             <int> 2, 0, 0, 0, 0, 5, 0, 0, 1, 0, 0, 0, 0…
## $ `Iris versicolor`                <int> 2, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 2, 0…
## $ `Iris virginica`                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Isoetes echinospora`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Juncus arcticus`                <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Juncus canadensis`              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Juncus effusus`                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Juncus pelocarpus`              <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Ledum groenlandicum`            <int> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Leersia oryzoides`              <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Lemna minor`                    <int> 5, 0, 1, 28, 3, 92, 0, 2, 73, 58, 357…
## $ `Lemna trisulca`                 <int> 772, 183, 490, 1644, 548, 749, 0, 919…
## $ `Lemna turionifera`              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Littorella uniflora`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Lobelia dortmanna`              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Lychnothamnus barbatus`         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Lycopus americanus`             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ `Lysimachia terrestris`          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Lythrum salicaria`              <int> 1, 0, 0, 2, 0, 6, 0, 0, 0, 0, 0, 0, 0…
## $ `Menyanthes trifoliata`          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Myrica gale`                    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Myriophyllum alterniflorum`     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Myriophyllum exalbescens`       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Myriophyllum farwellii`         <int> 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `Myriophyllum sibiricum`         <int> 513, 1486, 464, 484, 2250, 1769, 20, …
## $ `Myriophyllum spicatum`          <int> 0, 0, 0, 12, 87, 0, 0, 113, 0, 0, 89,…
## $ `Myriophyllum tenellum`          <int> 0, 43, 2, 4, 5, 0, 0, 0, 0, 0, 0, 0, …
## $ `Myriophyllum verticillatum`     <int> 2, 0, 0, 0, 1, 0, 0, 4, 0, 0, 0, 17, …
## $ `Najas flexilis`                 <int> 1101, 1847, 319, 501, 1608, 3493, 128…
## $ `Najas guadalupensis`            <int> 48, 50, 3, 5, 1937, 566, 0, 1595, 222…
## $ `Najas minor`                    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Nasturtium officinale`          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ `Nelumbo lutea`                  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Nitellopsis obtusa`             <int> 63, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 196…
## $ `Nuphar advena`                  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9…
## $ `Nuphar microphylla`             <int> 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Nuphar variegata`               <int> 652, 484, 392, 868, 413, 1361, 80, 61…
## $ `Nymphaea odorata`               <int> 639, 544, 666, 1143, 736, 1277, 42, 6…
## $ `Nymphaea tuberosa`              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Persicaria amphibia`            <int> 0, 0, 2, 0, 4, 14, 0, 0, 0, 0, 0, 0, …
## $ `Persicaria lapathifolia`        <int> 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0…
## $ `Phalaris arundinacea`           <int> 0, 0, 0, 0, 3, 2, 1, 2, 2, 0, 0, 3, 1…
## $ `Phragmites australis`           <int> 38, 11, 3, 25, 5, 13, 0, 5, 4, 1, 1, …
## $ `Polygonum amphibium`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 4, 0…
## $ `Pontederia cordata`             <int> 0, 0, 19, 2, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ `Potamogeton alpinus`            <int> 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Potamogeton amplifolius`        <int> 120, 905, 134, 391, 598, 1015, 22, 38…
## $ `Potamogeton crispus`            <int> 95, 0, 83, 1048, 97, 1211, 1, 1082, 4…
## $ `Potamogeton epihydrus`          <int> 0, 0, 21, 13, 10, 0, 0, 0, 0, 0, 0, 0…
## $ `Potamogeton foliosus`           <int> 2, 121, 3, 3, 0, 128, 0, 118, 0, 17, …
## $ `Potamogeton friesii`            <int> 389, 441, 528, 162, 713, 547, 0, 368,…
## $ `Potamogeton gramineus`          <int> 78, 387, 165, 186, 551, 265, 2, 154, …
## $ `Potamogeton hillii`             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Potamogeton illinoensis`        <int> 293, 359, 199, 93, 682, 692, 2, 531, …
## $ `Potamogeton natans`             <int> 374, 398, 200, 179, 225, 484, 59, 183…
## $ `Potamogeton nodosus`            <int> 0, 0, 7, 1, 0, 0, 7, 1, 4, 2, 1, 26, …
## $ `Potamogeton obtusifolius`       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Potamogeton praelongus`         <int> 269, 784, 175, 349, 1234, 754, 1, 629…
## $ `Potamogeton pusillus`           <int> 1, 21, 1, 8, 14, 22, 1, 52, 7, 41, 24…
## $ `Potamogeton richardsonii`       <int> 335, 532, 238, 95, 677, 568, 5, 520, …
## $ `Potamogeton robbinsii`          <int> 221, 1174, 58, 384, 328, 1306, 9, 805…
## $ `Potamogeton spirillus`          <int> 0, 1, 13, 1, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ `Potamogeton strictifolius`      <int> 12, 1, 0, 0, 1, 139, 0, 4, 0, 0, 2, 0…
## $ `Potamogeton vaseyi`             <int> 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Potamogeton zosteriformis`      <int> 1441, 2676, 840, 886, 3540, 3292, 39,…
## $ `Potentilla palustris`           <int> 2, 0, 16, 14, 1, 2, 1, 0, 0, 0, 0, 0,…
## $ `Protectedspecies 1`             <int> 1, 3, 58, 18, 2, 10, 0, 0, 14, 0, 0, …
## $ `Protectedspecies 10`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Protectedspecies 11`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ `Protectedspecies 12`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, …
## $ `Protectedspecies 2`             <int> 0, 0, 0, 0, 0, 49, 2, 12, 0, 0, 0, 12…
## $ `Protectedspecies 3`             <int> 9, 0, 2, 0, 0, 0, 0, 10, 0, 0, 0, 3, …
## $ `Protectedspecies 4`             <int> 0, 3, 0, 12, 0, 3, 0, 0, 0, 0, 0, 0, …
## $ `Protectedspecies 5`             <int> 0, 64, 0, 0, 0, 39, 0, 0, 0, 0, 0, 0,…
## $ `Protectedspecies 6`             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Protectedspecies 7`             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Protectedspecies 8`             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Protectedspecies 9`             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Ranunculus aquatilis`           <int> 2, 0, 0, 0, 0, 6, 0, 3, 23, 0, 42, 47…
## $ `Ranunculus flabellaris`         <int> 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0…
## $ `Ranunculus flammula`            <int> 0, 14, 0, 3, 15, 0, 0, 0, 0, 0, 0, 0,…
## $ `Ranunculus longirostris`        <int> 0, 0, 0, 24, 0, 44, 0, 6, 0, 9, 0, 0,…
## $ `Riccia fluitans`                <int> 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0…
## $ `Ricciocarpos natans`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Rumex orbiculatus`              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, …
## $ `Sagittaria cristata`            <int> 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0…
## $ `Sagittaria cuneata`             <int> 3, 0, 4, 5, 1, 0, 0, 1, 3, 0, 0, 0, 0…
## $ `Sagittaria graminea`            <int> 0, 0, 2, 4, 2, 1, 0, 31, 0, 0, 0, 0, …
## $ `Sagittaria latifolia`           <int> 2, 2, 11, 1, 0, 26, 0, 0, 0, 0, 0, 2,…
## $ `Sagittaria rigida`              <int> 0, 0, 7, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0…
## $ `Schoenoplectus acutus`          <int> 480, 81, 151, 360, 24, 599, 25, 255, …
## $ `Schoenoplectus americanus`      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Schoenoplectus pungens`         <int> 3, 3, 3, 4, 20, 15, 0, 3, 0, 0, 0, 0,…
## $ `Schoenoplectus subterminalis`   <int> 8, 97, 36, 81, 24, 83, 0, 0, 3, 0, 0,…
## $ `Schoenoplectus tabernaemontani` <int> 1, 0, 6, 5, 4, 7, 0, 6, 7, 2, 10, 6, …
## $ `Schoenoplectus x oblongus`      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Scirpus atrovirens`             <int> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Scirpus cyperinus`              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Scirpus validus`                <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Scolochloa festucacea`          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Scorpidium scorpioides`         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Scutellaria lateriflora`        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Sium suave`                     <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0…
## $ `Sparganium americanum`          <int> 0, 0, 3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Sparganium angustifolium`       <int> 0, 1, 0, 1, 1, 10, 0, 0, 0, 0, 0, 0, …
## $ `Sparganium emersum`             <int> 7, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Sparganium eurycarpum`          <int> 4, 2, 13, 12, 14, 2, 0, 5, 2, 1, 0, 5…
## $ `Sparganium fluctuans`           <int> 3, 2, 18, 12, 29, 14, 0, 0, 0, 0, 0, …
## $ `Sparganium natans`              <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ `Sphagnum magellanicum`          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Spirodela polyrhiza`            <int> 31, 31, 12, 43, 33, 64, 0, 1, 121, 2,…
## $ `Stuckenia filiformis`           <int> 19, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, …
## $ `Stuckenia pectinata`            <int> 429, 246, 248, 189, 498, 553, 5, 438,…
## $ `Tolypella intricata`            <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Triadenum fraseri`              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Typha angustifolia`             <int> 27, 11, 39, 41, 12, 157, 0, 82, 37, 8…
## $ `Typha glauca`                   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Typha latifolia`                <int> 22, 4, 21, 2, 6, 33, 4, 0, 0, 6, 1, 4…
## $ `Utricularia gibba`              <int> 1, 42, 3, 21, 30, 8, 11, 0, 46, 0, 0,…
## $ `Utricularia intermedia`         <int> 41, 75, 11, 64, 95, 103, 0, 7, 28, 0,…
## $ `Utricularia macrorhiza`         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Utricularia minor`              <int> 23, 59, 29, 102, 52, 139, 0, 62, 8, 0…
## $ `Utricularia vulgaris`           <int> 604, 929, 400, 466, 703, 963, 41, 127…
## $ `Vallisneria americana`          <int> 526, 780, 425, 280, 1277, 446, 8, 558…
## $ `Veronica americana`             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Wolffia borealis`               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Wolffia columbiana`             <int> 0, 1, 0, 1, 0, 0, 0, 1, 6, 0, 173, 11…
## $ `Zannichellia palustris`         <int> 1, 0, 0, 8, 1, 10, 0, 3, 0, 25, 102, …
## $ `Zizania palustris`              <int> 1078, 808, 693, 894, 1026, 1133, 77, …
## $ acorus                           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ alisma                           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 26, …
## $ alnus                            <int> 0, 0, 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0…
## $ bidens                           <int> 0, 0, 0, 0, 1, 9, 0, 0, 0, 0, 0, 0, 0…
## $ callitriche                      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ carex                            <int> 19, 2, 53, 41, 10, 64, 4, 14, 8, 10, …
## $ ceratophyllum                    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ chara                            <int> 1977, 6712, 1931, 1298, 4144, 4953, 6…
## $ characeae                        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cicuta                           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cyperaceae                       <int> 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0…
## $ drepanocladus                    <int> 38, 18, 71, 161, 5, 104, 3, 34, 37, 0…
## $ elatine                          <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ eleocharis                       <int> 53, 62, 47, 79, 91, 104, 1, 14, 12, 0…
## $ elodea                           <int> 0, 1, 0, 4, 0, 0, 0, 0, 0, 0, 2, 26, …
## $ equisetum                        <int> 9, 2, 44, 22, 6, 7, 0, 0, 0, 0, 1, 0,…
## $ eragrostis                       <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ eutrochium                       <int> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ hypericum                        <int> 0, 0, 0, 8, 0, 11, 0, 0, 0, 0, 0, 0, …
## $ impatiens                        <int> 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 1, 2, 1…
## $ iris                             <int> 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ isoetes                          <int> 0, 32, 3, 17, 12, 4, 0, 1, 0, 0, 0, 0…
## $ juncus                           <int> 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ lamiaceae                        <int> 0, 0, 0, 2, 0, 3, 0, 0, 0, 0, 0, 0, 0…
## $ lemna                            <int> 27, 4, 10, 8, 17, 27, 0, 7, 12, 0, 0,…
## $ lysimachia                       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ myriophyllum                     <int> 8, 44, 2, 20, 0, 26, 0, 146, 0, 0, 14…
## $ najas                            <int> 218, 805, 62, 147, 754, 608, 0, 514, …
## $ nitella                          <int> 9, 13, 11, 162, 13, 143, 0, 9, 14, 0,…
## $ nuphar                           <int> 26, 11, 176, 0, 0, 238, 0, 0, 33, 1, …
## $ nymphaea                         <int> 16, 0, 119, 0, 0, 78, 0, 0, 38, 8, 0,…
## $ nymphaeaceae                     <int> 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ persicaria                       <int> 2, 0, 0, 4, 1, 4, 0, 4, 1, 1, 3, 4, 0…
## $ poaceae                          <int> 2, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0…
## $ potamogeton                      <int> 203, 569, 174, 219, 1265, 648, 0, 397…
## $ `potamogeton (broad)`            <int> 0, 1, 11, 0, 6, 5, 0, 2, 1, 0, 0, 0, …
## $ `potamogeton (narrow)`           <int> 112, 73, 107, 138, 45, 420, 31, 119, …
## $ ranunculus                       <int> 91, 57, 192, 79, 292, 147, 4, 134, 10…
## $ riccia                           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ sagittaria                       <int> 63, 112, 116, 97, 180, 120, 8, 27, 56…
## $ salix                            <int> 0, 0, 1, 8, 1, 6, 0, 1, 1, 0, 0, 17, …
## $ schoenoplectus                   <int> 567, 551, 353, 255, 513, 409, 3, 186,…
## $ scirpus                          <int> 39, 0, 98, 0, 0, 108, 0, 47, 1, 22, 0…
## $ scutellaria                      <int> 0, 0, 1, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ solidago                         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ sparganium                       <int> 35, 34, 27, 24, 20, 47, 1, 28, 13, 0,…
## $ `sparganium (emergent)`          <int> 7, 0, 5, 3, 2, 7, 0, 0, 0, 0, 1, 0, 0…
## $ `sparganium (floating)`          <int> 5, 0, 36, 8, 16, 7, 0, 0, 0, 0, 3, 24…
## $ sphagnum                         <int> 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0…
## $ stuckenia                        <int> 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ typha                            <int> 34, 23, 193, 10, 7, 198, 0, 30, 44, 2…
## $ utricularia                      <int> 104, 15, 5, 17, 5, 376, 1, 353, 15, 0…
## $ verbena                          <int> 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0…
## $ wolffia                          <int> 0, 0, 0, 15, 3, 21, 0, 0, 2, 0, 0, 0,…
## $ zizania                          <int> 0, 0, 0, 260, 0, 99, 0, 33, 55, 1, 3,…
## $ zosterella                       <int> 0, 0, 0, 0, 0, 4, 0, 0, 13, 0, 0, 0, …
#    A. Name: watershed
#    Description: numeric code for watershed. matches to same field in other dataset. See also source file 4. from SHARING/ACCESS SECTION (DNR Watersheds 2023).
#    
#    B. Name: major_name    
#    Description: name of major watershed that corresponds to the watershed code. See also source file 4. from SHARING/ACCESS SECTION (DNR Watersheds 2023).
#    
#    C. Name: acres
#    Description: acres encompassed by watershed. See also source file 4. from SHARING/ACCESS SECTION (DNR Watersheds 2023).
#    
#    D. Name: sq_mile
#    Description: square miles encompassed by watershed. See also source file 4. from SHARING/ACCESS SECTION (DNR Watersheds 2023).
#    
#    E. Name: prod_year
#    Description:   The year of production associated with polygon linework. See also source file 4. from SHARING/ACCESS SECTION (DNR Watersheds 2023).
#    
#    F. Name: source
#    Description:   The source of polygon linework. See also source file 4. from SHARING/ACCESS SECTION (DNR Watersheds 2023). 
#    
#    G. Name: n_points
#    Description: number of points sampled in the watershed (not resampling of a points is unaccounted for, so resampled points are counted as n points where n = number of resamples)
#    
#    H. Name: n_species
#    Description: number of unique taxa observed in the watershed 
#    
#    I. Name: simpson_div_nat
#    Description: inverse Simpson's diversity of watershed taxa community  
# 
#     J.- END: [name of taxon observed in the database]
#   Description: Number of observations of [named taxa] in this watershed

   
   
   
   
   
   # fwrite(watershed_occurrence_wide[ ,.SD ,.SDcols = !c("geometry" )], file = "data&scripts/data/output/DRUM/watershed_occurrence_wide.csv")   
   
  
# DataVizFigs -------------------------------------------------------

# **distribution of lakes and surveys ---------------------------------------

Generate Data Viz for Pub

Map and Timeline

#plot with lakes shapes and point locs!! 
   
   # pwi_l[order_ID %in% unique(plants[ , order_ID]), ,]
   # 
   # ggplot(pwi_l[order_ID %in% plants[ , unique(order_ID) , ] , , ], aes(geometry=geometry)) +
   #   geom_sf() +
   #   labs(caption = "Map of lakes with surveys in our database")
   
   #Conversion of data frame to sf object to add points
   plants_pts <- st_as_sf(x = plants[!is.na(Longitude)],                         
                          coords = c("Longitude", "Latitude"),
                          crs = "+proj=lonlat +datum=WGS84")
   
   # #map points
   # ggplot(plants_pts, aes(geometry=geometry)) +
   #   geom_sf() +
   #   labs(caption = "Map of survey points in our database")
   
   #plot all together!
   
   #other data for fig
   # usa <- map_data("usa")
   # canada <- map_data("world", region = "canada")
   # states <- map_data("state")
   states <- sf::st_as_sf(maps::map("state", plot = FALSE, fill = TRUE))
   mn_df <- subset(states, ID == "minnesota")
   
   #Projection transformation
   plants_pts = st_transform(plants_pts, crs = "+proj=utm +zone=15")
   pwi_l <- st_sf(pwi_l)
   pwi_l <- st_transform(pwi_l, crs = st_crs(mn_df))
   setDT(pwi_l)
   
   watersheds_huc8 <- st_sf(watersheds_huc8)
   watersheds_huc8 <- st_transform(watersheds_huc8, crs = st_crs(mn_df))

Sampled Lakes Spatial Dist

   #map
   study_map <- ggplot(data = pwi_l, aes(geometry=geometry))+
     geom_sf(data = watersheds_huc8,aes(geometry = geometry), alpha = .05, color = "gray")+
     geom_sf(alpha = .5, color = "blue")+
     # geom_point(
     #   aes(geometry = geometry),
     #   stat = "sf_coordinates", color = "blue", alpha = 0.5)+
     geom_point(data = pwi_l[order_ID %in% plants[ , unique(order_ID) , ] , , ],
                stat = "sf_coordinates", 
                aes(geometry=geometry),
                color = "red", alpha = .5)+
     geom_sf(data = mn_df,aes(geometry = geom), color = "black", alpha = .05)+
     scale_shape_discrete(solid = FALSE)+
     theme(text = element_text(size=20), legend.position = )+
     # theme_bw()+
     ylab("Longitude")+
     xlab("Latitude")
   
   # study_map




   pwi_l[order_ID %in% plants[ , unique(order_ID) , ] , plant_survey := T ,]

Schupps lake classes

   ggplot(pwi_l, aes(lake_class))+
     scale_y_log10()+
     geom_histogram( binwidth = 1 )+
     geom_histogram(binwidth = 1, data =pwi_l[plant_survey == T], aes(lake_class), color = "red", alpha = .5)+
     labs( title = "Distribution versus samping of Schupps lake classes \n
           https://files.dnr.state.mn.us/publications/fisheries/investigational_reports/417.pdf")+
     scale_x_continuous(breaks = seq(0,44,2) )

     # geom_density(aes(color = plant_survey))
   
   
   #lake area
   ggplot(pwi_l, aes(acres.x))+
     scale_x_log10()+
     scale_y_log10()+
     geom_histogram( )+
     geom_histogram( data = pwi_l[plant_survey == T], aes(acres.x), color = "red", alpha = .5)

   ggplot(pwi_l, aes(acres.x))+
     # geom_histogram()+
     scale_x_log10()+
     geom_density(aes(color = plant_survey))

 # hist(pwi_l$lake_class)

# temporal accumulation ----------------------------------------------------

Temporal Accumulation

   # of surveys
   plants[ , length(unique(SURVEY_ID)), year(SURVEY_DATE)]
##      year    V1
##     <int> <int>
##  1:  2011   276
##  2:  2014   315
##  3:  2009   295
##  4:  2010   264
##  5:  2002    56
##  6:  2006   157
##  7:  2012   352
##  8:  2005   127
##  9:  2007   188
## 10:  2013   272
## 11:  2003   111
## 12:  2008   243
## 13:  2004    97
## 14:  2017    77
## 15:  2015   183
## 16:  2016    94
## 17:  2018    77
## 18:  2001     8
## 19:  2000     3
   #of obs
   plants[ , length(unique(OBS_ID)), year(SURVEY_DATE)]
##      year    V1
##     <int> <int>
##  1:  2011 65833
##  2:  2014 62634
##  3:  2009 71907
##  4:  2010 65009
##  5:  2002 10736
##  6:  2006 40471
##  7:  2012 57732
##  8:  2005 27986
##  9:  2007 37378
## 10:  2013 49961
## 11:  2003 18285
## 12:  2008 63651
## 13:  2004 22362
## 14:  2017 26889
## 15:  2015 49523
## 16:  2016 30121
## 17:  2018 28381
## 18:  2001  2157
## 19:  2000  1282
   #taxa
   plants[!is.na(TAXON) , length(unique(TAXON)), year(SURVEY_DATE)]
##      year    V1
##     <int> <int>
##  1:  2011   125
##  2:  2014   152
##  3:  2009   138
##  4:  2010   133
##  5:  2002    67
##  6:  2006   103
##  7:  2012   126
##  8:  2005    94
##  9:  2007   112
## 10:  2013   136
## 11:  2003    82
## 12:  2008   134
## 13:  2004    95
## 14:  2017    90
## 15:  2015   121
## 16:  2016   123
## 17:  2018   101
## 18:  2001    50
## 19:  2000    41
   missing_data_surveys[ , .N , SURVEY_DATE]
##     SURVEY_DATE     N
##          <IDat> <int>
##  1:        <NA>   243
##  2:  2009-06-24     1
##  3:  2012-09-14     1
##  4:  2014-09-09     1
##  5:  2012-07-31     1
##  6:  2011-08-23     1
##  7:  2009-07-30     1
##  8:  2012-08-17     1
##  9:  2006-08-01     1
## 10:  2004-06-01     1
## 11:  2018-07-29     1
## 12:  2017-06-06     1
## 13:  2017-08-07     1
## 14:  2017-07-20     1
## 15:  2017-08-24     1
   plotdat <- rbindlist(list(plants[ , first(SURVEY_DATE) , SURVEY_ID],missing_data_surveys[ , first(SURVEY_DATE) , SURVEY_ID]))[!is.na(V1)]
   
   setorder(plotdat, V1)
      plotdat[ , cumval := .I , ]
   
  temporal_accumulation <- ggplot(plotdat, aes(V1, cumval)) +
     geom_line()+
     # theme_bw()+
     xlab("Year")+
     ylab("Cumulative Survey Count")+
    theme(text = element_text(size=20), legend.position = )
 #   plotdat[ , metric := "surveys" , ]
 #   
 #   plotdat_pts <- plants[ , first(SURVEY_DATE) , POINT_ID]
 #      setorder(plotdat_pts, V1)
 #      plotdat_pts[ , cumval := .I , ]
 #      plotdat_pts[ , metric := "points" , ]
 #      
 #   
 #   plotdat_taxa <- plants[ , first(SURVEY_DATE) , TAXON]
 #     setorder(plotdat_taxa, V1)
 #     plotdat_taxa[ , cumval := .I , ]
 #     plotdat_taxa[ , metric := "taxa" , ]
 #     
 #     
 # plotdat_all <- rbind(rbind(plotdat[ ,2:4 ], plotdat_pts[ ,2:4 ] ) , plotdat_taxa[ ,2:4 ])
 # 
 # ggplot(plotdat_all, aes(V1, cumval)) +
 #   geom_line()+
 #   facet_wrap(~ metric, scales = "free")
 # 

Figure 2

  # arrange
  plots.row <- align_plots(study_map, temporal_accumulation, align="hv", 
                           axis="tblr")
  div.rows <- plot_grid(plots.row[[1]], plots.row[[2]],
                        nrow=1, label_size = 20,
                        label_fontface = "plain", labels = c("(a)", "(b)"), 
                        hjust = -0, vjust = 2.4)
  
  
   #write to file
  # png(file = "Fig_Map_Time.png", width = 10, height = 5, units = "in", res = 1200)
  div.rows

  # dev.off()

# species abundance distributions -------------------------------------

Species Abundance Distribution

  sad.dat <- plants[!is.na(TAXON) , .N , TAXON]
   setorder(sad.dat, -N)
   sad.dat[ , TAXON := factor(TAXON, levels = sad.dat$TAXON)]
   
   sad.dat[ , perc_abund := N/sum(sad.dat$N) , ]
   
   
  
  # ggplot(plotdat[], aes(TAXON, perc_abund))+
  #   geom_point()+
  #   scale_y_log10()+
  #   xlab("Taxa")+
  #   ylab("log10(percent of all observations)")+
  #   theme_bw()+
  #   theme(axis.text.x = element_blank())





  # write.csv(plotdat, "data&scripts/data/output/species_abund_list.csv")

Excluding unvegetated points, recalculating and ordering by perc-abund

  sad.dat <- 
    sad.dat %>% 
    filter(TAXON != "") %>% 
    mutate(perc_abund = N/sum(N)) %>% 
    arrange(desc(perc_abund)) %>% 
    mutate(rank = 1:n()) %>% 
    rename(Taxon = TAXON)
  
  sum(sad.dat$perc_abund)
## [1] 1
  #' Most and least abundant
  top1 <- 
    sad.dat %>% 
    filter(rank %in% 1:22) %>%
    dplyr::select(Taxon, N)
  
  top2 <- 
    sad.dat %>% 
    filter(rank %in% 23:50) %>%
    dplyr::select(Taxon, N)

Figure 3

  sad.plot <- ggplot(sad.dat, aes(x = rank, y = perc_abund)) +
    geom_segment(aes(xend = 105, y = 0.03, x = 52, yend = 0.03),
                 arrow = arrow(length = unit(0.25, "cm")), color = "black") +
    geom_table(data = top1, aes(x = 175, y = 0.26, label=list(top1)), size = 2,
               table.theme=ttheme_gtsimple, vjust = "top", hjust = "right") +
    geom_table(data = top2, aes(x = 237, y = 0.26, label=list(top2)), size = 2,
               table.theme=ttheme_gtsimple, vjust = "top", hjust = "right") +
    geom_point(shape = 19, size = 2, alpha = 0.5, color = "steelblue") +
    geom_rect(aes(xmin = -2, xmax = 52, ymin = 0.0015, ymax = 0.18),
              fill = "transparent", color = "gray30", linetype = 1) +
    scale_y_log10(breaks = c(0.1, 0.01, 0.001, 0.0001, 0.00001),
                  labels = c(0.1, 0.01, 0.001, expression(paste("1 × 10"^{-4})), 
                             expression(paste("1 × 10"^{-5})))) +
    scale_x_continuous(breaks = c(seq(0, 200, 50))) +
    xlab(expression(Species~(italic(n)^{th}~most~abundant))) + 
    ylab("Proportion of total abundance")  +
    coord_cartesian(xlim = c(5, 225), ylim = c(0.000002, 0.15)) +
    theme(panel.grid.minor=element_blank()) +
    annotate("text", x = 80, y = 0.03, size = 3, adj = 0.5,
             label = "50 most\nabundant taxa")
  
  # png(file = "Fig_SAD.png", width = 5.5, height = 5, units = "in", res = 1200)
  sad.plot

  # dev.off()
  

# diversity environment relationships ------------------------------------

Diversity-Environment Relationships

Depth

  summary(plants_rakeabund_wide$simpsons_div_nat)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0     1.8     3.6     Inf     Inf     Inf
  summary(plants_rakeabund_wide$DEPTH_FT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   3.900   6.300   8.463  10.800 146.000
  # desc(sort(plants_rakeabund_wide$DEPTH_FT))
  
  #' Removing zero depth, extreme depth, and Inf ENSpie points
  plants_rakeabund_wide <- 
    plants_rakeabund_wide %>% 
    filter(simpsons_div_nat != "Inf") %>% 
    filter(DEPTH_FT != 0) %>% 
    filter(DEPTH_FT < 50)
  
  label_depth <- "Point~scale~(italic(N) == 70745)"
  point_depth <- ggplot(plants_rakeabund_wide, aes(DEPTH_FT*0.348, simpsons_div_nat)) +
    geom_point(shape = 19, size = 1, alpha = 0.1, color = "steelblue4")+
    scale_x_log10() +
    geom_smooth(method = 'gam', color = "black") +
    ylab(expression(italic(ENS[PIE]))) +
    xlab("Water depth (m)") + 
    ylim(c(0, 15)) +
    annotate("text", x = 0.08, y = 15 - 0.025*15, size = 2.25, adj = 0, 
             label = label_depth, parse = TRUE) +
    theme(text = element_text(size = 7), plot.margin = unit(c(0.1, 0, 0.1, 0.1), "cm")) 

Secchi

Removing NAs

  summary(surveys$simpsons_div_nat)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.385   4.144   4.938   6.959  20.761
  summary(surveys$Secchi_m)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0100  0.6096  1.2192  1.6390  2.2000 11.1000     239
  surveys <- 
    surveys %>% 
    filter(!is.na(Secchi_m)) 
  
  label_Secchi <- "Lake~scale~(italic(N) == 2955)"
  lake_Secchi <- ggplot(surveys, aes(Secchi_m, simpsons_div_nat)) +
    geom_point(shape = 19, size = 1.5, alpha = 0.3, color = "steelblue4")+
    geom_smooth(method = 'gam', color = "black") +
    ylab("") +
    xlab("Secchi depth (m)") + 
    ylim(c(0, 21)) +
    annotate("text", x = 0.07, y = 21 - 21*0.025, size = 2.25, adj = 0, 
             label = label_Secchi, parse = TRUE) +
    theme(text = element_text(size = 7), plot.margin = unit(c(0.1, 0, 0.1, 0), "cm")) 

Area

Removing NAs

  summary(watersheds_huc8$simpson_div_nat)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.982   6.333  11.182     Inf  16.178     Inf      14
  watersheds_huc8 <- 
    watersheds_huc8 %>% 
    dplyr::select(-geometry) %>% 
    filter(!is.na(simpson_div_nat)) %>%
    filter(simpson_div_nat != "Inf") 
  
  label_area <- "Watershed~scale~(italic(N) == 64)"
  wshed_area <- ggplot(watersheds_huc8, aes(acres*0.404686, simpson_div_nat)) +
    geom_point(shape = 19, size = 2, alpha = 0.6, color = "steelblue4")+
    geom_smooth(method = 'gam', color = "black") +
    ylab("") +
    xlab("Area (ha)") + 
    ylim(c(0, 30)) +
    scale_x_continuous(breaks = c(0, 200000, 400000, 600000),
                       labels = c(0, "200,000", "400,000", "600,000")) +
    annotate("text", x = 0, y = 30 - 30*0.025, size = 2.25, adj = 0, 
             label = label_area, parse = TRUE) +
    theme(text = element_text(size = 7), plot.margin = unit(c(0.1, 0.1, 0.1, 0), "cm")) 

Figure 4

  plots.row <- align_plots(point_depth, lake_Secchi, wshed_area, align="hv", 
                           axis="tblr")
  div.rows <- plot_grid(plots.row[[1]], plots.row[[2]], plots.row[[3]],
                        nrow=1, label_size = 7.5,
                        label_fontface = "plain", labels = c("(a)", "(b)", "(c)"), 
                        hjust = -0.25, vjust = 2)
  
  # png(file = "Fig_DivEnv.png", width = 6.5, height = 2.25, units = "in", res = 1200)
  div.rows

  # dev.off()
  



# footer ------------------------------------------------------------------