#'---
#' title: "Minnesota Statewide Aquatic Plants and Associated Data"
#' author: "Mike Verhoeven & Dan Larkin"
#' output: 
#'    html_document:
#'       toc: true
#'       theme: default
#'       toc_depth: 3
#'       toc_float:
#'           collapsed: false
#'---

#' This script will pull in plant observation data from PI surveys, Secchi
#' clarity, lake/watershed geodata, species statuses, and collaborator feedback.
#' Then we'll sync them into our dataset, creating a full macrophyte obs and env
#' dataset for MN Lakes
#' 
#' 
#' 
# Outstanding Work:



#' # Document Preamble
knitr::opts_chunk$set(warning = FALSE, message = FALSE) 

strttime <- Sys.time()
getwd()


# load libraries ------------------------------------------------------------------

#' ## Libraries

  library(data.table) 
    # update_dev_pkg()# remotes::install_github("Rdatatable/data.table")
  library(ggplot2)
  library(stringr)
  library(sf)
  library(vegan)
  library(gridExtra)
  library(dplyr)
  library(tidyr)
  library(janitor)
  # library(lme4)
  # library(sjPlot)
  # library(mediation)
  library(ggpubr)
  # library(EnvStats)
  # library(lmerTest)
  # library(merTools)
  # library(rstanarm)
  # library(ggsn)
  library(ggpmisc)
  library(cowplot)


# load in functions -------------------------------------------------------
#' ## Functions

  f_dowle3natozeros = function(DT, x) {
  # or by number (slightly faster than by name) :
  for (j in x)
    set(DT,which(is.na(DT[[j]])),j,"0")
}


# load in data -------------------------------------------------

#' ## Data
#' 
#' The plants observations data and collaborator corrections are datasets that
#' we have generated.
#' 
#' Secchi data have been aggregated from public sources by
#' Kelsey Vitense for https://aslopubs.onlinelibrary.wiley.com/doi/full/10.1002/lol2.10323#lol210323-bib-0034
#'
#'
#'


  # #plants observation dataset:
  # plants <- fread(input = "data&scripts/data/input/plant_surveys_mn.csv", drop = 1:2) #import, dropping the exported row numbers

  # #collaborator corrections and feedback:
  # coll_edits <- fread(input = "data&scripts/data/input/Edited_post_contrib_feedback.csv")

  #secchi data:
  secchi <- fread(input = "data&scripts/data/input/AllSecchi_plus_ShallowLakesSecchi.csv", drop = 1) #import, dropping the exported row numbers


  
  #' The MN DNR does not allow publication of copies of their datasets, and thus the following datasets must be downloaded by a user in order to run this code. 
  #'
  #' Hydrography (https://gisdata.mn.gov/dataset/water-dnr-hydrography; 5April2022) and 
  #' watershed (https://gisdata.mn.gov/dataset/geos-dnr-watersheds; 10Aug2022) data were
  #' retrieved from the MN Geospatial commons. 
  #' 
  #' Species statuses were retrieved from the MN DNR website
  #' (https://www.dnr.state.mn.us/eco/mcbs/plant_lists.html; 5April2022).
  #' 
  #'Citations:
  #'DNR Hydrography Dataset. (2012). Retrieved 5April2022, from https://resources.gisdata.mn.gov/pub/gdrs/data/pub/us_mn_state_dnr/water_dnr_hydrography/metadata/metadata.html
  #'DNR Watersheds—DNR Level 04—HUC 08—Majors. (2023). Retrieved 10Aug2022, from https://resources.gisdata.mn.gov/pub/gdrs/data/pub/us_mn_state_dnr/geos_dnr_watersheds/metadata/dnr_watersheds_dnr_level_04_huc_08_majors.html
  #'MNTaxa: The State of Minnesota Vascular Plant Checklist. (2013). Retrieved 5April2022, from https://www.dnr.state.mn.us/eco/mcbs/plant_lists.html
  #' 
  
  
  
  ## MN DNR Datasets
  
  #hydrography & watersheds
  pwi_l <- st_read(dsn = "data&scripts/data/input/shp_water_dnr_hydrography", layer = "dnr_hydro_features_all")
  watersheds_huc8 <- st_read(dsn = "data&scripts/data/input/shp_geos_dnr_watersheds", layer = "dnr_watersheds_dnr_level_04_huc_08_majors")
  #species statuses
  rte <- fread(input = "data&scripts/data/input/2013_dnr_plant_checklist_web.csv")

  

  

# Strip Rare Species ------------------------------------------------------
#' ### Strip Rare Species Idents
  
    
#' Due to legal protections on rare, threatened, and endangered species, the following section strips the identities of rare species in the dataset. The identities can be recovered by requesting either the full data imported above (currently commented out), or the key to the protected species names below (note that only the former will get you any location info that was associated with those data).  
  
  # de-identify rare species:
  rte <- clean_names(rte)#tidy names on status dataset
  
  # #Strip species ID from protected species
  # namesstripkey <- plants[ TAXON %in% rte[rarity_status != "" , mn_dnr_scientific_name,], .N , TAXON]
  # others <- plants[ !(TAXON %in% rte[, mn_dnr_scientific_name,]), .N , TAXON]
  # rm(others) #none of these trigger me to need to deident them
  # 
  # namesstripkey[ , new_name := paste("ProtectedSpecies", .I, sep = "_") , ,]
  # 
  # plants[ namesstripkey , on = .(TAXON=TAXON) , new_name := new_name]
  # plants[ TAXON %in% namesstripkey[,TAXON],  new_name  ]
  # 
  # plants[ , .N , new_name]
  # 
  # plants[ !is.na(new_name),  TAXON := new_name ]
  # 
  # plants[ , new_name := NULL]
  # colnames(plants)
  # 
  # #strip locs from all points with a protected species
  # 
  # ps_points <- plants[str_detect(TAXON, "ProtectedSpecies" ), unique(POINT_ID)  ,  ]#give me the point ids for all points with a protected species
  # 
  # #any loc data there?
  # plants[POINT_ID %in% ps_points, c("X","Y","NORTHING","EASTING","LATITUDE","LONGITUDE","UTMX","UTMY")]
  # 
  # #delete that data
  # plants[POINT_ID %in% ps_points, c("X","Y","NORTHING","EASTING","LATITUDE","LONGITUDE","UTMX","UTMY"):= NA, ]
  # 
  # #export rte key:
  # fwrite(namesstripkey, file = "data&scripts/data/output/rte_namestrip_key.csv" )
  # #export rte stripped product:
  # fwrite(plants, file = "data&scripts/data/output/plants_input_data_rtestrip.csv")
  #import rte stripped product:
  # plants <- fread(input = "data&scripts/data/output/plants_input_data_rtestrip.csv") #import, dropping the exported row numbers

  
  

# stripped out peoples personally identifiable information ------------------------------------------------

  # #collaborator corrections and feedback:
  coll_edits <- fread(input = "data&scripts/data/input/Edited_post_contrib_feedback_noPII.csv") 
  
  #import PLANTS INPUT product:
  plants <- fread(input = "data&scripts/data/output/plants_input_data_rtestrip_noPII.csv") #import, dropping the exported row numbers
    plants[TAXON == "", .N]
    plants[TAXON == "", TAXON:= NA]
  
  

# *****DatasetUpdates***** ------------------------------------------------
#' # Dataset Updates

# collaborator corrections ------------------------------------------------

#' ## Collaborator Corrections
#' This section uses the collaborator feedback to revise the dataset.

  # check survey ID alignment
  #sum(!coll_edits[, SURVEY_ID, ] %in% plants[ , SURVEY_ID]) #100% of collaborator input has a match in plants
  names(coll_edits)[1] <- "feedback"
  coll_edits[ , .N , feedback ]
  
  # deletions 
  # coll_edits[ str_detect(feedback, "delete",), SURVEY_ID ,    ]#which are marked for deletion?
  plants <- plants[  !SURVEY_ID %in% 
                       coll_edits[ str_detect(feedback, "delete",), SURVEY_ID ,    ]#this drops about 10k observations from the dataset
                     , , ]
  
  # tag for reimport
  # these surveys need to be reimported and any current data deleted. We have these data in our files. 
  
  # coll_edits[str_detect(feedback, "import",) , SURVEY_ID  , ]
  # plants[ SURVEY_ID %in% coll_edits[str_detect(feedback, "import",) , SURVEY_ID  , ], .N ,  SURVEY_ID]
  sel <- plants[ SURVEY_ID %in% coll_edits[str_detect(feedback, "import",) , SURVEY_ID  , ],  ,  ] #peel off those reimport data
  sel <- sel[!duplicated(sel[ , SURVEY_ID , ]),] #compress to one row where botched import produced some data
  # plants[ SURVEY_ID %in% coll_edits[str_detect(feedback, "import",) , SURVEY_ID  , ], .N ,  SURVEY_ID]
  
  # drop or modify some cols to reflect bad import
  sel[ , c("STA_NBR_DATASOURCE", "DEPTH_FT", "NO_VEG_FOUND", "REL_ABUND", "WHOLE_RAKE_REL_ABUND","SUBSTRATE", "SURVEYOR", "TAXON", "SAMPLE_NOTES", "SURFACE_GROWTH", "POINT_LVL_SECCHI", "X", "Y", "NORTHING", "EASTING", "LATITUDE", "LONGITUDE", "UTMX", "UTMY", "POINT_ID", "OBS_ID") := NA ,]#in sel, dump these columns
  sel[ , INDATABASE := FALSE]#mark these as not in database
  plants <- plants[  !SURVEY_ID %in% coll_edits[str_detect(feedback, "import",) , SURVEY_ID  , ],  ,  ] #drops ~900 obs
  plants <- rbind(plants, sel)
  plants[ SURVEY_ID %in% sel[, SURVEY_ID] , SURVEY_FEEDBACK := "reimport required" , ]
  
  # data available from collaborator
  # coll_edits[ feedback %in% c("data available", "missing metadata") , SURVEY_ID , ]
  plants[ SURVEY_ID %in% coll_edits[ feedback %in% c("data available", "missing metadata") , SURVEY_ID , ]  , SURVEY_FEEDBACK := "data available from collaborator" ,   ]
  plants[ SURVEY_ID %in% coll_edits[ feedback %in% c("rake density data available") , SURVEY_ID , ]  , SURVEY_FEEDBACK := "rake density data available from collaborator" ,   ]
  
  
  # no data available 
  plants[ SURVEY_ID %in% coll_edits[ feedback %in% c("no data available") , SURVEY_ID , ]  , SURVEY_FEEDBACK := "no data available" ,   ]
  nrow(plants[SURVEY_FEEDBACK == "no data available" , .N , SURVEY_ID] )# how many cases with data unavailable/ not known where raw data are?
  
  
  # one-offs
  # coll_edits[feedback == "Point 69 has Lmin entered as 5 - change to 2", SURVEY_ID]
  plants[SURVEY_ID == coll_edits[feedback == "Point 69 has Lmin entered as 5 - change to 2", SURVEY_ID] & 
           STA_NBR_DATASOURCE == 69 &
           REL_ABUND == 5,
         REL_ABUND := 2]
  
  # Taxa naming problem:
  plants[TAXON == "Mitellopsis", TAXON := "Nitellopsis"]
  
 
  # preferred datasource name
  # coll_edits[ , .N , EDIT_DATASOURCE]
  plants[ , SURVEY_DATASOURCE := coll_edits[match(plants$DATASOURCE, coll_edits$DATASOURCE),  EDIT_DATASOURCE ] ,  ]
  # plants[ , .N ,  SURVEY_DATASOURCE]
  # plants[SURVEY_DATASOURCE == "", .N , DATASOURCE ]
  plants[DATASOURCE == "DNR Lakes and Rivers", SURVEY_DATASOURCE := "DNR Lakes and Rivers"]
  plants[DATASOURCE == "DNR Fisheries", SURVEY_DATASOURCE := "DNR Fisheries"]
  plants[DATASOURCE == "Rantala TIP", SURVEY_DATASOURCE := "DNR Fisheries"]
  plants[DATASOURCE == "Muthukrishnan Et al", SURVEY_DATASOURCE := "DNR Shallow Lakes" , ]
  plants[SURVEY_DATASOURCE == "DNR Fisheries Research" , SURVEY_DATASOURCE := "DNR Fisheries"]
  
  # check contribution # of surveys by new named datasources
  plants[ , length(unique(SURVEY_ID)) , SURVEY_DATASOURCE ]
  
  
  # lake name corrections 
  plants[ , NEW_LAKE_NAME := coll_edits[match(plants$SURVEY_ID, coll_edits$SURVEY_ID),  EDIT_LAKE_NAME ] ,  ]
  plants[NEW_LAKE_NAME %in% c("lake of the isles", "clear", "bde maka ska"), LAKE_NAME := NEW_LAKE_NAME ]
  plants[ , NEW_LAKE_NAME := NULL ,]
  
  
  # surveyor corrections
  plants[ , NEW_SURVEYOR := coll_edits[match(plants$SURVEY_ID, coll_edits$SURVEY_ID),  EDIT_SURVEYOR ] ,  ]
  # plants[, .N, NEW_SURVEYOR ]
  plants[!NEW_SURVEYOR == "" & !is.na(NEW_SURVEYOR), SURVEYOR := NEW_SURVEYOR , ]
  plants[ , NEW_SURVEYOR := NULL ,]
  
  # a <- plants[ , length(unique(SURVEY_ID)) , SURVEYOR]
  
  
  # date corrections
  plants[ , NEW_DATE := coll_edits[match(plants$SURVEY_ID, coll_edits$SURVEY_ID),  EDIT_DATE ] ,  ]
  # plants[, .N, NEW_DATE ]
  # plants[!NEW_DATE == "" & !is.na(NEW_DATE), .N , NEW_DATE ]
  plants[!NEW_DATE == "" & !is.na(NEW_DATE) , SURVEY_DATE := as.Date(NEW_DATE, format = "%d%b%Y") ,]
  plants[ , NEW_DATE := NULL ,]
  
  
  # input rake density scales
  #overwrite any bad rake scales:
  coll_edits[!is.na(`EDITED_SCALE_RAKE_DENS (0-X)`) , `SCALE_RAKE_DENS (0-X)` := `EDITED_SCALE_RAKE_DENS (0-X)`  ]
  # coll_edits[ ,.N , `SCALE_RAKE_DENS (0-X)`  ]
  coll_edits[ `SCALE_RAKE_DENS (0-X)` %in% c(1,2) , `SCALE_RAKE_DENS (0-X)` := NA  ]# these aren't real abundance scales--they should be marked as NA, to indicate only pres-abs data are useable.
  
  #push over to plants DB:
  plants[ , RAKE_SCALE_USED := coll_edits[match(plants$SURVEY_ID, coll_edits$SURVEY_ID),  `SCALE_RAKE_DENS (0-X)` ] ,  ]
  
  
  # for these surveys, we can see that our collaborators inputs on rake scale was not correct:
  plants[REL_ABUND>RAKE_SCALE_USED, .N , .(SURVEY_ID)  ]
  
  #two more left now to manually change to max scale observed rather than reported:
  plants[SURVEY_ID == 1418 , RAKE_SCALE_USED := 5]
  plants[SURVEY_ID == 3128 , RAKE_SCALE_USED := 5]
  
  # check process:
  plants[ , .("max_observed_in_data" = max(REL_ABUND, na.rm = T)) , RAKE_SCALE_USED]
  
  #there are 29 unique surveys where no rake scale data are provided, and no easy inference exists -- swap these to presence absence
  plants[SURVEY_ID %in% plants[is.na(RAKE_SCALE_USED) & REL_ABUND >1, SURVEY_ID], .N , RAKE_SCALE_USED] 
  
  plants[is.na(RAKE_SCALE_USED), .N , REL_ABUND ]
  
  #drop the abundance data from these
  plants[is.na(RAKE_SCALE_USED), REL_ABUND := NA ]
  
  
  # clean up WS
  rm(coll_edits, sel)

  # summarize plants dataset ---------------------------------------------------

#' ## Review Data Summaries
#' 
#' Review current data status and outline changes needed for 
#' 
#' 
  str(plants) #what data formats?
  names(plants) #field names
  
  plants[ , length(unique(SURVEY_ID)) , ] #how many surveys in all?
  plants[ INDATABASE == T , length(unique(SURVEY_ID))] #how many surveys do we have the data in our db for?
  plants[ , length((unique(DOW))) , ] #how many lake in all?
  plants[ , length(unique(YEAR)) , ] #how many years of data?
  plants[ , length(unique(POINT_ID)),] #how samples pulled from the lake?
  plants[!is.na(TAXON) , length(unique(OBS_ID))] # how many times was a plant identified in these data? 
  
  
  #' Lets see how many surveys (then number of points) we have been given by each contributor:
  
  plants[ , unique(SURVEY_DATASOURCE) ,] 
  
  # survey contribution viz
  ggplot(plants[ , .N, .(SURVEY_ID, SURVEY_DATASOURCE, INDATABASE)], aes(SURVEY_DATASOURCE, fill = INDATABASE))+
    geom_bar(stat = "count", position = "stack" )+
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
    ggtitle(label = "n surveys by contributor")+
    scale_y_log10()
  
  # point contributions
  ggplot(plants[INDATABASE==T , .N, .(POINT_ID, SURVEY_DATASOURCE, INDATABASE)], aes(SURVEY_DATASOURCE))+
    geom_bar(stat = "count", position = "stack" )+
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
    ggtitle(label = "n points by contributor")+
    scale_y_log10()


#' The database has all the surveys we know exist for MN in it, including those
#' for which we do not have the data. It is often useful to snip those no-data
#' ones off right away to avoid running any calcs using all those rows w/o any
#' species data. 
  
  missing_data_surveys <- plants[ INDATABASE == F] 
  plants <- plants[INDATABASE == T]
  
  
  # drop zeros --------------------------------------------------------------

#' ### Zero Depths
  
#' We dropped surveys with no depth data in an early cleaning step. This
#' happened before we merged datasets from the MN DNR into the database, meaning
#' that we've still got to do a purge of 0 and NA depths to be sure we've
#' handled DNR and other collaborator data consistently: 
  
  #any remaining points with depth == NA or 0? They need to be dropped to be consistent in the handling of all no depth sampled points (currently only MNDNR ):
  plants[is.na(DEPTH_FT)|DEPTH_FT == 0 , ][ , .N , .(SURVEY_ID, DATASOURCE)]
  
  plants[is.na(DEPTH_FT)|DEPTH_FT == 0 , .N, .(SURVEY_ID, DATASOURCE)][, unique(DATASOURCE)] #these only still remain in the DNR data--thats because we did the DNR data merge after cleaning up the other data 
  
  sum(plants[ SURVEY_ID %in% plants[is.na(DEPTH_FT)|DEPTH_FT == 0 , .N, .(SURVEY_ID, DATASOURCE)][,SURVEY_ID], .N , .(SURVEY_ID, DATASOURCE)
  ][ , N]) #counts all points in those surveys
  
  #drop them:
  plants <- plants[!is.na(DEPTH_FT)|DEPTH_FT == 0  ]
  
  
  # duplicated entries ------------------------------------------------------

#' ## Duplicated Records
  
#' It's become aparrent to me that when we casted the data to long format in the
#'  survey collation project, we ended up with many cases of multiple
#'  "observations" of the same thing from within a single point. Here we clean
#'  up this issue. I found the cause of it by opening up the surveycollation 
#'  project  
  #drop duplicated entries:
  names(plants)
  
  plants[ , .N , .(SURVEY_ID,
                   POINT_ID ,
                   NO_VEG_FOUND ,
                   # proplight ,
                   DEPTH_FT ,
                   SUBSTRATE ,
                   SURVEYOR, TAXON)  ][N>1 , hist(N) , ]
  
  sum(duplicated(plants$OBS_ID)) #obs ID is unique, but it was generated on row value, not on a unique key set
  
  names(plants[ , .SD , .SDcols = !c("OBS_ID") ])
  
  sum(duplicated(plants[ , .SD , .SDcols = !c("OBS_ID") ])) #here we see the unique key set (everything BUT obs ID) tells us to drop 500k observations!
  
  plants <- plants[!duplicated(plants[ , .SD , .SDcols = !c("OBS_ID") ]) , , ]
  
  # still a bunch of dups leftover where we've got:
  # two abunds for one species or two of something for one species...
  # with a little sleuthing, I can see that these are a whole mix of things. For
  # example, James Johnson submitted one survey with two samples for point 213...
  # the solution I'll use is to allow these obs to stay (assuming that both obs 
  # are real, and the data entry resulted in a bad point ID for one of them).
  # because of this, when we agg to the point level, we'll have to choose an obs
  # to use that taxon. You'll see this play out in the species matrix
  # construction below:
  
  
  plants[ , .N , .(SURVEY_ID,
                   POINT_ID ,
                   NO_VEG_FOUND ,
                   #proplight ,
                   DEPTH_FT ,
                   SUBSTRATE ,
                   SURVEYOR, TAXON)  ][N>1 , .N , ]
  plants[ , .N , .(SURVEY_ID,
                   POINT_ID ,
                   NO_VEG_FOUND ,
                   #proplight ,
                   DEPTH_FT ,
                   SUBSTRATE ,
                   SURVEYOR, TAXON)  ][N>1 , unique(POINT_ID) , ]
  
  
#' ## Review Data Format
  
#' 
#' Now "plants" is only those surveys for which we were able to gather and
#' collate the data. Below we organize these data 3 ways: 
#' 
#' 1. As long format: each row is a species observation within a point (multiple
#' rows per point) including all fields retained through cleaning processes
#' 2. As a wide format of occurrences: each row is a point record, and the columns include a 
#' species observation (presence/absence) matrix. Here we keep only fields we 
#' 3. As a wide format with species abundances (a subset of "2.") where each row
#' is a point record , and the columns include a species abundance matrix
  
  #how many surveys and how many points were sampled in each?
  hist(plants[ ,length(unique(POINT_ID)) , SURVEY_ID]$V1, breaks= 100, main = "N points per survey", xlab = "Npoints")
  
  #how many unique TAXA?
  unique(plants$TAXON)
  # N taxa per survey:
  plants[ , .("Ntaxa" = length(unique(TAXON))) , SURVEY_ID] #if you want to name cols on the fly you need to wrap in .() which makes list from them 
  hist(plants[ , length(unique(TAXON)) , SURVEY_ID][ , V1], main = "N taxa per survey", xlab = "N taxa")
  hist(plants[ , length(unique(TAXON)) , POINT_ID][ , V1], main = "N taxa per point", xlab = "N taxa")

# rake scale normalization -------------------------------------------
  
#' ## Rake Abundance Normalization
#' This code will clean the relative rake density data from the whole PI dataset, 
#' shifting all to a 0,1,2,3 scale. This code was developed in the
#' surveycollation project, but is implemented here (post-collaborator feedback) 
#' to allow the collabs to specify what the rake scale they used was.
  
  #drop surveys with max vals of 1s and 1-2s
  rakes1 <- plants[RAKE_SCALE_USED %in% c(3,4,5), ]
  # rakes1[ , .N , REL_ABUND]
  #how many surveys in these categories?
  # rakes1[  , .N  , SURVEY_ID] #982
  
  #Now shift/ realign data per discussion above
  #1-4 survey shifted to 1-3
  rakes1[RAKE_SCALE_USED == 4 & REL_ABUND == 3 ,
         REL_ABUND := 2 ]
  rakes1[RAKE_SCALE_USED == 4 & REL_ABUND == 4 ,
         REL_ABUND := 3 ]
  #1-5 surveys shifted to 1-3
  rakes1[RAKE_SCALE_USED == 5 & (REL_ABUND == 3 |REL_ABUND == 4) ,
         REL_ABUND := 2 ]
  rakes1[RAKE_SCALE_USED == 5 & REL_ABUND == 5 ,
         REL_ABUND := 3 ]
  
  
  # #check that the max vals are all 3's
  # hist(rakes1[ !is.na(REL_ABUND) , max(REL_ABUND) , SURVEY_ID  ][,V1])
  # 
  # #and all data are distributed in 1-3 rake density framework
  # hist(rakes1[ !is.na(REL_ABUND) , REL_ABUND ,  ])
  # 
  # # count the number of surveys we've got
  # rakes1[ , .N , SURVEY_ID ] # N points per survey (includes NA's--points where no species were observed)
  
  
  # put the corrected rake scales back into the plants db
  # plants[ , , ] 
  # rakes1[ , .N , OBS_ID][N>1]
  
  # rakes1[is.na(OBS_ID) , ,]
  
  #remove any cases where people told us the rake scale but data to-date are not in db:
  rakes1 <- rakes1[!is.na(OBS_ID)]
  
  #pop these corrected rake scale data into the plants dataset
  plants[OBS_ID %in% rakes1$OBS_ID , REL_ABUND_CORRECTED := rakes1$REL_ABUND  , ]
  
  #clean out intermediates
  rm(rakes1)

  #which surveys used rakabunds
  plants[ , .N , RAKE_SCALE_USED] 
  
  #why are there NA abunds for species in these surveys?
  plants[!is.na(RAKE_SCALE_USED)& !is.na(TAXON), .N , REL_ABUND_CORRECTED ] 
  #expect some NAs to come through on these. May be worth looking back at the 
  # DNR data and evaluating how these plant obs were recorded and how they 
  # report then carry through our workflow. Seems to me that these surveys 
  # oughtta have numbers assigned to them -- like how coudl a record show a
  # species present but not have a numeric indicator on that? Were they 0/1?
  plants[!is.na(RAKE_SCALE_USED) & is.na(REL_ABUND) & !is.na(TAXON), .N , SURVEY_ID ]
  
  bad_abund_surveys <- plants[!is.na(RAKE_SCALE_USED) & is.na(REL_ABUND) & !is.na(TAXON), unique(SURVEY_ID),  ]
  #force pres/abs on those
  plants[SURVEY_ID %in% bad_abund_surveys, c("RAKE_SCALE_USED", "REL_ABUND", "REL_ABUND_CORRECTED"):= NA , ]

 
# georeference data -------------------------------------------------------

#' ## Georeference Data
#' This section uses MN hydrography geodata to add direct geodata into the
#' dataset. After run, pwi_l and plants can be linked on the shared "order_ID"
#' column, and HUC-8 level watersheds are included in the dataset
  
  # merge geospatial files
  #change sf data.frame to a data.table
  setDT(pwi_l)
  
  # linking plants db to spatial reference:
  #shapefile dows need to be made numeric (drops leading zeros)
  # pwi_l[ , dowlknum , ]
  pwi_l[ , dow_main := round(as.numeric(dowlknum)/100,0)*100 , ]
  
  #there's a lot of junk in there, work towards a 1:1 of plants dows and pwi_l dows
  pwi_l <- pwi_l[!is.na(dowlknum)]# drops many polygons that aren't lakes (islands, rivers, etc)
  pwi_l[  , order_ID:= .I , ]#adds a key
  #drop non-mn shapes
  pwi_l <- pwi_l[!outside_mn == "Y"]
  #which dows are duplicated in the shapes?
  pwi_l[pwi_l[, duplicated(dowlknum),], dowlknum]
  #lets review those data and see if we can devise any cleaning ideas
  #pwi_l[dowlknum %in% pwi_l[pwi_l[, duplicated(dowlknum),], dowlknum],]
  # we can just use the first instance of these duplicated waterbodies:
  # we'll do that by dropping the subsequent duplicates!
  pwi_l <- pwi_l[!pwi_l[, duplicated(dowlknum),], , ]
  
  # missing matches in the plants data to shapefile dows
  sum(is.na(match(plants[ , unique(DOW) ,], unique(pwi_l[ , dowlknum , ]))))
  # missing matches in the plants data to shapefile mainlake dows
  sum(is.na(match(plants[ , unique(DOW) ,], unique(pwi_l[ , dow_main , ]))))
  
  #append a polygon value to the plants data (here we'll use our order_ID from above)
  plants[ , order_ID := pwi_l[ match( plants[ , DOW ,], pwi_l[ , as.numeric(dow_main) , ]) , order_ID , ]  ]
  
  #and any that didn't match on that, try the basin specific
  plants[ is.na(order_ID) , order_ID := pwi_l[ match(plants[ is.na(order_ID) , DOW ,], pwi_l[ , as.numeric(dowlknum) , ]), order_ID , ]  ]
  
  # now to navigate these last non-compliant ones...
  plants[is.na(order_ID) & !is.na(DOW), .N ,  .(LAKE_NAME, DOW, DATASOURCE)]
  
  pwi_l[ dowlknum == "40000200", order_ID]
  plants[ DOW == 40000201, order_ID := pwi_l[ dowlknum == "40000200", order_ID] ] #Upper Sakatah polygon
  
  pwi_l[ dowlknum == "68000500", order_ID]
  plants[ DOW %in% c(68000501,68000502), order_ID := pwi_l[ dowlknum == "68000500", order_ID] ] #Roseau River WMA
  
  # pwi_l[ dowlknum == "70005000", order_ID]
  # plants[ DOW == 70050000, ]  #Carls Lake
  # 
  pwi_l[ dowlknum == "82011800", order_ID]
  plants[ DOW == 82009999, order_ID := pwi_l[ dowlknum == "82011800", order_ID] ] #Katherine Abbott Pond
  
  # the plants datset lakes with no geodata in the hydrography layer we used:
  plants[is.na(order_ID) , .N ,  .(LAKE_NAME, DOW, DATASOURCE)]
  # in total, this is 18 surveys and 3611 observations without ANY geolocation
  plants[, summary(order_ID)]
  
  
  # fix up local geospatial info 
  #check for weird X,Y vals in th UTM-looking columns
  plants[!is.na("X"), summary(X) ,]
  plants[!is.na("Y"), summary(Y) ,]
  
  #some are clearly lat/longs
  plants[X < 4600, X ,]
  plants[X < 4600, LATITUDE := X ,]
  plants[X < 4600, X := NA ,]
  
  plants[Y<10000, summary(Y) ,]
  plants[Y<10000 & Y>0,  LATITUDE := Y ,]
  plants[Y<10000 & Y>0,  Y := NA ,]
  
  plants[Y<10000, LONGITUDE := Y  ,]
  plants[Y<10000, Y := NA  ,]
  
  #whatever the heck is leftover here is weeeeeird and muddled.
  plants[!is.na("X"), summary(X) ,]
  plants[!is.na("Y"), summary(Y) ,]
  
  # we need to delete these non-UTM vals form X & Y
  # plants[Y<4800000, .N, DATASOURCE ]
  plants[ Y < 4800000, c("X","Y") := NA,  ]
  
  #looks clean, now move into the UTM slots?  
  plants[!is.na(UTMY) , summary(UTMY) ,]
  plants[!is.na(UTMX) , summary(UTMX) ,]
  #any conflicts with UTM loc data?
  plants[!is.na(UTMX) & !is.na(X)]
  plants[!is.na(UTMY) & !is.na(Y)]
  #move X, Y to UTMs
  plants[!is.na(X) , UTMX := X ,  ]
  plants[!is.na(Y) , UTMY := Y ,  ]
  
  plants[ , c("X", "Y") := NULL , ]
  
  
  #now Northing Easting, which happen to look like clean UTM data
  plants[!is.na(NORTHING), summary(NORTHING) ,]
  plants[!is.na(EASTING), summary(EASTING) ,]
  
  #overlap/ conflict?
  plants[!is.na(UTMX) & !is.na(NORTHING)]
  plants[!is.na(UTMY) & !is.na(EASTING)]
  
  #move Northing and easting to UTMs
  plants[!is.na(NORTHING), UTMY := NORTHING  ,]
  plants[!is.na(EASTING), UTMX := EASTING  ,]
  
  plants[ , c("NORTHING", "EASTING") := NULL , ]
  
  #now get all into same CRS:
  #conflicts?
  plants[!is.na(UTMX) & !is.na(LONGITUDE)]
  plants[!is.na(UTMY) & !is.na(LATITUDE)]
  
  plants[!is.na(LONGITUDE), UTMX:=NA]
  plants[!is.na(LATITUDE), UTMY:=NA]
  
  
  #here we'll split into a non, UTM, and LL georef set, then convert ref'd to sf objects, then merge all back together 
  # plants complete x,y in one CRS or another? NOPE... Oh well. moving on.
  # plants[!is.na(UTMX) & is.na(UTMY)]
  # # plants[]
  # 
  # plants[!is.na(LATITUDE) & is.na(LONGITUDE)]
  # plants[is.na(LATITUDE) & !is.na(LONGITUDE)]
  
  #Conversion of data frame to sf object (note we've assumed NAD1983, Z15N for UTMs)
  plants_UTMS <- st_as_sf(x = plants[!is.na(UTMX)],                         
                          coords = c("UTMX", "UTMY"),
                          crs = "+proj=utm +zone=15")
  
  #Projection transformation
  plants_U_LL = st_transform(plants_UTMS, crs = "+proj=longlat +datum=WGS84")
  
  setDT(plants_U_LL)
  
  #Conversion of data frame to sf object
  plants_LLS <- st_as_sf(x = plants[!is.na(LONGITUDE)],                         
                         coords = c("LONGITUDE", "LATITUDE"),
                         crs = "+proj=longlat +datum=WGS84")
  
  setDT(plants_LLS)
  
  #drop unusedCRS cols from each:
  plants_U_LL[ , c("LATITUDE", "LONGITUDE") := NULL , ]
  plants_LLS[ , c("UTMX","UTMY") := NULL, ]
  
  plants2 <- rbindlist(list(plants_LLS, plants_U_LL))
  
  plants2 <- cbind(plants2, st_coordinates(st_as_sf(plants2)))
  
  plants2[ , geometry := NULL ,]
  
  names(plants2)[names(plants2)%in% c("X","Y")] <- c("Longitude","Latitude")
  
  #merge back to plants (check dims to ensure no duplications or overlaps):
  dim(plants)
  
  plants[is.na(UTMX) & is.na(LONGITUDE) , .N , ]+
    plants2[ , .N ,]
  
  plants1 <- plants[is.na(UTMX) & is.na(LONGITUDE), ]
  
  plants1[ , c("UTMX", "UTMY", "LATITUDE", "LONGITUDE") := NULL ,]
  
  plants1[ , c("Longitude", "Latitude") := NA, ]
  
  nrow(plants1)+nrow(plants2)
  
  plants <- rbindlist(list(plants1, plants2))
  
  rm(plants_LLS,plants_U_LL, plants1, plants2)
  
  # label all pwi_l with watershed names
  
  # st_join(pwi_l, watersheds_huc8)
  # 
  # st_crs(plants_UTMS) <- st_crs(watersheds_huc8)
  # 
  # plantsUTMS <- st_join(plants_UTMS, left = TRUE, watersheds_huc8)
  
  pwi_l <- st_sf(pwi_l)
  st_crs(pwi_l) <- st_crs(watersheds_huc8) #ignore warning, no re-projection needed in this case, we do this because I lost the crs in some of my data manipulation
  pwi_l <- st_join(pwi_l, left = TRUE, watersheds_huc8)
  setDT(pwi_l)
  rm( plants_UTMS)

  

  
# add in secchi data ------------------------------------------------------
  
#' ## Secchi Data Join
#'
#' This code will conduct an eval of the fuzzy join of Secchi to plants data,
#' calculate Secchi metrics based on the chosen fuzzy join, then excute the join.
#' The code includes a solution adapted from a script written by Dan Larkin for
#' the niches project (https://conservancy.umn.edu/handle/11299/218009).
#'  
#' Assign a Secchi to each observationuse the closest Secchi temporally.
#' For each plant observation, we'll append the Secchi observation from that DOW
#' that was closest in time to the plant obs.
#'    

  # number of observations
  hist(secchi[,year(Date)])
  hist(secchi[,month(Date)])
  secchi[,.N,Source]
  
  # and for the plants data?
  plants[ is.na(INDATABASE) , YEAR:= year(as.IDate(DATESURVEYSTART))  , ] #new data imports need to move date in from the chr strings
  plants[ is.na(INDATABASE) , SURVEY_DATE:= as.IDate(DATESURVEYSTART)  , ] #new data imports need to move date in from the chr strings
  hist(plants[ ,.N , .(SURVEY_ID,YEAR) ][ , YEAR,])
  
  #clean some data in prep for join
  secchi[, YEAR := year(Date)]
  secchi[, MONTH := month(Date) ]
  
  secchi[ , old_DOW := DOW]
  secchi[, DOW := as.integer(DOW)]
  secchi[ is.na(DOW) , old_DOW ]
  
  
  #how many survey DOW's have a secchi for the lake (ever)?
  summary(plants[ , unique(DOW) , ]%in%secchi[ ,DOW ,])
  
  #how many surveys have a secchi for that year?
  summary(plants[ , .N ,.(DOW,YEAR) ][,paste(DOW,YEAR, sep = "_"),] %in% secchi[ ,paste(DOW,YEAR, sep = "_") ,])
  
  #how many surveys have a secchi for that month?
  summary(plants[ , .N ,.(DOW,YEAR,MONTH) ][,paste(paste(DOW,YEAR, sep = "_"),MONTH, sep = "_"),] %in% 
            secchi[ ,paste(paste(DOW,YEAR, sep = "_"),MONTH, sep = "_")  ,])
  
  
  
  plants[ , date := SURVEY_DATE]
  
  
  
  #consolidate to the DOW-Date level
  secchi <- secchi[ , .("Secchi_m" = mean(Secchi_m))  , .(DOW, Date) ]
  secchi[ , SECCHI_DATE := Date]
  secchi <- secchi[!is.na(DOW)]
  
  plants <- secchi[plants,  , on = .(DOW, Date = date),  roll='nearest' ]
  
  #drop the Date field (now a dup of SURVEY_DATE)
  plants[ , Date := NULL]
  
  #how far apart are plant and secchi obs?
  hist(plants[,SURVEY_DATE-SECCHI_DATE,])
  
  #keep only Secchi obs within a month (date+/-30d)
  hist(plants[ , abs(yday(SECCHI_DATE) - yday(SURVEY_DATE)), ])
  
  plants[abs(yday(SECCHI_DATE) - yday(SURVEY_DATE))<30 &
           abs(year(SECCHI_DATE) - year(SURVEY_DATE))<1, SECCHI_m_ACCEPTED := Secchi_m ]

  #cleanup:
  rm(secchi)


#' ### Calc Light Availability 
  # calculate point level light avail 
  plants[ , proplight := exp(-(log(10)/SECCHI_m_ACCEPTED)*(DEPTH_FT/3.2804)) ]
  nrow(plants[!is.na(proplight) , .N , POINT_ID])/ #how many points can we do this for?
  nrow(plants[, .N , POINT_ID])#out of total n points
  plants[ ,hist(proplight, breaks = 100, main = "78% coverage for light availability")]
  
  
  


# clean taxa names here ---------------------------------------------------
#' ## Tidy Taxa Names
  
  #these names ought to be pretty close already. 
  
plants[ , .N , TAXON] 
  plants[ , sort(unique(tolower(gsub("_", " ", TAXON)))) , ]
plants[ , TAXON := tolower(gsub("_", " ", TAXON)) ,    ]
  

# clean up  taxonomy using macroniche (see paper or git repo) taxonomy ------------------------------------------------------

#' Verhoeven, M. R., Glisson, W. J., & Larkin, D. J. (2020). Niche models 
  #' differentiate potential impacts of two aquatic invasive plant species on 
  #' native macrophytes. Diversity, 12, 162. https://doi.org/10.3390/d12040162
  
  # pull in taxonomy corrections:
  tnrs <- fread( file = "data&scripts/data/input/tnrs.final.csv", drop = 1)  
tnrs[ , submittedname := tolower(gsub("\\.", "", submittedname)) ,]#make format match

sum(plants$TAXON %in% tnrs$submittedname)
  
  
  tnrs[match(plants$TAXON, tnrs$submittedname), "species"]


# implement name changes --------------------------------------------------------
  
# Add in a Taxon corrected column  
  plants[, TAXONC := tnrs[match(plants$TAXON, tnrs$submittedname), "species"],  ]
  #these are all looking good to me.
  print(plants[ , .N , TAXONC], max.levels = 200)
  #tidy up names
  
  
  plants[is.na(TAXONC), .N , TAXON ] #review unmatched
  plants[is.na(TAXONC) & str_detect(TAXON, "\\(", negate = T ) & sapply(strsplit(TAXON, " "), length) == 2,
         .N  ,
         sub("\\b(\\w)(\\w*)\\b", "\\U\\1\\L\\2", TAXON, perl = TRUE) ]
  #fix first cap in binomials:
  plants[is.na(TAXONC) & str_detect(TAXON, "\\(", negate = T ) & sapply(strsplit(TAXON, " "), length) == 2,
         TAXON :=  sub("\\b(\\w)(\\w*)\\b", "\\U\\1\\L\\2", TAXON, perl = TRUE) ]
  
  # overwrite TAXON with corrected names
  plants[!is.na(TAXONC), TAXON := TAXONC]
 
  #delete correction col:
  plants[ , TAXONC := NULL ,]
  
  #review taxa
  plants[ , sort(unique(TAXON)) ,]
  
  #only one species of nitellopsis-
  plants[TAXON == "nitellopsis", TAXON := "Nitellopsis obtusa" , ]
  

  # Make sure they aren't NA'd taxa that should be marked no-veg-found (are all NA taxa marked for NO_VEG_FOUND?) Yes, looks good
  plants[ is.na(TAXON) , .N, NO_VEG_FOUND , ]

  # #now we need to ensure we retain rows where no useable taxa were found
  # plants[ NO_VEG_FOUND == TRUE, , ]
  #
  # #now delete all rows where TAXON == DELETE & NO_VEG_FOUND == F
  # # plants[ TAXON == "DELETE" & NO_VEG_FOUND == F, ]
  # plants <-   plants[ !(TAXON == "DELETE" & NO_VEG_FOUND == F), ]
  
  # samples per taxon
  plants[ , .N , TAXON][order(-N), ]
  
  #and finally a new unique ID for each observation in the dataset
  plants[, OBS_ID := .I]
  
  #update SURVEY_ID
  plants[ , length(unique(DATASOURCE)) , .(SURVEY_ID)][V1>2] #not needed unless this call is non-empty
  
  # plants[ , SURVEY_ID_NEW:= .GRP , .(SURVEY_ID, DATASOURCE) ]

# plant status information ------------------------------------------------

#' ## Plant Status Key  

  # native diversity must exclude invasives/introduced species, so generate a df that can be use to select cols in these categories
 
  
  plants[ , sort(unique(TAXON)) , ][!(plants[ , unique(TAXON) , ] %in% c(rte[native_status == "I", mn_dnr_scientific_name],
                                     "Nitellopsis obtusa", "Typha glauca"))]
  
  #we'll use this to select columns as we calculate diversty metrics!
  natcols <- plants[ !is.na(TAXON), unique(TAXON) , ][!(plants[ !is.na(TAXON), unique(TAXON) , ] %in% c(rte[native_status == "I", mn_dnr_scientific_name],
                                                                              "Nitellopsis obtusa", "Typha glauca"))]
  taxacols <- plants[!is.na(TAXON) , unique(TAXON) , ]

 

# Prep Data Products ------------------------------------------------------


# **plants_db -------------------------------------------------------------

#' # Prep Data Products
#' ## Observations Long
 

  names(plants)
  
  plants[ , .N , month(SURVEY_DATE) ]
  plants[ , unique(DATESURVEYSTART), , ][1:100]
  plants[ POINT_ID == 151207  , ] #one of these has a substrate, one not
  
  any(duplicated(plants[,.SD, .SDcols = !c("SUBSTRATE","OBS_ID")]))
  
  plants[duplicated(plants[,.SD, .SDcols = !c("SUBSTRATE","OBS_ID")]), , ]
  
  plants <-   plants[!duplicated(plants[,.SD, .SDcols = !c("SUBSTRATE","OBS_ID")]), , ]
  
  #check has orderID field
  plants[ , .N , is.na(order_ID) ]
  plants[is.na(order_ID) , .N , LAKE_NAME  ] #I've not been able to resolve the location on these WBs, so we'll leave them un-georeferenced
  
  
  names(plants)
  

  

# **point level p/a  -------------------------------------------------
  
#' ## Point Occurrences Wide 

  plants[ ,.N , REL_ABUND]
  plants[ , .N , INDATABASE]
  
  # In case of desired fill 1/0 rather than T/F
  # plants[ , FILLFIELD := as.numeric(INDATABASE) ,]
  # plants[ , .N , FILLFIELD ]
  
  plants_occurrence_wide <- dcast(plants, 
                                  SURVEY_ID+ 
                                    POINT_ID +
                                    NO_VEG_FOUND +
                                    proplight +
                                    DEPTH_FT + 
                                    SUBSTRATE +
                                    SURVEYOR ~ TAXON, fun.aggregate = last, value.var = "INDATABASE",   fill = FALSE) #Specify a logical var for all included data (INDATABASE) so that this species matrix is all T/F; see previous lines for a 0/1 fill
  
  #diversity metrics (only have richness with p/a, no "evenness", no "diversity"):
  # point_natcols <- names(plants_occurrence_wide)[names(plants_occurrence_wide)%in%natcols]
  # names(plants_occurrence_wide)
  plants_occurrence_wide[ ,  richness := rowSums(.SD > 0), .SDcols = taxacols ]
  plants_occurrence_wide[ ,  nat_richness := rowSums(.SD > 0), .SDcols = natcols ]  
  
  #bring all survey level variables back into the dataset 
  #check join
  # nrow(plants[plants_occurrence_wide, on = .(POINT_ID, SURVEY_ID, NO_VEG_FOUND, proplight,DEPTH_FT,SUBSTRATE,SURVEYOR), mult = "last" , ])
  # names(plants[plants_occurrence_wide, on = .(POINT_ID, SURVEY_ID, NO_VEG_FOUND, proplight,DEPTH_FT,SUBSTRATE,SURVEYOR), mult = "last" , ])
  # 
  plants_occurrence_wide <- plants[plants_occurrence_wide, on = .(POINT_ID, SURVEY_ID, NO_VEG_FOUND, proplight, DEPTH_FT, SUBSTRATE, SURVEYOR), mult = "last" , ]
  

  #and drop unneeded cols & those with loss of meaning through munging:
  names(plants_occurrence_wide)
  plants_occurrence_wide[ , c("STA_NBR_DATASOURCE","SURVEY_ID_DATASOURCE",
                              "REL_ABUND", "REL_ABUND_CORRECTED", "WHOLE_RAKE_REL_ABUND", 
                              "SAMPLE_NOTES",
                              "SURFACE_GROWTH",
                              "POINT_LVL_SECCHI",
                              "OLD_SURVEY_ID",
                              "COHORT",
                              "DATEINFO", "MONTH", "DAY", "YEAR", "DATESURVEYSTART",
                              "INVENTORY_STAFF", "INVENTORY_STAFFDATE", "INVENTORY_NOTES",
                              "USEABLE", "CLEANED", "INDATABASE",
                              "SUBMISSION_STAFF", "SUBMISSION_STAFFDATE", "SUBMISSION_NOTES",
                              "SURVEY_FEEDBACK", "DATASOURCE", "RAKE_SCALE_USED",
                              "NA", "TAXON", "SUBSTRATE", "OBS_ID", "NO_VEG_FOUND") := NULL , ]
  setcolorder(plants_occurrence_wide, c("DOW", "LAKE_NAME", "order_ID", "SUBBASIN",
                                        "SURVEY_ID", "SURVEY_DATASOURCE", "SURVEY_DATE", "MULTIPARTSURVEY", "SURVEYOR",
                                        "Secchi_m", "SECCHI_DATE", "SECCHI_m_ACCEPTED",
                                        "POINT_ID" ,"DEPTH_FT", "proplight", "Longitude", "Latitude"))
  

  #check to make sure I didn't dump something critical for an ident:
  plants_occurrence_wide[duplicated(plants_occurrence_wide)]
  
  
# **point level rake abund --------------------------------------------------

#' ## Point Abundances Wide   
  
  plants_rakeabund_wide <- dcast(plants[!is.na(RAKE_SCALE_USED)], SURVEY_ID+ 
                                   POINT_ID +
                                   NO_VEG_FOUND +
                                   proplight +
                                   DEPTH_FT + 
                                   SUBSTRATE +
                                   SURVEYOR ~ TAXON, value.var = "REL_ABUND_CORRECTED", fun.aggregate = last, fill = 0)
  
  plants_rakeabund_wide[ ,.N , NO_VEG_FOUND]
  
  #calculate diversity metrics for each rake throw
  rake_taxacols <- names(plants_rakeabund_wide)[names(plants_rakeabund_wide)%in%taxacols]
  rake_natcols <- names(plants_rakeabund_wide)[names(plants_rakeabund_wide)%in%natcols]
  # names(plants_rakeabund_wide)
  # diversity metrics
  plants_rakeabund_wide[ , shannon_div := diversity(plants_rakeabund_wide[,.SD, .SDcols = rake_taxacols],index = "shannon") ]
  plants_rakeabund_wide[ , simpsons_div := diversity(plants_rakeabund_wide[,.SD, .SDcols = rake_taxacols],index = "invsimpson") ]
  plants_rakeabund_wide[ , shannon_div_nat := diversity(plants_rakeabund_wide[,.SD, .SDcols = rake_natcols],index = "shannon") ]
  plants_rakeabund_wide[ , simpsons_div_nat := diversity(plants_rakeabund_wide[,.SD, .SDcols = rake_natcols],index = "invsimpson") ]
  # richness
  plants_rakeabund_wide[ ,  richness := rowSums(.SD > 0), .SDcols = rake_taxacols ]
  plants_rakeabund_wide[ ,  nat_richness := rowSums(.SD > 0), .SDcols = rake_natcols ]
  
  summary(plants_rakeabund_wide$`Potamogeton crispus`)#check that max rakeabund is 3
  
  
  #bring all survey level variables back into the dataset 
  #check join
  # nrow(plants[plants_rakeabund_wide, on = .(POINT_ID, SURVEY_ID, NO_VEG_FOUND, proplight,DEPTH_FT,SUBSTRATE,SURVEYOR), mult = "last" , ])
  # names(plants[plants_rakeabund_wide, on = .(POINT_ID, SURVEY_ID, NO_VEG_FOUND, proplight,DEPTH_FT,SUBSTRATE,SURVEYOR), mult = "last" , ])
  # 
  plants_rakeabund_wide <- plants[plants_rakeabund_wide, on = .(POINT_ID, SURVEY_ID, NO_VEG_FOUND, proplight,DEPTH_FT,SURVEYOR, SUBSTRATE), mult = "last" , ]
  
  
  #and drop unneeded cols & those with loss of meaning through munging:
  plants_rakeabund_wide[ , c("STA_NBR_DATASOURCE", "REL_ABUND", "REL_ABUND_CORRECTED",
                             "WHOLE_RAKE_REL_ABUND", "SURVEY_ID_DATASOURCE", "SAMPLE_NOTES",
                             "SURFACE_GROWTH", "POINT_LVL_SECCHI", "OLD_SURVEY_ID", "DATESURVEYSTART",
                             "COHORT", "DATEINFO", "MONTH", "DAY", "YEAR", "INVENTORY_STAFF", "INVENTORY_STAFFDATE",
                             "INVENTORY_NOTES", "USEABLE", "CLEANED", "INDATABASE", "SUBMISSION_STAFF",
                             "SUBMISSION_STAFFDATE", "SUBMISSION_NOTES", "SURVEY_FEEDBACK", "DATASOURCE",
                             "RAKE_SCALE_USED") := NULL , ]
  
  
  names(plants_rakeabund_wide)
  plants_rakeabund_wide[ , c("STA_NBR_DATASOURCE","SURVEY_ID_DATASOURCE",
                              "REL_ABUND", "REL_ABUND_CORRECTED", "WHOLE_RAKE_REL_ABUND",
                              "SAMPLE_NOTES",
                              "SURFACE_GROWTH",
                              "POINT_LVL_SECCHI",
                              "OLD_SURVEY_ID",
                              "COHORT",
                              "DATEINFO", "MONTH", "DAY", "YEAR", "DATESURVEYSTART",
                              "INVENTORY_STAFF", "INVENTORY_STAFFDATE", "INVENTORY_NOTES",
                              "USEABLE", "CLEANED", "INDATABASE",
                              "SUBMISSION_STAFF", "SUBMISSION_STAFFDATE", "SUBMISSION_NOTES",
                              "SURVEY_FEEDBACK", "DATASOURCE", "RAKE_SCALE_USED",
                              "NA","TAXON", "SUBSTRATE", "OBS_ID", "NO_VEG_FOUND") := NULL , ]
  setcolorder(plants_rakeabund_wide, c("DOW", "LAKE_NAME", "order_ID", "SUBBASIN",
                                        "SURVEY_ID", "SURVEY_DATASOURCE", "SURVEY_DATE", "MULTIPARTSURVEY", "SURVEYOR",
                                        "Secchi_m", "SECCHI_DATE", "SECCHI_m_ACCEPTED",
                                        "POINT_ID" ,"DEPTH_FT", "proplight", "Longitude", "Latitude"))

  
  #cleanup:
  rm(rake_taxacols,rake_natcols)

  


# **survey level stats -------------------------------------------

#' ## Survey Level Aggregation

  surveys <- plants[ , .(tot_n_samp = length(unique(POINT_ID)))  , SURVEY_ID ]
  
  #add richness to the surveys dataset
  surveys[  , taxa_richness := #take the "taxon count" and subtract one if the survey includes NAs (see next two lines)
              plants[ , length(unique(TAXON))   , SURVEY_ID ][ , V1]-# ("total richness", but counts NAs as a taxon) minus
              plants[ , ifelse(sum(is.na(TAXON))== 0, 0, 1), SURVEY_ID][,V1],]#  (each survey get a 0 if no NAs or a 1 if contains NA's)
  
  # extent of vegetation in survey (proportion vegetated)
  surveys <- merge(surveys,plants[!is.na(TAXON), .(n_points_vegetated=length(unique(POINT_ID))) , SURVEY_ID ],
                   by = "SURVEY_ID", all.x = TRUE)[is.na(n_points_vegetated), n_points_vegetated := 0 ]
  surveys[ , prop_veg := n_points_vegetated/tot_n_samp ,]
  
  
  #create a plant observation matrix (species abund by survey)
  survey_species_matrix <- dcast(plants[!is.na(TAXON) , .("count" = length(unique(POINT_ID))) , .(SURVEY_ID,TAXON)], SURVEY_ID ~ TAXON, value.var = "count", fill = 0) #note that this line creates the matrix ONLY for surveys that had species observations (~70 surveys had no species observed)

  #diversity indicies:
  # species names:
  natcols <- names(survey_species_matrix)[names(survey_species_matrix) %in% natcols]
  
  # total diversity
  survey_species_matrix[ , shannon_div := diversity(.SD,index = "shannon"), .SDcols = taxacols ]
  survey_species_matrix[ , simpsons_div := diversity(.SD,index = "invsimpson"), .SDcols = taxacols ]
  
  # native diversity
  survey_species_matrix[ , shannon_div_nat := diversity(.SD,index = "shannon"), .SDcols = natcols  ]
  survey_species_matrix[ , simpsons_div_nat := diversity(.SD,index = "invsimpson"), .SDcols = natcols ]
  survey_species_matrix[simpsons_div_nat == Inf, simpsons_div_nat := 0]
  
  # native richness
  survey_species_matrix[ ,  nat_richness := rowSums(survey_species_matrix[ , .SD, .SDcols = natcols] > 0), ]

  # depth stats
  # depth surveyed stats:
  surveys <- surveys[plants[ !is.na(DEPTH_FT), .("max_depth_surveyed" = max(DEPTH_FT)) , SURVEY_ID], on = "SURVEY_ID" , ]
  surveys <- surveys[plants[ !is.na(DEPTH_FT), .("min_depth_surveyed" = min(DEPTH_FT)) , SURVEY_ID], on = "SURVEY_ID" , ]
  surveys <- surveys[plants[ !is.na(DEPTH_FT), .("mean_depth_surveyed" = mean(DEPTH_FT)) , SURVEY_ID], on = "SURVEY_ID" , ]
  surveys <- surveys[plants[ !is.na(DEPTH_FT), .("median_depth_surveyed" = median(DEPTH_FT)) , SURVEY_ID], on = "SURVEY_ID" , ]
  surveys <- surveys[plants[ !is.na(DEPTH_FT), .("IQR_depth_surveyed" = IQR(DEPTH_FT)) , SURVEY_ID], on = "SURVEY_ID" , ]
  
  #vegetated depths data
  #max depth vegetated within survey:
  
  #some of these might warrant removal, depending on whats being done with the data
  plants[NO_VEG_FOUND == F & DEPTH_FT>50, length(POINT_ID) ,  .(SURVEY_DATASOURCE, DOW, SUBBASIN, SURVEY_DATE, LAKE_NAME)]
  
  plants[ NO_VEG_FOUND == FALSE & DEPTH_FT<50 , .("max_depth_vegetated" = max(DEPTH_FT)) , SURVEY_ID]
  surveys <- merge( surveys , plants[ NO_VEG_FOUND == FALSE& DEPTH_FT<50 , .("max_depth_vegetated" = max(DEPTH_FT, na.rm = T)) , SURVEY_ID] , by = "SURVEY_ID" , all.x =TRUE )
  #other depth vegetated stats:
  surveys <- merge( surveys , plants[ NO_VEG_FOUND == FALSE& DEPTH_FT<50 , .("min_depth_vegetated" = min(DEPTH_FT, na.rm = T)) , SURVEY_ID], by = "SURVEY_ID" , all.x =TRUE )
  surveys <- merge( surveys , plants[ NO_VEG_FOUND == FALSE& DEPTH_FT<50 , .("mean_depth_vegetated" = mean(DEPTH_FT, na.rm = T)) , SURVEY_ID], by = "SURVEY_ID" , all.x =TRUE )
  surveys <- merge( surveys , plants[ NO_VEG_FOUND == FALSE& DEPTH_FT<50 , .("median_depth_vegetated" = median(DEPTH_FT, na.rm = T)) , SURVEY_ID], by = "SURVEY_ID" , all.x =TRUE )
  surveys <- merge( surveys , plants[ NO_VEG_FOUND == FALSE& DEPTH_FT>50 , .("IQR_depth_vegetated" = IQR(DEPTH_FT, na.rm = T)) , SURVEY_ID], by = "SURVEY_ID" , all.x =TRUE )


  # species matrix into survey data
  #species matrix for surveys
  surveys <- merge(surveys, survey_species_matrix, by = "SURVEY_ID", all.x = T)
  f_dowle3natozeros(surveys, names(survey_species_matrix)) #the merge incorrectly assigns NAs for non obs... here we replace those with 0s
  
  # check work:
  # summary(surveys[,1:17])
  
  #append survey data (basic data from plants db) to these
  # names(plants)
  surveys <- merge(plants[order(SURVEY_DATE) , .("nobs" = .N, "SURVEY_DATE" = first(SURVEY_DATE)) , .(SURVEY_ID, SURVEY_DATASOURCE,
                                                LAKE_NAME, DOW, SUBBASIN,
                                                MULTIPARTSURVEY, order_ID) ],surveys,  by = "SURVEY_ID")
  # summary(surveys)
  # names(surveys) <- gsub(" ", "_", gsub( "\\(", "_", gsub( "\\)", "_", names(surveys))))

  # secchi data metrics 
  # rescue the secchi data from the plants db for these surveys

  surveys[plants, Secchi_m := Secchi_m, on = "SURVEY_ID"]
  surveys[plants, Secchi_m_date := SECCHI_DATE, on = "SURVEY_ID"]

  #OPTIONAL: merge in the geodata + lake data to the survey work
  # surveys <- pwi_l[surveys, on = .(order_ID), mult = "first" ]  
  #have to strip off the geometry to prevent failure in write to csv
  # surveys[ ,geometry := NULL ,]

  
  # n samples within historical max depth
  
  # surveys <- merge(surveys,plants[!is.na(TAXON), .(alltime_maxvegdep = max(DEPTH_FT)) , .(DOW, SUBBASIN) ], by = c("DOW", "SUBBASIN"), all.x = TRUE) [is.na(alltime_maxvegdep), alltime_maxvegdep := 0  ]
  # summary(surveys$alltime_maxvegdep)
  # surveys[ , hist(alltime_maxvegdep) , ]
  
  
  #for plants records with plants, whats the max depth by lake?
  
  #remove these cols if they exist (added this in troubleshooting, should kick warning, but have no effect on product)
  plants[ ,alltime_maxvegdep := NULL]
  plants[ ,survey_maxvegdep := NULL]
  # Calculate max depth for non-NA TAXON records
  max_depth <- plants[!is.na(TAXON) & DEPTH_FT < 50, .(alltime_maxvegdep = max(DEPTH_FT, na.rm = T)), by = .(DOW, SUBBASIN)]
  # Merge the result back into the original data
  plants <- merge(plants, max_depth, by = c("DOW", "SUBBASIN"), all.x = TRUE)
  
  # Calculate max depth for non-NA TAXON records
  max_depth <- plants[!is.na(TAXON) & DEPTH_FT < 50, .(survey_maxvegdep = max(DEPTH_FT, na.rm = T)), by = .(SURVEY_ID)]
    # Merge the result back into the original data
  plants <- merge(plants, max_depth, by = "SURVEY_ID", all.x = TRUE)
  
  plants[ , .N , .(alltime_maxvegdep, survey_maxvegdep, SURVEY_ID) , ]
  
  summary(plants[ , .N , .(alltime_maxvegdep, survey_maxvegdep, SURVEY_ID) , ])
  
  plants[is.na(survey_maxvegdep), .N , NO_VEG_FOUND ]
  
  
  plot(data =  plants[, .N , .(alltime_maxvegdep, survey_maxvegdep, SURVEY_ID) , ],alltime_maxvegdep~survey_maxvegdep )
  
  # n_points within all time max vegetated depth
  
  plants[,
         .(alltime_maxvegdep_n_samp = fifelse(is.na(first(alltime_maxvegdep)) , NA ,
                                             length(POINT_ID))
                ) , SURVEY_ID ]

  surveys <- merge(surveys,
                   plants[,
                          .(alltime_maxvegdep_n_samp = fifelse(is.na(first(alltime_maxvegdep)) , NA ,
                                                               length(
                                                                 unique(ifelse(DEPTH_FT <= alltime_maxvegdep,POINT_ID,
                                                                                    NA),
                                                                             na.rm =T)
                                                                      )
                                                               ),
                            alltime_maxvegdep = first(alltime_maxvegdep)) ,
                          SURVEY_ID ],
                   by = "SURVEY_ID", all.x = TRUE
  )
  
  # n_points within survey specific max vegetated depth
  surveys <- merge(surveys,
                   plants[,
                          .(survey_maxvegdep_n_samp = 
                              fifelse(
                                is.na(
                                  first(survey_maxvegdep)) , NA ,
                                length(
                                  unique(
                                    ifelse(DEPTH_FT <= survey_maxvegdep,
                                           POINT_ID,NA
                                    ),
                                    na.rm =T
                                  )
                                )
                              ), survey_maxvegdep = first(survey_maxvegdep
                                                          )
                            )
                          , SURVEY_ID ],
                   by = "SURVEY_ID", all.x = TRUE
  )
  
  
  plot( data = surveys, alltime_maxvegdep_n_samp ~ n_points_vegetated, xlab = "n points vegetated in this survey", ylab = "n points within all time max vegetated depth")
  plot( data = surveys[ , .("propsamples_in_histmaxveg_depth" = (alltime_maxvegdep_n_samp/tot_n_samp),
                            "prop_samples_vegetated" = (n_points_vegetated/tot_n_samp)
                            ) , ], propsamples_in_histmaxveg_depth  ~ prop_samples_vegetated)
  
  
  plot(data = surveys,  alltime_maxvegdep_n_samp ~ survey_maxvegdep_n_samp, xlab = "n points within survey specific vegetated depth", ylab = "n points within all time max vegetated depth")
  plot(data = surveys, alltime_maxvegdep ~ survey_maxvegdep)

  summary(surveys[ , .(alltime_maxvegdep, survey_maxvegdep, max_depth_vegetated, min_depth_vegetated)])
  
  surveys[taxa_richness == 0, .N ,  alltime_maxvegdep]
  
  
  
# **species pools & watershed metrics ----------------------------------------------------

#' ## Watersheds and Species Pools Across Scales  
  
#' We have super awesome species pool data because we've got species abunds
#' across multiple scales:
#' From the smallest (point-- plants_rakeabund_wide or plants_occurrence_wide)
#' scale we have a species abundance matrix that can be treated as a product of
#' the species pool above it (whole survey/lake), which we also have an
#' abundance matrix for! We can also move up to the landscape scale, building 
#' species abundance matricies by aggregating these next-lower-scale data.
#' 
#' For example, we can do as described above (compressing matricies to richness
#' for viz) and aggregate to the HUC-8 watershed level. 
#' 
#' 

  #first add the lake level richness from each survey to the point rake abund data
  plants_rakeabund_wide[ , surveyrichness := surveys[match(plants_rakeabund_wide[ ,SURVEY_ID ,], surveys[, SURVEY_ID, ]), taxa_richness] ,]
  
  plants_occurrence_wide[ , surveyrichness := surveys[match(plants_occurrence_wide[ ,SURVEY_ID ,], surveys[, SURVEY_ID, ]), taxa_richness] ,]
  
    ggplot( data = plants_rakeabund_wide,
            aes(jitter(surveyrichness), jitter(nat_richness)))+
      geom_point(alpha = .05)+
      ylab("point level native richness")+
      xlab("suvey level native richness")

  
  # now we need to create a watershed level species matrix:
  #check keys
  # pwi_l$order_ID
  # plants$order_ID
   
  plants[ , watershed := pwi_l[match(plants[ , order_ID ,],pwi_l[ , order_ID ,]), major , ],]
  
  # plants[ , length(unique(POINT_ID)) , watershed]

    
  watersheds <- plants[ , .("n_points" = length(unique(POINT_ID))) , watershed]
  
  watersheds <- merge(watersheds, 
        plants[!is.na(TAXON) , .("n_species" = length(unique(TAXON))) , watershed],
        by = "watershed", 
        all.x = T)
  
  
  watershed_occurrence_wide <- dcast(plants, watershed ~ TAXON, value.var = "INDATABASE", fun.aggregate = sum, fill = 0)
  
  watershed_occurrence_wide <- merge(watersheds, 
                      watershed_occurrence_wide,
                      by = "watershed", 
                      all.x = T)
  
  plants_rakeabund_wide[ , watershed := plants[match(plants_rakeabund_wide[ ,SURVEY_ID ,], plants[, SURVEY_ID, ]), watershed] ,]
  
  plants_rakeabund_wide[ , watershedrichness := watershed_occurrence_wide[match(plants_rakeabund_wide[ ,watershed ,],    watershed_occurrence_wide[, watershed, ]), n_species] ,]
  
  # add to occurrence wide set:
  plants_occurrence_wide[ , watershed := plants[match(plants_occurrence_wide[ ,SURVEY_ID ,], plants[, SURVEY_ID, ]), watershed] ,]
  
  plants_occurrence_wide[ , watershedrichness := watershed_occurrence_wide[match(plants_occurrence_wide[ ,watershed ,],    watershed_occurrence_wide[, watershed, ]), n_species] ,]
  
  # watershed richness as the predictor of point scale richness: 
  ggplot( data = plants_rakeabund_wide,
          aes(jitter(watershedrichness), jitter(nat_richness)))+
    geom_point()+
    ylab("point level native richness")+
    xlab("watershed level native richness")
  
  surveys[ , watershed := plants[match(surveys[ , SURVEY_ID ,], plants[, SURVEY_ID, ]), watershed] ,]
  surveys[ , watershedrichness := watershed_occurrence_wide[match(surveys[ ,watershed ,], watershed_occurrence_wide[, watershed, ]), n_species] ,]
  
  surveys[is.na(watershedrichness), watershedrichness := 0]
  
  ggplot( data = surveys,
          aes(watershedrichness, nat_richness))+
    geom_point()+
    geom_smooth(method = "loess")+
    ylab("Survey Richness")+
    xlab("HUC-8 Watershed Richness")+
    theme_bw()

  # Get watershed Diversity

  names(watershed_occurrence_wide) %in% natcols
  
  watershed_occurrence_wide[ , simpson_div_nat := diversity(.SD,index = "invsimpson" ) , .SDcols = c(names(watershed_occurrence_wide) %in% natcols)]
  
  hist(watershed_occurrence_wide$simpson_div_nat)
  
  plants_rakeabund_wide[ , watershedsimpson_nat := watershed_occurrence_wide[match(plants_rakeabund_wide[ ,watershed ,], watershed_occurrence_wide[, watershed, ]), simpson_div_nat] ,]
  
  plants_occurrence_wide[ , watershedsimpson_nat := watershed_occurrence_wide[match(plants_occurrence_wide[ ,watershed ,], watershed_occurrence_wide[, watershed, ]), simpson_div_nat] ,]
  
  plants_rakeabund_wide[ , surveysimpson_nat := surveys[match(plants_rakeabund_wide[ ,SURVEY_ID ,], surveys[, SURVEY_ID, ]), simpsons_div_nat] ,]
  
  plants_occurrence_wide[ , surveysimpson_nat := surveys[match(plants_occurrence_wide[ ,SURVEY_ID ,], surveys[, SURVEY_ID, ]), simpsons_div_nat] ,]
  
  surveys[ , watershedsimpson_nat := watershed_occurrence_wide[match(surveys[ ,watershed ,], watershed_occurrence_wide[, watershed, ]), simpson_div_nat] ,]
  
  
  #redo species pool plots:
  point_pools <- ggplot( data = plants_rakeabund_wide[simpsons_div_nat != Inf],
          aes(surveysimpson_nat, simpsons_div_nat))+
    geom_point()+
    geom_smooth(method = "lm")+
    ylab("Point-scale ENSpie")+
    xlab("Lake-scale ENSpie")+
    theme_bw()
  
  lake_pools <- ggplot( data = surveys,
          aes(watershedsimpson_nat, simpsons_div_nat))+
    geom_point()+
    geom_smooth(method = "lm")+
    ylab("Lake-scale ENSpie")+
    xlab("Watershed-scale ENSpie")+
    theme_bw()
   ggarrange(
     point_pools,
     lake_pools
   )
  
   
   #bring simpsons div back to HUc8 table
   setDT(watersheds_huc8)
   watersheds_huc8[watershed_occurrence_wide, on = .(major = watershed) , simpson_div_nat := simpson_div_nat ]
  
   
#clean up intermediates:
   
   rm(lake_pools, point_pools, survey_species_matrix)



# **data products -----------------------------------------------------------


#' # Export Datasets
#' 
#' We have 6 datasets to export: 
#' 
#'   
#' 
#' 1. plants --> plants_env_data.csv -- As long format: each row is a species observation within a point (multiple
#' rows per point) including all fields retained through cleaning processes
#' 
   # names(plants)
   plants[ , c("DATASOURCE", "STA_NBR_DATASOURCE", "SURVEY_ID_DATASOURCE", "SAMPLE_NOTES", "OLD_SURVEY_ID", "DATESURVEYSTART", "COHORT", "DATEINFO", "MONTH", "DAY", "YEAR", "INVENTORY_STAFF", "INVENTORY_STAFFDATE", "USEABLE", "CLEANED", "INDATABASE", "INVENTORY_NOTES", "SUBMISSION_STAFF", "SUBMISSION_STAFFDATE", "SUBMISSION_NOTES", "SURVEY_FEEDBACK") := NULL , ]
   setcolorder(plants, c("DOW", "LAKE_NAME", "order_ID", "SUBBASIN", "watershed","alltime_maxvegdep",
                         "SURVEY_ID", "SURVEY_DATASOURCE", "SURVEY_DATE", "MULTIPARTSURVEY", "SURVEYOR","RAKE_SCALE_USED","survey_maxvegdep",
                         "Secchi_m", "SECCHI_DATE", "SECCHI_m_ACCEPTED",
                         "POINT_ID" ,"DEPTH_FT", "proplight", "Longitude", "Latitude", 
                         "NO_VEG_FOUND", "WHOLE_RAKE_REL_ABUND","SUBSTRATE", "SURFACE_GROWTH", "POINT_LVL_SECCHI",  
                         "OBS_ID", "TAXON", "REL_ABUND", "REL_ABUND_CORRECTED"
                         ))
   # export_names_plants <- tolower(names(plants))
   
# metadata for column names in this file
   # dow: MN Dept of Waters Ident.
   # lake_name: Name of the lake.
   # order_ID: key used to link to MN Hydrography dataset
   # subbasin: Sub-basin where the observation was made.
   # watershed: Watershed associated with the observation.
   # alltime_maxvegdep: Maximum vegetation depth ever observed in the lake (excludes any depth observation >50ft).
   # survey_id: Identification number for the survey.
   # survey_datasource: Name of the source of the survey data.
   # survey_date: Date when the survey was conducted, if multiple dates uses the first day of the survey.
   # multipartsurvey: Indicator for if the survey is part of a larger survey. Numeric with structure of SURVEY.PART
   # surveyor: Person or entity conducting the survey if known.
   # rake_scale_used: Scale used for rake abundance measurements.
   # survey_maxvegdep: Maximum vegetation depth observed during the survey.
   # secchi_m: Nearest temporal Secchi depth measured in meters.
   # secchi_date: Date when Secchi depth was measured.
   # secchi_m_accepted: Secchi depth measurement if observation is within 30d of the plant survey (used for proplight calculation).
   # point_id: Identification number for the observation point.
   # depth_ft: Depth in feet.
   # proplight: Proportion of surface light remaining at DEPTH_FT.
   # longitude: Longitude coordinate of the observation point.
   # latitude: Latitude coordinate of the observation point.
   # no_veg_found: Indicator if no vegetation was found at point.
   # whole_rake_rel_abund: Relative abundance rating assigned to the whole rake (all species), if assigned.
   # substrate: Substrate type.
   # surface_growth: Indicator variable for plant growth reached surface of water.
   # point_lvl_secchi: Secchi level at the observation point if recorded.
   # obs_id: Identification number for the observation.
   # taxon: Name of taxon observed.
   # rel_abund: Relative abundance observed (see RAKE_SCALE_USED for possible values).
   # rel_abund_corrected: Corrected relative abundance (fixes all relative abunds to scale of 1,2,3).
   
  # fwrite(plants, file = "data&scripts/data/output/DRUM/plants_env_data.csv")   
   
#' 2. plants_occurrence_wide --> plants_env_data_wide.csv -- As a wide format of occurrences: each row is a point record, and the columns include a 
#' species observation (presence/absence) matrix.

   # names(plants_occurrence_wide)

   setcolorder(plants_occurrence_wide, c("DOW", "LAKE_NAME", "order_ID", "SUBBASIN",
                                         "watershed", "watershedrichness", "watershedsimpson_nat",
                                         "SURVEY_ID", "SURVEY_DATASOURCE", "SURVEY_DATE", "MULTIPARTSURVEY", "SURVEYOR","surveyrichness", "surveysimpson_nat",
                                         "Secchi_m", "SECCHI_DATE", "SECCHI_m_ACCEPTED",
                                         "POINT_ID" ,"DEPTH_FT", "proplight", "Longitude", "Latitude",
                                         "richness", "nat_richness"))
   
   
   # export_names_plants_occurrence_wide <- tolower(names(plants_occurrence_wide))
   
   names(plants_occurrence_wide)
# I ran glimpse() and then added in metadata:
# DOW                              <int> MN Dept of Waters Ident.
# LAKE_NAME                        <chr> Name of the lake.
# order_ID                         <int> key used to link to MN Hydrography dataset
# SUBBASIN                         <chr> Sub-basin where the observation was made.
# watershed                        <dbl> Watershed associated with the observation. numeric key for watershed (see watershed_occurrence_wide for detail on watersheds like names, sizes, etc. )
# watershedrichness                <int> taxa richness across all surveys in watershed
# watershedsimpson_nat             <dbl> inverse simpsons diversity in watershed
# SURVEY_ID                        <int> Identification number for the survey.
# SURVEY_DATASOURCE                <chr> Name of the source of the survey data.
# SURVEY_DATE                      <IDate> Date when the survey was conducted, if multiple dates uses the first day of the survey.
# MULTIPARTSURVEY                  <dbl> Indicator for if the survey is part of a larger survey. Numeric with structure of [SURVEY.PART]
# SURVEYOR                         <chr> Person or entity conducting the survey if known.
# surveyrichness                   <dbl> Taxonomic richness of survey
# surveysimpson_nat                <dbl> Inverse simpsons diversity of survey
# Secchi_m                         <dbl> Nearest temporal Secchi depth measured in meters.
# SECCHI_DATE                      <IDate> Date when Secchi depth was measured.
# SECCHI_m_ACCEPTED                <dbl> Secchi depth measurement if observation is within 30d of the plant survey (used for proplight calculation).
# POINT_ID                         <int> Identification number for the observation point.
# DEPTH_FT                         <dbl> Depth to substrate in feet.
# proplight                        <dbl> Proportion of surface light remaining at DEPTH_FT.
# Longitude                        <dbl> Longitude coordinate of the observation point.
# Latitude                         <dbl> Latitude coordinate of the observation point.
# richness                         <dbl> Number of unique taxa observed at the point.
# nat_richness                     <dbl> Number of unique native taxa observed at the point.
# the remaining columns are species occurrence columns, indicating whether a taxon was observed, along with it's relative rake abundance (1-3) or not observed (0) at a sample point
   
   # fwrite(plants_occurrence_wide, file = "data&scripts/data/output/DRUM/plants_occurrence_wide.csv")
   
   
#' 
#' 3. plants_rakeabund_wide --> plants_abund_env_data_wide.csv -- As a wide 
#' format with species abundances (a subset of "2.") where each row
#' is a point record , and the columns include a species abundance matrix

   # names(plants_rakeabund_wide)
   
   setcolorder(plants_rakeabund_wide, c("DOW", "LAKE_NAME", "order_ID", "SUBBASIN", 
                                        "watershed", "watershedrichness", "watershedsimpson_nat",
                                        "SURVEY_ID", "SURVEY_DATASOURCE", "SURVEY_DATE", "MULTIPARTSURVEY", "SURVEYOR", "surveyrichness", "surveysimpson_nat",
                                        "Secchi_m", "SECCHI_DATE", "SECCHI_m_ACCEPTED",
                                        "POINT_ID" ,"DEPTH_FT", "proplight", "Longitude", "Latitude",
                                        "shannon_div", "simpsons_div", "shannon_div_nat", "simpsons_div_nat", "richness", "nat_richness"
                                        ))
  
   # export_names_plants_rakeabund_wide <- tolower(names(plants_rakeabund_wide))
   
   names(plants_rakeabund_wide)

# I ran glimpse() and then added in metadata:
# DOW                              <int> MN Dept of Waters Ident.
# LAKE_NAME                        <chr> Name of the lake.
# order_ID                         <int> key used to link to MN Hydrography dataset
# SUBBASIN                         <chr> Sub-basin where the observation was made.
# watershed                        <dbl> Watershed associated with the observation. numeric key for watershed (see watershed_occurrence_wide for detail on watersheds like names, sizes, etc. )
# watershedrichness                <int> taxa richness across all surveys in watershed
# watershedsimpson_nat             <dbl> inverse simpsons diversity in watershed
# SURVEY_ID                        <int> Identification number for the survey.
# SURVEY_DATASOURCE                <chr> Name of the source of the survey data.
# SURVEY_DATE                      <IDate> Date when the survey was conducted, if multiple dates uses the first day of the survey.
# MULTIPARTSURVEY                  <dbl> Indicator for if the survey is part of a larger survey. Numeric with structure of [SURVEY.PART]
# SURVEYOR                         <chr> Person or entity conducting the survey if known.
# surveyrichness                   <dbl> Taxonomic richness of survey
# surveysimpson_nat                <dbl> Inverse simpsons diversity of survey
# Secchi_m                         <dbl> Nearest temporal Secchi depth measured in meters.
# SECCHI_DATE                      <IDate> Date when Secchi depth was measured.
# SECCHI_m_ACCEPTED                <dbl> Secchi depth measurement if observation is within 30d of the plant survey (used for proplight calculation).
# POINT_ID                         <int> Identification number for the observation point.
# DEPTH_FT                         <dbl> Depth to substrate in feet.
# proplight                        <dbl> Proportion of surface light remaining at DEPTH_FT.
# Longitude                        <dbl> Longitude coordinate of the observation point.
# Latitude                         <dbl> Latitude coordinate of the observation point.
# shannon_div                      <dbl> Shannon diversity of taxa observed at the point
# simpsons_div                     <dbl> Inverse simpsons diversity of taxa observed at the point
# shannon_div_nat                  <dbl> Shannon diversity of native taxa observed at the point
# simpsons_div_nat                 <dbl> Inverse simpsons diversity of native taxa observed at the point
# richness                         <dbl> Number of unique taxa observed at the point.
# nat_richness                     <dbl> Number of unique native taxa observed at the point.
# the remaining columns are species occurrence columns, indicating whether a taxon was observed (1) or not observed (0) at a sample point

   # fwrite(plants_rakeabund_wide, file = "data&scripts/data/output/DRUM/plants_abund_env_data_wide.csv")
   
   
#' 4. surveys --> surveys_aqplants.csv -- aggregated plants data at the survey 
#' level. Each row
#' is a set of survey-level summary stats and abundances (number of obs) for all
#' species in the dataset.   
   
   setcolorder(surveys, c("DOW", "LAKE_NAME", "order_ID", "SUBBASIN", 
                          "watershed", "watershedrichness", "watershedsimpson_nat",
                          "SURVEY_ID", "SURVEY_DATASOURCE", "SURVEY_DATE", "MULTIPARTSURVEY",
                          "Secchi_m", "Secchi_m_date",
                          "nobs", "tot_n_samp",  
                          "max_depth_surveyed", "min_depth_surveyed", "mean_depth_surveyed", "median_depth_surveyed", "IQR_depth_surveyed", 
                          "max_depth_vegetated", "min_depth_vegetated", "mean_depth_vegetated", "median_depth_vegetated", "IQR_depth_vegetated",
                          "alltime_maxvegdep", "alltime_maxvegdep_n_samp", "survey_maxvegdep", "survey_maxvegdep_n_samp", 
                          "n_points_vegetated", "prop_veg", 
                          "shannon_div", "simpsons_div", "shannon_div_nat", "simpsons_div_nat", "taxa_richness", "nat_richness" 
   ))
   glimpse(surveys)

    # export_names_surveys <- tolower(names(surveys))
    
# DOW                              <int> MN Dept of Waters Ident.
# LAKE_NAME                        <chr> Name of the lake.
# order_ID                         <int> key used to link to MN Hydrography dataset
# SUBBASIN                         <chr> Sub-basin where the observation was made.
# watershed                        <dbl> Watershed associated with the observation. numeric key for watershed (see watershed_occurrence_wide for detail on watersheds like names, sizes, etc. )
# watershedrichness                <int> taxa richness across all surveys in watershed
# watershedsimpson_nat             <dbl> inverse simpsons diversity in watershed
# SURVEY_ID                        <int> Identification number for the survey.
# SURVEY_DATASOURCE                <chr> Name of the source of the survey data.
# SURVEY_DATE                      <IDate> Date when the survey was conducted, if multiple dates uses the first day of the survey.
# MULTIPARTSURVEY                  <dbl> Indicator for if the survey is part of a larger survey. Numeric with structure of [SURVEY.PART]
# Secchi_m                         <dbl> Nearest temporal Secchi depth measured in meters.
# Secchi_m_date                      <IDate> Date when Secchi depth was measured.
# nobs                             <int> number of observations in this survey (5 taxa observed at a point = 5 observations--thus this metric can exceed the tot n samp vlaue)
# tot_n_samp                       <int> total number of samples taken/points sampled
# max_depth_surveyed               <dbl> max depth that survyors sampled (ALL DEPTHS IN FEET)
# min_depth_surveyed               <dbl> min depth that surveyors sampled (ALL DEPTHS IN FEET)
# mean_depth_surveyed              <dbl> mean depth that surveyors sampled (ALL DEPTHS IN FEET)
# median_depth_surveyed            <dbl> median depth that surveyors sampled (ALL DEPTHS IN FEET)
# IQR_depth_surveyed               <dbl> inter-quartile range depth that surveyors sampled (ALL DEPTHS IN FEET)
# max_depth_vegetated              <dbl> maximum depth where vegetation was observed (ALL DEPTHS IN FEET)
# min_depth_vegetated              <dbl> min depth where vegetation was observed (ALL DEPTHS IN FEET)
# mean_depth_vegetated             <dbl> mean depth where vegetation was observed (ALL DEPTHS IN FEET)
# median_depth_vegetated           <dbl> median depth where vegetation was observed (ALL DEPTHS IN FEET)
# IQR_depth_vegetated              <dbl> inter-quartile range depth where vegetation was observed (ALL DEPTHS IN FEET) 
# alltime_maxvegdep                <dbl> the max depth of plants ever observed in this lake (across all surveys in this db)
# alltime_maxvegdep_n_samp         <int> Number of samples taken from points less than alltime_maxvegdep during this survey
# survey_maxvegdep                 <dbl> Survey maximum vegetation depth.
# survey_maxvegdep_n_samp          <int> Number of samples for survey maximum vegetation depth.
# n_points_vegetated               <int> Number of points with veg present
# prop_veg                         <dbl> n_points_vegetated/tot_n_samp
# shannon_div                      <dbl> Shannon diversity index for this survey
# simpsons_div                     <dbl> survey inverse Simpson's diversity index.
# shannon_div_nat                  <dbl> survey Shannon diversity index including native taxa only.
# simpsons_div_nat                 <dbl> survey inverse Simpson's diversity index including native taxa only 
# taxa_richness                    <dbl> count of taxa in this survey
# nat_richness                     <dbl> native species taxon count in this survey     
# the remaining columns are species occurrence count columns, indicating the number of points at which a taxon was observed (1+) or not observed (0) during a survey.
    
       
    # fwrite(surveys, file = "data&scripts/data/output/DRUM/surveys_aqplants.csv")
   
   
#' 5. missing surveys --> missing_data_surveys.csv -- Surveys identified, but for 
#' which plant data but were not sucessfully collated for this project. Each 
#' row is a survey.   
   
   # names(missing_data_surveys)
   missing_data_surveys[ , c( "STA_NBR_DATASOURCE", "SURVEY_ID_DATASOURCE", "SAMPLE_NOTES", "OLD_SURVEY_ID", "DATESURVEYSTART",
                              "COHORT","POINT_ID" ,"DEPTH_FT", "NO_VEG_FOUND", "WHOLE_RAKE_REL_ABUND","SUBSTRATE", "SURFACE_GROWTH",
                              "POINT_LVL_SECCHI","OBS_ID", "TAXON", "REL_ABUND","RAKE_SCALE_USED", 
                              "X", "Y", "NORTHING", "EASTING", "LATITUDE", "LONGITUDE", "UTMX", "UTMY"
                              ) := NULL , ]
   
   setcolorder(missing_data_surveys, c("DOW", "LAKE_NAME", "SUBBASIN", 
                                       "DATASOURCE", "SURVEY_ID", "SURVEY_DATASOURCE", "SURVEY_DATE", "MULTIPARTSURVEY", "SURVEYOR",
                         "DATEINFO", "MONTH", "DAY", "YEAR", 
                         "INVENTORY_STAFF", "INVENTORY_STAFFDATE", "USEABLE", "CLEANED", "INDATABASE",
                         "INVENTORY_NOTES", "SUBMISSION_STAFF", "SUBMISSION_STAFFDATE", "SUBMISSION_NOTES", "SURVEY_FEEDBACK"
   ))
   
   # export_missing_data_surveys <- tolower(names(missing_data_surveys))
   glimpse(missing_data_surveys)
   
# DOW                  <int> MN Dept of Waters Ident.
# LAKE_NAME            <chr> Name of the lake
# SUBBASIN             <chr> Subbasin where the survey was conducted if applicable
# DATASOURCE           <chr> Internal listing for source that identified the survey
# SURVEY_ID            <int> Unique identifier for the survey
# SURVEY_DATASOURCE    <chr> Source or authority for the survey data (could be contacted to try to acquire these data)
# SURVEY_DATE          <IDate> Date when the survey was conducted, if multiple dates uses the first day of the survey
# MULTIPARTSURVEY      <dbl> Indicator for if the survey is part of a larger survey. Numeric with structure of SURVEY.PART
# SURVEYOR             <chr> Surveyor name(s) if known
# DATEINFO             <chr> Date information that may help in identifying the survey
# MONTH                <int> Month of the survey
# DAY                  <int> Day of the survey.
# YEAR                 <int> Year of the survey
# INVENTORY_STAFF      <chr> Inventory staff name
# INVENTORY_STAFFDATE  <chr> Date of inventory by staff
# USEABLE              <chr> Indicator for data usability as submitted to project team
# CLEANED              <chr> Indicator for successful pre-cleaning of the data
# INDATABASE           <lgl> Indicator for sucessful processing into database
# INVENTORY_NOTES      <chr> Inventory notes from project staff
# SUBMISSION_STAFF     <chr> staff name that processed the original submission
# SUBMISSION_STAFFDATE <chr> Date of submission processing
# SUBMISSION_NOTES     <chr> Submission notes from project staff
# SURVEY_FEEDBACK      <chr> Feedback from the survey of data contributors    
   
   
   # fwrite(missing_data_surveys, file = "data&scripts/data/output/DRUM/missing_data_surveys.csv")
   
   
   
   
   
   
   
   
#' 6. watershed level summaries --> watershed_occurrence_wide.csv -- Surveys identified, but for 
#' which plant data but were not sucessfully collated for this project. Each 
#' row is a survey.    
   
   # watersheds_huc8[ , simpson_div_nat := NULL , ]
   
   watershed_occurrence_wide <- watershed_occurrence_wide[as.data.table(watersheds_huc8)[ ,.SD , .SDcols = !c("simpson_div_nat")], on = .(watershed=major)   , ]
   
   watershed_occurrence_wide[ , c("NA", "HUC_8", "Shape_Leng", "Shape_Area") := NULL , ] 
   
   # colnames(watershed_occurrence_wide)
   
   
   names <-    c("watershed","major_name",
                 "acres", "sq_miles", "prod_year", "source", "geometry", 
                 "n_points", "n_species", "simpson_div_nat")                      
                 
   setcolorder(watershed_occurrence_wide, names )
   
   glimpse(watershed_occurrence_wide
           )
#    A. Name: watershed
#    Description: numeric code for watershed. matches to same field in other dataset. See also source file 4. from SHARING/ACCESS SECTION (DNR Watersheds 2023).
#    
#    B. Name: major_name	
#    Description: name of major watershed that corresponds to the watershed code. See also source file 4. from SHARING/ACCESS SECTION (DNR Watersheds 2023).
#    
#    C. Name: acres
#    Description: acres encompassed by watershed. See also source file 4. from SHARING/ACCESS SECTION (DNR Watersheds 2023).
#    
#    D. Name: sq_mile
#    Description: square miles encompassed by watershed. See also source file 4. from SHARING/ACCESS SECTION (DNR Watersheds 2023).
#    
#    E. Name: prod_year
#    Description: 	The year of production associated with polygon linework. See also source file 4. from SHARING/ACCESS SECTION (DNR Watersheds 2023).
#    
#    F. Name: source
#    Description: 	The source of polygon linework. See also source file 4. from SHARING/ACCESS SECTION (DNR Watersheds 2023). 
#    
#    G. Name: n_points
#    Description: number of points sampled in the watershed (not resampling of a points is unaccounted for, so resampled points are counted as n points where n = number of resamples)
#    
#    H. Name: n_species
#    Description: number of unique taxa observed in the watershed 
#    
#    I. Name: simpson_div_nat
#    Description: inverse Simpson's diversity of watershed taxa community  
# 
#     J.- END: [name of taxon observed in the database]
# 	Description: Number of observations of [named taxa] in this watershed

   
   
   
   
   
   # fwrite(watershed_occurrence_wide[ ,.SD ,.SDcols = !c("geometry" )], file = "data&scripts/data/output/DRUM/watershed_occurrence_wide.csv")   
   
  
# DataVizFigs -------------------------------------------------------

# **distribution of lakes and surveys ---------------------------------------

#' # Generate Data Viz for Pub
#' 
#' ## Map and Timeline   
   
#plot with lakes shapes and point locs!! 
   
   # pwi_l[order_ID %in% unique(plants[ , order_ID]), ,]
   # 
   # ggplot(pwi_l[order_ID %in% plants[ , unique(order_ID) , ] , , ], aes(geometry=geometry)) +
   #   geom_sf() +
   #   labs(caption = "Map of lakes with surveys in our database")
   
   #Conversion of data frame to sf object to add points
   plants_pts <- st_as_sf(x = plants[!is.na(Longitude)],                         
                          coords = c("Longitude", "Latitude"),
                          crs = "+proj=lonlat +datum=WGS84")
   
   # #map points
   # ggplot(plants_pts, aes(geometry=geometry)) +
   #   geom_sf() +
   #   labs(caption = "Map of survey points in our database")
   
   #plot all together!
   
   #other data for fig
   # usa <- map_data("usa")
   # canada <- map_data("world", region = "canada")
   # states <- map_data("state")
   states <- sf::st_as_sf(maps::map("state", plot = FALSE, fill = TRUE))
   mn_df <- subset(states, ID == "minnesota")
   
   #Projection transformation
   plants_pts = st_transform(plants_pts, crs = "+proj=utm +zone=15")
   pwi_l <- st_sf(pwi_l)
   pwi_l <- st_transform(pwi_l, crs = st_crs(mn_df))
   setDT(pwi_l)
   
   watersheds_huc8 <- st_sf(watersheds_huc8)
   watersheds_huc8 <- st_transform(watersheds_huc8, crs = st_crs(mn_df))
   
#' ### Sampled Lakes Spatial Dist  
   
   #map
   study_map <- ggplot(data = pwi_l, aes(geometry=geometry))+
     geom_sf(data = watersheds_huc8,aes(geometry = geometry), alpha = .05, color = "gray")+
     geom_sf(alpha = .5, color = "blue")+
     # geom_point(
     #   aes(geometry = geometry),
     #   stat = "sf_coordinates", color = "blue", alpha = 0.5)+
     geom_point(data = pwi_l[order_ID %in% plants[ , unique(order_ID) , ] , , ],
                stat = "sf_coordinates", 
                aes(geometry=geometry),
                color = "red", alpha = .5)+
     geom_sf(data = mn_df,aes(geometry = geom), color = "black", alpha = .05)+
     scale_shape_discrete(solid = FALSE)+
     theme(text = element_text(size=20), legend.position = )+
     # theme_bw()+
     ylab("Longitude")+
     xlab("Latitude")
   
   # study_map




   pwi_l[order_ID %in% plants[ , unique(order_ID) , ] , plant_survey := T ,]
   
#' ### Schupps lake classes

   ggplot(pwi_l, aes(lake_class))+
     scale_y_log10()+
     geom_histogram( binwidth = 1 )+
     geom_histogram(binwidth = 1, data =pwi_l[plant_survey == T], aes(lake_class), color = "red", alpha = .5)+
     labs( title = "Distribution versus samping of Schupps lake classes \n
           https://files.dnr.state.mn.us/publications/fisheries/investigational_reports/417.pdf")+
     scale_x_continuous(breaks = seq(0,44,2) )
     
     # geom_density(aes(color = plant_survey))
   
   
   #lake area
   ggplot(pwi_l, aes(acres.x))+
     scale_x_log10()+
     scale_y_log10()+
     geom_histogram( )+
     geom_histogram( data = pwi_l[plant_survey == T], aes(acres.x), color = "red", alpha = .5)
   
   ggplot(pwi_l, aes(acres.x))+
     # geom_histogram()+
     scale_x_log10()+
     geom_density(aes(color = plant_survey))
     
     
   
 # hist(pwi_l$lake_class)

# temporal accumulation ----------------------------------------------------

#' ### Temporal Accumulation
   
   # of surveys
   plants[ , length(unique(SURVEY_ID)), year(SURVEY_DATE)]
   
   #of obs
   plants[ , length(unique(OBS_ID)), year(SURVEY_DATE)]
   
   #taxa
   plants[!is.na(TAXON) , length(unique(TAXON)), year(SURVEY_DATE)]
   
   missing_data_surveys[ , .N , SURVEY_DATE]
   
   plotdat <- rbindlist(list(plants[ , first(SURVEY_DATE) , SURVEY_ID],missing_data_surveys[ , first(SURVEY_DATE) , SURVEY_ID]))[!is.na(V1)]
   
   setorder(plotdat, V1)
      plotdat[ , cumval := .I , ]
   
  temporal_accumulation <- ggplot(plotdat, aes(V1, cumval)) +
     geom_line()+
     # theme_bw()+
     xlab("Year")+
     ylab("Cumulative Survey Count")+
    theme(text = element_text(size=20), legend.position = )
 #   plotdat[ , metric := "surveys" , ]
 #   
 #   plotdat_pts <- plants[ , first(SURVEY_DATE) , POINT_ID]
 #      setorder(plotdat_pts, V1)
 #      plotdat_pts[ , cumval := .I , ]
 #      plotdat_pts[ , metric := "points" , ]
 #      
 #   
 #   plotdat_taxa <- plants[ , first(SURVEY_DATE) , TAXON]
 #     setorder(plotdat_taxa, V1)
 #     plotdat_taxa[ , cumval := .I , ]
 #     plotdat_taxa[ , metric := "taxa" , ]
 #     
 #     
 # plotdat_all <- rbind(rbind(plotdat[ ,2:4 ], plotdat_pts[ ,2:4 ] ) , plotdat_taxa[ ,2:4 ])
 # 
 # ggplot(plotdat_all, aes(V1, cumval)) +
 #   geom_line()+
 #   facet_wrap(~ metric, scales = "free")
 # 

#' ### Figure 2 
     
  # arrange
  plots.row <- align_plots(study_map, temporal_accumulation, align="hv", 
                           axis="tblr")
  div.rows <- plot_grid(plots.row[[1]], plots.row[[2]],
                        nrow=1, label_size = 20,
                        label_fontface = "plain", labels = c("(a)", "(b)"), 
                        hjust = -0, vjust = 2.4)
  
  
   #write to file
  # png(file = "Fig_Map_Time.png", width = 10, height = 5, units = "in", res = 1200)
  div.rows
  # dev.off()

# species abundance distributions -------------------------------------

  
#' ## Species Abundance Distribution  
  
  
  sad.dat <- plants[!is.na(TAXON) , .N , TAXON]
   setorder(sad.dat, -N)
   sad.dat[ , TAXON := factor(TAXON, levels = sad.dat$TAXON)]
   
   sad.dat[ , perc_abund := N/sum(sad.dat$N) , ]
   
   
  
  # ggplot(plotdat[], aes(TAXON, perc_abund))+
  #   geom_point()+
  #   scale_y_log10()+
  #   xlab("Taxa")+
  #   ylab("log10(percent of all observations)")+
  #   theme_bw()+
  #   theme(axis.text.x = element_blank())





  # write.csv(plotdat, "data&scripts/data/output/species_abund_list.csv")

#' Excluding unvegetated points, recalculating and ordering by perc-abund
   
   
  sad.dat <- 
    sad.dat %>% 
    filter(TAXON != "") %>% 
    mutate(perc_abund = N/sum(N)) %>% 
    arrange(desc(perc_abund)) %>% 
    mutate(rank = 1:n()) %>% 
    rename(Taxon = TAXON)
  
  sum(sad.dat$perc_abund)
  
  #' Most and least abundant
  top1 <- 
    sad.dat %>% 
    filter(rank %in% 1:22) %>%
    dplyr::select(Taxon, N)
  
  top2 <- 
    sad.dat %>% 
    filter(rank %in% 23:50) %>%
    dplyr::select(Taxon, N)


#' 
#' ### Figure 3

  sad.plot <- ggplot(sad.dat, aes(x = rank, y = perc_abund)) +
    geom_segment(aes(xend = 105, y = 0.03, x = 52, yend = 0.03),
                 arrow = arrow(length = unit(0.25, "cm")), color = "black") +
    geom_table(data = top1, aes(x = 175, y = 0.26, label=list(top1)), size = 2,
               table.theme=ttheme_gtsimple, vjust = "top", hjust = "right") +
    geom_table(data = top2, aes(x = 237, y = 0.26, label=list(top2)), size = 2,
               table.theme=ttheme_gtsimple, vjust = "top", hjust = "right") +
    geom_point(shape = 19, size = 2, alpha = 0.5, color = "steelblue") +
    geom_rect(aes(xmin = -2, xmax = 52, ymin = 0.0015, ymax = 0.18),
              fill = "transparent", color = "gray30", linetype = 1) +
    scale_y_log10(breaks = c(0.1, 0.01, 0.001, 0.0001, 0.00001),
                  labels = c(0.1, 0.01, 0.001, expression(paste("1 × 10"^{-4})), 
                             expression(paste("1 × 10"^{-5})))) +
    scale_x_continuous(breaks = c(seq(0, 200, 50))) +
    xlab(expression(Species~(italic(n)^{th}~most~abundant))) + 
    ylab("Proportion of total abundance")  +
    coord_cartesian(xlim = c(5, 225), ylim = c(0.000002, 0.15)) +
    theme(panel.grid.minor=element_blank()) +
    annotate("text", x = 80, y = 0.03, size = 3, adj = 0.5,
             label = "50 most\nabundant taxa")
  
  # png(file = "Fig_SAD.png", width = 5.5, height = 5, units = "in", res = 1200)
  sad.plot
  # dev.off()
  

# diversity environment relationships ------------------------------------

  
#' ## Diversity-Environment Relationships   
  

#' 
#' ### Depth
  summary(plants_rakeabund_wide$simpsons_div_nat)
  summary(plants_rakeabund_wide$DEPTH_FT)
  # desc(sort(plants_rakeabund_wide$DEPTH_FT))
  
  #' Removing zero depth, extreme depth, and Inf ENSpie points
  plants_rakeabund_wide <- 
    plants_rakeabund_wide %>% 
    filter(simpsons_div_nat != "Inf") %>% 
    filter(DEPTH_FT != 0) %>% 
    filter(DEPTH_FT < 50)
  
  label_depth <- "Point~scale~(italic(N) == 70745)"
  point_depth <- ggplot(plants_rakeabund_wide, aes(DEPTH_FT*0.348, simpsons_div_nat)) +
    geom_point(shape = 19, size = 1, alpha = 0.1, color = "steelblue4")+
    scale_x_log10() +
    geom_smooth(method = 'gam', color = "black") +
    ylab(expression(italic(ENS[PIE]))) +
    xlab("Water depth (m)") + 
    ylim(c(0, 15)) +
    annotate("text", x = 0.08, y = 15 - 0.025*15, size = 2.25, adj = 0, 
             label = label_depth, parse = TRUE) +
    theme(text = element_text(size = 7), plot.margin = unit(c(0.1, 0, 0.1, 0.1), "cm")) 
  
  
#' ###  Secchi
#' 
#' Removing NAs
  summary(surveys$simpsons_div_nat)
  summary(surveys$Secchi_m)
  surveys <- 
    surveys %>% 
    filter(!is.na(Secchi_m)) 
  
  label_Secchi <- "Lake~scale~(italic(N) == 2955)"
  lake_Secchi <- ggplot(surveys, aes(Secchi_m, simpsons_div_nat)) +
    geom_point(shape = 19, size = 1.5, alpha = 0.3, color = "steelblue4")+
    geom_smooth(method = 'gam', color = "black") +
    ylab("") +
    xlab("Secchi depth (m)") + 
    ylim(c(0, 21)) +
    annotate("text", x = 0.07, y = 21 - 21*0.025, size = 2.25, adj = 0, 
             label = label_Secchi, parse = TRUE) +
    theme(text = element_text(size = 7), plot.margin = unit(c(0.1, 0, 0.1, 0), "cm")) 
  
#' ### Area
#' 

  
  
#' Removing NAs
  
  summary(watersheds_huc8$simpson_div_nat)
  watersheds_huc8 <- 
    watersheds_huc8 %>% 
    dplyr::select(-geometry) %>% 
    filter(!is.na(simpson_div_nat)) %>%
    filter(simpson_div_nat != "Inf") 
  
  label_area <- "Watershed~scale~(italic(N) == 64)"
  wshed_area <- ggplot(watersheds_huc8, aes(acres*0.404686, simpson_div_nat)) +
    geom_point(shape = 19, size = 2, alpha = 0.6, color = "steelblue4")+
    geom_smooth(method = 'gam', color = "black") +
    ylab("") +
    xlab("Area (ha)") + 
    ylim(c(0, 30)) +
    scale_x_continuous(breaks = c(0, 200000, 400000, 600000),
                       labels = c(0, "200,000", "400,000", "600,000")) +
    annotate("text", x = 0, y = 30 - 30*0.025, size = 2.25, adj = 0, 
             label = label_area, parse = TRUE) +
    theme(text = element_text(size = 7), plot.margin = unit(c(0.1, 0.1, 0.1, 0), "cm")) 
  
#' ### Figure 4

  plots.row <- align_plots(point_depth, lake_Secchi, wshed_area, align="hv", 
                           axis="tblr")
  div.rows <- plot_grid(plots.row[[1]], plots.row[[2]], plots.row[[3]],
                        nrow=1, label_size = 7.5,
                        label_fontface = "plain", labels = c("(a)", "(b)", "(c)"), 
                        hjust = -0.25, vjust = 2)
  
  # png(file = "Fig_DivEnv.png", width = 6.5, height = 2.25, units = "in", res = 1200)
  div.rows
  # dev.off()
  



# footer ------------------------------------------------------------------
  
  
  
#' ## Document footer 

# workspace cleanup -------------------------------------------------------

  rm(div.rows, lake_Secchi, max_depth, mn_df, plots.row, point_depth,
     sad.dat, sad.plot, states, study_map, temporal_accumulation, top1, top2, watersheds, wshed_area,
     bad_abund_surveys, label_area, label_depth, label_Secchi)
#' 
#' Session Information:
  #+ sessionInfo
  sessionInfo()
  
  #script runtime
  Sys.time() - strttime