#' ## Secretive marshbird response to invasive wetland plant management
#' 
#' ### Nina Hill, hillx725@umn.edu
#' David E. Andersen, dea@umn.edu
#' 
#' University of Minnesota, Minnesota Cooperative Fish & Wildlife Research Unit
#' 
#' 135 Skok Hall, Upper Buford Circle, Saint Paul, Minnesota 55108
#' 

format(Sys.Date(), "%Y %B %d")

# Code related to Chapter 2 - 
# Morris bird ~ treatment models
# Morris bird ~ remote + site wetland characteristics

# 0) Set up workspace, echo=FALSE-------------------------
    setwd("F:/forDRUM")
    library(ggplot2); library(gplots)
    library(tidyverse)
    library(stats) # aov(), TukeyHSD()
    library(plotrix)
    # library(MASS); library(lme4)  # linear mixed model
    # library(nlme); library(glmm); library(car)
    # library(sjPlot); library(sjmisc) # for making nice tables of model params
    # library(Hmisc); library(RColorBrewer);  library(foreign)

    bird.broadcast <- c("AMBI","PBGR","SORA","VIRA")
    birdcolors <- c("orange","purple","tomato","dodgerblue")
    
    std <- function(x) sd(x)/sqrt(length(x))

    right = function(text, num_char) {
      substr(text, nchar(text) - (num_char-1), nchar(text))
    }
    
    
    # bird.broadcast2 <- c("AMBI", "PBGR", "SORA", "VIRA")
    # birdnames <- c("American bittern", "Pied-billed grebe", "Sora", "Virginia rail")
    # PTnew2016<-sort(c("73H", "891H", "888H", "699M", "901M", "474M", "449M","1540H","293H","571L","2325H","2345H","2322H","2013H","2014H","2028H"))
    # birdcolors2 <- c("orange","purple","tomato","dodgerblue")
    # treatmentlist <- levels(MPOINT.dens$Treatment)
    
# Part 1 Density ~ Treatment------------------------------------
#************************************************************************************************
# MORRIS STUDY AREA: bird ~ treatment models
#     Make models to test relationship of response variable (adj density birds per point) 
#     to treatment categories.
#           
#************************************************************************************************

# 1) Load data and set up workspace--------
    # MPOINT.dens <- read.csv("data/MPOINT.densities.csv")  # new dataset from script 01_Counts to Densities.R
    MPOINT.dens <- read.csv("data/MPOINT.density.csv") # prepared dataset posted on DRUM
    MPOINT.dens <- MPOINT.dens %>%
      dplyr::select("PointID", "Year", grep("dens", colnames(MPOINT.dens), value=T))  %>%
      filter(!grepl("E", PointID)) %>% # load density data, drop "E" points from this analysis
      mutate(Treatment = factor(right(as.character(PointID), 1), levels=c("L", "M", "H"))) %>%
      pivot_longer(cols=c(AMBI_dens:PBGR2_dens), names_to="Species", values_to="Density") 
    
    ggplot(MPOINT.dens, aes(x = Treatment, y = Density, fill=Treatment, color=Treatment))+
      stat_summary(fun = mean, geom = "point", size=3) + 
      stat_summary(fun.data = mean_se, geom = "errorbar", width=0.3)+
      facet_grid(Species~Year, scales="free")

    
# 2) Make models density ~ treatment-----------------------------------------------------------
    modA <- MPOINT.dens %>% filter(Species == "AMBI_dens")
    A.aov <- aov(Density ~ Treatment, modA)
    summary(A.aov)
    TukeyHSD(A.aov)
    
    modP <- MPOINT.dens %>% filter(Species == "PBGR2_dens") %>% filter(!is.infinite(Density))
    P.aov <- aov(Density ~ Treatment, modP)
    summary(P.aov)
    TukeyHSD(P.aov)
    
    modS <- MPOINT.dens %>% filter(Species == "SORA_dens")
    S.aov <- aov(Density ~ Treatment, modS)
    summary(S.aov)
    TukeyHSD(S.aov)
    
    modV <- MPOINT.dens %>% filter(Species == "VIRA_dens")
    V.aov <- aov(Density ~ Treatment, modV)
    summary(V.aov)
    TukeyHSD(V.aov)
    
    
# 3) Plot Density ~ Treatment!-----------------------------------------------
    for (ii in 1:4){
      par(mfrow=c(1,2))
      plotthis15 <- MPOINT.dens[which(MPOINT.dens$Year=="2015" & MPOINT.dens$Species == paste(bird.broadcast[ii], "dens", sep="_")),]
      # calc lower limit as mean - se (95% CI is mean +/- 1.96*se)    
      plotthis15CIL <-tapply(plotthis15$Density, plotthis15$Treatment, mean, na.rm=T) - 
        1.96*(tapply(plotthis15$Density, plotthis15$Treatment, std.error, na.rm=T)) 
      # calc upper limit as mean - se
      plotthis15CIU <-tapply(plotthis15$Density, plotthis15$Treatment, mean, na.rm=T) + 
        1.96*(tapply(plotthis15$Density, plotthis15$Treatment, std.error, na.rm=T)) 
      
      plotthis16 <- MPOINT.dens[which(MPOINT.dens$Year=="2016" & MPOINT.dens$Species == paste(bird.broadcast[ii], "dens", sep="_")),]
      plotthis16CIL <-tapply(plotthis16$Density, plotthis16$Treatment, mean, na.rm=T) - 1.96*(tapply(plotthis16$Density, plotthis16$Treatment, std.error, na.rm=T))
      plotthis16CIU <-tapply(plotthis16$Density, plotthis16$Treatment, mean, na.rm=T) + 1.96*(tapply(plotthis16$Density, plotthis16$Treatment, std.error, na.rm=T))
  
      barplot2(tapply(plotthis15$Density, plotthis15$Treatment, mean, na.rm=T), col=birdcolors[ii],
               main= paste(bird.broadcast[ii], " 2015",sep=""),
               ylab = "2015 Mean Density", xlab = "Treatment Category",  
               plot.ci=TRUE,  ci.l = plotthis15CIL, ci.u=plotthis15CIU, cex.names=1.5,
               ylim=c(0,(max(c(plotthis16CIU,plotthis15CIU))+0.1)))
      
      barplot2(tapply(plotthis16$Density, plotthis16$Treatment, mean, na.rm=T), col=birdcolors[ii], 
               main= paste(bird.broadcast[ii], " 2016",sep=""),
               ylab = "2016 Mean Density", xlab = "Treatment Category",  
               plot.ci=TRUE, ci.l = plotthis16CIL, ci.u=plotthis16CIU, cex.names=1.5,
               ylim=c(0,(max(c(plotthis16CIU,plotthis15CIU))+0.1)))
      plotname <- paste("output/",bird.broadcast[ii], "densTrtYr.png", sep="")
      dev.copy(png, file=plotname, height=600, width=750)
      dev.off()
      rm(plotthis15, plotthis16 ,plotthis15CIL,plotthis16CIL, plotthis15CIU,plotthis16CIU)
    }
    rm(ii, plotname)
 
    rm(modA, modP, modS, modV)
    rm(A.aov, P.aov, S.aov, V.aov)


# Part 2 Density ~ Wetland characteristics------------------------------------    
#************************************************************************************************
# MORRIS Wetland Characteristics models
# Subset 2/3 data to build model. Test on predicting other 1/3.
# Bird data  111, 128 datapoints in 2015, 2016
# Veg data   111, 57 datapoints in 2015, 2016
# 
# 42 pts with 2 yr bird + 2 yr veg (sample with replacement? i.e. randomly select from 2 years' data?)
# 68 pts with 2 yr bird + 1 yr veg (only 2015 veg)
# 1  pt  with 1 yr bird + 1 yr veg (74E)
# 15 pts with 1 yr bird + 1 yr veg (2016 new)
# 
# 1  pt  with 2 yr bird + 0 yr veg (oops, 109L)
# 1  pt  with 1 yr bird + 0 yr veg (oops, 73H)
# (2  pts dropped, never using again)
#
# Make models to test relationship of response variable (adj density birds per point)
# to vegetation and landscape covariates.
#           
# can use confint()!! and ggcoef()
# am I correctly cal se, sd, and CI?
# ************************************************************************************************

# 1) Import data: bird + GIS + fieldveg---------------------
      
    # Import 2 years' bird density data---
    # MPOINT.dens <- read.csv("data/MPOINT.densities.csv")
    MPOINT.dens <- read.csv("data/MPOINT.density_OLD.csv")
    MPOINT.dens <- MPOINT.dens %>%
      dplyr::filter(!is.na(AMBI_WetHA)) %>%
      dplyr::select("PointID", "Year", grep("dens", colnames(MPOINT.dens), value=T)) %>%
      dplyr::select(!contains("PBGR_")) %>%
      mutate("Treatment" = factor(right(as.character(PointID), 1), levels=c("L", "M", "H", "E"))) %>%
      mutate(PTYR = paste(PointID, Year, sep="_"))
    MPOINT.dens$PBGR2_dens[is.na(MPOINT.dens$PBGR2_dens)] <- 0
    MPOINT.dens$PBGR2_dens[is.infinite(MPOINT.dens$PBGR2_dens)] <- 0
    
    # MPOINT.denslong <- MPOINT.dens %>%
    #   pivot_longer(cols=c(AMBI_dens:PBGR2_dens), names_to="Species", values_to="Density")
    
    # Import covariate datasets---
    MPOINT.veg <- read.csv("data/MPOINT.veg.csv") %>%
      dplyr::mutate("PTYR" = paste(PointID, Year, sep="_")) %>%
      dplyr::select(!c(Treatment, Latitude, Longitude))
      # dplyr::select(PointID, Year, all_of(chosenfield))
    MPOINT.veg$WATDMAX[is.na(MPOINT.veg$WATDMAX)] <- 0
    
    MPOINT.remote <- read.csv("data/MPOINT.remote.csv")
    MPOINT.remote$RatioopenDIVemerg[is.infinite(MPOINT.remote$RatioopenDIVemerg)] <- 0
   
# 2) SELECT COVARIATE VARIABLES-------------------------
    # Make list of chosen covariate data--- 
    # manuscript includes these as reduced set of variables included in modeling:
    
    # FIELD VEG CHARS:
    # veg height mean
    # litter depth
    # cattail density
    # water depth
    # woody species
    # vegetation height variance
    
    # REMOTE CHARS:
    # survey wetland log area
    # shallow marsh prop
    # temp flood prop
    # scrub/shrub present
    # wetland type 1 prop
    # hay/pasture prop
    # open water: emergent ratio
    # all wetland types area
    
    
    chosenfield <- c("WATDMEAN","HGTMAXMEAN0", "LITMAX", "HGTMAXsd", 
                     "WOODYP", # "N.Species", 
                     "TouchTYmean") #, "TouchTYsd")
    
    chosenremote <- c("Logtargwet", 
                      "Wetlandshallow.marsh_prop", # "PTarg.Shallowmarsh",
                      "NWImodA.prop", 
                      "Targ.covShrubwet01",
                      "Wetlandseasonally_prop",
                      "Hayand.Pasture_prop",
                      "RatioopenDIVemerg",
                      "AllWetHA600"
    )
    
    # OLD versions---
    # c("Logtargwet", "AllWetHA600", "wetland woody", "Wetland seasonally", ...)
    # OLD# chosenfield <- c("DEADP","HGTMAXMEAN0","WATDMEAN","LITMAX","LITcv","TouchTYmean","EdgeratioDJ","SORApres")  #,"Targ.covShrub01"
    # OLD# chosennames <- c("Proportion plots with dead veg", "Mean of Max Height of vegetation",
    # "Mean water depth", "Maximum litter depth", "Coeff of variation litter depth", "Density of cattail",
    # "Complexity of shape (Edge [m] / sqrt[area HA])", "SORA present")
    # "LOGwetHA","LOG Area HA of target wetland", 
    # predictors <- c("WOODYP","WATDMEAN","LITMAX","HGTMAXMEAN0", "HGTMAXsd", "TouchTYsd" )
    # predictors <- c("DEADP" , "WATDMEAN" , "LITMAX" , "LITcv" , "HGTMAXMEAN0" , "TouchTYmean")
    # predictors <- c("NWImodA.prop", "AllWetHA600", "Targ.covShrubwet01", "PTarg.Shallowmarsh",
    #                 "Logtargwet", "Wetland.seasonally", "Hayand.Pasture_prop", "RatioopenDIVemerg")
    # thesevars <- c("TargHA100","AllWet600","Mean.Deadhgt","Mean.MaxHgt","P.plotsTYtouch", "N.Species", "Range.WetDepth")
    # 2021/02/14 ug, I lost a bunch of code last summer. looks like it was some PCA finalizations
    # I recreated lost PCA code in other script, not relevant for manuscript docs
    # PCA I ran a whole bunch of times with different vars excluded (ex. just prop vars), come out a little different each time
    # same ones always included: size of target wetland (targetwetarea)
    # wetlands in landscape [vs. row crops] (allwetha600, or wetland emergent)
    # scrub/shrub/woody, others get mixed in
    # some error-checks on GIS ecog data were lost, but I recovered the GIS data
    # lost any transformations I made. I remade proportions of ecog land cover. I combined some groups/names
    
    # library(ellipse)
    # library(RColorBrewer)
    # # Use of the mtcars data proposed by R
    # data <- cor(MPOINT.remote[,-which(colnames(MPOINT.remote) %in% c("PointID", "Treatment"))])
    # 
    # # Build a Pannel of 100 colors with Rcolor Brewer
    # my_colors <- brewer.pal(5, "Spectral")
    # my_colors <- colorRampPalette(my_colors)(100)
    # 
    # # Order the correlation matrix
    # ord <- order(data[1, ])
    # data_ord <- data[ord, ord]
    # plotcorr(data_ord , mar=c(1,1,1,1)  )
    # 
    # library(corrplot)
    # dat <- MPOINT.remote[,-which(colnames(MPOINT.remote) %in% c("PointID", "Treatment", "NVCStype", "Extent",
    #                                                             "DiversSimple2", "DiversPEM", "WetACRE"))]
    # dat[is.na(dat)] <- 0
    # corrplot.mixed(cor(dat),  tl.col="black", upper = "circle", lower="shade", tl.cex=0.5, tl.pos="lt")
    
   
# 3) Subset training data--------------------------------------
    # Subset data into chunks of points, based on how many samples I have for each point
    # PTresample       42 pts with 2 yr bird + 2 yr veg (sample with replacement? i.e. randomly select from 2 years' data?)
    # PTveg2015        69 pts with 2 yr bird + 1 yr veg (only 2015 veg) **including 1 pt  with 1 yr bird + 1 yr veg (74E)
    # PTnew2016        15 pts with 1 yr bird + 1 yr veg (2016 new)
    # -3 pts no veg
    # sample 2/3 of each = [42]+45+10 = 97 traindat
    # save 1/3 = [42]+23+5 = 70 testdat
    
    # Create random sample to use as training data---
    # first make a full list of all the PointID-Year I have bird data, and veg data, and remote data to work with
    # all resampled points + 2015 veg only points + 2016 veg only points - no veg points
    PTresample <- data.frame(PointID=c("101E","101H","108H","108L","139L","223H","23H","24E","252L","281L","2E",
                                       "300H","384L","407L","423M","474E","47L","484E","502L","505L","56M","607L","66M","68M","69L",
                                       "75M","77E","77H","78M","7M","800E","802E","807E","82H","84H","89H","89M","8E",
                                       "90H","90M","92H","96M"), # points with both 2015 and 2016 veg data
                             Year=2016) %>% mutate(PTYR = paste(PointID, Year, sep="_"))
    PTnoveg <- c("109L", "73H", "387L")  # don't be confused. I have 128 pts of bird data, 126 pts of veg data.
    PTnew2016 <- data.frame(PointID=c("1540H","2013H","2014H","2028H","2322H","2325H","2345H","293H",
                   "449M","474M","571L","699M","888H","891H","901M"), # "73H",
                   Year=2016) %>% mutate(PTYR = paste(PointID, Year, sep="_")) # points with only 2016 field veg data
    PTveg2015 <- MPOINT.veg[which(MPOINT.veg$Year == 2015 & 
                                    !MPOINT.veg$PointID %in% PTresample$PointID &
                                    !MPOINT.veg$PointID %in% PTnoveg), c("PointID", "Year")]  %>%  
      mutate(PTYR = paste(PointID, Year, sep="_"))

    # Randomly select 2/3 points for training data---
    set.seed(24)
    samp1 <- unique(PTresample$PointID)
    samp1 <- PTresample %>%
      mutate(Year = sample(c(2015, 2016), length(samp1), replace=T))
    samp2 <- PTnew2016[sample(nrow(PTnew2016),  (2/3)*nrow(PTnew2016), replace=F),]
    samp3 <- PTveg2015[sample(nrow(PTveg2015),  (2/3)*nrow(PTveg2015), replace=F),]
    samplepts <- samp1 %>%
      full_join(samp2) %>%
      full_join(samp3) 
    
    
    # # Full data join
    mpfull <-MPOINT.dens %>%
      dplyr::select("PointID", "Year", grep("_dens", colnames(MPOINT.dens), value=T)) %>%
      left_join(MPOINT.veg, by=c("PointID", "Year")) %>%
      left_join(MPOINT.remote, by=c("PointID"))

    traindat <- mpfull %>%
      filter(PTYR %in% samplepts$PTYR) %>%
      dplyr::select(PointID, Year, grep("dens", colnames(mpfull)), all_of(chosenfield), all_of(chosenremote)) 
    
    testdat <- mpfull %>%
      filter(!PTYR %in% samplepts$PTYR) %>%
      filter(!is.na(AMBI_dens) & !is.na(WOODYP)) %>%
      dplyr::select(PointID, Year, grep("dens", colnames(mpfull)), all_of(chosenfield), all_of(chosenremote)) 
    
    
    rm(samp1, samp2, samp3, PTnoveg, PTnew2016, PTresample, PTveg2015, samplepts)
    
    # Old version in loop---
    # thesepoints <- as.integer(sample(rownames(vegALL), 84, replace=F))
    # sampledat <- data.frame(PointID=vegALL$PointID[c(thesepoints)], Year=vegALL$Year[thesepoints]) # randomly select training data
    # for(ii in 1:nrow(sampledat)){
    #     if(sampledat$PointID[ii] %in% PTresample){
    #             sampledat$Year[ii] <- sample(c("2015","2016"),1)
    #     } else if(sampledat$PointID[ii] %in% PTveg2015){ sampledat$Year[ii] <- "2015"
    #     } else if(sampledat$PointID[ii] %in% PTnew2016){ sampledat$Year[ii] <- "2016"}
    # }
    # sampledat$YRPT <- paste("B", paste(sampledat$Year, sampledat$PointID, sep=""), sep="")
    # # write.csv(sampledat, "data/trainingdataindex2.csv", row.names = F)
    #
   
  
# 4a) Train FIELD VEGETATION models-----------------------------------------
    # Create generalized linear models with training data  # glm(family="gaussian")
    # try forward, backward, both selection using stepAIC()
    # step-wise regression model selection w AIC. get best model for field vegetation variables, then get best model for remote variables,
    # then put the vars from field veg best models with remote vars to see if some drop out or can be improved as explanation for bird density
    
    # firstmodel <- "the function I need" (AMBI_dens ~ covar1 + covar2 + ..., data=traindat, "other arguments for model/curve/parameters/output")
    # stepAIC(fit2, direction="both", scope=list(upper=fit1, lower=fit2))    
    # stepAIC() uses criteria k=2
    # use step() or stepAIC() or stepwise(from internet) or something from library(leaps)
    # the two functions stepAIC() & step() give same result, both default backwards selection
    # caret() includes cross-validation of model selection
    
    # no.na.data <- na.omit(traindat[c(chosenfield, response)]) # takes out missing NA data ?same as complete.cases?
    # sitevarglm <- glm(formula=as.formula(paste(paste(response,'~', sep='') # paste(predictors,collapse='+'), sep='')), no.na.data, family=gaussian)        
    # can use offset with glm() too, family=gaussian is default
    
    library(MASS)
    library(stats)
     
    # log-regression off-set ?(WTF?) - is this what Spencer was talking about? certain curve types can't be at 0 so need to adjust/push data with offset?
    # ns(), lm(), glm(), gls()
    # multimodel <- lmer(thissp ~ TargWetsqkm600 + HGTMAXmean.m + LOVOmean.m + HIVOmean.m + DEADP + WATDmean.m +
    #                      Targ.covShrub01 -1 +(1|Year), data=MPOINT.birdveg, REML=F)
    # multimodel <- lmer(get(thissp) ~ TargHA100 + AllWet600 + Mean.Deadhgt + Mean.MaxHgt + P.plotsTYtouch + N.Species
    #                    + Range.WetDepth -1 +(1|Year), data=results3, REML=F) 
    
    
    # start with field veg covars---
    #' AMBI
    sitevarglm.all <- glm(AMBI_dens ~ ., data = traindat[c(chosenfield, "AMBI_dens")])
    summary(sitevarglm.all)
    sitevarglm.none <- glm(AMBI_dens ~ 1, data = traindat[c(chosenfield, "AMBI_dens")])
    summary(sitevarglm.none)$coeff  # coeff already built into the output
    
    stepboth <- stepAIC(sitevarglm.all, direction = "both")
    stepboth$anova
    aveg <- glm(AMBI_dens ~ WOODYP + HGTMAXsd , data=traindat[c(chosenfield, "AMBI_dens")])
    
    
    #' PBGR
    sitevarglm.all <- glm(PBGR2_dens ~ ., data=traindat[c(chosenfield, "PBGR2_dens")])
    summary(sitevarglm.all)
    sitevarglm.none <- glm(PBGR2_dens ~ 1, data=traindat[c(chosenfield, "PBGR2_dens")])
    summary(sitevarglm.none) 
    
    stepboth <- stepAIC(sitevarglm.all, direction = "both")
    stepboth$anova
    pveg <- glm(PBGR2_dens ~ WATDMEAN, data=traindat[c(chosenfield, "PBGR2_dens")])
    
    
    #' SORA
    sitevarglm.all <- glm(SORA_dens ~ ., data=traindat[c(chosenfield, "SORA_dens")])
    sitevarglm.none <- glm(SORA_dens ~ 1, data=traindat[c(chosenfield, "SORA_dens")])
    
    stepboth <- stepAIC(sitevarglm.all, direction = "both")
    stepboth$anova
    sveg <- glm(SORA_dens ~ 1, data=traindat[c(chosenfield, "SORA_dens")])
    
    
    #' VIRA
    sitevarglm.all <- glm(VIRA_dens ~ ., data=traindat[c(chosenfield, "VIRA_dens")])
    sitevarglm.none <- glm(VIRA_dens ~ 1, data=traindat[c(chosenfield, "VIRA_dens")])
    
    stepboth <- stepAIC(sitevarglm.all, direction = "both", scope=list(upper= sitevarglm.all, lower=sitevarglm.none))
    stepboth$anova
    vveg <- glm(VIRA_dens ~ 1, data=traindat[c(chosenfield, "VIRA_dens")])

    rm(sitevarglm.all, sitevarglm.none, stepboth)
# Done with veg models*************************************************
    
    
    # Try to bootstrap glm stepwise regression---
    # does not seem to bootstrap sampling though, but rather tests the full dataset
    # https://stackoverflow.com/questions/61333752/bootstrap-with-stepwise-regression-in-r
    library(dplyr)
    library(boot)
    
    ddat <- mpfull %>%
      dplyr::select(AMBI_dens, all_of(chosenfield)) 
    ddat <- ddat[complete.cases(ddat),]
    B <- 300 #number of bootstrap samples
    n <- nrow(ddat)
    d <- ncol(ddat) - 1
    full_mdl = colnames(model.matrix(AMBI_dens ~ ., data=ddat))
    bhatA <- matrix(NA, nrow = B, ncol = length(full_mdl)+1)
    colnames(bhatA) = c(full_mdl, "AIC")
    for(b in 1:B) {
      samp <- ddat[sample(1:n, n, replace=FALSE), c(1:ncol(ddat))]
      null <- glm(ddat$AMBI_dens ~ 1, data=samp, na.action = "na.omit")
      full <- glm(ddat$AMBI_dens ~ ., data=samp, na.action = "na.omit")
      fit <- step(null, scope=formula(full), direction="both", trace=0)
      bhatA[b,names(coef(fit))] <- coef(fit)
      bhatA[b, "AIC"] <- fit$aic
    }
    rm(ddat, B, n, d, full_mdl, samp, null, full, fit)
    colSums(!is.na(bhatA))
    #
    
    ddat <- mpfull %>%
      dplyr::select(PBGR2_dens, all_of(chosenfield)) 
    ddat <- ddat[complete.cases(ddat),]
    B <- 300 #number of bootstrap samples
    n <- nrow(ddat)
    d <- ncol(ddat) - 1
    full_mdl = colnames(model.matrix(PBGR2_dens ~ ., data=ddat))
    bhatP <- matrix(NA, nrow = B, ncol = length(full_mdl)+1)
    colnames(bhatP) = c(full_mdl, "AIC")
    for(b in 1:B) {
      samp <- ddat[sample(1:n, n, replace=FALSE), c(1:ncol(ddat))]
      null <- glm(ddat$PBGR2_dens ~ 1, data=samp, na.action = "na.omit")
      full <- glm(ddat$PBGR2_dens ~ ., data=samp, na.action = "na.omit")
      fit <- step(null, scope=formula(full), direction="both", trace=0)
      bhatP[b,names(coef(fit))] <- coef(fit)
      bhatP[b, "AIC"] <- fit$aic
    }
    rm(ddat, B, n, d, full_mdl, samp, null, full, fit)
    colSums(!is.na(bhatP))
    
    # # https://www.statmethods.net/advstats/bootstrapping.html
    # # function to obtain regression weights
    # bs <- function(formula, data, indices) {
    #   d <- data[indices,] # allows boot to select sample
    #   fit <- lm(formula, data=d)
    #   return(coef(fit))
    # }
    # # bootstrapping with 1000 replications
    # results <- boot(data=ddat, statistic=bs,
    #                 R=10, formula=ddat$PBGR2_dens ~ 
    #                   ddat$HGTMAXsd + ddat$WATDMEAN +
    #                   ddat$HGTMAXMEAN0 + ddat$N.Species + 
    #                   ddat$LITMAX + ddat$WOODYP +
    #                   ddat$N.plotsTYtouch + ddat$TouchTYsd)
    # # view results
    # results
    
    
# 4b) Train REMOTELY SENSED models--------------------------------------------
# next with remote covars--- 
    # traindat <- trainremote
    #' AMBI
    sitevarglm.all <- glm(AMBI_dens ~ ., data=traindat[c(chosenremote, "AMBI_dens")])
    sitevarglm.none <- glm(AMBI_dens ~ 1, data=traindat[c(chosenremote, "AMBI_dens")])
    
    stepboth <- stepAIC(sitevarglm.all, direction = "both")
    stepboth$anova
    arem <- glm(AMBI_dens ~ 1, data=traindat[c(chosenremote, "AMBI_dens")])
    
    #' PBGR
    sitevarglm.all <- glm(PBGR2_dens ~ ., data=traindat[c(chosenremote, "PBGR2_dens")])
    sitevarglm.none <- glm(PBGR2_dens ~ 1, data=traindat[c(chosenremote, "PBGR2_dens")])
    
    stepboth <- stepAIC(sitevarglm.all, direction = "both")
    stepboth$anova
    prem <- glm(PBGR2_dens ~ AllWetHA600, data=traindat[c(chosenremote, "PBGR2_dens")]) # NWImodA.prop
    
    
    #' SORA
    sitevarglm.all <- glm(SORA_dens ~ ., data=traindat[c(chosenremote, "SORA_dens")])
    sitevarglm.none <- glm(SORA_dens ~ 1, data=traindat[c(chosenremote, "SORA_dens")])
    
    stepboth <- stepAIC(sitevarglm.all, direction = "both")
    stepboth$anova
    srem <- glm(SORA_dens ~ AllWetHA600 + RatioopenDIVemerg, data=traindat[c(chosenremote, "SORA_dens")])
    
    #' VIRA
    sitevarglm.all <- glm(VIRA_dens ~ ., data=traindat[c(chosenremote, "VIRA_dens")])
    sitevarglm.none <- glm(VIRA_dens ~ 1, data=traindat[c(chosenremote, "VIRA_dens")])
    
    stepboth <- stepAIC(sitevarglm.all, direction = "both")
    stepboth$anova
    vrem <- glm(VIRA_dens ~ Targ.covShrubwet01, data=traindat[c(chosenremote, "VIRA_dens")])

    rm(sitevarglm.all, sitevarglm.none, stepboth)
    # 5.2.18 so far AMBI and PBGR best with empty model, SORA ~ WATDMEAN, VIRA ~ LITcv. NONE significant pvals.

    
    # Try to bootstrap data sampling and stepwise regression---
    # https://stackoverflow.com/questions/61333752/bootstrap-with-stepwise-regression-in-r
   
    ddat <- mpfull %>%
      dplyr::select(AMBI_dens, all_of(chosenremote)) 
    ddat <- ddat[complete.cases(ddat),]
    B <- 300 #number of bootstrap samples
    n <- nrow(ddat)
    d <- ncol(ddat) - 1
    full_mdl = colnames(model.matrix(AMBI_dens ~ ., data=ddat))
    bhatA <- matrix(NA, nrow = B, ncol = length(full_mdl)+1)
    colnames(bhatA) = c(full_mdl, "AIC")
    for(b in 1:B) {
      samp <- ddat[sample(1:n, n, replace=FALSE), c(1:ncol(ddat))]
      null <- glm(ddat$AMBI_dens ~ 1, data=samp, na.action = "na.omit")
      full <- glm(ddat$AMBI_dens ~ ., data=samp, na.action = "na.omit")
      fit <- step(null, scope=formula(full), direction="both", k=log(n), trace=0)
      bhatA[b,names(coef(fit))] <- coef(fit)
      bhatA[b,"AIC"] <- fit$aic
    }
    rm(ddat, B, n, d, full_mdl, samp, null, full, fit)
    #
    
    
    ddat <- mpfull %>%
      dplyr::select(PBGR2_dens, all_of(chosenremote)) 
    ddat <- ddat[complete.cases(ddat),]
    B <- 300 #number of bootstrap samples
    n <- nrow(ddat)
    d <- ncol(ddat) - 1
    full_mdl = colnames(model.matrix(PBGR2_dens ~ ., data=ddat))
    bhatP <- matrix(NA, nrow = B, ncol = length(full_mdl)+1)
    colnames(bhatP) = c(full_mdl, "AIC")
    for(b in 1:B) {
      samp <- ddat[sample(1:n, n, replace=FALSE), c(1:ncol(ddat))]
      null <- glm(ddat$PBGR2_dens ~ 1, data=samp, na.action = "na.omit")
      full <- glm(ddat$PBGR2_dens ~ ., data=samp, na.action = "na.omit")
      fit <- step(null, scope=formula(full), direction="both", k=log(n), trace=0)
      bhatP[b,names(coef(fit))] <- coef(fit)
      bhatP[b, "AIC"] <- fit$aic
    }
    rm(ddat, B, n, d, full_mdl, samp, null, full, fit)
    #
 
    
    ddat <- mpfull %>%
      dplyr::select(SORA_dens, all_of(chosenremote)) 
    ddat <- ddat[complete.cases(ddat),]
    B <- 300 #number of bootstrap samples
    n <- nrow(ddat)
    d <- ncol(ddat) - 1
    full_mdl = colnames(model.matrix(SORA_dens ~ ., data=ddat))
    bhatS <- matrix(NA, nrow = B, ncol = length(full_mdl)+1)
    colnames(bhatS) = c(full_mdl, "AIC")
    for(b in 1:B) {
      samp <- ddat[sample(1:n, n, replace=FALSE), c(1:ncol(ddat))]
      null <- glm(ddat$SORA_dens ~ 1, data=samp, na.action = "na.omit")
      full <- glm(ddat$SORA_dens ~ ., data=samp, na.action = "na.omit")
      fit <- step(null, scope=formula(full), direction="both", k=log(n), trace=0)
      bhatS[b,names(coef(fit))] <- coef(fit)
      bhatS[b, "AIC"] <- fit$aic
    }
    rm(ddat, B, n, d, full_mdl, samp, null, full, fit)
    #
    
    ddat <- mpfull %>%
      dplyr::select(VIRA_dens, all_of(chosenremote)) 
    ddat <- ddat[complete.cases(ddat),]
    B <- 300 #number of bootstrap samples
    n <- nrow(ddat)
    d <- ncol(ddat) - 1
    full_mdl = colnames(model.matrix(VIRA_dens ~ ., data=ddat))
    bhatV <- matrix(NA, nrow = B, ncol = length(full_mdl)+1)
    colnames(bhatV) = c(full_mdl, "AIC")
    for(b in 1:B) {
      samp <- ddat[sample(1:n, n, replace=FALSE), c(1:ncol(ddat))]
      null <- glm(ddat$VIRA_dens ~ 1, data=samp, na.action = "na.omit")
      full <- glm(ddat$VIRA_dens ~ ., data=samp, na.action = "na.omit")
      fit <- step(null, scope=formula(full), direction="both", k=log(n), trace=0)
      bhatV[b,names(coef(fit))] <- coef(fit)
      bhatV[b, "AIC"] <- fit$aic
    }
    rm(ddat, B, n, d, full_mdl, samp, null, full, fit)
    
    # # https://www.statmethods.net/advstats/bootstrapping.html
    # # function to obtain regression weights
    # bs <- function(formula, data, indices) {
    #   d <- data[indices,] # allows boot to select sample
    #   fit <- lm(formula, data=d)
    #   return(coef(fit))
    # }
    # # bootstrapping with 1000 replications
    # results <- boot(data=ddat, statistic=bs,
    #                 R=10, formula=ddat$PBGR2_dens ~ 
    #                   ddat$HGTMAXsd + ddat$WATDMEAN +
    #                   ddat$HGTMAXMEAN0 + ddat$N.Species + 
    #                   ddat$LITMAX + ddat$WOODYP +
    #                   ddat$N.plotsTYtouch + ddat$TouchTYsd)
    # # view results
    # results
    
    
    rm(bhatA, bhatP, bhatS, bhatV)
    
# 4c) Train BEST REMOTE + VEG models-------------------------------
    
    # Get variables chosen in best remote models---
    # aveg <- glm(AMBI_dens ~ WOODYP + HGTMAXsd, data=traindat[c(chosenfield, response)])
    # arem <- glm(AMBI_dens ~ 1, data=traindat[c(chosenremote, "AMBI_dens")])
    summary(arem)$coefficients
    arem.var <- names(arem$coefficients)[-1]
    
    # pveg <- glm(PBGR2_dens ~ WATDMEAN, data=traindat[c(chosenfield, response)])
    # prem <- glm(PBGR2_dens ~ AllWetHA600, data=traindat[c(chosenremote, "PBGR2_dens")]) # NWImodA.prop
    summary(prem)$coefficients
    prem.var <- names(prem$coefficients)[-1]
    
    # sveg <- glm(SORA_dens ~ 1, data=traindat[c(chosenfield, response)])
    # srem <- glm(SORA_dens ~ AllWetHA600 + RatioopenDIVemerg, data=traindat[c(chosenremote, "SORA_dens")])
    summary(srem)$coefficients
    srem.var <- names(srem$coefficients)[-1]
    
    # vveg <- glm(VIRA_dens ~ 1, data=traindat[c(chosenfield, response)])
    # vrem <- glm(VIRA_dens ~ Targ.covShrubwet01, data=traindat[c(chosenremote, "VIRA_dens")])
    summary(vrem)$coefficients
    vrem.var <- names(vrem$coefficients)[-1]
    
    
    # Finally, train combined models (the best remote model + field veg covars)--- 
    # then, in cross-validation test predication power of just best-remote variables
    # to prediction power of best-remote + adding data by going into the field
    
    # AMBI
    response <- "AMBI_dens"
    sitevarglm.all <- glm(AMBI_dens ~ ., data=traindat[c(arem.var, chosenfield, response)])
    sitevarglm.none <- glm(AMBI_dens ~ 1, data=traindat[c(arem.var, chosenfield, response)])
    
    stepboth <- stepAIC(sitevarglm.all, direction = "both")
    stepboth$anova
    aboth <- glm(AMBI_dens ~ WOODYP + HGTMAXsd, data=traindat[c(arem.var, chosenfield, response)])
    
    # PBGR
    response <- "PBGR2_dens"
    sitevarglm.all <- glm(PBGR2_dens ~ ., data=traindat[c(prem.var, chosenfield, response)])
    sitevarglm.none <- glm(PBGR2_dens ~ 1, data=traindat[c(prem.var, chosenfield, response)])
   
    stepboth <- stepAIC(sitevarglm.all, direction = "both")
    stepboth$anova
    pboth <- glm(PBGR2_dens ~ WATDMEAN + AllWetHA600, data=traindat[c(prem.var, chosenfield, response)]) # NWImodA.prop
    
    
    # SORA
    response <- "SORA_dens"
    sitevarglm.all <- glm(SORA_dens ~ ., data=traindat[c(srem.var, chosenfield, response)])
    sitevarglm.none <- glm(SORA_dens ~ 1, data=traindat[c(srem.var, chosenfield, response)])
    
    stepboth <- stepAIC(sitevarglm.all, direction = "both")
    stepboth$anova
    sboth <- glm(SORA_dens ~ HGTMAXsd + AllWetHA600 + RatioopenDIVemerg, data=traindat[c(srem.var, chosenfield, response)])
    
    # VIRA
    response <- "VIRA_dens"
    sitevarglm.all <- glm(VIRA_dens ~ ., data=traindat[c(vrem.var, chosenfield, response)])
    sitevarglm.none <- glm(VIRA_dens ~ 1, data=traindat[c(vrem.var, chosenfield, response)])
  
    stepboth <- stepAIC(sitevarglm.all, direction = "both")
    stepboth$anova
    vboth <- glm(VIRA_dens ~ Targ.covShrubwet01, data=traindat[c(vrem.var, chosenfield, response)])
    
    rm(response, sitevarglm.all, sitevarglm.none, stepboth)
    
    
    # # Try to bootstrap GLM (but does not do stepwise comparisons)
    # # uses 5-fold validation
    # # https://quantdev.ssri.psu.edu/tutorials/cross-validation-tutorial
    # model_caret <- train(SORA_dens ~ HGTMAXMEAN0+LITMAX+ HGTMAXsd+WATDMEAN+WOODYP+
    #                        N.Species+TouchTYsd+Logtargwet+ Wetlandshallow.marsh_prop+ NWImodA.prop+
    #                        Wetlandseasonally_prop+Hayand.Pasture_prop+RatioopenDIVemerg+AllWetHA600+
    #                        WOODYP + LITMAX,   # model to fit
    #                      data = mpfull,                        
    #                      # trControl = data_ctrl,              # folds
    #                      method = "glm",                      # specifying regression model
    #                      na.action = na.pass)                # pass missing data to model - some models will handle this
    # model_caret$finalModel
    # model_caret$resample
    # # randomly select half of the sample
    # set.seed(0123)
    # half_size <- floor(0.50 * nrow(data))
    # random_sample <- sample(seq_len(nrow(data)), size = half_size)
    # first_half_data <- data[random_sample, ]
    # 
    # # run the model
    # first_half <- lm(ACT ~ gender + age + SATV + SATQ, data = first_half_data)
    # summary(first_half)
    # 
    # #select half of data not used in the above model
    # second_half_data <- data[-random_sample, ]
    # 
    # # run the model
    # second_half <- lm(ACT ~ gender + age + SATV + SATQ, data = second_half_data)
    # summary(second_half)
    # rm(model_caret, random_sample, first_half_data, first_half, second_half, second_half_data, half_size)
    
# 4)  Test remote and field veg and both models--------------------------------------       
    library(caret)
    # caret()
    # AIC
    # R-squared
    # RMSE
    
    # Get best models---
    # aveg <- glm(AMBI_dens ~ WOODYP + HGTMAXsd, data=traindat[c(chosenfield, response)])
    # arem <- glm(AMBI_dens ~ 1, data=traindat[c(chosenremote, "AMBI_dens")])
    # aboth <- glm(AMBI_dens ~ WOODYP + HGTMAXsd, data=traindat[c(arem.var, chosenfield, response)])
       
    # pveg <- glm(PBGR2_dens ~ WATDMEAN, data=traindat[c(chosenfield, response)])
    # prem <- glm(PBGR2_dens ~ AllWetHA600, data=traindat[c(chosenremote, "PBGR2_dens")]) # NWImodA.prop
    # pboth <- glm(PBGR2_dens ~ WATDMEAN + AllWetHA600, data=traindat[c(prem.var, chosenfield, response)]) # NWImodA.prop
    
    # sveg <- glm(SORA_dens ~ 1, data=traindat[c(chosenfield, response)])
    # srem <- glm(SORA_dens ~ AllWetHA600 + RatioopenDIVemerg, data=traindat[c(chosenremote, "SORA_dens")])
    # sboth <- glm(SORA_dens ~ HGTMAXsd + AllWetHA600 + RatioopenDIVemerg, data=traindat[c(srem.var, chosenfield, response)])
    
    # vveg <- glm(VIRA_dens ~ 1, data=traindat[c(chosenfield, response)])
    # vrem <- glm(VIRA_dens ~ Targ.covShrubwet01, data=traindat[c(chosenremote, "VIRA_dens")])
    # vboth <- glm(VIRA_dens ~ Targ.covShrubwet01, data=traindat[c(vrem.var, chosenfield, response)])
   
     
    # Make predictions and compute the R2, RMSE and MAE
    vegpred <- aveg %>% predict(testdat)
    rempred <- arem %>% predict(testdat)
    bothpred<- aboth %>% predict(testdat)
    
    data.frame( RMSEveg = RMSE(vegpred, testdat$AMBI_dens),
                RMSErem = RMSE(rempred, testdat$AMBI_dens),
                RMSEboth = RMSE(bothpred, testdat$AMBI_dens))
    #
    vegpred <- pveg %>% predict(testdat)
    rempred <- prem %>% predict(testdat)
    bothpred<- pboth %>% predict(testdat)
    
    data.frame( RMSEveg = RMSE(vegpred, testdat$PBGR2_dens),
                RMSErem = RMSE(rempred, testdat$PBGR2_dens),
                RMSEboth = RMSE(bothpred, testdat$PBGR2_dens))
    #
    vegpred <- sveg %>% predict(testdat)
    rempred <- srem %>% predict(testdat)
    bothpred<- sboth %>% predict(testdat)
    
    data.frame( RMSEveg = RMSE(vegpred, testdat$SORA_dens),
                RMSErem = RMSE(rempred, testdat$SORA_dens),
                RMSEboth = RMSE(bothpred, testdat$SORA_dens))
    #
    vegpred <- vveg %>% predict(testdat)
    rempred <- vrem %>% predict(testdat)
    bothpred<- vboth %>% predict(testdat)
    
    data.frame( RMSEveg = RMSE(vegpred, testdat$VIRA_dens),
                RMSErem = RMSE(rempred, testdat$VIRA_dens),
                RMSEboth = RMSE(bothpred, testdat$VIRA_dens))
    
    
    
    # # Other model validation code---
    # data.frame( R2 = R2(vegpred, testdat$AMBI_dens),
    #             RMSE=RMSE(vegpred, testdat$AMBI_dens),
    #             MAE =MAE(vegpred, testdat$AMBI_dens))
    # 
    # data.frame( R2 = R2(rempred, testdat$AMBI_dens),
    #             RMSE=RMSE(rempred, testdat$AMBI_dens),
    #             MAE =MAE(rempred, testdat$AMBI_dens))
    # 
    # data.frame( R2 = R2(bothpred, testdat$AMBI_dens),
    #             RMSE=RMSE(bothpred, testdat$AMBI_dens),
    #             MAE =MAE(bothpred, testdat$AMBI_dens))
    # 
    # RMSE(rempred, testdat$AMBI_dens)/mean(testdat$AMBI_dens)
    
    
    # compare <- function(orig_data, i){
    #   # create the resampled data
    #   train_data <- orig_data[i, ]
    #   test_data <- orig_data # ie the full original sample
    #   
    #   # fit the three modelling processes
    #   model_step <- model_process_step(train_data)
    #   model_vif  <- model_process_vif(train_data)
    #   model_full <- lm(MedianIncome ~ ., data = train_data)
    #   
    #   # predict the values on the original, unresampled data
    #   predict_step <- predict(model_step, newdata = test_data)
    #   predict_vif  <- predict(model_vif, newdata = test_data)
    #   predict_full  <- predict(model_full, newdata = test_data)
    #   
    #   # return a vector of 6 summary results
    #   results <- c(
    #     step_R2 = R2(predict_step, test_data$MedianIncome),
    #     vif_R2  = R2(predict_vif, test_data$MedianIncome),
    #     full_R2  = R2(predict_full, test_data$MedianIncome),
    #     step_RMSE = RMSE(predict_step, test_data$MedianIncome),
    #     vif_RMSE  = RMSE(predict_vif, test_data$MedianIncome),
    #     full_RMSE  = RMSE(predict_full, test_data$MedianIncome)
    #   )
    #   return(results)
    # }
    
    
    # is there way to bootstrap data sampling into regression and count the number of times variables in top model?
    # is there a way to change up order of variables in stepwise regression testing?
    # is it a problem to put area wetland in as covar when it is already incorporated in the density?

    # cross validation is to test a model's ability to predict new data 
    # (so we use part of data to build model and then predict, compare predicted to the other unused part of data - did model come close to predicting?)
    # compare the RMSE (root mean square error) of model predictions of bird density to the observed bird density 
    # so observed data is "true" line/relationship. we measure how far away predicted observations are from this truth
  

# # END models*********************************************************************************************************
# 
library(ezknitr)
ezspin("02_Morris_modeling.R", out_dir = "output", fig_dir = "figures", keep_md=FALSE)