#' ## Secretive marshbird response to invasive wetland plant management #' #' ### Nina Hill, hillx725@umn.edu #' David E. Andersen, dea@umn.edu #' #' University of Minnesota, Minnesota Cooperative Fish & Wildlife Research Unit #' #' 135 Skok Hall, Upper Buford Circle, Saint Paul, Minnesota 55108 #' format(Sys.Date(), "%Y %B %d") # Code related to Chapter 2 - # Morris bird ~ treatment models # Morris bird ~ remote + site wetland characteristics # 0) Set up workspace, echo=FALSE------------------------- setwd("F:/forDRUM") library(ggplot2); library(gplots) library(tidyverse) library(stats) # aov(), TukeyHSD() library(plotrix) # library(MASS); library(lme4) # linear mixed model # library(nlme); library(glmm); library(car) # library(sjPlot); library(sjmisc) # for making nice tables of model params # library(Hmisc); library(RColorBrewer); library(foreign) bird.broadcast <- c("AMBI","PBGR","SORA","VIRA") birdcolors <- c("orange","purple","tomato","dodgerblue") std <- function(x) sd(x)/sqrt(length(x)) right = function(text, num_char) { substr(text, nchar(text) - (num_char-1), nchar(text)) } # bird.broadcast2 <- c("AMBI", "PBGR", "SORA", "VIRA") # birdnames <- c("American bittern", "Pied-billed grebe", "Sora", "Virginia rail") # PTnew2016<-sort(c("73H", "891H", "888H", "699M", "901M", "474M", "449M","1540H","293H","571L","2325H","2345H","2322H","2013H","2014H","2028H")) # birdcolors2 <- c("orange","purple","tomato","dodgerblue") # treatmentlist <- levels(MPOINT.dens$Treatment) # Part 1 Density ~ Treatment------------------------------------ #************************************************************************************************ # MORRIS STUDY AREA: bird ~ treatment models # Make models to test relationship of response variable (adj density birds per point) # to treatment categories. # #************************************************************************************************ # 1) Load data and set up workspace-------- # MPOINT.dens <- read.csv("data/MPOINT.densities.csv") # new dataset from script 01_Counts to Densities.R MPOINT.dens <- read.csv("data/MPOINT.density.csv") # prepared dataset posted on DRUM MPOINT.dens <- MPOINT.dens %>% dplyr::select("PointID", "Year", grep("dens", colnames(MPOINT.dens), value=T)) %>% filter(!grepl("E", PointID)) %>% # load density data, drop "E" points from this analysis mutate(Treatment = factor(right(as.character(PointID), 1), levels=c("L", "M", "H"))) %>% pivot_longer(cols=c(AMBI_dens:PBGR2_dens), names_to="Species", values_to="Density") ggplot(MPOINT.dens, aes(x = Treatment, y = Density, fill=Treatment, color=Treatment))+ stat_summary(fun = mean, geom = "point", size=3) + stat_summary(fun.data = mean_se, geom = "errorbar", width=0.3)+ facet_grid(Species~Year, scales="free") # 2) Make models density ~ treatment----------------------------------------------------------- modA <- MPOINT.dens %>% filter(Species == "AMBI_dens") A.aov <- aov(Density ~ Treatment, modA) summary(A.aov) TukeyHSD(A.aov) modP <- MPOINT.dens %>% filter(Species == "PBGR2_dens") %>% filter(!is.infinite(Density)) P.aov <- aov(Density ~ Treatment, modP) summary(P.aov) TukeyHSD(P.aov) modS <- MPOINT.dens %>% filter(Species == "SORA_dens") S.aov <- aov(Density ~ Treatment, modS) summary(S.aov) TukeyHSD(S.aov) modV <- MPOINT.dens %>% filter(Species == "VIRA_dens") V.aov <- aov(Density ~ Treatment, modV) summary(V.aov) TukeyHSD(V.aov) # 3) Plot Density ~ Treatment!----------------------------------------------- for (ii in 1:4){ par(mfrow=c(1,2)) plotthis15 <- MPOINT.dens[which(MPOINT.dens$Year=="2015" & MPOINT.dens$Species == paste(bird.broadcast[ii], "dens", sep="_")),] # calc lower limit as mean - se (95% CI is mean +/- 1.96*se) plotthis15CIL <-tapply(plotthis15$Density, plotthis15$Treatment, mean, na.rm=T) - 1.96*(tapply(plotthis15$Density, plotthis15$Treatment, std.error, na.rm=T)) # calc upper limit as mean - se plotthis15CIU <-tapply(plotthis15$Density, plotthis15$Treatment, mean, na.rm=T) + 1.96*(tapply(plotthis15$Density, plotthis15$Treatment, std.error, na.rm=T)) plotthis16 <- MPOINT.dens[which(MPOINT.dens$Year=="2016" & MPOINT.dens$Species == paste(bird.broadcast[ii], "dens", sep="_")),] plotthis16CIL <-tapply(plotthis16$Density, plotthis16$Treatment, mean, na.rm=T) - 1.96*(tapply(plotthis16$Density, plotthis16$Treatment, std.error, na.rm=T)) plotthis16CIU <-tapply(plotthis16$Density, plotthis16$Treatment, mean, na.rm=T) + 1.96*(tapply(plotthis16$Density, plotthis16$Treatment, std.error, na.rm=T)) barplot2(tapply(plotthis15$Density, plotthis15$Treatment, mean, na.rm=T), col=birdcolors[ii], main= paste(bird.broadcast[ii], " 2015",sep=""), ylab = "2015 Mean Density", xlab = "Treatment Category", plot.ci=TRUE, ci.l = plotthis15CIL, ci.u=plotthis15CIU, cex.names=1.5, ylim=c(0,(max(c(plotthis16CIU,plotthis15CIU))+0.1))) barplot2(tapply(plotthis16$Density, plotthis16$Treatment, mean, na.rm=T), col=birdcolors[ii], main= paste(bird.broadcast[ii], " 2016",sep=""), ylab = "2016 Mean Density", xlab = "Treatment Category", plot.ci=TRUE, ci.l = plotthis16CIL, ci.u=plotthis16CIU, cex.names=1.5, ylim=c(0,(max(c(plotthis16CIU,plotthis15CIU))+0.1))) plotname <- paste("output/",bird.broadcast[ii], "densTrtYr.png", sep="") dev.copy(png, file=plotname, height=600, width=750) dev.off() rm(plotthis15, plotthis16 ,plotthis15CIL,plotthis16CIL, plotthis15CIU,plotthis16CIU) } rm(ii, plotname) rm(modA, modP, modS, modV) rm(A.aov, P.aov, S.aov, V.aov) # Part 2 Density ~ Wetland characteristics------------------------------------ #************************************************************************************************ # MORRIS Wetland Characteristics models # Subset 2/3 data to build model. Test on predicting other 1/3. # Bird data 111, 128 datapoints in 2015, 2016 # Veg data 111, 57 datapoints in 2015, 2016 # # 42 pts with 2 yr bird + 2 yr veg (sample with replacement? i.e. randomly select from 2 years' data?) # 68 pts with 2 yr bird + 1 yr veg (only 2015 veg) # 1 pt with 1 yr bird + 1 yr veg (74E) # 15 pts with 1 yr bird + 1 yr veg (2016 new) # # 1 pt with 2 yr bird + 0 yr veg (oops, 109L) # 1 pt with 1 yr bird + 0 yr veg (oops, 73H) # (2 pts dropped, never using again) # # Make models to test relationship of response variable (adj density birds per point) # to vegetation and landscape covariates. # # can use confint()!! and ggcoef() # am I correctly cal se, sd, and CI? # ************************************************************************************************ # 1) Import data: bird + GIS + fieldveg--------------------- # Import 2 years' bird density data--- # MPOINT.dens <- read.csv("data/MPOINT.densities.csv") MPOINT.dens <- read.csv("data/MPOINT.density_OLD.csv") MPOINT.dens <- MPOINT.dens %>% dplyr::filter(!is.na(AMBI_WetHA)) %>% dplyr::select("PointID", "Year", grep("dens", colnames(MPOINT.dens), value=T)) %>% dplyr::select(!contains("PBGR_")) %>% mutate("Treatment" = factor(right(as.character(PointID), 1), levels=c("L", "M", "H", "E"))) %>% mutate(PTYR = paste(PointID, Year, sep="_")) MPOINT.dens$PBGR2_dens[is.na(MPOINT.dens$PBGR2_dens)] <- 0 MPOINT.dens$PBGR2_dens[is.infinite(MPOINT.dens$PBGR2_dens)] <- 0 # MPOINT.denslong <- MPOINT.dens %>% # pivot_longer(cols=c(AMBI_dens:PBGR2_dens), names_to="Species", values_to="Density") # Import covariate datasets--- MPOINT.veg <- read.csv("data/MPOINT.veg.csv") %>% dplyr::mutate("PTYR" = paste(PointID, Year, sep="_")) %>% dplyr::select(!c(Treatment, Latitude, Longitude)) # dplyr::select(PointID, Year, all_of(chosenfield)) MPOINT.veg$WATDMAX[is.na(MPOINT.veg$WATDMAX)] <- 0 MPOINT.remote <- read.csv("data/MPOINT.remote.csv") MPOINT.remote$RatioopenDIVemerg[is.infinite(MPOINT.remote$RatioopenDIVemerg)] <- 0 # 2) SELECT COVARIATE VARIABLES------------------------- # Make list of chosen covariate data--- # manuscript includes these as reduced set of variables included in modeling: # FIELD VEG CHARS: # veg height mean # litter depth # cattail density # water depth # woody species # vegetation height variance # REMOTE CHARS: # survey wetland log area # shallow marsh prop # temp flood prop # scrub/shrub present # wetland type 1 prop # hay/pasture prop # open water: emergent ratio # all wetland types area chosenfield <- c("WATDMEAN","HGTMAXMEAN0", "LITMAX", "HGTMAXsd", "WOODYP", # "N.Species", "TouchTYmean") #, "TouchTYsd") chosenremote <- c("Logtargwet", "Wetlandshallow.marsh_prop", # "PTarg.Shallowmarsh", "NWImodA.prop", "Targ.covShrubwet01", "Wetlandseasonally_prop", "Hayand.Pasture_prop", "RatioopenDIVemerg", "AllWetHA600" ) # OLD versions--- # c("Logtargwet", "AllWetHA600", "wetland woody", "Wetland seasonally", ...) # OLD# chosenfield <- c("DEADP","HGTMAXMEAN0","WATDMEAN","LITMAX","LITcv","TouchTYmean","EdgeratioDJ","SORApres") #,"Targ.covShrub01" # OLD# chosennames <- c("Proportion plots with dead veg", "Mean of Max Height of vegetation", # "Mean water depth", "Maximum litter depth", "Coeff of variation litter depth", "Density of cattail", # "Complexity of shape (Edge [m] / sqrt[area HA])", "SORA present") # "LOGwetHA","LOG Area HA of target wetland", # predictors <- c("WOODYP","WATDMEAN","LITMAX","HGTMAXMEAN0", "HGTMAXsd", "TouchTYsd" ) # predictors <- c("DEADP" , "WATDMEAN" , "LITMAX" , "LITcv" , "HGTMAXMEAN0" , "TouchTYmean") # predictors <- c("NWImodA.prop", "AllWetHA600", "Targ.covShrubwet01", "PTarg.Shallowmarsh", # "Logtargwet", "Wetland.seasonally", "Hayand.Pasture_prop", "RatioopenDIVemerg") # thesevars <- c("TargHA100","AllWet600","Mean.Deadhgt","Mean.MaxHgt","P.plotsTYtouch", "N.Species", "Range.WetDepth") # 2021/02/14 ug, I lost a bunch of code last summer. looks like it was some PCA finalizations # I recreated lost PCA code in other script, not relevant for manuscript docs # PCA I ran a whole bunch of times with different vars excluded (ex. just prop vars), come out a little different each time # same ones always included: size of target wetland (targetwetarea) # wetlands in landscape [vs. row crops] (allwetha600, or wetland emergent) # scrub/shrub/woody, others get mixed in # some error-checks on GIS ecog data were lost, but I recovered the GIS data # lost any transformations I made. I remade proportions of ecog land cover. I combined some groups/names # library(ellipse) # library(RColorBrewer) # # Use of the mtcars data proposed by R # data <- cor(MPOINT.remote[,-which(colnames(MPOINT.remote) %in% c("PointID", "Treatment"))]) # # # Build a Pannel of 100 colors with Rcolor Brewer # my_colors <- brewer.pal(5, "Spectral") # my_colors <- colorRampPalette(my_colors)(100) # # # Order the correlation matrix # ord <- order(data[1, ]) # data_ord <- data[ord, ord] # plotcorr(data_ord , mar=c(1,1,1,1) ) # # library(corrplot) # dat <- MPOINT.remote[,-which(colnames(MPOINT.remote) %in% c("PointID", "Treatment", "NVCStype", "Extent", # "DiversSimple2", "DiversPEM", "WetACRE"))] # dat[is.na(dat)] <- 0 # corrplot.mixed(cor(dat), tl.col="black", upper = "circle", lower="shade", tl.cex=0.5, tl.pos="lt") # 3) Subset training data-------------------------------------- # Subset data into chunks of points, based on how many samples I have for each point # PTresample 42 pts with 2 yr bird + 2 yr veg (sample with replacement? i.e. randomly select from 2 years' data?) # PTveg2015 69 pts with 2 yr bird + 1 yr veg (only 2015 veg) **including 1 pt with 1 yr bird + 1 yr veg (74E) # PTnew2016 15 pts with 1 yr bird + 1 yr veg (2016 new) # -3 pts no veg # sample 2/3 of each = [42]+45+10 = 97 traindat # save 1/3 = [42]+23+5 = 70 testdat # Create random sample to use as training data--- # first make a full list of all the PointID-Year I have bird data, and veg data, and remote data to work with # all resampled points + 2015 veg only points + 2016 veg only points - no veg points PTresample <- data.frame(PointID=c("101E","101H","108H","108L","139L","223H","23H","24E","252L","281L","2E", "300H","384L","407L","423M","474E","47L","484E","502L","505L","56M","607L","66M","68M","69L", "75M","77E","77H","78M","7M","800E","802E","807E","82H","84H","89H","89M","8E", "90H","90M","92H","96M"), # points with both 2015 and 2016 veg data Year=2016) %>% mutate(PTYR = paste(PointID, Year, sep="_")) PTnoveg <- c("109L", "73H", "387L") # don't be confused. I have 128 pts of bird data, 126 pts of veg data. PTnew2016 <- data.frame(PointID=c("1540H","2013H","2014H","2028H","2322H","2325H","2345H","293H", "449M","474M","571L","699M","888H","891H","901M"), # "73H", Year=2016) %>% mutate(PTYR = paste(PointID, Year, sep="_")) # points with only 2016 field veg data PTveg2015 <- MPOINT.veg[which(MPOINT.veg$Year == 2015 & !MPOINT.veg$PointID %in% PTresample$PointID & !MPOINT.veg$PointID %in% PTnoveg), c("PointID", "Year")] %>% mutate(PTYR = paste(PointID, Year, sep="_")) # Randomly select 2/3 points for training data--- set.seed(24) samp1 <- unique(PTresample$PointID) samp1 <- PTresample %>% mutate(Year = sample(c(2015, 2016), length(samp1), replace=T)) samp2 <- PTnew2016[sample(nrow(PTnew2016), (2/3)*nrow(PTnew2016), replace=F),] samp3 <- PTveg2015[sample(nrow(PTveg2015), (2/3)*nrow(PTveg2015), replace=F),] samplepts <- samp1 %>% full_join(samp2) %>% full_join(samp3) # # Full data join mpfull <-MPOINT.dens %>% dplyr::select("PointID", "Year", grep("_dens", colnames(MPOINT.dens), value=T)) %>% left_join(MPOINT.veg, by=c("PointID", "Year")) %>% left_join(MPOINT.remote, by=c("PointID")) traindat <- mpfull %>% filter(PTYR %in% samplepts$PTYR) %>% dplyr::select(PointID, Year, grep("dens", colnames(mpfull)), all_of(chosenfield), all_of(chosenremote)) testdat <- mpfull %>% filter(!PTYR %in% samplepts$PTYR) %>% filter(!is.na(AMBI_dens) & !is.na(WOODYP)) %>% dplyr::select(PointID, Year, grep("dens", colnames(mpfull)), all_of(chosenfield), all_of(chosenremote)) rm(samp1, samp2, samp3, PTnoveg, PTnew2016, PTresample, PTveg2015, samplepts) # Old version in loop--- # thesepoints <- as.integer(sample(rownames(vegALL), 84, replace=F)) # sampledat <- data.frame(PointID=vegALL$PointID[c(thesepoints)], Year=vegALL$Year[thesepoints]) # randomly select training data # for(ii in 1:nrow(sampledat)){ # if(sampledat$PointID[ii] %in% PTresample){ # sampledat$Year[ii] <- sample(c("2015","2016"),1) # } else if(sampledat$PointID[ii] %in% PTveg2015){ sampledat$Year[ii] <- "2015" # } else if(sampledat$PointID[ii] %in% PTnew2016){ sampledat$Year[ii] <- "2016"} # } # sampledat$YRPT <- paste("B", paste(sampledat$Year, sampledat$PointID, sep=""), sep="") # # write.csv(sampledat, "data/trainingdataindex2.csv", row.names = F) # # 4a) Train FIELD VEGETATION models----------------------------------------- # Create generalized linear models with training data # glm(family="gaussian") # try forward, backward, both selection using stepAIC() # step-wise regression model selection w AIC. get best model for field vegetation variables, then get best model for remote variables, # then put the vars from field veg best models with remote vars to see if some drop out or can be improved as explanation for bird density # firstmodel <- "the function I need" (AMBI_dens ~ covar1 + covar2 + ..., data=traindat, "other arguments for model/curve/parameters/output") # stepAIC(fit2, direction="both", scope=list(upper=fit1, lower=fit2)) # stepAIC() uses criteria k=2 # use step() or stepAIC() or stepwise(from internet) or something from library(leaps) # the two functions stepAIC() & step() give same result, both default backwards selection # caret() includes cross-validation of model selection # no.na.data <- na.omit(traindat[c(chosenfield, response)]) # takes out missing NA data ?same as complete.cases? # sitevarglm <- glm(formula=as.formula(paste(paste(response,'~', sep='') # paste(predictors,collapse='+'), sep='')), no.na.data, family=gaussian) # can use offset with glm() too, family=gaussian is default library(MASS) library(stats) # log-regression off-set ?(WTF?) - is this what Spencer was talking about? certain curve types can't be at 0 so need to adjust/push data with offset? # ns(), lm(), glm(), gls() # multimodel <- lmer(thissp ~ TargWetsqkm600 + HGTMAXmean.m + LOVOmean.m + HIVOmean.m + DEADP + WATDmean.m + # Targ.covShrub01 -1 +(1|Year), data=MPOINT.birdveg, REML=F) # multimodel <- lmer(get(thissp) ~ TargHA100 + AllWet600 + Mean.Deadhgt + Mean.MaxHgt + P.plotsTYtouch + N.Species # + Range.WetDepth -1 +(1|Year), data=results3, REML=F) # start with field veg covars--- #' AMBI sitevarglm.all <- glm(AMBI_dens ~ ., data = traindat[c(chosenfield, "AMBI_dens")]) summary(sitevarglm.all) sitevarglm.none <- glm(AMBI_dens ~ 1, data = traindat[c(chosenfield, "AMBI_dens")]) summary(sitevarglm.none)$coeff # coeff already built into the output stepboth <- stepAIC(sitevarglm.all, direction = "both") stepboth$anova aveg <- glm(AMBI_dens ~ WOODYP + HGTMAXsd , data=traindat[c(chosenfield, "AMBI_dens")]) #' PBGR sitevarglm.all <- glm(PBGR2_dens ~ ., data=traindat[c(chosenfield, "PBGR2_dens")]) summary(sitevarglm.all) sitevarglm.none <- glm(PBGR2_dens ~ 1, data=traindat[c(chosenfield, "PBGR2_dens")]) summary(sitevarglm.none) stepboth <- stepAIC(sitevarglm.all, direction = "both") stepboth$anova pveg <- glm(PBGR2_dens ~ WATDMEAN, data=traindat[c(chosenfield, "PBGR2_dens")]) #' SORA sitevarglm.all <- glm(SORA_dens ~ ., data=traindat[c(chosenfield, "SORA_dens")]) sitevarglm.none <- glm(SORA_dens ~ 1, data=traindat[c(chosenfield, "SORA_dens")]) stepboth <- stepAIC(sitevarglm.all, direction = "both") stepboth$anova sveg <- glm(SORA_dens ~ 1, data=traindat[c(chosenfield, "SORA_dens")]) #' VIRA sitevarglm.all <- glm(VIRA_dens ~ ., data=traindat[c(chosenfield, "VIRA_dens")]) sitevarglm.none <- glm(VIRA_dens ~ 1, data=traindat[c(chosenfield, "VIRA_dens")]) stepboth <- stepAIC(sitevarglm.all, direction = "both", scope=list(upper= sitevarglm.all, lower=sitevarglm.none)) stepboth$anova vveg <- glm(VIRA_dens ~ 1, data=traindat[c(chosenfield, "VIRA_dens")]) rm(sitevarglm.all, sitevarglm.none, stepboth) # Done with veg models************************************************* # Try to bootstrap glm stepwise regression--- # does not seem to bootstrap sampling though, but rather tests the full dataset # https://stackoverflow.com/questions/61333752/bootstrap-with-stepwise-regression-in-r library(dplyr) library(boot) ddat <- mpfull %>% dplyr::select(AMBI_dens, all_of(chosenfield)) ddat <- ddat[complete.cases(ddat),] B <- 300 #number of bootstrap samples n <- nrow(ddat) d <- ncol(ddat) - 1 full_mdl = colnames(model.matrix(AMBI_dens ~ ., data=ddat)) bhatA <- matrix(NA, nrow = B, ncol = length(full_mdl)+1) colnames(bhatA) = c(full_mdl, "AIC") for(b in 1:B) { samp <- ddat[sample(1:n, n, replace=FALSE), c(1:ncol(ddat))] null <- glm(ddat$AMBI_dens ~ 1, data=samp, na.action = "na.omit") full <- glm(ddat$AMBI_dens ~ ., data=samp, na.action = "na.omit") fit <- step(null, scope=formula(full), direction="both", trace=0) bhatA[b,names(coef(fit))] <- coef(fit) bhatA[b, "AIC"] <- fit$aic } rm(ddat, B, n, d, full_mdl, samp, null, full, fit) colSums(!is.na(bhatA)) # ddat <- mpfull %>% dplyr::select(PBGR2_dens, all_of(chosenfield)) ddat <- ddat[complete.cases(ddat),] B <- 300 #number of bootstrap samples n <- nrow(ddat) d <- ncol(ddat) - 1 full_mdl = colnames(model.matrix(PBGR2_dens ~ ., data=ddat)) bhatP <- matrix(NA, nrow = B, ncol = length(full_mdl)+1) colnames(bhatP) = c(full_mdl, "AIC") for(b in 1:B) { samp <- ddat[sample(1:n, n, replace=FALSE), c(1:ncol(ddat))] null <- glm(ddat$PBGR2_dens ~ 1, data=samp, na.action = "na.omit") full <- glm(ddat$PBGR2_dens ~ ., data=samp, na.action = "na.omit") fit <- step(null, scope=formula(full), direction="both", trace=0) bhatP[b,names(coef(fit))] <- coef(fit) bhatP[b, "AIC"] <- fit$aic } rm(ddat, B, n, d, full_mdl, samp, null, full, fit) colSums(!is.na(bhatP)) # # https://www.statmethods.net/advstats/bootstrapping.html # # function to obtain regression weights # bs <- function(formula, data, indices) { # d <- data[indices,] # allows boot to select sample # fit <- lm(formula, data=d) # return(coef(fit)) # } # # bootstrapping with 1000 replications # results <- boot(data=ddat, statistic=bs, # R=10, formula=ddat$PBGR2_dens ~ # ddat$HGTMAXsd + ddat$WATDMEAN + # ddat$HGTMAXMEAN0 + ddat$N.Species + # ddat$LITMAX + ddat$WOODYP + # ddat$N.plotsTYtouch + ddat$TouchTYsd) # # view results # results # 4b) Train REMOTELY SENSED models-------------------------------------------- # next with remote covars--- # traindat <- trainremote #' AMBI sitevarglm.all <- glm(AMBI_dens ~ ., data=traindat[c(chosenremote, "AMBI_dens")]) sitevarglm.none <- glm(AMBI_dens ~ 1, data=traindat[c(chosenremote, "AMBI_dens")]) stepboth <- stepAIC(sitevarglm.all, direction = "both") stepboth$anova arem <- glm(AMBI_dens ~ 1, data=traindat[c(chosenremote, "AMBI_dens")]) #' PBGR sitevarglm.all <- glm(PBGR2_dens ~ ., data=traindat[c(chosenremote, "PBGR2_dens")]) sitevarglm.none <- glm(PBGR2_dens ~ 1, data=traindat[c(chosenremote, "PBGR2_dens")]) stepboth <- stepAIC(sitevarglm.all, direction = "both") stepboth$anova prem <- glm(PBGR2_dens ~ AllWetHA600, data=traindat[c(chosenremote, "PBGR2_dens")]) # NWImodA.prop #' SORA sitevarglm.all <- glm(SORA_dens ~ ., data=traindat[c(chosenremote, "SORA_dens")]) sitevarglm.none <- glm(SORA_dens ~ 1, data=traindat[c(chosenremote, "SORA_dens")]) stepboth <- stepAIC(sitevarglm.all, direction = "both") stepboth$anova srem <- glm(SORA_dens ~ AllWetHA600 + RatioopenDIVemerg, data=traindat[c(chosenremote, "SORA_dens")]) #' VIRA sitevarglm.all <- glm(VIRA_dens ~ ., data=traindat[c(chosenremote, "VIRA_dens")]) sitevarglm.none <- glm(VIRA_dens ~ 1, data=traindat[c(chosenremote, "VIRA_dens")]) stepboth <- stepAIC(sitevarglm.all, direction = "both") stepboth$anova vrem <- glm(VIRA_dens ~ Targ.covShrubwet01, data=traindat[c(chosenremote, "VIRA_dens")]) rm(sitevarglm.all, sitevarglm.none, stepboth) # 5.2.18 so far AMBI and PBGR best with empty model, SORA ~ WATDMEAN, VIRA ~ LITcv. NONE significant pvals. # Try to bootstrap data sampling and stepwise regression--- # https://stackoverflow.com/questions/61333752/bootstrap-with-stepwise-regression-in-r ddat <- mpfull %>% dplyr::select(AMBI_dens, all_of(chosenremote)) ddat <- ddat[complete.cases(ddat),] B <- 300 #number of bootstrap samples n <- nrow(ddat) d <- ncol(ddat) - 1 full_mdl = colnames(model.matrix(AMBI_dens ~ ., data=ddat)) bhatA <- matrix(NA, nrow = B, ncol = length(full_mdl)+1) colnames(bhatA) = c(full_mdl, "AIC") for(b in 1:B) { samp <- ddat[sample(1:n, n, replace=FALSE), c(1:ncol(ddat))] null <- glm(ddat$AMBI_dens ~ 1, data=samp, na.action = "na.omit") full <- glm(ddat$AMBI_dens ~ ., data=samp, na.action = "na.omit") fit <- step(null, scope=formula(full), direction="both", k=log(n), trace=0) bhatA[b,names(coef(fit))] <- coef(fit) bhatA[b,"AIC"] <- fit$aic } rm(ddat, B, n, d, full_mdl, samp, null, full, fit) # ddat <- mpfull %>% dplyr::select(PBGR2_dens, all_of(chosenremote)) ddat <- ddat[complete.cases(ddat),] B <- 300 #number of bootstrap samples n <- nrow(ddat) d <- ncol(ddat) - 1 full_mdl = colnames(model.matrix(PBGR2_dens ~ ., data=ddat)) bhatP <- matrix(NA, nrow = B, ncol = length(full_mdl)+1) colnames(bhatP) = c(full_mdl, "AIC") for(b in 1:B) { samp <- ddat[sample(1:n, n, replace=FALSE), c(1:ncol(ddat))] null <- glm(ddat$PBGR2_dens ~ 1, data=samp, na.action = "na.omit") full <- glm(ddat$PBGR2_dens ~ ., data=samp, na.action = "na.omit") fit <- step(null, scope=formula(full), direction="both", k=log(n), trace=0) bhatP[b,names(coef(fit))] <- coef(fit) bhatP[b, "AIC"] <- fit$aic } rm(ddat, B, n, d, full_mdl, samp, null, full, fit) # ddat <- mpfull %>% dplyr::select(SORA_dens, all_of(chosenremote)) ddat <- ddat[complete.cases(ddat),] B <- 300 #number of bootstrap samples n <- nrow(ddat) d <- ncol(ddat) - 1 full_mdl = colnames(model.matrix(SORA_dens ~ ., data=ddat)) bhatS <- matrix(NA, nrow = B, ncol = length(full_mdl)+1) colnames(bhatS) = c(full_mdl, "AIC") for(b in 1:B) { samp <- ddat[sample(1:n, n, replace=FALSE), c(1:ncol(ddat))] null <- glm(ddat$SORA_dens ~ 1, data=samp, na.action = "na.omit") full <- glm(ddat$SORA_dens ~ ., data=samp, na.action = "na.omit") fit <- step(null, scope=formula(full), direction="both", k=log(n), trace=0) bhatS[b,names(coef(fit))] <- coef(fit) bhatS[b, "AIC"] <- fit$aic } rm(ddat, B, n, d, full_mdl, samp, null, full, fit) # ddat <- mpfull %>% dplyr::select(VIRA_dens, all_of(chosenremote)) ddat <- ddat[complete.cases(ddat),] B <- 300 #number of bootstrap samples n <- nrow(ddat) d <- ncol(ddat) - 1 full_mdl = colnames(model.matrix(VIRA_dens ~ ., data=ddat)) bhatV <- matrix(NA, nrow = B, ncol = length(full_mdl)+1) colnames(bhatV) = c(full_mdl, "AIC") for(b in 1:B) { samp <- ddat[sample(1:n, n, replace=FALSE), c(1:ncol(ddat))] null <- glm(ddat$VIRA_dens ~ 1, data=samp, na.action = "na.omit") full <- glm(ddat$VIRA_dens ~ ., data=samp, na.action = "na.omit") fit <- step(null, scope=formula(full), direction="both", k=log(n), trace=0) bhatV[b,names(coef(fit))] <- coef(fit) bhatV[b, "AIC"] <- fit$aic } rm(ddat, B, n, d, full_mdl, samp, null, full, fit) # # https://www.statmethods.net/advstats/bootstrapping.html # # function to obtain regression weights # bs <- function(formula, data, indices) { # d <- data[indices,] # allows boot to select sample # fit <- lm(formula, data=d) # return(coef(fit)) # } # # bootstrapping with 1000 replications # results <- boot(data=ddat, statistic=bs, # R=10, formula=ddat$PBGR2_dens ~ # ddat$HGTMAXsd + ddat$WATDMEAN + # ddat$HGTMAXMEAN0 + ddat$N.Species + # ddat$LITMAX + ddat$WOODYP + # ddat$N.plotsTYtouch + ddat$TouchTYsd) # # view results # results rm(bhatA, bhatP, bhatS, bhatV) # 4c) Train BEST REMOTE + VEG models------------------------------- # Get variables chosen in best remote models--- # aveg <- glm(AMBI_dens ~ WOODYP + HGTMAXsd, data=traindat[c(chosenfield, response)]) # arem <- glm(AMBI_dens ~ 1, data=traindat[c(chosenremote, "AMBI_dens")]) summary(arem)$coefficients arem.var <- names(arem$coefficients)[-1] # pveg <- glm(PBGR2_dens ~ WATDMEAN, data=traindat[c(chosenfield, response)]) # prem <- glm(PBGR2_dens ~ AllWetHA600, data=traindat[c(chosenremote, "PBGR2_dens")]) # NWImodA.prop summary(prem)$coefficients prem.var <- names(prem$coefficients)[-1] # sveg <- glm(SORA_dens ~ 1, data=traindat[c(chosenfield, response)]) # srem <- glm(SORA_dens ~ AllWetHA600 + RatioopenDIVemerg, data=traindat[c(chosenremote, "SORA_dens")]) summary(srem)$coefficients srem.var <- names(srem$coefficients)[-1] # vveg <- glm(VIRA_dens ~ 1, data=traindat[c(chosenfield, response)]) # vrem <- glm(VIRA_dens ~ Targ.covShrubwet01, data=traindat[c(chosenremote, "VIRA_dens")]) summary(vrem)$coefficients vrem.var <- names(vrem$coefficients)[-1] # Finally, train combined models (the best remote model + field veg covars)--- # then, in cross-validation test predication power of just best-remote variables # to prediction power of best-remote + adding data by going into the field # AMBI response <- "AMBI_dens" sitevarglm.all <- glm(AMBI_dens ~ ., data=traindat[c(arem.var, chosenfield, response)]) sitevarglm.none <- glm(AMBI_dens ~ 1, data=traindat[c(arem.var, chosenfield, response)]) stepboth <- stepAIC(sitevarglm.all, direction = "both") stepboth$anova aboth <- glm(AMBI_dens ~ WOODYP + HGTMAXsd, data=traindat[c(arem.var, chosenfield, response)]) # PBGR response <- "PBGR2_dens" sitevarglm.all <- glm(PBGR2_dens ~ ., data=traindat[c(prem.var, chosenfield, response)]) sitevarglm.none <- glm(PBGR2_dens ~ 1, data=traindat[c(prem.var, chosenfield, response)]) stepboth <- stepAIC(sitevarglm.all, direction = "both") stepboth$anova pboth <- glm(PBGR2_dens ~ WATDMEAN + AllWetHA600, data=traindat[c(prem.var, chosenfield, response)]) # NWImodA.prop # SORA response <- "SORA_dens" sitevarglm.all <- glm(SORA_dens ~ ., data=traindat[c(srem.var, chosenfield, response)]) sitevarglm.none <- glm(SORA_dens ~ 1, data=traindat[c(srem.var, chosenfield, response)]) stepboth <- stepAIC(sitevarglm.all, direction = "both") stepboth$anova sboth <- glm(SORA_dens ~ HGTMAXsd + AllWetHA600 + RatioopenDIVemerg, data=traindat[c(srem.var, chosenfield, response)]) # VIRA response <- "VIRA_dens" sitevarglm.all <- glm(VIRA_dens ~ ., data=traindat[c(vrem.var, chosenfield, response)]) sitevarglm.none <- glm(VIRA_dens ~ 1, data=traindat[c(vrem.var, chosenfield, response)]) stepboth <- stepAIC(sitevarglm.all, direction = "both") stepboth$anova vboth <- glm(VIRA_dens ~ Targ.covShrubwet01, data=traindat[c(vrem.var, chosenfield, response)]) rm(response, sitevarglm.all, sitevarglm.none, stepboth) # # Try to bootstrap GLM (but does not do stepwise comparisons) # # uses 5-fold validation # # https://quantdev.ssri.psu.edu/tutorials/cross-validation-tutorial # model_caret <- train(SORA_dens ~ HGTMAXMEAN0+LITMAX+ HGTMAXsd+WATDMEAN+WOODYP+ # N.Species+TouchTYsd+Logtargwet+ Wetlandshallow.marsh_prop+ NWImodA.prop+ # Wetlandseasonally_prop+Hayand.Pasture_prop+RatioopenDIVemerg+AllWetHA600+ # WOODYP + LITMAX, # model to fit # data = mpfull, # # trControl = data_ctrl, # folds # method = "glm", # specifying regression model # na.action = na.pass) # pass missing data to model - some models will handle this # model_caret$finalModel # model_caret$resample # # randomly select half of the sample # set.seed(0123) # half_size <- floor(0.50 * nrow(data)) # random_sample <- sample(seq_len(nrow(data)), size = half_size) # first_half_data <- data[random_sample, ] # # # run the model # first_half <- lm(ACT ~ gender + age + SATV + SATQ, data = first_half_data) # summary(first_half) # # #select half of data not used in the above model # second_half_data <- data[-random_sample, ] # # # run the model # second_half <- lm(ACT ~ gender + age + SATV + SATQ, data = second_half_data) # summary(second_half) # rm(model_caret, random_sample, first_half_data, first_half, second_half, second_half_data, half_size) # 4) Test remote and field veg and both models-------------------------------------- library(caret) # caret() # AIC # R-squared # RMSE # Get best models--- # aveg <- glm(AMBI_dens ~ WOODYP + HGTMAXsd, data=traindat[c(chosenfield, response)]) # arem <- glm(AMBI_dens ~ 1, data=traindat[c(chosenremote, "AMBI_dens")]) # aboth <- glm(AMBI_dens ~ WOODYP + HGTMAXsd, data=traindat[c(arem.var, chosenfield, response)]) # pveg <- glm(PBGR2_dens ~ WATDMEAN, data=traindat[c(chosenfield, response)]) # prem <- glm(PBGR2_dens ~ AllWetHA600, data=traindat[c(chosenremote, "PBGR2_dens")]) # NWImodA.prop # pboth <- glm(PBGR2_dens ~ WATDMEAN + AllWetHA600, data=traindat[c(prem.var, chosenfield, response)]) # NWImodA.prop # sveg <- glm(SORA_dens ~ 1, data=traindat[c(chosenfield, response)]) # srem <- glm(SORA_dens ~ AllWetHA600 + RatioopenDIVemerg, data=traindat[c(chosenremote, "SORA_dens")]) # sboth <- glm(SORA_dens ~ HGTMAXsd + AllWetHA600 + RatioopenDIVemerg, data=traindat[c(srem.var, chosenfield, response)]) # vveg <- glm(VIRA_dens ~ 1, data=traindat[c(chosenfield, response)]) # vrem <- glm(VIRA_dens ~ Targ.covShrubwet01, data=traindat[c(chosenremote, "VIRA_dens")]) # vboth <- glm(VIRA_dens ~ Targ.covShrubwet01, data=traindat[c(vrem.var, chosenfield, response)]) # Make predictions and compute the R2, RMSE and MAE vegpred <- aveg %>% predict(testdat) rempred <- arem %>% predict(testdat) bothpred<- aboth %>% predict(testdat) data.frame( RMSEveg = RMSE(vegpred, testdat$AMBI_dens), RMSErem = RMSE(rempred, testdat$AMBI_dens), RMSEboth = RMSE(bothpred, testdat$AMBI_dens)) # vegpred <- pveg %>% predict(testdat) rempred <- prem %>% predict(testdat) bothpred<- pboth %>% predict(testdat) data.frame( RMSEveg = RMSE(vegpred, testdat$PBGR2_dens), RMSErem = RMSE(rempred, testdat$PBGR2_dens), RMSEboth = RMSE(bothpred, testdat$PBGR2_dens)) # vegpred <- sveg %>% predict(testdat) rempred <- srem %>% predict(testdat) bothpred<- sboth %>% predict(testdat) data.frame( RMSEveg = RMSE(vegpred, testdat$SORA_dens), RMSErem = RMSE(rempred, testdat$SORA_dens), RMSEboth = RMSE(bothpred, testdat$SORA_dens)) # vegpred <- vveg %>% predict(testdat) rempred <- vrem %>% predict(testdat) bothpred<- vboth %>% predict(testdat) data.frame( RMSEveg = RMSE(vegpred, testdat$VIRA_dens), RMSErem = RMSE(rempred, testdat$VIRA_dens), RMSEboth = RMSE(bothpred, testdat$VIRA_dens)) # # Other model validation code--- # data.frame( R2 = R2(vegpred, testdat$AMBI_dens), # RMSE=RMSE(vegpred, testdat$AMBI_dens), # MAE =MAE(vegpred, testdat$AMBI_dens)) # # data.frame( R2 = R2(rempred, testdat$AMBI_dens), # RMSE=RMSE(rempred, testdat$AMBI_dens), # MAE =MAE(rempred, testdat$AMBI_dens)) # # data.frame( R2 = R2(bothpred, testdat$AMBI_dens), # RMSE=RMSE(bothpred, testdat$AMBI_dens), # MAE =MAE(bothpred, testdat$AMBI_dens)) # # RMSE(rempred, testdat$AMBI_dens)/mean(testdat$AMBI_dens) # compare <- function(orig_data, i){ # # create the resampled data # train_data <- orig_data[i, ] # test_data <- orig_data # ie the full original sample # # # fit the three modelling processes # model_step <- model_process_step(train_data) # model_vif <- model_process_vif(train_data) # model_full <- lm(MedianIncome ~ ., data = train_data) # # # predict the values on the original, unresampled data # predict_step <- predict(model_step, newdata = test_data) # predict_vif <- predict(model_vif, newdata = test_data) # predict_full <- predict(model_full, newdata = test_data) # # # return a vector of 6 summary results # results <- c( # step_R2 = R2(predict_step, test_data$MedianIncome), # vif_R2 = R2(predict_vif, test_data$MedianIncome), # full_R2 = R2(predict_full, test_data$MedianIncome), # step_RMSE = RMSE(predict_step, test_data$MedianIncome), # vif_RMSE = RMSE(predict_vif, test_data$MedianIncome), # full_RMSE = RMSE(predict_full, test_data$MedianIncome) # ) # return(results) # } # is there way to bootstrap data sampling into regression and count the number of times variables in top model? # is there a way to change up order of variables in stepwise regression testing? # is it a problem to put area wetland in as covar when it is already incorporated in the density? # cross validation is to test a model's ability to predict new data # (so we use part of data to build model and then predict, compare predicted to the other unused part of data - did model come close to predicting?) # compare the RMSE (root mean square error) of model predictions of bird density to the observed bird density # so observed data is "true" line/relationship. we measure how far away predicted observations are from this truth # # END models********************************************************************************************************* # library(ezknitr) ezspin("02_Morris_modeling.R", out_dir = "output", fig_dir = "figures", keep_md=FALSE)