# Coldwater Habitat Reconstruction Model - MGLP Data \ #recalculated TDO3 using updated data #model selection using cross validation on training set, calculate RMSE on test set #Feb 2022 log transform geometry ratio to deal with wiggliness issues library(mgcv) library(graphics) library(ggplot2) library(gridExtra) library(grid) library(ggmap) library(maps) library(rms) library(dplyr) library(gamclass) library(SiZer) library(readr) library(gratia) library(cowplot) # Input data # ======================================================= T<-read.csv("Data - MGLP TDOx Models2.csv") T$geom_ratio<-(T$surface_area*10000)^0.25/T$max_depth T$watershed_dist<-T$NLCDDEV_PC+T$NLCDPAST_PC+T$NLCDCROP_PC #ag and urban disturbance T$cisco_status<-as.character(T$cisco_status) #U = unknown. Will call absent T$cisco_status[T$cisco_status=="U"]<-"A" T$cisco_status[T$cisco_status==""]<-"A" #replace blanks with "A" for absent T$cisco_status[is.na(T$cisco_status)]<-"A" #also replace NA values T$cisco_status_o<-factor(T$cisco_status,levels=c("V","R","X","A")) T$cisco_status[T$cisco_status=="U"]<-"V" T$cisco_status[T$cisco_status==""]<-"A" #replace blanks with "A" for absent T$cisco_status[is.na(T$cisco_status)]<-"A" #also replace NA values T$cisco_status_o<-factor(T$cisco_status,levels=c("V","R","X","A")) T$cisco_status_name=NA T$cisco_status_name[T$cisco_status_o=="V"]="Viable" T$cisco_status_name[T$cisco_status_o=="R"]="Rare" T$cisco_status_name[T$cisco_status_o=="X"]="Extirpated" T$cisco_status_name[T$cisco_status_o=="A"]="Absent" #log transform geometry ratio T$geom.ratio.original=T$geom_ratio T$geom_ratio=log(T$geom.ratio.original) head(T) nrow(T) # TDO3 Models # ======================================================= M<-subset(T,tdo3.obs>0&!is.na(geom_ratio)&watershed_dist>=0) #records with model data nrow(M) #first, identify appropriate k values using gam.check on training. Get all terms above p-value 0.05 with lowest possible k #then test using cross validation and hold out test set for error #split into test and training for unbiased error estimate #what percent? test.size=.15 set.seed(12) testing=M[sample(nrow(M), test.size*nrow(M)), ] training=anti_join(M, testing) summary(testing) summary(training) ############################################################################### #gam check for tuning k parameter,constrained by the wiggliness not being ecologically unrealistic #build 3 models using different air temp metrics. Tune k for each using gam check. TDO3Mod1<-gam(data=M,tdo3.obs~te(watershed_dist,air_temp_annual, k=3)+s(geom_ratio, bs="cs"), method="REML") # mean annual temp gam.check(TDO3Mod1) draw(TDO3Mod1) summary(TDO3Mod1) TDO3Mod1.1<-gam(data=training,tdo3.obs~watershed_dist*air_temp_annual+s(geom_ratio, bs="cs"), method="REML") # mean annual temp linear interaction gam.check(TDO3Mod1.1) TDO3Mod1.2<-gam(data=training,tdo3.obs~watershed_dist+air_temp_annual+s(geom_ratio, bs="cs"), method="REML") # mean annual temp linear no interaction gam.check(TDO3Mod1.2) ###################################### TDO3Mod2<-gam(data=training,tdo3.obs~te(watershed_dist,air_temp_summer, bs="tp", k=3)+s(geom_ratio, bs="cs"), method="REML") # mean summer temp gam.check(TDO3Mod2) draw(TDO3Mod2) summary(TDO3Mod2) TDO3Mod2.1<-gam(data=training,tdo3.obs~watershed_dist*air_temp_summer+s(geom_ratio, bs="cs"), method="REML") # mean summer temp linear interaction gam.check(TDO3Mod2.1) TDO3Mod2.2<-gam(data=training,tdo3.obs~watershed_dist+air_temp_summer+s(geom_ratio, bs="cs"), method="REML") # mean summer temp linear no interaction gam.check(TDO3Mod2.2) ################################################## TDO3Mod3<-gam(data=training,tdo3.obs~te(watershed_dist,air_temp_july, bs="tp", k=3)+s(geom_ratio, bs="cs"), method="REML") # mean july temp gam.check(TDO3Mod3) draw(TDO3Mod3) summary(TDO3Mod3) #linear TDO3Mod3.1<-gam(data=training,tdo3.obs~watershed_dist*air_temp_july+s(geom_ratio, bs="cs"), method="REML") # mean july temp linear interaction gam.check(TDO3Mod3.1) summary(TDO3Mod3.1) #linear TDO3Mod3.2<-gam(data=training,tdo3.obs~watershed_dist+air_temp_july+s(geom_ratio, bs="cs"), method="REML") # mean july temp linear no interaction gam.check(TDO3Mod3.2) summary(TDO3Mod3.2) #################################################################################################### #k folds cross validation # k-fold cross validation df <- training df <- df[sample(nrow(df)), ] # randomly shuffles the data. k <- nrow(df) # Number of folds. Note k=nrow(df) in the leave-one-out cross validation. folds <- cut(seq(from=1, to=nrow(df)), breaks=k, labels=FALSE) # creates unique numbers for k equally size folds. df$ID <- folds # adds fold IDs. df[paste("pred", 1:9, sep="")] <- NA # adds multiple columns "pred1"..."pred9" to speed up the following loop. for(i in 1:k) { # looping for different models: m1 <- update(TDO3Mod1, method="ML", data=df, subset=(ID != i)) m2 <- update(TDO3Mod1.1, method="ML", data=df, subset=(ID != i)) m3 <- update(TDO3Mod1.2, method="ML", data=df, subset=(ID != i)) m4 <- update(TDO3Mod2, method="ML", data=df, subset=(ID != i)) m5 <-update(TDO3Mod2.1, method="ML", data=df, subset=(ID != i)) m6<-update(TDO3Mod2.2, method="ML", data=df, subset=(ID != i)) m7 <-update(TDO3Mod3, method="ML", data=df, subset=(ID != i)) m8<-update(TDO3Mod3.1, method="ML", data=df, subset=(ID != i)) m9<-update(TDO3Mod3.2, method="ML", data=df, subset=(ID != i)) # looping for predictions: df[df$ID==i, "pred1"] <- predict(m1, df[df$ID==i, ], type="response") df[df$ID==i, "pred2"] <- predict(m2, df[df$ID==i, ], type="response") df[df$ID==i, "pred3"] <- predict(m3, df[df$ID==i, ], type="response") df[df$ID==i, "pred4"] <- predict(m4, df[df$ID==i, ], type="response") df[df$ID==i, "pred5"] <- predict(m5, df[df$ID==i, ], type="response") df[df$ID==i, "pred6"] <- predict(m6, df[df$ID==i, ], type="response") df[df$ID==i, "pred7"] <- predict(m7, df[df$ID==i, ], type="response") df[df$ID==i, "pred8"] <- predict(m8, df[df$ID==i, ], type="response") df[df$ID==i, "pred9"] <- predict(m9, df[df$ID==i, ], type="response") print(i) } # calculating residuals: df$res1 <- with(df, tdo3.obs - pred1) df$res2 <- with(df, tdo3.obs - pred2) df$res3 <- with(df, tdo3.obs - pred3) df$res4 <- with(df, tdo3.obs - pred4) df$res5 <- with(df, tdo3.obs - pred5) df$res6 <- with(df, tdo3.obs - pred6) df$res7 <- with(df, tdo3.obs - pred7) df$res8 <- with(df, tdo3.obs - pred8) df$res9 <- with(df, tdo3.obs - pred9) Model <- paste("m", 1:9, sep="") # creates a vector of model names. # creating a vector of mean-square errors (MSE): MSE <- with(df, c( sqrt(mean(res1^2)), sqrt(mean(res2^2)), sqrt(mean(res3^2)), sqrt(mean(res4^2)), sqrt(mean(res5^2)), sqrt(mean(res6^2)), sqrt(mean(res7^2)), sqrt(mean(res8^2)), sqrt(mean(res9^2)) )) model.mse <- data.frame(Model, MSE) # creates a data frame of model names, mean-square errors and coefficients of determination. #create table write.csv(model.mse, "tdo3_model_selection_rmse_updated.csv", row.names=F) ######################################## #refit with REML -all m1 <- update(TDO3Mod1, method="REML", data=training) m2 <- update(TDO3Mod1.1, method="REML", data=training) m3 <- update(TDO3Mod1.2, method="REML", data=training) m4 <- update(TDO3Mod2, method="REML", data=training) m5 <-update(TDO3Mod2.1, method="REML", data=training) m6<-update(TDO3Mod2.2, method="REML", data=training) m7 <-update(TDO3Mod3, method="REML", data=training) m8<-update(TDO3Mod3.1, method="REML", data=training) m9<-update(TDO3Mod3.2, method="REML", data=training) #unbiased RMSE testing$predictions.1=predict.gam(m1, testing) testing$predictions.2=predict.gam(m2, testing) testing$predictions.3=predict.gam(m3, testing) testing$predictions.4=predict.gam(m4, testing) testing$predictions.5=predict.gam(m5, testing) testing$predictions.6=predict.gam(m6, testing) testing$predictions.7=predict.gam(m7, testing) testing$predictions.8=predict.gam(m8, testing) testing$predictions.9=predict.gam(m9, testing) testing$error1=testing$predictions.1-testing$tdo3.obs testing$error2=testing$predictions.2-testing$tdo3.obs testing$error3=testing$predictions.3-testing$tdo3.obs testing$error4=testing$predictions.4-testing$tdo3.obs testing$error5=testing$predictions.5-testing$tdo3.obs testing$error6=testing$predictions.6-testing$tdo3.obs testing$error7=testing$predictions.7-testing$tdo3.obs testing$error8=testing$predictions.8-testing$tdo3.obs testing$error9=testing$predictions.9-testing$tdo3.obs mse1=sqrt(mean(testing$error1^2)) mse2=sqrt(mean(testing$error2^2)) mse3=sqrt(mean(testing$error3^2)) mse4=sqrt(mean(testing$error4^2)) mse5=sqrt(mean(testing$error5^2)) mse6=sqrt(mean(testing$error6^2)) mse7=sqrt(mean(testing$error7^2)) mse8=sqrt(mean(testing$error8^2)) mse9=sqrt(mean(testing$error9^2)) print(paste(mse1, mse2, mse3, mse4, mse5,mse6, mse7,mse8, mse9, sep=",")) #refit with all data #final model final.model.name=model.mse %>% slice_min(MSE) %>% select(Model) final.model<-update(m8, data=M) summary(final.model) gam.check(final.model) # plot windows() par(mfrow=c(1,2)) plot(final.model,residuals=TRUE,rug=FALSE) windows() par(mfrow=c(2,2)) gam.check(final.model) #################################### #plots summary(final.model) # pulling the prediction and residual data from the model M %<>% mutate(resid = resid(final.model), predict = predict(final.model)) #gratia draw(final.model, residuals=TRUE)&theme_bw()&xlab(expression('Log'[e]*'(geometry ratio)'))&ggtitle(element_blank()) ggsave("geometry_ratio_effect_tdo3.png", height=4, width=4, units="in") ############################## ########################### windows() par(mfrow=c(2,2)) vis.gam(final.model,plot.type="contour", color="terrain", cond=list(geom_ratio=0), main="Geometry ratio=1", zlim=c(3,25),nlevels=11, levels=c(5,7,9,11,13,15,17,19,21,23,25), contour.col="black", lwd=2, labcex=1, n.grid=50, xlab=NA, ylab=expression("Mean July air temperature ("*degree*"C)")) #windows() vis.gam(final.model,plot.type="contour", color="terrain", cond=list(geom_ratio=0.409), main="Geometry ratio=1.5", zlim=c(3,25),nlevels=11, levels=c(5,7,9,11,13,15,17,19,21,23,25), contour.col="black", lwd=2, labcex=1,n.grid=50, xlab=NA, ylab=NA) #windows() vis.gam(final.model,plot.type="contour", color="terrain", cond=list(geom_ratio=.69), main="Geometry ratio=2", zlim=c(3,25),nlevels=11, levels=c(5,7,9,11,13,15,17,19,21,23,25), contour.col="black", lwd=2, labcex=1,n.grid=50, ylab=expression("Mean July air temperature ("*degree*"C)"), xlab="Proportion watershed developed") #windows() vis.gam(final.model,plot.type="contour", color="terrain", cond=list(geom_ratio=1.6), main="Geometry ratio=5", zlim=c(3,25), nlevels=11, levels=c(5,7,9,11,13,15,17,19,21,23,25), contour.col="black", lwd=2, labcex=1,n.grid=50, xlab="Proportion watershed developed", ylab=NA) ######################################################################### ############################################################################## # Predict TDO3 for all MGLP lakes with depth and watershed data # ======================================================= IP<-select(T, mglp_id, geom_ratio, watershed_dist, air_temp_july, tdo3.obs, cisco_status_name, cisco_status_o) P<-subset(IP,geom_ratio<=max(M$geom_ratio)& !is.na(watershed_dist)) # minimal extrapolation of geometry ratios nrow(P) #records with complete set of predictor variables # predict TDO3 P$tdo3_est<-predict(final.model,P) # integrate presence absence data P$presence_absence<-P$cisco_status_o # generate presence absence variable P$presence_absence<-factor(P$presence_absence,levels=c("V","R","X","A","P")) P$presence_absence[P$presence_absence=="X"]<-"A" P$presence_absence[P$presence_absence!="A"]<-"P" ########################################################################## #step 2 is identify thresholds separating tiers. #this is done by predicting cisco presence absence from actual TDO3 #and then identifying niche boundaries #again following Jacobson et al. 2010 # integrate presence absence data T$presence_absence<-T$cisco_status_o # generate presence absence variable T$presence_absence<-factor(T$presence_absence,levels=c("V","R","X","A","P")) T$presence_absence[T$presence_absence=="X"]<-"A" T$presence_absence[T$presence_absence!="A"]<-"P" T=T %>% mutate(present=ifelse(presence_absence=="P", tdo3.obs, NA), absent=ifelse(presence_absence=="A", tdo3.obs, NA)) # Models using measured TDO3 ActCiscoMod2<-gam(data=T,presence_absence~s(tdo3.obs, bs="cs"),family=binomial(link=logit), REML=TRUE) summary(ActCiscoMod2) gam.check(ActCiscoMod2) plot(ActCiscoMod2,select=1,ylim=c(-2,2)) draw(ActCiscoMod2) # Heegaard (2002) Central and Outer Bounds xTempAtdo3=seq(1,30,0.1) response<-predict(ActCiscoMod2,data.frame(tdo3.obs=xTempAtdo3),type='response') response_se <- predict(ActCiscoMod2,data.frame(tdo3.obs=xTempAtdo3),type='response', se.fit = TRUE)$se.fit plot(xTempAtdo3, response) max_response<-max(response) central_response<-max(response)*exp(-0.5) outer_response<-max(response)*exp(-2) temp_at_max_response<-xTempAtdo3[which.max(response)] temp_at_central_response<-approx(response,xTempAtdo3,central_response)$y temp_at_outer_response<-approx(response,xTempAtdo3,outer_response)$y temp_at_max_response temp_at_central_response temp_at_outer_response tier.1=temp_at_central_response tier.2=temp_at_outer_response toplot=data.frame(xTempAtdo3, response) #create area for plotting regions #Breaks for background rectangles rects <- data.frame(xstart = c(0,temp_at_central_response,temp_at_outer_response), xend = c(temp_at_central_response,temp_at_outer_response, Inf), col = letters[1:3]) #set up color scheme tier.colors=c("#0571b0", "#f4a582", "#ca0020") windows() ggplot(toplot, aes(xTempAtdo3,response))+ annotate("rect",xmin=temp_at_central_response, xmax=temp_at_outer_response, ymin=-Inf, ymax=Inf, alpha = 0.2, fill="#f4a582")+ annotate("rect",xmax=temp_at_central_response, xmin=0, ymin=-Inf, ymax=Inf, alpha = 0.2, fill="#0571b0")+geom_ribbon(aes(ymin = response - 2*response_se, ymax = response + 2*response_se), alpha = 0.2)+geom_path(lwd=1.5)+ labs(x=expression(Temperature~at~3~mg/L~O[2]~"("*degree*"C)"), y="Probability of occurrence")+theme_bw()+theme(panel.grid=element_blank())+geom_text(x=6, y=0.1, label="Tier 1")+geom_text(x=19, y=0.1, label="Tier 2") +geom_text(x=27, y=0.1, label="Tier 3")+ annotate("rect",xmax=Inf, xmin=temp_at_outer_response, ymin=-Inf, ymax=Inf, alpha = 0.2, fill="#ca0020")+geom_rug(data=T, aes(x=present), sides="t", inherit.aes = FALSE, alpha=.7, colour="darkgrey")+geom_rug(data=T, aes(x=absent), sides="b", inherit.aes=FALSE, alpha=.7, colour="darkgrey") ggsave("tdo3_response_curve_with_tiers_new_profiles_Ribbon.png", height=4, width=4, units="in") ###################### #error plots #################################### #plot predicted and observed from testing #windows() err1=ggplot(testing, aes(tdo3.obs,predictions.8, fill=error8))+geom_abline(slope=1, intercept = 0)+geom_point(size=2, colour="grey",shape=21)+scale_fill_gradient2(expression("Error (" *degree*"C)"), trans='reverse')+theme_classic()+ylab(expression(atop("Predicted temperature", "at 3 mg/L of " *O[2]*" ("*degree*"C)")))+xlab(expression(atop("Observed temperature", "at 3 mg/L of " *O[2]*" ("*degree*"C)"))) #ggsave("predicted_observed_tdo3_withheld_data.png", height=4, width=4, units="in") #windows() err2=ggplot(testing, aes(geom_ratio,error8, fill=error8))+geom_abline(slope=0, intercept = 0)+geom_point(size=2, colour="grey",shape=21)+scale_fill_gradient2(expression("Error (" *degree*"C)"), trans='reverse')+theme_classic()+xlab(expression('Log'[e]*'(geometry ratio)'))+ylab(expression("Model error ("*degree*"C)")) #ggsave("geom_ratio_error_withheld_data.png", height=4, width=4, units="in") #windows() err3=ggplot(testing, aes(watershed_dist,error8, fill=error8))+geom_abline(slope=0, intercept = 0)+geom_point(size=2, colour="grey",shape=21)+scale_fill_gradient2(expression("Error (" *degree*"C)"), trans='reverse')+theme_classic()+xlab("Watershed disturbance")+ylab(expression("Model error ("*degree*"C)")) #ggsave("ws_dist_error_withheld_data.png", height=4, width=4, units="in") #windows() err4=ggplot(testing, aes(air_temp_july ,error8, fill=error8))+geom_abline(slope=0, intercept = 0)+geom_point(size=2, colour="grey",shape=21)+scale_fill_gradient2(expression("Error (" *degree*"C)"), trans='reverse')+theme_classic()+xlab(expression("July air temp ("*degree*"C)"))+ylab(expression("Model error ("*degree*"C)")) #ggsave("july_air_temp_error_withheld_data.png", height=4, width=4, units="in") #read in lat/long mglp=read_csv("MGLP_Centroids_Lake.csv") testing=merge(testing, mglp, by.x="mglp_id", by.y="LAKE_ID") #map errors all_states <- map_data("state") states <- subset(all_states, region %in% c("indiana" , "minnesota", "michigan", "wisconsin", "iowa", "illinois", "south dakota", "north dakota") ) windows() p <- ggplot() p <- p + geom_polygon( data=states, aes(x=long, y=lat, group = group),colour="black", fill="grey90" ) #p=p+geom_point( data=mglp, aes(Lon, Lat), size=.5, colour="grey90") p=p+geom_point( data=testing, aes(Lon, Lat, fill=error8), size=1.5, shape=21, colour="darkgrey") p=p+scale_fill_gradient2(expression("Error (" *degree*"C)"), trans='reverse') p=p+theme( panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.ticks = element_blank(), axis.text= element_blank(), legend.position=c(.2,.25) , legend.key = element_blank(),legend.background = element_blank(), legend.title=element_text(size=14),legend.text=element_text(size=14), axis.title.y=element_blank(), axis.title.x=element_blank(), plot.title = element_text(face = "bold", size = (15), hjust=.5, vjust=-1), ) p ggsave("MGLP_error_map_new.png", height=5, width=6, units="in") legend <- get_legend( err1 +guides(color = guide_legend()) + theme(legend.position = "right")) ########################################################## #cowplot for multipanel figure windows() plots1=plot_grid( err2+theme(legend.position = "none"), err3+theme(legend.position = "none", axis.title.y = element_blank(), axis.text.y=element_blank()), err4+theme(legend.position = "none", axis.title.y = element_blank(), axis.text.y=element_blank()),labels=c('A', 'B', 'C'), ncol=3, align='vh') #bottom row plots2=plot_grid(err1+theme(legend.position = "none"), p+theme(legend.position = "none"), legend, ncol=3, rel_widths = c(1,1.5, .2),labels=c('D', 'E', '')) plot_grid(plots1, plots2, rel_heights = c(1, 1), ncol=1) ggsave("error_plots_testing_data.png", height=6, width=8.5, units = "in")