# Coldwater Habitat Reconstruction Model - MGLP Data  \
#recalculated TDO3 using updated data 
#model selection using cross validation on training set, calculate RMSE on test set 
#Feb 2022 log transform geometry ratio to deal with wiggliness issues
library(mgcv)
library(graphics)
library(ggplot2)
library(gridExtra)
library(grid)
library(ggmap)
library(maps)
library(rms)
library(dplyr)
library(gamclass)
library(SiZer)
library(readr)
library(gratia)
library(cowplot)


# Input data
# =======================================================
T<-read.csv("Data - MGLP TDOx Models2.csv")
T$geom_ratio<-(T$surface_area*10000)^0.25/T$max_depth
T$watershed_dist<-T$NLCDDEV_PC+T$NLCDPAST_PC+T$NLCDCROP_PC #ag and urban disturbance
T$cisco_status<-as.character(T$cisco_status) 
#U = unknown. Will call absent
T$cisco_status[T$cisco_status=="U"]<-"A"
T$cisco_status[T$cisco_status==""]<-"A" #replace blanks with "A" for absent
T$cisco_status[is.na(T$cisco_status)]<-"A" #also replace NA values
T$cisco_status_o<-factor(T$cisco_status,levels=c("V","R","X","A"))
T$cisco_status[T$cisco_status=="U"]<-"V"
T$cisco_status[T$cisco_status==""]<-"A" #replace blanks with "A" for absent
T$cisco_status[is.na(T$cisco_status)]<-"A" #also replace NA values
T$cisco_status_o<-factor(T$cisco_status,levels=c("V","R","X","A"))

T$cisco_status_name=NA
T$cisco_status_name[T$cisco_status_o=="V"]="Viable"
T$cisco_status_name[T$cisco_status_o=="R"]="Rare"
T$cisco_status_name[T$cisco_status_o=="X"]="Extirpated"
T$cisco_status_name[T$cisco_status_o=="A"]="Absent"

#log transform geometry ratio
T$geom.ratio.original=T$geom_ratio
T$geom_ratio=log(T$geom.ratio.original)

head(T)
nrow(T)

# TDO3 Models
# =======================================================
M<-subset(T,tdo3.obs>0&!is.na(geom_ratio)&watershed_dist>=0) #records with model data
nrow(M)



#first, identify appropriate k values using gam.check on training. Get all terms above p-value 0.05 with lowest possible k

#then test using cross validation and hold out test set for error

#split into test and training for unbiased error estimate
#what percent? 
test.size=.15

set.seed(12)
testing=M[sample(nrow(M), test.size*nrow(M)), ]
training=anti_join(M, testing)

summary(testing)
summary(training)

###############################################################################
#gam check for tuning k parameter,constrained by the wiggliness not being ecologically unrealistic 
#build 3 models using different air temp metrics. Tune k for each using gam check.
TDO3Mod1<-gam(data=M,tdo3.obs~te(watershed_dist,air_temp_annual, k=3)+s(geom_ratio,  bs="cs"),  method="REML") # mean annual temp

gam.check(TDO3Mod1)
draw(TDO3Mod1)
summary(TDO3Mod1)

TDO3Mod1.1<-gam(data=training,tdo3.obs~watershed_dist*air_temp_annual+s(geom_ratio,  bs="cs"),  method="REML") # mean annual temp linear interaction

gam.check(TDO3Mod1.1)

TDO3Mod1.2<-gam(data=training,tdo3.obs~watershed_dist+air_temp_annual+s(geom_ratio,  bs="cs"),  method="REML") # mean annual temp linear no interaction

gam.check(TDO3Mod1.2)

######################################

TDO3Mod2<-gam(data=training,tdo3.obs~te(watershed_dist,air_temp_summer, bs="tp", k=3)+s(geom_ratio,  bs="cs"),   method="REML") # mean summer temp

gam.check(TDO3Mod2)
draw(TDO3Mod2)
summary(TDO3Mod2)

TDO3Mod2.1<-gam(data=training,tdo3.obs~watershed_dist*air_temp_summer+s(geom_ratio,  bs="cs"),  method="REML") # mean summer temp linear interaction

gam.check(TDO3Mod2.1)

TDO3Mod2.2<-gam(data=training,tdo3.obs~watershed_dist+air_temp_summer+s(geom_ratio,  bs="cs"),  method="REML") # mean summer temp linear no interaction

gam.check(TDO3Mod2.2)
##################################################


TDO3Mod3<-gam(data=training,tdo3.obs~te(watershed_dist,air_temp_july, bs="tp", k=3)+s(geom_ratio,  bs="cs"),  method="REML") # mean july temp

gam.check(TDO3Mod3)
draw(TDO3Mod3)
summary(TDO3Mod3)


#linear
TDO3Mod3.1<-gam(data=training,tdo3.obs~watershed_dist*air_temp_july+s(geom_ratio, bs="cs"),  method="REML") # mean july temp linear interaction

gam.check(TDO3Mod3.1)
summary(TDO3Mod3.1)

#linear
TDO3Mod3.2<-gam(data=training,tdo3.obs~watershed_dist+air_temp_july+s(geom_ratio,  bs="cs"),  method="REML") # mean july temp linear no interaction

gam.check(TDO3Mod3.2)
summary(TDO3Mod3.2)

####################################################################################################

#k folds cross validation

# k-fold cross validation
df <- training
df <- df[sample(nrow(df)), ] # randomly shuffles the data.
k <- nrow(df)                      # Number of folds. Note k=nrow(df) in the leave-one-out cross validation.
folds <- cut(seq(from=1, to=nrow(df)), breaks=k, labels=FALSE) # creates unique numbers for k equally size folds.
df$ID <- folds               # adds fold IDs.
df[paste("pred", 1:9, sep="")] <- NA # adds multiple columns "pred1"..."pred9" to speed up the following loop.


for(i in 1:k) {    
  # looping for different models:   
  m1 <- update(TDO3Mod1,  method="ML", data=df, subset=(ID != i)) 
  
  m2 <- update(TDO3Mod1.1,  method="ML", data=df, subset=(ID != i)) 
  
  m3 <- update(TDO3Mod1.2,  method="ML", data=df, subset=(ID != i))  
  
  m4 <- update(TDO3Mod2,  method="ML", data=df, subset=(ID != i))  
  
  m5 <-update(TDO3Mod2.1,  method="ML", data=df, subset=(ID != i)) 
  
  
  m6<-update(TDO3Mod2.2,  method="ML", data=df, subset=(ID != i)) 
  
  m7 <-update(TDO3Mod3,  method="ML", data=df, subset=(ID != i)) 
  
  
  m8<-update(TDO3Mod3.1,  method="ML", data=df, subset=(ID != i)) 
  
  
  m9<-update(TDO3Mod3.2,  method="ML", data=df, subset=(ID != i)) 

  
  # looping for predictions:
  df[df$ID==i, "pred1"] <- predict(m1, df[df$ID==i, ], type="response")
  df[df$ID==i, "pred2"] <- predict(m2, df[df$ID==i, ], type="response")  
  df[df$ID==i, "pred3"] <- predict(m3, df[df$ID==i, ], type="response")
  df[df$ID==i, "pred4"] <- predict(m4, df[df$ID==i, ], type="response")
  df[df$ID==i, "pred5"] <- predict(m5, df[df$ID==i, ], type="response")
  df[df$ID==i, "pred6"] <- predict(m6, df[df$ID==i, ], type="response")
  df[df$ID==i, "pred7"] <- predict(m7, df[df$ID==i, ], type="response")
  df[df$ID==i, "pred8"] <- predict(m8, df[df$ID==i, ], type="response")
  df[df$ID==i, "pred9"] <- predict(m9, df[df$ID==i, ], type="response")
    print(i)
}

# calculating residuals:
df$res1 <- with(df, tdo3.obs - pred1)
df$res2 <- with(df, tdo3.obs - pred2)    
df$res3 <- with(df, tdo3.obs - pred3)
df$res4 <- with(df, tdo3.obs - pred4)
df$res5 <- with(df, tdo3.obs - pred5)
df$res6 <- with(df, tdo3.obs - pred6)
df$res7 <- with(df, tdo3.obs - pred7)
df$res8 <- with(df, tdo3.obs - pred8)
df$res9 <- with(df, tdo3.obs - pred9)



Model <- paste("m", 1:9, sep="") # creates a vector of model names.

# creating a vector of mean-square errors (MSE):
MSE <- with(df, c(
  sqrt(mean(res1^2)), 
  sqrt(mean(res2^2)),
  sqrt(mean(res3^2)),
  sqrt(mean(res4^2)),
  sqrt(mean(res5^2)),
  sqrt(mean(res6^2)),
  sqrt(mean(res7^2)),
  sqrt(mean(res8^2)),
  sqrt(mean(res9^2))
  
))

model.mse <- data.frame(Model, MSE) # creates a data frame of model names, mean-square errors and coefficients of determination.

#create table
write.csv(model.mse, "tdo3_model_selection_rmse_updated.csv", row.names=F)


########################################
#refit with REML -all

m1 <- update(TDO3Mod1,  method="REML", data=training) 

m2 <- update(TDO3Mod1.1,  method="REML", data=training) 

m3 <- update(TDO3Mod1.2,  method="REML", data=training)  

m4 <- update(TDO3Mod2,  method="REML", data=training)  

m5 <-update(TDO3Mod2.1,  method="REML", data=training) 


m6<-update(TDO3Mod2.2,  method="REML", data=training) 

m7 <-update(TDO3Mod3,  method="REML", data=training) 


m8<-update(TDO3Mod3.1,  method="REML", data=training) 


m9<-update(TDO3Mod3.2,  method="REML", data=training) 

#unbiased RMSE
testing$predictions.1=predict.gam(m1, testing)
testing$predictions.2=predict.gam(m2, testing)
testing$predictions.3=predict.gam(m3, testing)
testing$predictions.4=predict.gam(m4, testing)
testing$predictions.5=predict.gam(m5, testing)
testing$predictions.6=predict.gam(m6, testing)
testing$predictions.7=predict.gam(m7, testing)
testing$predictions.8=predict.gam(m8, testing)
testing$predictions.9=predict.gam(m9, testing)

testing$error1=testing$predictions.1-testing$tdo3.obs
testing$error2=testing$predictions.2-testing$tdo3.obs
testing$error3=testing$predictions.3-testing$tdo3.obs
testing$error4=testing$predictions.4-testing$tdo3.obs
testing$error5=testing$predictions.5-testing$tdo3.obs
testing$error6=testing$predictions.6-testing$tdo3.obs
testing$error7=testing$predictions.7-testing$tdo3.obs
testing$error8=testing$predictions.8-testing$tdo3.obs
testing$error9=testing$predictions.9-testing$tdo3.obs


mse1=sqrt(mean(testing$error1^2))
mse2=sqrt(mean(testing$error2^2))
mse3=sqrt(mean(testing$error3^2))
mse4=sqrt(mean(testing$error4^2))
mse5=sqrt(mean(testing$error5^2))
mse6=sqrt(mean(testing$error6^2))
mse7=sqrt(mean(testing$error7^2))
mse8=sqrt(mean(testing$error8^2))
mse9=sqrt(mean(testing$error9^2))


print(paste(mse1, mse2, mse3, mse4, mse5,mse6, mse7,mse8, mse9, sep=","))

#refit with all data

#final model
final.model.name=model.mse %>%  slice_min(MSE) %>% select(Model)

final.model<-update(m8, data=M)
summary(final.model)
gam.check(final.model)

# plot 
windows()
par(mfrow=c(1,2))
plot(final.model,residuals=TRUE,rug=FALSE)


windows()
par(mfrow=c(2,2))
gam.check(final.model)

####################################
#plots
summary(final.model)

# pulling the prediction and residual data from the model
M %<>% mutate(resid = resid(final.model), predict = predict(final.model))

#gratia
draw(final.model, residuals=TRUE)&theme_bw()&xlab(expression('Log'[e]*'(geometry ratio)'))&ggtitle(element_blank())

ggsave("geometry_ratio_effect_tdo3.png",  height=4, width=4, units="in")


##############################


###########################
windows()
par(mfrow=c(2,2))
vis.gam(final.model,plot.type="contour", color="terrain", cond=list(geom_ratio=0), main="Geometry ratio=1", zlim=c(3,25),nlevels=11, levels=c(5,7,9,11,13,15,17,19,21,23,25), contour.col="black", lwd=2, labcex=1, n.grid=50, xlab=NA, ylab=expression("Mean July air temperature ("*degree*"C)"))
#windows()
vis.gam(final.model,plot.type="contour", color="terrain", cond=list(geom_ratio=0.409), main="Geometry ratio=1.5", zlim=c(3,25),nlevels=11, levels=c(5,7,9,11,13,15,17,19,21,23,25), contour.col="black", lwd=2, labcex=1,n.grid=50, xlab=NA, ylab=NA)
#windows()
vis.gam(final.model,plot.type="contour", color="terrain", cond=list(geom_ratio=.69), main="Geometry ratio=2", zlim=c(3,25),nlevels=11, levels=c(5,7,9,11,13,15,17,19,21,23,25), contour.col="black", lwd=2, labcex=1,n.grid=50, ylab=expression("Mean July air temperature ("*degree*"C)"), xlab="Proportion watershed developed")
#windows()
vis.gam(final.model,plot.type="contour", color="terrain", cond=list(geom_ratio=1.6), main="Geometry ratio=5", zlim=c(3,25), nlevels=11, levels=c(5,7,9,11,13,15,17,19,21,23,25), contour.col="black", lwd=2, labcex=1,n.grid=50, xlab="Proportion watershed developed", ylab=NA)


#########################################################################
##############################################################################

# Predict TDO3 for all MGLP lakes with depth and watershed data
# =======================================================
IP<-select(T, mglp_id, geom_ratio, watershed_dist, air_temp_july, tdo3.obs, cisco_status_name, cisco_status_o)
  

P<-subset(IP,geom_ratio<=max(M$geom_ratio)& !is.na(watershed_dist)) # minimal extrapolation of geometry ratios
nrow(P) #records with complete set of predictor variables

# predict TDO3
P$tdo3_est<-predict(final.model,P)

# integrate presence absence data
P$presence_absence<-P$cisco_status_o # generate presence absence variable
P$presence_absence<-factor(P$presence_absence,levels=c("V","R","X","A","P"))
P$presence_absence[P$presence_absence=="X"]<-"A"
P$presence_absence[P$presence_absence!="A"]<-"P"







##########################################################################
#step 2 is identify thresholds separating tiers.
#this is done by predicting cisco presence absence from actual TDO3 
#and then identifying niche boundaries
#again following Jacobson et al. 2010


# integrate presence absence data
T$presence_absence<-T$cisco_status_o # generate presence absence variable
T$presence_absence<-factor(T$presence_absence,levels=c("V","R","X","A","P"))
T$presence_absence[T$presence_absence=="X"]<-"A"
T$presence_absence[T$presence_absence!="A"]<-"P"
T=T %>% mutate(present=ifelse(presence_absence=="P", tdo3.obs, NA),
               absent=ifelse(presence_absence=="A", tdo3.obs, NA))

# Models using measured TDO3
ActCiscoMod2<-gam(data=T,presence_absence~s(tdo3.obs,  bs="cs"),family=binomial(link=logit), REML=TRUE)

summary(ActCiscoMod2)
gam.check(ActCiscoMod2)

plot(ActCiscoMod2,select=1,ylim=c(-2,2))

draw(ActCiscoMod2)

# Heegaard (2002) Central and Outer Bounds
xTempAtdo3=seq(1,30,0.1)
response<-predict(ActCiscoMod2,data.frame(tdo3.obs=xTempAtdo3),type='response')
response_se <- predict(ActCiscoMod2,data.frame(tdo3.obs=xTempAtdo3),type='response', se.fit = TRUE)$se.fit

plot(xTempAtdo3, response)
max_response<-max(response)
central_response<-max(response)*exp(-0.5)
outer_response<-max(response)*exp(-2)
temp_at_max_response<-xTempAtdo3[which.max(response)]
temp_at_central_response<-approx(response,xTempAtdo3,central_response)$y
temp_at_outer_response<-approx(response,xTempAtdo3,outer_response)$y

temp_at_max_response
temp_at_central_response
temp_at_outer_response

tier.1=temp_at_central_response
tier.2=temp_at_outer_response




toplot=data.frame(xTempAtdo3, response)
#create area for plotting regions
#Breaks for background rectangles
rects <- data.frame(xstart = c(0,temp_at_central_response,temp_at_outer_response), xend = c(temp_at_central_response,temp_at_outer_response, Inf), col = letters[1:3])

#set up color scheme
tier.colors=c("#0571b0", "#f4a582", "#ca0020")

windows()
ggplot(toplot, aes(xTempAtdo3,response))+ annotate("rect",xmin=temp_at_central_response, xmax=temp_at_outer_response, ymin=-Inf, ymax=Inf, alpha = 0.2, fill="#f4a582")+ annotate("rect",xmax=temp_at_central_response, xmin=0, ymin=-Inf, ymax=Inf, alpha = 0.2, fill="#0571b0")+geom_ribbon(aes(ymin = response - 2*response_se, ymax = response + 2*response_se), alpha = 0.2)+geom_path(lwd=1.5)+ labs(x=expression(Temperature~at~3~mg/L~O[2]~"("*degree*"C)"), y="Probability of occurrence")+theme_bw()+theme(panel.grid=element_blank())+geom_text(x=6, y=0.1, label="Tier 1")+geom_text(x=19, y=0.1, label="Tier 2") +geom_text(x=27, y=0.1, label="Tier 3")+ annotate("rect",xmax=Inf, xmin=temp_at_outer_response, ymin=-Inf, ymax=Inf, alpha = 0.2, fill="#ca0020")+geom_rug(data=T, aes(x=present), sides="t", inherit.aes = FALSE, alpha=.7, colour="darkgrey")+geom_rug(data=T, aes(x=absent), sides="b", inherit.aes=FALSE, alpha=.7, colour="darkgrey")
ggsave("tdo3_response_curve_with_tiers_new_profiles_Ribbon.png", height=4, width=4, units="in")



######################
#error plots
####################################
#plot predicted and observed from testing
#windows()
err1=ggplot(testing, aes(tdo3.obs,predictions.8, fill=error8))+geom_abline(slope=1, intercept = 0)+geom_point(size=2, colour="grey",shape=21)+scale_fill_gradient2(expression("Error (" *degree*"C)"), trans='reverse')+theme_classic()+ylab(expression(atop("Predicted temperature", "at 3 mg/L of " *O[2]*" ("*degree*"C)")))+xlab(expression(atop("Observed temperature", "at 3 mg/L of " *O[2]*" ("*degree*"C)")))
#ggsave("predicted_observed_tdo3_withheld_data.png", height=4, width=4, units="in")

#windows()
err2=ggplot(testing, aes(geom_ratio,error8, fill=error8))+geom_abline(slope=0, intercept = 0)+geom_point(size=2, colour="grey",shape=21)+scale_fill_gradient2(expression("Error (" *degree*"C)"), trans='reverse')+theme_classic()+xlab(expression('Log'[e]*'(geometry ratio)'))+ylab(expression("Model error ("*degree*"C)"))
#ggsave("geom_ratio_error_withheld_data.png", height=4, width=4, units="in")

#windows()
err3=ggplot(testing, aes(watershed_dist,error8, fill=error8))+geom_abline(slope=0, intercept = 0)+geom_point(size=2, colour="grey",shape=21)+scale_fill_gradient2(expression("Error (" *degree*"C)"), trans='reverse')+theme_classic()+xlab("Watershed disturbance")+ylab(expression("Model error ("*degree*"C)"))
#ggsave("ws_dist_error_withheld_data.png", height=4, width=4, units="in")

#windows()
err4=ggplot(testing, aes(air_temp_july ,error8, fill=error8))+geom_abline(slope=0, intercept = 0)+geom_point(size=2, colour="grey",shape=21)+scale_fill_gradient2(expression("Error (" *degree*"C)"), trans='reverse')+theme_classic()+xlab(expression("July air temp ("*degree*"C)"))+ylab(expression("Model error ("*degree*"C)"))
#ggsave("july_air_temp_error_withheld_data.png", height=4, width=4, units="in")



#read in lat/long
mglp=read_csv("MGLP_Centroids_Lake.csv")
testing=merge(testing, mglp,  by.x="mglp_id", by.y="LAKE_ID")


#map errors
all_states <- map_data("state")
states <- subset(all_states, region %in% c("indiana" , "minnesota", "michigan", "wisconsin", "iowa", "illinois", "south dakota", "north dakota") )

windows()
p <- ggplot()
p <- p + geom_polygon( data=states, aes(x=long, y=lat, group = group),colour="black", fill="grey90" )
#p=p+geom_point( data=mglp, aes(Lon, Lat), size=.5, colour="grey90")
p=p+geom_point( data=testing, aes(Lon, Lat,  fill=error8), size=1.5, shape=21, colour="darkgrey")
p=p+scale_fill_gradient2(expression("Error (" *degree*"C)"), trans='reverse')
p=p+theme(
  panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(),
  panel.background = element_blank(),
  axis.ticks = element_blank(),
  axis.text= element_blank(),
  legend.position=c(.2,.25) , legend.key = element_blank(),legend.background = element_blank(),
  legend.title=element_text(size=14),legend.text=element_text(size=14),
  axis.title.y=element_blank(),
  axis.title.x=element_blank(),
  plot.title = element_text(face = "bold", size = (15), hjust=.5, vjust=-1),
)
p
ggsave("MGLP_error_map_new.png", height=5, width=6, units="in")




legend <- get_legend(
  err1  +guides(color = guide_legend()) +
    theme(legend.position = "right"))

##########################################################
#cowplot for multipanel figure
windows()
plots1=plot_grid( err2+theme(legend.position = "none"), err3+theme(legend.position = "none", axis.title.y = element_blank(), axis.text.y=element_blank()), err4+theme(legend.position = "none", axis.title.y = element_blank(), axis.text.y=element_blank()),labels=c('A', 'B', 'C'), ncol=3,  align='vh')
#bottom row
plots2=plot_grid(err1+theme(legend.position = "none"), p+theme(legend.position = "none"), legend, ncol=3, rel_widths = c(1,1.5, .2),labels=c('D', 'E', ''))

plot_grid(plots1, plots2, rel_heights =  c(1, 1),  ncol=1)
ggsave("error_plots_testing_data.png", height=6, width=8.5, units = "in")