#' ---
#' title: "Case Study II: Combining results from multiple models" 
#' output: html_document
#' ---
#' 
#' ## Objectives
#'    
#' This example demonstrates how:
#' 
#' 1. the bootstrap can be used in applications that involve multiple
#' response measures from the same set of cases.  
#' 
#' 2. the bootstrap can provide estimates of uncertainty for 
#' non-linear functions of model parameters. In such cases, there will
#' usually not be an analytical formula for calculating the SE of our
#' estimator. Alternatives are to use the delta method with a Normal
#' approximation of the sampling distribution or Bayesian methods.  
#'   
#' This example comes from:
#' 
#' Zicus, M. C., D. P. Rave, and J. Fieberg.  2006.  Cost effectiveness of single- vs.
#'  double-cylinder over-water nest structures.  Wildlife Society Bulletin 34:647-655.
#' 
#+ warning=FALSE, message=FALSE 
library(geepack)
library(gmodels)
library(mgcv)
library(splines) 
library(dplyr)
library(ggplot2)
library(ggfortify)

#' Set seed of random number generator
set.seed(10)

#' Read in survival data and duckling data
  ddata<-read.csv("data/costeff.csv")
  names(ddata)<-tolower(names(ddata))
  ddata$deply<-as.factor(ddata$deply)
  ddata$year<-as.factor(ddata$year)

#' Variables of interest:
#' 
#' - deply = 0 for single cylinders and 1 for double cylinders
#' - strtno = structure ID (unique to each nesting structure)
#' - period = (1-4) categorical variable capturing seasonal effects
#' - size = size of the wetland where the structure is placed
#' - year = year of observation
#' - yng = number of ducks produced 
#'
  
#' Make sure the observations are ordered by structure ID (strtno);
  x<-order(ddata$strtno)
  ddata<-ddata[x,]

#' ## Structure survival model
#' 
#'  Place knots at size = 3 and size = 10 and create spline basis vectors when
#'  modeling the (non-linear) effect of wetland size
  bsize<-ns(ddata$size,df=3, knots=c(3,10))
  ddata2<-cbind(ddata,bsize[,1:3])
  names(ddata2)[8:10]<-c("sz1", "sz2", "sz3")

#' Fit the discrete time survival model to capture influence of structure type (deply)
#' and wetland size. 
  glmsurv<-glm(surv~year+deply +sz1+sz2+sz3, family=binomial(link=cloglog), data=ddata2)

#' Now, lets get predictions for both cylinder types, a range of sizes, and all 8 years.
  pdat<-expand.grid(size = seq(0,28,0.1), deply=unique(ddata2$deply), year=unique(ddata2$year))
  
#' Create the spline basis vectors again, using the same set of knots
  pdatsize<-predict(bsize, pdat$size)  
  pdat2<-cbind(pdat,pdatsize[,1:3])
  names(pdat2)[4:6]<-c("sz1", "sz2", "sz3")
  
#' Now, predictions on prob of failure (accounting for the fact that our model is fit on the cloglog scale)
  pdat2$pfail<-predict(glmsurv, newdata=pdat2, type="resp")
  
#' Plot survival for each year and deployment type. We see that structure
#' failure rates increase with the size of the wetland. These failure rates
#' also varied considerably from year to year.
#+ fig.width=8, fig.height=4   
  ggplot(pdat2, aes(x=size, y=pfail, color=year))+geom_line()+facet_wrap(~deply)+
   xlab("Size of wetland (ha)")+ ylab("Probability of Structure Failure")

#' Get predictions averaged across years. We see that failure rates 
#' were slightly higher in double-cylinder than single-cylinder structures.
   pfail<- pdat2 %>% group_by(size, deply) %>% summarize(meanfail=mean(pfail))
   ggplot(pfail, aes(x=size, y=meanfail, color=deply))+geom_line() +
     xlab("Size of wetland (ha)")+ ylab("Probability of Structure Failure")
   
#' Costs were estimated as fixed ($25/cylinder + variable depending on survival and cylinder type)
#' 
#' - $28 if single cylinder fails, $48 if double cylinder fails
   vcost<-ifelse(pfail$deply==0,28,48)
   pfail$ec<-25+vcost*pfail$meanfail  
   ggplot(pfail, aes(x=size, y=ec, color=deply))+geom_line() +
     xlab("Size of wetland (ha)")+ ylab("Expected cost")
   
#' ## Duckling production model

#' Since the predictors do not change across years for this model, Zicus et al. just
#' modeled the mean number of ducks as the response.  Determine the mean   
  tdata<-ddata %>% group_by(strtno) %>% summarize(
    yng= mean(yng), size=mean(size), deply=unique(deply)
  ) 
  
#' We can now model how the number of ducks depends on size of the wetland and cynlinder type
  dmod<-lm(yng~size+I(size^2)+deply, data=tdata)
  summary(dmod)
 
#' Note, the assumptions of linear regression are not met. In particular,
#' the assumption that the residuals are Normally distributed seems suspect
#' as evidenced by points falling off the line in the qqplot (top right).
#' That's OK, we will use a non-parametric bootstrap for inference 
#' (resampling structures with replacement).
#+ fig.width=6, fig.height=6
  autoplot(dmod)     
  
#' Predictions for the same range of wetland sizes and both deployment types
  ducks<-expand.grid(size =seq(0,28,0.1), deply=unique(tdata$deply))
  ducks$phat<-predict(dmod, newdata=ducks)

#' Order the ducks data set the same way as the structure failure data set
pfail
ducks<-ducks %>% arrange(size, deply)
head(ducks)

#' Add plot showing expected ducks versus wetland size. 
#' Here, we see that larger wetlands
#' may result in higher production rates of ducklings, but the data are 
#' highly variable and there is not much data available for structures
#' placed in the largest wetlands.
#+ fig.width=8, fig.height=4 
  ggplot(ducks, aes(x=size, y=phat)) + geom_line() + 
    geom_point(data=tdata, aes(size, y=yng))+ facet_wrap(~deply) +
    xlab("Size of wetland (ha)")+ ylab("Expected number of ducks")

#' ## Get cost effectiveness by talking E[ducks]/E[cost]
  costef<-inner_join(pfail, ducks)
  costef$ce<- costef$ec/costef$phat

#' Plot cost-effectiveness versus wetland size for both cylinder types. We
#' see that cost-effectiveness appears to be highest for deeper wetlands
#' (due to higher duck production in these wetlands, despite higher 
#' failure probabilities associated with larger wetlands).  However, we need to 
#' calculate a measure of uncertainty to help interpret these results.    
  ggplot(costef, aes(x=size, y=ce))+geom_line()+facet_wrap(~deply) +
    xlab("Size of wetland (ha)")+ ylab("Expected cost/expected ducks")
  
#' ## Cluster-level bootstrap to get estimates of uncertainty
#'   
#' Now that we have shown how to get an estimate of everything with the original data, to 
#' determine uncertainty, we just need to:
#' 
#' 1. Resample structures with replacement.,
#' 2. Refit our 2 models.
#' 3. Recalculate our statistic of interest (ducks/expected cost)

#' We will select structures with replacement
  uid <- unique(ddata2$strtno) # unique id for each strtucture
  nstrtno <- length(uid) # number of structures
  
#' Set up matrices to hold bootstrap results  
  nboot<-5000
  pfail.b<-matrix(NA, nboot, nrow(ducks))
  costs.b<-matrix(NA, nboot, nrow(ducks))
  ducks.b<-matrix(NA, nboot, nrow(ducks)) 
  pfail.temp<-pdat2
  
#' Code for the bootstrap is given below. This takes some time to run (and could be sped up by 
#' avoiding loops...).
  for(i in 1:nboot){
    
    # Take a sample from x (uid) of size nstrtno with replacement:
    bootIDs <- data.frame(strtno = sample(x = uid, size = nstrtno, replace = TRUE))
    
    # Use this to sample from original data by beach
    bootDat <- merge(bootIDs,ddata2)
    
    # Now, fit survival model
    glmsurv.b<-glm(surv~year+deply +sz1+sz2+sz3, family=binomial(link=cloglog), data=bootDat)
    
    # Now, predictions on prob of failure (accounting for the
    # fact that our model is fit on the cloglog scale)
    pfail.temp$pfail<-predict(glmsurv.b, newdata=pdat2, type="resp")
    
    # Get predictions averaged across years
    pfail.temp2<- pfail.temp %>% group_by(size, deply) %>%
      summarize(meanfail=mean(pfail))  
    pfail.b[i,]<-pfail.temp2$meanfail
    
    # Costs were estimated as fixed ($25/cylinder + variable depending on survival and cylinder type)
    costs.b[i,]<-25+vcost*pfail.b[i,] 
    
    # ## Duckling production model
    
    # Since the predictors do not change across years for this model, Zicus et al. just
    # modeled the mean number of ducks as the response.  Determine the mean   
    tdata.boot<-merge(bootIDs,tdata)
   
     # We can now model how the number of ducks depends on size
     # of the wetland and cynlinder type
    dmod.b<-lm(yng~size+I(size^2)+deply, data=tdata.boot)
    ducks.b[i,]<-predict(dmod.b, newdata=ducks)
    
} 
  
    
#' Now, calculate pointwise 90% CI and plot.
  ducks$upducks<-apply(ducks.b,2, quantile, probs=0.95) 
  ducks$lowducks<-apply(ducks.b,2, quantile, probs=0.05)
  pfail$upcost<-apply(costs.b, 2,quantile, probs=0.95)
  pfail$lowcost<-apply(costs.b, 2,quantile, probs=0.05)
  costef$upce<-apply(costs.b/ducks.b, 2, quantile, probs=0.95)
  costef$lowce<-apply(costs.b/ducks.b, 2, quantile, probs=0.05)
  costef$meanbootce<-apply(costs.b/ducks.b, 2, mean)
  
#+ fig.width=8, fig.height=4    
  ggplot(ducks, aes(x=size, y=phat)) + 
    geom_ribbon(aes(ymin=lowducks, ymax=upducks), fill="grey70") + 
    geom_line() + geom_point(data=tdata, aes(size, y=yng))+ facet_wrap(~deply) +
    xlab("Size of wetland (ha)")+ ylab("Expected number of ducks")

#+ fig.width=8, fig.height=4       
  ggplot(pfail, aes(x=size, y=ec))+geom_ribbon(aes(ymin=lowcost, ymax=upcost), fill="grey70") + 
    facet_wrap(~deply) + geom_line() + 
    xlab("Size of wetland (ha)")+ ylab("Expected cost")
  
  
#+ fig.width=8, fig.height=4    
  ggplot(costef, aes(x=size, y=ce))+geom_ribbon(aes(ymin=lowce, ymax=upce), fill="grey70") +
    facet_wrap(~deply) +geom_line()+
    xlab("Size of wetland (ha)")+ ylab("Expected cost/expected ducks")

#' ## Use the bootstrap to check for bias
#'
#' We can compare the mean of the bootstrap distribution to the point
#' estimate as a measure of bias. 
costef$boot.bias<-apply(costs.b/ducks.b, 2, mean) - costef$ce

#' Comparing the estimated bias to the estimated SE, we see that the 
#' relative bias is < 0.25, which Efron and Tibshirani (1993) suggest
#' as a general rule of thumb for when it is not worth worrying about.
costef$boot.se<-apply(costs.b/ducks.b, 2, sd) 
summary(costef$boot.bias/costef$boot.se)

#' ## Biological Conclusions
#' 
#' We conclude that cost-effectiveness of nest structures is highest
#' in large wetlands (cost/duck is minimized). Cost-effectiveness also
#' appears to be slightly higher for single-cylinder structures though
#' confidence intervals for single- and double-cylinder structures largely
#' overlap. Single-cylinder structures were less likely to fall over,
#' so were less costly to maintain.
#'
 
#' ## Document footer
#' 
#' Session Information:
#+ sessionInfo
  sessionInfo()