#' --- #' title: "Case Study I: Relaxing the Assumptions of Linear Regression" #' output: html_document #' --- #' ## Objectives #' #' This example demonstrates how: #' #' 1. a cluster-level bootstrap can be used for repeated measures #' data with equal-sized clusters. This approach is most applicable #' to problems where predictors of interest do not vary within a cluster. #' #' 2. to use functions in the boot package to calculate different bootstrap #' confidence intervals, including the BCa interval, which has better #' statistical properties than percentile-based intervals. #' #' ## Document Preamble #+ docPreamble, warning=FALSE, message=FALSE # Load Libraries library(knitr) library(mosaic) library(ggplot2) library(ggfortify) # Set knitr options opts_chunk$set(fig.width = 6, fig.height=5) # Clear Environment (optional) remove(list=ls()) # Set seed set.seed(314159) #' ## Section 1. Bootstrapping RIKZ data #' #' Read in data from .csv and look at first 6 rows #+ dataEntry01 rikzData <- read.csv("data/RIKZdat.csv") head(rikzData) #' Fit a linear regression model relating species richness to exposure level #+ linReg01 # Simple linear regression and summary lm.RIKZ <- lm(Richness~exposure, data=rikzData) summary(lm.RIKZ) #' ## Check Assumptions #' Here, we see that the residuals do not appear Normally distributed. #' We also know the independence assumption is problematic. #+fig.width=8, fig.height=4, echo=FALSE, include=FALSE pdf("Figure6.pdf", width = 8, height = 4) par(mfrow=c(1,2)) with(rikzData, plot(exposure, Richness, pch=16, bty="L", cex.lab=1.4, xlab="Exposure")) abline(lm.RIKZ) title("A)", adj=0) xdat<-range(lm.RIKZ$resid) hist(lm.RIKZ$resid, col="gray", xlab="Residuals", freq=FALSE, bty="L", cex.lab=1.4, main="") title("B)", adj=0) curve(dnorm(x, mean(lm.RIKZ$resid), sd(lm.RIKZ$resid)), from =xdat[1], to=xdat[2], add=TRUE, col="black") dev.off() #+fig.width=8, fig.height=4 par(mfrow=c(1,2)) with(rikzData, plot(exposure, Richness, pch=16, bty="L", cex.lab=1.4)) abline(lm.RIKZ) title("A)", adj=0) xdat<-range(lm.RIKZ$resid) hist(lm.RIKZ$resid, col="gray", xlab="Residuals", freq=FALSE, bty="L", cex.lab=1.4, main="") title("B)", adj=0) curve(dnorm(x, mean(lm.RIKZ$resid), sd(lm.RIKZ$resid)), from =xdat[1], to=xdat[2], add=TRUE, col="black") #' Cluster-level bootstrap example. The code below shows how to create #' a single bootstrap data set where we resample clusters. #+ bootstrap01 # Data processing uid <- unique(rikzData$Beach) # unique id for each beach nBeach <- length(uid) # number of beaches ### One bootstrap: # Take a sample from x (uid) of size nBeach with replacement: bootIDs <- data.frame(Beach = sample(x = uid, size = nBeach, replace = TRUE)) bootIDs # Use this to sample from original data by beach bootDat <- merge(bootIDs, rikzData) table(bootDat$Beach) ## this table shows how many obs are drawn for each beach in bootstrap sample. # Double check sample sizes worked (these should match): length(rikzData$Beach) # original data length(bootDat$Beach) # bootstrap sample #' Now, repeat this process several times and store results. #+ bootstrap02 nBoots <- 5000 # number of bootstraps coef.ests <- matrix(NA, ncol=2, nrow=nBoots) # to hold bootstrap estimates for(i in 1:nBoots){ # create bootstrap bootIDs <- data.frame(Beach = sample(x = uid, size = nBeach, replace = TRUE)) bootDat <- merge(bootIDs, rikzData) # Estimate coefficients lmBoot <- lm(Richness ~ exposure, data=bootDat) coef.ests[i,] <- lmBoot$coefficients } # Look at histograms of the bootstrap estimates par(mfrow=c(1,2)) hist(coef.ests[,1], main="", xlab="intercept") hist(coef.ests[,2], main="", xlab="slope") # Bootstrap confidence intervals using the percentile method # Intercept: quantile(x = coef.ests[,1], probs = c(0.025,0.975), na.rm=TRUE) # Slope: quantile(x = coef.ests[,2], probs = c(0.025,0.975), na.rm=TRUE) # Compare to linear regression confidence intervals: confint(lm.RIKZ) #' #' ## Better bootstrap confidence intervals #' #' Note: Although it is common to use percentile-based bootstrap confidence intervals, #' there are better ways to calculate confidence intervals, particularly when the #' bootstrap distribution is not symmetric (as is the case here). A nice discussion of #' alternative bootstrap confidence intervals is Hesterberg (2015). What Teachers #' Should Know about the Bootstrap: Resampling in the Undergraduate Statistics Curriculum, #' The American Statistician 69:371-386. #' #' Below, I illustrate how one can use the boot library to calculate some alternative intervals. #' #' ### Other alternatives using boot.ci in the boot library #' #' - Normal = estimate +/- 1.96*SE(bootstrap distribution) #' - Percentile = percentile-based #' - Basic = calculates differences between theta[boot] and original estimate and uses #' this information to construct the interval #' - BCa = attempts to correct for bias and skewness in the bootstrap distribution. #' #' Load boot library #+warning=FALSE, message=FALSE library(boot) #' To use the functions in the boot library, we have to create a function with first #' two arguments = data, indices #' #' - data = the data that will be resampled #' - indices are the vector of indices that will be resampled #+ bootfunc01 slope.exposure<-function(data, indices, formula, obsdat){ # data will contain the data to be resampled (here, beach ids) # indices = used to select clusters # formula = allows flexibility w/ the model that is fit # obsdat = our data set containing all observations bootbeaches<-data[indices] bootids<-data.frame(Beach=bootbeaches) bootdat<-merge(bootids, obsdat) # Fit the linear model with exposurec and return the fitted coefficients lm.boot<- lm(formula, data = bootdat) return(coef(lm.boot)) } # Call the function to do the bootstrapping results<-boot(data=uid, statistic = slope.exposure, R=9999, formula=Richness ~ exposure, obsdat=rikzData) #' Lets look at the results. Note: the warning, below, tells us that we are unable to calculate #' studentized intervals (these are discussed in Hesterberg (2015)) and tend to work well in many #' situations but require an estimate of variance of the statistic to go along with each bootstrap replicate. #' Nonetheless, we get several other intervals all from the same set of #' bootstrapped coefficients. Of these, the BCa interval is arguably the most defensible approach. #+ Bootresults01 # Look at results: Intercept boot.ci(results, index=1) # Look at results: Slope boot.ci(results, index=2) #' The percentile-based intervals are very similar to the intervals we calculated previously. #' Note, however the BCA interval (preferred) is highly asymmetric. All #' intervals are quite a bit wider than those calculated assuming independence. #' Lets plot the results for a visual comparison using ggplot. #+ message=FALSE, warning=FALSE library(ggplot2) #' Gather the data. #+ datagather01, warning=FALSE intervals<-matrix(NA,10,2) # Interepts intervals[1,]<-confint(lm.RIKZ)[1,] # CI assuming independence intervals[2,]<- boot.ci(results, index=1)$normal[2:3] #normal based intervals[3,]<- boot.ci(results, index=1)$basic[4:5] # basic intervals[4,]<- boot.ci(results, index=1)$percent[4:5] # percentile intervals[5,]<- boot.ci(results, index=1)$bca[4:5] # bias-corrected and accelerated # Slopes intervals[6,]<-confint(lm.RIKZ)[2,] # CI assuming independence intervals[7,]<- boot.ci(results, index=2)$normal[2:3] # normal intervals[8,]<- boot.ci(results, index=2)$basic[4:5] #basic intervals[9,]<- boot.ci(results, index=2)$percent[4:5] #percentil intervals[10,]<- boot.ci(results, index=2)$bca[4:5] #' Reformat the data for plotting intervals.df<-data.frame(lowerCL=intervals[,1], upperCL=intervals[,2], parameter=c(rep("Intercept",5), rep("Slope",5)), type=rep(c("Independence", "Normal", "Basic", "Percentile", "BCa"), 2), PE=rep(coef(lm.RIKZ), each=5)) #' Plot #+ ggplotInt01, fig.width=10, fig.height=6 ggplot(intervals.df, aes(x=type, y=PE, group=type)) + geom_errorbar(width=.1, aes(ymin=lowerCL, ymax=upperCL), colour="blue") + geom_point(shape=21, size=3, fill="white")+ facet_wrap(~parameter, ncol=2, scales="free") + xlab("")+ylab("Estimate and 95% CI") #' ## Conclusions #' #' Using a cluster-level bootstrap results in wider confidence intervals than naive intervals #' that assume independence. Also, the BCa intervals, which have better statistical properties #' than percentile-based intervals when estimators exhibit bias or the sampling distribution is #' skewed, result is a highly asymmetric confidence interval. #' #' ## Document footer #' #' Session Information: #+ sessionInfo sessionInfo()