#####################################################################################
### PLSR models AISA ################################################################
### Anna K Schweiger July 31 2020 ###################################################
### with lines of code and the VIPjh function from Shawn P. Serbin's awesome GitHub repo # 
### https://github.com/serbinsh/Spectra_PLSR_models #################################

library(pls)
library(reshape)
library(agricolae)
library(caret)

VIPjh=function(object, j, h) { 
  if (object$method != "oscorespls") stop("Only implemented for orthogonal scores algorithm. Refit with 'method = \"oscorespls\"'")
  if (nrow(object$Yloadings) > 1) stop("Only implemented for single-response models")
  b=c(object$Yloadings)[1:h]
  T=object$scores[,1:h, drop = FALSE]
  SS=b^2 * colSums(T^2)
  W=object$loading.weights[,1:h, drop = FALSE]
  Wnorm2=colSums(W^2)
  VIP=sqrt(nrow(W) * sum(SS * W[j,]^2 / Wnorm2) / sum(SS)) 
  return(VIP)
}

### Read data 
# allData <- read.csv("./plot_chem_spec.csv")
allData <- read.csv("./subplot_chem_spec.csv")
# allData <- read.csv("./transect_chem_spec.csv")

inBands <- names(allData)[10:187] # bands to work with
inVars <- names(allData[c(3:9)]) # traits to work with


###############################################
### Find number of components
for (i in 1:length(inVars)){
  inVar <- inVars[i]
  iForm <- paste(inBands,collapse="+")
  iForm <- as.formula(paste(inVar,"~",iForm))
  
  ### Separate data into model CAL/VAL and IVAL (independent or external validation) data
  set.seed(19841002+i)
  rand <- createDataPartition(allData[, inVars[i]], p=0.75, list=F) # here: 75% data for model CAL/VAL
  subData <- allData[rand,] ## CAL
  tstData <- allData[-rand,] ## IVAL
  
  ### Calibrate the model
  nComps <- 15 # max no of components to be evaluated 
  nsims <- 50 # no of simulations
  outMat <- matrix(data=NA,nrow=nsims,ncol=nComps)
  outMatRM <- matrix(data=NA,nrow=nsims,ncol=nComps)
  
  for (nsim in seq(nsims)){
    print(nsim)
    flush.console()
    segs <- cvsegments(N = nrow(subData), k = 5, type="random") # k fold CV, 5 folds = 80:20 split
    resNCOMP <- plsr(iForm,data=subData,ncomp=nComps, 
                     validation="CV", segments=segs,
                     method="oscorespls")
    resPRESS <- as.vector(resNCOMP$validation$PRESS)
    outMat[nsim,seq(resNCOMP$validation$ncomp)] <-resPRESS
    resRMSEP <- as.numeric(RMSEP(resNCOMP,estimate="CV",intercept=F)$val)
    outMatRM[nsim,seq(resNCOMP$validation$ncomp)] <-resRMSEP
  }
  
  ### PRESS statistic: Tukey test - sign diff among no of components?
  pressDF <- as.data.frame(outMat)
  names(pressDF) <- as.character(seq(nComps))
  pressDFres <- melt(pressDF)
  
  modi <- lm (value~variable, pressDFres) 
  tuk <- HSD.test (modi,"variable")
  tuk_dat <- as.data.frame(tuk$groups)
  tuk_dat$var <- as.numeric(row.names(tuk_dat))
  tuk_dat <- tuk_dat[order(tuk_dat$var,decreasing = F),]
  letters <- as.character(tuk_dat$groups)
  
  jpeg(paste("./", inVar, "_PRESS.jpg",sep=""),width=6,height=4.5,units="in",res=200)
  par(bty="l")
  boxplot(pressDFres$value~pressDFres$variable, xlab="n Components",ylab="PRESS",main=inVar)
  text(x=1:max(as.numeric(pressDFres$variable)),  y=rep(max(pressDFres$value),15),letters)
  dev.off()
  
  ### RMSEP: Tukey test 
  RMDF <- as.data.frame(outMatRM)
  names(RMDF) <- as.character(seq(nComps))
  RMDFres <- melt(RMDF)
  
  modi <- lm (value~variable, RMDFres) 
  tuk <- HSD.test (modi,"variable")
  tuk_dat <- as.data.frame(tuk$groups)
  tuk_dat$var <- as.numeric(row.names(tuk_dat))
  tuk_dat <- tuk_dat[order(tuk_dat$var,decreasing = F),]
  letters <- as.character(tuk_dat$groups)
  
  jpeg(paste("./", inVar, "_RMSEP.jpg",sep=""),width=6,height=4.5,units="in",res=200)
  par(bty="l")
  boxplot(RMDFres$value~RMDFres$variable, xlab="n Components",ylab="RMSEP",main=inVar)
  text(x=1:max(as.numeric(RMDFres$variable)), y=rep(max(RMDFres$value),15),letters)
  dev.off()
}


#######################################
###### Final models ###################

### Make dataframe containing selected no of components (=data)  per trait
compis <- as.data.frame(matrix(nrow=1, ncol=length(inVars),dimnames = list(NULL, c(inVars)), 
                               data=c(3,4,2,4,2,3,2))) 

### One trait at a time ...
k <- 1 
inVar <- inVars[k]  
nCompss <- compis[, inVar] # nCompss could also be a vector of a number of options for ncomps

### Separate data into CAL and IVAL data, use same seed same as above
set.seed(19841002+k)
rand <- createDataPartition(allData[, inVars[k]], p=0.75, list=F) ## 75% data for calibration
subData <- allData[rand,] ## CAL
tstData <- allData[-rand,] ## IVAL

iForm <- paste(inBands,collapse="+")
iForm <- as.formula(paste(inVar,"~",iForm))

set.seed(1840)
for (i in 1:length(nCompss)){
  nsims <- 500
  nComps <- nCompss[i]
  nCutmod <- floor(0.8*nrow(subData)) ## 80% data for calibration, 20% for CV
  
  coefMat <- matrix(data=NA,nrow=nsims,ncol=length(inBands)+1)
  coefStd <- matrix(data=NA,nrow=nsims,ncol=length(inBands)+1)
  vipMat <- matrix(data=NA,ncol=nsims,nrow=length(inBands))
  statMat <- matrix(data=NA,nrow=nsims,ncol=10)
  
  for (nsim in seq(nsims)){
    print(nsim)
    flush.console()
    
    set.seed (19818411+nsim)
    randx <-createDataPartition(subData[, inVars[k]], p=0.8, list=F)
    calData <- allData[randx,] ## CAL
    valData <- allData[-randx,] ## VAL (internal)
    
    resX <- plsr(iForm,data=calData,ncomp=nComps,method="oscorespls") ###
    resS <- plsr(iForm,data=calData,ncomp=nComps, method="oscorespls",scale=T) ### scaled by SD
    
    ### Coefficients (raw and standardized)
    coefs <- as.vector(coef(resX,ncomp=nComps,intercept=T))
    zcoef <- as.vector(coef(resS,ncomp=nComps,intercept=T))
    coefMat[nsim,] <- coefs
    coefStd[nsim,] <- zcoef ### standardized coeffis for importance of wvls
    
    ### VIP
    vip <- c()
    for (j in seq(length(inBands))){
      vip <- c(vip,VIPjh(resS,j,nComps))
    }
    vipMat[,nsim] <- vip
    
    ### Model stats
    fitX <- as.vector(unlist(resX$fitted.values[,1,nComps])) ### fitted values
    preX <- as.vector(unlist(predict(resX,ncomp=nComps,newdata=valData))) ### internal val
    fitBias <- mean(calData[,inVar]-fitX)
    valBias <- mean(valData[,inVar]-preX)
    fitR2 <- summary(lm(calData[,inVar]~fitX))$r.squared
    valR2 <- summary(lm(valData[,inVar]~preX))$r.squared
    fitP <- summary(lm(calData[,inVar]~fitX))[[4]][[8]]
    valP <- summary(lm(valData[,inVar]~preX))[[4]][[8]]
    fitRMSE <- sqrt(mean((calData[,inVar]-fitX)^2))
    valRMSE <- sqrt(mean((valData[,inVar]-preX)^2))
    fitRMSEperc <- fitRMSE/(max(calData[,inVar])-min(subData[,inVar]))*100
    valRMSEperc <- valRMSE/(max(valData[,inVar])-min(valData[,inVar]))*100
    outVec <- c(fitR2,fitP, fitRMSE,fitBias,fitRMSEperc,
                valR2,valP, valRMSE,valBias,valRMSEperc)
    statMat[nsim,] <- outVec
  }
  
  statMat <- as.data.frame(statMat)
  names(statMat) <- c("fitR2", "fitP","fitRMSE","fitBias","fitRMSEperc",
                      "valR2","valP","valRMSE","valBias","valRMSEperc")
  
  meanStat <- as.data.frame(t(colMeans(statMat)))
  
  write.csv(statMat,paste("./",inVar,"_", nComps,"comps_stats.csv", sep=""),row.names=FALSE)
  write.csv(meanStat, paste("./",inVar,"_", nComps, "_mean_stats.csv", sep=""))
  
  
  ### Coefficients
  coeffis <- data.frame(matrix(nrow = length(inBands)+1, ncol = 3))
  names(coeffis) <- c("bands", "mean","stdv")
  coeffis$bands <- c("Intercept",inBands)
  coeffis$mean <- apply(coefMat,MAR=2,FUN=mean)
  coeffis$stdv <- apply(coefMat,MAR=2,FUN=sd)
  write.table(coeffis, paste("./", inVar, "_",nComps,"comps_coeffMEAN.csv", sep=""),
              sep=",",col.names=T,row.names=F)
  
  ### Predictions
  specMat <- subData[,inBands]
  specMat <- cbind(rep(1,nrow(specMat)),specMat)
  specMat <- as.matrix(specMat)
  
  predMat <- specMat%*%t(coefMat)
  predMean <- apply(predMat,FUN=mean,MAR=1)
  predStdv <- apply(predMat,FUN=sd,MAR=1)
  
  preds <- subData[, !(names(subData) %in% inBands|names(subData)=="RAND")]
  preds[,paste("predMean_",inVar,sep="")] <- predMean
  preds[,paste("predStdv_",inVar,sep="")] <- predStdv
  write.csv(preds,paste("./", inVar, "_", nComps,"comps_preds_mod.csv", sep=""), row.names=FALSE)
  
  ### Plot predictions 
  modCI <- quantile(statMat$fitR2, probs=c(0.05,0.95))
  modCIval <- quantile(statMat$valR2, probs=c(0.05,0.95))
  RMCI <- quantile(statMat$fitRMSE, probs=c(0.05,0.95))
  RMCIperc <- quantile(statMat$fitRMSEperc, probs=c(0.05,0.95))
  RMCIval <- quantile(statMat$valRMSE, probs=c(0.05,0.95))
  RMCIvalperc <- quantile(statMat$valRMSEperc, probs=c(0.05,0.95))
  
  formi <- as.formula(paste(paste(inVar," ~ predMean_",inVar, sep="")))
  lmformi <-   as.formula(paste(paste("predMean_",inVar," ~ ", inVar, sep="")))
  
  jpeg(paste("./",inVar,"_", nComps, "comp_predplot.jpg",sep=""),
       width=6,height=5,units="in",res=300)
  plot(formi, data= preds, pch=16,cex=0.8,ylab="measured",xlab="predicted",
       main=inVar,xlim=c(min(preds[,grepl(paste0("predMean_", inVar),names(preds))] -
                               preds[,grepl(paste0("predStdv_", inVar),names(preds))]),
                         max(preds[,grepl(paste0("predMean_", inVar),names(preds))] +
                               preds[,grepl(paste0("predStdv_", inVar),names(preds))])))
  
  abline(lm(lmformi, data= preds))
  abline(a = 0,b = 1, lty=2)
  arrows(preds[,grepl(paste0("predMean_", inVar),names(preds))],
         preds[,match(inVar,names(preds))],
         preds[,grepl(paste0("predMean_", inVar),names(preds))]+preds[,grepl(paste0("predStdv_", inVar),names(preds))],
         preds[,match(inVar,names(preds))],angle=90,length=0.05, lwd=0.8)
  arrows(preds[,grepl(paste0("predMean_", inVar),names(preds))],
         preds[,match(inVar,names(preds))],
         preds[,grepl(paste0("predMean_", inVar),names(preds))]-preds[,grepl(paste0("predStdv_", inVar),names(preds))],
         preds[,match(inVar,names(preds))],angle=90,length=0.05, lwd=0.8)
   legend("topleft", bty="n", cex=0.8,
         c(paste("R² cal= ", sprintf("%.2f",signif (mean(statMat$fitR2),3))," [",signif(modCI[1],2),",",signif(modCI[2],2),"]", sep = ""),
           paste("R² val= ", sprintf("%.2f",signif (mean(statMat$valR2),3))," [",signif(modCIval[1],2),",",signif(modCIval[2],2),"]", sep = ""),
           paste("RMSEP % cal= ", sprintf("%.2f",signif (mean(statMat$fitRMSEperc),3)), " [",signif(RMCIperc[1],3),",",signif(RMCIperc[2],3),"]", sep=""),
           paste("RMSEP % val= ", sprintf("%.2f",signif (mean(statMat$valRMSEperc),3)), " [",signif(RMCIvalperc[1],3),",",signif(RMCIvalperc[2],3),"]", sep=""),
           paste("ncomps =", nComps, sep=" ")))
  dev.off()
  
  ### VIP for plotting
  vipAggr <- as.data.frame(t(apply(vipMat,MAR=1,FUN=quantile,probs=c(0.05,0.5,0.95))))
  vipAggr$mean_VIP <- apply(vipMat,MAR=1,FUN=mean)
  vipAggr$stdv <- apply(vipMat,MAR=1,FUN=sd)
  serr <- function(x) sqrt(var(x,na.rm=TRUE)/length(na.omit(x)))  ## standard error of mean 
  vipAggr$se <- apply(vipMat,MAR=1,FUN=serr)
  vipAggr$band <- inBands
  
  ### Standardized coefficients for plotting
  coeff_std <- data.frame(matrix(nrow = length(inBands)+1, ncol = 3))
  names(coeff_std) <- c("bands", "mean","stdv")
  coeff_std$bands <- c("Intercept",inBands)
  coeff_std$mean <- apply(coefStd,MAR=2,FUN=mean)
  coeff_std$stdv <- apply(coefStd,MAR=2,FUN=sd)
  
  ### Plot VIP and standardized coefficients
  jpeg(paste("./", inVar, "_", nComps, "comp_varimp.jpg",sep=""),
       width=6,height=7,units="in",res=300)
  par(mfrow=c(2,1), mar=c(1.5,4,2.5,1.5), oma=c(3,0,0,0))
  plot(coeff_std$mean[-1]~as.numeric(substr(coeff_std$bands[-1],2,nchar(coeff_std$bands[-1]))),
       type="p",pch=19, xlab="",ylab="coeff_stdmean",main=paste(inVar,nComps,"comps",sep = "_"),
       ylim=c(-max(abs(coeff_std$mean[-1])),max(abs(coeff_std$mean[-1]))), bty="l")
  abline(h=0)
  points(abs(coeff_std$mean)[-1]~as.numeric(substr(coeff_std$bands[-1],2,nchar(coeff_std$bands[-1]))),
         xlab="wvl",ylab="coeff_stdmean", col=2, pch=16, cex=0.8)
  lines(abs(coeff_std$mean)[-1]~as.numeric(substr(coeff_std$band[-1],2,nchar(coeff_std$band[-1]))), col=2)
  plot(as.numeric(substr(vipAggr$band,2,nchar(vipAggr$band))),vipAggr$mean_VIP, type="l",
       xlab = "wvl",ylab = "VIP", bty="l")
  polygon(x=c(as.numeric(substr(vipAggr$band,2,nchar(vipAggr$band))),
              rev(as.numeric(substr(vipAggr$band,2,nchar(vipAggr$band))))),
          y=c(vipAggr$mean_VIP+vipAggr$stdv*1.96, rev(vipAggr$mean_VIP-vipAggr$stdv*1.96)),  
          col =  adjustcolor("red", alpha.f = 0.2), border = NA)
  mtext("wavelength(nm)",1,outer = T,line = 1)
  dev.off()
  
  ######################################
  ### Independent (external) validation
  specMatIVAL <- tstData[,inBands]
  specMatIVAL <- cbind(rep(1,nrow(specMatIVAL)),specMatIVAL)
  specMatIVAL <- as.matrix(specMatIVAL)
  
  predMatIVAL <- specMatIVAL%*%t(coefMat)
  predMeanIVAL <- apply(predMatIVAL,FUN=mean,MAR=1)
  predStdvIVAL <- apply(predMatIVAL,FUN=sd,MAR=1)
  
  predsIVAL <- tstData[, !(names(tstData) %in% inBands|names(tstData)=="RAND")]
  predsIVAL[,paste("predMeanIVAL_",inVar,sep="")] <- predMeanIVAL
  predsIVAL[,paste("predStdvIVAL_",inVar,sep="")] <- predStdvIVAL
  write.csv(predsIVAL,paste("./", inVar, "_", nComps, "comps_preds_IVAL.csv", sep=""), 
            row.names=FALSE)
  
  ### Model stats
  statMatIVAL <- matrix(data=NA,nrow=nsims,ncol=5)
  for (nsim in seq(nsims)){
    valBias <- mean(tstData[,inVar]-predMatIVAL[,nsim])
    valR2 <- summary(lm(tstData[,inVar]~predMatIVAL[,nsim]))$r.squared
    valRMSE <- sqrt(mean((tstData[,inVar]-predMatIVAL[,nsim])^2))
    valRMSEperc <- valRMSE/(max(tstData[,inVar])-min(tstData[,inVar]))*100
    valP <- summary(lm(tstData[,inVar]~predMatIVAL[,nsim]))[[4]][[8]]
    outVec <- c(valR2,valP,valRMSE,valBias,valRMSEperc)
    statMatIVAL[nsim,] <- outVec
  }
  
  statMatIVAL <- as.data.frame(statMatIVAL)
  names(statMatIVAL) <- c("IvalR2","IvalP","IvalRMSE","IvalBias","IvalRMSEperc")
  meanIVAL <- as.data.frame(t(colMeans(statMatIVAL)))
  
  write.csv(statMatIVAL,paste("./",inVar,"_", nComps, "comps_stats_IVAL.csv", sep=""),
            row.names=FALSE)
  write.csv(meanIVAL, paste("./",inVar,"_", nComps, "_mean_stats_IVAL.csv", sep=""))
  
  ### Plot
  modCIval <- quantile(statMatIVAL$IvalR2, probs=c(0.05,0.95))
  RMCIval <- quantile(statMatIVAL$IvalRMSE, probs=c(0.05,0.95))
  RMCIvalperc <- quantile(statMatIVAL$IvalRMSEperc, probs=c(0.05,0.95))
  
  formi <- as.formula(paste(paste(inVar," ~ predMeanIVAL_",inVar, sep="")))
  lmformi <-   as.formula(paste(paste("predMeanIVAL_",inVar," ~ ", inVar, sep="")))
  
  jpeg(paste("./", inVar,"_", nComps, "comp_predplot_IVAL.jpg",sep=""),
       width=6,height=5,units="in",res=300)
  plot(formi, data= predsIVAL, pch=16,cex=0.8,ylab="measured",xlab="predicted",
       main=inVar,xlim=c(min(predsIVAL[,grepl(paste0("predMeanIVAL_", inVar),names(predsIVAL))] -
                               predsIVAL[,grepl(paste0("predStdvIVAL_", inVar),names(predsIVAL))]),
                         max(predsIVAL[,grepl(paste0("predMeanIVAL_", inVar),names(predsIVAL))] +
                               predsIVAL[,grepl(paste0("predStdvIVAL_", inVar),names(predsIVAL))])))
  
  abline(lm(lmformi, data= predsIVAL))
  abline(a = 0,b = 1, lty=2)
  arrows(predsIVAL[,grepl(paste0("predMeanIVAL_", inVar),names(predsIVAL))],
         predsIVAL[,match(inVar,names(predsIVAL))],
         predsIVAL[,grepl(paste0("predMeanIVAL_", inVar),names(predsIVAL))]+predsIVAL[,grepl(paste0("predStdvIVAL_", inVar),names(predsIVAL))],
         predsIVAL[,match(inVar,names(predsIVAL))],angle=90,length=0.05, lwd=0.8)
  arrows(predsIVAL[,grepl(paste0("predMeanIVAL_", inVar),names(predsIVAL))],
         predsIVAL[,match(inVar,names(predsIVAL))],
         predsIVAL[,grepl(paste0("predMeanIVAL_", inVar),names(predsIVAL))]-predsIVAL[,grepl(paste0("predStdvIVAL_", inVar),names(predsIVAL))],
         predsIVAL[,match(inVar,names(predsIVAL))],angle=90,length=0.05, lwd=0.8)
  legend("topleft", bty="n", cex=0.8,
        c(paste("R² = ", sprintf("%.2f",signif (mean(statMatIVAL$IvalR2),3))," [",signif(modCIval[1],2),",",signif(modCIval[2],2),"]", sep = ""),
          paste("RMSEP % = ", sprintf("%.2f",signif (mean(statMatIVAL$IvalRMSEperc),3)), " [",signif(RMCIvalperc[1],2),",",signif(RMCIvalperc[2],2),"]", sep=""),
          paste("ncomps =", nComps, sep=" ")))
  dev.off()
}

###### END #########