#Jeannine Cavender-Bares, 5/24/20 - 6/5/2021
#PLSR model procedures (examples from Wood River)

library(pls)
library(reshape)
library(agricolae)
require("caret")

f1 <- function(x) SD = sd(x)
rm()
setwd("")
WRdat <- read.csv("processed/PD.FD.traits.BM.FG.spec.subplots.csv")
WRplot <- aggregate(. ~ Plot, data=WRdat, mean)

#for plot analyses
WRdat <- WRplot

#subplot
soil                  <- data.frame(SampN = c(1:nrow(WRdat)))

soil$biomass          <- WRdat$biomass_g_m2
soil$Lp               <- WRdat$Lp
soil$C3p              <- WRdat$C3p
soil$C4p              <- WRdat$C4p
soil$Fp               <- WRdat$Fp
soil$dicot            <- soil$Lp + soil$Fp
soil$monocot          <- soil$C3p + soil$C4p
REF                   <- WRdat[,57:230] #spectral reflectance data
soil$REF              <- as.matrix(REF)

#############
soil$Plot             <- WRdat$Plot.t.sp #use for subplot analyses

#############
soil$Plot             <- WRdat$Plot #use for plot analyses

###Input numbers of components for each test based on rmsep and press randomizations 

biomasscomp  <- 15 
Lpcomp       <- 10 
C4pcomp      <- 11
C3pcomp      <- 12 
Fpcomp       <- 11
monocotcomp  <- 12
dicotcomp    <- 11

#for PLOTS
biomasscomp  <- 4 
Lpcomp       <- 9 
C4pcomp      <- 6 
C3pcomp      <- 6
Fpcomp       <- 5

#####################################

# function for standard error of the mean
sem <- function(x){
  sd(x)/sqrt(length(x))
}

#Extract rownames for the coefficients
mod                     <-  plsr(biomass ~ soil$REF, data=soil, ncomp=biomasscompi, validation="LOO") #using LOO
cf                      <-  as.data.frame(coef(mod,ncomp=biomasscomp,intercept=T))
bands                   <-  as.data.frame(rownames(cf))

#Set up for each model: biomass, legumes, C4, C3, Fb and Lp, C4p, C3p, Fp

#Cut the NAs out of the data set before partitioning the data
soil_biomass            <-  soil[!is.na(soil$biomass), ]
soil_Lp                 <-  soil[!is.na(soil$Lp), ]
soil_Fp                 <-  soil[!is.na(soil$Fp), ]
soil_C4p                <-  soil[!is.na(soil$C4p), ]
soil_C3p                <-  soil[!is.na(soil$C3p), ]
soil_monocot 			<-  soil[!is.na(soil$monocot), ]
soil_dicot   			<-  soil[!is.na(soil$dicot), ]


################  FG PROPORTIONS ####################
#########################################################

############################ LEGUME Proportions ############################
#legume proportion

soil_Lp           			<- soil_Lp2

set.seed(35633399)
subset 						<- createDataPartition(soil_Lp$Lp, p=0.75, list=F, times=1) #identifies rows to use
testData 					<- soil_Lp[subset,] #keep data to run plsr models
ivData 						<- soil_Lp[-subset,] #true validation data
nr.iv 						<- nrow(ivData)
nr.all 						<- nrow(soil_Lp)

set.seed(65215792)
subset2 						<- createDataPartition(testData$Lp, p=0.80, list=F, times=nsims) #partitions nsims times - a matrix of random data partitions that is balanced
nr.cal 						<- nrow(subset2)
nr.cv  						<- nrow(testData)-nrow(subset2)

predict.cal 				<- matrix(nrow = nr.cal, ncol = 1)
predict.cal 				<- data.frame(predict.cal)

predict.cv 					<- matrix(nrow=nr.cv,ncol=1)
predict.cv 					<- data.frame(predict.cv)

predict.iv 					<- matrix(nrow=nr.iv,ncol=1) #check nrow
predict.iv 					<- data.frame(predict.iv)

cf 							<- data.frame()
coef 						<- data.frame(bands)

sim.cal 						<- vector()
sim.cv 						<- vector()
sim.iv 						<- vector()

Rsq.cal    					<- matrix(nrow=1,ncol=1)
Rsq.cal 						<- data.frame(Rsq.cal)

Pval.cal   					<- matrix(nrow=1,ncol=1)
Pval.cal 					<- data.frame(Pval.cal)

RMSEP.cal  					<- matrix(nrow=1,ncol=1)
RMSEP.cal 					<- data.frame(RMSEP.cal)

BIAS.cal   					<- matrix(nrow=1,ncol=1)
BIAS.cal 					<- data.frame(BIAS.cal)

PERC_RMSEP.cal 				<- matrix(nrow=1,ncol=1)
PERC_RMSEP.cal 				<- data.frame(PERC_RMSEP.cal)

Rsq.cv     					<- matrix(nrow=1,ncol=1)
Rsq.cv 						<- data.frame(Rsq.cv)

Pval.cv    					<- matrix()
Pval.cv 						<- data.frame(Pval.cv)

RMSEP.cv   					<- matrix()
RMSEP.cv 					<- data.frame(RMSEP.cv)

BIAS.cv    					<- matrix()
BIAS.cv 						<- data.frame(BIAS.cv)

PERC_RMSEP.cv 				<- matrix()
PERC_RMSEP.cv 				<- data.frame(PERC_RMSEP.cv)

Rsq.iv    					<- matrix(nrow=1,ncol=1)
Rsq.iv 						<- data.frame(Rsq.iv)

Pval.iv   					<- matrix(nrow=1,ncol=1)
Pval.iv 						<- data.frame(Pval.iv)

RMSEP.iv  					<- matrix(nrow=1,ncol=1)
RMSEP.iv 					<- data.frame(RMSEP.iv)

BIAS.iv   					<- matrix(nrow=1,ncol=1)
BIAS.iv 						<- data.frame(BIAS.iv)

PERC_RMSEP.iv 				<- matrix(nrow=1,ncol=1)
PERC_RMSEP.iv 				<- data.frame(PERC_RMSEP.iv)

PERC_RMSEP.iv2 				<- matrix(nrow=1,ncol=1)
PERC_RMSEP.iv2 				<- data.frame(PERC_RMSEP.iv2)

comp 						<- Lpcomp

for (i in seq(nsims)){
  print(i) 
  
  calData 					<- testData[subset2[,i], ] # training data; uses the ith column
  cvData  					<- testData[-subset2[,i], ] # cross validation data
  
  calData 					<- as.data.frame(calData)
  cvData 					<- as.data.frame(cvData)
  
  sim_num                 	<- paste0("sim_",i)
  mod                     	<- plsr(Lp ~ calData$REF, data=calData, ncomp=comp, validation="none") 
  
  sim.cal                 	<- predict(mod, ncomp=comp, which="train", newdata=calData$REF)
  predict.cal[,sim_num]   	<- as.data.frame(sim.cal) # calibration
  
  sim.cv                  	<- predict(mod, ncomp=comp, newdata=cvData$REF)
  predict.cv[,sim_num]    	<- sim.cv # cross validation
  
  sim.iv                  	<- predict(mod, ncomp=comp, newdata=ivData$REF)
  predict.iv[,sim_num]    	<- sim.iv # true validation
  

  cf                      	<-  as.data.frame(coef(mod,ncomp=comp,intercept=TRUE))
  coef[sim_num]           	<-  cf[,1]
  
  
  #calibration results
  y 							<- calData$Lp
  lm1 						<- summary(lm(y ~  sim.cal)) #run to check
  Rsq.cal[sim_num,1] 		<- lm1$r.squared
  Pval.cal[sim_num,1] 		<- lm1$coefficients[2,4]
  RMSEP.cal[sim_num,1] 		<- sqrt(mean((sim.cal - y)^2)) 
  BIAS.cal[sim_num,1] 		<- mean(sim.cal)-mean(y)
  PERC_RMSEP.cal[sim_num,1] <- RMSEP.cal[sim_num,1]/(max(y)-min(y))*100
  
  #cross validation results
  y 							<- cvData$Lp
  lm1 						<- summary(lm(y ~  sim.cv)) #run to check
  Rsq.cv[sim_num,1] 			<- lm1$r.squared
  Pval.cv[sim_num,1] 		<- lm1$coefficients[2,4]
  RMSEP.cv[sim_num,1] 		<- sqrt(mean((sim.cv - y)^2)) 
  BIAS.cv[sim_num,1] 		<- mean(sim.cv)-mean(y)
  PERC_RMSEP.cv[sim_num,1] 	<- RMSEP.cv[sim_num,1]/(max(y)-min(y))*100
  
  #independent validation results
  y   						<- ivData$Lp
  lm1 						<- summary(lm(y ~  sim.iv)) #run to check
  Rsq.iv 					<- lm1$r.squared
  Pval.iv 					<- lm1$coefficients[2,4]
  RMSEP.iv 					<- sqrt(mean((sim.iv - y)^2)) 
  BIAS.iv 					<- mean(sim.iv)-mean(y)
  PERC_RMSEP.iv 				<- RMSEP.iv/(max(y)-min(y))*100
  PERC_RMSEP.iv2 			<- RMSEP.iv/(max(soil_Lp$Lp)-min(soil_Lp$Lp))*100
}
#Model coefficients to save
Lp.coef <- coef

#Use coefficients to generate model predictions and save


data<-as.data.frame(soil_Lp)

coef 						<- coef[-1]
int  						<- as.matrix(coef[1,])
coef 						<- as.matrix(coef[-1,])
REF  						<- as.matrix(data$REF)

predict.cv 					<- (REF[,] %*% coef[,] + int[,])
predict.cv 					<- data.frame(predict.cv)
predict.cv$Plot 				<- data$Plot
predict.cv$measured 			<- data$Lp
predict.cv$cv.mean.pred 		<- rowMeans(predict.cv[,1:nsims])
predict.cv$cv.std 			<- apply(predict.cv[,1:nsims],1,sd)
predict.cv$cv.se   			<- apply(predict.cv[,1:nsims],1,sem)

predict.iv$Plot 				<- ivData$Plot
predict.iv$measured 			<- ivData$Lp
predict.iv$iv.mean.pred 		<- rowMeans(predict.iv[,3:nsims])
predict.iv$iv.std 			<- apply(predict.iv[,3:nsims],1,sd)
predict.iv$iv.se   			<- apply(predict.iv[,3:nsims],1,sem)

Lp_predict.cv 				<- data.frame(predict.cv)
Lp_predict.iv  				<- data.frame(predict.iv)

mx 							<- max(soil_Lp$Lp)
mn 							<- min(soil_Lp$Lp)

#statistical outputs
Rsq.cal						<- as.vector(Rsq.cal)
Pval.cal    				<- as.vector(Pval.cal)
RMSEP.cal   				<- as.vector(RMSEP.cal)
BIAS.cal    				<- as.vector(BIAS.cal)
PERC_RMSEP.cal 				<- as.vector(PERC_RMSEP.cal)

Rsq.cv  						<- as.vector(Rsq.cv)
Pval.cv 						<- as.vector(Pval.cv)
RMSEP.cv 					<- as.vector(RMSEP.cv)
BIAS.cv    					<- as.vector(BIAS.cv)
PERC_RMSEP.cv 				<- as.vector(PERC_RMSEP.cv)

Rsq.iv 						<- as.vector(Rsq.iv)
Pval.iv 						<- as.vector(Pval.iv)
RMSEP.iv   					<- as.vector(RMSEP.iv)
BIAS.iv    					<- as.vector(BIAS.iv)
PERC_RMSEP.iv 				<- as.vector(PERC_RMSEP.iv)
PERC_RMSEP.iv2 				<- as.vector(PERC_RMSEP.iv2)


Lp_stats 					<- cbind(mx,mn,Rsq.cal, Pval.cal,RMSEP.cal,BIAS.cal,PERC_RMSEP.cal,Rsq.cv,Pval.cv,RMSEP.cv,BIAS.cv,PERC_RMSEP.cv)
Lp_iv_stats 				<- cbind(Rsq.iv,Pval.iv,RMSEP.iv,BIAS.iv,PERC_RMSEP.iv,PERC_RMSEP.iv2)

#save legume stats 
Lp                          <- Lp_stats[-1,]
Lp_stats[1,]                <- colMeans(Lp)
Lp_stats.means              <- Lp_stats[1,]


########## C4 proportion #########################
##################################################

set.seed(93132687)
subset 						<- createDataPartition(soil_C4p$C4p, p=0.75, list=F, times=1) #identifies rows to use
testData 					<- soil_C4p[subset,] #keep data to run plsr models
ivData 						<- soil_C4p[-subset,] #true validation data
nr.iv 						<- nrow(ivData)
nr.all 						<- nrow(soil_C4p)

set.seed(54225756)
subset2 						<- createDataPartition(testData$C4p, p=0.80, list=F, times=nsims) #partitions nsims times - a matrix of random data partitions that is balanced
nr.cal 						<- nrow(subset2)
nr.cv  						<- nrow(testData)-nrow(subset2)

predict.cal 				<- matrix(nrow = nr.cal, ncol = 1)
predict.cal 				<- data.frame(predict.cal)

predict.cv 					<- matrix(nrow=nr.cv,ncol=1)
predict.cv 					<- data.frame(predict.cv)

predict.iv 					<- matrix(nrow=nr.iv,ncol=1) #check nrow
predict.iv 					<- data.frame(predict.iv)

cf 							<- data.frame()
coef 						<- data.frame(bands)

sim.cal 						<- vector()
sim.cv 						<- vector()
sim.iv 						<- vector()

Rsq.cal    					<- matrix(nrow=1,ncol=1)
Rsq.cal 						<- data.frame(Rsq.cal)

Pval.cal   					<- matrix(nrow=1,ncol=1)
Pval.cal 					<- data.frame(Pval.cal)

RMSEP.cal  					<- matrix(nrow=1,ncol=1)
RMSEP.cal 					<- data.frame(RMSEP.cal)

BIAS.cal   					<- matrix(nrow=1,ncol=1)
BIAS.cal 					<- data.frame(BIAS.cal)

PERC_RMSEP.cal 				<- matrix(nrow=1,ncol=1)
PERC_RMSEP.cal 				<- data.frame(PERC_RMSEP.cal)

Rsq.cv     					<- matrix(nrow=1,ncol=1)
Rsq.cv 						<- data.frame(Rsq.cv)

Pval.cv    					<- matrix()
Pval.cv 						<- data.frame(Pval.cv)

RMSEP.cv   					<- matrix()
RMSEP.cv 					<- data.frame(RMSEP.cv)

BIAS.cv    					<- matrix()
BIAS.cv 						<- data.frame(BIAS.cv)

PERC_RMSEP.cv 				<- matrix()
PERC_RMSEP.cv 				<- data.frame(PERC_RMSEP.cv)

Rsq.iv    					<- matrix(nrow=1,ncol=1)
Rsq.iv 						<- data.frame(Rsq.iv)

Pval.iv   					<- matrix(nrow=1,ncol=1)
Pval.iv 						<- data.frame(Pval.iv)

RMSEP.iv  					<- matrix(nrow=1,ncol=1)
RMSEP.iv 					<- data.frame(RMSEP.iv)

BIAS.iv   					<- matrix(nrow=1,ncol=1)
BIAS.iv 						<- data.frame(BIAS.iv)

PERC_RMSEP.iv 				<- matrix(nrow=1,ncol=1)
PERC_RMSEP.iv 				<- data.frame(PERC_RMSEP.iv)

PERC_RMSEP.iv2 				<- matrix(nrow=1,ncol=1)
PERC_RMSEP.iv2 				<- data.frame(PERC_RMSEP.iv2)

#input data
comp 						<- C4pcomp

for (i in seq(nsims)){
  print(i) 
  #  i=1
  
  calData <- testData[subset2[,i], ] # training data; uses the ith column
  cvData  <- testData[-subset2[,i], ] # cross validation data
  
  calData <-as.data.frame(calData)
  cvData  <-as.data.frame(cvData)
  
  sim_num                 <- paste0("sim_",i)
  mod                     <- plsr(C4p ~ calData$REF, data=calData, ncomp=comp, validation="none") #using CV cross validation
  
  sim.cal                 <- predict(mod, ncomp=comp, which="train", newdata=calData$REF)
  predict.cal[,sim_num]   <- as.data.frame(sim.cal) # calibration
  
  sim.cv                  <- predict(mod, ncomp=comp, newdata=cvData$REF)
  predict.cv[,sim_num]    <-  sim.cv # cross validation
  
  sim.iv                  <- predict(mod, ncomp=comp, newdata=ivData$REF)
  predict.iv[,sim_num]    <- sim.iv # true validation
  
  cf                      <-  as.data.frame(coef(mod,ncomp=comp,intercept=TRUE))
  coef[sim_num]           <-  cf[,1]
  
  
  #calibration results
  y 							<- calData$C4p
  lm1 						<- summary(lm(y ~  sim.cal)) #run to check
  Rsq.cal[sim_num,1] 		<- lm1$r.squared
  Pval.cal[sim_num,1] 		<- lm1$coefficients[2,4]
  RMSEP.cal[sim_num,1] 		<- sqrt(mean((sim.cal - y)^2)) 
  BIAS.cal[sim_num,1] 		<- mean(sim.cal)-mean(y)
  PERC_RMSEP.cal[sim_num,1] <- RMSEP.cal[sim_num,1]/(max(y)-min(y))*100
  
  #cross validation results
  y 							<- cvData$C4p
  lm1 						<- summary(lm(y ~  sim.cv)) #run to check
  Rsq.cv[sim_num,1] 			<- lm1$r.squared
  Pval.cv[sim_num,1] 		<- lm1$coefficients[2,4]
  RMSEP.cv[sim_num,1] 		<- sqrt(mean((sim.cv - y)^2)) 
  BIAS.cv[sim_num,1] 		<- mean(sim.cv)-mean(y)
  PERC_RMSEP.cv[sim_num,1] 	<- RMSEP.cv[sim_num,1]/(max(y)-min(y))*100
  
  #true validation results
  y   						<- ivData$C4p
  lm1 						<- summary(lm(y ~  sim.iv)) #run to check
  Rsq.iv 					<- lm1$r.squared
  Pval.iv 					<- lm1$coefficients[2,4]
  RMSEP.iv 					<- sqrt(mean((sim.iv - y)^2)) 
  BIAS.iv 					<- mean(sim.iv)-mean(y)
  PERC_RMSEP.iv 				<- RMSEP.iv/(max(y)-min(y))*100
  PERC_RMSEP.iv2 			<- RMSEP.iv/(max(soil_C4p$C4p)-min(soil_C4p$C4p))*100
}

#Model coefficients to save
C4p.coef 					<- coef

data 						<-as.data.frame(soil_C4p) #set

#Use coefficients to generate predictions and save
coef 						<- coef[-1]
int  						<- as.matrix(coef[1,])
coef 						<- as.matrix(coef[-1,])
REF  						<- as.matrix(data$REF)

predict.cv 					<-(REF[,] %*% coef[,] + int[,])
predict.cv 					<-data.frame(predict.cv)
predict.cv$Plot 				<- data$Plot
predict.cv$measured 			<- data$C4p #change
predict.cv$cv.mean.pred 		<- rowMeans(predict.cv[,1:nsims])
predict.cv$cv.std 			<- apply(predict.cv[,1:nsims],1,sd)
predict.cv$cv.se   			<- apply(predict.cv[,1:nsims],1,sem)

predict.iv$Plot 				<- ivData$Plot
predict.iv$measured 			<-ivData$C4p
predict.iv$iv.mean.pred 		<- rowMeans(predict.iv[,3:nsims])
predict.iv$iv.std 			<- apply(predict.iv[,3:nsims],1,sd)
predict.iv$iv.se   			<- apply(predict.iv[,3:nsims],1,sem)

C4p_predict.cv 				<- data.frame(predict.cv)
C4p_predict.iv  				<- data.frame(predict.iv)

#statistical outputs
Rsq.cal						<- as.vector(Rsq.cal)
Pval.cal    				<-as.vector(Pval.cal)
RMSEP.cal   				<- as.vector(RMSEP.cal)
BIAS.cal    				<-as.vector(BIAS.cal)
PERC_RMSEP.cal 				<- as.vector(PERC_RMSEP.cal)

Rsq.cv  						<- as.vector(Rsq.cv)
Pval.cv 						<-as.vector(Pval.cv)
RMSEP.cv 					<- as.vector(RMSEP.cv)
BIAS.cv    					<- as.vector(BIAS.cv)
PERC_RMSEP.cv 				<- as.vector(PERC_RMSEP.cv)

Rsq.iv 						<- as.vector(Rsq.iv)
Pval.iv 						<-as.vector(Pval.iv)
RMSEP.iv   					<- as.vector(RMSEP.iv)
BIAS.iv    					<- as.vector(BIAS.iv)
PERC_RMSEP.iv 				<- as.vector(PERC_RMSEP.iv)
PERC_RMSEP.iv2 				<- as.vector(PERC_RMSEP.iv2)

mx 							<- max(soil_C4p$C4p)
mn 							<- min(soil_C4p$C4p)

C4p_stats 					<- cbind(mx, mn, Rsq.cal, Pval.cal,RMSEP.cal,BIAS.cal,PERC_RMSEP.cal,Rsq.cv,Pval.cv,RMSEP.cv,BIAS.cv,PERC_RMSEP.cv)
C4p_iv_stats 				<- cbind(Rsq.iv,Pval.iv,RMSEP.iv,BIAS.iv,PERC_RMSEP.iv,PERC_RMSEP.iv2)

#print stats and read them back in to calculate means

C4p 							<- C4p_stats[-1,]
C4p_stats[1,]				<-colMeans(C4p)
C4p_stats.means 				<- C4p_stats[1,]


########## monocot proportion #########################
#######################################################

set.seed(93132687)
subset   	<- createDataPartition(soil_monocot$monocot, p=0.75, list=F, times=1) #identifies rows to use
testData 	<- soil_monocot[subset,] #keep data to run plsr models
ivData 		<- soil_monocot[-subset,] #true validation data
nr.iv 		<- nrow(ivData)
nr.all 		<- nrow(soil_monocot)

set.seed(54225756)
subset2 		<-createDataPartition(testData$monocot, p=0.80, list=F, times=nsims) #partitions nsims times - a matrix of random data partitions that is balanced
nr.cal 		<- nrow(subset2)
nr.cv  		<- nrow(testData)-nrow(subset2)

predict.cal <- matrix(nrow = nr.cal, ncol = 1)
predict.cal <- data.frame(predict.cal)

predict.cv 	<- matrix(nrow=nr.cv,ncol=1)
predict.cv 	<- data.frame(predict.cv)

predict.iv 	<- matrix(nrow=nr.iv,ncol=1) #check nrow
predict.iv 	<- data.frame(predict.iv)

cf 			<- data.frame()
coef 		<- data.frame(bands)

sim.cal 		<- vector()
sim.cv 		<- vector()
sim.iv      <- vector()

Rsq.cal    	<- matrix(nrow=1,ncol=1)
Rsq.cal 		<- data.frame(Rsq.cal)

Pval.cal   	<- matrix(nrow=1,ncol=1)
Pval.cal 	<- data.frame(Pval.cal)

RMSEP.cal  	<- matrix(nrow=1,ncol=1)
RMSEP.cal 	<- data.frame(RMSEP.cal)

BIAS.cal   	<- matrix(nrow=1,ncol=1)
BIAS.cal 	<- data.frame(BIAS.cal)

PERC_RMSEP.cal <- matrix(nrow=1,ncol=1)
PERC_RMSEP.cal <- data.frame(PERC_RMSEP.cal)

Rsq.cv     	<- matrix(nrow=1,ncol=1)
Rsq.cv 		<- data.frame(Rsq.cv)

Pval.cv    	<- matrix()
Pval.cv 		<- data.frame(Pval.cv)

RMSEP.cv   	<- matrix()
RMSEP.cv 	<- data.frame(RMSEP.cv)

BIAS.cv    	<- matrix()
BIAS.cv 		<- data.frame(BIAS.cv)

PERC_RMSEP.cv <- matrix()
PERC_RMSEP.cv <- data.frame(PERC_RMSEP.cv)

Rsq.iv    	<- matrix(nrow=1,ncol=1)
Rsq.iv 		<- data.frame(Rsq.iv)

Pval.iv   	<- matrix(nrow=1,ncol=1)
Pval.iv 		<- data.frame(Pval.iv)

RMSEP.iv  	<- matrix(nrow=1,ncol=1)
RMSEP.iv 	<- data.frame(RMSEP.iv)

BIAS.iv   	<- matrix(nrow=1,ncol=1)
BIAS.iv 		<- data.frame(BIAS.iv)

PERC_RMSEP.iv <- matrix(nrow=1,ncol=1)
PERC_RMSEP.iv <- data.frame(PERC_RMSEP.iv)

PERC_RMSEP.iv2 <- matrix(nrow=1,ncol=1)
PERC_RMSEP.iv2 <- data.frame(PERC_RMSEP.iv2)

#input data

comp 			<- monocotcomp

for (i in seq(nsims)){
  print(i) 
  #  i=1
  
  calData 				<- testData[subset2[,i], ] # training data; uses the ith column
  cvData  				<- testData[-subset2[,i], ] # cross validation data
  
  calData 				<-as.data.frame(calData)
  cvData 				<-as.data.frame(cvData)
  
  sim_num                 <- paste0("sim_",i)
  mod                     <- plsr(monocot ~ calData$REF, data=calData, ncomp=comp, validation="none") #using CV cross validation
  
  sim.cal                 <- predict(mod, ncomp=comp, which="train", newdata=calData$REF)
  predict.cal[,sim_num]   <- as.data.frame(sim.cal) # calibration
  
  sim.cv                  <- predict(mod, ncomp=comp, newdata=cvData$REF)
  predict.cv[,sim_num]    <-  sim.cv # cross validation
  
  sim.iv                  <- predict(mod, ncomp=comp, newdata=ivData$REF)
  predict.iv[,sim_num]    <- sim.iv # true validation
  
  cf                      <-  as.data.frame(coef(mod,ncomp=comp,intercept=TRUE))
  coef[sim_num]           <-  cf[,1]
  
  
  #calibration results
  y 							<- calData$monocot
  lm1 						<- summary(lm(y ~  sim.cal)) #run to check
  Rsq.cal[sim_num,1] 		<- lm1$r.squared
  Pval.cal[sim_num,1] 		<- lm1$coefficients[2,4]
  RMSEP.cal[sim_num,1] 		<- sqrt(mean((sim.cal - y)^2)) 
  BIAS.cal[sim_num,1] 		<- mean(sim.cal)-mean(y)
  PERC_RMSEP.cal[sim_num,1] <- RMSEP.cal[sim_num,1]/(max(y)-min(y))*100
  
  #cross validation results
  y 							<- cvData$monocot
  lm1 						<- summary(lm(y ~  sim.cv)) #run to check
  Rsq.cv[sim_num,1] 			<- lm1$r.squared
  Pval.cv[sim_num,1] 		<- lm1$coefficients[2,4]
  RMSEP.cv[sim_num,1] 		<- sqrt(mean((sim.cv - y)^2)) 
  BIAS.cv[sim_num,1] 		<- mean(sim.cv)-mean(y)
  PERC_RMSEP.cv[sim_num,1] 	<- RMSEP.cv[sim_num,1]/(max(y)-min(y))*100
  
  #true validation results
  y   						<- ivData$monocot
  lm1 						<- summary(lm(y ~  sim.iv)) #run to check
  Rsq.iv 					<- lm1$r.squared
  Pval.iv 					<- lm1$coefficients[2,4]
  RMSEP.iv 					<- sqrt(mean((sim.iv - y)^2)) 
  BIAS.iv 					<- mean(sim.iv)-mean(y)
  PERC_RMSEP.iv 				<- RMSEP.iv/(max(y)-min(y))*100
  PERC_RMSEP.iv2 			<- RMSEP.iv/(max(soil_monocot$monocot)-min(soil_monocot$monocot))*100
}

#Model coefficients to save
monocot.coef 	<- coef

data 						<-as.data.frame(soil_monocot) #set

#Use coefficients to generate predictions and save
coef 						<- coef[-1]
int  						<- as.matrix(coef[1,])
coef 						<- as.matrix(coef[-1,])
REF  						<- as.matrix(data$REF)

predict.cv 					<-(REF[,] %*% coef[,] + int[,])
predict.cv 					<-data.frame(predict.cv)
predict.cv$Plot 				<- data$Plot
predict.cv$measured 			<- data$monocot #change
predict.cv$cv.mean.pred 		<- rowMeans(predict.cv[,1:nsims])
predict.cv$cv.std 			<- apply(predict.cv[,1:nsims],1,sd)
predict.cv$cv.se   			<- apply(predict.cv[,1:nsims],1,sem)

predict.iv$Plot 				<- ivData$Plot
predict.iv$measured 			<- ivData$monocot
predict.iv$iv.mean.pred 		<- rowMeans(predict.iv[,3:nsims])
predict.iv$iv.std 			<- apply(predict.iv[,3:nsims],1,sd)
predict.iv$iv.se   			<- apply(predict.iv[,3:nsims],1,sem)

monocot_predict.cv 			<- data.frame(predict.cv)
monocot_predict.iv  			<- data.frame(predict.iv)

predict.iv$Plot 				<- ivData$Plot
predict.iv$measured 			<- ivData$monocot
predict.iv$iv.mean.pred 		<- rowMeans(predict.iv[,3:nsims])
predict.iv$iv.std 			<- apply(predict.iv[,3:nsims],1,sd)
predict.iv$iv.se   			<- apply(predict.iv[,3:nsims],1,sem)

monocot_predict.cv 			<- data.frame(predict.cv)
monocot_predict.iv  			<- data.frame(predict.iv)

#statistical outputs
Rsq.cal						<- as.vector(Rsq.cal)
Pval.cal    				<- as.vector(Pval.cal)
RMSEP.cal   				<- as.vector(RMSEP.cal)
BIAS.cal    				<- as.vector(BIAS.cal)
PERC_RMSEP.cal 				<- as.vector(PERC_RMSEP.cal)

Rsq.cv  						<- as.vector(Rsq.cv)
Pval.cv 						<- as.vector(Pval.cv)
RMSEP.cv 					<- as.vector(RMSEP.cv)
BIAS.cv    					<- as.vector(BIAS.cv)
PERC_RMSEP.cv 				<- as.vector(PERC_RMSEP.cv)

Rsq.iv 						<- as.vector(Rsq.iv)
Pval.iv 						<- as.vector(Pval.iv)
RMSEP.iv   					<- as.vector(RMSEP.iv)
BIAS.iv    					<- as.vector(BIAS.iv)
PERC_RMSEP.iv 				<- as.vector(PERC_RMSEP.iv)
PERC_RMSEP.iv2 				<- as.vector(PERC_RMSEP.iv2)

mx <- max(soil_monocot$monocot)
mn <- min(soil_monocot$monocot)

monocot_stats 				<- cbind(mx, mn, Rsq.cal, Pval.cal,RMSEP.cal,BIAS.cal,PERC_RMSEP.cal,Rsq.cv,Pval.cv,RMSEP.cv,BIAS.cv,PERC_RMSEP.cv)
monocot_iv_stats 			<- cbind(Rsq.iv,Pval.iv,RMSEP.iv,BIAS.iv,PERC_RMSEP.iv,PERC_RMSEP.iv2)

monocot 						<- monocot_stats[-1,]
monocot_stats[1,]			<- colMeans(monocot)
monocot_stats.means 			<- monocot_stats[1,]

#
########## dicot proportion #########################
#####################################################

set.seed(93132687)
subset 		<- createDataPartition(soil_dicot$dicot, p=0.75, list=FALSE, times=1) #identifies rows to use
testData 	<- soil_dicot[subset,] #keep data to run plsr models
ivData 		<- soil_dicot[-subset,] #true validation data
nr.iv 		<- nrow(ivData)
nr.all 		<- nrow(soil_dicot)

set.seed(54225756)
subset2 		<- createDataPartition(testData$dicot, p=0.80, list=FALSE, times=nsims) #partitions nsims times - a matrix of random data partitions that is balanced
nr.cal 		<- nrow(subset2)
nr.cv  		<- nrow(testData)-nrow(subset2)

predict.cal <- matrix(nrow = nr.cal, ncol = 1)
predict.cal <- data.frame(predict.cal)

predict.cv 	<- matrix(nrow=nr.cv,ncol=1)
predict.cv 	<- data.frame(predict.cv)

predict.iv 	<- matrix(nrow=nr.iv,ncol=1) #check nrow
predict.iv 	<- data.frame(predict.iv)

predict.all <- matrix(nrow=nr.all,ncol=1)
predict.all <- data.frame(predict.all)

cf 			<- data.frame()
coef 		<- data.frame(bands)

sim.cal 		<- vector()
sim.cv 		<- vector()
sim.iv 		<- vector()

Rsq.cal    	<- matrix(nrow=1,ncol=1)
Rsq.cal 		<- data.frame(Rsq.cal)

Pval.cal   	<- matrix(nrow=1,ncol=1)
Pval.cal 	<- data.frame(Pval.cal)

RMSEP.cal  	<- matrix(nrow=1,ncol=1)
RMSEP.cal 	<- data.frame(RMSEP.cal)

BIAS.cal   	<- matrix(nrow=1,ncol=1)
BIAS.cal 	<- data.frame(BIAS.cal)

PERC_RMSEP.cal <- matrix(nrow=1,ncol=1)
PERC_RMSEP.cal <- data.frame(PERC_RMSEP.cal)

Rsq.cv     	<- matrix(nrow=1,ncol=1)
Rsq.cv 		<- data.frame(Rsq.cv)

Pval.cv    	<- matrix()
Pval.cv 		<- data.frame(Pval.cv)

RMSEP.cv   	<- matrix()
RMSEP.cv 	<- data.frame(RMSEP.cv)

BIAS.cv    	<- matrix()
BIAS.cv 		<- data.frame(BIAS.cv)

PERC_RMSEP.cv <- matrix()
PERC_RMSEP.cv <- data.frame(PERC_RMSEP.cv)

Rsq.iv    	<- matrix(nrow=1,ncol=1)
Rsq.iv 		<- data.frame(Rsq.iv)

Pval.iv   	<- matrix(nrow=1,ncol=1)
Pval.iv 		<- data.frame(Pval.iv)

RMSEP.iv  	<- matrix(nrow=1,ncol=1)
RMSEP.iv 	<- data.frame(RMSEP.iv)

BIAS.iv   	<- matrix(nrow=1,ncol=1)
BIAS.iv 		<- data.frame(BIAS.iv)

PERC_RMSEP.iv <- matrix(nrow=1,ncol=1)
PERC_RMSEP.iv <- data.frame(PERC_RMSEP.iv)

PERC_RMSEP.iv2 <- matrix(nrow=1,ncol=1)
PERC_RMSEP.iv2 <- data.frame(PERC_RMSEP.iv2)

#input data

comp <- dicotcomp
all.data <- soil_dicot

for (i in seq(nsims)){
  print(i) 
  #  i=1
  
  calData <- testData[subset2[,i], ] # training data; uses the ith column
  cvData  <- testData[-subset2[,i], ] # cross validation data
  
  calData <-as.data.frame(calData)
  cvData  <-as.data.frame(cvData)
  
  sim_num                 <- paste0("sim_",i)
  mod                     <- plsr(dicot ~ calData$REF, data=calData, ncomp=comp, validation="none") #using CV cross validation
  
  sim.cal                 <- predict(mod, ncomp=comp, which="train", newdata=calData$REF)
  predict.cal[,sim_num]   <- as.data.frame(sim.cal) # calibration
  
  sim.cv                  <- predict(mod, ncomp=comp, newdata=cvData$REF)
  predict.cv[,sim_num]    <-  sim.cv # cross validation
  
  sim.iv                  <- predict(mod, ncomp=comp, newdata=ivData$REF)
  predict.iv[,sim_num]    <- sim.iv # true validation
  
  sim.all                 <- predict(mod, ncomp=comp, newdata=all.data$REF)
  predict.all[,sim_num]   <- sim.all # true validation
  
  cf                      <-  as.data.frame(coef(mod,ncomp=comp,intercept=TRUE))
  coef[sim_num]           <-  cf[,1]
  
  
  #calibration results
  y 						<- calData$dicot
  lm1 					<- summary(lm(y ~  sim.cal)) #run to check
  Rsq.cal[sim_num,1] 	<- lm1$r.squared
  Pval.cal[sim_num,1] 	<- lm1$coefficients[2,4]
  RMSEP.cal[sim_num,1] 	<- sqrt(mean((sim.cal - y)^2)) 
  BIAS.cal[sim_num,1] 	<- mean(sim.cal)-mean(y)
  PERC_RMSEP.cal[sim_num,1] <- RMSEP.cal[sim_num,1]/(max(y)-min(y))*100
  
  #cross validation results
  y 						<- cvData$dicot
  lm1 					<- summary(lm(y ~  sim.cv)) #run to check
  Rsq.cv[sim_num,1] 		<- lm1$r.squared
  Pval.cv[sim_num,1] 	<- lm1$coefficients[2,4]
  RMSEP.cv[sim_num,1] 	<- sqrt(mean((sim.cv - y)^2)) 
  BIAS.cv[sim_num,1] 	<- mean(sim.cv)-mean(y)
  PERC_RMSEP.cv[sim_num,1] <- RMSEP.cv[sim_num,1]/(max(y)-min(y))*100
  
  #true validation results
  y   					<- ivData$dicot
  lm1 					<- summary(lm(y ~  sim.iv)) #run to check
  Rsq.iv 				<- lm1$r.squared
  Pval.iv 				<- lm1$coefficients[2,4]
  RMSEP.iv 				<- sqrt(mean((sim.iv - y)^2)) 
  BIAS.iv 				<- mean(sim.iv)-mean(y)
  PERC_RMSEP.iv 			<- RMSEP.iv/(max(y)-min(y))*100
  PERC_RMSEP.iv2 		<- RMSEP.iv/(max(soil_dicot$dicot)-min(soil_dicot$dicot))*100
}
#Model coefficients to save
dicot.coef 				<- coef

data <-as.data.frame(soil_dicot) #set

#Use coefficients to generate predictions and save
coef 					<- coef[-1]
int  					<- as.matrix(coef[1,])
coef 					<- as.matrix(coef[-1,])
REF  					<- as.matrix(data$REF)

predict.cv 				<-(REF[,] %*% coef[,] + int[,])
predict.cv 				<-data.frame(predict.cv)
predict.cv$Plot 			<- data$Plot
predict.cv$measured 		<- data$dicot #change
predict.cv$cv.mean.pred <- rowMeans(predict.cv[,1:nsims])
predict.cv$cv.std 		<- apply(predict.cv[,1:nsims],1,sd)
predict.cv$cv.se   		<- apply(predict.cv[,1:nsims],1,sem)

predict.iv$Plot 			<- ivData$Plot
predict.iv$measured 		<-ivData$dicot
predict.iv$iv.mean.pred <- rowMeans(predict.iv[,3:nsims])
predict.iv$iv.std 		<- apply(predict.iv[,3:nsims],1,sd)
predict.iv$iv.se   		<- apply(predict.iv[,3:nsims],1,sem)

dicot_predict.cv 		<- data.frame(predict.cv)
dicot_predict.iv  		<- data.frame(predict.iv)

#statistical outputs
Rsq.cal					<- as.vector(Rsq.cal)
Pval.cal    			<-as.vector(Pval.cal)
RMSEP.cal   			<- as.vector(RMSEP.cal)
BIAS.cal    			<-as.vector(BIAS.cal)
PERC_RMSEP.cal 			<- as.vector(PERC_RMSEP.cal)

Rsq.cv  					<- as.vector(Rsq.cv)
Pval.cv 					<-as.vector(Pval.cv)
RMSEP.cv 				<- as.vector(RMSEP.cv)
BIAS.cv    				<- as.vector(BIAS.cv)
PERC_RMSEP.cv 			<- as.vector(PERC_RMSEP.cv)

Rsq.iv 					<- as.vector(Rsq.iv)
Pval.iv 					<- as.vector(Pval.iv)
RMSEP.iv   				<- as.vector(RMSEP.iv)
BIAS.iv    				<- as.vector(BIAS.iv)
PERC_RMSEP.iv 			<- as.vector(PERC_RMSEP.iv)
PERC_RMSEP.iv2 			<- as.vector(PERC_RMSEP.iv2)

mx <- max(soil_dicot$dicot)
mn <- min(soil_dicot$dicot)

dicot_stats 			<- cbind(mx, mn, Rsq.cal, Pval.cal,RMSEP.cal,BIAS.cal,PERC_RMSEP.cal,Rsq.cv,Pval.cv,RMSEP.cv,BIAS.cv,PERC_RMSEP.cv)
dicot_iv_stats 			<- cbind(Rsq.iv,Pval.iv,RMSEP.iv,BIAS.iv,PERC_RMSEP.iv,PERC_RMSEP.iv2)

#print stats 

dicot 					<- dicot_stats[-1,]
dicot_stats[1,]			<- colMeans(dicot)
dicot_stats.means 		<- dicot_stats[1,]

##############################
##############################
#OUTLIER ANALYSIS
snr =100
nsims=100

#set.seed(25689321)
outlier <- matrix(nrow=nrow(soil_biomass),ncol=0)
outlier <-data.frame(outlier)
outlier$Plot <- soil_biomass$Plot

for (j in seq(snr)){
print(j) 
set.seed(25689321+j)

subset <- createDataPartition(soil_biomass$biomass, p=0.75, list=F, times=1) #identifies rows to use
testData <- soil_biomass[subset,] #keep data to run plsr models
ivData <- soil_biomass[-subset,] #true validation data
nr.iv <-nrow(ivData)

set.seed(24219792+j)
subset2 <-createDataPartition(testData$biomass, p=0.80, list=F, times=nsims) #partitions nsims times - a matrix of random but balanced data partitions 
nr.cal <- nrow(subset2)
nr.cv  <- nrow(testData)-nrow(subset2)

B_predict.iv <- matrix(nrow=nr.iv, ncol=1) #check nrow
B_predict.iv <- data.frame(ivData$biomass)
B_predict.iv$Plot <- ivData$Plot

Bsim.cal <-vector()
Bsim.cv <-vector()
Bsim.iv <-vector()

  for (i in seq(nsims)){

    #i=1
    calData <- testData[subset2[,i], ] # training data; uses the ith column
    cvData  <- testData[-subset2[,i], ] # cross validation data
  
    calData <-as.data.frame(calData)
    cvData <-as.data.frame(cvData)
  
    sim_num                 <- paste0("sim_",i)
    B_mod                   <- plsr(biomass ~ calData$REF, data=calData, ncomp=biomasscompi, validation="none") #using CV cross validation
  
    Bsim.iv                 <- predict(B_mod, ncomp=biomasscompi, newdata=ivData$REF)
    B_predict.iv[,sim_num]  <- Bsim.iv # true validation
  }

B_predict.iv$Plot <- ivData$Plot
B_predict.iv$measured <-ivData$biomass
B_predict.iv$iv.mean.pred <- rowMeans(B_predict.iv[,3:nsims])
B_predict.iv$diff   <-B_predict.iv$iv.mean.pred - B_predict.iv$measured

mat <- B_predict.iv[,c(2,105)]
rep<-paste0("sim_",j)

colnames(mat)[2]<-rep
outlier <- merge(outlier, mat, by="Plot", all.x=TRUE, all.y=TRUE)


}
outlier$mean.diff     <- rowMeans(outlier[,-c(1)], na.rm=TRUE)
outlier$abs.mean.diff <- abs(rowMeans(outlier[,-c(1)], na.rm=TRUE))

cutoff<-(max(soil_biomass$biomass)-min(soil_biomass$biomass))*0.4

outlier_plots <- subset(outlier, outlier$abs.mean.diff>cutoff)
outlier_plots <- outlier_plots[,-c(2:snr+1)]
pc.dat.rem <- round(nrow(outlier_plots)/nrow(soil_biomass),digits =3)
# Write out 
#filename<-paste0("~/WoodRiver.2017/Outlier_test/",rep,".",pc.dat.rem,"outlier.biomass.csv")
write.csv(outlier, filename)
#filename2<-paste0("~/WoodRiver.2017/Outlier_test/",rep,".","outlier.biomass.plots.csv")
write.csv(outlier_plots, filename2)

print(outlier_plots$Plot)

#If outliers from airborne data are present, they can be removed:
soil_biomass2 <- soil_biomass[!(soil_biomass$Plot %in% outlier_plots$Plot),]
                        

###############################
nsims=500 #(or 1000)

soil_biomass <- soil_biomass2
#biomass
set.seed(25689321) #set seed to repeat 
subset 			<- createDataPartition(soil_biomass$biomass, p=0.75, list=FALSE, times=1) #identifies rows to use
testData 		<- soil_biomass[subset,] #keep data to run plsr models
ivData 			<- soil_biomass[-subset,] #true validation data
nr.iv 			<-nrow(ivData)

set.seed(24219792)
subset2 		<-createDataPartition(testData$biomass, p=0.80, list=FALSE, times=nsims) #partitions nsims times - a matrix of random data partitions that is balanced
nr.cal 			<- nrow(subset2)
nr.cv  			<- nrow(testData)-nrow(subset2)

#biomass
B_predict.cal 	<- matrix(nrow = nr.cal, ncol = 1)
B_predict.cal 	<- data.frame(B_predict.cal)

B_predict.cv 	<- matrix(nrow=nr.cv,ncol=1)
B_predict.cv 	<- data.frame(B_predict.cv)

B_predict.iv 	<- matrix(nrow=nr.iv, ncol=1) #check nrow
B_predict.iv 	<- data.frame(ivData$biomass)
B_predict.iv$Plot <- ivData$Plot

Bcf 			<- data.frame()
B_coef 			<- data.frame(bands)

Bsim.cal 		<- vector()
Bsim.cv  		<- vector()
Bsim.iv  		<- vector()

Rsq.cal    		<- matrix(nrow=1,ncol=1)
Rsq.cal    		<- data.frame(Rsq.cal)

Pval.cal   		<- matrix(nrow=1,ncol=1)
Pval.cal   		<- data.frame(Pval.cal)

RMSEP.cal  		<- matrix(nrow=1,ncol=1)
RMSEP.cal  		<- data.frame(RMSEP.cal)

BIAS.cal   		<- matrix(nrow=1,ncol=1)
BIAS.cal   		<- data.frame(BIAS.cal)

PERC_RMSEP.cal 	<- matrix(nrow=1,ncol=1)
PERC_RMSEP.cal 	<- data.frame(PERC_RMSEP.cal)

Rsq.cv     		<- matrix(nrow=1,ncol=1)
Rsq.cv     		<- data.frame(Rsq.cv)

Pval.cv    		<- matrix()
Pval.cv    		<- data.frame(Pval.cv)

RMSEP.cv   		<- matrix()
RMSEP.cv   		<- data.frame(RMSEP.cv)

BIAS.cv    		<- matrix()
BIAS.cv    		<- data.frame(BIAS.cv)

PERC_RMSEP.cv 	<- matrix()
PERC_RMSEP.cv 	<- data.frame(PERC_RMSEP.cv)

Rsq.iv    		<- matrix(nrow=1,ncol=1)
Rsq.iv    		<- data.frame(Rsq.iv)

Pval.iv   		<- matrix(nrow=1,ncol=1)
Pval.iv   		<- data.frame(Pval.iv)

RMSEP.iv  		<- matrix(nrow=1,ncol=1)
RMSEP.iv  		<- data.frame(RMSEP.iv)

BIAS.iv   		<- matrix(nrow=1,ncol=1)
BIAS.iv   		<- data.frame(BIAS.iv)

PERC_RMSEP.iv 	<- matrix(nrow=1,ncol=1)
PERC_RMSEP.iv 	<- data.frame(PERC_RMSEP.iv)

PERC_RMSEP.iv2 	<- matrix(nrow=1,ncol=1)
PERC_RMSEP.iv2 	<- data.frame(PERC_RMSEP.iv2)


for (i in seq(nsims)){
  print(i) 
  #i=1
  
  calData <- testData[subset2[,i], ] # training data; uses the ith column
  cvData  <- testData[-subset2[,i], ] # cross validation data
  
  calData <- as.data.frame(calData)
  cvData  <- as.data.frame(cvData)
  
  sim_num                 <- paste0("sim_",i)
  B_mod                   <- plsr(biomass ~ calData$REF, data=calData, ncomp=biomasscompi, validation="none") #using CV cross validation
  
  Bsim.cal                <- predict(B_mod, ncomp=biomasscompi, which="train", newdata=calData$REF)
  B_predict.cal$biomass   <- as.vector(calData$biomass)
  B_predict.cal[,sim_num] <- as.data.frame(Bsim.cal) # calibration
  
  Bsim.cv                 <- predict(B_mod, ncomp=biomasscompi, newdata=cvData$REF)
  B_predict.cv[,sim_num]  <- Bsim.cv # cross validation
  
  Bsim.iv                 <- predict(B_mod, ncomp=biomasscompi, newdata=ivData$REF)
  B_predict.iv[,sim_num]  <- Bsim.iv # true validation
  
  Bcf                     <-  as.data.frame(coef(B_mod,ncomp=biomasscompi,intercept=TRUE))
  B_coef[sim_num]         <-  Bcf[,1]
  
  # #calibration results
  lm1 					<- summary(lm(calData$biomass ~  Bsim.cal)) #run to check
  Rsq.cal[sim_num,1] 	<- lm1$r.squared
  Pval.cal[sim_num,1] 	<- lm1$coefficients[2,4]
  RMSEP.cal[sim_num,1] 	<- sqrt(mean((Bsim.cal - calData$biomass)^2))
  BIAS.cal[sim_num,1] 	<- mean(Bsim.cal)-mean(calData$biomass)
  PERC_RMSEP.cal[sim_num,1] <- RMSEP.cal[sim_num,1]/(max(calData$biomass)-min(calData$biomass))*100

  #cross validation results
  lm1 					<- summary(lm(cvData$biomass ~  Bsim.cv)) #run to check
  Rsq.cv[sim_num,1] 	<- lm1$r.squared
  Pval.cv[sim_num,1] 	<- lm1$coefficients[2,4]
  RMSEP.cv[sim_num,1] 	<- sqrt(mean((Bsim.cv - cvData$biomass)^2))
  BIAS.cv[sim_num,1] 	<- mean(Bsim.cv)-mean(cvData$biomass)
  PERC_RMSEP.cv[sim_num,1] <- RMSEP.cv[sim_num,1]/(max(cvData$biomass)-min(cvData$biomass))*100
  
  #true validation results
  lm1 					<- summary(lm(ivData$biomass ~  Bsim.iv)) #run to check
  Rsq.iv 				<- lm1$r.squared
  Pval.iv 				<- lm1$coefficients[2,4]
  RMSEP.iv 				<- sqrt(mean((Bsim.iv - ivData$biomass)^2)) 
  BIAS.iv 				<- mean(Bsim.iv)-mean(ivData$biomass)
  PERC_RMSEP.iv 		<- RMSEP.iv/(max(ivData$biomass)-min(ivData$biomass))*100
  PERC_RMSEP.iv2 		<- RMSEP.iv/(max(soil_biomass$biomass)-min(soil_biomass$biomass))*100
}

#Model coefficients to save
B.coef 					<- B_coef

data<-as.data.frame(soil_biomass)

coef 					<- B_coef[-1]
int  					<- as.matrix(coef[1,])
coef 					<- as.matrix(coef[-1,])
REF  					<- as.matrix(data$REF)

B_predict.cv          	<-(REF[,] %*% coef[,] + int[,])
B_predict.cv          	<-data.frame(B_predict.cv)
B_predict.cv$Plot     	<- data$Plot
B_predict.cv$measured 	<- data$biomass
B_predict.cv$cv.mean.pred <- rowMeans(B_predict.cv[,1:nsims])
B_predict.cv$cv.std   	<- apply(B_predict.cv[,1:nsims],1,sd)
B_predict.cv$cv.se    	<- apply(B_predict.cv[,1:nsims],1,sem)

B_predict.iv$Plot     	<- ivData$Plot
B_predict.iv$measured 	<-ivData$biomass
B_predict.iv$iv.mean.pred <- rowMeans(B_predict.iv[,3:nsims])
B_predict.iv$iv.std   	<- apply(B_predict.iv[,3:nsims],1,sd)
B_predict.iv$iv.se    	<- apply(B_predict.iv[,3:nsims],1,sem)
B_predict.iv$diff     	<-B_predict.iv$iv.mean.pred - B_predict.iv$measured

mx <- max(soil_biomass$biomass)
mn <- min(soil_biomass$biomass)

#statistical outputs
Rsq.cal               	<- as.vector(Rsq.cal)
Pval.cal              	<-as.vector(Pval.cal)
RMSEP.cal             	<- as.vector(RMSEP.cal)
BIAS.cal              	<-as.vector(BIAS.cal)
PERC_RMSEP.cal        	<- as.vector(PERC_RMSEP.cal)

Rsq.cv        			<- as.vector(Rsq.cv)
Pval.cv 					<-as.vector(Pval.cv)
RMSEP.cv 				<- as.vector(RMSEP.cv)
BIAS.cv    				<- as.vector(BIAS.cv)
PERC_RMSEP.cv 			<- as.vector(PERC_RMSEP.cv)

Rsq.iv 					<- as.vector(Rsq.iv)
Pval.iv 				<-as.vector(Pval.iv)
RMSEP.iv   				<- as.vector(RMSEP.iv)
BIAS.iv    				<- as.vector(BIAS.iv)
PERC_RMSEP.iv 			<- as.vector(PERC_RMSEP.iv)

biomass_stats 			<- cbind(mx,mn,Rsq.cal, Pval.cal,RMSEP.cal,BIAS.cal,PERC_RMSEP.cal,Rsq.cv,Pval.cv,RMSEP.cv,BIAS.cv,PERC_RMSEP.cv)
biomass_iv_stats 		<-cbind(Rsq.iv,Pval.iv,RMSEP.iv,BIAS.iv,PERC_RMSEP.iv,PERC_RMSEP.iv2)

bm 						<- biomass_stats[-1,]
biomass_stats[1,]		<- colMeans(bm)
biomass_stats_means 	<- biomass_stats[1,]


#####################################
##### COLLATE RESULTS ###############

iv.stats <- rbind(biomass_iv_stats, Lp_iv_stats, Fp_iv_stats, C4p_iv_stats, C3p_iv_stats, monocot_iv_stats, dicot_iv_stats)
rownames(iv.stats) <- c("biomass", "Legume_proportion", "Forb_proportion","C4_proportion","C3_proportion", "monocot_prop", "dicot_prop")

stats.means <- rbind(biomass_stats_means, Lp_stats.means, Fp_stats.means, C4p_stats.means, C3p_stats.means, monocot_stats.means, dicot_stats.means)
rownames(stats.means) <- c("biomass", "Legume_proportion", "Forb_proportion","C4_proportion","C3_proportion", "monocot_prop", "dicot_prop")

stats.summary <- cbind(stats.means, iv.stats)


##############################################################################
##### Print out the results
##############################################################################
write.csv(stats.summary, file="~/Dropbox/NSF_NASA_DoB/DoB/R/PLSR.output/July2020/WoodRiver.2017/WoodRiver.2017.BioDIV.FG.model.performance.summary.csv")

write.csv(C4p_predict.iv, file="~/WoodRiver.2017/WoodRiver.2017.C4p.iv.csv")
write.csv(C3p_predict.iv, file="~/WoodRiver.2017/WoodRiver.2017.C3p.iv.csv")
write.csv(Lp_predict.iv, file="~/WoodRiver.2017/WoodRiver.2017.Lp.iv.csv")
write.csv(Fp_predict.iv, file="~/WoodRiver.2017/WoodRiver.2017.Fp.iv.csv")
write.csv(monocot_predict.iv, file="~/WoodRiver.2017/WoodRiver.2017.monocot.iv.csv")
write.csv(dicot_predict.iv, file="~/WoodRiver.2017/WoodRiver.2017.dicot.iv.csv")
write.csv(B_predict.iv, file="~/WoodRiver.2017/WoodRiver.2017.biomass.iv.csv")

write.csv(Lp_predict.cv, file="~/WoodRiver.2017/WoodRiver.2017.pred.Lp.cv.csv")
write.csv(C4p_predict.cv, file="~/WoodRiver.2017/WoodRiver.2017.pred.C4p.cv.csv")
write.csv(C3p_predict.cv, file="~/WoodRiver.2017/WoodRiver.2017.pred.C3p.cv.csv")
write.csv(Fp_predict.cv, file="~/WoodRiver.2017/WoodRiver.2017.pred.Fp.cv.csv")
write.csv(monocot_predict.cv, file="~/WoodRiver.2017/WoodRiver.2017.pred.monocot.cv.csv")
write.csv(monocot_predict.cv, file="~/WoodRiver.2017/WoodRiver.2017.pred.dicot.cv.csv")
write.csv(B_predict.cv, file="~/WoodRiver.2017/WoodRiver.2017.pred.biomass.cv.csv")
# 
write.csv(C4p.coef, file="~/WoodRiver.2017/WoodRiver.2017.C4p.coef.csv")
write.csv(C3p.coef, file="~/WoodRiver.2017/WoodRiver.2017.C3p.coef.csv")
write.csv(Fp.coef, file="~/WoodRiver.2017/WoodRiver.2017.Fp.coef.csv")
write.csv(Lp.coef, file="~/WoodRiver.2017/WoodRiver.2017.Lp.coef.csv")
write.csv(monocot.coef, file="~/WoodRiver.2017/WoodRiver.2017.monocot.coef.csv")
write.csv(dicot.coef, file="~/WoodRiver.2017/WoodRiver.2017.dicot.coef.csv")
write.csv(B.coef, file="~/WoodRiver.2017/WoodRiver.2017.biomass.coef.csv")