############################################################################################################ ################################################################################# #################################################################################################################### # Set seed and load packages set.seed(1234567) library(ggplot2) library(gridExtra) library(corrplot) library(reshape) library(caret) library(caretEnsemble) library(caTools) library(data.table) library(xgboost) library(randomForest) library(e1071) library(naivebayes) # Read data ## # df = forest_polygons_data_2019 str(df) # Remove id and X columns since they are not useful df$COMPT <- NULL df <- df[, order(names(df))] df <- df[, c(1:14, 16:32, 15)] #df <- df[, c(1:6, 8:19, 7)] head(df) # Show summary of data summary(df) ########################################################################################### #Split data into train set and test set # Split data into train set and test set index <- sample(1:nrow(df), 0.7 * nrow(df)) train <- df[index,] test <- df[-index,] # This makes sure that malignant (loss) is positive and benign (gain) is negative df$MORTALITY <- factor(df$MORTALITY, levels = c("LOSS", "GAIN")) train$MORTALITY <- factor(train$MORTALITY, levels = c("LOSS", "GAIN")) test$MORTALITY <- factor(test$MORTALITY, levels = c("LOSS", "GAIN")) econtrol <- trainControl(method="cv", number=10, summaryFunction = twoClassSummary,savePredictions = TRUE, classProbs = TRUE) models <- caretList(MORTALITY ~., data=train, methodList=c("xgbTree", "rf", "naive_bayes"), preProcess = c("center", "scale", "nzv","corr", "pca"), trControl = econtrol, metric = "ROC" ) #Caret rf method calls for the randomForest function from randomForest package. #the warnings arise as caret's one general function 'train' (or other function it calls) feeds higher value to 'mtry'agrument than the number of predictor variables. # But as the model runs a little more, randomForest adjust value of 'mtry' to a lower valid value. The model runs and outcome is valid. results <- resamples(models) summary(results) #Stacked model using boosted tree model on meta-models stack <- caretStack(models, method="xgbTree", metric="Sens", verbose = FALSE, trControl = trainControl(method="boot", number=15, savePredictions="final", classProbs=TRUE, summaryFunction=twoClassSummary) ) test$xgbTree <- predict(models$xgbTree, test) test$rf <- predict(models$rf, test) test$naive_bayes <- predict(models$naive_bayes, test) pred <- predict(stack, test, type="prob") #pred <- predict(stack, test0, type="prob") threshold <- 0.5 test$stack <- ifelse(pred>threshold, "LOSS", "GAIN") test$stack <- factor(test$stack, levels = c("LOSS", "GAIN")) cmstackpreds <- confusionMatrix(test$xgbTree, test$MORTALITY, positive="LOSS", mode="everything") cmstackpreds cmstackpreds <- confusionMatrix(test$rf, test$MORTALITY, positive="LOSS", mode="everything") cmstackpreds cmstackpreds <- confusionMatrix(test$naive_bayes, test$MORTALITY, positive="LOSS", mode="everything") cmstackpreds ## STACK model results cmstackpreds <- confusionMatrix(test$stack, test$MORTALITY, positive="LOSS", mode="everything") cmstackpreds ################################################################################################################################################################### ###########################################################################################################################################################3 ## PREDICTING Tree cover loss in plantations (n=2147) ####################################################################################################################### #test0 = Test_data_2147plantations_2019 head(test0) test0$FID <- NULL test0<- test0[, order(names(test0))] test0$AvailableSWC<-as.numeric(test0$AvailableSWC) test0$TopSoil_BulkDen<-as.numeric(test0$TopSoil_BulkDen) test0$TopSoil_PH<-as.numeric(test0$TopSoil_PH) test0$Number_of_Smallholdings<-as.numeric(test0$Number_of_Smallholdings) head(test0) str(test0) # Show summary of data summary(test0) pred_plantation_data <- predict(stack, test0, type="prob") p<-pred_plantation_data df <- data.frame(plantationID=1:2147) #plantationID: ID for the plantations for which tree cover loss predictions are made df1 <- data.frame(predProbTreeCoverLoss=p) write.table(c(df,df1),file="pred_plantation_data.csv",col.names = c("plantationID","predProbTreeCoverLoss"),sep = ",",row.names = F) ####################################################################################################################