############################################################################################################
#################################################################################
####################################################################################################################

# Set seed and load packages
set.seed(1234567)
library(ggplot2)
library(gridExtra)
library(corrplot)
library(reshape)
library(caret)
library(caretEnsemble)
library(caTools)
library(data.table)
library(xgboost)
library(randomForest)
library(e1071)
library(naivebayes)


# Read data
##
# df = forest_polygons_data_2019 

str(df)

# Remove id and X columns since they are not useful
df$COMPT <- NULL


df <- df[, order(names(df))]
df <- df[, c(1:14, 16:32, 15)]

#df <- df[, c(1:6, 8:19, 7)]
head(df)

# Show summary of data
summary(df)


###########################################################################################
#Split data into train set and test set
# Split data into train set and test set
index <- sample(1:nrow(df), 0.7 * nrow(df))
train <- df[index,]
test <- df[-index,]


# This makes sure that malignant (loss) is positive and benign (gain) is negative
df$MORTALITY <- factor(df$MORTALITY, levels = c("LOSS", "GAIN"))
train$MORTALITY <- factor(train$MORTALITY, levels = c("LOSS", "GAIN"))
test$MORTALITY <- factor(test$MORTALITY, levels = c("LOSS", "GAIN"))


econtrol <- trainControl(method="cv", number=10, summaryFunction = twoClassSummary,savePredictions = TRUE, classProbs = TRUE)


models <- caretList(MORTALITY ~., data=train,
                    methodList=c("xgbTree", "rf", "naive_bayes"),
                    preProcess = c("center", "scale", "nzv","corr", "pca"),
                    trControl = econtrol,
                    metric = "ROC"
)

#Caret rf method calls for the randomForest function from randomForest package.
#the warnings arise as caret's one general function 'train' (or other function it calls) feeds higher value to 'mtry'agrument than the number of predictor variables. 
# But as the model runs a little more, randomForest adjust value of 'mtry' to a lower valid value. The model runs and outcome is valid.


results <- resamples(models)
summary(results)


#Stacked model using boosted tree model on meta-models
stack <- caretStack(models, method="xgbTree", metric="Sens", verbose = FALSE,
                    trControl = trainControl(method="boot", number=15, savePredictions="final",
                                             classProbs=TRUE, summaryFunction=twoClassSummary)
)



test$xgbTree <- predict(models$xgbTree, test)
test$rf <- predict(models$rf, test)
test$naive_bayes <- predict(models$naive_bayes, test)

pred <- predict(stack, test, type="prob")

#pred <- predict(stack, test0, type="prob")


threshold <- 0.5
test$stack <- ifelse(pred>threshold, "LOSS", "GAIN")
test$stack <- factor(test$stack, levels = c("LOSS", "GAIN"))


cmstackpreds <- confusionMatrix(test$xgbTree, test$MORTALITY, positive="LOSS", mode="everything")
cmstackpreds

cmstackpreds <- confusionMatrix(test$rf, test$MORTALITY, positive="LOSS", mode="everything")
cmstackpreds


cmstackpreds <- confusionMatrix(test$naive_bayes, test$MORTALITY, positive="LOSS", mode="everything")
cmstackpreds

## STACK model results

cmstackpreds <- confusionMatrix(test$stack, test$MORTALITY, positive="LOSS", mode="everything")
cmstackpreds

###################################################################################################################################################################

###########################################################################################################################################################3
## PREDICTING Tree cover loss in plantations (n=2147)
#######################################################################################################################
#test0 = Test_data_2147plantations_2019  

head(test0)
test0$FID <- NULL
test0<- test0[, order(names(test0))]

test0$AvailableSWC<-as.numeric(test0$AvailableSWC)
test0$TopSoil_BulkDen<-as.numeric(test0$TopSoil_BulkDen)
test0$TopSoil_PH<-as.numeric(test0$TopSoil_PH)
test0$Number_of_Smallholdings<-as.numeric(test0$Number_of_Smallholdings)
head(test0)


str(test0)

# Show summary of data
summary(test0)

pred_plantation_data <- predict(stack, test0, type="prob")
p<-pred_plantation_data

df <- data.frame(plantationID=1:2147)
#plantationID: ID for the plantations for which tree cover loss predictions are made

df1 <- data.frame(predProbTreeCoverLoss=p)

write.table(c(df,df1),file="pred_plantation_data.csv",col.names = c("plantationID","predProbTreeCoverLoss"),sep = ",",row.names = F)


####################################################################################################################