# Clear up the analysis environment
rm(list=ls())
setwd('/Users/aherman/rashidi_work')
library('glmnet')

# Read in the matrix and arcsin sqrt transform
mat <- read.table('transpose_mphlan2_otu_table.txt', header = T, sep = '\t')
mat <- as.matrix(mat)
mat <- asin(sqrt(mat))

# list the files to iterate over
files <- list.files('/Users/aherman/rashidi_work', pattern = '*dbgap.txt')

#[1] "rs4415345_alt_counts_dbgap.txt" "rs4415345_ref_counts_dbgap.txt" "rs4610776_alt_counts_dbgap.txt"
#[4] "rs4610776_ref_counts_dbgap.txt"

# Do one LASSO each and plot the AUCs for different lambdas
par(mfrow=c(2,2))
emp_cvms <- setNames(data.frame(matrix(ncol = 4, nrow = 1)), gsub('_counts_dbgap.txt','', files))
pdf('auc_vs_lambas.pdf')
for(file in files){
	gtypes <- as.numeric(scan(file))
	print(sum(gtypes))
	name <- gsub('_counts_dbgap.txt','', file)
	cv.glmmod <- cv.glmnet(mat, y=gtypes, alpha = 1, family = 'binomial', type.measure = 'auc', nfolds = 8)
	plot(cv.glmmod)
	#lambda_1se <- cv.glmmod$lambda.1se
	emp_cvms[name] <- cv.glmmod$cvm[match(cv.glmmod$lambda.min, cv.glmmod$lambda)]	
}
dev.off()

# Build the null distribution of AUC values and write to file
p_cvms <- setNames(data.frame(matrix(ncol = 4, nrow = 1000)), gsub('_counts_dbgap.txt','', files))
for(file in files){
	name <- gsub('_counts_dbgap.txt','', file)
	gtypes <- as.numeric(scan(file))
	vect <- c()
	for(i in seq(1,1000)){
		permutes <- sample(gtypes, size = length(gtypes))
		cv.glmmod <- cv.glmnet(mat, y=permutes, alpha = 1, family = 'binomial', type.measure = 'auc', nfolds = 8)
		vect[i] <- cv.glmmod$cvm[match(cv.glmmod$lambda.min, cv.glmmod$lambda)]
		}
	p_cvms[name] <- vect
}
write.table(p_cvms, file = 'permutation_min_cvms.txt', quote = F, sep = '\t', row.names = F)


# Build the repeated empirical distribution of AUCs
e_cvms <- setNames(data.frame(matrix(ncol = 4, nrow = 1000)), gsub('_counts_dbgap.txt','', files))
for(file in files){
	name <- gsub('_counts_dbgap.txt','', file)
	gtypes <- as.numeric(scan(file))
	vect <- c()
	for(i in seq(1,1000)){
		cv.glmmod <- cv.glmnet(mat, y=gtypes, alpha = 1, family = 'binomial', type.measure = 'auc', nfolds = 8)
		vect[i] <- cv.glmmod$cvm[match(cv.glmmod$lambda.min, cv.glmmod$lambda)]
		}
	e_cvms[name] <- vect
}

write.table(e_cvms, file = 'empirical_multirun_min_cvms.txt', quote = F, sep = '\t', row.names = F)


# read in the files generated if done in a different session
p_cvms <- read.table('permutation_min_cvms.txt', header = T, sep = '\t')
e_cvms <- read.table('empirical_multirun_min_cvms.txt', header = T, sep = '\t')

# Do some plotting to show either overlap of AUC distributions or plot of null distribution with vertical line of empirical mean.
par(mfrow=c(2,2))
bins = seq(0,1,0.01)
for(i in seq(1,length(names(p_cvms)))){
	#######  For plotting the full distributions
	#hist(e_cvms[,i],breaks = bins, col = rgb(1,0,0,0.5), main = '', xlab = 'Maximum AUC', ylim = c(0,100))
	#hist(p_cvms[,i],breaks = bins, add = T, col = rgb(0,0,1,0.5), ylim = c(0,100))
	#######
	hist(p_cvms[,i],breaks = bins, main = '', xlab = 'Maximum AUC', ylim = c(0, 60)) #main = gsub('_',' ',names(p_cvms)[i]),
	abline(v = quantile(p_cvms[,i], 0.025), lty = 3)
	abline(v = quantile(p_cvms[,i], 0.975), lty = 3)
	abline(v = mean(e_cvms[,i]), col = 'red')
	#legend('left', col = c(rgb(1,0,0,0.5),rgb(0,0,1,0.5)), pch = 15,  legend = c('Un-permuted', 'Permuted'), bty = 'n')
	#legend('left', col = c('black','red'), lty = c(3,1),  legend = c('Significance cut-off', 'Empirical mean'), bty = 'n')
}
dev.copy2pdf(file="auc_dists_vertline.pdf")