# Clear up the analysis environment rm(list=ls()) setwd('/Users/aherman/rashidi_work') library('glmnet') # Read in the matrix and arcsin sqrt transform mat <- read.table('transpose_mphlan2_otu_table.txt', header = T, sep = '\t') mat <- as.matrix(mat) mat <- asin(sqrt(mat)) # list the files to iterate over files <- list.files('/Users/aherman/rashidi_work', pattern = '*dbgap.txt') #[1] "rs4415345_alt_counts_dbgap.txt" "rs4415345_ref_counts_dbgap.txt" "rs4610776_alt_counts_dbgap.txt" #[4] "rs4610776_ref_counts_dbgap.txt" # Do one LASSO each and plot the AUCs for different lambdas par(mfrow=c(2,2)) emp_cvms <- setNames(data.frame(matrix(ncol = 4, nrow = 1)), gsub('_counts_dbgap.txt','', files)) pdf('auc_vs_lambas.pdf') for(file in files){ gtypes <- as.numeric(scan(file)) print(sum(gtypes)) name <- gsub('_counts_dbgap.txt','', file) cv.glmmod <- cv.glmnet(mat, y=gtypes, alpha = 1, family = 'binomial', type.measure = 'auc', nfolds = 8) plot(cv.glmmod) #lambda_1se <- cv.glmmod$lambda.1se emp_cvms[name] <- cv.glmmod$cvm[match(cv.glmmod$lambda.min, cv.glmmod$lambda)] } dev.off() # Build the null distribution of AUC values and write to file p_cvms <- setNames(data.frame(matrix(ncol = 4, nrow = 1000)), gsub('_counts_dbgap.txt','', files)) for(file in files){ name <- gsub('_counts_dbgap.txt','', file) gtypes <- as.numeric(scan(file)) vect <- c() for(i in seq(1,1000)){ permutes <- sample(gtypes, size = length(gtypes)) cv.glmmod <- cv.glmnet(mat, y=permutes, alpha = 1, family = 'binomial', type.measure = 'auc', nfolds = 8) vect[i] <- cv.glmmod$cvm[match(cv.glmmod$lambda.min, cv.glmmod$lambda)] } p_cvms[name] <- vect } write.table(p_cvms, file = 'permutation_min_cvms.txt', quote = F, sep = '\t', row.names = F) # Build the repeated empirical distribution of AUCs e_cvms <- setNames(data.frame(matrix(ncol = 4, nrow = 1000)), gsub('_counts_dbgap.txt','', files)) for(file in files){ name <- gsub('_counts_dbgap.txt','', file) gtypes <- as.numeric(scan(file)) vect <- c() for(i in seq(1,1000)){ cv.glmmod <- cv.glmnet(mat, y=gtypes, alpha = 1, family = 'binomial', type.measure = 'auc', nfolds = 8) vect[i] <- cv.glmmod$cvm[match(cv.glmmod$lambda.min, cv.glmmod$lambda)] } e_cvms[name] <- vect } write.table(e_cvms, file = 'empirical_multirun_min_cvms.txt', quote = F, sep = '\t', row.names = F) # read in the files generated if done in a different session p_cvms <- read.table('permutation_min_cvms.txt', header = T, sep = '\t') e_cvms <- read.table('empirical_multirun_min_cvms.txt', header = T, sep = '\t') # Do some plotting to show either overlap of AUC distributions or plot of null distribution with vertical line of empirical mean. par(mfrow=c(2,2)) bins = seq(0,1,0.01) for(i in seq(1,length(names(p_cvms)))){ ####### For plotting the full distributions #hist(e_cvms[,i],breaks = bins, col = rgb(1,0,0,0.5), main = '', xlab = 'Maximum AUC', ylim = c(0,100)) #hist(p_cvms[,i],breaks = bins, add = T, col = rgb(0,0,1,0.5), ylim = c(0,100)) ####### hist(p_cvms[,i],breaks = bins, main = '', xlab = 'Maximum AUC', ylim = c(0, 60)) #main = gsub('_',' ',names(p_cvms)[i]), abline(v = quantile(p_cvms[,i], 0.025), lty = 3) abline(v = quantile(p_cvms[,i], 0.975), lty = 3) abline(v = mean(e_cvms[,i]), col = 'red') #legend('left', col = c(rgb(1,0,0,0.5),rgb(0,0,1,0.5)), pch = 15, legend = c('Un-permuted', 'Permuted'), bty = 'n') #legend('left', col = c('black','red'), lty = c(3,1), legend = c('Significance cut-off', 'Empirical mean'), bty = 'n') } dev.copy2pdf(file="auc_dists_vertline.pdf")