ls *R1_001.fastq | cut -f1–1 -d "" > samples
#!/bin/sh
for sample in $(cat samples)
do
echo "On sample: $sample" | cutadapt -a ^GTGCCAGCMGCCGCGGTAA...ATTAGAWACCCBDGTAGTCC -A ^GGACTACHVGGGTWTCTAAT...TTACCGCGGCKGCTGGCAC -m 215 -M 285 --discard-untrimmed -o ${sample}_R1_001_trimmed.fq.gz -p ${sample}_R2_001_trimmed.fq.gz ${sample}_R1_001.fastq ${sample}_R2_001.fastq >> cutadapt_primer_trimming_stats.txt 2>&1
done
paste samples <(grep «passing» cutadapt_primer_trimming_stats.txt | cut -f3 -d «(" | tr -d ")») <(grep «filtered» cutadapt_primer_trimming_stats.txt | cut -f3 -d «(" | tr -d ")»)
R version 4.0.1 (2020–06–06) – «See Things Now» library(«dada2») packageVersion(«dada2»)
[1] д1.16.0е
setwd(«xx») list.files() samples <- scan(«samples», what=«character») Read 7 items
forward_reads <- paste0(samples, «_R1_001_trimmed.fq.gz») reverse_reads <- paste0(samples, «_R2_001_trimmed.fq.gz») filtered_forward_reads <- paste0(samples, «_R1_filtered.fq.gz») filtered_reverse_reads <- paste0(samples, «_R2_filtered.fq.gz»)
filtered_out <- filterAndTrim(forward_reads, filtered_forward_reads, reverse_reads, filtered_reverse_reads, maxEE=c(2,2), rm.phix=TRUE, minLen=175, truncLen=c(250,200))
class(filtered_out) [1] «matrix» «array» dim(filtered_out) [1] 7 2 filtered_out
reads.in reads.out
err_forward_reads <- learnErrors(filtered_forward_reads, multithread = TRUE) 169014500 total bases in 676058 reads from 2 samples will be used for learning the error rates. err_reverse_reads <- learnErrors(filtered_reverse_reads, multithread = TRUE) 135211600 total bases in 676058 reads from 2 samples will be used for learning the error rates.
plotErrors(err_reverse_reads, nominalQ=TRUE) derep_forward <- derepFastq(filtered_forward_reads, verbose=TRUE) names(derep_forward) <- samples derep_reverse <- derepFastq(filtered_reverse_reads, verbose=TRUE) names(derep_reverse) <- samples
dada_forward <- dada(derep_forward, err=err_forward_reads, pool=«pseudo») dada_forward[[1]]
dada-class: object describing DADA2 denoising results 3881 sequence variants were inferred from 61370 input unique sequences. Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
dada_reverse <- dada(derep_reverse, err=err_reverse_reads, pool=«pseudo») dada_reverse[[1]] 3806 sequence variants were inferred from 50109 input unique sequences. Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
merged_amplicons <- mergePairs(dada_forward, derep_forward, dada_reverse, derep_reverse, trimOverhang=TRUE, minOverlap=170, verbose=TRUE)
seqtab <- makeSequenceTable(merged_amplicons) seqtab.nochim <- removeBimeraDenovo(seqtab, verbose=T)
Identified 631 bimeras out of 7806 input sequences.
sum(seqtab.nochim)/sum(seqtab) [1] 0.9578124
getN <- function(x) sum(getUniques(x))
summary_tab <- data.frame(row.names=samples, dada2_input=filtered_out[,1], filtered=filtered_out[,2], dada_f=sapply(dada_forward, getN), dada_r=sapply(dada_reverse, getN), merged=sapply(merged_amplicons, getN), nonchim=rowSums(seqtab.nochim), final_perc_reads_retained=round(rowSums(seqtab.nochim)/filtered_out[,1]*100, 1))
final_perc_reads_retained
taxa <- assignTaxonomy(seqtab.nochim, «/project/geobiology/DADA2_databases/silva_nr_v138_train_set.fa.gz», tryRC=T)
taxa <- addSpecies(taxa, «/project/geobiology/DADA2_databases/silva_species_assignment_v138.fa.gz»)
asv_seqs <- colnames(seqtab.nochim) asv_headers <- vector(dim(seqtab.nochim)[2], mode=«character»)
for (i in 1:dim(seqtab.nochim)[2]) { asv_headers[i] <- paste(">ASV", i, sep=«_») }
setwd(«~/Desktop/PROJECTS/Corrosion Project/itag_16S/Second_Analyses/») library(«phyloseq») library(«vegan») library(«DESeq2») library(«ggplot2») library(«dendextend») library(«tidyr») library(«viridis») library(«reshape»)
phyloseq") д1.32.0е packageVersion(«vegan») д2.5.6е packageVersion(«DESeq2») д1.28.0е packageVersion(«ggplot2») д3.3.0е packageVersion(«dendextend») д1.13.4е packageVersion(«tidyr») д1.0.2е packageVersion(«viridis») д0.5.1е packageVersion(«reshape») д0.8.8е load(«P20_summer2020_reduced.RData»)
seep_dataframe <- read.csv(«seep_dataframe.csv», header = TRUE, sep=«,»)
ps2 <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows=FALSE), tax_table(taxa), sample_data(seep_dataframe))
table(tax_table(ps2)[, «Kingdom»], exclude = NULL)
Archaea Bacteria Eukaryota
asv_seqs <- colnames(seqtab.nochim) asv_headers <- vector(dim(seqtab.nochim)[2], mode=«character»)
for (i in 1:dim(seqtab.nochim)[2]) { asv_headers[i] <- paste(">ASV", i, sep=«_») }
asv_fasta <- c(rbind(asv_headers, asv_seqs)) write(asv_fasta, «ASVs.fa»)
asv_tab <- t(seqtab.nochim) row.names(asv_tab) <- sub(">", «», asv_headers) write.table(asv_tab, «ASVs_counts.tsv», sep=«\t», quote=F, col.names=NA)
asv_tax <- taxa row.names(asv_tax) <- sub(">", «», asv_headers) write.table(asv_tax, «ASVs_taxonomy.tsv», sep=«\t», quote=F, col.names=NA)
library(decontam) packageVersion(«decontam») [1] д1.8.0е
colnames(asv_tab) vector_for_decontam <- c(rep(FALSE, 3), rep(TRUE, 1), rep(FALSE, 3))
contam_df <- isContaminant(t(asv_tab), method=«prevalence», neg=vector_for_decontam, threshold=0.50) table(contam_df$contaminant) FALSE TRUE 7171 4
contam_asvs <- row.names(contam_df[contam_df$contaminant == TRUE, ]) asv_tax[row.names(asv_tax) %in% contam_asvs, ]
Kingdom Phylum Class Order Family
contam_indices <- which(asv_fasta %in% paste0(">", contam_asvs)) dont_want <- sort(c(contam_indices, contam_indices + 1)) asv_fasta_no_contam <- asv_fasta[- dont_want] asv_tab_no_contam <- asv_tab[!row.names(asv_tab) %in% contam_asvs, ] asv_tax_no_contam <- asv_tax[!row.names(asv_tax) %in% contam_asvs, ]
write(asv_fasta_no_contam, «ASVs-no-contam.fa») write.table(asv_tab_no_contam, «ASVs_counts-no-contam.tsv», sep=«\t», quote=F, col.names=NA) write.table(asv_tax_no_contam, «ASVs_taxonomy-no-contam.tsv», sep=«\t», quote=F, col.names=NA)
count_tab <- read.table(«ASVs_counts-no-contam.tsv», header=T, row.names=1, check.names=F, sep=«\t»)[ , -c(4)] tax_tab <- as.matrix(read.table(«ASVs_taxonomy-no-contam.tsv», header=T, row.names=1, check.names=F, sep=«\t»)) sample_info_tab <- read.table(«seep_dataframe_no_controls.tsv», header=T, row.names=1, check.names=F, sep=«\t»)
sample_info_tab$Color <- as.character(sample_info_tab$Color)
deseq_counts <- DESeqDataSetFromMatrix(count_tab, colData = sample_info_tab, design = Location)
deseq_counts_vst <- varianceStabilizingTransformation(deseq_counts) vst_trans_count_tab <- assay(deseq_counts_vst) euc_dist <- dist(t(vst_trans_count_tab)) euc_clust <- hclust(euc_dist, method=«ward.D2») plot(euc_clust) euc_dend <- as.dendrogram(euc_clust, hang=0.1) dend_cols <- as.character(sample_info_tab$Color[order.dendrogram(euc_dend)]) labels_colors(euc_dend) <- dend_cols
plot(euc_dend, ylab=«VST Euc. dist.»)
vst_count_phy <- otu_table(vst_trans_count_tab, taxa_are_rows=T) sample_info_tab_phy <- sample_data(sample_info_tab) vst_physeq <- phyloseq(vst_count_phy, sample_info_tab_phy)
vst_pcoa <- ordinate(vst_physeq, method=«MDS», distance=«euclidean») eigen_vals <- vst_pcoa$values$Eigenvalues
plot_ordination(vst_physeq, vst_pcoa, color=«Location») + geom_point(size=2) + labs(col=«Location») + geom_text(aes(label=rownames(sample_info_tab), hjust=0.3, vjust=-0.4)) + coord_fixed(sqrt(eigen_vals[2]/eigen_vals[1])) + ggtitle(«PCoA») + scale_color_manual(values=unique(sample_info_tab$Color[order(sample_info_tab$Location)])) + theme(legend.position=«none»)
rarecurve(t(count_tab), step=100, col=sample_info_tab$Color, lwd=2, ylab=«ASVs», label=T)
abline(v=(min(rowSums(t(count_tab)))))
count_tab_phy <- otu_table(count_tab, taxa_are_rows=T) tax_tab_phy <- tax_table(tax_tab)
ASV_physeq <- phyloseq(count_tab_phy, tax_tab_phy, sample_info_tab_phy)
plot_richness(ASV_physeq, color=«Location», measures=c(«Chao1», «Shannon»)) + scale_color_manual(values=unique(sample_info_tab$Color[order(sample_info_tab$Location)])) + theme(legend.title = element_blank())
plot_bar(ASV_physeq, fill=«Phylum») plot_bar(ASV_physeq, fill=«Class»)
phyla_counts_tab <- otu_table(tax_glom(ASV_physeq, taxrank=«Phylum»))
phyla_tax_vec <- as.vector(tax_table(tax_glom(ASV_physeq, taxrank=«Phylum»))[,2])
rownames(phyla_counts_tab) <- as.vector(phyla_tax_vec)
unclassified_tax_counts <- colSums(count_tab) - colSums(phyla_counts_tab)
phyla_and_unidentified_counts_tab <- rbind(phyla_counts_tab, «Unclassified»=unclassified_tax_counts)
temp_major_taxa_counts_tab <- phyla_and_unidentified_counts_tab[!row.names(phyla_and_unidentified_counts_tab) %in% «Proteobacteria», ]
class_counts_tab <- otu_table(tax_glom(ASV_physeq, taxrank=«Class»))
class_tax_phy_tab <- tax_table(tax_glom(ASV_physeq, taxrank=«Class»))
phy_tmp_vec <- class_tax_phy_tab[,2] class_tmp_vec <- class_tax_phy_tab[,3] rows_tmp <- row.names(class_tax_phy_tab) class_tax_tab <- data.frame(«Phylum»=phy_tmp_vec, «Class»=class_tmp_vec, row.names = rows_tmp)
proteo_no_class_annotated_counts <- phyla_and_unidentified_counts_tab[row.names(phyla_and_unidentified_counts_tab) %in% «Proteobacteria», ] - colSums(proteo_class_counts_tab)
major_taxa_counts_tab <- rbind(temp_major_taxa_counts_tab, proteo_class_counts_tab, «Unresolved_Proteobacteria»=proteo_no_class_annotated_counts)
identical(colSums(major_taxa_counts_tab), colSums(count_tab))
major_taxa_proportions_tab <- apply(major_taxa_counts_tab, 2, function(x) x/sum(x)*100)
proteo_classes_vec <- as.vector(class_tax_tab[class_tax_tab$Phylum == «Proteobacteria», «Class»])
phyla_counts_tab3 <- otu_table(tax_glom(ASV_physeq_0NA, taxrank=«Order»))
phyla_tax_vec3 <- as.vector(tax_table(tax_glom(ASV_physeq_0NA, taxrank=«Order»))[,4])
rownames(phyla_counts_tab3) <- as.vector(phyla_tax_vec3) class_tax_tab5 <- data.frame(phyla_counts_tab3)
major_taxa_proportions_tab3 <- apply(class_tax_tab5, 2, function(x) x/sum(x)*100)
temp_filt_major_taxa_proportions_tab5 <- data.frame(major_taxa_proportions_tab3[apply(major_taxa_proportions_tab3, 1, max) > 2.0, ])
dim(temp_filt_major_taxa_proportions_tab5) [1] 15 6
filtered_proportions5 <- colSums(major_taxa_proportions_tab3) - colSums(temp_filt_major_taxa_proportions_tab5)
filt_major_taxa_proportions_tab5 <- rbind(temp_filt_major_taxa_proportions_tab5, «Other»=filtered_proportions5)
filt_major_taxa_proportions_tab_for_plot5 <- filt_major_taxa_proportions_tab5
filt_major_taxa_proportions_tab_for_plot5$Major_Taxa <- row.names(filt_major_taxa_proportions_tab_for_plot5)
filt_major_taxa_proportions_tab_for_plot5.g <- gather(filt_major_taxa_proportions_tab_for_plot5, Sample, Proportion, -Major_Taxa)
sample_info_for_merge5<-data.frame(«Sample»=row.names(sample_info_tab), «Location»=sample_info_tab$Location, «Color»=sample_info_tab$Color, stringsAsFactors=F)
filt_major_taxa_proportions_tab_for_plot5.g4 <- merge(filt_major_taxa_proportions_tab_for_plot5.g, sample_info_for_merge5)
ggplot(filt_major_taxa_proportions_tab_for_plot5.g4, aes(x=Sample, y=Proportion, fill=Major_Taxa)) + geom_bar(stat=«identity», width=0.9) + scale_fill_brewer(palette=«Spectral») + theme_classic() + theme(axis.text.x=element_text(angle=90, vjust=0.4, hjust=1), legend.title=element_blank()) + labs(x=«Sample», y=«% of 16S rRNA gene copies recovered», title=«All samples»)