Methane seep carbonate iTag Data

Make a list of filenames

ls *R1_001.fastq | cut -f1–1 -d "" > samples

Run bash script for cutadapt

#!/bin/sh

for sample in $(cat samples)
do
echo "On sample: $sample" | cutadapt -a ^GTGCCAGCMGCCGCGGTAA...ATTAGAWACCCBDGTAGTCC -A ^GGACTACHVGGGTWTCTAAT...TTACCGCGGCKGCTGGCAC -m 215 -M 285 --discard-untrimmed -o ${sample}_R1_001_trimmed.fq.gz -p ${sample}_R2_001_trimmed.fq.gz ${sample}_R1_001.fastq ${sample}_R2_001.fastq >> cutadapt_primer_trimming_stats.txt 2>&1

done

paste samples <(grep «passing» cutadapt_primer_trimming_stats.txt | cut -f3 -d «(" | tr -d ")») <(grep «filtered» cutadapt_primer_trimming_stats.txt | cut -f3 -d «(" | tr -d ")»)

R version 4.0.1 (2020–06–06) – «See Things Now» library(«dada2») packageVersion(«dada2»)

[1] д1.16.0е

setwd(«xx») list.files() samples <- scan(«samples», what=«character») Read 7 items

forward_reads <- paste0(samples, «_R1_001_trimmed.fq.gz») reverse_reads <- paste0(samples, «_R2_001_trimmed.fq.gz») filtered_forward_reads <- paste0(samples, «_R1_filtered.fq.gz») filtered_reverse_reads <- paste0(samples, «_R2_filtered.fq.gz»)

filtered_out <- filterAndTrim(forward_reads, filtered_forward_reads, reverse_reads, filtered_reverse_reads, maxEE=c(2,2), rm.phix=TRUE, minLen=175, truncLen=c(250,200))

class(filtered_out) [1] «matrix» «array» dim(filtered_out) [1] 7 2 filtered_out

reads.in reads.out

err_forward_reads <- learnErrors(filtered_forward_reads, multithread = TRUE) 169014500 total bases in 676058 reads from 2 samples will be used for learning the error rates. err_reverse_reads <- learnErrors(filtered_reverse_reads, multithread = TRUE) 135211600 total bases in 676058 reads from 2 samples will be used for learning the error rates.

plotErrors(err_reverse_reads, nominalQ=TRUE) derep_forward <- derepFastq(filtered_forward_reads, verbose=TRUE) names(derep_forward) <- samples derep_reverse <- derepFastq(filtered_reverse_reads, verbose=TRUE) names(derep_reverse) <- samples

dada_forward <- dada(derep_forward, err=err_forward_reads, pool=«pseudo») dada_forward[[1]]

dada-class: object describing DADA2 denoising results 3881 sequence variants were inferred from 61370 input unique sequences. Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16

dada_reverse <- dada(derep_reverse, err=err_reverse_reads, pool=«pseudo») dada_reverse[[1]] 3806 sequence variants were inferred from 50109 input unique sequences. Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16

merged_amplicons <- mergePairs(dada_forward, derep_forward, dada_reverse, derep_reverse, trimOverhang=TRUE, minOverlap=170, verbose=TRUE)

seqtab <- makeSequenceTable(merged_amplicons) seqtab.nochim <- removeBimeraDenovo(seqtab, verbose=T)

Identified 631 bimeras out of 7806 input sequences.

sum(seqtab.nochim)/sum(seqtab) [1] 0.9578124

getN <- function(x) sum(getUniques(x))

summary_tab <- data.frame(row.names=samples, dada2_input=filtered_out[,1], filtered=filtered_out[,2], dada_f=sapply(dada_forward, getN), dada_r=sapply(dada_reverse, getN), merged=sapply(merged_amplicons, getN), nonchim=rowSums(seqtab.nochim), final_perc_reads_retained=round(rowSums(seqtab.nochim)/filtered_out[,1]*100, 1))

final_perc_reads_retained

taxa <- assignTaxonomy(seqtab.nochim, «/project/geobiology/DADA2_databases/silva_nr_v138_train_set.fa.gz», tryRC=T)

taxa <- addSpecies(taxa, «/project/geobiology/DADA2_databases/silva_species_assignment_v138.fa.gz»)

asv_seqs <- colnames(seqtab.nochim) asv_headers <- vector(dim(seqtab.nochim)[2], mode=«character»)

for (i in 1:dim(seqtab.nochim)[2]) { asv_headers[i] <- paste(">ASV", i, sep=«_») }

setwd(«~/Desktop/PROJECTS/Corrosion Project/itag_16S/Second_Analyses/») library(«phyloseq») library(«vegan») library(«DESeq2») library(«ggplot2») library(«dendextend») library(«tidyr») library(«viridis») library(«reshape»)

phyloseq") д1.32.0е packageVersion(«vegan») д2.5.6е packageVersion(«DESeq2») д1.28.0е packageVersion(«ggplot2») д3.3.0е packageVersion(«dendextend») д1.13.4е packageVersion(«tidyr») д1.0.2е packageVersion(«viridis») д0.5.1е packageVersion(«reshape») д0.8.8е load(«P20_summer2020_reduced.RData»)

seep_dataframe <- read.csv(«seep_dataframe.csv», header = TRUE, sep=«,»)

ps2 <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows=FALSE), tax_table(taxa), sample_data(seep_dataframe))

table(tax_table(ps2)[, «Kingdom»], exclude = NULL)

Archaea Bacteria Eukaryota 425 6697 8 45

New count files from seqtab.nochim

asv_seqs <- colnames(seqtab.nochim) asv_headers <- vector(dim(seqtab.nochim)[2], mode=«character»)

for (i in 1:dim(seqtab.nochim)[2]) { asv_headers[i] <- paste(">ASV", i, sep=«_») }

asv_fasta <- c(rbind(asv_headers, asv_seqs)) write(asv_fasta, «ASVs.fa»)

asv_tab <- t(seqtab.nochim) row.names(asv_tab) <- sub(">", «», asv_headers) write.table(asv_tab, «ASVs_counts.tsv», sep=«\t», quote=F, col.names=NA)

asv_tax <- taxa row.names(asv_tax) <- sub(">", «», asv_headers) write.table(asv_tax, «ASVs_taxonomy.tsv», sep=«\t», quote=F, col.names=NA)

library(decontam) packageVersion(«decontam») [1] д1.8.0е

Negative controls in column number 3 need to count out positionally as below

colnames(asv_tab) vector_for_decontam <- c(rep(FALSE, 3), rep(TRUE, 1), rep(FALSE, 3))

contam_df <- isContaminant(t(asv_tab), method=«prevalence», neg=vector_for_decontam, threshold=0.50) table(contam_df$contaminant) FALSE TRUE 7171 4

Vector holding the identified contaminant IDs

contam_asvs <- row.names(contam_df[contam_df$contaminant == TRUE, ]) asv_tax[row.names(asv_tax) %in% contam_asvs, ]

Kingdom Phylum Class Order Family

contam_indices <- which(asv_fasta %in% paste0(">", contam_asvs)) dont_want <- sort(c(contam_indices, contam_indices + 1)) asv_fasta_no_contam <- asv_fasta[- dont_want] asv_tab_no_contam <- asv_tab[!row.names(asv_tab) %in% contam_asvs, ] asv_tax_no_contam <- asv_tax[!row.names(asv_tax) %in% contam_asvs, ]

write(asv_fasta_no_contam, «ASVs-no-contam.fa») write.table(asv_tab_no_contam, «ASVs_counts-no-contam.tsv», sep=«\t», quote=F, col.names=NA) write.table(asv_tax_no_contam, «ASVs_taxonomy-no-contam.tsv», sep=«\t», quote=F, col.names=NA)

count_tab <- read.table(«ASVs_counts-no-contam.tsv», header=T, row.names=1, check.names=F, sep=«\t»)[ , -c(4)] tax_tab <- as.matrix(read.table(«ASVs_taxonomy-no-contam.tsv», header=T, row.names=1, check.names=F, sep=«\t»)) sample_info_tab <- read.table(«seep_dataframe_no_controls.tsv», header=T, row.names=1, check.names=F, sep=«\t»)

sample_info_tab$Color <- as.character(sample_info_tab$Color)

deseq_counts <- DESeqDataSetFromMatrix(count_tab, colData = sample_info_tab, design = Location)

deseq_counts_vst <- varianceStabilizingTransformation(deseq_counts) vst_trans_count_tab <- assay(deseq_counts_vst) euc_dist <- dist(t(vst_trans_count_tab)) euc_clust <- hclust(euc_dist, method=«ward.D2») plot(euc_clust) euc_dend <- as.dendrogram(euc_clust, hang=0.1) dend_cols <- as.character(sample_info_tab$Color[order.dendrogram(euc_dend)]) labels_colors(euc_dend) <- dend_cols

plot(euc_dend, ylab=«VST Euc. dist.»)

Phyloseq object with transformed table

vst_count_phy <- otu_table(vst_trans_count_tab, taxa_are_rows=T) sample_info_tab_phy <- sample_data(sample_info_tab) vst_physeq <- phyloseq(vst_count_phy, sample_info_tab_phy)

PCoA

vst_pcoa <- ordinate(vst_physeq, method=«MDS», distance=«euclidean») eigen_vals <- vst_pcoa$values$Eigenvalues

plot_ordination(vst_physeq, vst_pcoa, color=«Location») + geom_point(size=2) + labs(col=«Location») + geom_text(aes(label=rownames(sample_info_tab), hjust=0.3, vjust=-0.4)) + coord_fixed(sqrt(eigen_vals[2]/eigen_vals[1])) + ggtitle(«PCoA») + scale_color_manual(values=unique(sample_info_tab$Color[order(sample_info_tab$Location)])) + theme(legend.position=«none»)

Transforms table for rarefaction curve

rarecurve(t(count_tab), step=100, col=sample_info_tab$Color, lwd=2, ylab=«ASVs», label=T)

Adding a vertical line at the fewest seqs in any sample

abline(v=(min(rowSums(t(count_tab)))))

Richness and diversity estimates using un-transformed count table

count_tab_phy <- otu_table(count_tab, taxa_are_rows=T) tax_tab_phy <- tax_table(tax_tab)

ASV_physeq <- phyloseq(count_tab_phy, tax_tab_phy, sample_info_tab_phy)

Plot_richness() function on our phyloseq object

plot_richness(ASV_physeq, color=«Location», measures=c(«Chao1», «Shannon»)) + scale_color_manual(values=unique(sample_info_tab$Color[order(sample_info_tab$Location)])) + theme(legend.title = element_blank())

Taxonomic Studies

plot_bar(ASV_physeq, fill=«Phylum») plot_bar(ASV_physeq, fill=«Class»)

Bar chart of 2% most abundant clades

A count table that has summed all ASVs that were in the same phylum

phyla_counts_tab <- otu_table(tax_glom(ASV_physeq, taxrank=«Phylum»))

Making a vector of phyla names to set as row names

phyla_tax_vec <- as.vector(tax_table(tax_glom(ASV_physeq, taxrank=«Phylum»))[,2])

rownames(phyla_counts_tab) <- as.vector(phyla_tax_vec)

To account for sequences that weren’t assigned any taxonomy even at the phylum level

unclassified_tax_counts <- colSums(count_tab) - colSums(phyla_counts_tab)

phyla_and_unidentified_counts_tab <- rbind(phyla_counts_tab, «Unclassified»=unclassified_tax_counts)

Remove the Proteobacteria, so we can next add them back in broken down by class

temp_major_taxa_counts_tab <- phyla_and_unidentified_counts_tab[!row.names(phyla_and_unidentified_counts_tab) %in% «Proteobacteria», ]

Count table broken down by class

class_counts_tab <- otu_table(tax_glom(ASV_physeq, taxrank=«Class»))

Table holds the phylum and class level info

class_tax_phy_tab <- tax_table(tax_glom(ASV_physeq, taxrank=«Class»))

phy_tmp_vec <- class_tax_phy_tab[,2] class_tmp_vec <- class_tax_phy_tab[,3] rows_tmp <- row.names(class_tax_phy_tab) class_tax_tab <- data.frame(«Phylum»=phy_tmp_vec, «Class»=class_tmp_vec, row.names = rows_tmp)

proteo_no_class_annotated_counts <- phyla_and_unidentified_counts_tab[row.names(phyla_and_unidentified_counts_tab) %in% «Proteobacteria», ] - colSums(proteo_class_counts_tab)

Combining the tables

major_taxa_counts_tab <- rbind(temp_major_taxa_counts_tab, proteo_class_counts_tab, «Unresolved_Proteobacteria»=proteo_no_class_annotated_counts)

identical(colSums(major_taxa_counts_tab), colSums(count_tab))

Proportions table for summarizing

major_taxa_proportions_tab <- apply(major_taxa_counts_tab, 2, function(x) x/sum(x)*100)

Add in the Proteobacteria & plot

proteo_classes_vec <- as.vector(class_tax_tab[class_tax_tab$Phylum == «Proteobacteria», «Class»])

phyla_counts_tab3 <- otu_table(tax_glom(ASV_physeq_0NA, taxrank=«Order»))

phyla_tax_vec3 <- as.vector(tax_table(tax_glom(ASV_physeq_0NA, taxrank=«Order»))[,4])

rownames(phyla_counts_tab3) <- as.vector(phyla_tax_vec3) class_tax_tab5 <- data.frame(phyla_counts_tab3)

major_taxa_proportions_tab3 <- apply(class_tax_tab5, 2, function(x) x/sum(x)*100)

temp_filt_major_taxa_proportions_tab5 <- data.frame(major_taxa_proportions_tab3[apply(major_taxa_proportions_tab3, 1, max) > 2.0, ])

dim(temp_filt_major_taxa_proportions_tab5) [1] 15 6

filtered_proportions5 <- colSums(major_taxa_proportions_tab3) - colSums(temp_filt_major_taxa_proportions_tab5)

filt_major_taxa_proportions_tab5 <- rbind(temp_filt_major_taxa_proportions_tab5, «Other»=filtered_proportions5)

filt_major_taxa_proportions_tab_for_plot5 <- filt_major_taxa_proportions_tab5

filt_major_taxa_proportions_tab_for_plot5$Major_Taxa <- row.names(filt_major_taxa_proportions_tab_for_plot5)

filt_major_taxa_proportions_tab_for_plot5.g <- gather(filt_major_taxa_proportions_tab_for_plot5, Sample, Proportion, -Major_Taxa)

sample_info_for_merge5<-data.frame(«Sample»=row.names(sample_info_tab), «Location»=sample_info_tab$Location, «Color»=sample_info_tab$Color, stringsAsFactors=F)

filt_major_taxa_proportions_tab_for_plot5.g4 <- merge(filt_major_taxa_proportions_tab_for_plot5.g, sample_info_for_merge5)

ggplot(filt_major_taxa_proportions_tab_for_plot5.g4, aes(x=Sample, y=Proportion, fill=Major_Taxa)) + geom_bar(stat=«identity», width=0.9) + scale_fill_brewer(palette=«Spectral») + theme_classic() + theme(axis.text.x=element_text(angle=90, vjust=0.4, hjust=1), legend.title=element_blank()) + labs(x=«Sample», y=«% of 16S rRNA gene copies recovered», title=«All samples»)