Carbonate_iTag

Methane seep carbonate iTag Data

Make a list of filenames

ls *R1_001.fastq | cut -f1–1 -d "" > samples

Run bash script for cutadapt

#!/bin/sh

for sample in $(cat samples)
do
echo "On sample: $sample" | cutadapt -a ^GTGCCAGCMGCCGCGGTAA...ATTAGAWACCCBDGTAGTCC -A ^GGACTACHVGGGTWTCTAAT...TTACCGCGGCKGCTGGCAC -m 215 -M 285 --discard-untrimmed -o ${sample}_R1_001_trimmed.fq.gz -p ${sample}_R2_001_trimmed.fq.gz ${sample}_R1_001.fastq ${sample}_R2_001.fastq >> cutadapt_primer_trimming_stats.txt 2>&1

done

paste samples <(grep «passing» cutadapt_primer_trimming_stats.txt | cut -f3 -d «(" | tr -d ")») <(grep «filtered» cutadapt_primer_trimming_stats.txt | cut -f3 -d «(" | tr -d ")»)

Bottom1 97.4% 82.4%
Bottom2 97.0% 82.0%
Bottom3 97.3% 82.3%
Control 86.5% 73.5%
Top1 97.3% 82.3%
Top2 97.4% 82.3%
Top3 97.3% 82.2%

R version 4.0.1 (2020–06–06) – «See Things Now» library(«dada2») packageVersion(«dada2»)

[1] д1.16.0е

setwd(«xx») list.files() samples <- scan(«samples», what=«character») Read 7 items

forward_reads <- paste0(samples, «_R1_001_trimmed.fq.gz») reverse_reads <- paste0(samples, «_R2_001_trimmed.fq.gz») filtered_forward_reads <- paste0(samples, «_R1_filtered.fq.gz») filtered_reverse_reads <- paste0(samples, «_R2_filtered.fq.gz»)

filtered_out <- filterAndTrim(forward_reads, filtered_forward_reads, reverse_reads, filtered_reverse_reads, maxEE=c(2,2), rm.phix=TRUE, minLen=175, truncLen=c(250,200))

class(filtered_out) [1] «matrix» «array» dim(filtered_out) [1] 7 2 filtered_out

reads.in reads.out

Bottom1_R1_001_trimmed.fq.gz 341361 294058
Bottom2_R1_001_trimmed.fq.gz 442551 382000
Bottom3_R1_001_trimmed.fq.gz 299111 259541
Control_R1_001_trimmed.fq.gz 1376 946
Top1_R1_001_trimmed.fq.gz 376948 322343
Top2_R1_001_trimmed.fq.gz 266800 232707
Top3_R1_001_trimmed.fq.gz 389678 344021

err_forward_reads <- learnErrors(filtered_forward_reads, multithread = TRUE) 169014500 total bases in 676058 reads from 2 samples will be used for learning the error rates. err_reverse_reads <- learnErrors(filtered_reverse_reads, multithread = TRUE) 135211600 total bases in 676058 reads from 2 samples will be used for learning the error rates.

plotErrors(err_reverse_reads, nominalQ=TRUE) derep_forward <- derepFastq(filtered_forward_reads, verbose=TRUE) names(derep_forward) <- samples derep_reverse <- derepFastq(filtered_reverse_reads, verbose=TRUE) names(derep_reverse) <- samples

dada_forward <- dada(derep_forward, err=err_forward_reads, pool=«pseudo») dada_forward[[1]]

dada-class: object describing DADA2 denoising results 3881 sequence variants were inferred from 61370 input unique sequences. Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16

dada_reverse <- dada(derep_reverse, err=err_reverse_reads, pool=«pseudo») dada_reverse[[1]] 3806 sequence variants were inferred from 50109 input unique sequences. Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16

merged_amplicons <- mergePairs(dada_forward, derep_forward, dada_reverse, derep_reverse, trimOverhang=TRUE, minOverlap=170, verbose=TRUE)

281501 paired-reads (in 3589 unique pairings) successfully merged out of 289843 (in 5281 pairings) input.
367662 paired-reads (in 3505 unique pairings) successfully merged out of 377294 (in 5762 pairings) input.
248426 paired-reads (in 3379 unique pairings) successfully merged out of 255002 (in 5270 pairings) input.
855 paired-reads (in 30 unique pairings) successfully merged out of 901 (in 49 pairings) input.
313371 paired-reads (in 2871 unique pairings) successfully merged out of 319689 (in 4721 pairings) input.
225517 paired-reads (in 2759 unique pairings) successfully merged out of 230512 (in 4010 pairings) input.
333119 paired-reads (in 2772 unique pairings) successfully merged out of 340885 (in 4645 pairings) input.

seqtab <- makeSequenceTable(merged_amplicons) seqtab.nochim <- removeBimeraDenovo(seqtab, verbose=T)

Identified 631 bimeras out of 7806 input sequences.

sum(seqtab.nochim)/sum(seqtab) [1] 0.9578124

getN <- function(x) sum(getUniques(x))

summary_tab <- data.frame(row.names=samples, dada2_input=filtered_out[,1], filtered=filtered_out[,2], dada_f=sapply(dada_forward, getN), dada_r=sapply(dada_reverse, getN), merged=sapply(merged_amplicons, getN), nonchim=rowSums(seqtab.nochim), final_perc_reads_retained=round(rowSums(seqtab.nochim)/filtered_out[,1]*100, 1))

dada2_input filtered dada_f dada_r merged nonchim
Bottom1 341361 294058 290823 291260 281501 272979
Bottom2 442551 382000 378412 378914 367662 349900
Bottom3 299111 259541 255938 256590 248426 235631
Control 1376 946 919 906 855 855
Top1 376948 322343 320362 320719 313371 300673
Top2 266800 232707 231144 231306 225517 218223
Top3 389678 344021 341626 342152 333119 317499

final_perc_reads_retained

Bottom1 80.0
Bottom2 79.1
Bottom3 78.8
Control 62.1
Top1 79.8
Top2 81.8
Top3 81.5

taxa <- assignTaxonomy(seqtab.nochim, «/project/geobiology/DADA2_databases/silva_nr_v138_train_set.fa.gz», tryRC=T)

taxa <- addSpecies(taxa, «/project/geobiology/DADA2_databases/silva_species_assignment_v138.fa.gz»)

asv_seqs <- colnames(seqtab.nochim) asv_headers <- vector(dim(seqtab.nochim)[2], mode=«character»)

for (i in 1:dim(seqtab.nochim)[2]) { asv_headers[i] <- paste(">ASV", i, sep=«_») }

setwd(«~/Desktop/PROJECTS/Corrosion Project/itag_16S/Second_Analyses/») library(«phyloseq») library(«vegan») library(«DESeq2») library(«ggplot2») library(«dendextend») library(«tidyr») library(«viridis») library(«reshape»)

phyloseq") д1.32.0е packageVersion(«vegan») д2.5.6е packageVersion(«DESeq2») д1.28.0е packageVersion(«ggplot2») д3.3.0е packageVersion(«dendextend») д1.13.4е packageVersion(«tidyr») д1.0.2е packageVersion(«viridis») д0.5.1е packageVersion(«reshape») д0.8.8е load(«P20_summer2020_reduced.RData»)

seep_dataframe <- read.csv(«seep_dataframe.csv», header = TRUE, sep=«,»)

ps2 <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows=FALSE), tax_table(taxa), sample_data(seep_dataframe))

table(tax_table(ps2)[, «Kingdom»], exclude = NULL)

Archaea Bacteria Eukaryota 425 6697 8 45

New count files from seqtab.nochim

asv_seqs <- colnames(seqtab.nochim) asv_headers <- vector(dim(seqtab.nochim)[2], mode=«character»)

for (i in 1:dim(seqtab.nochim)[2]) { asv_headers[i] <- paste(">ASV", i, sep=«_») }

asv_fasta <- c(rbind(asv_headers, asv_seqs)) write(asv_fasta, «ASVs.fa»)

asv_tab <- t(seqtab.nochim) row.names(asv_tab) <- sub(">", «», asv_headers) write.table(asv_tab, «ASVs_counts.tsv», sep=«\t», quote=F, col.names=NA)

asv_tax <- taxa row.names(asv_tax) <- sub(">", «», asv_headers) write.table(asv_tax, «ASVs_taxonomy.tsv», sep=«\t», quote=F, col.names=NA)

library(decontam) packageVersion(«decontam») [1] д1.8.0е

Negative controls in column number 3 need to count out positionally as below

colnames(asv_tab) vector_for_decontam <- c(rep(FALSE, 3), rep(TRUE, 1), rep(FALSE, 3))

contam_df <- isContaminant(t(asv_tab), method=«prevalence», neg=vector_for_decontam, threshold=0.50) table(contam_df$contaminant) FALSE TRUE 7171 4

Vector holding the identified contaminant IDs

contam_asvs <- row.names(contam_df[contam_df$contaminant == TRUE, ]) asv_tax[row.names(asv_tax) %in% contam_asvs, ]

Kingdom Phylum Class Order Family

ASV_782 «Bacteria» «Proteobacteria» «Gammaproteobacteria» «Enterobacterales» «Enterobacteriaceae»
ASV_1291 «Bacteria» «Firmicutes» «Bacilli» «Lactobacillales» «Streptococcaceae»
ASV_1362 «Bacteria» «Proteobacteria» «Gammaproteobacteria» «Pasteurellales» «Pasteurellaceae»
ASV_1636 «Bacteria» «Chloroflexi» «Dehalococcoidia» «S085» NA
Genus Species
ASV_782 «Escherichia/Shigella» NA
ASV_1291 «Streptococcus» NA
ASV_1362 «Haemophilus» NA
ASV_1636 NA NA

contam_indices <- which(asv_fasta %in% paste0(">", contam_asvs)) dont_want <- sort(c(contam_indices, contam_indices + 1)) asv_fasta_no_contam <- asv_fasta[- dont_want] asv_tab_no_contam <- asv_tab[!row.names(asv_tab) %in% contam_asvs, ] asv_tax_no_contam <- asv_tax[!row.names(asv_tax) %in% contam_asvs, ]

write(asv_fasta_no_contam, «ASVs-no-contam.fa») write.table(asv_tab_no_contam, «ASVs_counts-no-contam.tsv», sep=«\t», quote=F, col.names=NA) write.table(asv_tax_no_contam, «ASVs_taxonomy-no-contam.tsv», sep=«\t», quote=F, col.names=NA)

count_tab <- read.table(«ASVs_counts-no-contam.tsv», header=T, row.names=1, check.names=F, sep=«\t»)[ , -c(4)] tax_tab <- as.matrix(read.table(«ASVs_taxonomy-no-contam.tsv», header=T, row.names=1, check.names=F, sep=«\t»)) sample_info_tab <- read.table(«seep_dataframe_no_controls.tsv», header=T, row.names=1, check.names=F, sep=«\t»)

sample_info_tab$Color <- as.character(sample_info_tab$Color)

deseq_counts <- DESeqDataSetFromMatrix(count_tab, colData = sample_info_tab, design = _Location)

deseq_counts_vst <- varianceStabilizingTransformation(deseq_counts) vst_trans_count_tab <- assay(deseq_counts_vst) euc_dist <- dist(t(vst_trans_count_tab)) euc_clust <- hclust(euc_dist, method=«ward.D2») plot(euc_clust) euc_dend <- as.dendrogram(euc_clust, hang=0.1) dend_cols <- as.character(sample_info_tab$Color[order.dendrogram(euc_dend)]) labels_colors(euc_dend) <- dend_cols

plot(euc_dend, ylab=«VST Euc. dist.»)

Phyloseq object with transformed table

vst_count_phy <- otu_table(vst_trans_count_tab, taxa_are_rows=T) sample_info_tab_phy <- sample_data(sample_info_tab) vst_physeq <- phyloseq(vst_count_phy, sample_info_tab_phy)

PCoA

vst_pcoa <- ordinate(vst_physeq, method=«MDS», distance=«euclidean») eigen_vals <- vst_pcoa$values$Eigenvalues

plot_ordination(vst_physeq, vst_pcoa, color=«Location») + geom_point(size=2) + labs(col=«Location») + geom_text(aes(label=rownames(sample_info_tab), hjust=0.3, vjust=-0.4)) + coord_fixed(sqrt(eigen_vals[2]/eigen_vals[1])) + ggtitle(«PCoA») + scale_color_manual(values=unique(sample_info_tab$Color[order(sample_info_tab$Location)])) + theme(legend.position=«none»)

Transforms table for rarefaction curve

rarecurve(t(count_tab), step=100, col=sample_info_tab$Color, lwd=2, ylab=«ASVs», label=T)

Adding a vertical line at the fewest seqs in any sample

abline(v=(min(rowSums(t(count_tab)))))

Richness and diversity estimates using un-transformed count table

count_tab_phy <- otu_table(count_tab, taxa_are_rows=T) tax_tab_phy <- tax_table(tax_tab)

ASV_physeq <- phyloseq(count_tab_phy, tax_tab_phy, sample_info_tab_phy)

Plot_richness() function on our phyloseq object

plot_richness(ASV_physeq, color=«Location», measures=c(«Chao1», «Shannon»)) + scale_color_manual(values=unique(sample_info_tab$Color[order(sample_info_tab$Location)])) + theme(legend.title = element_blank())

Taxonomic Studies

plot_bar(ASV_physeq, fill=«Phylum») plot_bar(ASV_physeq, fill=«Class»)

Bar chart of 2% most abundant clades

A count table that has summed all ASVs that were in the same phylum

phyla_counts_tab <- otu_table(tax_glom(ASV_physeq, taxrank=«Phylum»))

Making a vector of phyla names to set as row names

phyla_tax_vec <- as.vector(tax_table(tax_glom(ASV_physeq, taxrank=«Phylum»))[,2])

rownames(phyla_counts_tab) <- as.vector(phyla_tax_vec)

To account for sequences that weren’t assigned any taxonomy even at the phylum level

unclassified_tax_counts <- colSums(count_tab) - colSums(phyla_counts_tab)

phyla_and_unidentified_counts_tab <- rbind(phyla_counts_tab, «Unclassified»=unclassified_tax_counts)

Remove the Proteobacteria, so we can next add them back in broken down by class

temp_major_taxa_counts_tab <- phyla_and_unidentified_counts_tab[!row.names(phyla_and_unidentified_counts_tab) %in% «Proteobacteria», ]

Count table broken down by class

class_counts_tab <- otu_table(tax_glom(ASV_physeq, taxrank=«Class»))

Table holds the phylum and class level info

class_tax_phy_tab <- tax_table(tax_glom(ASV_physeq, taxrank=«Class»))

phy_tmp_vec <- class_tax_phy_tab[,2] class_tmp_vec <- class_tax_phy_tab[,3] rows_tmp <- row.names(class_tax_phy_tab) class_tax_tab <- data.frame(«Phylum»=phy_tmp_vec, «Class»=class_tmp_vec, row.names = rows_tmp)

proteo_no_class_annotated_counts <- phyla_and_unidentified_counts_tab[row.names(phyla_and_unidentified_counts_tab) %in% «Proteobacteria», ] - colSums(proteo_class_counts_tab)

Combining the tables

major_taxa_counts_tab <- rbind(temp_major_taxa_counts_tab, proteo_class_counts_tab, «Unresolved_Proteobacteria»=proteo_no_class_annotated_counts)

identical(colSums(major_taxa_counts_tab), colSums(count_tab))

Proportions table for summarizing

major_taxa_proportions_tab <- apply(major_taxa_counts_tab, 2, function(x) x/sum(x)*100)

Add in the Proteobacteria & plot

proteo_classes_vec <- as.vector(class_tax_tab[class_tax_tab$Phylum == «Proteobacteria», «Class»])

phyla_counts_tab3 <- otu_table(tax_glom(ASV_physeq_0NA, taxrank=«Order»))

phyla_tax_vec3 <- as.vector(tax_table(tax_glom(ASV_physeq_0NA, taxrank=«Order»))[,4])

rownames(phyla_counts_tab3) <- as.vector(phyla_tax_vec3) class_tax_tab5 <- data.frame(phyla_counts_tab3)

major_taxa_proportions_tab3 <- apply(class_tax_tab5, 2, function(x) x/sum(x)*100)

temp_filt_major_taxa_proportions_tab5 <- data.frame(major_taxa_proportions_tab3[apply(major_taxa_proportions_tab3, 1, max) > 2.0, ])

dim(temp_filt_major_taxa_proportions_tab5) [1] 15 6

filtered_proportions5 <- colSums(major_taxa_proportions_tab3) - colSums(temp_filt_major_taxa_proportions_tab5)

filt_major_taxa_proportions_tab5 <- rbind(temp_filt_major_taxa_proportions_tab5, «Other»=filtered_proportions5)

filt_major_taxa_proportions_tab_for_plot5 <- filt_major_taxa_proportions_tab5

filt_major_taxa_proportions_tab_for_plot5$Major_Taxa <- row.names(filt_major_taxa_proportions_tab_for_plot5)

filt_major_taxa_proportions_tab_for_plot5.g <- gather(filt_major_taxa_proportions_tab_for_plot5, Sample, Proportion, -Major_Taxa)

sample_info_for_merge5<-data.frame(«Sample»=row.names(sample_info_tab), «Location»=sample_info_tab$Location, «Color»=sample_info_tab$Color, stringsAsFactors=F)

filt_major_taxa_proportions_tab_for_plot5.g4 <- merge(filt_major_taxa_proportions_tab_for_plot5.g, sample_info_for_merge5)

ggplot(filt_major_taxa_proportions_tab_for_plot5.g4, aes(x=Sample, y=Proportion, fill=Major_Taxa)) + geom_bar(stat=«identity», width=0.9) + scale_fill_brewer(palette=«Spectral») + theme_classic() + theme(axis.text.x=element_text(angle=90, vjust=0.4, hjust=1), legend.title=element_blank()) + labs(x=«Sample», y=«% of 16S rRNA gene copies recovered», title=«All samples»)