P18_processing_4_publication

Processing iTag reads for Namibian Beggiatoceae and host sediments

Simplify file names & make list file

ls  *_R1_001.fastq | cut -f1-2 -d "_" > samples

Run bash script for cutadapt to remove primers

!/bin/sh

for sample in $(cat samples)_
do
echo "On sample: $sample" | cutadapt -a ^GTGCCAGCMGCCGCGGTAA...ATTAGAWACCCBDGTAGTCC -A ^GGACTACHVGGGTWTCTAAT...TTACCGCGGCKGCTGGCAC -m 215 -M 285 --discard-untrimmed -o ${sample}_R1_001_trimmed.fq.gz -p ${sample}_R2_001_trimmed.fq.gz ${sample}_R1_001.fastq ${sample}_R2_001.fastq >> cutadapt_primer_trimming_stats.txt 2>&1
done

paste samples <(grep "passing" cutadapt_primer_trimming_stats.txt | cut -f3 -d "(" | tr -d ")") <(grep "filtered" cutadapt_primer_trimming_stats.txt | cut -f3 -d "(" | tr -d ")")

Cutadapt results

EMPTYE06_S38 91.4% 80.3%
EMPTYH07_S62 82.2% 71.5%
JNN162D45A_S22 96.2% 83.8%
JNN171B_S8 91.9% 79.4%
JNN171D01A_S2 97.2% 84.0%
JNN171D01B_S10 97.1% 83.7%
JNN171D01C_S18 97.2% 84.0%
JNN171D12A_S1 97.4% 84.3%
JNN171D12B_S9 97.1% 83.9%
JNN171D23A_S17 97.4% 84.3%
JNN171D23B_S25 96.3% 84.0%
JNN171D34A_S33 96.7% 84.2%
JNN171D34B_S41 98.0% 85.0%
JNN171D45A_S49 97.5% 84.5%
JNN171D45B_S56 96.1% 83.4%
JNN171SSA1_S26 96.3% 83.7%
JNN171SSA2_S34 96.5% 83.9%
JNN171SSA3_S42 97.1% 84.0%
JNN171T_S46 96.0% 83.0%
JNN1722SO1T1_S54 95.3% 82.6%
JNN1722SO1T2_S61 98.0% 85.0%
JNN1722SO1T3_S7 95.2% 82.6%
JNN1722SO1T4_S15 95.0% 82.2%
JNN1722SO1T5_S23 96.3% 83.4%
JNN1722SO1T6_S31 95.8% 83.7%
JNN172D01A_S13 96.1% 83.0%
JNN172D01B_S21 96.4% 83.4%
JNN172D01C_S29 93.4% 81.8%
JNN172D12A_S37 96.2% 84.0%
JNN172D12B_S45 97.1% 84.2%
JNN172D23A_S53 97.0% 84.1%
JNN172D23B_S60 97.3% 84.3%
JNN172D34A_S6 96.9% 84.1%
JNN172D34B_S14 96.8% 83.8%
JNN172D45B_S30 95.9% 83.8%
JNN172SSA_S50 97.1% 84.2%
JNN172SSB_S57 97.2% 84.2%
JNN172SSC_S3 96.6% 83.8%
JNN1742SB1B08_S16 96.3% 83.3%
JNN1742SB1C08_S24 95.5% 82.7%
JNN1742SB1D08_S32 96.1% 83.9%
JNN174D01_S11 97.0% 84.0%
JNN174D12_S19 97.3% 84.2%
JNN174D23_S27 95.9% 83.5%
JNN174D34A_S35 97.0% 84.3%
JNN174D34B_S43 97.2% 84.1%
JNN174D45_S51 97.2% 84.3%
JNN174SS1A_S58 96.5% 83.5%
JNN174SS1B_S4 96.8% 83.7%
JNN174SS1S1_S12 96.9% 83.8%
JNN174SS1S6_S20 96.9% 84.0%
JNN174SS1S9_S28 90.2% 79.3%
JNN174SS1V_S36 96.0% 84.1%
JNN175RINSEW_S5 92.9% 80.8%
JNN175SED1_S44 98.0% 85.2%
JNN175SED2_S52 97.8% 84.8%
JNN175SED3_S59 97.3% 84.3%
JNN175ZBS_S40 94.9% 82.6%
JNN175ZBW_S48 39.3% 34.3%
JNN175ZSOA_S39 95.5% 83.4%
JNN175ZSOB_S47 95.7% 82.9%
JNN175ZSOC_S55 94.8% 82.0%

R version 3.6.3 (2020–02–29) DADA2 processing

> library("dada2")
> packageVersion("dada2")

‘1.16.0’

> setwd("/project/geobiology/scratch/flood/Simons_Project_18_itag/")
> list.files()  #just to see if it is reading the files in the directory
> samples <- scan("samples", what="character")

Read and filter 62 items

> forward_reads <- paste0(samples, "_R1_001_trimmed.fq.gz")
> reverse_reads <- paste0(samples, "_R2_001_trimmed.fq.gz")
> filtered_forward_reads <- paste0(samples, "_R1_filtered.fq.gz")
> filtered_reverse_reads <- paste0(samples, "_R2_filtered.fq.gz")
> plotQualityProfile(forward_reads)
> filtered_out <- filterAndTrim(forward_reads, filtered_forward_reads, reverse_reads, filtered_reverse_reads, maxEE=c(2,2), rm.phix=TRUE, minLen=175, truncLen=c(250,200))

> class(filtered_out)

[1] «matrix»

> dim(filtered_out)

[1] 62 2

> filtered_out

reads.in reads.out
EMPTYE06_S38_R1_001_trimmed.fq.gz 7426 301
EMPTYH07_S62_R1_001_trimmed.fq.gz 3448 1832
JNN162D45A_S22_R1_001_trimmed.fq.gz 222454 66175
JNN171B_S8_R1_001_trimmed.fq.gz 273310 170324
JNN171D01A_S2_R1_001_trimmed.fq.gz 417622 268916
JNN171D01B_S10_R1_001_trimmed.fq.gz 489272 323240
JNN171D01C_S18_R1_001_trimmed.fq.gz 523421 341174
JNN171D12A_S1_R1_001_trimmed.fq.gz 393197 244735
JNN171D12B_S9_R1_001_trimmed.fq.gz 413180 265626
JNN171D23A_S17_R1_001_trimmed.fq.gz 356006 190980
JNN171D23B_S25_R1_001_trimmed.fq.gz 161354 34109
JNN171D34A_S33_R1_001_trimmed.fq.gz 234123 74096
JNN171D34B_S41_R1_001_trimmed.fq.gz 411512 254125
JNN171D45A_S49_R1_001_trimmed.fq.gz 403062 238558
JNN171D45B_S56_R1_001_trimmed.fq.gz 2557 1326
JNN171SSA1_S26_R1_001_trimmed.fq.gz 223593 63556
JNN171SSA2_S34_R1_001_trimmed.fq.gz 172957 48233
JNN171SSA3_S42_R1_001_trimmed.fq.gz 472779 307137
JNN171T_S46_R1_001_trimmed.fq.gz 273852 177579
JNN1722SO1T1_S54_R1_001_trimmed.fq.gz 395365 226807
JNN1722SO1T2_S61_R1_001_trimmed.fq.gz 244705 140656
JNN1722SO1T3_S7_R1_001_trimmed.fq.gz 374299 227895
JNN1722SO1T4_S15_R1_001_trimmed.fq.gz 365867 230573
JNN1722SO1T5_S23_R1_001_trimmed.fq.gz 495281 298117
JNN1722SO1T6_S31_R1_001_trimmed.fq.gz 174921 34480
JNN172D01A_S13_R1_001_trimmed.fq.gz 471320 308275
JNN172D01B_S21_R1_001_trimmed.fq.gz 478900 310242
JNN172D01C_S29_R1_001_trimmed.fq.gz 87820 11840
JNN172D12A_S37_R1_001_trimmed.fq.gz 257790 92729
JNN172D12B_S45_R1_001_trimmed.fq.gz 554695 349848
JNN172D23A_S53_R1_001_trimmed.fq.gz 448829 273993
JNN172D23B_S60_R1_001_trimmed.fq.gz 272519 175609
JNN172D34A_S6_R1_001_trimmed.fq.gz 447083 266743
JNN172D34B_S14_R1_001_trimmed.fq.gz 451128 280289
JNN172D45B_S30_R1_001_trimmed.fq.gz 185486 40805
JNN172SSA_S50_R1_001_trimmed.fq.gz 404028 224929
JNN172SSB_S57_R1_001_trimmed.fq.gz 156574 99258
JNN172SSC_S3_R1_001_trimmed.fq.gz 457678 276162
JNN1742SB1B08_S16_R1_001_trimmed.fq.gz 490239 316150
JNN1742SB1C08_S24_R1_001_trimmed.fq.gz 460028 291147
JNN1742SB1D08_S32_R1_001_trimmed.fq.gz 176081 37981
JNN174D01_S11_R1_001_trimmed.fq.gz 501094 318456
JNN174D12_S19_R1_001_trimmed.fq.gz 509244 315585
JNN174D23_S27_R1_001_trimmed.fq.gz 229119 57758
JNN174D34A_S35_R1_001_trimmed.fq.gz 248700 80705
JNN174D34B_S43_R1_001_trimmed.fq.gz 450990 286264
JNN174D45_S51_R1_001_trimmed.fq.gz 388617 229464
JNN174SS1A_S58_R1_001_trimmed.fq.gz 181673 116713
JNN174SS1B_S4_R1_001_trimmed.fq.gz 338386 222699
JNN174SS1S1_S12_R1_001_trimmed.fq.gz 442587 284169
JNN174SS1S6_S20_R1_001_trimmed.fq.gz 445877 282301
JNN174SS1S9_S28_R1_001_trimmed.fq.gz 42942 4784
JNN174SS1V_S36_R1_001_trimmed.fq.gz 34146 11737
JNN175RINSEW_S5_R1_001_trimmed.fq.gz 13388 7177
JNN175SED1_S44_R1_001_trimmed.fq.gz 383382 228497
JNN175SED2_S52_R1_001_trimmed.fq.gz 371816 228629
JNN175SED3_S59_R1_001_trimmed.fq.gz 183482 117726
JNN175ZBS_S40_R1_001_trimmed.fq.gz 160205 50854
JNN175ZBW_S48_R1_001_trimmed.fq.gz 2470 834
JNN175ZSOA_S39_R1_001_trimmed.fq.gz 108634 18892
JNN175ZSOB_S47_R1_001_trimmed.fq.gz 419243 264062
JNN175ZSOC_S55_R1_001_trimmed.fq.gz 394909 237964

Make error models

> err_forward_reads <- learnErrors(filtered_forward_reads, multithread = TRUE)
> err_reverse_reads <- learnErrors(filtered_reverse_reads, multithread = TRUE)
> plotErrors(err_forward_reads, nominalQ=TRUE)
> plotErrors(err_reverse_reads, nominalQ=TRUE)

Dereplication

> derep_forward <- derepFastq(filtered_forward_reads, verbose=TRUE)
> names(derep_forward) <- samples
> derep_reverse <- derepFastq(filtered_reverse_reads, verbose=TRUE)
> names(derep_reverse) <- samples

> dada_forward <- dada(derep_forward, err=err_forward_reads, pool="pseudo")

Generate ASVs

> dada_forward[[1]]

dada-class: object describing DADA2 denoising results 31 sequence variants were inferred from 248 input unique sequences. Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16

> dada_reverse <- dada(derep_reverse, err=err_reverse_reads, pool="pseudo")
> dada_reverse[[1]]

33 sequence variants were inferred from 178 input unique sequences. Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16

Merge reads

> merged_amplicons <- mergePairs(dada_forward, derep_forward, dada_reverse, derep_reverse, trimOverhang=TRUE, minOverlap=170, verbose=TRUE)

Make a counts table

> seqtab <- makeSequenceTable(merged_amplicons)

Remove chimeras

> seqtab.nochim <- removeBimeraDenovo(seqtab, verbose=T)

> sum(seqtab.nochim)/sum(seqtab)

[1] 0.9870203 lastest package [1] 0.987158

> getN <- function(x) sum(getUniques(x))

> summary_tab <- data.frame(row.names=samples, dada2_input=filtered_out[,1], filtered=filtered_out[,2], dada_f=sapply(dada_forward, getN), dada_r=sapply(dada_reverse, getN), merged=sapply(merged_amplicons, getN), nonchim=rowSums(seqtab.nochim), final_perc_reads_retained=round(rowSums(seqtab.nochim)/filtered_out[,1]*100, 1))

Assign taxonomy

> taxa <- assignTaxonomy(seqtab.nochim, "/project/geobiology/DADA2_databases/silva_nr_v138_train_set.fa.gz", tryRC=T)

> taxa <- addSpecies(taxa, "/project/geobiology/DADA2_databases/silva_species_assignment_v138.fa.gz")

Prepare for Phyloseq2

Giving our seq headers more manageable names (ASV_1, ASV_2…)

> asv_seqs <- colnames(seqtab.nochim)
> asv_headers <- vector(dim(seqtab.nochim)[2], mode="character")

for (i in 1:dim(seqtab.nochim)[2]) {
  asv_headers[i] <- paste(">ASV", i, sep="_")
}

making and writing out a fasta of our final ASV seqs:

> asv_fasta <- c(rbind(asv_headers, asv_seqs))
> write(asv_fasta, "ASVs2.fa")

count table:

> asv_tab <- t(seqtab.nochim)
> row.names(asv_tab) <- sub(">", "", asv_headers)
> write.table(asv_tab, "ASVs_counts2.tsv", sep="\t", quote=F, col.names=NA)

taxonomic table:

> asv_tax <- taxa
> row.names(asv_tax) <- sub(">", "", asv_headers)
> write.table(asv_tax, "ASVs_taxonomy2.tsv", sep="\t", quote=F, col.names=NA)

In excel merge the two tables to a single dataframe

> library("phyloseq")
> library("vegan")
> library("DESeq2")
> library("ggplot2")
> library("dendextend")
> library("tidyr")
> library("viridis")
> library("reshape")
> library("decontam")

> packageVersion("phyloseq") ‘1.32.0’
> packageVersion("vegan") ‘2.5.6’
> packageVersion("DESeq2") ‘1.28.0’
> packageVersion("ggplot2") ‘3.3.0’
> packageVersion("dendextend") ‘1.13.4’
> packageVersion("tidyr") ‘1.0.2’
> packageVersion("viridis") ‘0.5.1’
> packageVersion("reshape") ‘0.8.8’
> packageVersion("decontam") [1] ‘1.8.0’

> load("P18_spring2020.RData")

Too big to open on desktop, need to remove large objects

> objects()  #lists all objects
> rm()

> load("P18_spring2020_red.RData")

Load in dataframe with sample metadata

> p18dataframe2 <- read.csv("p18dataframe2.csv", header = TRUE, sep=",")

> ps2 <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows=FALSE), tax_table(taxa), sample_data(p18dataframe2))

Didn’t work, new names have the _## after name. But fixing didn’t work

> rownames(p18dataframe2) <- samples.out

Ok this works. But now an extra column called names. Should remove.

> p18dataframe3 <- p18dataframe2[ -c(1) ]

> ps2 <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows=FALSE), tax_table(taxa), sample_data(p18dataframe3))

> table(tax_table(ps2)[, "Kingdom"], exclude = NULL)

Archaea Bacteria Eukaryota 11327 29002 21 170

Start with creating new count files from seqtab.nochim

> asv_seqs <- colnames(seqtab.nochim)
> asv_headers <- vector(dim(seqtab.nochim)[2], mode="character")asv_headers <- vector(dim(seqtab.nochim)[2], mode="character")

for (i in 1:dim(seqtab.nochim)[2]) { asv_headers[i] <- paste(">ASV", i, sep="_") }

Making and writing out a fasta of our ASV seqs:

> asv_fasta <- c(rbind(asv_headers, asv_seqs))
> write(asv_fasta, "ASVs.fa")

Making and writing out a fasta of our count table:

> asv_tab <- t(seqtab.nochim)
> row.names(asv_tab) <- sub(">", "", asv_headers)
> write.table(asv_tab, "ASVs_counts.tsv", sep="\t", quote=F, col.names=NA)

Making and writing out a fasta of our tax table:

> asv_tax <- taxa
> row.names(asv_tax) <- sub(">", "", asv_headers)
> write.table(asv_tax, "ASVs_taxonomy.tsv", sep="\t", quote=F, col.names=NA)

To identify contaminates we need to identify which columns have the sequencing center blacks and our negative controls. Negative controls in column numbers 1,2,54,59

> colnames(asv_tab)
> vector_for_decontam <- c(rep(TRUE, 2), rep(FALSE, 51), rep(TRUE, 1), rep(FALSE, 4), rep(TRUE, 1), rep(FALSE, 3))

> contam_df <- isContaminant(t(asv_tab), neg=vector_for_decontam)

> table(contam_df$contaminant)

Getting vector holding the identified contaminant IDs

> contam_asvs <- row.names(contam_df[contam_df$contaminant == TRUE, ])

> asv_tax[row.names(asv_tax) %in% contam_asvs, ]

Kingdom Phylum Class
ASV_1497 «Bacteria» «Bacteroidota» «Bacteroidia»
ASV_1660 «Bacteria» «Firmicutes» «Bacilli»
ASV_2331 «Archaea» «Nanoarchaeota» «Nanoarchaeia»
ASV_3776 «Bacteria» «Proteobacteria» «Alphaproteobacteria» ASV_5306 «Bacteria» «Actinobacteriota» «Actinobacteria»
ASV_9182 «Bacteria» «Proteobacteria» «Gammaproteobacteria» ASV_23524 «Bacteria» «Desulfobacterota» «Desulfobulbia»
Order Family
ASV_1497 «Flavobacteriales» «Weeksellaceae»
ASV_1660 «Bacillales» «Planococcaceae»
ASV_2331 «Woesearchaeales» NA
ASV_3776 «Rhizobiales» «Xanthobacteraceae»
ASV_5306 «Propionibacteriales» «Propionibacteriaceae» ASV_9182 «Xanthomonadales» «Xanthomonadaceae»
ASV_23524 «Desulfobulbales» NA
Genus Species
ASV_1497 «Chryseobacterium» NA
ASV_1660 «Lysinibacillus» NA
ASV_2331 NA NA
ASV_3776 «Bradyrhizobium» NA
ASV_5306 «Cutibacterium» NA
ASV_9182 «Stenotrophomonas» «maltophilia» ASV_23524 NA NA

> contam_indices <- which(asv_fasta %in% paste0(">", contam_asvs))
> dont_want <- sort(c(contam_indices, contam_indices + 1))
> asv_fasta_no_contam <- asv_fasta[- dont_want]
> asv_tab_no_contam <- asv_tab[!row.names(asv_tab) %in% contam_asvs, ]
> asv_tax_no_contam <- asv_tax[!row.names(asv_tax) %in% contam_asvs, ]

 
> write(asv_fasta_no_contam, "ASVs-no-contam.fa")
> write.table(asv_tab_no_contam, "ASVs_counts-no-contam.tsv",
        sep="\t", quote=F, col.names=NA)
> write.table(asv_tax_no_contam, "ASVs_taxonomy-no-contam.tsv",
        sep="\t", quote=F, col.names=NA)

Start new Rdata for simplicity (removing negative control and very poor libraries)

> count_tab <- read.table("ASVs_counts-no-contam.tsv", header=T, row.names=1,
         check.names=F, sep="\t")[ , -c(1:2,15,28,52:54,59)]
> tax_tab <- as.matrix(read.table("ASVs_taxonomy-no-contam.tsv", header=T,
       row.names=1, check.names=F, sep="\t"))
> sample_info_tab <- read.table("P18dataframe3_no_controls.tsv", header=T, row.names=1, check.names=F, sep="\t")

> sample_info_tab$Color <- as.character(sample_info_tab$Color)
> sample_info_tab$NewColor <- as.character(sample_info_tab$NewColor)
> sample_info_tab$LocationColor <- as.character(sample_info_tab$LocationColor)


> deseq_counts <- DESeqDataSetFromMatrix(count_tab, colData = sample_info_tab, design = ~Location)

> deseq_counts_vst <- varianceStabilizingTransformation(deseq_counts)
> vst_trans_count_tab <- assay(deseq_counts_vst)
> euc_dist <- dist(t(vst_trans_count_tab))
> euc_clust <- hclust(euc_dist, method="ward.D2")
> plot(euc_clust) 
> euc_dend <- as.dendrogram(euc_clust, hang=0.1)
> dend_cols <- as.character(sample_info_tab$Color[order.dendrogram(euc_dend)])
> labels_colors(euc_dend) <- dend_cols

> plot(euc_dend, ylab="VST Euc. dist.")

Making our phyloseq object with transformed table (will use vst_physeq for many analyses)

> vst_count_phy <- otu_table(vst_trans_count_tab, taxa_are_rows=T)
> sample_info_tab_phy <- sample_data(sample_info_tab)
> vst_physeq <- phyloseq(vst_count_phy, sample_info_tab_phy)

Generating and visualizing the PCoA with phyloseq

> vst_pcoa <- ordinate(vst_physeq, method="MDS", distance="euclidean")
> eigen_vals <- vst_pcoa$values$Eigenvalues

Plot_ordination(vst_physeq, vst_pcoa, color=«Type», shape=«Location») +

> geom_point(size=2) + labs(col="Location") + 
geom_text(aes(label=rownames(sample_info_tab), hjust=0.3, vjust=-0.4)) + 
coord_fixed(sqrt(eigen_vals[2]/eigen_vals[1])) + ggtitle("PCoA") + 
scale_color_manual(values=unique(sample_info_tab$Color[order(sample_info_tab$Type)])) + 
theme(legend.position="none")

Transforms table

> rarecurve(t(count_tab), step=100, col=sample_info_tab$Color, lwd=2, ylab="ASVs", label=T)
  abline(v=(min(rowSums(t(count_tab)))))

Richness and diversity estimates (first we need to create a phyloseq object using our un-transformed count table)

> count_tab_phy <- otu_table(count_tab, taxa_are_rows=T)
> tax_tab_phy <- tax_table(tax_tab)

> ASV_physeq <- phyloseq(count_tab_phy, tax_tab_phy, sample_info_tab_phy)

Plot_richness

> plot_richness(ASV_physeq, color="Type", measures=c("Chao1", "Shannon")) + 	scale_color_manual(values=unique(sample_info_tab$Color[order(sample_info_tab$Type)])) +	theme(legend.title = element_blank())
  
> plot_richness(ASV_physeq, x="Type", color="Location", measures=c("Chao1", "Shannon")) + 	scale_color_manual(values=unique(sample_info_tab$LocationColor[order(sample_info_tab$Location)])) +	theme(legend.title = element_blank())

> plot_richness(ASV_physeq, x="Type", color="Depth", measures=c("Chao1", "Shannon")) + 
scale_color_manual(values=unique(sample_info_tab$NewColor[order(sample_info_tab$Depth)])) +
theme(legend.title = element_blank())

Using phyloseq to make a count table that has summed all ASVs that were in the same phylum

> phyla_counts_tab <- otu_table(tax_glom(ASV_physeq, taxrank="Phylum"))

Making a vector of phyla names to set as row names

> phyla_tax_vec <- as.vector(tax_table(tax_glom(ASV_physeq, taxrank="Phylum"))[,2]) 
> rownames(phyla_counts_tab) <- as.vector(phyla_tax_vec)

To account for sequences that weren’t assigned any taxonomy even at the phylum level

These came into R as ‘NAs’ in the taxonomy table, but their counts are still in the count table so we can get that value for each sample by substracting the column sums of this new table (that has everything that had a phylum assigned to it from the column sums of the starting count table (that has all representative sequences)

> unclassified_tax_counts <- colSums(count_tab) - colSums(phyla_counts_tab)

Add this row to our phylum count table:

> phyla_and_unidentified_counts_tab <- rbind(phyla_counts_tab, "Unclassified"=unclassified_tax_counts)

Remove the Proteobacteria, so we can next add them back in broken down by class

> temp_major_taxa_counts_tab <- phyla_and_unidentified_counts_tab[!row.names(phyla_and_unidentified_counts_tab) %in% "Proteobacteria", ]

Making count table broken down by class (contains classes beyond the Proteobacteria too at this point)

> class_counts_tab <- otu_table(tax_glom(ASV_physeq, taxrank="Class"))

Making a table that holds the phylum and class level info

> class_tax_phy_tab <- tax_table(tax_glom(ASV_physeq, taxrank="Class")) 

> phy_tmp_vec <- class_tax_phy_tab[,2]
> class_tmp_vec <- class_tax_phy_tab[,3]
> rows_tmp <- row.names(class_tax_phy_tab)
> class_tax_tab <- data.frame("Phylum"=phy_tmp_vec, "Class"=class_tmp_vec, row.names = rows_tmp)

Making a vector of just the Proteobacteria classes

> proteo_classes_vec <- as.vector(class_tax_tab[class_tax_tab$Phylum == "Proteobacteria", "Class"])

Changing the row names like above so that they correspond to the taxonomy,rather than an ASV identifier

> rownames(class_counts_tab) <- as.vector(class_tax_tab$Class)

Making a table of the counts of the Proteobacterial classes

> proteo_class_counts_tab <- class_counts_tab[row.names(class_counts_tab) %in% proteo_classes_vec, ]

There are also possibly some sequences that were resolved to the level of Proteobacteria, but not any further, and therefore would be missing from our class table. The sum of them by subtracting the proteo class count table from just the Proteobacteria row from the original phylum-level count table

> proteo_no_class_annotated_counts <- phyla_and_unidentified_counts_tab[row.names(phyla_and_unidentified_counts_tab) %in% "Proteobacteria", ] - colSums(proteo_class_counts_tab)

Combining the tables:

> major_taxa_counts_tab <- rbind(temp_major_taxa_counts_tab, proteo_class_counts_tab, "Unresolved_Proteobacteria"=proteo_no_class_annotated_counts)

To check we didn’t miss any other sequences, we can compare the column sums to see if they are the same if «TRUE», we know nothing fell through the cracks

> identical(colSums(major_taxa_counts_tab), colSums(count_tab))

A proportions table for summarizing:

> major_taxa_proportions_tab <- apply(major_taxa_counts_tab, 2, function(x) x/sum(x)*100)

Check the dimensions of this table

> dim(major_taxa_proportions_tab)

[1] 85 54

There are currently 85 rows, which might be a little busy for a summary figure many of these taxa make up a very small percentage, so we’re going to filter some out

> temp_filt_major_taxa_proportions_tab <- data.frame(major_taxa_proportions_tab[apply(major_taxa_proportions_tab, 1, max) > 5, ])

Checking how many we have that were above this threshold

> dim(temp_filt_major_taxa_proportions_tab)

Now have 13, much more manageable for an overview figure. Sum rest as «other»

> filtered_proportions <- colSums(major_taxa_proportions_tab) - colSums(temp_filt_major_taxa_proportions_tab)
> filt_major_taxa_proportions_tab <- rbind(temp_filt_major_taxa_proportions_tab, "Other"=filtered_proportions)

Make a copy of our table that’s safe for manipulating for ggplot

> filt_major_taxa_proportions_tab_for_plot <- filt_major_taxa_proportions_tab

For ggplot, add a column of the taxa names so that it is within the table, rather than just as row names

> filt_major_taxa_proportions_tab_for_plot$Major_Taxa <- row.names(filt_major_taxa_proportions_tab_for_plot)

For ggplot, transform the table into narrow, or long, format

> filt_major_taxa_proportions_tab_for_plot.g <- gather(filt_major_taxa_proportions_tab_for_plot, Sample, Proportion, -Major_Taxa)

Compare new table and compare it with the old one

> head(filt_major_taxa_proportions_tab_for_plot.g)
> head(filt_major_taxa_proportions_tab_for_plot)

Making a new table by pulling what we want from the sample & information table

> sample_info_for_merge<-data.frame("Sample"=row.names(sample_info_tab), "Type"=sample_info_tab$Type, 	"color"=sample_info_tab$Color, stringsAsFactors=F)
> sample_info_for_mergeN<-data.frame("Sample"=row.names(sample_info_tab), "Depth"=sample_info_tab$Depth, 	"color"=sample_info_tab$NewColor, stringsAsFactors=F)
> sample_info_for_mergeL<-data.frame("Sample"=row.names(sample_info_tab), "Location"=sample_info_tab$Location, 	"color"=sample_info_tab$LocationColor, stringsAsFactors=F)

Merging this table with the plotting table

> filt_major_taxa_proportions_tab_for_plot.g2 <- merge(filt_major_taxa_proportions_tab_for_plot.g, sample_info_for_merge)

> ggplot(filt_major_taxa_proportions_tab_for_plot.g2, aes(x=Sample, y=Proportion, fill=Major_Taxa)) +
geom_bar(width=0.6, stat="identity") +	theme_bw() + theme(axis.text.x=element_text(angle=90, vjust=0.4, hjust=1), 	legend.title=element_blank()) +	 labs(x="Sample", y="% of 16S rRNA gene copies recovered", title="All samples")

Phylum Box Plot by Type

> ggplot(filt_major_taxa_proportions_tab_for_plot.g2, aes(Major_Taxa, Proportion)) +
geom_jitter(aes(color=factor(Type)), size=2, width=0.15, height=0) + scale_color_manual(values=unique(filt_major_taxa_proportions_tab_for_plot.g2$color[order(filt_major_taxa_proportions_tab_f	or_plot.g2$Type)])) + geom_boxplot(fill=NA, outlier.color=NA) + theme(axis.text.x=element_text(angle=90, vjust=0.4, hjust=1), legend.title=element_blank()) + labs(x="Major Taxa", y="% of 16S rRNA gene copies recovered", title="All samples")

Phylum Box Plot by Depth

> ggplot(filt_major_taxa_proportions_tab_for_plotN.g2, aes(Major_Taxa, Proportion)) +
geom_jitter(aes(color=factor(Depth)), size=2, width=0.15, height=0) + scale_color_manual(values=unique(filt_major_taxa_proportions_tab_for_plotN.g2$color[order(filt_major_taxa_proportions_tab_	for_plotN.g2$Depth)])) + geom_boxplot(fill=NA, outlier.color=NA) + theme(axis.text.x=element_text(angle=90, vjust=0.4, 	hjust=1), legend.title=element_blank()) + labs(x="Major Taxa", y="% of 16S rRNA gene copies recovered", title="All 	samples")

Phylum Box Plot by Location

> ggplot(filt_major_taxa_proportions_tab_for_plotL.g2, aes(Major_Taxa, Proportion)) +
geom_jitter(aes(color=factor(Location)), size=2, width=0.15, height=0) + scale_color_manual(values=unique(filt_major_taxa_proportions_tab_for_plotL.g2$color[order(filt_major_taxa_proportions_tab_	for_plotL.g2$Location)])) + geom_boxplot(fill=NA, outlier.color=NA) + theme(axis.text.x=element_text(angle=90, 	vjust=0.4, hjust=1), legend.title=element_blank()) + labs(x="Major Taxa", y="% of 16S rRNA gene copies recovered", 	title="All samples")

Betadisper and permutational ANOVA

> anova(betadisper(euc_dist, sample_info_tab$Type))

#Response: Distances Df Sum Sq Mean Sq F value Pr(>F)
Groups 3 25138 8379.4 2.7634 0.05157 . Residuals 50 151613 3032.3
Signif. codes: 0 ‘’ 0.001 ‘’ 0.01 ‘’ 0.05 ‘.’ 0.1 ‘ ’ 1

Not significantly differnt, could do a permutation ANOVA (adonis)

> adonis(euc_dist~sample_info_tab$Type)

Terms added sequentially (first to last) Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
sample_info_tab$Type 3 781797 260599 1.9516 0.10482 0.008 ** Residuals 50 6676437 133529 0.89518
Total 53 7458234 1.00000

Signif. codes: 0 ‘’ 0.001 ‘’ 0.01 ‘’ 0.05 ‘.’ 0.1 ‘ ’ 1

adonis(euc_dist_sample_info_tab$Depth) Df SumsOfSqs MeanSqs F.Model R2 Pr(>F) sample_info_tab$Depth 6 730765 121794 0.85089 0.09798 0.806 Residuals 47 6727469 143138 0.90202
Total 53 7458234 1.00000

adonis(euc_dist_sample_info_tab$Location) Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
sample_info_tab$Location 3 2789750 929917 9.9595 0.37405 0.001 *** Residuals 50 4668484 93370 0.62595
Total 53 7458234 1.00000

Signif. codes: 0 ‘’ 0.001 ‘’ 0.01 ‘’ 0.05 ‘.’ 0.1 ‘ ’ 1

Group Specific Analyses: Thiomargarita vs. Picking Sediment

> Tmarg_sample_IDs <- row.names(sample_info_tab)[sample_info_tab$Type == "Thiomargarita"]

New distance matrix of only

> Tmarg_euc_dist <- dist(t(vst_trans_count_tab[ , colnames(vst_trans_count_tab) %in% Tmarg_sample_IDs]))

Sample info table with just Thiomargarita

> Tmarg_sample_info_tab <- sample_info_tab[row.names(sample_info_tab) %in% Tmarg_sample_IDs, ]

anova(betadisper(Tmarg_euc_dist, Tmarg_sample_info_tab$Location)) #Response: Distances Df Sum Sq Mean Sq F value Pr(>F)
Groups 2 63923 31961 15.829 0.002526 ** Residuals 7 14134 2019 Signif. codes: 0 ‘’ 0.001 ‘’ 0.01 ‘’ 0.05 ‘.’ 0.1 ‘ ’ 1

Sample info table with just Beggiatoa

> Begg_sample_IDs <- row.names(sample_info_tab)[sample_info_tab$Type == "Beggiatoa"]
> Begg_euc_dist <- dist(t(vst_trans_count_tab[ , colnames(vst_trans_count_tab) %in% Begg_sample_IDs]))
> Begg_sample_info_tab <- sample_info_tab[row.names(sample_info_tab) %in% Begg_sample_IDs, ]

anova(betadisper(Begg_euc_dist, Begg_sample_info_tab$Location)) Df Sum Sq Mean Sq F value Pr(>F)
Groups 1 46025 46025 15.572 0.05863 . Residuals 2 5911 2956

Signif. codes: 0 ‘’ 0.001 ‘’ 0.01 ‘’ 0.05 ‘.’ 0.1 ‘ ’ 1

> sample_IDs_23002 <- row.names(sample_info_tab)[sample_info_tab$Location == "St_23002"]
> sample_IDs_23020 <- row.names(sample_info_tab)[sample_info_tab$Location == "St_23020"]
> sample_IDs_20020 <- row.names(sample_info_tab)[sample_info_tab$Location == "St_20020"]
> sample_IDs_Geochem21 <- row.names(sample_info_tab)[sample_info_tab$Location == "Geochem_21"]

Make an object for each site

> sample_IDs_23002_count_phy <- otu_table(count_tab[, colnames(count_tab) %in% sample_IDs_23002], taxa_are_rows=T)
> sample_IDs_23020_count_phy <- otu_table(count_tab[, colnames(count_tab) %in% sample_IDs_23020], taxa_are_rows=T)
> sample_IDs_20020_count_phy <- otu_table(count_tab[, colnames(count_tab) %in% sample_IDs_20020], taxa_are_rows=T)
> sample_IDs_Geochem21_count_phy <- otu_table(count_tab[, colnames(count_tab) %in% sample_IDs_Geochem21], taxa_are_rows=T)

Make a sample info table with just the locations and run DeSeq2

> sample_IDs_23002_sample_info_tab <- sample_info_tab[row.names(sample_info_tab) %in% sample_IDs_23002, ]
> sample_IDs_23020_sample_info_tab <- sample_info_tab[row.names(sample_info_tab) %in% sample_IDs_23020, ]
> sample_IDs_20020_sample_info_tab <- sample_info_tab[row.names(sample_info_tab) %in% sample_IDs_20020, ]
> sample_IDs_Geochem21_sample_info_tab <- sample_info_tab[row.names(sample_info_tab) %in% sample_IDs_Geochem21, ]

> sample_IDs_23002_sample_info_tab_phy <- sample_data(sample_IDs_23002_sample_info_tab)
> sample_IDs_23020_sample_info_tab_phy <- sample_data(sample_IDs_23020_sample_info_tab)
> sample_IDs_20020_sample_info_tab_phy <- sample_data(sample_IDs_20020_sample_info_tab)
> sample_IDs_Geochem21_sample_info_tab_phy <- sample_data(sample_IDs_Geochem21_sample_info_tab)

> p23002_physeq <- phyloseq(sample_IDs_23002_count_phy, sample_IDs_23002_sample_info_tab_phy)
> p23020_physeq <- phyloseq(sample_IDs_23020_count_phy, sample_IDs_23020_sample_info_tab_phy)
> p20020_physeq <- phyloseq(sample_IDs_20020_count_phy, sample_IDs_20020_sample_info_tab_phy)
> Geochem21_physeq <- phyloseq(sample_IDs_Geochem21_count_phy, sample_IDs_Geochem21_sample_info_tab_phy)

> p23002_deseq <- phyloseq_to_deseq2(p23002_physeq, ~Type)
> p23020_deseq <- phyloseq_to_deseq2(p23020_physeq, ~Type)
> p20020_deseq <- phyloseq_to_deseq2(p20020_physeq, ~Type)
> Geochem21_deseq <- phyloseq_to_deseq2(Geochem21_physeq, ~Type)

> p23002_deseq <- DESeq(p23002_deseq)
> p23020_deseq <- DESeq(p23020_deseq)
> p20020_deseq <- DESeq(p20020_deseq)
> Geochem21_deseq <- DESeq(Geochem21_deseq)

Examining the results

> deseq_res_Beggiatoa_p20020 <- results(p20020_deseq, alpha=0.01, contrast=c("Type", "Beggiatoa", "Live Sediment LSB 	Removed"))
> summary(deseq_res_Beggiatoa_p20020)

Positive hits for > 0.01

> sigtab_deseq_res_Beggiatoa_p20020 <- deseq_res_Beggiatoa_p20020[which(deseq_res_Beggiatoa_p20020$padj < 0.01), ]
> summary(sigtab_deseq_res_Beggiatoa_p20020)
> sigtab_deseq_res_Beggiatoa_p20020_with_tax <- cbind(as(sigtab_deseq_res_Beggiatoa_p20020, "data.frame"), as(tax_table(ASV_physeq)[row.names(sigtab_deseq_res_Beggiatoa_p20020), ], "matrix"))
> sigtab_deseq_res_Beggiatoa_p20020_with_tax[order(sigtab_deseq_res_Beggiatoa_p20020_with_tax$baseMean, decreasing=T), ]
> write.table(sigtab_deseq_res_Beggiatoa_p20020_with_tax, "sigtab_deseq_res_Beggiatoa_p20020_with_tax.tsv", sep="\t", quote=F, col.names=NA)

> deseq_res_Thiom_Geochem21 <- results(Geochem21_deseq, alpha=0.01, contrast=c("Type", "Thiomargarita", "Live Sediment 	LSB Removed"))
> summary(deseq_res_Thiom_Geochem21)

out of 17652 with nonzero total read count adjusted p-value < 0.01 LFC > 0 (up) : 57, 0.32% LFC < 0 (down) : 1, 0.0057% outliers [1] : 67, 0.38% low counts [2] : 15273, 87% (mean count < 7)

> sigtab_deseq_res_Thiom_Geochem21 <- deseq_res_Thiom_Geochem21[which(deseq_res_Thiom_Geochem21$padj < 0.01), ]
> summary(sigtab_deseq_res_Thiom_Geochem21)
> sigtab_deseq_res_Thiom_Geochem21_with_tax <- cbind(as(sigtab_deseq_res_Thiom_Geochem21, "data.frame"), as(tax_table(ASV_physeq)[row.names(sigtab_deseq_res_Thiom_Geochem21), ], "matrix"))
> sigtab_deseq_res_Thiom_Geochem21_with_tax[order(sigtab_deseq_res_Thiom_Geochem21_with_tax$baseMean, decreasing=T), ]
> write.table(sigtab_deseq_res_Thiom_Geochem21_with_tax, "sigtab_deseq_res_Thiom_Geochem21_with_tax.tsv", sep="\t", quote=F, col.names=NA)


> deseq_res_Thiom_p23020_deseq <- results(p23020_deseq, alpha=0.01, contrast=c("Type", "Thiomargarita", "Live Sediment 	LSB Removed"))
> summary(deseq_res_Thiom_p23020_deseq)

out of 22661 with nonzero total read count adjusted p-value < 0.01 LFC > 0 (up) : 102, 0.45% LFC < 0 (down) : 6, 0.026% outliers [1] : 80, 0.35% low counts [2] : 16425, 72% (mean count < 3)

> sigtab_deseq_res_Thiom_p23020 <- deseq_res_Thiom_p23020_deseq[which(deseq_res_Thiom_p23020_deseq$padj < 0.01), ]
summary(sigtab_deseq_res_Thiom_p23020)
> sigtab_deseq_res_Thiom_p23020_with_tax <- cbind(as(sigtab_deseq_res_Thiom_p23020, "data.frame"), 	as(tax_table(ASV_physeq)[row.names(sigtab_deseq_res_Thiom_p23020), ], "matrix"))
>sigtab_deseq_res_Thiom_p23020_with_tax[order(sigtab_deseq_res_Thiom_p23020$baseMean, decreasing=T), ]
> write.table(sigtab_deseq_res_Thiom_p23020_with_tax, "sigtab_deseq_res_Thiom_p23020_with_tax.tsv", sep="\t", quote=F, col.names=NA)