Processing iTag reads for Namibian Beggiatoceae and host sediments

Simplify file names & make list file

ls  *_R1_001.fastq | cut -f1-2 -d "_" > samples

Run bash script for cutadapt to remove primers

!/bin/sh

for sample in $(cat samples)_
do
echo "On sample: $sample" | cutadapt -a ^GTGCCAGCMGCCGCGGTAA...ATTAGAWACCCBDGTAGTCC -A ^GGACTACHVGGGTWTCTAAT...TTACCGCGGCKGCTGGCAC -m 215 -M 285 --discard-untrimmed -o ${sample}_R1_001_trimmed.fq.gz -p ${sample}_R2_001_trimmed.fq.gz ${sample}_R1_001.fastq ${sample}_R2_001.fastq >> cutadapt_primer_trimming_stats.txt 2>&1
done

paste samples <(grep "passing" cutadapt_primer_trimming_stats.txt | cut -f3 -d "(" | tr -d ")") <(grep "filtered" cutadapt_primer_trimming_stats.txt | cut -f3 -d "(" | tr -d ")")

Cutadapt results

R version 3.6.3 (2020–02–29) DADA2 processing

> library("dada2")
> packageVersion("dada2") 

‘1.16.0’

> setwd("/project/geobiology/scratch/flood/Simons_Project_18_itag/")
> list.files()  #just to see if it is reading the files in the directory
> samples <- scan("samples", what="character")

Read and filter 62 items

> forward_reads <- paste0(samples, "_R1_001_trimmed.fq.gz")
> reverse_reads <- paste0(samples, "_R2_001_trimmed.fq.gz")
> filtered_forward_reads <- paste0(samples, "_R1_filtered.fq.gz")
> filtered_reverse_reads <- paste0(samples, "_R2_filtered.fq.gz")
> plotQualityProfile(forward_reads)
> filtered_out <- filterAndTrim(forward_reads, filtered_forward_reads, reverse_reads, filtered_reverse_reads, maxEE=c(2,2), rm.phix=TRUE, minLen=175, truncLen=c(250,200))

> class(filtered_out)

[1] «matrix»

> dim(filtered_out)

[1] 62 2

> filtered_out

Make error models

> err_forward_reads <- learnErrors(filtered_forward_reads, multithread = TRUE)
> err_reverse_reads <- learnErrors(filtered_reverse_reads, multithread = TRUE)
> plotErrors(err_forward_reads, nominalQ=TRUE)
> plotErrors(err_reverse_reads, nominalQ=TRUE)

Dereplication

> derep_forward <- derepFastq(filtered_forward_reads, verbose=TRUE)
> names(derep_forward) <- samples
> derep_reverse <- derepFastq(filtered_reverse_reads, verbose=TRUE)
> names(derep_reverse) <- samples

> dada_forward <- dada(derep_forward, err=err_forward_reads, pool="pseudo")

Generate ASVs

> dada_forward[[1]]

dada-class: object describing DADA2 denoising results 31 sequence variants were inferred from 248 input unique sequences. Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16

> dada_reverse <- dada(derep_reverse, err=err_reverse_reads, pool="pseudo")
> dada_reverse[[1]]

33 sequence variants were inferred from 178 input unique sequences. Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16

Merge reads

> merged_amplicons <- mergePairs(dada_forward, derep_forward, dada_reverse, derep_reverse, trimOverhang=TRUE, minOverlap=170, verbose=TRUE)

Make a counts table

> seqtab <- makeSequenceTable(merged_amplicons)

Remove chimeras

> seqtab.nochim <- removeBimeraDenovo(seqtab, verbose=T)

> sum(seqtab.nochim)/sum(seqtab)

[1] 0.9870203 lastest package [1] 0.987158

> getN <- function(x) sum(getUniques(x))

> summary_tab <- data.frame(row.names=samples, dada2_input=filtered_out[,1], filtered=filtered_out[,2], dada_f=sapply(dada_forward, getN), dada_r=sapply(dada_reverse, getN), merged=sapply(merged_amplicons, getN), nonchim=rowSums(seqtab.nochim), final_perc_reads_retained=round(rowSums(seqtab.nochim)/filtered_out[,1]*100, 1))

Assign taxonomy

> taxa <- assignTaxonomy(seqtab.nochim, "/project/geobiology/DADA2_databases/silva_nr_v138_train_set.fa.gz", tryRC=T)

> taxa <- addSpecies(taxa, "/project/geobiology/DADA2_databases/silva_species_assignment_v138.fa.gz")

Prepare for Phyloseq2

Giving our seq headers more manageable names (ASV_1, ASV_2…)

> asv_seqs <- colnames(seqtab.nochim)
> asv_headers <- vector(dim(seqtab.nochim)[2], mode="character")

for (i in 1:dim(seqtab.nochim)[2]) {
  asv_headers[i] <- paste(">ASV", i, sep="_")
}

making and writing out a fasta of our final ASV seqs:

> asv_fasta <- c(rbind(asv_headers, asv_seqs))
> write(asv_fasta, "ASVs2.fa")

count table:

> asv_tab <- t(seqtab.nochim)
> row.names(asv_tab) <- sub(">", "", asv_headers)
> write.table(asv_tab, "ASVs_counts2.tsv", sep="\t", quote=F, col.names=NA)

taxonomic table:

> asv_tax <- taxa
> row.names(asv_tax) <- sub(">", "", asv_headers)
> write.table(asv_tax, "ASVs_taxonomy2.tsv", sep="\t", quote=F, col.names=NA)

In excel merge the two tables to a single dataframe

> library("phyloseq")
> library("vegan")
> library("DESeq2")
> library("ggplot2")
> library("dendextend")
> library("tidyr")
> library("viridis")
> library("reshape")
> library("decontam")

> packageVersion("phyloseq") ‘1.32.0’
> packageVersion("vegan") ‘2.5.6’
> packageVersion("DESeq2") ‘1.28.0’
> packageVersion("ggplot2") ‘3.3.0’
> packageVersion("dendextend") ‘1.13.4’
> packageVersion("tidyr") ‘1.0.2’
> packageVersion("viridis") ‘0.5.1’
> packageVersion("reshape") ‘0.8.8’
> packageVersion("decontam") [1] ‘1.8.0’

> load("P18_spring2020.RData")

Too big to open on desktop, need to remove large objects

> objects()  #lists all objects
> rm()

> load("P18_spring2020_red.RData")

Load in dataframe with sample metadata

> p18dataframe2 <- read.csv("p18dataframe2.csv", header = TRUE, sep=",")

> ps2 <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows=FALSE), tax_table(taxa), sample_data(p18dataframe2))

Didn’t work, new names have the _## after name. But fixing didn’t work

> rownames(p18dataframe2) <- samples.out

Ok this works. But now an extra column called names. Should remove.

> p18dataframe3 <- p18dataframe2[ -c(1) ]

> ps2 <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows=FALSE), tax_table(taxa), sample_data(p18dataframe3))

> table(tax_table(ps2)[, "Kingdom"], exclude = NULL)

Archaea Bacteria Eukaryota 11327 29002 21 170

Start with creating new count files from seqtab.nochim

> asv_seqs <- colnames(seqtab.nochim)
> asv_headers <- vector(dim(seqtab.nochim)[2], mode="character")asv_headers <- vector(dim(seqtab.nochim)[2], mode="character")

for (i in 1:dim(seqtab.nochim)[2]) { asv_headers[i] <- paste(">ASV", i, sep="_") }

Making and writing out a fasta of our ASV seqs:

> asv_fasta <- c(rbind(asv_headers, asv_seqs))
> write(asv_fasta, "ASVs.fa")

Making and writing out a fasta of our count table:

> asv_tab <- t(seqtab.nochim)
> row.names(asv_tab) <- sub(">", "", asv_headers)
> write.table(asv_tab, "ASVs_counts.tsv", sep="\t", quote=F, col.names=NA)

Making and writing out a fasta of our tax table:

> asv_tax <- taxa
> row.names(asv_tax) <- sub(">", "", asv_headers)
> write.table(asv_tax, "ASVs_taxonomy.tsv", sep="\t", quote=F, col.names=NA)

To identify contaminates we need to identify which columns have the sequencing center blacks and our negative controls. Negative controls in column numbers 1,2,54,59

> colnames(asv_tab)
> vector_for_decontam <- c(rep(TRUE, 2), rep(FALSE, 51), rep(TRUE, 1), rep(FALSE, 4), rep(TRUE, 1), rep(FALSE, 3))

> contam_df <- isContaminant(t(asv_tab), neg=vector_for_decontam)

> table(contam_df$contaminant) 

Getting vector holding the identified contaminant IDs

> contam_asvs <- row.names(contam_df[contam_df$contaminant == TRUE, ])

> asv_tax[row.names(asv_tax) %in% contam_asvs, ]

Kingdom Phylum Class
ASV_1497 «Bacteria» «Bacteroidota» «Bacteroidia»
ASV_1660 «Bacteria» «Firmicutes» «Bacilli»
ASV_2331 «Archaea» «Nanoarchaeota» «Nanoarchaeia»
ASV_3776 «Bacteria» «Proteobacteria» «Alphaproteobacteria» ASV_5306 «Bacteria» «Actinobacteriota» «Actinobacteria»
ASV_9182 «Bacteria» «Proteobacteria» «Gammaproteobacteria» ASV_23524 «Bacteria» «Desulfobacterota» «Desulfobulbia»
Order Family
ASV_1497 «Flavobacteriales» «Weeksellaceae»
ASV_1660 «Bacillales» «Planococcaceae»
ASV_2331 «Woesearchaeales» NA
ASV_3776 «Rhizobiales» «Xanthobacteraceae»
ASV_5306 «Propionibacteriales» «Propionibacteriaceae» ASV_9182 «Xanthomonadales» «Xanthomonadaceae»
ASV_23524 «Desulfobulbales» NA
Genus Species
ASV_1497 «Chryseobacterium» NA
ASV_1660 «Lysinibacillus» NA
ASV_2331 NA NA
ASV_3776 «Bradyrhizobium» NA
ASV_5306 «Cutibacterium» NA
ASV_9182 «Stenotrophomonas» «maltophilia» ASV_23524 NA NA

> contam_indices <- which(asv_fasta %in% paste0(">", contam_asvs))
> dont_want <- sort(c(contam_indices, contam_indices + 1))
> asv_fasta_no_contam <- asv_fasta[- dont_want]
> asv_tab_no_contam <- asv_tab[!row.names(asv_tab) %in% contam_asvs, ]
> asv_tax_no_contam <- asv_tax[!row.names(asv_tax) %in% contam_asvs, ]

 
> write(asv_fasta_no_contam, "ASVs-no-contam.fa")
> write.table(asv_tab_no_contam, "ASVs_counts-no-contam.tsv",
        sep="\t", quote=F, col.names=NA)
> write.table(asv_tax_no_contam, "ASVs_taxonomy-no-contam.tsv",
        sep="\t", quote=F, col.names=NA)

Start new Rdata for simplicity (removing negative control and very poor libraries)

> count_tab <- read.table("ASVs_counts-no-contam.tsv", header=T, row.names=1,
         check.names=F, sep="\t")[ , -c(1:2,15,28,52:54,59)]
> tax_tab <- as.matrix(read.table("ASVs_taxonomy-no-contam.tsv", header=T,
       row.names=1, check.names=F, sep="\t"))
> sample_info_tab <- read.table("P18dataframe3_no_controls.tsv", header=T, row.names=1, check.names=F, sep="\t")

> sample_info_tab$Color <- as.character(sample_info_tab$Color)
> sample_info_tab$NewColor <- as.character(sample_info_tab$NewColor)
> sample_info_tab$LocationColor <- as.character(sample_info_tab$LocationColor)


> deseq_counts <- DESeqDataSetFromMatrix(count_tab, colData = sample_info_tab, design = ~Location)

> deseq_counts_vst <- varianceStabilizingTransformation(deseq_counts)
> vst_trans_count_tab <- assay(deseq_counts_vst)
> euc_dist <- dist(t(vst_trans_count_tab))
> euc_clust <- hclust(euc_dist, method="ward.D2")
> plot(euc_clust) 
> euc_dend <- as.dendrogram(euc_clust, hang=0.1)
> dend_cols <- as.character(sample_info_tab$Color[order.dendrogram(euc_dend)])
> labels_colors(euc_dend) <- dend_cols

> plot(euc_dend, ylab="VST Euc. dist.")

Making our phyloseq object with transformed table (will use vst_physeq for many analyses)

> vst_count_phy <- otu_table(vst_trans_count_tab, taxa_are_rows=T)
> sample_info_tab_phy <- sample_data(sample_info_tab)
> vst_physeq <- phyloseq(vst_count_phy, sample_info_tab_phy)

Generating and visualizing the PCoA with phyloseq

> vst_pcoa <- ordinate(vst_physeq, method="MDS", distance="euclidean")
> eigen_vals <- vst_pcoa$values$Eigenvalues

Plot_ordination(vst_physeq, vst_pcoa, color=«Type», shape=«Location») +

> geom_point(size=2) + labs(col="Location") + 
geom_text(aes(label=rownames(sample_info_tab), hjust=0.3, vjust=-0.4)) + 
coord_fixed(sqrt(eigen_vals[2]/eigen_vals[1])) + ggtitle("PCoA") + 
scale_color_manual(values=unique(sample_info_tab$Color[order(sample_info_tab$Type)])) + 
theme(legend.position="none")

Transforms table

> rarecurve(t(count_tab), step=100, col=sample_info_tab$Color, lwd=2, ylab="ASVs", label=T)
  abline(v=(min(rowSums(t(count_tab)))))

Richness and diversity estimates (first we need to create a phyloseq object using our un-transformed count table)

> count_tab_phy <- otu_table(count_tab, taxa_are_rows=T)
> tax_tab_phy <- tax_table(tax_tab)

> ASV_physeq <- phyloseq(count_tab_phy, tax_tab_phy, sample_info_tab_phy)

Plot_richness

> plot_richness(ASV_physeq, color="Type", measures=c("Chao1", "Shannon")) + 	scale_color_manual(values=unique(sample_info_tab$Color[order(sample_info_tab$Type)])) +	theme(legend.title = element_blank())
  
> plot_richness(ASV_physeq, x="Type", color="Location", measures=c("Chao1", "Shannon")) + 	scale_color_manual(values=unique(sample_info_tab$LocationColor[order(sample_info_tab$Location)])) +	theme(legend.title = element_blank())

> plot_richness(ASV_physeq, x="Type", color="Depth", measures=c("Chao1", "Shannon")) + 
scale_color_manual(values=unique(sample_info_tab$NewColor[order(sample_info_tab$Depth)])) +
theme(legend.title = element_blank())

Using phyloseq to make a count table that has summed all ASVs that were in the same phylum

> phyla_counts_tab <- otu_table(tax_glom(ASV_physeq, taxrank="Phylum")) 

Making a vector of phyla names to set as row names

> phyla_tax_vec <- as.vector(tax_table(tax_glom(ASV_physeq, taxrank="Phylum"))[,2]) 
> rownames(phyla_counts_tab) <- as.vector(phyla_tax_vec)

To account for sequences that weren’t assigned any taxonomy even at the phylum level

These came into R as ‘NAs’ in the taxonomy table, but their counts are still in the count table so we can get that value for each sample by substracting the column sums of this new table (that has everything that had a phylum assigned to it from the column sums of the starting count table (that has all representative sequences)

> unclassified_tax_counts <- colSums(count_tab) - colSums(phyla_counts_tab)

Add this row to our phylum count table:

> phyla_and_unidentified_counts_tab <- rbind(phyla_counts_tab, "Unclassified"=unclassified_tax_counts)

Remove the Proteobacteria, so we can next add them back in broken down by class

> temp_major_taxa_counts_tab <- phyla_and_unidentified_counts_tab[!row.names(phyla_and_unidentified_counts_tab) %in% "Proteobacteria", ]

Making count table broken down by class (contains classes beyond the Proteobacteria too at this point)

> class_counts_tab <- otu_table(tax_glom(ASV_physeq, taxrank="Class")) 

Making a table that holds the phylum and class level info

> class_tax_phy_tab <- tax_table(tax_glom(ASV_physeq, taxrank="Class")) 

> phy_tmp_vec <- class_tax_phy_tab[,2]
> class_tmp_vec <- class_tax_phy_tab[,3]
> rows_tmp <- row.names(class_tax_phy_tab)
> class_tax_tab <- data.frame("Phylum"=phy_tmp_vec, "Class"=class_tmp_vec, row.names = rows_tmp)

Making a vector of just the Proteobacteria classes

> proteo_classes_vec <- as.vector(class_tax_tab[class_tax_tab$Phylum == "Proteobacteria", "Class"])

Changing the row names like above so that they correspond to the taxonomy,rather than an ASV identifier

> rownames(class_counts_tab) <- as.vector(class_tax_tab$Class) 

Making a table of the counts of the Proteobacterial classes

> proteo_class_counts_tab <- class_counts_tab[row.names(class_counts_tab) %in% proteo_classes_vec, ] 

There are also possibly some sequences that were resolved to the level of Proteobacteria, but not any further, and therefore would be missing from our class table. The sum of them by subtracting the proteo class count table from just the Proteobacteria row from the original phylum-level count table

> proteo_no_class_annotated_counts <- phyla_and_unidentified_counts_tab[row.names(phyla_and_unidentified_counts_tab) %in% "Proteobacteria", ] - colSums(proteo_class_counts_tab)

Combining the tables:

> major_taxa_counts_tab <- rbind(temp_major_taxa_counts_tab, proteo_class_counts_tab, "Unresolved_Proteobacteria"=proteo_no_class_annotated_counts)

To check we didn’t miss any other sequences, we can compare the column sums to see if they are the same if «TRUE», we know nothing fell through the cracks

> identical(colSums(major_taxa_counts_tab), colSums(count_tab)) 

A proportions table for summarizing:

> major_taxa_proportions_tab <- apply(major_taxa_counts_tab, 2, function(x) x/sum(x)*100)

Check the dimensions of this table

> dim(major_taxa_proportions_tab)

[1] 85 54

There are currently 85 rows, which might be a little busy for a summary figure many of these taxa make up a very small percentage, so we’re going to filter some out

> temp_filt_major_taxa_proportions_tab <- data.frame(major_taxa_proportions_tab[apply(major_taxa_proportions_tab, 1, max) > 5, ])

Checking how many we have that were above this threshold

> dim(temp_filt_major_taxa_proportions_tab) 

Now have 13, much more manageable for an overview figure. Sum rest as «other»

> filtered_proportions <- colSums(major_taxa_proportions_tab) - colSums(temp_filt_major_taxa_proportions_tab)
> filt_major_taxa_proportions_tab <- rbind(temp_filt_major_taxa_proportions_tab, "Other"=filtered_proportions)

Make a copy of our table that’s safe for manipulating for ggplot

> filt_major_taxa_proportions_tab_for_plot <- filt_major_taxa_proportions_tab

For ggplot, add a column of the taxa names so that it is within the table, rather than just as row names

> filt_major_taxa_proportions_tab_for_plot$Major_Taxa <- row.names(filt_major_taxa_proportions_tab_for_plot)

For ggplot, transform the table into narrow, or long, format

> filt_major_taxa_proportions_tab_for_plot.g <- gather(filt_major_taxa_proportions_tab_for_plot, Sample, Proportion, -Major_Taxa)

Compare new table and compare it with the old one

> head(filt_major_taxa_proportions_tab_for_plot.g)
> head(filt_major_taxa_proportions_tab_for_plot)

Making a new table by pulling what we want from the sample & information table

> sample_info_for_merge<-data.frame("Sample"=row.names(sample_info_tab), "Type"=sample_info_tab$Type, 	"color"=sample_info_tab$Color, stringsAsFactors=F)
> sample_info_for_mergeN<-data.frame("Sample"=row.names(sample_info_tab), "Depth"=sample_info_tab$Depth, 	"color"=sample_info_tab$NewColor, stringsAsFactors=F)
> sample_info_for_mergeL<-data.frame("Sample"=row.names(sample_info_tab), "Location"=sample_info_tab$Location, 	"color"=sample_info_tab$LocationColor, stringsAsFactors=F)

Merging this table with the plotting table

> filt_major_taxa_proportions_tab_for_plot.g2 <- merge(filt_major_taxa_proportions_tab_for_plot.g, sample_info_for_merge)

> ggplot(filt_major_taxa_proportions_tab_for_plot.g2, aes(x=Sample, y=Proportion, fill=Major_Taxa)) +
geom_bar(width=0.6, stat="identity") +	theme_bw() + theme(axis.text.x=element_text(angle=90, vjust=0.4, hjust=1), 	legend.title=element_blank()) +	 labs(x="Sample", y="% of 16S rRNA gene copies recovered", title="All samples")

Phylum Box Plot by Type

> ggplot(filt_major_taxa_proportions_tab_for_plot.g2, aes(Major_Taxa, Proportion)) +
geom_jitter(aes(color=factor(Type)), size=2, width=0.15, height=0) + scale_color_manual(values=unique(filt_major_taxa_proportions_tab_for_plot.g2$color[order(filt_major_taxa_proportions_tab_f	or_plot.g2$Type)])) + geom_boxplot(fill=NA, outlier.color=NA) + theme(axis.text.x=element_text(angle=90, vjust=0.4, hjust=1), legend.title=element_blank()) + labs(x="Major Taxa", y="% of 16S rRNA gene copies recovered", title="All samples")

Phylum Box Plot by Depth

> ggplot(filt_major_taxa_proportions_tab_for_plotN.g2, aes(Major_Taxa, Proportion)) +
geom_jitter(aes(color=factor(Depth)), size=2, width=0.15, height=0) + scale_color_manual(values=unique(filt_major_taxa_proportions_tab_for_plotN.g2$color[order(filt_major_taxa_proportions_tab_	for_plotN.g2$Depth)])) + geom_boxplot(fill=NA, outlier.color=NA) + theme(axis.text.x=element_text(angle=90, vjust=0.4, 	hjust=1), legend.title=element_blank()) + labs(x="Major Taxa", y="% of 16S rRNA gene copies recovered", title="All 	samples")

Phylum Box Plot by Location

> ggplot(filt_major_taxa_proportions_tab_for_plotL.g2, aes(Major_Taxa, Proportion)) +
geom_jitter(aes(color=factor(Location)), size=2, width=0.15, height=0) + scale_color_manual(values=unique(filt_major_taxa_proportions_tab_for_plotL.g2$color[order(filt_major_taxa_proportions_tab_	for_plotL.g2$Location)])) + geom_boxplot(fill=NA, outlier.color=NA) + theme(axis.text.x=element_text(angle=90, 	vjust=0.4, hjust=1), legend.title=element_blank()) + labs(x="Major Taxa", y="% of 16S rRNA gene copies recovered", 	title="All samples")

Betadisper and permutational ANOVA

> anova(betadisper(euc_dist, sample_info_tab$Type))

#Response: Distances Df Sum Sq Mean Sq F value Pr(>F)
Groups 3 25138 8379.4 2.7634 0.05157 . Residuals 50 151613 3032.3
Signif. codes: 0 ‘’ 0.001 ‘’ 0.01 ‘’ 0.05 ‘.’ 0.1 ‘ ’ 1

Not significantly differnt, could do a permutation ANOVA (adonis)

> adonis(euc_dist~sample_info_tab$Type)  

Terms added sequentially (first to last) Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
sample_info_tab$Type 3 781797 260599 1.9516 0.10482 0.008 ** Residuals 50 6676437 133529 0.89518
Total 53 7458234 1.00000

Signif. codes: 0 ‘’ 0.001 ‘’ 0.01 ‘’ 0.05 ‘.’ 0.1 ‘ ’ 1

adonis(euc_distsample_info_tab$Depth) Df SumsOfSqs MeanSqs F.Model R2 Pr(>F) sample_info_tab$Depth 6 730765 121794 0.85089 0.09798 0.806 Residuals 47 6727469 143138 0.90202
Total 53 7458234 1.00000

adonis(euc_distsample_info_tab$Location) Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
sample_info_tab$Location 3 2789750 929917 9.9595 0.37405 0.001 *** Residuals 50 4668484 93370 0.62595
Total 53 7458234 1.00000

Signif. codes: 0 ‘’ 0.001 ‘’ 0.01 ‘’ 0.05 ‘.’ 0.1 ‘ ’ 1

Group Specific Analyses: Thiomargarita vs. Picking Sediment

> Tmarg_sample_IDs <- row.names(sample_info_tab)[sample_info_tab$Type == "Thiomargarita"]

New distance matrix of only

> Tmarg_euc_dist <- dist(t(vst_trans_count_tab[ , colnames(vst_trans_count_tab) %in% Tmarg_sample_IDs]))

Sample info table with just Thiomargarita

> Tmarg_sample_info_tab <- sample_info_tab[row.names(sample_info_tab) %in% Tmarg_sample_IDs, ]

anova(betadisper(Tmarg_euc_dist, Tmarg_sample_info_tab$Location)) #Response: Distances Df Sum Sq Mean Sq F value Pr(>F)
Groups 2 63923 31961 15.829 0.002526 ** Residuals 7 14134 2019 Signif. codes: 0 ‘’ 0.001 ‘’ 0.01 ‘’ 0.05 ‘.’ 0.1 ‘ ’ 1

Sample info table with just Beggiatoa

> Begg_sample_IDs <- row.names(sample_info_tab)[sample_info_tab$Type == "Beggiatoa"]
> Begg_euc_dist <- dist(t(vst_trans_count_tab[ , colnames(vst_trans_count_tab) %in% Begg_sample_IDs]))
> Begg_sample_info_tab <- sample_info_tab[row.names(sample_info_tab) %in% Begg_sample_IDs, ]

anova(betadisper(Begg_euc_dist, Begg_sample_info_tab$Location)) Df Sum Sq Mean Sq F value Pr(>F)
Groups 1 46025 46025 15.572 0.05863 . Residuals 2 5911 2956

Signif. codes: 0 ‘’ 0.001 ‘’ 0.01 ‘’ 0.05 ‘.’ 0.1 ‘ ’ 1

> sample_IDs_23002 <- row.names(sample_info_tab)[sample_info_tab$Location == "St_23002"]
> sample_IDs_23020 <- row.names(sample_info_tab)[sample_info_tab$Location == "St_23020"]
> sample_IDs_20020 <- row.names(sample_info_tab)[sample_info_tab$Location == "St_20020"]
> sample_IDs_Geochem21 <- row.names(sample_info_tab)[sample_info_tab$Location == "Geochem_21"]

Make an object for each site

> sample_IDs_23002_count_phy <- otu_table(count_tab[, colnames(count_tab) %in% sample_IDs_23002], taxa_are_rows=T)
> sample_IDs_23020_count_phy <- otu_table(count_tab[, colnames(count_tab) %in% sample_IDs_23020], taxa_are_rows=T)
> sample_IDs_20020_count_phy <- otu_table(count_tab[, colnames(count_tab) %in% sample_IDs_20020], taxa_are_rows=T)
> sample_IDs_Geochem21_count_phy <- otu_table(count_tab[, colnames(count_tab) %in% sample_IDs_Geochem21], taxa_are_rows=T)

Make a sample info table with just the locations and run DeSeq2

> sample_IDs_23002_sample_info_tab <- sample_info_tab[row.names(sample_info_tab) %in% sample_IDs_23002, ]
> sample_IDs_23020_sample_info_tab <- sample_info_tab[row.names(sample_info_tab) %in% sample_IDs_23020, ]
> sample_IDs_20020_sample_info_tab <- sample_info_tab[row.names(sample_info_tab) %in% sample_IDs_20020, ]
> sample_IDs_Geochem21_sample_info_tab <- sample_info_tab[row.names(sample_info_tab) %in% sample_IDs_Geochem21, ]

> sample_IDs_23002_sample_info_tab_phy <- sample_data(sample_IDs_23002_sample_info_tab)
> sample_IDs_23020_sample_info_tab_phy <- sample_data(sample_IDs_23020_sample_info_tab)
> sample_IDs_20020_sample_info_tab_phy <- sample_data(sample_IDs_20020_sample_info_tab)
> sample_IDs_Geochem21_sample_info_tab_phy <- sample_data(sample_IDs_Geochem21_sample_info_tab)

> p23002_physeq <- phyloseq(sample_IDs_23002_count_phy, sample_IDs_23002_sample_info_tab_phy)
> p23020_physeq <- phyloseq(sample_IDs_23020_count_phy, sample_IDs_23020_sample_info_tab_phy)
> p20020_physeq <- phyloseq(sample_IDs_20020_count_phy, sample_IDs_20020_sample_info_tab_phy)
> Geochem21_physeq <- phyloseq(sample_IDs_Geochem21_count_phy, sample_IDs_Geochem21_sample_info_tab_phy)

> p23002_deseq <- phyloseq_to_deseq2(p23002_physeq, ~Type)
> p23020_deseq <- phyloseq_to_deseq2(p23020_physeq, ~Type)
> p20020_deseq <- phyloseq_to_deseq2(p20020_physeq, ~Type)
> Geochem21_deseq <- phyloseq_to_deseq2(Geochem21_physeq, ~Type)

> p23002_deseq <- DESeq(p23002_deseq)
> p23020_deseq <- DESeq(p23020_deseq)
> p20020_deseq <- DESeq(p20020_deseq)
> Geochem21_deseq <- DESeq(Geochem21_deseq)

Examining the results

> deseq_res_Beggiatoa_p20020 <- results(p20020_deseq, alpha=0.01, contrast=c("Type", "Beggiatoa", "Live Sediment LSB 	Removed"))
> summary(deseq_res_Beggiatoa_p20020)

Positive hits for > 0.01

> sigtab_deseq_res_Beggiatoa_p20020 <- deseq_res_Beggiatoa_p20020[which(deseq_res_Beggiatoa_p20020$padj < 0.01), ]
> summary(sigtab_deseq_res_Beggiatoa_p20020)
> sigtab_deseq_res_Beggiatoa_p20020_with_tax <- cbind(as(sigtab_deseq_res_Beggiatoa_p20020, "data.frame"), as(tax_table(ASV_physeq)[row.names(sigtab_deseq_res_Beggiatoa_p20020), ], "matrix"))
> sigtab_deseq_res_Beggiatoa_p20020_with_tax[order(sigtab_deseq_res_Beggiatoa_p20020_with_tax$baseMean, decreasing=T), ]
> write.table(sigtab_deseq_res_Beggiatoa_p20020_with_tax, "sigtab_deseq_res_Beggiatoa_p20020_with_tax.tsv", sep="\t", quote=F, col.names=NA)

> deseq_res_Thiom_Geochem21 <- results(Geochem21_deseq, alpha=0.01, contrast=c("Type", "Thiomargarita", "Live Sediment 	LSB Removed"))
> summary(deseq_res_Thiom_Geochem21)

out of 17652 with nonzero total read count adjusted p-value < 0.01 LFC > 0 (up) : 57, 0.32% LFC < 0 (down) : 1, 0.0057% outliers [1] : 67, 0.38% low counts [2] : 15273, 87% (mean count < 7)

> sigtab_deseq_res_Thiom_Geochem21 <- deseq_res_Thiom_Geochem21[which(deseq_res_Thiom_Geochem21$padj < 0.01), ]
> summary(sigtab_deseq_res_Thiom_Geochem21)
> sigtab_deseq_res_Thiom_Geochem21_with_tax <- cbind(as(sigtab_deseq_res_Thiom_Geochem21, "data.frame"), as(tax_table(ASV_physeq)[row.names(sigtab_deseq_res_Thiom_Geochem21), ], "matrix"))
> sigtab_deseq_res_Thiom_Geochem21_with_tax[order(sigtab_deseq_res_Thiom_Geochem21_with_tax$baseMean, decreasing=T), ]
> write.table(sigtab_deseq_res_Thiom_Geochem21_with_tax, "sigtab_deseq_res_Thiom_Geochem21_with_tax.tsv", sep="\t", quote=F, col.names=NA)


> deseq_res_Thiom_p23020_deseq <- results(p23020_deseq, alpha=0.01, contrast=c("Type", "Thiomargarita", "Live Sediment 	LSB Removed"))
> summary(deseq_res_Thiom_p23020_deseq)

out of 22661 with nonzero total read count adjusted p-value < 0.01 LFC > 0 (up) : 102, 0.45% LFC < 0 (down) : 6, 0.026% outliers [1] : 80, 0.35% low counts [2] : 16425, 72% (mean count < 3)

> sigtab_deseq_res_Thiom_p23020 <- deseq_res_Thiom_p23020_deseq[which(deseq_res_Thiom_p23020_deseq$padj < 0.01), ]
summary(sigtab_deseq_res_Thiom_p23020)
> sigtab_deseq_res_Thiom_p23020_with_tax <- cbind(as(sigtab_deseq_res_Thiom_p23020, "data.frame"), 	as(tax_table(ASV_physeq)[row.names(sigtab_deseq_res_Thiom_p23020), ], "matrix"))
>sigtab_deseq_res_Thiom_p23020_with_tax[order(sigtab_deseq_res_Thiom_p23020$baseMean, decreasing=T), ]
> write.table(sigtab_deseq_res_Thiom_p23020_with_tax, "sigtab_deseq_res_Thiom_p23020_with_tax.tsv", sep="\t", quote=F, col.names=NA)