library("BiocStyle")
library("knitr")
library("rmarkdown")
library(tidyverse)
library(dada2); packageVersion("dada2")
[1] ‘1.16.0’
$set(message = FALSE, error = FALSE, warning = FALSE,
opts_chunkcache = FALSE, fig.width = 8, fig.height = 5)
"data/MiSeq_SOP" # CHANGE ME to the directory containing the fastq files after unzipping.
path <-list.files(path)
## [1] "F3D0_S188_L001_R1_001.fastq" "F3D0_S188_L001_R2_001.fastq"
## [3] "F3D1_S189_L001_R1_001.fastq" "F3D1_S189_L001_R2_001.fastq"
## [5] "F3D141_S207_L001_R1_001.fastq" "F3D141_S207_L001_R2_001.fastq"
## [7] "F3D142_S208_L001_R1_001.fastq" "F3D142_S208_L001_R2_001.fastq"
## [9] "F3D143_S209_L001_R1_001.fastq" "F3D143_S209_L001_R2_001.fastq"
## [11] "F3D144_S210_L001_R1_001.fastq" "F3D144_S210_L001_R2_001.fastq"
## [13] "F3D145_S211_L001_R1_001.fastq" "F3D145_S211_L001_R2_001.fastq"
## [15] "F3D146_S212_L001_R1_001.fastq" "F3D146_S212_L001_R2_001.fastq"
## [17] "F3D147_S213_L001_R1_001.fastq" "F3D147_S213_L001_R2_001.fastq"
## [19] "F3D148_S214_L001_R1_001.fastq" "F3D148_S214_L001_R2_001.fastq"
## [21] "F3D149_S215_L001_R1_001.fastq" "F3D149_S215_L001_R2_001.fastq"
## [23] "F3D150_S216_L001_R1_001.fastq" "F3D150_S216_L001_R2_001.fastq"
## [25] "F3D2_S190_L001_R1_001.fastq" "F3D2_S190_L001_R2_001.fastq"
## [27] "F3D3_S191_L001_R1_001.fastq" "F3D3_S191_L001_R2_001.fastq"
## [29] "F3D5_S193_L001_R1_001.fastq" "F3D5_S193_L001_R2_001.fastq"
## [31] "F3D6_S194_L001_R1_001.fastq" "F3D6_S194_L001_R2_001.fastq"
## [33] "F3D7_S195_L001_R1_001.fastq" "F3D7_S195_L001_R2_001.fastq"
## [35] "F3D8_S196_L001_R1_001.fastq" "F3D8_S196_L001_R2_001.fastq"
## [37] "F3D9_S197_L001_R1_001.fastq" "F3D9_S197_L001_R2_001.fastq"
## [39] "filtered" "HMP_MOCK.v35.fasta"
## [41] "Mock_S280_L001_R1_001.fastq" "Mock_S280_L001_R2_001.fastq"
## [43] "mouse.dpw.metadata" "mouse.time.design"
## [45] "stability.batch" "stability.files"
# Forward and reverse fastq filenames have format: SAMPLENAME_R1_001.fastq and SAMPLENAME_R2_001.fastq
sort(list.files(path, pattern="_R1_001.fastq", full.names = TRUE))
fnFs <- sort(list.files(path, pattern="_R2_001.fastq", full.names = TRUE))
fnRs <-# Extract sample names, assuming filenames have format: SAMPLENAME_XXX.fastq
sapply(strsplit(basename(fnFs), "_"), `[`, 1) sample.names <-
plotQualityProfile(fnFs[1:2])
plotQualityProfile(fnRs[1:2])
Considerations for your own data: Your reads must still overlap after truncation in order to merge them later! The tutorial is using 2x250 V4 sequence data, so the forward and reverse reads almost completely overlap and our trimming can be completely guided by the quality scores. If you are using a less-overlapping primer set, like V1-V2 or V3-V4, your truncLen must be large enough to maintain 20 + biological.length.variation nucleotides of overlap between them.
Filter and trim
# Place filtered files in filtered/ subdirectory
file.path(path, "filtered", paste0(sample.names, "_F_filt.fastq.gz"))
filtFs <- file.path(path, "filtered", paste0(sample.names, "_R_filt.fastq.gz"))
filtRs <-names(filtFs) <- sample.names
names(filtRs) <- sample.names
filterAndTrim(fnFs, filtFs, fnRs, filtRs, truncLen=c(240,160),
out <-maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=TRUE,
compress=TRUE, multithread=TRUE) # On Windows set multithread=FALSE
head(out)
## reads.in reads.out
## F3D0_S188_L001_R1_001.fastq 7793 7113
## F3D1_S189_L001_R1_001.fastq 5869 5299
## F3D141_S207_L001_R1_001.fastq 5958 5463
## F3D142_S208_L001_R1_001.fastq 3183 2914
## F3D143_S209_L001_R1_001.fastq 3178 2941
## F3D144_S210_L001_R1_001.fastq 4827 4312
Considerations for your own data: The standard filtering parameters are starting points, not set in stone. If you want to speed up downstream computation, consider tightening maxEE. If too few reads are passing the filter, consider relaxing maxEE, perhaps especially on the reverse reads (eg. maxEE=c(2,5)), and reducing the truncLen to remove low quality tails. Remember though, when choosing truncLen for paired-end reads you must maintain overlap after truncation in order to merge them later.
Considerations for your own data: For ITS sequencing, it is usually undesirable to truncate reads to a fixed length due to the large length variation at that locus. That is OK, just leave out truncLen. See the DADA2 ITS workflow for more information
Learn the Error Rates
learnErrors(filtFs, multithread=TRUE) errF <-
## 33514080 total bases in 139642 reads from 20 samples will be used for learning the error rates.
learnErrors(filtRs, multithread=TRUE) errR <-
## 22342720 total bases in 139642 reads from 20 samples will be used for learning the error rates.
plotErrors(errF, nominalQ=TRUE)
Sample Inference
dada(filtFs, err=errF, multithread=TRUE) dadaFs <-
## Sample 1 - 7113 reads in 1979 unique sequences.
## Sample 2 - 5299 reads in 1639 unique sequences.
## Sample 3 - 5463 reads in 1477 unique sequences.
## Sample 4 - 2914 reads in 904 unique sequences.
## Sample 5 - 2941 reads in 939 unique sequences.
## Sample 6 - 4312 reads in 1267 unique sequences.
## Sample 7 - 6741 reads in 1756 unique sequences.
## Sample 8 - 4560 reads in 1438 unique sequences.
## Sample 9 - 15637 reads in 3590 unique sequences.
## Sample 10 - 11413 reads in 2762 unique sequences.
## Sample 11 - 12017 reads in 3021 unique sequences.
## Sample 12 - 5032 reads in 1566 unique sequences.
## Sample 13 - 18075 reads in 3707 unique sequences.
## Sample 14 - 6250 reads in 1479 unique sequences.
## Sample 15 - 4052 reads in 1195 unique sequences.
## Sample 16 - 7369 reads in 1832 unique sequences.
## Sample 17 - 4765 reads in 1183 unique sequences.
## Sample 18 - 4871 reads in 1382 unique sequences.
## Sample 19 - 6504 reads in 1709 unique sequences.
## Sample 20 - 4314 reads in 897 unique sequences.
dada(filtRs, err=errR, multithread=TRUE) dadaRs <-
## Sample 1 - 7113 reads in 1660 unique sequences.
## Sample 2 - 5299 reads in 1349 unique sequences.
## Sample 3 - 5463 reads in 1335 unique sequences.
## Sample 4 - 2914 reads in 853 unique sequences.
## Sample 5 - 2941 reads in 880 unique sequences.
## Sample 6 - 4312 reads in 1286 unique sequences.
## Sample 7 - 6741 reads in 1803 unique sequences.
## Sample 8 - 4560 reads in 1265 unique sequences.
## Sample 9 - 15637 reads in 3414 unique sequences.
## Sample 10 - 11413 reads in 2522 unique sequences.
## Sample 11 - 12017 reads in 2771 unique sequences.
## Sample 12 - 5032 reads in 1415 unique sequences.
## Sample 13 - 18075 reads in 3290 unique sequences.
## Sample 14 - 6250 reads in 1390 unique sequences.
## Sample 15 - 4052 reads in 1134 unique sequences.
## Sample 16 - 7369 reads in 1635 unique sequences.
## Sample 17 - 4765 reads in 1084 unique sequences.
## Sample 18 - 4871 reads in 1161 unique sequences.
## Sample 19 - 6504 reads in 1502 unique sequences.
## Sample 20 - 4314 reads in 732 unique sequences.
1]] dadaFs[[
## dada-class: object describing DADA2 denoising results
## 128 sequence variants were inferred from 1979 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
Merge paired reads
mergePairs(dadaFs, filtFs, dadaRs, filtRs, verbose=TRUE)
mergers <-# Inspect the merger data.frame from the first sample
head(mergers[[1]])
## sequence
## 1 TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGCAGGCGGAAGATCAAGTCAGCGGTAAAATTGAGAGGCTCAACCTCTTCGAGCCGTTGAAACTGGTTTTCTTGAGTGAGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCATACCGGCGCTCAACTGACGCTCATGCACGAAAGTGTGGGTATCGAACAGG
## 2 TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGCCTGCCAAGTCAGCGGTAAAATTGCGGGGCTCAACCCCGTACAGCCGTTGAAACTGCCGGGCTCGAGTGGGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACCCCGATTGCGAAGGCAGCATACCGGCGCCCTACTGACGCTGAGGCACGAAAGTGCGGGGATCAAACAGG
## 3 TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGGCTGTTAAGTCAGCGGTCAAATGTCGGGGCTCAACCCCGGCCTGCCGTTGAAACTGGCGGCCTCGAGTGGGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCATACCGGCGCCCGACTGACGCTGAGGCACGAAAGCGTGGGTATCGAACAGG
## 4 TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGGCTTTTAAGTCAGCGGTAAAAATTCGGGGCTCAACCCCGTCCGGCCGTTGAAACTGGGGGCCTTGAGTGGGCGAGAAGAAGGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACCCCGATTGCGAAGGCAGCCTTCCGGCGCCCTACTGACGCTGAGGCACGAAAGTGCGGGGATCGAACAGG
## 5 TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGCAGGCGGACTCTCAAGTCAGCGGTCAAATCGCGGGGCTCAACCCCGTTCCGCCGTTGAAACTGGGAGCCTTGAGTGCGCGAGAAGTAGGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCCTACCGGCGCGCAACTGACGCTCATGCACGAAAGCGTGGGTATCGAACAGG
## 6 TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGGATGCCAAGTCAGCGGTAAAAAAGCGGTGCTCAACGCCGTCGAGCCGTTGAAACTGGCGTTCTTGAGTGGGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCATACCGGCGCCCTACTGACGCTGAGGCACGAAAGCGTGGGTATCGAACAGG
## abundance forward reverse nmatch nmismatch nindel prefer accept
## 1 579 1 1 148 0 0 1 TRUE
## 2 470 2 2 148 0 0 2 TRUE
## 3 449 3 4 148 0 0 1 TRUE
## 4 430 4 3 148 0 0 2 TRUE
## 5 345 5 6 148 0 0 1 TRUE
## 6 282 6 5 148 0 0 2 TRUE
Considerations for your own data: Most of your reads should successfully merge. If that is not the case upstream parameters may need to be revisited: Did you trim away the overlap between your reads?
Construct sequence table
makeSequenceTable(mergers)
seqtab <-dim(seqtab)
## [1] 20 293
#Inspect distribution of sequence lengths
table(nchar(getSequences(seqtab)))
##
## 251 252 253 254 255
## 1 88 196 6 2
Considerations for your own data: Sequences that are much longer or shorter than expected may be the result of non-specific priming. You can remove non-target-length sequences from your sequence table (eg. seqtab2 <- seqtab[,nchar(colnames(seqtab)) %in% 250:256]). This is analogous to “cutting a band” in-silico to get amplicons of the targeted length.
Remove chimeras
removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)
seqtab.nochim <-dim(seqtab.nochim)
## [1] 20 232
sum(seqtab.nochim)/sum(seqtab)
## [1] 0.964263
Track reads through the pipeline
function(x) sum(getUniques(x))
getN <- cbind(out, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers, getN), rowSums(seqtab.nochim))
track <-# If processing a single sample, remove the sapply calls: e.g. replace sapply(dadaFs, getN) with getN(dadaFs)
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim")
rownames(track) <- sample.names
head(track)
## input filtered denoisedF denoisedR merged nonchim
## F3D0 7793 7113 6996 6978 6551 6539
## F3D1 5869 5299 5227 5239 5025 5014
## F3D141 5958 5463 5339 5351 4973 4850
## F3D142 3183 2914 2799 2833 2595 2521
## F3D143 3178 2941 2822 2868 2553 2519
## F3D144 4827 4312 4146 4224 3622 3483
Considerations for your own data: This is a great place to do a last sanity check. Outside of filtering, there should no step in which a majority of reads are lost. If a majority of reads failed to merge, you may need to revisit the truncLen parameter used in the filtering step and make sure that the truncated reads span your amplicon. If a majority of reads were removed as chimeric, you may need to revisit the removal of primers, as the ambiguous nucleotides in unremoved primers interfere with chimera identification.
Assign taxonomy
assignTaxonomy(seqtab.nochim, "data/silva_nr_v132_train_set.fa.gz", multithread=TRUE) taxa <-
Extensions: The dada2 package also implements a method to make species level assignments based on exact matching between ASVs and sequenced reference strains. Recent analysis suggests that exact matching (or 100% identity) is the only appropriate way to assign species to 16S gene fragments. Currently, species-assignment training fastas are available for the Silva and RDP 16S databases. To follow the optional species addition step, download the silva_species_assignment_v132.fa.gz file, and place it in the directory with the fastq files.
taxa <- addSpecies(taxa, “~/tax/silva_species_assignment_v132.fa.gz”)
taxa # Removing sequence rownames for display only
taxa.print <-rownames(taxa.print) <- NULL
head(taxa.print)
## Kingdom Phylum Class Order Family
## [1,] "Bacteria" "Bacteroidetes" "Bacteroidia" "Bacteroidales" "Muribaculaceae"
## [2,] "Bacteria" "Bacteroidetes" "Bacteroidia" "Bacteroidales" "Muribaculaceae"
## [3,] "Bacteria" "Bacteroidetes" "Bacteroidia" "Bacteroidales" "Muribaculaceae"
## [4,] "Bacteria" "Bacteroidetes" "Bacteroidia" "Bacteroidales" "Muribaculaceae"
## [5,] "Bacteria" "Bacteroidetes" "Bacteroidia" "Bacteroidales" "Bacteroidaceae"
## [6,] "Bacteria" "Bacteroidetes" "Bacteroidia" "Bacteroidales" "Muribaculaceae"
## Genus
## [1,] NA
## [2,] NA
## [3,] NA
## [4,] NA
## [5,] "Bacteroides"
## [6,] NA
Considerations for your own data: If your reads do not seem to be appropriately assigned, for example lots of your bacterial 16S sequences are being assigned as Eukaryota NA NA NA NA NA, your reads may be in the opposite orientation as the reference database. Tell dada2 to try the reverse-complement orientation with assignTaxonomy(…, tryRC=TRUE) and see if this fixes the assignments. If using DECIPHER for taxonomy, try IdTaxa (…, strand=“both”).
Evaluate accuracy
seqtab.nochim["Mock",]
unqs.mock <- sort(unqs.mock[unqs.mock>0], decreasing=TRUE) # Drop ASVs absent in the Mock
unqs.mock <-cat("DADA2 inferred", length(unqs.mock), "sample sequences present in the Mock community.\n")
## DADA2 inferred 20 sample sequences present in the Mock community.
getSequences(file.path(path, "HMP_MOCK.v35.fasta"))
mock.ref <- sum(sapply(names(unqs.mock), function(x) any(grepl(x, mock.ref))))
match.ref <-cat("Of those,", sum(match.ref), "were exact matches to the expected reference sequences.\n")
## Of those, 20 were exact matches to the expected reference sequences.
Bonus: Handoff to phyloseq
library(phyloseq); packageVersion("phyloseq")
[1] ‘1.32.0’
library(Biostrings); packageVersion("Biostrings")
[1] ‘2.56.0’
theme_set(theme_bw())
rownames(seqtab.nochim)
samples.out <- sapply(strsplit(samples.out, "D"), `[`, 1)
subject <- substr(subject,1,1)
gender <- substr(subject,2,999)
subject <- as.integer(sapply(strsplit(samples.out, "D"), `[`, 2))
day <- data.frame(Subject=subject, Gender=gender, Day=day)
samdf <-$When <- "Early"
samdf$When[samdf$Day>100] <- "Late"
samdfrownames(samdf) <- samples.out
phyloseq(otu_table(seqtab.nochim, taxa_are_rows=FALSE),
ps <-sample_data(samdf),
tax_table(taxa))
prune_samples(sample_names(ps) != "Mock", ps) # Remove mock sample ps <-
Biostrings::DNAStringSet(taxa_names(ps))
dna <-names(dna) <- taxa_names(ps)
merge_phyloseq(ps, dna)
ps <-taxa_names(ps) <- paste0("ASV", seq(ntaxa(ps)))
ps
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 232 taxa and 19 samples ]
## sample_data() Sample Data: [ 19 samples by 4 sample variables ]
## tax_table() Taxonomy Table: [ 232 taxa by 6 taxonomic ranks ]
## refseq() DNAStringSet: [ 232 reference sequences ]
Visualize alpha-diversity:
plot_richness(ps, x="Day", measures=c("Shannon", "Simpson"), color="When")
### Ordinate:
# Transform data to proportions as appropriate for Bray-Curtis distances
transform_sample_counts(ps, function(otu) otu/sum(otu))
ps.prop <- ordinate(ps.prop, method="NMDS", distance="bray") ord.nmds.bray <-
## Run 0 stress 0.08574537
## Run 1 stress 0.1343832
## Run 2 stress 0.08574537
## ... Procrustes: rmse 2.415046e-06 max resid 5.154431e-06
## ... Similar to previous best
## Run 3 stress 0.08002299
## ... New best solution
## ... Procrustes: rmse 0.04283673 max resid 0.1433523
## Run 4 stress 0.09421601
## Run 5 stress 0.08942866
## Run 6 stress 0.08942865
## Run 7 stress 0.08002299
## ... Procrustes: rmse 3.787492e-06 max resid 1.047028e-05
## ... Similar to previous best
## Run 8 stress 0.09421602
## Run 9 stress 0.08574537
## Run 10 stress 0.0894288
## Run 11 stress 0.08002299
## ... New best solution
## ... Procrustes: rmse 2.532727e-06 max resid 7.479276e-06
## ... Similar to previous best
## Run 12 stress 0.1233096
## Run 13 stress 0.1331726
## Run 14 stress 0.1216669
## Run 15 stress 0.1216669
## Run 16 stress 0.09421602
## Run 17 stress 0.08574537
## Run 18 stress 0.1216669
## Run 19 stress 0.08002299
## ... New best solution
## ... Procrustes: rmse 1.186438e-06 max resid 2.248004e-06
## ... Similar to previous best
## Run 20 stress 0.08574537
## *** Solution reached
plot_ordination(ps.prop, ord.nmds.bray, color="When", title="Bray NMDS")
names(sort(taxa_sums(ps), decreasing=TRUE))[1:20]
top20 <- transform_sample_counts(ps, function(OTU) OTU/sum(OTU))
ps.top20 <- prune_taxa(top20, ps.top20)
ps.top20 <-plot_bar(ps.top20, x="Day", fill="Family") + facet_wrap(~When, scales="free_x")