Skip to content
22 changes: 16 additions & 6 deletions q2_dada2/_denoise.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def _check_featureless_table(fp):
'homopolymer_gap_penalty': _SKIP,
'band_size': _SKIP,
'retain_all_samples': _BOOL,
'retain_unmerged': _BOOL,
'front': _SKIP,
'adapter': _SKIP,
'indels': _SKIP,
Expand Down Expand Up @@ -106,7 +107,7 @@ def _filepath_to_sample_paired(fp):

def _denoise_helper(biom_fp, track_fp, err_track_fp,
hashed_feature_ids, retain_all_samples,
paired=False):
paired=False, retain_unmerged=False):

_check_featureless_table(biom_fp)
with open(biom_fp) as fh:
Expand Down Expand Up @@ -182,18 +183,24 @@ def _denoise_helper(biom_fp, track_fp, err_track_fp,
# reintroduced above!
if not retain_all_samples:
table = table.remove_empty(axis="sample", inplace=False)

def _to_sequence(sequence, metadata):
if retain_unmerged:
return skbio.Sequence(sequence, metadata=metadata)

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am 99% sure this will still work even with the new LinkedDNA class.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mean the __init__ signature?

return skbio.DNA(sequence, metadata=metadata)

# The feature IDs in DADA2 are the sequences themselves.
if hashed_feature_ids:
# Make feature IDs the md5 sums of the sequences.
fid_map = {id_: hashlib.md5(id_.encode('utf-8')).hexdigest()
for id_ in table.ids(axis='observation')}
table.update_ids(fid_map, axis='observation', inplace=True)

rep_sequences = DNAIterator((skbio.DNA(k, metadata={'id': v})
rep_sequences = DNAIterator((_to_sequence(k, metadata={'id': v})
for k, v in fid_map.items()))
else:
rep_sequences = DNAIterator(
(skbio.DNA(id_, metadata={'id': id_})
(_to_sequence(id_, metadata={'id': id_})
for id_ in table.ids(axis='observation')))

# initalize and populate DADA2 diagnoistic Stats dictionary
Expand Down Expand Up @@ -300,7 +307,8 @@ def denoise_paired(demultiplexed_seqs: SingleLanePerSamplePairedEndFastqDirFmt,
allow_one_off: bool = False,
n_threads: int = 1, n_reads_learn: int = 1000000,
hashed_feature_ids: bool = True,
retain_all_samples: bool = True
retain_all_samples: bool = True,
retain_unmerged: bool = False
) -> (biom.Table, DNAIterator,
qiime2.Metadata, qiime2.Metadata):
_check_inputs(**locals())
Expand Down Expand Up @@ -357,7 +365,8 @@ def denoise_paired(demultiplexed_seqs: SingleLanePerSamplePairedEndFastqDirFmt,
'--min_parental_fold', str(min_fold_parent_over_abundance),
'--allow_one_off', str(allow_one_off),
'--num_threads', str(n_threads),
'--learn_min_reads', str(n_reads_learn)]
'--learn_min_reads', str(n_reads_learn),
'--retain_unmerged', str(retain_unmerged)]
try:
run_commands([cmd])
except subprocess.CalledProcessError as e:
Expand All @@ -378,7 +387,8 @@ def denoise_paired(demultiplexed_seqs: SingleLanePerSamplePairedEndFastqDirFmt,

return _denoise_helper(biom_fp, track_fp, err_track_fp,
hashed_feature_ids, retain_all_samples,
paired=True)
paired=True,
retain_unmerged=retain_unmerged)


def _remove_barcode(filename):
Expand Down
86 changes: 78 additions & 8 deletions q2_dada2/assets/run_dada.R
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,8 @@ option_list = list(
help="The number of threads to use"),
make_option(c("--learn_min_reads"), action="store", default='NULL', type='character',
help="The minimum number of reads to learn the error model from"),
make_option(c("--retain_unmerged"), action="store", default='FALSE', type='character',
help="If TRUE, denoised paired reads that fail merging are retained as space-concatenated sequences."),
make_option(c("--homopolymer_gap_penalty"), action="store", default='NULL', type='character',
help="The cost of gaps in homopolymer regions (>=3 repeated bases).Default is NULL, which causes homopolymer gaps to be treated as normal gaps."),
make_option(c("--band_size"), action="store", default='NULL', type='character',
Expand Down Expand Up @@ -244,6 +246,8 @@ minParentFold <- if(opt$min_parental_fold=='NULL') NULL else as.numeric(opt$min_
allowOneOff <-if(opt$allow_one_off=='NULL') NULL else as.logical(opt$allow_one_off)
nthreads <- if(opt$num_threads=='NULL') NULL else as.integer(opt$num_threads)
nreads.learn <- if(opt$learn_min_reads=='NULL') NULL else as.integer(opt$learn_min_reads)
retain.unmerged <- if(opt$retain_unmerged=='NULL') FALSE else as.logical(opt$retain_unmerged)
linked.concat.delim <- "NNNNNNNNNN"
# The following args are not directly exposed to end users in q2-dada2,
# but rather indirectly, via the methods `denoise-single` and `denoise-pyro`.
if (opt$homopolymer_gap_penalty=='NULL'){
Expand Down Expand Up @@ -309,7 +313,7 @@ cat("DADA2:", as.character(packageVersion("dada2")), "/",
"RcppParallel:", as.character(packageVersion("RcppParallel")), "\n")

### Helper Functions ###
#function to approximate melt function from reshape2 which is not a dependency
#function to approximate melt function from reshape2 which is not a dependency
melter<-function(df){
df<-as.data.frame(df)
melted_df<-data.frame(Var1 = character(), Var2 = numeric(), value = numeric(), stringsAsFactors = TRUE)
Expand All @@ -336,9 +340,9 @@ internal_plotErrors <- function(dq, nti=c("A","C","G","T"), ntj=c("A","C","G","T
if(!(all(nti %in% ACGT) && all(ntj %in% ACGT)) || any(duplicated(nti)) || any(duplicated(ntj))) {
stop("nti and ntj must be nucleotide(s): A/C/G/T.")
}

dq <- getErrors(dq, detailed=TRUE, enforce=FALSE)

if(!is.null(dq$trans)) {
if(ncol(dq$trans) <= 1) {
stop("plotErrors only supported when using quality scores in the error model (i.e. USE_QUALS=TRUE).")
Expand All @@ -356,7 +360,7 @@ internal_plotErrors <- function(dq, nti=c("A","C","G","T"), ntj=c("A","C","G","T
}
transdf$from <- substr(transdf$Transition, 1, 1)
transdf$to <- substr(transdf$Transition, 3, 3)

if(!is.null(dq$trans)) {
tot.count <- tapply(transdf$count, list(transdf$from, transdf$Qual), sum)
transdf$tot <- mapply(function(x,y) tot.count[x,y], transdf$from, as.character(transdf$Qual))
Expand Down Expand Up @@ -502,9 +506,11 @@ if(inp.dirR =='NULL'){#for CCS/sinlge/pyro read analysis
seqtab <- makeSequenceTable(dds)
}else{#for paired read analysis
denoisedF <- rep(0, length(filts))
mergedF <- rep(0, length(filts))
ddsF <- vector("list", length(filts))
ddsR <- vector("list", length(filts))
mergers <- vector("list", length(filts))
unmerged <- vector("list", length(filts))
cat("3) Denoise samples ")

for(j in seq(length(filts))) {
Expand Down Expand Up @@ -545,12 +551,49 @@ if(inp.dirR =='NULL'){#for CCS/sinlge/pyro read analysis
for(j in seq(length(filts))) {
drpF <- derepFastq(filts[[j]])
drpR <- derepFastq(filtsR[[j]])
mergers[[j]] <- mergePairs(
# we are intentionally not using `justConcatenate = TRUE` here; that sets
# the "accept" column to TRUE for all pairs in mergePairs and would collapse
# true merged and rescued unmerged reads into the same "merged" count
mp <- mergePairs(
ddsF[[j]], drpF, ddsR[[j]], drpR,
minOverlap=minOverlap,
maxMismatch=maxMergeMismatch,
trimOverhang=trimOverhang
trimOverhang=trimOverhang,
returnRejects=retain.unmerged
)
if(retain.unmerged){
mergedF[j] <- sum(mp[mp$accept, "abundance"])
mergers[[j]] <- mp[mp$accept, c("sequence", "abundance")]
unmerged.j <- mp[!mp$accept, c("forward", "reverse", "abundance")]
if(nrow(unmerged.j) > 0){
# `mergePairs` returns cluster indices in the "forward" and "reverse"
# columns, so we resolve these to denoised forward and
# reverse-complemented reverse sequences before concatenation
unmerged.forward <- as.character(
ddsF[[j]]$clustering$sequence[unmerged.j$forward]
)
unmerged.reverse <- as.character(
rc(ddsR[[j]]$clustering$sequence[unmerged.j$reverse])
)

# manually reconstruct dada2's "N" * 10 separator so that sequences
# are recognized if chimera filtering is performed; this separator
# is later converted to a single space
unmerged[[j]] <- data.frame(
sequence=paste(
unmerged.forward, unmerged.reverse,
sep=linked.concat.delim
),
abundance=unmerged.j$abundance,
stringsAsFactors=FALSE
)
Comment thread
colinvwood marked this conversation as resolved.
}else{
unmerged[[j]] <- data.frame(sequence=character(), abundance=numeric())
}
}else{
mergers[[j]] <- mp
mergedF[j] <- sum(mp[,"abundance"])
}
denoisedF[[j]] <- getN(ddsF[[j]])
cat(".")
}
Expand All @@ -560,15 +603,42 @@ if(inp.dirR =='NULL'){#for CCS/sinlge/pyro read analysis

}

# combine the merged and unmerged/concatenated feature tables
if(inp.dirR !='NULL' && retain.unmerged){
Comment thread
colinvwood marked this conversation as resolved.
Outdated
Comment thread
colinvwood marked this conversation as resolved.
Outdated
unmerged.any <- any(vapply(unmerged, nrow, integer(1)) > 0)
if(unmerged.any){
seqtab.unmerged <- makeSequenceTable(unmerged)
unmerged.ids <- colnames(seqtab.unmerged)
if(ncol(seqtab) > 0){
seqtab <- cbind(seqtab, seqtab.unmerged)
}else{
seqtab <- seqtab.unmerged
}
}else{
unmerged.ids <- character()
}
}else{
unmerged.ids <- character()
}

### Remove chimeras
cat("5) Remove chimeras (method = ", chimeraMethod, ")\n", sep="")
if(chimeraMethod %in% c("pooled", "consensus")) {
if(chimeraMethod %in% c("pooled", "consensus") && ncol(seqtab) > 0) {
Comment thread
colinvwood marked this conversation as resolved.
seqtab.nochim <- removeBimeraDenovo(seqtab, method=chimeraMethod, minFoldParentOverAbundance=minParentFold, allowOneOff=allowOneOff, multithread=multithread)
} else { # No chimera removal, copy seqtab to seqtab.nochim
seqtab.nochim <- seqtab
}

# after chimera filtering, convert any retained concatenated IDs to the
# single space-delimited representation
if(length(unmerged.ids) > 0){
unmerged.keep <- intersect(colnames(seqtab.nochim), unmerged.ids)
if(length(unmerged.keep) > 0){
Comment thread
colinvwood marked this conversation as resolved.
colnames(seqtab.nochim)[match(unmerged.keep, colnames(seqtab.nochim))] <-
gsub(linked.concat.delim, " ", unmerged.keep, fixed=TRUE)
}
}

### REPORT READ FRACTIONS THROUGH PIPELINE ###
cat("6) Report read numbers through the pipeline\n")
if(inp.dirR =='NULL'){
Expand All @@ -591,7 +661,7 @@ if(inp.dirR =='NULL'){
colnames(track) <- c("input", "filtered", "denoised", "merged", "non-chimeric")
passed.filtering <- track[,"filtered"] > 0
track[passed.filtering,"denoised"] <- denoisedF
track[passed.filtering,"merged"] <- rowSums(seqtab)
track[passed.filtering,"merged"] <- mergedF
track[passed.filtering,"non-chimeric"] <- rowSums(seqtab.nochim)
write.table(track, out.track, sep="\t", row.names=TRUE, col.names=NA,
quote=FALSE)
Expand Down
33 changes: 24 additions & 9 deletions q2_dada2/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from q2_types.per_sample_sequences import (
SequencesWithQuality, PairedEndSequencesWithQuality)
from q2_types.sample_data import SampleData
from q2_types.feature_data import FeatureData, Sequence
from q2_types.feature_data import FeatureData, Sequence, LinkedSequence
from q2_types.feature_table import FeatureTable, Frequency

import q2_dada2
Expand All @@ -24,6 +24,12 @@

_POOL_OPT = {'pseudo', 'independent'}
_CHIM_OPT = {'consensus', 'none'}
P_retain_unmerged, T_paired_representative_sequences = qiime2.plugin.TypeMap({
qiime2.plugin.Bool % qiime2.plugin.Choices(True):
Comment thread
colinvwood marked this conversation as resolved.
Outdated
FeatureData[LinkedSequence],
qiime2.plugin.Bool % qiime2.plugin.Choices(False):
Comment thread
colinvwood marked this conversation as resolved.
Outdated
FeatureData[Sequence],
})

citations = qiime2.plugin.Citations.load('citations.bib', package='q2_dada2')
plugin = qiime2.plugin.Plugin(
Expand Down Expand Up @@ -180,9 +186,10 @@
'n_threads': qiime2.plugin.Threads,
'n_reads_learn': qiime2.plugin.Int,
'hashed_feature_ids': qiime2.plugin.Bool,
'retain_all_samples': qiime2.plugin.Bool},
'retain_all_samples': qiime2.plugin.Bool,
'retain_unmerged': P_retain_unmerged},
Comment thread
colinvwood marked this conversation as resolved.
Outdated
outputs=[('table', FeatureTable[Frequency]),
('representative_sequences', FeatureData[Sequence]),
('representative_sequences', T_paired_representative_sequences),
('denoising_stats', SampleData[DADA2Stats]),
('base_transition_stats', DADA2BaseTransitionStats)],
input_descriptions={
Expand Down Expand Up @@ -283,15 +290,23 @@
'retain_all_samples': 'If True all samples input to dada2 will be '
'retained in the output of dada2, if false '
'samples with zero total frequency are removed '
'from the table.'
'from the table.',
'retain_unmerged': (
'If True, denoised paired reads that fail merging are retained by '
'encoding each pair as `forward_read<space>reverse_read` and '
'including these features in the table and sequences. Note that '
'the reverse read is reverse-complemented and thus both read '
'directions can be expected to map to the same strand.'
)
},
output_descriptions={
'table': 'The resulting feature table.',
'representative_sequences': ('The resulting feature sequences. Each '
'feature in the feature table will be '
'represented by exactly one sequence, '
'and these sequences will be the joined '
'paired-end sequences.'),
'representative_sequences': (
'The resulting feature sequences. Each feature in the feature '
'table will be represented by exactly one sequence, and these '
'sequences will be the joined paired-end sequences (and retained '
'unmerged pairs if `retain_unmerged` is enabled).'
),
'denoising_stats': DENOISING_STATS_DESCRIPTION,
'base_transition_stats': BASE_TRANSITION_STATS_DESCRIPTION,
},
Expand Down
Loading
Loading