Skip to content
41 changes: 35 additions & 6 deletions q2_dada2/_denoise.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def _check_featureless_table(fp):
'homopolymer_gap_penalty': _SKIP,
'band_size': _SKIP,
'retain_all_samples': _BOOL,
'retain_unmerged': _BOOL,
'front': _SKIP,
'adapter': _SKIP,
'indels': _SKIP,
Expand Down Expand Up @@ -106,7 +107,7 @@ def _filepath_to_sample_paired(fp):

def _denoise_helper(biom_fp, track_fp, err_track_fp,
hashed_feature_ids, retain_all_samples,
paired=False):
paired=False, retain_unmerged=False):

_check_featureless_table(biom_fp)
with open(biom_fp) as fh:
Expand All @@ -125,6 +126,9 @@ def _denoise_helper(biom_fp, track_fp, err_track_fp,

PASSED_FILTER = 'percentage of input passed filter'
NON_CHIMERIC = 'percentage of input non-chimeric'
CONCATENATED = 'percentage of input concatenated'
NON_CHIMERIC_CONCATENATED = (
'percentage of input non-chimeric concatenated')

round_cols = {PASSED_FILTER: 2, NON_CHIMERIC: 2}

Expand All @@ -142,6 +146,22 @@ def _denoise_helper(biom_fp, track_fp, err_track_fp,
col_order.insert(4, 'merged')
col_order.insert(5, MERGED)

# only calculate percentage of input concatenated if unmerged read pairs
# were retained
if 'concatenated' in df:
round_cols[CONCATENATED] = 2
df[CONCATENATED] = df['concatenated'] / df['input'] * 100
insert_at = col_order.index('non-chimeric')
col_order.insert(insert_at, 'concatenated')
col_order.insert(insert_at + 1, CONCATENATED)

if 'non-chimeric concatenated' in df:
round_cols[NON_CHIMERIC_CONCATENATED] = 2
df[NON_CHIMERIC_CONCATENATED] = (
df['non-chimeric concatenated'] / df['input'] * 100)
col_order.append('non-chimeric concatenated')
col_order.append(NON_CHIMERIC_CONCATENATED)

# only calculate percentage of input primer-removed if ccs
if 'primer-removed' in df:
PASSED_PRIMERREMOVE = 'percentage of input primer-removed'
Expand Down Expand Up @@ -182,18 +202,24 @@ def _denoise_helper(biom_fp, track_fp, err_track_fp,
# reintroduced above!
if not retain_all_samples:
table = table.remove_empty(axis="sample", inplace=False)

def _to_sequence(sequence, metadata):
if retain_unmerged:
return skbio.Sequence(sequence, metadata=metadata)

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am 99% sure this will still work even with the new LinkedDNA class.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mean the __init__ signature?

return skbio.DNA(sequence, metadata=metadata)

# The feature IDs in DADA2 are the sequences themselves.
if hashed_feature_ids:
# Make feature IDs the md5 sums of the sequences.
fid_map = {id_: hashlib.md5(id_.encode('utf-8')).hexdigest()
for id_ in table.ids(axis='observation')}
table.update_ids(fid_map, axis='observation', inplace=True)

rep_sequences = DNAIterator((skbio.DNA(k, metadata={'id': v})
rep_sequences = DNAIterator((_to_sequence(k, metadata={'id': v})
for k, v in fid_map.items()))
else:
rep_sequences = DNAIterator(
(skbio.DNA(id_, metadata={'id': id_})
(_to_sequence(id_, metadata={'id': id_})
for id_ in table.ids(axis='observation')))

# initalize and populate DADA2 diagnoistic Stats dictionary
Expand Down Expand Up @@ -300,7 +326,8 @@ def denoise_paired(demultiplexed_seqs: SingleLanePerSamplePairedEndFastqDirFmt,
allow_one_off: bool = False,
n_threads: int = 1, n_reads_learn: int = 1000000,
hashed_feature_ids: bool = True,
retain_all_samples: bool = True
retain_all_samples: bool = True,
retain_unmerged: bool = False
) -> (biom.Table, DNAIterator,
qiime2.Metadata, qiime2.Metadata):
_check_inputs(**locals())
Expand Down Expand Up @@ -357,7 +384,8 @@ def denoise_paired(demultiplexed_seqs: SingleLanePerSamplePairedEndFastqDirFmt,
'--min_parental_fold', str(min_fold_parent_over_abundance),
'--allow_one_off', str(allow_one_off),
'--num_threads', str(n_threads),
'--learn_min_reads', str(n_reads_learn)]
'--learn_min_reads', str(n_reads_learn),
'--retain_unmerged', str(retain_unmerged)]
try:
run_commands([cmd])
except subprocess.CalledProcessError as e:
Expand All @@ -378,7 +406,8 @@ def denoise_paired(demultiplexed_seqs: SingleLanePerSamplePairedEndFastqDirFmt,

return _denoise_helper(biom_fp, track_fp, err_track_fp,
hashed_feature_ids, retain_all_samples,
paired=True)
paired=True,
retain_unmerged=retain_unmerged)


def _remove_barcode(filename):
Expand Down
117 changes: 106 additions & 11 deletions q2_dada2/assets/run_dada.R
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,8 @@ option_list = list(
help="The number of threads to use"),
make_option(c("--learn_min_reads"), action="store", default='NULL', type='character',
help="The minimum number of reads to learn the error model from"),
make_option(c("--retain_unmerged"), action="store", default='FALSE', type='character',
help="If TRUE, denoised paired reads that fail merging are retained as space-concatenated sequences."),
make_option(c("--homopolymer_gap_penalty"), action="store", default='NULL', type='character',
help="The cost of gaps in homopolymer regions (>=3 repeated bases).Default is NULL, which causes homopolymer gaps to be treated as normal gaps."),
make_option(c("--band_size"), action="store", default='NULL', type='character',
Expand Down Expand Up @@ -244,6 +246,8 @@ minParentFold <- if(opt$min_parental_fold=='NULL') NULL else as.numeric(opt$min_
allowOneOff <-if(opt$allow_one_off=='NULL') NULL else as.logical(opt$allow_one_off)
nthreads <- if(opt$num_threads=='NULL') NULL else as.integer(opt$num_threads)
nreads.learn <- if(opt$learn_min_reads=='NULL') NULL else as.integer(opt$learn_min_reads)
retain.unmerged <- if(opt$retain_unmerged=='NULL') FALSE else as.logical(opt$retain_unmerged)
linked.concat.delim <- "NNNNNNNNNN"
# The following args are not directly exposed to end users in q2-dada2,
# but rather indirectly, via the methods `denoise-single` and `denoise-pyro`.
if (opt$homopolymer_gap_penalty=='NULL'){
Expand Down Expand Up @@ -309,7 +313,7 @@ cat("DADA2:", as.character(packageVersion("dada2")), "/",
"RcppParallel:", as.character(packageVersion("RcppParallel")), "\n")

### Helper Functions ###
#function to approximate melt function from reshape2 which is not a dependency
#function to approximate melt function from reshape2 which is not a dependency
melter<-function(df){
df<-as.data.frame(df)
melted_df<-data.frame(Var1 = character(), Var2 = numeric(), value = numeric(), stringsAsFactors = TRUE)
Expand All @@ -336,9 +340,9 @@ internal_plotErrors <- function(dq, nti=c("A","C","G","T"), ntj=c("A","C","G","T
if(!(all(nti %in% ACGT) && all(ntj %in% ACGT)) || any(duplicated(nti)) || any(duplicated(ntj))) {
stop("nti and ntj must be nucleotide(s): A/C/G/T.")
}

dq <- getErrors(dq, detailed=TRUE, enforce=FALSE)

if(!is.null(dq$trans)) {
if(ncol(dq$trans) <= 1) {
stop("plotErrors only supported when using quality scores in the error model (i.e. USE_QUALS=TRUE).")
Expand All @@ -356,7 +360,7 @@ internal_plotErrors <- function(dq, nti=c("A","C","G","T"), ntj=c("A","C","G","T
}
transdf$from <- substr(transdf$Transition, 1, 1)
transdf$to <- substr(transdf$Transition, 3, 3)

if(!is.null(dq$trans)) {
tot.count <- tapply(transdf$count, list(transdf$from, transdf$Qual), sum)
transdf$tot <- mapply(function(x,y) tot.count[x,y], transdf$from, as.character(transdf$Qual))
Expand Down Expand Up @@ -466,6 +470,8 @@ if(primer.removed.dir!='NULL'){#for CCS read analysis

### PROCESS ALL SAMPLES ###
# Loop over rest in streaming fashion with learned error rates
unmerged.id.map <- data.frame(
temporary=character(), linked=character(), stringsAsFactors=FALSE)

if(inp.dirR =='NULL'){#for CCS/sinlge/pyro read analysis
dds <- vector("list", length(filts))
Expand Down Expand Up @@ -502,6 +508,9 @@ if(inp.dirR =='NULL'){#for CCS/sinlge/pyro read analysis
seqtab <- makeSequenceTable(dds)
}else{#for paired read analysis
denoisedF <- rep(0, length(filts))
mergedF <- rep(0, length(filts))
concatenatedF <- rep(0, length(filts))
nonchimConcatenatedF <- rep(0, length(filts))
ddsF <- vector("list", length(filts))
ddsR <- vector("list", length(filts))
mergers <- vector("list", length(filts))
Expand Down Expand Up @@ -545,12 +554,64 @@ if(inp.dirR =='NULL'){#for CCS/sinlge/pyro read analysis
for(j in seq(length(filts))) {
drpF <- derepFastq(filts[[j]])
drpR <- derepFastq(filtsR[[j]])
mergers[[j]] <- mergePairs(
# we are intentionally not using `justConcatenate = TRUE` here; that sets
# the "accept" column to TRUE for all pairs in mergePairs and would collapse
# true merged and rescued unmerged reads into the same "merged" count
mp <- mergePairs(
ddsF[[j]], drpF, ddsR[[j]], drpR,
minOverlap=minOverlap,
maxMismatch=maxMergeMismatch,
trimOverhang=trimOverhang
trimOverhang=trimOverhang,
returnRejects=retain.unmerged
)
if(retain.unmerged){
mergedF[j] <- sum(mp[mp$accept, "abundance"])
mergers[[j]] <- mp[mp$accept, c("sequence", "abundance")]
unmerged.j <- mp[!mp$accept, c("forward", "reverse", "abundance")]
concatenatedF[j] <- sum(unmerged.j[,"abundance"])
if(nrow(unmerged.j) > 0){
# `mergePairs` returns cluster indices in the "forward" and "reverse"
# columns, so we resolve these to denoised forward and
# reverse-complemented reverse sequences before concatenation
unmerged.forward <- as.character(
ddsF[[j]]$clustering$sequence[unmerged.j$forward]
)
unmerged.reverse <- as.character(
rc(ddsR[[j]]$clustering$sequence[unmerged.j$reverse])
)

# manually reconstruct dada2's "N" * 10 separator so that sequences
# are recognized if chimera filtering is performed; this separator
# is later converted to a single space
unmerged.temp.seqs <- paste(
unmerged.forward, unmerged.reverse,
sep=linked.concat.delim
)
unmerged.linked.seqs <- paste(
unmerged.forward, unmerged.reverse,
sep=" "
)
unmerged.id.map <- rbind(
unmerged.id.map,
data.frame(
temporary=unmerged.temp.seqs,
linked=unmerged.linked.seqs,
stringsAsFactors=FALSE
)
)
mergers[[j]] <- rbind(
mergers[[j]],
data.frame(
sequence=unmerged.temp.seqs,
abundance=unmerged.j$abundance,
stringsAsFactors=FALSE
)
)
Comment thread
colinvwood marked this conversation as resolved.
}
}else{
mergers[[j]] <- mp
mergedF[j] <- sum(mp[,"abundance"])
}
denoisedF[[j]] <- getN(ddsF[[j]])
cat(".")
}
Expand All @@ -560,15 +621,36 @@ if(inp.dirR =='NULL'){#for CCS/sinlge/pyro read analysis

}


### Remove chimeras
cat("5) Remove chimeras (method = ", chimeraMethod, ")\n", sep="")
if(chimeraMethod %in% c("pooled", "consensus")) {
if(chimeraMethod %in% c("pooled", "consensus") && ncol(seqtab) > 0) {
Comment thread
colinvwood marked this conversation as resolved.
seqtab.nochim <- removeBimeraDenovo(seqtab, method=chimeraMethod, minFoldParentOverAbundance=minParentFold, allowOneOff=allowOneOff, multithread=multithread)
} else { # No chimera removal, copy seqtab to seqtab.nochim
seqtab.nochim <- seqtab
}

# after chimera filtering, convert retained concatenated IDs to the single
# space-delimited representation using the exact joins we introduced above
if(nrow(unmerged.id.map) > 0){
unmerged.id.map <- unique(unmerged.id.map)
ambiguous.ids <- unmerged.id.map$temporary[
duplicated(unmerged.id.map$temporary) |
duplicated(unmerged.id.map$temporary, fromLast=TRUE)
]
if(length(ambiguous.ids) > 0){
errQuit("Unable to uniquely map retained unmerged sequences from the temporary DADA2-compatible representation to linked sequences.", status=1)

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am quite confused how this would be possible. Can you explain a little more in a comment?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Say we have two linked sequences like so:

>seq1
ACN TG
>seq2
AC NTG

The the unmerged.id.map becomes (imagine we're using a single N separator instead of ten):

ACNNTG -> ACN TG
ACNNTG -> AC NTG

Now it's ambiguous what ACNNTG maps to.

}

unmerged.keep <- intersect(colnames(seqtab.nochim), unmerged.id.map$temporary)
if(length(unmerged.keep) > 0){
Comment thread
colinvwood marked this conversation as resolved.
nonchimConcatenatedF <- rowSums(
seqtab.nochim[, unmerged.keep, drop=FALSE]
)
colnames(seqtab.nochim)[match(unmerged.keep, colnames(seqtab.nochim))] <-
unmerged.id.map$linked[match(unmerged.keep, unmerged.id.map$temporary)]
}
}

### REPORT READ FRACTIONS THROUGH PIPELINE ###
cat("6) Report read numbers through the pipeline\n")
if(inp.dirR =='NULL'){
Expand All @@ -587,11 +669,24 @@ if(inp.dirR =='NULL'){
quote=FALSE)
}else{#for paired end reads
# Handle edge cases: Samples lost in filtering; One sample
track <- cbind(out, matrix(0, nrow=nrow(out), ncol=3))
colnames(track) <- c("input", "filtered", "denoised", "merged", "non-chimeric")
if(retain.unmerged){
track <- cbind(out, matrix(0, nrow=nrow(out), ncol=5))
colnames(track) <- c("input", "filtered", "denoised", "merged",
"concatenated", "non-chimeric",
"non-chimeric concatenated")
}else{
track <- cbind(out, matrix(0, nrow=nrow(out), ncol=3))
colnames(track) <- c("input", "filtered", "denoised", "merged",
"non-chimeric")
}
passed.filtering <- track[,"filtered"] > 0
track[passed.filtering,"denoised"] <- denoisedF
track[passed.filtering,"merged"] <- rowSums(seqtab)
track[passed.filtering,"merged"] <- mergedF
if(retain.unmerged){
track[passed.filtering,"concatenated"] <- concatenatedF
track[passed.filtering,"non-chimeric concatenated"] <-
nonchimConcatenatedF
}
track[passed.filtering,"non-chimeric"] <- rowSums(seqtab.nochim)
write.table(track, out.track, sep="\t", row.names=TRUE, col.names=NA,
quote=FALSE)
Expand Down
33 changes: 24 additions & 9 deletions q2_dada2/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from q2_types.per_sample_sequences import (
SequencesWithQuality, PairedEndSequencesWithQuality)
from q2_types.sample_data import SampleData
from q2_types.feature_data import FeatureData, Sequence
from q2_types.feature_data import FeatureData, Sequence, LinkedSequence
from q2_types.feature_table import FeatureTable, Frequency

import q2_dada2
Expand All @@ -24,6 +24,12 @@

_POOL_OPT = {'pseudo', 'independent'}
_CHIM_OPT = {'consensus', 'none'}
P_retain_unmerged, T_paired_representative_sequences = qiime2.plugin.TypeMap({
qiime2.plugin.Choices(True):
FeatureData[LinkedSequence],
qiime2.plugin.Choices(False):
FeatureData[Sequence],
})

citations = qiime2.plugin.Citations.load('citations.bib', package='q2_dada2')
plugin = qiime2.plugin.Plugin(
Expand Down Expand Up @@ -181,9 +187,10 @@
'n_threads': qiime2.plugin.Threads,
'n_reads_learn': qiime2.plugin.Int,
'hashed_feature_ids': qiime2.plugin.Bool,
'retain_all_samples': qiime2.plugin.Bool},
'retain_all_samples': qiime2.plugin.Bool,
'retain_unmerged': qiime2.plugin.Bool % P_retain_unmerged},
outputs=[('table', FeatureTable[Frequency]),
('representative_sequences', FeatureData[Sequence]),
('representative_sequences', T_paired_representative_sequences),
('denoising_stats', SampleData[DADA2Stats]),
('base_transition_stats', DADA2BaseTransitionStats)],
input_descriptions={
Expand Down Expand Up @@ -285,15 +292,23 @@
'retain_all_samples': 'If True all samples input to dada2 will be '
'retained in the output of dada2, if false '
'samples with zero total frequency are removed '
'from the table.'
'from the table.',
'retain_unmerged': (
'If True, denoised paired reads that fail merging are retained by '
'encoding each pair as `forward_read<space>reverse_read` and '
'including these features in the table and sequences. Note that '
'the reverse read is reverse-complemented and thus both read '
'directions can be expected to map to the same strand.'
)
},
output_descriptions={
'table': 'The resulting feature table.',
'representative_sequences': ('The resulting feature sequences. Each '
'feature in the feature table will be '
'represented by exactly one sequence, '
'and these sequences will be the joined '
'paired-end sequences.'),
'representative_sequences': (
'The resulting feature sequences. Each feature in the feature '
'table will be represented by exactly one sequence, and these '
'sequences will be the joined paired-end sequences (and retained '
'unmerged pairs if `retain_unmerged` is enabled).'
),
'denoising_stats': DENOISING_STATS_DESCRIPTION,
'base_transition_stats': BASE_TRANSITION_STATS_DESCRIPTION,
},
Expand Down
Loading
Loading