qiime2 · colinvwood · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026 · Jun 11, 2026
diff --git a/q2_dada2/_denoise.py b/q2_dada2/_denoise.py
@@ -76,6 +76,7 @@ def _check_featureless_table(fp):
     'homopolymer_gap_penalty': _SKIP,
     'band_size': _SKIP,
     'retain_all_samples': _BOOL,
+    'retain_unmerged': _BOOL,
     'front': _SKIP,
     'adapter': _SKIP,
     'indels': _SKIP,
@@ -106,7 +107,7 @@ def _filepath_to_sample_paired(fp):
 
 def _denoise_helper(biom_fp, track_fp, err_track_fp,
                     hashed_feature_ids, retain_all_samples,
-                    paired=False):
+                    paired=False, retain_unmerged=False):
 
     _check_featureless_table(biom_fp)
     with open(biom_fp) as fh:
@@ -125,6 +126,9 @@ def _denoise_helper(biom_fp, track_fp, err_track_fp,
 
     PASSED_FILTER = 'percentage of input passed filter'
     NON_CHIMERIC = 'percentage of input non-chimeric'
+    CONCATENATED = 'percentage of input concatenated'
+    NON_CHIMERIC_CONCATENATED = (
+        'percentage of input non-chimeric concatenated')
 
     round_cols = {PASSED_FILTER: 2, NON_CHIMERIC: 2}
 
@@ -142,6 +146,22 @@ def _denoise_helper(biom_fp, track_fp, err_track_fp,
         col_order.insert(4, 'merged')
         col_order.insert(5, MERGED)
 
+    # only calculate percentage of input concatenated if unmerged read pairs
+    # were retained
+    if 'concatenated' in df:
+        round_cols[CONCATENATED] = 2
+        df[CONCATENATED] = df['concatenated'] / df['input'] * 100
+        insert_at = col_order.index('non-chimeric')
+        col_order.insert(insert_at, 'concatenated')
+        col_order.insert(insert_at + 1, CONCATENATED)
+
+    if 'non-chimeric concatenated' in df:
+        round_cols[NON_CHIMERIC_CONCATENATED] = 2
+        df[NON_CHIMERIC_CONCATENATED] = (
+            df['non-chimeric concatenated'] / df['input'] * 100)
+        col_order.append('non-chimeric concatenated')
+        col_order.append(NON_CHIMERIC_CONCATENATED)
+
     # only calculate percentage of input primer-removed if ccs
     if 'primer-removed' in df:
         PASSED_PRIMERREMOVE = 'percentage of input primer-removed'
@@ -182,18 +202,24 @@ def _denoise_helper(biom_fp, track_fp, err_track_fp,
     # reintroduced above!
     if not retain_all_samples:
         table = table.remove_empty(axis="sample", inplace=False)
+
+    def _to_sequence(sequence, metadata):
+        if retain_unmerged:
+            return skbio.Sequence(sequence, metadata=metadata)
+        return skbio.DNA(sequence, metadata=metadata)
+
     # The feature IDs in DADA2 are the sequences themselves.
     if hashed_feature_ids:
         # Make feature IDs the md5 sums of the sequences.
         fid_map = {id_: hashlib.md5(id_.encode('utf-8')).hexdigest()
                    for id_ in table.ids(axis='observation')}
         table.update_ids(fid_map, axis='observation', inplace=True)
 
-        rep_sequences = DNAIterator((skbio.DNA(k, metadata={'id': v})
+        rep_sequences = DNAIterator((_to_sequence(k, metadata={'id': v})
                                      for k, v in fid_map.items()))
     else:
         rep_sequences = DNAIterator(
-            (skbio.DNA(id_, metadata={'id': id_})
+            (_to_sequence(id_, metadata={'id': id_})
              for id_ in table.ids(axis='observation')))
 
     # initalize and populate DADA2 diagnoistic Stats dictionary
@@ -300,7 +326,8 @@ def denoise_paired(demultiplexed_seqs: SingleLanePerSamplePairedEndFastqDirFmt,
                    allow_one_off: bool = False,
                    n_threads: int = 1, n_reads_learn: int = 1000000,
                    hashed_feature_ids: bool = True,
-                   retain_all_samples: bool = True
+                   retain_all_samples: bool = True,
+                   retain_unmerged: bool = False
                    ) -> (biom.Table, DNAIterator,
                          qiime2.Metadata, qiime2.Metadata):
     _check_inputs(**locals())
@@ -357,7 +384,8 @@ def denoise_paired(demultiplexed_seqs: SingleLanePerSamplePairedEndFastqDirFmt,
                '--min_parental_fold', str(min_fold_parent_over_abundance),
                '--allow_one_off', str(allow_one_off),
                '--num_threads', str(n_threads),
-               '--learn_min_reads', str(n_reads_learn)]
+               '--learn_min_reads', str(n_reads_learn),
+               '--retain_unmerged', str(retain_unmerged)]
         try:
             run_commands([cmd])
         except subprocess.CalledProcessError as e:
@@ -378,7 +406,8 @@ def denoise_paired(demultiplexed_seqs: SingleLanePerSamplePairedEndFastqDirFmt,
 
         return _denoise_helper(biom_fp, track_fp, err_track_fp,
                                hashed_feature_ids, retain_all_samples,
-                               paired=True)
+                               paired=True,
+                               retain_unmerged=retain_unmerged)
 
 
 def _remove_barcode(filename):

diff --git a/q2_dada2/assets/run_dada.R b/q2_dada2/assets/run_dada.R
@@ -205,6 +205,8 @@ option_list = list(
               help="The number of threads to use"),
   make_option(c("--learn_min_reads"), action="store", default='NULL', type='character',
               help="The minimum number of reads to learn the error model from"),
+  make_option(c("--retain_unmerged"), action="store", default='FALSE', type='character',
+              help="If TRUE, denoised paired reads that fail merging are retained as space-concatenated sequences."),
   make_option(c("--homopolymer_gap_penalty"), action="store", default='NULL', type='character',
               help="The cost of gaps in homopolymer regions (>=3 repeated bases).Default is NULL, which causes homopolymer gaps to be treated as normal gaps."),
   make_option(c("--band_size"), action="store", default='NULL', type='character',
@@ -244,6 +246,8 @@ minParentFold <- if(opt$min_parental_fold=='NULL') NULL else as.numeric(opt$min_
 allowOneOff <-if(opt$allow_one_off=='NULL') NULL else as.logical(opt$allow_one_off)
 nthreads <- if(opt$num_threads=='NULL') NULL else as.integer(opt$num_threads)
 nreads.learn <- if(opt$learn_min_reads=='NULL') NULL else as.integer(opt$learn_min_reads)
+retain.unmerged <- if(opt$retain_unmerged=='NULL') FALSE else as.logical(opt$retain_unmerged)
+linked.concat.delim <- "NNNNNNNNNN"
 # The following args are not directly exposed to end users in q2-dada2,
 # but rather indirectly, via the methods `denoise-single` and `denoise-pyro`.
 if (opt$homopolymer_gap_penalty=='NULL'){
@@ -309,7 +313,7 @@ cat("DADA2:", as.character(packageVersion("dada2")), "/",
     "RcppParallel:", as.character(packageVersion("RcppParallel")), "\n")
 
 ### Helper Functions ###
-#function to approximate melt function from reshape2 which is not a dependency 
+#function to approximate melt function from reshape2 which is not a dependency
 melter<-function(df){
   df<-as.data.frame(df)
   melted_df<-data.frame(Var1 = character(), Var2 = numeric(), value = numeric(), stringsAsFactors = TRUE)
@@ -336,9 +340,9 @@ internal_plotErrors <- function(dq, nti=c("A","C","G","T"), ntj=c("A","C","G","T
   if(!(all(nti %in% ACGT) && all(ntj %in% ACGT)) || any(duplicated(nti)) || any(duplicated(ntj))) {
     stop("nti and ntj must be nucleotide(s): A/C/G/T.")
   }
-  
+
   dq <- getErrors(dq, detailed=TRUE, enforce=FALSE)
-  
+
   if(!is.null(dq$trans)) {
     if(ncol(dq$trans) <= 1) {
       stop("plotErrors only supported when using quality scores in the error model (i.e. USE_QUALS=TRUE).")
@@ -356,7 +360,7 @@ internal_plotErrors <- function(dq, nti=c("A","C","G","T"), ntj=c("A","C","G","T
   }
   transdf$from <- substr(transdf$Transition, 1, 1)
   transdf$to <- substr(transdf$Transition, 3, 3)
-  
+
   if(!is.null(dq$trans)) {
     tot.count <- tapply(transdf$count, list(transdf$from, transdf$Qual), sum)
     transdf$tot <- mapply(function(x,y) tot.count[x,y], transdf$from, as.character(transdf$Qual))
@@ -466,6 +470,8 @@ if(primer.removed.dir!='NULL'){#for CCS read analysis
 
 ### PROCESS ALL SAMPLES ###
 # Loop over rest in streaming fashion with learned error rates
+unmerged.id.map <- data.frame(
+  temporary=character(), linked=character(), stringsAsFactors=FALSE)
 
 if(inp.dirR =='NULL'){#for CCS/sinlge/pyro read analysis
   dds <- vector("list", length(filts))
@@ -502,6 +508,9 @@ if(inp.dirR =='NULL'){#for CCS/sinlge/pyro read analysis
   seqtab <- makeSequenceTable(dds)
 }else{#for paired read analysis
   denoisedF <- rep(0, length(filts))
+  mergedF <- rep(0, length(filts))
+  concatenatedF <- rep(0, length(filts))
+  nonchimConcatenatedF <- rep(0, length(filts))
   ddsF <- vector("list", length(filts))
   ddsR <- vector("list", length(filts))
   mergers <- vector("list", length(filts))
@@ -545,12 +554,64 @@ if(inp.dirR =='NULL'){#for CCS/sinlge/pyro read analysis
   for(j in seq(length(filts))) {
     drpF <- derepFastq(filts[[j]])
     drpR <- derepFastq(filtsR[[j]])
-    mergers[[j]] <- mergePairs(
+    # we are intentionally not using `justConcatenate = TRUE` here; that sets
+    # the "accept" column to TRUE for all pairs in mergePairs and would collapse
+    # true merged and rescued unmerged reads into the same "merged" count
+    mp <- mergePairs(
       ddsF[[j]], drpF, ddsR[[j]], drpR,
       minOverlap=minOverlap,
       maxMismatch=maxMergeMismatch,
-      trimOverhang=trimOverhang
+      trimOverhang=trimOverhang,
+      returnRejects=retain.unmerged
       )
+    if(retain.unmerged){
+      mergedF[j] <- sum(mp[mp$accept, "abundance"])
+      mergers[[j]] <- mp[mp$accept, c("sequence", "abundance")]
+      unmerged.j <- mp[!mp$accept, c("forward", "reverse", "abundance")]
+      concatenatedF[j] <- sum(unmerged.j[,"abundance"])
+      if(nrow(unmerged.j) > 0){
+        # `mergePairs` returns cluster indices in the "forward" and "reverse"
+        # columns, so we resolve these to denoised forward and
+        # reverse-complemented reverse sequences before concatenation
+        unmerged.forward <- as.character(
+          ddsF[[j]]$clustering$sequence[unmerged.j$forward]
+        )
+        unmerged.reverse <- as.character(
+          rc(ddsR[[j]]$clustering$sequence[unmerged.j$reverse])
+        )
+
+        # manually reconstruct dada2's "N" * 10 separator so that sequences
+        # are recognized if chimera filtering is performed; this separator
+        # is later converted to a single space
+        unmerged.temp.seqs <- paste(
+          unmerged.forward, unmerged.reverse,
+          sep=linked.concat.delim
+        )
+        unmerged.linked.seqs <- paste(
+          unmerged.forward, unmerged.reverse,
+          sep=" "
+        )
+        unmerged.id.map <- rbind(
+          unmerged.id.map,
+          data.frame(
+            temporary=unmerged.temp.seqs,
+            linked=unmerged.linked.seqs,
+            stringsAsFactors=FALSE
+          )
+        )
+        mergers[[j]] <- rbind(
+          mergers[[j]],
+          data.frame(
+            sequence=unmerged.temp.seqs,
+            abundance=unmerged.j$abundance,
+            stringsAsFactors=FALSE
+          )
+        )
+      }
+    }else{
+      mergers[[j]] <- mp
+      mergedF[j] <- sum(mp[,"abundance"])
+    }
     denoisedF[[j]] <- getN(ddsF[[j]])
     cat(".")
   }
@@ -560,15 +621,36 @@ if(inp.dirR =='NULL'){#for CCS/sinlge/pyro read analysis
 
 }
 
-
 ### Remove chimeras
 cat("5) Remove chimeras (method = ", chimeraMethod, ")\n", sep="")
-if(chimeraMethod %in% c("pooled", "consensus")) {
+if(chimeraMethod %in% c("pooled", "consensus") && ncol(seqtab) > 0) {
   seqtab.nochim <- removeBimeraDenovo(seqtab, method=chimeraMethod, minFoldParentOverAbundance=minParentFold, allowOneOff=allowOneOff, multithread=multithread)
 } else { # No chimera removal, copy seqtab to seqtab.nochim
   seqtab.nochim <- seqtab
 }
 
+# after chimera filtering, convert retained concatenated IDs to the single
+# space-delimited representation using the exact joins we introduced above
+if(nrow(unmerged.id.map) > 0){
+  unmerged.id.map <- unique(unmerged.id.map)
+  ambiguous.ids <- unmerged.id.map$temporary[
+    duplicated(unmerged.id.map$temporary) |
+      duplicated(unmerged.id.map$temporary, fromLast=TRUE)
+  ]
+  if(length(ambiguous.ids) > 0){
+    errQuit("Unable to uniquely map retained unmerged sequences from the temporary DADA2-compatible representation to linked sequences.", status=1)
+  }
+
+  unmerged.keep <- intersect(colnames(seqtab.nochim), unmerged.id.map$temporary)
+  if(length(unmerged.keep) > 0){
+    nonchimConcatenatedF <- rowSums(
+      seqtab.nochim[, unmerged.keep, drop=FALSE]
+    )
+    colnames(seqtab.nochim)[match(unmerged.keep, colnames(seqtab.nochim))] <-
+      unmerged.id.map$linked[match(unmerged.keep, unmerged.id.map$temporary)]
+  }
+}
+
 ### REPORT READ FRACTIONS THROUGH PIPELINE ###
 cat("6) Report read numbers through the pipeline\n")
 if(inp.dirR =='NULL'){
@@ -587,11 +669,24 @@ if(inp.dirR =='NULL'){
               quote=FALSE)
 }else{#for paired end reads
   # Handle edge cases: Samples lost in filtering; One sample
-  track <- cbind(out, matrix(0, nrow=nrow(out), ncol=3))
-  colnames(track) <- c("input", "filtered", "denoised", "merged", "non-chimeric")
+  if(retain.unmerged){
+    track <- cbind(out, matrix(0, nrow=nrow(out), ncol=5))
+    colnames(track) <- c("input", "filtered", "denoised", "merged",
+                         "concatenated", "non-chimeric",
+                         "non-chimeric concatenated")
+  }else{
+    track <- cbind(out, matrix(0, nrow=nrow(out), ncol=3))
+    colnames(track) <- c("input", "filtered", "denoised", "merged",
+                         "non-chimeric")
+  }
   passed.filtering <- track[,"filtered"] > 0
   track[passed.filtering,"denoised"] <- denoisedF
-  track[passed.filtering,"merged"] <- rowSums(seqtab)
+  track[passed.filtering,"merged"] <- mergedF
+  if(retain.unmerged){
+    track[passed.filtering,"concatenated"] <- concatenatedF
+    track[passed.filtering,"non-chimeric concatenated"] <-
+      nonchimConcatenatedF
+  }
   track[passed.filtering,"non-chimeric"] <- rowSums(seqtab.nochim)
   write.table(track, out.track, sep="\t", row.names=TRUE, col.names=NA,
               quote=FALSE)

diff --git a/q2_dada2/plugin_setup.py b/q2_dada2/plugin_setup.py
@@ -11,7 +11,7 @@
 from q2_types.per_sample_sequences import (
     SequencesWithQuality, PairedEndSequencesWithQuality)
 from q2_types.sample_data import SampleData
-from q2_types.feature_data import FeatureData, Sequence
+from q2_types.feature_data import FeatureData, Sequence, LinkedSequence
 from q2_types.feature_table import FeatureTable, Frequency
 
 import q2_dada2
@@ -24,6 +24,12 @@
 
 _POOL_OPT = {'pseudo', 'independent'}
 _CHIM_OPT = {'consensus', 'none'}
+P_retain_unmerged, T_paired_representative_sequences = qiime2.plugin.TypeMap({
+    qiime2.plugin.Choices(True):
+        FeatureData[LinkedSequence],
+    qiime2.plugin.Choices(False):
+        FeatureData[Sequence],
+})
 
 citations = qiime2.plugin.Citations.load('citations.bib', package='q2_dada2')
 plugin = qiime2.plugin.Plugin(
@@ -181,9 +187,10 @@
                 'n_threads': qiime2.plugin.Threads,
                 'n_reads_learn': qiime2.plugin.Int,
                 'hashed_feature_ids': qiime2.plugin.Bool,
-                'retain_all_samples': qiime2.plugin.Bool},
+                'retain_all_samples': qiime2.plugin.Bool,
+                'retain_unmerged': qiime2.plugin.Bool % P_retain_unmerged},
     outputs=[('table', FeatureTable[Frequency]),
-             ('representative_sequences', FeatureData[Sequence]),
+             ('representative_sequences', T_paired_representative_sequences),
              ('denoising_stats', SampleData[DADA2Stats]),
              ('base_transition_stats', DADA2BaseTransitionStats)],
     input_descriptions={
@@ -285,15 +292,23 @@
         'retain_all_samples': 'If True all samples input to dada2 will be '
                               'retained in the output of dada2, if false '
                               'samples with zero total frequency are removed '
-                              'from the table.'
+                              'from the table.',
+        'retain_unmerged': (
+            'If True, denoised paired reads that fail merging are retained by '
+            'encoding each pair as `forward_read<space>reverse_read` and '
+            'including these features in the table and sequences. Note that '
+            'the reverse read is reverse-complemented and thus both read '
+            'directions can be expected to map to the same strand.'
+        )
     },
     output_descriptions={
         'table': 'The resulting feature table.',
-        'representative_sequences': ('The resulting feature sequences. Each '
-                                     'feature in the feature table will be '
-                                     'represented by exactly one sequence, '
-                                     'and these sequences will be the joined '
-                                     'paired-end sequences.'),
+        'representative_sequences': (
+            'The resulting feature sequences. Each feature in the feature '
+            'table will be represented by exactly one sequence, and these '
+            'sequences will be the joined paired-end sequences (and retained '
+            'unmerged pairs if `retain_unmerged` is enabled).'
+        ),
         'denoising_stats': DENOISING_STATS_DESCRIPTION,
         'base_transition_stats': BASE_TRANSITION_STATS_DESCRIPTION,
     },