-
Notifications
You must be signed in to change notification settings - Fork 42
Linked sequences #190
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: dev
Are you sure you want to change the base?
Linked sequences #190
Changes from all commits
833c591
364d2bf
14ac4e4
4b7f4e5
3691024
68a2a18
f118353
80d4c6c
e0afdfe
df6c5f6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -205,6 +205,8 @@ option_list = list( | |
| help="The number of threads to use"), | ||
| make_option(c("--learn_min_reads"), action="store", default='NULL', type='character', | ||
| help="The minimum number of reads to learn the error model from"), | ||
| make_option(c("--retain_unmerged"), action="store", default='FALSE', type='character', | ||
| help="If TRUE, denoised paired reads that fail merging are retained as space-concatenated sequences."), | ||
| make_option(c("--homopolymer_gap_penalty"), action="store", default='NULL', type='character', | ||
| help="The cost of gaps in homopolymer regions (>=3 repeated bases).Default is NULL, which causes homopolymer gaps to be treated as normal gaps."), | ||
| make_option(c("--band_size"), action="store", default='NULL', type='character', | ||
|
|
@@ -244,6 +246,8 @@ minParentFold <- if(opt$min_parental_fold=='NULL') NULL else as.numeric(opt$min_ | |
| allowOneOff <-if(opt$allow_one_off=='NULL') NULL else as.logical(opt$allow_one_off) | ||
| nthreads <- if(opt$num_threads=='NULL') NULL else as.integer(opt$num_threads) | ||
| nreads.learn <- if(opt$learn_min_reads=='NULL') NULL else as.integer(opt$learn_min_reads) | ||
| retain.unmerged <- if(opt$retain_unmerged=='NULL') FALSE else as.logical(opt$retain_unmerged) | ||
| linked.concat.delim <- "NNNNNNNNNN" | ||
| # The following args are not directly exposed to end users in q2-dada2, | ||
| # but rather indirectly, via the methods `denoise-single` and `denoise-pyro`. | ||
| if (opt$homopolymer_gap_penalty=='NULL'){ | ||
|
|
@@ -309,7 +313,7 @@ cat("DADA2:", as.character(packageVersion("dada2")), "/", | |
| "RcppParallel:", as.character(packageVersion("RcppParallel")), "\n") | ||
|
|
||
| ### Helper Functions ### | ||
| #function to approximate melt function from reshape2 which is not a dependency | ||
| #function to approximate melt function from reshape2 which is not a dependency | ||
| melter<-function(df){ | ||
| df<-as.data.frame(df) | ||
| melted_df<-data.frame(Var1 = character(), Var2 = numeric(), value = numeric(), stringsAsFactors = TRUE) | ||
|
|
@@ -336,9 +340,9 @@ internal_plotErrors <- function(dq, nti=c("A","C","G","T"), ntj=c("A","C","G","T | |
| if(!(all(nti %in% ACGT) && all(ntj %in% ACGT)) || any(duplicated(nti)) || any(duplicated(ntj))) { | ||
| stop("nti and ntj must be nucleotide(s): A/C/G/T.") | ||
| } | ||
|
|
||
| dq <- getErrors(dq, detailed=TRUE, enforce=FALSE) | ||
|
|
||
| if(!is.null(dq$trans)) { | ||
| if(ncol(dq$trans) <= 1) { | ||
| stop("plotErrors only supported when using quality scores in the error model (i.e. USE_QUALS=TRUE).") | ||
|
|
@@ -356,7 +360,7 @@ internal_plotErrors <- function(dq, nti=c("A","C","G","T"), ntj=c("A","C","G","T | |
| } | ||
| transdf$from <- substr(transdf$Transition, 1, 1) | ||
| transdf$to <- substr(transdf$Transition, 3, 3) | ||
|
|
||
| if(!is.null(dq$trans)) { | ||
| tot.count <- tapply(transdf$count, list(transdf$from, transdf$Qual), sum) | ||
| transdf$tot <- mapply(function(x,y) tot.count[x,y], transdf$from, as.character(transdf$Qual)) | ||
|
|
@@ -466,6 +470,8 @@ if(primer.removed.dir!='NULL'){#for CCS read analysis | |
|
|
||
| ### PROCESS ALL SAMPLES ### | ||
| # Loop over rest in streaming fashion with learned error rates | ||
| unmerged.id.map <- data.frame( | ||
| temporary=character(), linked=character(), stringsAsFactors=FALSE) | ||
|
|
||
| if(inp.dirR =='NULL'){#for CCS/sinlge/pyro read analysis | ||
| dds <- vector("list", length(filts)) | ||
|
|
@@ -502,6 +508,9 @@ if(inp.dirR =='NULL'){#for CCS/sinlge/pyro read analysis | |
| seqtab <- makeSequenceTable(dds) | ||
| }else{#for paired read analysis | ||
| denoisedF <- rep(0, length(filts)) | ||
| mergedF <- rep(0, length(filts)) | ||
| concatenatedF <- rep(0, length(filts)) | ||
| nonchimConcatenatedF <- rep(0, length(filts)) | ||
| ddsF <- vector("list", length(filts)) | ||
| ddsR <- vector("list", length(filts)) | ||
| mergers <- vector("list", length(filts)) | ||
|
|
@@ -545,12 +554,64 @@ if(inp.dirR =='NULL'){#for CCS/sinlge/pyro read analysis | |
| for(j in seq(length(filts))) { | ||
| drpF <- derepFastq(filts[[j]]) | ||
| drpR <- derepFastq(filtsR[[j]]) | ||
| mergers[[j]] <- mergePairs( | ||
| # we are intentionally not using `justConcatenate = TRUE` here; that sets | ||
| # the "accept" column to TRUE for all pairs in mergePairs and would collapse | ||
| # true merged and rescued unmerged reads into the same "merged" count | ||
| mp <- mergePairs( | ||
| ddsF[[j]], drpF, ddsR[[j]], drpR, | ||
| minOverlap=minOverlap, | ||
| maxMismatch=maxMergeMismatch, | ||
| trimOverhang=trimOverhang | ||
| trimOverhang=trimOverhang, | ||
| returnRejects=retain.unmerged | ||
| ) | ||
| if(retain.unmerged){ | ||
| mergedF[j] <- sum(mp[mp$accept, "abundance"]) | ||
| mergers[[j]] <- mp[mp$accept, c("sequence", "abundance")] | ||
| unmerged.j <- mp[!mp$accept, c("forward", "reverse", "abundance")] | ||
| concatenatedF[j] <- sum(unmerged.j[,"abundance"]) | ||
| if(nrow(unmerged.j) > 0){ | ||
| # `mergePairs` returns cluster indices in the "forward" and "reverse" | ||
| # columns, so we resolve these to denoised forward and | ||
| # reverse-complemented reverse sequences before concatenation | ||
| unmerged.forward <- as.character( | ||
| ddsF[[j]]$clustering$sequence[unmerged.j$forward] | ||
| ) | ||
| unmerged.reverse <- as.character( | ||
| rc(ddsR[[j]]$clustering$sequence[unmerged.j$reverse]) | ||
| ) | ||
|
|
||
| # manually reconstruct dada2's "N" * 10 separator so that sequences | ||
| # are recognized if chimera filtering is performed; this separator | ||
| # is later converted to a single space | ||
| unmerged.temp.seqs <- paste( | ||
| unmerged.forward, unmerged.reverse, | ||
| sep=linked.concat.delim | ||
| ) | ||
| unmerged.linked.seqs <- paste( | ||
| unmerged.forward, unmerged.reverse, | ||
| sep=" " | ||
| ) | ||
| unmerged.id.map <- rbind( | ||
| unmerged.id.map, | ||
| data.frame( | ||
| temporary=unmerged.temp.seqs, | ||
| linked=unmerged.linked.seqs, | ||
| stringsAsFactors=FALSE | ||
| ) | ||
| ) | ||
| mergers[[j]] <- rbind( | ||
| mergers[[j]], | ||
| data.frame( | ||
| sequence=unmerged.temp.seqs, | ||
| abundance=unmerged.j$abundance, | ||
| stringsAsFactors=FALSE | ||
| ) | ||
| ) | ||
|
colinvwood marked this conversation as resolved.
|
||
| } | ||
| }else{ | ||
| mergers[[j]] <- mp | ||
| mergedF[j] <- sum(mp[,"abundance"]) | ||
| } | ||
| denoisedF[[j]] <- getN(ddsF[[j]]) | ||
| cat(".") | ||
| } | ||
|
|
@@ -560,15 +621,36 @@ if(inp.dirR =='NULL'){#for CCS/sinlge/pyro read analysis | |
|
|
||
| } | ||
|
|
||
|
|
||
| ### Remove chimeras | ||
| cat("5) Remove chimeras (method = ", chimeraMethod, ")\n", sep="") | ||
| if(chimeraMethod %in% c("pooled", "consensus")) { | ||
| if(chimeraMethod %in% c("pooled", "consensus") && ncol(seqtab) > 0) { | ||
|
colinvwood marked this conversation as resolved.
|
||
| seqtab.nochim <- removeBimeraDenovo(seqtab, method=chimeraMethod, minFoldParentOverAbundance=minParentFold, allowOneOff=allowOneOff, multithread=multithread) | ||
| } else { # No chimera removal, copy seqtab to seqtab.nochim | ||
| seqtab.nochim <- seqtab | ||
| } | ||
|
|
||
| # after chimera filtering, convert retained concatenated IDs to the single | ||
| # space-delimited representation using the exact joins we introduced above | ||
| if(nrow(unmerged.id.map) > 0){ | ||
| unmerged.id.map <- unique(unmerged.id.map) | ||
| ambiguous.ids <- unmerged.id.map$temporary[ | ||
| duplicated(unmerged.id.map$temporary) | | ||
| duplicated(unmerged.id.map$temporary, fromLast=TRUE) | ||
| ] | ||
| if(length(ambiguous.ids) > 0){ | ||
| errQuit("Unable to uniquely map retained unmerged sequences from the temporary DADA2-compatible representation to linked sequences.", status=1) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am quite confused how this would be possible. Can you explain a little more in a comment?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Say we have two linked sequences like so: The the Now it's ambiguous what |
||
| } | ||
|
|
||
| unmerged.keep <- intersect(colnames(seqtab.nochim), unmerged.id.map$temporary) | ||
| if(length(unmerged.keep) > 0){ | ||
|
colinvwood marked this conversation as resolved.
|
||
| nonchimConcatenatedF <- rowSums( | ||
| seqtab.nochim[, unmerged.keep, drop=FALSE] | ||
| ) | ||
| colnames(seqtab.nochim)[match(unmerged.keep, colnames(seqtab.nochim))] <- | ||
| unmerged.id.map$linked[match(unmerged.keep, unmerged.id.map$temporary)] | ||
| } | ||
| } | ||
|
|
||
| ### REPORT READ FRACTIONS THROUGH PIPELINE ### | ||
| cat("6) Report read numbers through the pipeline\n") | ||
| if(inp.dirR =='NULL'){ | ||
|
|
@@ -587,11 +669,24 @@ if(inp.dirR =='NULL'){ | |
| quote=FALSE) | ||
| }else{#for paired end reads | ||
| # Handle edge cases: Samples lost in filtering; One sample | ||
| track <- cbind(out, matrix(0, nrow=nrow(out), ncol=3)) | ||
| colnames(track) <- c("input", "filtered", "denoised", "merged", "non-chimeric") | ||
| if(retain.unmerged){ | ||
| track <- cbind(out, matrix(0, nrow=nrow(out), ncol=5)) | ||
| colnames(track) <- c("input", "filtered", "denoised", "merged", | ||
| "concatenated", "non-chimeric", | ||
| "non-chimeric concatenated") | ||
| }else{ | ||
| track <- cbind(out, matrix(0, nrow=nrow(out), ncol=3)) | ||
| colnames(track) <- c("input", "filtered", "denoised", "merged", | ||
| "non-chimeric") | ||
| } | ||
| passed.filtering <- track[,"filtered"] > 0 | ||
| track[passed.filtering,"denoised"] <- denoisedF | ||
| track[passed.filtering,"merged"] <- rowSums(seqtab) | ||
| track[passed.filtering,"merged"] <- mergedF | ||
| if(retain.unmerged){ | ||
| track[passed.filtering,"concatenated"] <- concatenatedF | ||
| track[passed.filtering,"non-chimeric concatenated"] <- | ||
| nonchimConcatenatedF | ||
| } | ||
| track[passed.filtering,"non-chimeric"] <- rowSums(seqtab.nochim) | ||
| write.table(track, out.track, sep="\t", row.names=TRUE, col.names=NA, | ||
| quote=FALSE) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am 99% sure this will still work even with the new LinkedDNA class.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You mean the
__init__signature?