@@ -7,7 +7,7 @@ version="v1"
77
88# #this pipeline has 5 main steps:
99# # STEP 1: Downsampling on raw ONT reads (Filtlong)
10- # # STEP 2: Polishing of raw ONT reads with paired end illumina data (Ratatosk)
10+ # # STEP 2: Polishing of raw ONT reads with paired end illumina data (Ratatosk; optional )
1111# # STEP 3: Assembly of polished ONT reads (Flye; modified to allow for larger minimum overlap values)
1212# # Step 4: Polishing of assembly using Pacbio (NextPolish2; optional)
1313# # Step 5: Filtering, reordering and renaming (Seqkit)
@@ -40,9 +40,9 @@ version="v1"
4040
4141# default values, unless denoted when running MUM&Co
4242nanopore=" "
43+ genomesize=" "
4344pair1=" "
4445pair2=" "
45- genomesize=" "
4646hifi=" "
4747threads=" 1"
4848minsize=" 5000"
@@ -66,18 +66,18 @@ case "$key" in
6666 shift
6767 shift
6868 ;;
69- -1 |--pair1 )
70- pair1 =" $2 "
69+ -g |--genomesize )
70+ genomesize =" $2 "
7171 shift
7272 shift
7373 ;;
74- -2 |--pair2 )
75- pair2 =" $2 "
74+ -1 |--pair1 )
75+ pair1 =" $2 "
7676 shift
7777 shift
7878 ;;
79- -g |--genomesize )
80- genomesize =" $2 "
79+ -2 |--pair2 )
80+ pair2 =" $2 "
8181 shift
8282 shift
8383 ;;
@@ -131,22 +131,22 @@ case "$key" in
131131
132132 fusemblr (version: ${version} )
133133
134- fusemblr.sh -n nanopore.fq.gz -1 illumina.R1.fq.gz -2 illumina.R2 .fq.gz -g 70000000
134+ fusemblr.sh -n nanopore.fq.gz -g 70000000
135135
136136 Required inputs:
137137 -n | --nanopore Nanopore long reads used for assembly in fastq or fasta format (*.fastq / *.fq) and can be gzipped (*.gz)
138- -1 | --pair1 Paired end illumina reads in fastq format; first pair. Used for Ratatosk polishing. Can be gzipped (*.gz)
139- -2 | --pair2 Paired end illumina reads in fastq format; second pair. Used for Ratatosk polishing. Can be gzipped (*.gz)
140138 -g | --genomesize Estimation of genome size, required for downsampling and assembly
141139
142140 Recommended inputs:
141+ -1 | --pair1 Paired end illumina reads in fastq format; first pair. Used for Rataosk polishing. Can be gzipped (*.gz)
142+ -2 | --pair2 Paired end illumina reads in fastq format; second pair. Used for Rataosk polishing. Can be gzipped (*.gz)
143143 -h | --hifi Pacbio HiFi reads required for assembly polishing with NextPolish2 (Recommended if available)
144- -t | --threads Number of threads for tools that accept this option (default : 1)
144+ -t | --threads Number of threads for tools that accept this option (Default : 1)
145145
146146 Optional parameters:
147147 -m | --minsize Minimum size of reads to keep during downsampling (Default: 5000)
148148 -x | --coverage The amount of coverage for downsampling (X), based on genome size, i.e. coverage*genomesize (Default: 100)
149- -v | --minovl Minimum overlap for Flye assembly (Default: Calculated during run as N95 of reads used for assembly)
149+ -v | --minovl Minimum overlap for Flye assembly (Default: Calculated during run as N90 of reads used for assembly)
150150 -w | --weight The weighting used by Filtlong for selecting reads; balancing the length vs the quality (Default: 5)
151151 -p | --prefix Prefix for output (Default: name of nanopore reads file (-a) before the fastq suffix)
152152 -o | --output Name of output folder for all results (Default: fusemblr_output)
@@ -162,10 +162,10 @@ done
162162
163163# creates error message and exits if these values are not/incorrectly assigned
164164[[ $nanopore == " " ]] && echo " ERROR: Path to ONT long-reads not found, assign using -n" && exit
165- [[ $pair1 == " " || $pair2 == " " ]] && echo " ERROR: Missing paired-end short-reads as input, assign using -1 and -2" && exit
166165[[ $genomesize == " " ]] && echo " ERROR: Missing genome size estimation, assign using -g (can be a very rough estimate)" && exit
167166
168167[[ $hifi == " " ]] && echo " WARNING: Running without PacBio Hifi data; skipping final polishing step"
168+ [[ $pair1 == " " || $pair2 == " " ]] && echo " WARNING: Running without paired-end short-reads as input; skipping pre-polishing of reads step"
169169
170170# #############################################################
171171# ###################### 0b. SETTING UP #######################
@@ -178,20 +178,26 @@ done
178178
179179# #paths to raw data
180180nanoporepath=$( realpath ${nanopore} )
181- pair1path=$( realpath ${pair1} )
182- pair2path=$( realpath ${pair2} )
181+
183182
184183# check if the files given actually exist
185184[ ! -f " ${nanoporepath} " ] && echo " ERROR: Cannot find path to nanopore reads provided by -n; check path is correct and file exists" && exit
186- [ ! -f " ${pair1path} " ] && echo " ERROR: Cannot find path to illumina reads provided by -1; check path is correct and file exists" && exit
187- [ ! -f " ${pair2path} " ] && echo " ERROR: Cannot find path to illumina reads provided by -2; check path is correct and file exists" && exit
185+
188186
189187if [[ $hifi != " " ]]
190188then
191189hifipath=$( realpath ${hifi} )
192190[ ! -f " ${hifipath} " ] && echo " ERROR: Cannot find path to Pacbio Hifi reads provided by -h; check path is correct and file exists" && exit
193191fi
194192
193+ if [[ $pair1 != " " ]]
194+ then
195+ pair1path=$( realpath ${pair1} )
196+ pair2path=$( realpath ${pair2} )
197+ [ ! -f " ${pair1path} " ] && echo " ERROR: Cannot find path to illumina reads provided by -1; check path is correct and file exists" && exit
198+ [ ! -f " ${pair2path} " ] && echo " ERROR: Cannot find path to illumina reads provided by -2; check path is correct and file exists" && exit
199+ fi
200+
195201
196202# #newly defining some variables for output writing
197203# # leave as is just setting to kb
@@ -229,11 +235,15 @@ filtlong --min_length ${minsize} -t ${target} --length_weight ${weight} ${nanopo
229235
230236
231237# ##########################################################################
232- # ##################### 2. READ POLISHING WITH RATATOSK #####################
238+ # ##################### 2. READ POLISHING WITH RATAOSK #####################
233239# ##########################################################################
234240
241+
235242echo " ################## fusemblr: Step 2: Polishing ONT reads"
236243
244+ # #run polishing if illumina data is present
245+ if [[ $pair1 != " " ]]
246+ then
237247
238248# # create directory for output of reads
239249mkdir 2.ratatosk_ont
@@ -250,19 +260,38 @@ seqkit stats -N 50,90,95 --threads ${threads} 2.ratatosk_ont/${prefix}.${readsta
250260# ##get the read N90 to set as a variables in flye
251261if [[ $minovl == " " ]]
252262then
253- minovl=$( tail -n1 2.ratatosk_ont/${prefix} .${readstats} .ratatosk.stats.tsv | awk ' {print $11}' | sed ' s/,//g' )
263+ minovl=$( tail -n1 2.ratatosk_ont/${prefix} .${readstats} .ratatosk.stats.tsv | awk ' {print $10}' | sed ' s/,//g' )
264+ minovl2=$( echo ${minovl} | awk ' {print $1/1000}' | awk -F " ." ' {print $1}' )
265+ else
266+ minovl2=$( echo ${minovl} | awk ' {print $1/1000}' | awk -F " ." ' {print $1}' )
267+
268+ fi
269+
270+ else
271+ # #if illunina data isn't present use the downsampled unpolished reads
272+ # #get some stats
273+ seqkit stats -N 50,90,95 --threads ${threads} 1.filtlong_ont/${prefix} .${readstats} .fq.gz > 1.filtlong_ont/${prefix} .${readstats} .stats.tsv
274+
275+ # ##get the read N90 to set as a variables in flye
276+ if [[ $minovl == " " ]]
277+ then
278+ minovl=$( tail -n1 1.filtlong_ont/${prefix} .${readstats} .stats.tsv | awk ' {print $11}' | sed ' s/,//g' )
254279minovl2=$( echo ${minovl} | awk ' {print $1/1000}' | awk -F " ." ' {print $1}' )
255280else
256281minovl2=$( echo ${minovl} | awk ' {print $1/1000}' | awk -F " ." ' {print $1}' )
257282
258283fi
259284
285+ fi
286+
260287# ################################################################
261288# ################# STEP 3a. ASSEMBLY WITH FLYE ##################
262289# ################################################################
263290
264291echo " ################## fusemblr: Step 3a: Assembling ONT reads with Flye"
265292
293+ if [[ $pair1 != " " ]]
294+ then
266295# # create a variable that saves these variable in the naming scheme
267296assembly=" ${prefix} .${readstats} .ratatosk.flye_nanocorr_${size2} Mb_${coverage} X_minovl${minovl2} k"
268297
@@ -271,6 +300,17 @@ flye --nano-corr 2.ratatosk_ont/${prefix}.${readstats}.ratatosk.fq.gz -m ${minov
271300# # make sure the assembly file is named is a simple format
272301cp 3a.flye_assembly/assembly.fasta 3a.flye_assembly/${assembly} .fa
273302
303+ else
304+ # # create a variable that saves these variable in the naming scheme
305+ assembly=" ${prefix} .${readstats} .flye_nanocorr_${size2} Mb_${coverage} X_minovl${minovl2} k"
306+
307+ # # run flye
308+ flye --nano-corr 1.filtlong_ont/${prefix} .${readstats} .fq.gz -m ${minovl} --genome-size ${genomesize} --asm-coverage ${coverage} --threads ${threads} -o 3a.flye_assembly/
309+ # # make sure the assembly file is named is a simple format
310+ cp 3a.flye_assembly/assembly.fasta 3a.flye_assembly/${assembly} .fa
311+
312+ fi
313+
274314# ################################################################
275315# ############### STEP 3b. ASSEMBLY WITH HIFIASM #################
276316# ################################################################
@@ -280,6 +320,8 @@ echo "################## fusemblr: Step 3b: Assembling ONT reads with Hifiasm"
280320# #create directory for the hifiasm output
281321mkdir 3b.hifiasm/
282322
323+ if [[ $pair1 != " " ]]
324+ then
283325if [[ $hifi != " " ]]
284326then
285327# # if hifi data is present; run hifiasm with hifi data and providing ONT reads as an ultralong dataset
@@ -288,6 +330,16 @@ else
288330# # if hifi data is not present; run hifiasm with hifi data only providing ONT reads
289331hifiasm -o 3b.hifiasm/${prefix} -t ${threads} -l0 --ont 2.ratatosk_ont/${prefix} .${readstats} .ratatosk.fq.gz
290332fi
333+ else
334+ if [[ $hifi != " " ]]
335+ then
336+ # # if hifi data is present; run hifiasm with hifi data and providing ONT reads as an ultralong dataset
337+ hifiasm -o 3b.hifiasm/${prefix} -t ${threads} -l0 --ul 1.filtlong_ont/${prefix} .${readstats} .fq.gz ${hifipath}
338+ else
339+ # # if hifi data is not present; run hifiasm with hifi data only providing ONT reads
340+ hifiasm -o 3b.hifiasm/${prefix} -t ${threads} -l0 --ont 1.filtlong_ont/${prefix} .${readstats} .fq.gz
341+ fi
342+ fi
291343# # convert gfa to fasta
292344awk ' /^S/{print ">"$2;print $3}' 3b.hifiasm/${prefix} .bp.p_ctg.gfa > 3b.hifiasm/${prefix} .hifiasm.fa
293345# #clean up
0 commit comments