Skip to content

Commit 31ba3b2

Browse files
authored
Merge pull request #14 from SAMtoBAM/SAMtoBAM-patch-8
Update fusemblr.sh
2 parents 41dae1b + 6101cfe commit 31ba3b2

2 files changed

Lines changed: 73 additions & 21 deletions

File tree

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ The pipeline uses Nanopore (the longer and higher coverage the better) and paire
5656
#### 3. Genome Assembly
5757
##### 3.a. Assembly with```Flye```
5858
######     -removed the hard coded maximium value for the minimum overlap threshold (previously 10kb)
59-
######     -by default the minimum overlap value is automatically provided as the read N95 after polishing
59+
######     -by default the minimum overlap value is automatically provided as the read N90 after polishing
6060
##### 3.b. Assembly with ```Hifiasm```
6161
######     -if Hifi reads are provided: uses the ```--ul``` option, with both polished ONT and Hifi reads
6262
######     -without Hifi: uses the ```--ont``` option, with only the polished ONT reads

bin/fusemblr.sh

Lines changed: 72 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ version="v1"
77

88
##this pipeline has 5 main steps:
99
## STEP 1: Downsampling on raw ONT reads (Filtlong)
10-
## STEP 2: Polishing of raw ONT reads with paired end illumina data (Ratatosk)
10+
## STEP 2: Polishing of raw ONT reads with paired end illumina data (Ratatosk; optional)
1111
## STEP 3: Assembly of polished ONT reads (Flye; modified to allow for larger minimum overlap values)
1212
## Step 4: Polishing of assembly using Pacbio (NextPolish2; optional)
1313
## Step 5: Filtering, reordering and renaming (Seqkit)
@@ -40,9 +40,9 @@ version="v1"
4040

4141
#default values, unless denoted when running MUM&Co
4242
nanopore=""
43+
genomesize=""
4344
pair1=""
4445
pair2=""
45-
genomesize=""
4646
hifi=""
4747
threads="1"
4848
minsize="5000"
@@ -66,18 +66,18 @@ case "$key" in
6666
shift
6767
shift
6868
;;
69-
-1|--pair1)
70-
pair1="$2"
69+
-g|--genomesize)
70+
genomesize="$2"
7171
shift
7272
shift
7373
;;
74-
-2|--pair2)
75-
pair2="$2"
74+
-1|--pair1)
75+
pair1="$2"
7676
shift
7777
shift
7878
;;
79-
-g|--genomesize)
80-
genomesize="$2"
79+
-2|--pair2)
80+
pair2="$2"
8181
shift
8282
shift
8383
;;
@@ -131,22 +131,22 @@ case "$key" in
131131
132132
fusemblr (version: ${version})
133133
134-
fusemblr.sh -n nanopore.fq.gz -1 illumina.R1.fq.gz -2 illumina.R2.fq.gz -g 70000000
134+
fusemblr.sh -n nanopore.fq.gz -g 70000000
135135
136136
Required inputs:
137137
-n | --nanopore Nanopore long reads used for assembly in fastq or fasta format (*.fastq / *.fq) and can be gzipped (*.gz)
138-
-1 | --pair1 Paired end illumina reads in fastq format; first pair. Used for Ratatosk polishing. Can be gzipped (*.gz)
139-
-2 | --pair2 Paired end illumina reads in fastq format; second pair. Used for Ratatosk polishing. Can be gzipped (*.gz)
140138
-g | --genomesize Estimation of genome size, required for downsampling and assembly
141139
142140
Recommended inputs:
141+
-1 | --pair1 Paired end illumina reads in fastq format; first pair. Used for Rataosk polishing. Can be gzipped (*.gz)
142+
-2 | --pair2 Paired end illumina reads in fastq format; second pair. Used for Rataosk polishing. Can be gzipped (*.gz)
143143
-h | --hifi Pacbio HiFi reads required for assembly polishing with NextPolish2 (Recommended if available)
144-
-t | --threads Number of threads for tools that accept this option (default: 1)
144+
-t | --threads Number of threads for tools that accept this option (Default: 1)
145145
146146
Optional parameters:
147147
-m | --minsize Minimum size of reads to keep during downsampling (Default: 5000)
148148
-x | --coverage The amount of coverage for downsampling (X), based on genome size, i.e. coverage*genomesize (Default: 100)
149-
-v | --minovl Minimum overlap for Flye assembly (Default: Calculated during run as N95 of reads used for assembly)
149+
-v | --minovl Minimum overlap for Flye assembly (Default: Calculated during run as N90 of reads used for assembly)
150150
-w | --weight The weighting used by Filtlong for selecting reads; balancing the length vs the quality (Default: 5)
151151
-p | --prefix Prefix for output (Default: name of nanopore reads file (-a) before the fastq suffix)
152152
-o | --output Name of output folder for all results (Default: fusemblr_output)
@@ -162,10 +162,10 @@ done
162162

163163
#creates error message and exits if these values are not/incorrectly assigned
164164
[[ $nanopore == "" ]] && echo "ERROR: Path to ONT long-reads not found, assign using -n" && exit
165-
[[ $pair1 == "" || $pair2 == "" ]] && echo "ERROR: Missing paired-end short-reads as input, assign using -1 and -2" && exit
166165
[[ $genomesize == "" ]] && echo "ERROR: Missing genome size estimation, assign using -g (can be a very rough estimate)" && exit
167166

168167
[[ $hifi == "" ]] && echo "WARNING: Running without PacBio Hifi data; skipping final polishing step"
168+
[[ $pair1 == "" || $pair2 == "" ]] && echo "WARNING: Running without paired-end short-reads as input; skipping pre-polishing of reads step"
169169

170170
##############################################################
171171
####################### 0b. SETTING UP #######################
@@ -178,20 +178,26 @@ done
178178

179179
##paths to raw data
180180
nanoporepath=$( realpath ${nanopore} )
181-
pair1path=$( realpath ${pair1} )
182-
pair2path=$( realpath ${pair2} )
181+
183182

184183
#check if the files given actually exist
185184
[ ! -f "${nanoporepath}" ] && echo "ERROR: Cannot find path to nanopore reads provided by -n; check path is correct and file exists" && exit
186-
[ ! -f "${pair1path}" ] && echo "ERROR: Cannot find path to illumina reads provided by -1; check path is correct and file exists" && exit
187-
[ ! -f "${pair2path}" ] && echo "ERROR: Cannot find path to illumina reads provided by -2; check path is correct and file exists" && exit
185+
188186

189187
if [[ $hifi != "" ]]
190188
then
191189
hifipath=$( realpath ${hifi} )
192190
[ ! -f "${hifipath}" ] && echo "ERROR: Cannot find path to Pacbio Hifi reads provided by -h; check path is correct and file exists" && exit
193191
fi
194192

193+
if [[ $pair1 != "" ]]
194+
then
195+
pair1path=$( realpath ${pair1} )
196+
pair2path=$( realpath ${pair2} )
197+
[ ! -f "${pair1path}" ] && echo "ERROR: Cannot find path to illumina reads provided by -1; check path is correct and file exists" && exit
198+
[ ! -f "${pair2path}" ] && echo "ERROR: Cannot find path to illumina reads provided by -2; check path is correct and file exists" && exit
199+
fi
200+
195201

196202
##newly defining some variables for output writing
197203
## leave as is just setting to kb
@@ -229,11 +235,15 @@ filtlong --min_length ${minsize} -t ${target} --length_weight ${weight} ${nanopo
229235

230236

231237
###########################################################################
232-
###################### 2. READ POLISHING WITH RATATOSK #####################
238+
###################### 2. READ POLISHING WITH RATAOSK #####################
233239
###########################################################################
234240

241+
235242
echo "################## fusemblr: Step 2: Polishing ONT reads"
236243

244+
##run polishing if illumina data is present
245+
if [[ $pair1 != "" ]]
246+
then
237247

238248
## create directory for output of reads
239249
mkdir 2.ratatosk_ont
@@ -250,19 +260,38 @@ seqkit stats -N 50,90,95 --threads ${threads} 2.ratatosk_ont/${prefix}.${readsta
250260
###get the read N90 to set as a variables in flye
251261
if [[ $minovl == "" ]]
252262
then
253-
minovl=$( tail -n1 2.ratatosk_ont/${prefix}.${readstats}.ratatosk.stats.tsv | awk '{print $11}' | sed 's/,//g' )
263+
minovl=$( tail -n1 2.ratatosk_ont/${prefix}.${readstats}.ratatosk.stats.tsv | awk '{print $10}' | sed 's/,//g' )
264+
minovl2=$( echo ${minovl} | awk '{print $1/1000}' | awk -F "." '{print $1}' )
265+
else
266+
minovl2=$( echo ${minovl} | awk '{print $1/1000}' | awk -F "." '{print $1}' )
267+
268+
fi
269+
270+
else
271+
##if illunina data isn't present use the downsampled unpolished reads
272+
##get some stats
273+
seqkit stats -N 50,90,95 --threads ${threads} 1.filtlong_ont/${prefix}.${readstats}.fq.gz > 1.filtlong_ont/${prefix}.${readstats}.stats.tsv
274+
275+
###get the read N90 to set as a variables in flye
276+
if [[ $minovl == "" ]]
277+
then
278+
minovl=$( tail -n1 1.filtlong_ont/${prefix}.${readstats}.stats.tsv | awk '{print $11}' | sed 's/,//g' )
254279
minovl2=$( echo ${minovl} | awk '{print $1/1000}' | awk -F "." '{print $1}' )
255280
else
256281
minovl2=$( echo ${minovl} | awk '{print $1/1000}' | awk -F "." '{print $1}' )
257282

258283
fi
259284

285+
fi
286+
260287
#################################################################
261288
################## STEP 3a. ASSEMBLY WITH FLYE ##################
262289
#################################################################
263290

264291
echo "################## fusemblr: Step 3a: Assembling ONT reads with Flye"
265292

293+
if [[ $pair1 != "" ]]
294+
then
266295
## create a variable that saves these variable in the naming scheme
267296
assembly="${prefix}.${readstats}.ratatosk.flye_nanocorr_${size2}Mb_${coverage}X_minovl${minovl2}k"
268297

@@ -271,6 +300,17 @@ flye --nano-corr 2.ratatosk_ont/${prefix}.${readstats}.ratatosk.fq.gz -m ${minov
271300
## make sure the assembly file is named is a simple format
272301
cp 3a.flye_assembly/assembly.fasta 3a.flye_assembly/${assembly}.fa
273302

303+
else
304+
## create a variable that saves these variable in the naming scheme
305+
assembly="${prefix}.${readstats}.flye_nanocorr_${size2}Mb_${coverage}X_minovl${minovl2}k"
306+
307+
## run flye
308+
flye --nano-corr 1.filtlong_ont/${prefix}.${readstats}.fq.gz -m ${minovl} --genome-size ${genomesize} --asm-coverage ${coverage} --threads ${threads} -o 3a.flye_assembly/
309+
## make sure the assembly file is named is a simple format
310+
cp 3a.flye_assembly/assembly.fasta 3a.flye_assembly/${assembly}.fa
311+
312+
fi
313+
274314
#################################################################
275315
################ STEP 3b. ASSEMBLY WITH HIFIASM #################
276316
#################################################################
@@ -280,6 +320,8 @@ echo "################## fusemblr: Step 3b: Assembling ONT reads with Hifiasm"
280320
##create directory for the hifiasm output
281321
mkdir 3b.hifiasm/
282322

323+
if [[ $pair1 != "" ]]
324+
then
283325
if [[ $hifi != "" ]]
284326
then
285327
## if hifi data is present; run hifiasm with hifi data and providing ONT reads as an ultralong dataset
@@ -288,6 +330,16 @@ else
288330
## if hifi data is not present; run hifiasm with hifi data only providing ONT reads
289331
hifiasm -o 3b.hifiasm/${prefix} -t ${threads} -l0 --ont 2.ratatosk_ont/${prefix}.${readstats}.ratatosk.fq.gz
290332
fi
333+
else
334+
if [[ $hifi != "" ]]
335+
then
336+
## if hifi data is present; run hifiasm with hifi data and providing ONT reads as an ultralong dataset
337+
hifiasm -o 3b.hifiasm/${prefix} -t ${threads} -l0 --ul 1.filtlong_ont/${prefix}.${readstats}.fq.gz ${hifipath}
338+
else
339+
## if hifi data is not present; run hifiasm with hifi data only providing ONT reads
340+
hifiasm -o 3b.hifiasm/${prefix} -t ${threads} -l0 --ont 1.filtlong_ont/${prefix}.${readstats}.fq.gz
341+
fi
342+
fi
291343
## convert gfa to fasta
292344
awk '/^S/{print ">"$2;print $3}' 3b.hifiasm/${prefix}.bp.p_ctg.gfa > 3b.hifiasm/${prefix}.hifiasm.fa
293345
##clean up

0 commit comments

Comments
 (0)