Skip to content

Commit 988ec88

Browse files
author
Andreas Kusalananda Kähäri
authored
Merge pull request #47 from NBISweden/feature/cleanup
Feature/cleanup
2 parents aa247b6 + 792e67b commit 988ec88

File tree

4 files changed

+88
-91
lines changed

4 files changed

+88
-91
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,6 @@
22
results*
33
slurm-*
44
CEP-1-7*
5+
test_data
6+
.cache
7+
.nextflow

bin/vt

-4.45 MB
Binary file not shown.

main.nf

Lines changed: 72 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,12 @@ if (!bamindex) {
4545
output:
4646
file 'bamfile.bai' into bamfile_index
4747

48-
module 'bioinfo-tools'
49-
module "$params.modules.samtools"
50-
51-
// We only need one core for this part
5248
executor choose_executor()
5349
queue 'core'
54-
time params.short_job
50+
time params.runtime.simple
51+
52+
module 'bioinfo-tools'
53+
module "$params.modules.samtools"
5554

5655
when: 'indexbam' in workflowSteps
5756

@@ -73,24 +72,24 @@ process manta {
7372
output:
7473
file 'manta.vcf' into manta_vcf
7574

76-
publishDir params.outdir, mode: 'copy'
77-
78-
module 'bioinfo-tools'
79-
module "$params.modules.manta"
75+
publishDir params.outdir, mode: 'copy', saveAs: { "$params.prefix$it" }
8076

8177
errorStrategy { task.exitStatus == 143 ? 'retry' : 'terminate' }
82-
time { params.long_job * 2**(task.attempt-1) }
78+
time { params.runtime.caller * 2**(task.attempt-1) }
8379
maxRetries 3
8480
queue 'core'
8581
cpus 4
8682

83+
module 'bioinfo-tools'
84+
module "$params.modules.manta"
85+
8786
when: 'manta' in workflowSteps
8887

8988
script:
9089
"""
9190
configManta.py --normalBam bamfile --referenceFasta $params.ref_fasta --runDir testRun
9291
cd testRun
93-
./runWorkflow.py -m local -j $params.threads
92+
./runWorkflow.py -m local -j \$SLURM_CPUS_ON_NODE
9493
mv results/variants/diploidSV.vcf.gz ../manta.vcf.gz
9594
cd ..
9695
gunzip -c manta.vcf.gz > manta.vcf
@@ -113,13 +112,12 @@ if (!params.fastq) {
113112
output:
114113
file 'fastq.fq.gz' into fastq
115114

116-
module 'bioinfo-tools'
117-
module "$params.modules.samtools"
118-
119-
// We only need one core for this part
120115
executor choose_executor()
121116
queue 'core'
122-
time params.short_job
117+
time params.runtime.simple
118+
119+
module 'bioinfo-tools'
120+
module "$params.modules.samtools"
123121

124122
when: 'fastq' in workflowSteps
125123

@@ -140,7 +138,12 @@ process fermikit {
140138
output:
141139
file 'fermikit.vcf' into fermi_vcf
142140

143-
publishDir params.outdir, mode: 'copy'
141+
publishDir params.outdir, mode: 'copy', saveAs: { "$params.prefix$it" }
142+
143+
errorStrategy { task.exitStatus == 143 ? 'retry' : 'terminate' }
144+
time { params.runtime.fermikit * 2**( task.attempt - 1 ) }
145+
maxRetries 3
146+
queue 'node'
144147

145148
module 'bioinfo-tools'
146149
module "$params.modules.fermikit"
@@ -152,9 +155,9 @@ process fermikit {
152155

153156
script:
154157
"""
155-
fermi2.pl unitig -s$params.genome_size -t$params.threads -l$params.readlen -p sample sample.fq.gz > sample.mak
158+
fermi2.pl unitig -s3g -t\$SLURM_CPUS_ON_NODE -l150 -p sample sample.fq.gz > sample.mak
156159
make -f sample.mak
157-
run-calling -t$params.threads $params.ref_fasta sample.mag.gz > calling.sh
160+
run-calling -t\$SLURM_CPUS_ON_NODE $params.ref_fasta sample.mag.gz > calling.sh
158161
bash calling.sh
159162
vcf-sort -c sample.sv.vcf.gz > fermikit.vcf
160163
bgzip -c fermikit.vcf > fermikit.vcf.gz
@@ -182,12 +185,9 @@ process mask_beds {
182185
output:
183186
file '*_masked.vcf' into masked_vcfs
184187

185-
publishDir params.outdir, mode: 'copy'
186-
187-
// Does not use many resources, run it locally
188188
executor choose_executor()
189189
queue 'core'
190-
time params.short_job
190+
time params.runtime.simple
191191

192192
module 'bioinfo-tools'
193193
module "$params.modules.bedtools"
@@ -203,32 +203,25 @@ process mask_beds {
203203

204204

205205
// To make intersect files we need to combine them into one channel with
206-
// toSortedList() (fermi is before manta in alphabet). And also figure out if we
207-
// have one or two files, therefore the tap and count_vcfs.
208-
masked_vcfs.tap { count_vcfs_tmp }
209-
.tap { masked_vcfs }
206+
// toSortedList() (fermi is before manta in alphabet).
207+
masked_vcfs.tap { masked_vcfs }
208+
.filter( ~/manta|fermikit/ )
210209
.toSortedList().set { intersect_input }
211-
count_vcfs_tmp.count().set { count_vcfs }
212210

213211
process intersect_files {
214212
input:
215213
set file(fermi_vcf), file(manta_vcf) from intersect_input
216-
val nvcfs from count_vcfs
217214
output:
218215
file "combined_masked.vcf" into intersections
219-
file "combined_masked*.vcf"
220216

221-
publishDir params.outdir, mode: 'copy'
222-
223-
// Does not use many resources, run it locally
224217
executor choose_executor()
225218
queue 'core'
226-
time params.short_job
219+
time params.runtime.simple
227220

228221
module 'bioinfo-tools'
229222
module "$params.modules.bedtools"
230223

231-
when: nvcfs == 2
224+
when: 'make_intersect' in workflowSteps
232225

233226
script:
234227
"""
@@ -255,13 +248,13 @@ process variant_effect_predictor {
255248
input:
256249
file infile from annotate_files.tap { annotate_files }
257250
output:
258-
file '*.vep' into vep_outfiles
251+
file '*.vep.vcf'
259252

260-
publishDir params.outdir, mode: 'copy'
253+
publishDir params.outdir, mode: 'copy', saveAs: { "$params.prefix$it" }
261254

262255
executor choose_executor()
263256
queue 'core'
264-
time params.short_job
257+
time params.runtime.simple
265258

266259
module 'bioinfo-tools'
267260
module "$params.modules.vep"
@@ -270,28 +263,28 @@ process variant_effect_predictor {
270263

271264
script:
272265
"""
273-
infile="$infile"
274-
outfile="\$infile.vep"
275-
vep_cache="/sw/data/uppnex/vep/84"
276-
assembly="$params.vep.assembly"
277-
278-
case "\$infile" in
279-
*vcf) format="vcf" ;;
280-
*bed) format="ensembl" ;;
281-
*) printf "Unrecognized format for '%s'" "\$infile" >&2
266+
INFILE="$infile"
267+
OUTFILE="\${INFILE%.vcf}.vep.vcf"
268+
VEP_CACHE="/sw/data/uppnex/vep/84"
269+
ASSEMBLY="GRCh37"
270+
271+
case "\$INFILE" in
272+
*vcf) FORMAT="vcf" ;;
273+
*bed) FORMAT="ensembl" ;;
274+
*) printf "Unrecognized format for '%s'" "\$INFILE" >&2
282275
exit 1;;
283276
esac
284277
285-
## If the input file is empty, just copy
286-
if [ \$( wc -l "\$infile" | awk '{print \$1}' ) -eq 0 ]; then
287-
cp "\$infile" "\$outfile"
278+
## If the input file is empty, just copy it
279+
if [[ -f "\$INFILE" && -s "\$INFILE" ]]; then
280+
cp "\$INFILE" "\$OUTFILE"
288281
exit
289282
fi
290283
291284
variant_effect_predictor.pl \
292285
-i "\$infile" \
293-
--format "\$format" \
294-
-cache --dir "\$vep_cache" \
286+
--format "\$FORMAT" \
287+
-cache --dir "\$VEP_CACHE" \
295288
-o "\$outfile" \
296289
--vcf \
297290
--merged \
@@ -306,50 +299,51 @@ process variant_effect_predictor {
306299
--canonical \
307300
--ccds \
308301
--fields Consequence,Codons,Amino_acids,Gene,SYMBOL,Feature,EXON,PolyPhen,SIFT,Protein_position,BIOTYPE \
309-
--assembly "\$assembly" \
302+
--assembly "\$ASSEMBLY" \
310303
--offline
311304
"""
312305
}
313306

314307
process snpEff {
315308
input:
316-
file vcf from annotate_files.tap { annotate_files }
309+
file infile from annotate_files.tap { annotate_files }
317310
output:
318-
file '*.snpeff'
311+
file '*.snpeff.vcf'
319312

320-
publishDir params.outdir, mode: 'copy'
313+
publishDir params.outdir, mode: 'copy', saveAs: { "$params.prefix$it" }
321314

322-
module 'bioinfo-tools'
323-
module "$params.modules.snpeff"
324-
325-
// Does not use many resources, run it locally
326315
executor choose_executor()
327316
queue 'core'
328-
time params.short_job
317+
time params.runtime.simple
318+
319+
module 'bioinfo-tools'
320+
module "$params.modules.snpeff"
321+
module "$params.modules.vt"
329322

330323
when: 'snpeff' in workflowSteps
331324

332325
script:
333326
"""
334-
vcf="$vcf" ## Use bash-semantics for variables
335-
snpeffjar=''
327+
INFILE="$infile" ## Use bash-semantics for variables
328+
OUTFILE="\${INFILE%.vcf}.snpeff.vcf"
329+
SNPEFFJAR=''
336330
337-
for p in \$( tr ':' ' ' <<<"\$CLASSPATH" ); do
338-
if [ -f "\$p/snpEff.jar" ]; then
339-
snpeffjar="\$p/snpEff.jar"
331+
for P in \$( tr ':' ' ' <<<"\$CLASSPATH" ); do
332+
if [ -f "\$P/snpEff.jar" ]; then
333+
SNPEFFJAR="\$P/snpEff.jar"
340334
break
341335
fi
342336
done
343-
if [ -z "\$snpeffjar" ]; then
337+
if [ -z "\$SNPEFFJAR" ]; then
344338
printf "Can't find snpEff.jar in '%s'" "\$CLASSPATH" >&2
345339
exit 1
346340
fi
347341
348-
sed 's/ID=AD,Number=./ID=AD,Number=R/' "\$vcf" \
342+
sed 's/ID=AD,Number=./ID=AD,Number=R/' "\$INFILE" \
349343
| vt decompose -s - \
350344
| vt normalize -r $params.ref_fasta - \
351-
| java -Xmx7G -jar "\$snpeffjar" -formatEff -classic GRCh37.75 \
352-
> "\$vcf.snpeff"
345+
| java -Xmx7G -jar "\$SNPEFFJAR" -formatEff -classic GRCh37.75 \
346+
> "\$OUTFILE"
353347
"""
354348
}
355349

@@ -369,11 +363,10 @@ def usage_message() {
369363
log.info ' --help Show this message and exit'
370364
log.info ' --fastq Input fastqfile (default is bam but with fq as fileending)'
371365
log.info ' --steps Specify what steps to run, comma separated:'
372-
log.info ' Callers: manta, fermikit, cnvnator (choose one or many)'
366+
log.info ' Callers: manta, fermikit (choose one or many)'
373367
log.info ' Annotation: vep OR snpeff'
374-
log.info ' --long_job Running time for long job (callers, fermi and manta)'
375-
log.info ' --short_job Running time for short jobs (bam indexing and bam2fq)'
376368
log.info ' --outdir Directory where resultfiles are stored'
369+
log.info ' --prefix Prefix for result filenames'
377370
log.info ''
378371
}
379372

@@ -439,6 +432,8 @@ def nextflow_running_as_slurmjob() {
439432
return false
440433
}
441434

435+
/* If the nextflow deamon is running as a slurm job, we can use the local CPU
436+
* for a lot of our work */
442437
def choose_executor() {
443438
return nextflow_running_as_slurmjob() ? 'local' : 'slurm'
444439
}
@@ -462,5 +457,9 @@ def processWorkflowSteps(steps) {
462457
workflowSteps.push( 'fastq' )
463458
}
464459

460+
if ('manta' in workflowSteps && 'fermikit' in workflowSteps) {
461+
workflowSteps.push( 'make_intersect' )
462+
}
463+
465464
return workflowSteps
466465
}

nextflow.config

Lines changed: 13 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@ params {
22
steps = 'manta,vep' // Change on commandline --steps x,y,z
33
project = "" // Set project or supply on commandline ( --project )
44
outdir = "results"
5+
prefix = ''
6+
7+
ref_fasta = "/sw/data/uppnex/ToolBox/ReferenceAssemblies/hg38make/bundle/2.8/b37/human_g1k_v37.fasta"
58

69
// Modules and their versions on the HPC-system
710
modules {
@@ -13,36 +16,28 @@ params {
1316
bedtools = "BEDTools/2.25.0"
1417
vep = "vep/84"
1518
snpeff = "snpEff/4.2"
19+
vt = "vt/0.5772"
1620
}
1721

18-
// Caller specific options
19-
threads = 16
20-
genome_size = "3g"
21-
readlen = 150
22-
ref_fasta = "/sw/data/uppnex/ToolBox/ReferenceAssemblies/hg38make/bundle/2.8/b37/human_g1k_v37.fasta"
23-
24-
vep {
25-
assembly = "GRCh37"
22+
runtime {
23+
simple = '30m' // Short simple shell jobs
24+
fermikit = '24h' // Fermikit is the longest running of them all
25+
caller = '10h' // The rest are a lot quicker
2626
}
27-
28-
long_job = '10h' // used for the callers (fermikit & manta)
29-
short_job = '30m' // used for bam indexing and bam2fq
3027
}
3128

3229
process {
33-
executor = 'slurm'
34-
time = params.long_job
35-
queue = "node"
36-
clusterOptions = {
37-
"-A $params.project"
38-
}
30+
executor = 'slurm'
31+
clusterOptions = {
32+
"-A $params.project"
33+
}
3934
}
4035

4136
executor {
4237
$slurm {
4338
queueSize = 10
4439
}
4540
$local {
46-
queueSize = 10
41+
queueSize = 1
4742
}
4843
}

0 commit comments

Comments
 (0)