Skip to content

Commit 6818f7f

Browse files
authored
Merge pull request #93 from NBISweden/mask_swegen
Mask swegen feature
2 parents 158015e + 76ee047 commit 6818f7f

File tree

9 files changed

+122
-56
lines changed

9 files changed

+122
-56
lines changed

README.md

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,10 @@ Options:
6565
--steps Specify what steps to run, comma separated: (default: manta, vep)
6666
Callers: manta, fermikit
6767
Annotation: vep, snpeff
68-
Extra: normalize (with vt)
68+
Extra: normalize (with vt),
69+
filter (with bed files in masks_filters/, by default swegen is used)
70+
--sg_mask_ovlp Fractional overlap for use with the filter option
71+
--no_sg_reciprocal Don't use a reciprocal overlap for the filter option
6972
--outdir Directory where resultfiles are stored (default: results)
7073
--prefix Prefix for result filenames (default: no prefix)
7174
```
@@ -101,14 +104,23 @@ usage message for information on them.
101104
The next section has the reference assembly to use, both as fasta and assembly
102105
name.
103106

104-
You may want to use different versions of the modules used by this workflow, and
105-
the `modules` section contains all of them and their
106-
versions. Customize the modules here, NOT in the `main.nf` file.
107-
108-
Finally the `runtime` section has the different runtimes for the different
109-
parts of the workflow. `fermikit` has it's own timespec since that is a very
110-
long running program, otherwise the workflow differentiates between `callers`
111-
and other supporting `simple` single-core jobs.
107+
You may want to use different versions of the modules used by this workflow,
108+
currently you will have to edit the profiles to do that. On uppmax we have the
109+
milou profile which specifies all the modules and versions, see the
110+
`config/milou.config`.
111+
112+
The runtimes of the different programs is set in the `config/standard.config`
113+
file. That file also specifies how to deal with errors and the interaction
114+
with the Slurm scheduler, you probably don't want to change those unless you
115+
know what you are doing.
116+
117+
The two folders `masks_artifacts` and `masks_filters` contain bed files to
118+
filter the vcf-files from the callers. The artifact directory contains files
119+
that should mask out problematic regions, it removes everything that has an
120+
overlaps at least 25% with a region in the artifact mask. The filter one is
121+
for more stringent filtering of already known variants, and here the default
122+
filter threshold is instead a reciprocal overlap of 95%. It can be customized
123+
with the two options `sg_mask_ovlp` (default 0.95) and `no_sg_reciprocal`.
112124

113125

114126
## Support

config/devcore.config

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,13 @@
11
process {
2+
clusterOptions = {
3+
"-A $params.project"
4+
}
25

3-
clusterOptions = {
4-
"-A $params.project"
5-
}
6-
errorStrategy = { task.exitStatus == 143 ? 'retry' : 'terminate' }
7-
queue = 'devcore'
8-
executor = 'slurm'
9-
10-
time = '1h'
6+
errorStrategy = { task.exitStatus == 143 ? 'retry' : 'terminate' }
7+
queue = 'devcore'
8+
executor = 'slurm'
119

10+
time = '1h'
1211
}
1312

1413
executor {

config/milou.config

Lines changed: 28 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,46 @@
11
process {
2+
$index_bamfile {
3+
module = ['bioinfo-tools', 'samtools/1.3']
4+
}
25

3-
$index_bamfile {
4-
module = ['bioinfo-tools', 'samtools/1.3']
5-
}
6+
$manta {
7+
maxRetries = 3
8+
cpus = 16
9+
module = ['bioinfo-tools', 'manta/1.0.3']
10+
}
611

7-
$manta {
8-
maxRetries = 3
9-
cpus = 16
10-
module = ['bioinfo-tools', 'manta/1.0.3']
11-
}
12+
$create_fastq {
13+
module = ['bioinfo-tools', 'samtools/1.3']
14+
}
1215

13-
$create_fastq {
14-
module = ['bioinfo-tools', 'samtools/1.3']
16+
$fermikit {
17+
maxRetries = 3
18+
cpus = 16
19+
module = ['bioinfo-tools', 'samtools/1.3', 'fermikit/r178', "vcftools/0.1.14", "tabix/0.2.6"]
1520
}
1621

17-
$fermikit {
18-
maxRetries = 3
19-
cpus = 16
20-
module = ['bioinfo-tools', 'samtools/1.3', 'fermikit/r178', "vcftools/0.1.14", "tabix/0.2.6"]
22+
$artifact_mask_vcfs {
23+
module = ['bioinfo-tools', 'BEDTools/2.25.0']
2124
}
2225

23-
$mask_vcfs {
24-
module = ['bioinfo-tools', 'BEDTools/2.25.0']
26+
$swegen_mask_vcfs {
27+
module = ['bioinfo-tools', 'BEDTools/2.25.0']
2528
}
2629

27-
$intersect_files {
28-
module = ['bioinfo-tools', 'BEDTools/2.25.0']
30+
$intersect_files {
31+
module = ['bioinfo-tools', 'BEDTools/2.25.0']
2932
}
3033

31-
$normalize_vcf {
32-
module = ['bioinfo-tools', 'vt/0.5772']
34+
$normalize_vcf {
35+
module = ['bioinfo-tools', 'vt/0.5772']
3336
}
3437

35-
$variant_effect_predictor {
36-
cpus = 4
37-
module = ['bioinfo-tools', 'vep/84']
38+
$variant_effect_predictor {
39+
cpus = 4
40+
module = ['bioinfo-tools', 'vep/84']
3841
}
3942

40-
$snpEff {
41-
module = ['bioinfo-tools', 'snpEff/4.2', 'vt/0.5772']
43+
$snpEff {
44+
module = ['bioinfo-tools', 'snpEff/4.2', 'vt/0.5772']
4245
}
4346
}

config/standard.config

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
process {
2-
32
clusterOptions = {
43
"-A $params.project"
54
}
@@ -20,5 +19,5 @@ process {
2019

2120
$fermikit {
2221
time = params.runtime.fermikit * 2**( task.attempt -1 )
23-
22+
}
2423
}

main.nf

Lines changed: 55 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -156,36 +156,75 @@ process fermikit {
156156

157157
// 3. Create summary files
158158

159-
mask_dir = file("$baseDir/masks")
160159
ch_vcfs = ch_manta_vcf.mix( ch_fermi_vcf )
161160

162-
process mask_vcfs {
161+
process artifact_mask_vcfs {
163162
input:
164-
each mask_dir from mask_dir
165163
set file(svfile), val(uuid), val(dir) from ch_vcfs
166164
output:
167-
set file('*_masked.vcf'), val(uuid), val(dir) into ch_masked_vcfs
165+
set file(svfile), val(uuid), val(dir) into ch_artifact_masked_vcfs
168166

169167
tag "$uuid $svfile"
170168

171169
executor choose_executor()
172170

173171
"""
174172
BNAME=\$( echo $svfile | cut -d. -f1 )
175-
MASK_FILE=\${BNAME}_masked.vcf
176-
MASK_DIR=$mask_dir
173+
MASK_DIR=$params.mask_dirs.masks_artifacts
177174
175+
# We don't want to change the filename in this process so we copy the
176+
# infile and remove the symbolic link. And then recreate the file at the
177+
# end.
178178
cp $svfile workfile
179+
rm $svfile # It's a link, should be ok :)
179180
for mask in \$MASK_DIR/*; do
181+
if [ ! -f "\$mask" ]; then
182+
break
183+
fi
180184
cat workfile \
181185
| bedtools intersect -header -v -a stdin -b \$mask -f 0.25 \
182186
> tempfile
183187
mv tempfile workfile
184188
done
189+
mv workfile $svfile
190+
"""
191+
}
192+
193+
194+
ch_noswegen_mask = ch_artifact_masked_vcfs.tap { ch_swegen_mask_in }
195+
reciprocal = params.no_sg_reciprocal ? '': '-r'
196+
197+
process swegen_mask_vcfs {
198+
input:
199+
set file(svfile), val(uuid), val(dir) from ch_swegen_mask_in
200+
output:
201+
set file('*_swegen_masked.vcf'), val(uuid), val(dir) into ch_swegen_masked_vcfs
202+
203+
tag "$uuid $svfile"
204+
205+
executor choose_executor()
206+
when 'filter' in workflowSteps
207+
208+
"""
209+
BNAME=\$( echo $svfile | cut -d. -f1 )
210+
MASK_FILE=\${BNAME}_swegen_masked.vcf
211+
MASK_DIR=$params.mask_dirs.masks_filters
212+
213+
cp $svfile workfile
214+
for mask in \$MASK_DIR/*; do
215+
if [ ! -f "\$mask" ]; then
216+
break
217+
fi
218+
cat workfile \
219+
| bedtools intersect -header -v -a stdin -b \$mask -f $params.sg_mask_ovlp $reciprocal \
220+
> tempfile
221+
mv tempfile workfile
222+
done
185223
mv workfile \$MASK_FILE
186224
"""
187225
}
188226

227+
ch_masked_vcfs = ch_noswegen_mask.mix(ch_swegen_masked_vcfs)
189228

190229
// To make intersect files we need to combine them into one channel with
191230
// toList() and then sort in the map so that fermi is before manta in the
@@ -199,7 +238,7 @@ process intersect_files {
199238
input:
200239
set file(vcfs), val(uuid), val(dir) from ch_intersect_input
201240
output:
202-
set file('combined_masked.vcf'), val(uuid), val("${dir[0]}") into ch_intersections
241+
set file('combined_*.vcf'), val(uuid), val("${dir[0]}") into ch_intersections
203242

204243
tag "$uuid"
205244

@@ -217,21 +256,22 @@ process intersect_files {
217256
manta_vcf=${vcfs[0]}
218257
fi
219258
259+
OUTNAME=`basename \$fermi_vcf|sed 's/fermikit_//;s/.vcf//'`
220260
## Create intersected vcf files
221261
for WORD in DEL INS DUP; do
222262
intersectBed -a <( grep -w "^#.\\+\\|\$WORD" \$fermi_vcf) \
223263
-b <( grep -w "^#.\\+\\|\$WORD" \$manta_vcf) \
224264
-f 0.5 -r \
225-
| sort -k1,1V -k2,2n > combined_masked_\${WORD,,}.vcf
265+
| sort -k1,1V -k2,2n > cmb_\${OUTNAME}_\${WORD,,}.vcf
226266
done
227267
228268
cat <( grep -v -w '^#.\\+\\|DEL\\|INS\\|DUP' \$fermi_vcf ) \
229269
<( grep -v -w '^#.\\+\\|DEL\\|INS\\|DUP' \$manta_vcf ) \
230270
| cut -f 1-8 \
231-
| sort -k1,1V -k2,2n > combined_masked_OTHER.vcf
271+
| sort -k1,1V -k2,2n > cmb_\${OUTNAME}_OTHER.vcf
232272
233273
( grep '^#' \$fermi_vcf; \
234-
sort -k1,1V -k2,2n combined_masked_*.vcf ) >> combined_masked.vcf
274+
sort -k1,1V -k2,2n cmb_\${OUTNAME}_*.vcf ) >> combined_\${OUTNAME}.vcf
235275
"""
236276
}
237277

@@ -333,7 +373,7 @@ process variant_effect_predictor {
333373
--total_length \
334374
--canonical \
335375
--ccds \
336-
--fork "\$SLURM_JOB_CPUS_PER_NODE" \
376+
\$( test "\$SLURM_CPUS_ON_NODE" -gt 1 && echo "--fork \$SLURM_CPUS_ON_NODE" ) \
337377
--fields Consequence,Codons,Amino_acids,Gene,SYMBOL,Feature,EXON,PolyPhen,SIFT,Protein_position,BIOTYPE \
338378
--assembly "\$ASSEMBLY" \
339379
--offline
@@ -398,7 +438,10 @@ def usage_message() {
398438
log.info ' --steps Specify what steps to run, comma separated: (default: manta, vep)'
399439
log.info ' Callers: manta, fermikit'
400440
log.info ' Annotation: vep, snpeff'
401-
log.info ' Extra: normalize (with vt)'
441+
log.info ' Extra: normalize (with vt),'
442+
log.info ' filter (with bed files in masks_filters/, by default swegen is used)'
443+
log.info ' --sg_mask_ovlp Fractional overlap for use with the filter option'
444+
log.info ' --no_sg_reciprocal Don't use a reciprocal overlap for the filter option'
402445
log.info ' --outdir Directory where resultfiles are stored (default: results)'
403446
log.info ' --prefix Prefix for result filenames (default: no prefix)'
404447
log.info ''
File renamed without changes.

masks_filters/.keep

Whitespace-only changes.

nextflow.config

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@ params {
88
prefix = ''
99
help = False
1010

11+
//optional swegen masking
12+
sg_mask_ovlp = 0.95 // --sg_mask_ovlp in case other overlap value than default 0.95 for swegen
13+
no_sg_reciprocal = False // --no_sg_reciprocal in case swegen masking should not include reciprocal overlap option
14+
1115
// Reference assemblies
1216
ref_fasta = "/sw/data/uppnex/ToolBox/ReferenceAssemblies/hg38make/bundle/2.8/b37/human_g1k_v37.fasta"
1317
assembly = 'GRCh37'
@@ -18,6 +22,12 @@ params {
1822
fermikit = '48h' // Fermikit is the longest running of them all
1923
caller = '24h' // The rest are a lot quicker
2024
}
25+
26+
// Dirs that contain BED files for masking
27+
mask_dirs {
28+
masks_artifacts = "$baseDir/masks_artifacts"
29+
masks_filters = "$baseDir/masks_filters"
30+
}
2131
}
2232

2333
profiles {

0 commit comments

Comments
 (0)