Skip to content

Commit 202785c

Browse files
committed
Update by dgwoo@unist.ac.kr
Signed-off-by: Jaewoong Lee <230@fumire.moe>
1 parent 915282d commit 202785c

File tree

8 files changed

+343
-188
lines changed

8 files changed

+343
-188
lines changed

02_Data_pre-processing_for_variant_discovery/02_1_BWA.bash

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,3 @@ set -euo pipefail
33
IFS=$'\n\t'
44

55
python3 02_1_BWA.py /BiO/Store/Standard-Pipeline/cn95N_S0_L009_R1_001.fastq.gz /BiO/Store/Standard-Pipeline/cn95N_S0_L009_R2_001.fastq.gz .
6-
python3 02_1_BWA.py /BiO/Store/Standard-Pipeline/cn95P_S0_L009_R1_001.fastq.gz /BiO/Store/Standard-Pipeline/cn95P_S0_L009_R2_001.fastq.gz .
Lines changed: 70 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -1,66 +1,70 @@
1-
#!/usr/bin/env python3
2-
"""
3-
02_1_BWA.py: Mapping with BWA
4-
"""
5-
import argparse
6-
import configparser
7-
import os
8-
import subprocess
9-
10-
parser = argparse.ArgumentParser()
11-
12-
parser.add_argument("input", help="Input FASTQ file", nargs=2)
13-
parser.add_argument("output", help="Output directory", default=os.getcwd())
14-
parser.add_argument("-c", "--config", help="config INI file", default="../config.ini")
15-
16-
parser.add_argument("-n", "--dryrun", help="Don't actually run any recipe; just make .SH only", default=False, action="store_true")
17-
18-
args = parser.parse_args()
19-
20-
config = configparser.ConfigParser(interpolation=configparser.ExtendedInterpolation())
21-
config.read(args.config)
22-
23-
args.input.sort()
24-
name = args.input[0].split("/")[-1].split("_")[0]
25-
args.output = os.path.realpath(args.output)
26-
27-
# Mapping
28-
with open(f"BWA_{name}.sh", "w") as sh:
29-
sh.write("#!/bin/bash\n")
30-
sh.write(f"{config['TOOLS']['bwa']} mem -M -t {config['DEFAULT']['threads']} -R '@RG\\tID:{name}\\tPL:ILLUMINA\\tLB:{name}\\tSM:{name}\\tCN:UNIST' -v 3 {config['REFERENCES']['fasta']} {args.input[0]} {args.input[0]} | {config['TOOLS']['samtools']} view --bam --with-header --threads {config['DEFAULT']['threads']} --reference {config['REFERENCES']['fasta']} --output {args.output}/{name}.bam")
31-
32-
if not args.dryrun:
33-
mapping_job_id = subprocess.check_output(f"sbatch --chdir=$(realpath .) --cpus-per-task={config['DEFAULT']['threads']} --error='%x-%A.txt' --job-name='BWA_{name}' --mem={config['DEFAULT']['memory']}G --output='%x-%A.txt' --export=ALL BWA_{name}.sh", encoding="utf-8", shell=True).split()[-1]
34-
35-
# Sort
36-
with open(f"Sort_{name}.sh", "w") as sh:
37-
sh.write("#!/bin/bash\n")
38-
sh.write(f"{config['TOOLS']['samtools']} sort -l 9 --threads {config['DEFAULT']['threads']} -m {int(config['DEFAULT']['memory']) // int(config['DEFAULT']['threads'])}G --reference {config['REFERENCES']['fasta']} --write-index -o {args.output}/{name}.Sort.bam {args.output}/{name}.bam")
39-
40-
if not args.dryrun:
41-
sorting_job_id = subprocess.check_output(f"sbatch --dependency=afterok:{mapping_job_id} --chdir=$(realpath .) --cpus-per-task={config['DEFAULT']['threads']} --error='%x-%A.txt' --job-name='Sort_{name}' --mem={config['DEFAULT']['memory']}G --output='%x-%A.txt' --export=ALL Sort_{name}.sh", encoding="utf-8", shell=True).split()[-1]
42-
43-
# Mark Duplicates
44-
with open(f"MarkDup_{name}.sh", "w") as sh:
45-
sh.write("#!/bin/bash\n")
46-
sh.write(f"{config['TOOLS']['gatk']} MarkDuplicatesSpark --input {args.output}/{name}.Sort.bam --output {args.output}/{name}.Sort.MarkDuplicates.bam --reference {config['REFERENCES']['fasta']} --metrics-file {args.output}/{name}.Sort.MarkDuplicates.metrics --duplicate-tagging-policy 'OpticalOnly' -- --spark-master 'local[{config['DEFAULT']['threads']}]' --spark-verbosity 'INFO'")
47-
48-
if not args.dryrun:
49-
markduplicates_job_id = subprocess.check_output(f"sbatch --dependency=afterok:{sorting_job_id} --chdir=$(realpath .) --cpus-per-task={config['DEFAULT']['threads']} --error='%x-%A.txt' --job-name='MarkDup_{name}' --mem={config['DEFAULT']['memory']}G --output='%x-%A.txt' --export=ALL MarkDup_{name}.sh", encoding="utf-8", shell=True).split()[-1]
50-
51-
# Base Quality Score Recalibration (BQSR)
52-
with open(f"BQSR_{name}.sh", "w") as sh:
53-
sh.write("#!/bin/bash\n")
54-
sh.write(f"{config['TOOLS']['gatk']} BaseRecalibrator --input {args.output}/{name}.Sort.MarkDuplicates.bam --reference {config['REFERENCES']['fasta']} --output {args.output}/{name}.Sort.MarkDuplicates.BQSR.table --create-output-bam-index true")
55-
for site in config['REFERENCES']['sites'].split(" "):
56-
sh.write(f" --known-sites {site}")
57-
58-
if not args.dryrun:
59-
BQSR_job_id = subprocess.check_output(f"sbatch --dependency=afterok:{markduplicates_job_id} --chdir=$(realpath .) --cpus-per-task={config['DEFAULT']['threads']} --error='%x-%A.txt' --job-name='BQSR_{name}' --mem={config['DEFAULT']['memory']}G --output='%x-%A.txt' --export=ALL BQSR_{name}.sh", encoding="utf-8", shell=True).split()[-1]
60-
61-
with open(f"ApplyBQSR_{name}.sh", "w") as sh:
62-
sh.write("#!/bin/bash\n")
63-
sh.write(f"{config['TOOLS']['gatk']} ApplyBQSR --bqsr-recal-file {args.output}/{name}.Sort.MarkDuplicates.BQSR.table --input {args.output}/{name}.Sort.MarkDuplicates.bam --output {args.output}/{name}.Sort.MarkDuplicates.BQSR.bam --reference {config['REFERENCES']['fasta']} --create-output-bam-index true")
64-
65-
if not args.dryrun:
66-
ApplyBQSR_job_id = subprocess.check_output(f"sbatch --dependency=afterok:{BQSR_job_id} --chdir=$(realpath .) --cpus-per-task={config['DEFAULT']['threads']} --error='%x-%A.txt' --job-name='ApplyBQSR_{name}' --mem={config['DEFAULT']['memory']}G --output='%x-%A.txt' --export=ALL ApplyBQSR_{name}.sh", encoding="utf-8", shell=True).split()[-1]
1+
#!/usr/bin/env python3
2+
"""
3+
02_1_BWA.py: Mapping with BWA
4+
"""
5+
import argparse
6+
import os
7+
import sys
8+
from pipeline_utils import PipelineManagerBase
9+
10+
sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname(__file__))))
11+
12+
13+
class PipelineManager(PipelineManagerBase):
14+
def __init__(self, input_files, output, config_file, dryrun):
15+
super().__init__(config_file, dryrun, output_dir=output)
16+
self.input_files = sorted(input_files)
17+
self.name = os.path.basename(self.input_files[0]).split("_")[0]
18+
19+
def run_bwa(self, dependency_id=None):
20+
command = f"{self.config['TOOLS']['bwa']} mem -M -t {self.config['DEFAULT']['threads']} -R '@RG\\tID:{self.name}\\tPL:ILLUMINA\\tLB:{self.name}\\tSM:{self.name}\\tCN:UNIST' -v 3 {self.config['REFERENCES']['fasta']} {self.input_files[0]} {self.input_files[0]} | {self.config['TOOLS']['samtools']} view --bam --with-header --threads {self.config['DEFAULT']['threads']} --reference {self.config['REFERENCES']['fasta']} --output {self.output_dir}/{self.name}.bam"
21+
self.create_sh("BWA", command)
22+
return self.submit_job("BWA")
23+
24+
def run_sort(self, dependency_id=None):
25+
command = f"{self.config['TOOLS']['samtools']} sort -l 9 --threads {self.config['DEFAULT']['threads']} -m {int(self.config['DEFAULT']['memory']) // int(self.config['DEFAULT']['threads'])}G --reference {self.config['REFERENCES']['fasta']} --write-index -o {self.output_dir}/{self.name}.Sort.bam {self.output_dir}/{self.name}.bam"
26+
self.create_sh("Sort", command)
27+
return self.submit_job("Sort", dependency_id=dependency_id)
28+
29+
def run_mark_duplicates(self, dependency_id=None):
30+
command = f"{self.config['TOOLS']['gatk']} MarkDuplicatesSpark --input {self.output_dir}/{self.name}.Sort.bam --output {self.output_dir}/{self.name}.Sort.MarkDuplicates.bam --reference {self.config['REFERENCES']['fasta']} --metrics-file {self.output_dir}/{self.name}.Sort.MarkDuplicates.metrics --duplicate-tagging-policy 'OpticalOnly' -- --spark-master 'local[{self.config['DEFAULT']['threads']}]' --spark-verbosity 'INFO'"
31+
self.create_sh("MarkDup", command)
32+
return self.submit_job("MarkDup", dependency_id=dependency_id)
33+
34+
def run_bqsr(self, dependency_id=None):
35+
known_sites = " ".join([f"--known-sites {site}" for site in self.config["REFERENCES"]["sites"].split(" ")])
36+
command = f"{self.config['TOOLS']['gatk']} BaseRecalibrator --input {self.output_dir}/{self.name}.Sort.MarkDuplicates.bam --reference {self.config['REFERENCES']['fasta']} --output {self.output_dir}/{self.name}.Sort.MarkDuplicates.BQSR.table --create-output-bam-index true {known_sites}"
37+
self.create_sh("BQSR", command)
38+
return self.submit_job("BQSR", dependency_id=dependency_id)
39+
40+
def run_apply_bqsr(self, dependency_id=None):
41+
command = f"{self.config['TOOLS']['gatk']} ApplyBQSR --bqsr-recal-file {self.output_dir}/{self.name}.Sort.MarkDuplicates.BQSR.table --input {self.output_dir}/{self.name}.Sort.MarkDuplicates.bam --output {self.output_dir}/{self.name}.Sort.MarkDuplicates.BQSR.bam --reference {self.config['REFERENCES']['fasta']} --create-output-bam-index true"
42+
self.create_sh("ApplyBQSR", command)
43+
return self.submit_job("ApplyBQSR", dependency_id=dependency_id)
44+
45+
46+
def parse_arguments():
47+
parser = argparse.ArgumentParser()
48+
49+
parser.add_argument("input", help="Input FASTQ file", nargs=2)
50+
parser.add_argument("output", help="Output directory", default=os.getcwd())
51+
parser.add_argument("-c", "--config", help="config INI file", default="../config.ini")
52+
parser.add_argument("-n", "--dryrun", help="Don't actually run any recipe; just make .SH only", default=False, action="store_true")
53+
54+
return parser.parse_args()
55+
56+
57+
def main():
58+
args = parse_arguments()
59+
60+
pipeline = PipelineManager(input_files=args.input, output=args.output, config_file=args.config, dryrun=args.dryrun)
61+
62+
mapping_job_id = pipeline.run_bwa()
63+
sort_job_id = pipeline.run_sort(dependency_id=mapping_job_id)
64+
mark_duplicates_job_id = pipeline.run_mark_duplicates(dependency_id=sort_job_id)
65+
bqsr_job_id = pipeline.run_bqsr(dependency_id=mark_duplicates_job_id)
66+
apply_bqsr_job_id = pipeline.run_apply_bqsr(dependency_id=bqsr_job_id)
67+
68+
69+
if __name__ == "__main__":
70+
main()

03_Somatic_short_variant_discovery/03_1_Mutect2.bash

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@
22
set -euo pipefail
33
#IFS=$'\n\t'
44

5-
python3 03_1_Mutect2.py /BiO/Research/Standard-Pipeline/02_Data_pre-processing_for_variant_discovery/cn95N.Sort.MarkDuplicates.BQSR.bam /BiO/Research/Standard-Pipeline/02_Data_pre-processing_for_variant_discovery/cn95P.Sort.MarkDuplicates.BQSR.bam cn95.maf
5+
python3 03_1_Mutect2.py /BiO/Research/Standard-Pipeline/02_Data_pre-processing_for_variant_discovery/cn95N.Sort.MarkDuplicates.BQSR.bam /BiO/Research/Standard-Pipeline/02_Data_pre-processing_for_variant_discovery/cn95P.Sort.MarkDuplicates.BQSR.bam cn95.maf -p

0 commit comments

Comments
 (0)