|
1 | | -#!/usr/bin/env python3 |
2 | | -""" |
3 | | -02_1_BWA.py: Mapping with BWA |
4 | | -""" |
5 | | -import argparse |
6 | | -import configparser |
7 | | -import os |
8 | | -import subprocess |
9 | | - |
10 | | -parser = argparse.ArgumentParser() |
11 | | - |
12 | | -parser.add_argument("input", help="Input FASTQ file", nargs=2) |
13 | | -parser.add_argument("output", help="Output directory", default=os.getcwd()) |
14 | | -parser.add_argument("-c", "--config", help="config INI file", default="../config.ini") |
15 | | - |
16 | | -parser.add_argument("-n", "--dryrun", help="Don't actually run any recipe; just make .SH only", default=False, action="store_true") |
17 | | - |
18 | | -args = parser.parse_args() |
19 | | - |
20 | | -config = configparser.ConfigParser(interpolation=configparser.ExtendedInterpolation()) |
21 | | -config.read(args.config) |
22 | | - |
23 | | -args.input.sort() |
24 | | -name = args.input[0].split("/")[-1].split("_")[0] |
25 | | -args.output = os.path.realpath(args.output) |
26 | | - |
27 | | -# Mapping |
28 | | -with open(f"BWA_{name}.sh", "w") as sh: |
29 | | - sh.write("#!/bin/bash\n") |
30 | | - sh.write(f"{config['TOOLS']['bwa']} mem -M -t {config['DEFAULT']['threads']} -R '@RG\\tID:{name}\\tPL:ILLUMINA\\tLB:{name}\\tSM:{name}\\tCN:UNIST' -v 3 {config['REFERENCES']['fasta']} {args.input[0]} {args.input[0]} | {config['TOOLS']['samtools']} view --bam --with-header --threads {config['DEFAULT']['threads']} --reference {config['REFERENCES']['fasta']} --output {args.output}/{name}.bam") |
31 | | - |
32 | | -if not args.dryrun: |
33 | | - mapping_job_id = subprocess.check_output(f"sbatch --chdir=$(realpath .) --cpus-per-task={config['DEFAULT']['threads']} --error='%x-%A.txt' --job-name='BWA_{name}' --mem={config['DEFAULT']['memory']}G --output='%x-%A.txt' --export=ALL BWA_{name}.sh", encoding="utf-8", shell=True).split()[-1] |
34 | | - |
35 | | -# Sort |
36 | | -with open(f"Sort_{name}.sh", "w") as sh: |
37 | | - sh.write("#!/bin/bash\n") |
38 | | - sh.write(f"{config['TOOLS']['samtools']} sort -l 9 --threads {config['DEFAULT']['threads']} -m {int(config['DEFAULT']['memory']) // int(config['DEFAULT']['threads'])}G --reference {config['REFERENCES']['fasta']} --write-index -o {args.output}/{name}.Sort.bam {args.output}/{name}.bam") |
39 | | - |
40 | | -if not args.dryrun: |
41 | | - sorting_job_id = subprocess.check_output(f"sbatch --dependency=afterok:{mapping_job_id} --chdir=$(realpath .) --cpus-per-task={config['DEFAULT']['threads']} --error='%x-%A.txt' --job-name='Sort_{name}' --mem={config['DEFAULT']['memory']}G --output='%x-%A.txt' --export=ALL Sort_{name}.sh", encoding="utf-8", shell=True).split()[-1] |
42 | | - |
43 | | -# Mark Duplicates |
44 | | -with open(f"MarkDup_{name}.sh", "w") as sh: |
45 | | - sh.write("#!/bin/bash\n") |
46 | | - sh.write(f"{config['TOOLS']['gatk']} MarkDuplicatesSpark --input {args.output}/{name}.Sort.bam --output {args.output}/{name}.Sort.MarkDuplicates.bam --reference {config['REFERENCES']['fasta']} --metrics-file {args.output}/{name}.Sort.MarkDuplicates.metrics --duplicate-tagging-policy 'OpticalOnly' -- --spark-master 'local[{config['DEFAULT']['threads']}]' --spark-verbosity 'INFO'") |
47 | | - |
48 | | -if not args.dryrun: |
49 | | - markduplicates_job_id = subprocess.check_output(f"sbatch --dependency=afterok:{sorting_job_id} --chdir=$(realpath .) --cpus-per-task={config['DEFAULT']['threads']} --error='%x-%A.txt' --job-name='MarkDup_{name}' --mem={config['DEFAULT']['memory']}G --output='%x-%A.txt' --export=ALL MarkDup_{name}.sh", encoding="utf-8", shell=True).split()[-1] |
50 | | - |
51 | | -# Base Quality Score Recalibration (BQSR) |
52 | | -with open(f"BQSR_{name}.sh", "w") as sh: |
53 | | - sh.write("#!/bin/bash\n") |
54 | | - sh.write(f"{config['TOOLS']['gatk']} BaseRecalibrator --input {args.output}/{name}.Sort.MarkDuplicates.bam --reference {config['REFERENCES']['fasta']} --output {args.output}/{name}.Sort.MarkDuplicates.BQSR.table --create-output-bam-index true") |
55 | | - for site in config['REFERENCES']['sites'].split(" "): |
56 | | - sh.write(f" --known-sites {site}") |
57 | | - |
58 | | -if not args.dryrun: |
59 | | - BQSR_job_id = subprocess.check_output(f"sbatch --dependency=afterok:{markduplicates_job_id} --chdir=$(realpath .) --cpus-per-task={config['DEFAULT']['threads']} --error='%x-%A.txt' --job-name='BQSR_{name}' --mem={config['DEFAULT']['memory']}G --output='%x-%A.txt' --export=ALL BQSR_{name}.sh", encoding="utf-8", shell=True).split()[-1] |
60 | | - |
61 | | -with open(f"ApplyBQSR_{name}.sh", "w") as sh: |
62 | | - sh.write("#!/bin/bash\n") |
63 | | - sh.write(f"{config['TOOLS']['gatk']} ApplyBQSR --bqsr-recal-file {args.output}/{name}.Sort.MarkDuplicates.BQSR.table --input {args.output}/{name}.Sort.MarkDuplicates.bam --output {args.output}/{name}.Sort.MarkDuplicates.BQSR.bam --reference {config['REFERENCES']['fasta']} --create-output-bam-index true") |
64 | | - |
65 | | -if not args.dryrun: |
66 | | - ApplyBQSR_job_id = subprocess.check_output(f"sbatch --dependency=afterok:{BQSR_job_id} --chdir=$(realpath .) --cpus-per-task={config['DEFAULT']['threads']} --error='%x-%A.txt' --job-name='ApplyBQSR_{name}' --mem={config['DEFAULT']['memory']}G --output='%x-%A.txt' --export=ALL ApplyBQSR_{name}.sh", encoding="utf-8", shell=True).split()[-1] |
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +02_1_BWA.py: Mapping with BWA |
| 4 | +""" |
| 5 | +import argparse |
| 6 | +import os |
| 7 | +import sys |
| 8 | +from pipeline_utils import PipelineManagerBase |
| 9 | + |
| 10 | +sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname(__file__)))) |
| 11 | + |
| 12 | + |
| 13 | +class PipelineManager(PipelineManagerBase): |
| 14 | + def __init__(self, input_files, output, config_file, dryrun): |
| 15 | + super().__init__(config_file, dryrun, output_dir=output) |
| 16 | + self.input_files = sorted(input_files) |
| 17 | + self.name = os.path.basename(self.input_files[0]).split("_")[0] |
| 18 | + |
| 19 | + def run_bwa(self, dependency_id=None): |
| 20 | + command = f"{self.config['TOOLS']['bwa']} mem -M -t {self.config['DEFAULT']['threads']} -R '@RG\\tID:{self.name}\\tPL:ILLUMINA\\tLB:{self.name}\\tSM:{self.name}\\tCN:UNIST' -v 3 {self.config['REFERENCES']['fasta']} {self.input_files[0]} {self.input_files[0]} | {self.config['TOOLS']['samtools']} view --bam --with-header --threads {self.config['DEFAULT']['threads']} --reference {self.config['REFERENCES']['fasta']} --output {self.output_dir}/{self.name}.bam" |
| 21 | + self.create_sh("BWA", command) |
| 22 | + return self.submit_job("BWA") |
| 23 | + |
| 24 | + def run_sort(self, dependency_id=None): |
| 25 | + command = f"{self.config['TOOLS']['samtools']} sort -l 9 --threads {self.config['DEFAULT']['threads']} -m {int(self.config['DEFAULT']['memory']) // int(self.config['DEFAULT']['threads'])}G --reference {self.config['REFERENCES']['fasta']} --write-index -o {self.output_dir}/{self.name}.Sort.bam {self.output_dir}/{self.name}.bam" |
| 26 | + self.create_sh("Sort", command) |
| 27 | + return self.submit_job("Sort", dependency_id=dependency_id) |
| 28 | + |
| 29 | + def run_mark_duplicates(self, dependency_id=None): |
| 30 | + command = f"{self.config['TOOLS']['gatk']} MarkDuplicatesSpark --input {self.output_dir}/{self.name}.Sort.bam --output {self.output_dir}/{self.name}.Sort.MarkDuplicates.bam --reference {self.config['REFERENCES']['fasta']} --metrics-file {self.output_dir}/{self.name}.Sort.MarkDuplicates.metrics --duplicate-tagging-policy 'OpticalOnly' -- --spark-master 'local[{self.config['DEFAULT']['threads']}]' --spark-verbosity 'INFO'" |
| 31 | + self.create_sh("MarkDup", command) |
| 32 | + return self.submit_job("MarkDup", dependency_id=dependency_id) |
| 33 | + |
| 34 | + def run_bqsr(self, dependency_id=None): |
| 35 | + known_sites = " ".join([f"--known-sites {site}" for site in self.config["REFERENCES"]["sites"].split(" ")]) |
| 36 | + command = f"{self.config['TOOLS']['gatk']} BaseRecalibrator --input {self.output_dir}/{self.name}.Sort.MarkDuplicates.bam --reference {self.config['REFERENCES']['fasta']} --output {self.output_dir}/{self.name}.Sort.MarkDuplicates.BQSR.table --create-output-bam-index true {known_sites}" |
| 37 | + self.create_sh("BQSR", command) |
| 38 | + return self.submit_job("BQSR", dependency_id=dependency_id) |
| 39 | + |
| 40 | + def run_apply_bqsr(self, dependency_id=None): |
| 41 | + command = f"{self.config['TOOLS']['gatk']} ApplyBQSR --bqsr-recal-file {self.output_dir}/{self.name}.Sort.MarkDuplicates.BQSR.table --input {self.output_dir}/{self.name}.Sort.MarkDuplicates.bam --output {self.output_dir}/{self.name}.Sort.MarkDuplicates.BQSR.bam --reference {self.config['REFERENCES']['fasta']} --create-output-bam-index true" |
| 42 | + self.create_sh("ApplyBQSR", command) |
| 43 | + return self.submit_job("ApplyBQSR", dependency_id=dependency_id) |
| 44 | + |
| 45 | + |
| 46 | +def parse_arguments(): |
| 47 | + parser = argparse.ArgumentParser() |
| 48 | + |
| 49 | + parser.add_argument("input", help="Input FASTQ file", nargs=2) |
| 50 | + parser.add_argument("output", help="Output directory", default=os.getcwd()) |
| 51 | + parser.add_argument("-c", "--config", help="config INI file", default="../config.ini") |
| 52 | + parser.add_argument("-n", "--dryrun", help="Don't actually run any recipe; just make .SH only", default=False, action="store_true") |
| 53 | + |
| 54 | + return parser.parse_args() |
| 55 | + |
| 56 | + |
| 57 | +def main(): |
| 58 | + args = parse_arguments() |
| 59 | + |
| 60 | + pipeline = PipelineManager(input_files=args.input, output=args.output, config_file=args.config, dryrun=args.dryrun) |
| 61 | + |
| 62 | + mapping_job_id = pipeline.run_bwa() |
| 63 | + sort_job_id = pipeline.run_sort(dependency_id=mapping_job_id) |
| 64 | + mark_duplicates_job_id = pipeline.run_mark_duplicates(dependency_id=sort_job_id) |
| 65 | + bqsr_job_id = pipeline.run_bqsr(dependency_id=mark_duplicates_job_id) |
| 66 | + apply_bqsr_job_id = pipeline.run_apply_bqsr(dependency_id=bqsr_job_id) |
| 67 | + |
| 68 | + |
| 69 | +if __name__ == "__main__": |
| 70 | + main() |
0 commit comments