-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathproc_dist_ne.py
More file actions
executable file
·184 lines (150 loc) · 5.36 KB
/
proc_dist_ne.py
File metadata and controls
executable file
·184 lines (150 loc) · 5.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#! /usr/bin/env python3
import ibdutils.utils.ibdutils as ibdutils
import ibdutils.runner.ibdne as ibdne
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
from pathlib import Path
ibd_jar_default = str(Path(__file__).parent / "ibdne.jar")
# parse arguments
pa = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
pa.add_argument("--ibd_files", type=str, nargs=14, required=True)
pa.add_argument("--vcf_files", type=str, nargs=14, required=True)
pa.add_argument("--genome_set_id", type=str, required=True)
pa.add_argument("--ibdne_jar", type=str, default=ibd_jar_default)
pa.add_argument(
"--peak_validate_meth", type=str, choices=["xirs", "ihs"], default="xirs"
)
pa.add_argument("--ibdne_mincm", type=float, default=2)
pa.add_argument("--ibdne_minregion", type=float, default=10)
pa.add_argument("--ibdne_flatmeth", type=str, default="none")
# This can be used to specify which chromosome to use, especially when selection
# is not established on some chromosomes and we only use those that are
# established to better observe selection effects
pa.add_argument("--ibdne_chrlist", type=int, nargs="+")
pa.add_argument(
"--ibdne_no_diploid_conversion",
type=str,
choices=["true", "false"],
default="false",
)
args = pa.parse_args()
print(args)
assert args.ibdne_jar is not None
assert args.ibdne_flatmeth in ["none", "merge", "keep_hap_1_only"]
idx = args.genome_set_id
ibdne_mincm = args.ibdne_mincm
ibdne_minregion = args.ibdne_minregion
# output prefix string
label_str = "_".join(
[
str(x)
for x in [
args.genome_set_id,
args.ibdne_mincm,
args.ibdne_minregion,
args.ibdne_flatmeth,
]
]
)
# read ibd
genome_14_100 = ibdutils.Genome.get_genome("simu_14chr_100cm")
ibd = ibdutils.IBD(genome=genome_14_100, label=f"{label_str}_orig")
ibd.read_ibd(ibd_fn_lst=args.ibd_files)
ibd.calc_ibd_cov()
ibd.find_peaks()
# output files:
of_ibddist_obj = f"{label_str}.ibddist.ibdobj.gz"
# store combined IBD for IBD distribution analysis
ibd.pickle_dump(of_ibddist_obj)
# remove highly relatedness samples
mat = ibd.make_ibd_matrix()
unrelated_samples = ibd.get_unrelated_samples(ibdmat=mat)
ibd.subset_ibd_by_samples(subset_samples=unrelated_samples)
# ######################################
# prepare input for IBDNe
# remove ibd with tmrca < 1.5 (required according to IBDNe paper)
ibd.filter_ibd_by_time(min_tmrca=1.5)
ibd.filter_ibd_by_length(min_seg_cm=ibdne_mincm)
# calculate XiR,s
if args.peak_validate_meth == "xirs":
ibd.calc_xirs(vcf_fn_lst=args.vcf_files, min_maf=0.01)
elif args.peak_validate_meth == "ihs":
ibd.calc_ihs(vcf_fn_lst=args.vcf_files, min_maf=0.01)
else:
raise NotImplementedError(f"{args.peak_validate_meth} is not valid method")
if args.ibdne_no_diploid_conversion == "true":
print("skip diploid conversion")
elif args.ibdne_no_diploid_conversion == "false":
# convert to heterzygous diploids
# Note: remove_hbd might not remove a lot segments as
# hdb only involves n/2 pairs of a total of n(n-1)/2 pairs (ratio: 1/(n-1)).
# When n is large, the difference is small
ibd.convert_to_heterozygous_diploids(remove_hbd=True)
if args.ibdne_flatmeth != "none":
ibd.flatten_diploid_ibd(method=args.ibdne_flatmeth)
else:
raise NotImplementedError(
f"{args.ibdne_no_diploid_conversion} is not valid value is not a value value for option --ibdne_no_diploid_conversion"
)
# calculate coverage and remove peaks
ibd.calc_ibd_cov()
ibd.find_peaks()
if args.peak_validate_meth == "xirs":
xirs_df = ibd._xirs_df
ibd.filter_peaks_by_xirs(xirs_df)
elif args.peak_validate_meth == "ihs":
ibd.filter_peaks_by_ihs(min_ihs_hits=1)
else:
raise NotImplementedError(f"{args.peak_validate_meth} is not valid method")
ibd2 = ibd.duplicate(f"{label_str}_rmpeaks")
ibd2.remove_peaks()
ibd2._df = ibd2.cut_and_split_ibd()
# filter by chromosome for Ne
if args.ibdne_chrlist is not None:
ibd._df = ibd._df[lambda df: df.Chromosome.isin(args.ibdne_chrlist)]
ibd2._df = ibd2._df[lambda df: df.Chromosome.isin(args.ibdne_chrlist)]
# write ne file
of_orig_ibdne_obj = f"{label_str}_orig.ibdne.ibdobj.gz"
ibd.pickle_dump(of_orig_ibdne_obj)
of_rmpeaks_ibdne_obj = f"{label_str}_rmpeaks.ibdne.ibdobj.gz"
ibd2.pickle_dump(of_rmpeaks_ibdne_obj)
# link ibdne.jar file
if not Path("ibdne.jar").exists():
assert Path(args.ibdne_jar).exists()
this = Path("ibdne.jar")
target = Path(args.ibdne_jar).absolute()
this.symlink_to(target)
print(f"link {this} -> {target}")
# use nerunner (dry_run) to preare inputs and bash scripts
# --- for ibd before removing ibd peaks ------------------------
nerunner = ibdne.IbdNeRunner(
ibd=ibd,
input_folder=".",
output_folder=".",
mincm=ibdne_mincm,
minregion=ibdne_minregion,
)
nerunner.run(nthreads=10, mem_gb=20, dry_run=True)
# --- for ibd after removing ibd peaks ------------------------
nerunner = ibdne.IbdNeRunner(
ibd=ibd2,
input_folder=".",
output_folder=".",
mincm=ibdne_mincm,
minregion=ibdne_minregion,
)
nerunner.run(nthreads=10, mem_gb=20, dry_run=True)
print(
f"""
output files:
{of_ibddist_obj}
ibdne.jar
{idx}_orig.sh
{idx}_orig.map
{idx}_orig.ibd.gz
{idx}_rmpeaks.sh
{idx}_rmpeaks.map
{idx}_rmpeaks.ibd.gz
{of_orig_ibdne_obj}
{of_rmpeaks_ibdne_obj}
"""
)