-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathget_SMILES.py
More file actions
executable file
·107 lines (85 loc) · 3.29 KB
/
get_SMILES.py
File metadata and controls
executable file
·107 lines (85 loc) · 3.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import dill
import numpy as np
import pandas as pd
import requests
import re
import json
# fix mismatch between two mappings
def fix_mismatch(idx2atc, atc2ndc, ndc2atc_original_path):
ndc2atc = pd.read_csv(open(ndc2atc_original_path, 'rb'))
ndc2atc.ATC5 = ndc2atc.ATC5.apply(lambda x: x[:4])
mismatch = []
for k, v in idx2atc.items():
if v in atc2ndc.NDC.tolist():
pass
else:
mismatch.append(v)
for i in mismatch:
atc2ndc = atc2ndc.append({'NDC': i, 'NDC_orig': [s.replace('-', '') for s in ndc2atc[ndc2atc.ATC5 == i].NDC.tolist()]}, ignore_index=True)
atc2ndc = atc2ndc.append({'NDC': 'seperator', 'NDC_orig': []}, ignore_index=True)
atc2ndc = atc2ndc.append({'NDC': 'decoder_point', 'NDC_orig': []}, ignore_index=True)
return atc2ndc
def ndc2smiles(NDC):
url3 = 'https://ndclist.com/?s=' + NDC
r3 = requests.get(url3)
name = re.findall('<td data-title="Proprietary Name">(.+?)</td>', r3.text)[0]
url = 'https://dev.drugbankplus.com/guides/tutorials/api_request?request_path=us/product_concepts?q=' + name
r = requests.get(url)
drugbankID = re.findall('(DB\d+)', r.text)[0]
# re matching might need to update (drugbank may change their html script)
url2 = 'https://www.drugbank.ca/drugs/' + drugbankID
r2 = requests.get(url2)
SMILES = re.findall('SMILES</dt><dd class="col-xl-10 col-md-9 col-sm-8"><div class="wrap">(.+?)</div>', r2.text)[0]
return SMILES
def atc2smiles(atc2ndc):
atc2SMILES = {}
for k, ndc in atc2ndc.values:
if k not in list(atc2SMILES.keys()):
for index, code in enumerate(ndc):
if index > 100: break
try:
SMILES = ndc2smiles(code)
if 'href' in SMILES:
continue
print (k, index, len(ndc), SMILES)
if k not in atc2SMILES:
atc2SMILES[k] = set()
atc2SMILES[k].add(SMILES)
# if len(atc2SMILES[k]) >= 3:
# break
except:
pass
return atc2SMILES
def idx2smiles(idx2atc, atc2SMILES):
idx2drug = {}
idx2drug['seperator'] = {}
idx2drug['decoder_point'] = {}
for idx, atc in idx2atc.items():
try:
idx2drug[idx] = atc2SMILES[atc]
except:
pass
# dill.dump(idx2drug, open('idx2SMILES.pkl', 'wb'))
dill.dump(idx2drug, open('atc3toSMILES.pkl', 'wb'))
if __name__ == '__main__':
# get idx2atc
path = './output_COGNet/voc_final.pkl'
voc_final = dill.load(open(path, 'rb'))
idx2atc = voc_final['med_voc'].idx2word
# get atc2ndc
# path = './atc2rxnorm.pkl'
path = './output_COGNet/ndc2rxnorm_mapping.txt'
# atc2ndc = dill.load(open(path, 'rb'))
# print(np.loadtxt(path))
with open(path, 'r') as f:
data = f.readline()
atc2ndc = json.loads(data.replace('u', ''))
# data = json.load(json_data)
# print(data['00300154430'])
# fix atc2ndc mismatch
ndc2atc_original_path = './output_COGNet/ndc2atc_level4.csv'
atc2ndc = fix_mismatch(idx2atc, atc2ndc, ndc2atc_original_path)
# atc2smiles
atc2SMILES = atc2smiles(atc2ndc)
# idx2smiles (dumpped)
idx2smiles(idx2atc, atc2SMILES)