Skip to content

Commit 8ef8676

Browse files
committed
Updated preprocess.ipynb and removed tokenizer mapping tables
1 parent 5f63bb6 commit 8ef8676

File tree

7 files changed

+33
-83
lines changed

7 files changed

+33
-83
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@ logs/
44
utils/
55
model/
66
.ipynb_checkpoints/
7-
*.npy
7+
*.npy
8+
mapping/*.json

README.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,13 @@
66

77
# Dataset
88
- Download the SMILES dataset from [Kaggle ZINC 250k](https://www.kaggle.com/datasets/lianghsunhuang/zinc-250k). Change the file extension to `.smi` and remove the header row.
9+
- Place the prepared `.smi` file in the `datasets` folder.
10+
11+
# Preprocessing
12+
- Use `preprocess.ipynb` and run the notebook to preprocess the `.smi` file and obtain the tokenization mapping table.
913

1014
# Running the Main Script
11-
We have provided a token file for this project, so you can skip the `Load Training Data` section and proceed with running the rest of the code.
15+
~~We have provided a token file for this project, so you can skip the `Load Training Data` section and proceed with running the rest of the code.~~
1216

1317
> Note: Due to the nature of Variational Autoencoders (VAE), there might be instances where new compounds are not generated (sampling problem). If this happens, please run the code multiple times to obtain a valid compound.
1418

mapping/.gitkeep

Whitespace-only changes.

mapping/250k_rndm_zinc_drugs_clean/idx_to_word.json

Lines changed: 0 additions & 1 deletion
This file was deleted.

mapping/250k_rndm_zinc_drugs_clean/tokenizer.json

Lines changed: 0 additions & 1 deletion
This file was deleted.

mapping/250k_rndm_zinc_drugs_clean/word_to_idx.json

Lines changed: 0 additions & 1 deletion
This file was deleted.

preprocess.ipynb

Lines changed: 26 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,19 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 3,
5+
"execution_count": 1,
66
"metadata": {},
7-
"outputs": [],
7+
"outputs": [
8+
{
9+
"name": "stderr",
10+
"output_type": "stream",
11+
"text": [
12+
"2024-06-06 10:26:46.074236: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
13+
"To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
14+
"2024-06-06 10:26:46.608196: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
15+
]
16+
}
17+
],
818
"source": [
919
"import numpy as np\n",
1020
"import tensorflow as tf\n",
@@ -21,12 +31,12 @@
2131
},
2232
{
2333
"cell_type": "code",
24-
"execution_count": 18,
34+
"execution_count": 5,
2535
"metadata": {},
2636
"outputs": [],
2737
"source": [
2838
"# Folder path\n",
29-
"DATASET_DIR = './datasets/zinc-preprocess/'\n",
39+
"DATASET_DIR = './datasets/'\n",
3040
"MAPPING_DIR = './mapping/'\n",
3141
"\n",
3242
"# SMILES path\n",
@@ -40,7 +50,7 @@
4050
},
4151
{
4252
"cell_type": "code",
43-
"execution_count": 19,
53+
"execution_count": 6,
4454
"metadata": {},
4555
"outputs": [
4656
{
@@ -66,21 +76,21 @@
6676
},
6777
{
6878
"cell_type": "code",
69-
"execution_count": 4,
79+
"execution_count": 7,
7080
"metadata": {},
7181
"outputs": [
7282
{
7383
"name": "stderr",
7484
"output_type": "stream",
7585
"text": [
76-
"100%|██████████| 319616985/319616985 [5:27:12<00:00, 16279.95it/s] "
86+
"100%|██████████| 2/2 [00:00<00:00, 31655.12it/s]"
7787
]
7888
},
7989
{
8090
"name": "stdout",
8191
"output_type": "stream",
8292
"text": [
83-
"Total number of SMILES: 39\n"
93+
"Total number of SMILES: 2\n"
8494
]
8595
},
8696
{
@@ -108,47 +118,16 @@
108118
},
109119
{
110120
"cell_type": "code",
111-
"execution_count": 6,
121+
"execution_count": 8,
112122
"metadata": {},
113123
"outputs": [
114124
{
115125
"data": {
116126
"text/plain": [
117-
"{'C': 1,\n",
118-
" '@': 2,\n",
119-
" '[': 3,\n",
120-
" ']': 4,\n",
121-
" 'H': 5,\n",
122-
" '1': 6,\n",
123-
" '2': 7,\n",
124-
" 'O': 8,\n",
125-
" '(': 9,\n",
126-
" ')': 10,\n",
127-
" 'N': 11,\n",
128-
" '=': 12,\n",
129-
" '3': 13,\n",
130-
" 'l': 14,\n",
131-
" 'S': 15,\n",
132-
" '#': 16,\n",
133-
" 'B': 17,\n",
134-
" 'r': 18,\n",
135-
" 'F': 19,\n",
136-
" '4': 20,\n",
137-
" '/': 21,\n",
138-
" 'c': 22,\n",
139-
" '+': 23,\n",
140-
" '\\\\': 24,\n",
141-
" 'P': 25,\n",
142-
" '-': 26,\n",
143-
" 'I': 27,\n",
144-
" 'n': 28,\n",
145-
" 'o': 29,\n",
146-
" '<START>': 30,\n",
147-
" '<PAD>': 31,\n",
148-
" '<EOL>': 32}"
127+
"{'C': 1, 'c': 2, '<START>': 3, '<PAD>': 4, '<EOL>': 5}"
149128
]
150129
},
151-
"execution_count": 6,
130+
"execution_count": 8,
152131
"metadata": {},
153132
"output_type": "execute_result"
154133
}
@@ -165,47 +144,16 @@
165144
},
166145
{
167146
"cell_type": "code",
168-
"execution_count": 8,
147+
"execution_count": 9,
169148
"metadata": {},
170149
"outputs": [
171150
{
172151
"data": {
173152
"text/plain": [
174-
"{1: 'C',\n",
175-
" 2: '@',\n",
176-
" 3: '[',\n",
177-
" 4: ']',\n",
178-
" 5: 'H',\n",
179-
" 6: '1',\n",
180-
" 7: '2',\n",
181-
" 8: 'O',\n",
182-
" 9: '(',\n",
183-
" 10: ')',\n",
184-
" 11: 'N',\n",
185-
" 12: '=',\n",
186-
" 13: '3',\n",
187-
" 14: 'l',\n",
188-
" 15: 'S',\n",
189-
" 16: '#',\n",
190-
" 17: 'B',\n",
191-
" 18: 'r',\n",
192-
" 19: 'F',\n",
193-
" 20: '4',\n",
194-
" 21: '/',\n",
195-
" 22: 'c',\n",
196-
" 23: '+',\n",
197-
" 24: '\\\\',\n",
198-
" 25: 'P',\n",
199-
" 26: '-',\n",
200-
" 27: 'I',\n",
201-
" 28: 'n',\n",
202-
" 29: 'o',\n",
203-
" 30: '<START>',\n",
204-
" 31: '<PAD>',\n",
205-
" 32: '<EOL>'}"
153+
"{1: 'C', 2: 'c', 3: '<START>', 4: '<PAD>', 5: '<EOL>'}"
206154
]
207155
},
208-
"execution_count": 8,
156+
"execution_count": 9,
209157
"metadata": {},
210158
"output_type": "execute_result"
211159
}
@@ -218,7 +166,7 @@
218166
},
219167
{
220168
"cell_type": "code",
221-
"execution_count": 9,
169+
"execution_count": 10,
222170
"metadata": {},
223171
"outputs": [],
224172
"source": [
@@ -252,7 +200,7 @@
252200
"name": "python",
253201
"nbconvert_exporter": "python",
254202
"pygments_lexer": "ipython3",
255-
"version": "3.6.9"
203+
"version": "3.10.14"
256204
}
257205
},
258206
"nbformat": 4,

0 commit comments

Comments
 (0)