lianghsun
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 5 additions & 1 deletion b/‎README.md‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎mapping/.gitkeep‎ b/‎mapping/.gitkeep‎
diff --git a/‎mapping/250k_rndm_zinc_drugs_clean/idx_to_word.json‎
Lines changed: 0 additions & 1 deletion b/‎mapping/250k_rndm_zinc_drugs_clean/idx_to_word.json‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎mapping/250k_rndm_zinc_drugs_clean/tokenizer.json‎
Lines changed: 0 additions & 1 deletion b/‎mapping/250k_rndm_zinc_drugs_clean/tokenizer.json‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎mapping/250k_rndm_zinc_drugs_clean/word_to_idx.json‎
Lines changed: 0 additions & 1 deletion b/‎mapping/250k_rndm_zinc_drugs_clean/word_to_idx.json‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎preprocess.ipynb‎
Lines changed: 26 additions & 78 deletions b/‎preprocess.ipynb‎
Lines changed: 26 additions & 78 deletions
@@ -4,4 +4,5 @@ logs/
 utils/
 model/
 .ipynb_checkpoints/
-*.npy
+*.npy
+mapping/*.json
@@ -6,9 +6,13 @@
 
 # Dataset
 - Download the SMILES dataset from [Kaggle ZINC 250k](https://www.kaggle.com/datasets/lianghsunhuang/zinc-250k). Change the file extension to `.smi` and remove the header row.
+- Place the prepared `.smi` file in the `datasets` folder.
+
+# Preprocessing
+- Use `preprocess.ipynb` and run the notebook to preprocess the `.smi` file and obtain the tokenization mapping table.
 
 # Running the Main Script
-We have provided a token file for this project, so you can skip the `Load Training Data` section and proceed with running the rest of the code.
+~~We have provided a token file for this project, so you can skip the `Load Training Data` section and proceed with running the rest of the code.~~
 
  > Note: Due to the nature of Variational Autoencoders (VAE), there might be instances where new compounds are not generated (sampling problem). If this happens, please run the code multiple times to obtain a valid compound.
 
 
@@ -2,9 +2,19 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-06-06 10:26:46.074236: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "2024-06-06 10:26:46.608196: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
+     ]
+    }
+   ],
    "source": [
     "import numpy as np\n",
     "import tensorflow as tf\n",
@@ -21,12 +31,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Folder path\n",
-    "DATASET_DIR = './datasets/zinc-preprocess/'\n",
+    "DATASET_DIR = './datasets/'\n",
     "MAPPING_DIR = './mapping/'\n",
     "\n",
     "# SMILES path\n",
@@ -40,7 +50,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -66,21 +76,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 319616985/319616985 [5:27:12<00:00, 16279.95it/s]  "
+      "100%|██████████| 2/2 [00:00<00:00, 31655.12it/s]"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Total number of SMILES: 39\n"
+      "Total number of SMILES: 2\n"
      ]
     },
     {
@@ -108,47 +118,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'C': 1,\n",
-       " '@': 2,\n",
-       " '[': 3,\n",
-       " ']': 4,\n",
-       " 'H': 5,\n",
-       " '1': 6,\n",
-       " '2': 7,\n",
-       " 'O': 8,\n",
-       " '(': 9,\n",
-       " ')': 10,\n",
-       " 'N': 11,\n",
-       " '=': 12,\n",
-       " '3': 13,\n",
-       " 'l': 14,\n",
-       " 'S': 15,\n",
-       " '#': 16,\n",
-       " 'B': 17,\n",
-       " 'r': 18,\n",
-       " 'F': 19,\n",
-       " '4': 20,\n",
-       " '/': 21,\n",
-       " 'c': 22,\n",
-       " '+': 23,\n",
-       " '\\\\': 24,\n",
-       " 'P': 25,\n",
-       " '-': 26,\n",
-       " 'I': 27,\n",
-       " 'n': 28,\n",
-       " 'o': 29,\n",
-       " '<START>': 30,\n",
-       " '<PAD>': 31,\n",
-       " '<EOL>': 32}"
+       "{'C': 1, 'c': 2, '<START>': 3, '<PAD>': 4, '<EOL>': 5}"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -165,47 +144,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{1: 'C',\n",
-       " 2: '@',\n",
-       " 3: '[',\n",
-       " 4: ']',\n",
-       " 5: 'H',\n",
-       " 6: '1',\n",
-       " 7: '2',\n",
-       " 8: 'O',\n",
-       " 9: '(',\n",
-       " 10: ')',\n",
-       " 11: 'N',\n",
-       " 12: '=',\n",
-       " 13: '3',\n",
-       " 14: 'l',\n",
-       " 15: 'S',\n",
-       " 16: '#',\n",
-       " 17: 'B',\n",
-       " 18: 'r',\n",
-       " 19: 'F',\n",
-       " 20: '4',\n",
-       " 21: '/',\n",
-       " 22: 'c',\n",
-       " 23: '+',\n",
-       " 24: '\\\\',\n",
-       " 25: 'P',\n",
-       " 26: '-',\n",
-       " 27: 'I',\n",
-       " 28: 'n',\n",
-       " 29: 'o',\n",
-       " 30: '<START>',\n",
-       " 31: '<PAD>',\n",
-       " 32: '<EOL>'}"
+       "{1: 'C', 2: 'c', 3: '<START>', 4: '<PAD>', 5: '<EOL>'}"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -218,7 +166,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -252,7 +200,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.10.14"
   }
  },
  "nbformat": 4,