patonlab
diff --git a/‎analysis/README.md‎
Lines changed: 3 additions & 3 deletions b/‎analysis/README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎analysis/plot_zinc_vs_cawkwell.py‎
Lines changed: 4 additions & 10 deletions b/‎analysis/plot_zinc_vs_cawkwell.py‎
Lines changed: 4 additions & 10 deletions
@@ -18,14 +18,14 @@ The training data (`deltahf/data/training_data.csv`) contains 531 molecules acro
 
 ---
 
-## ZINC vs Cawkwell Comparison
+## Comparing typical drug-like molecules with energetic CHNO molecules
 
 **Script:** `plot_zinc_vs_cawkwell.py`
 
 This script compares predicted ΔHf° distributions for two molecule sets:
 
 - **Cawkwell energetic set** (531 molecules) — energetic CHNO molecules from Cawkwell et al. (2021)
-- **ZINC drug-like sample** (1,000 molecules) — randomly sampled from the ZINC 250k drug-like dataset, filtered to supported elements and neutralised
+- **ZINC drug-like sample** (10,000 molecules) — randomly sampled from the ZINC 250k drug-like dataset, filtered to supported elements and neutralised
 
 The comparison assesses how the predicted ΔHf° distributions differ between energetic CHNO molecules and typical drug-like molecules.
 
@@ -60,7 +60,7 @@ The same comparison using xTB + `bondorder_ext` shows a similar pattern. The dis
 | File | Description |
 |------|-------------|
 | `250k_rndm_zinc_drugs_clean_3.csv` | ZINC 250k drug-like dataset (source data) |
-| `zinc_sample_1000.csv` | 1,000-molecule random sample (neutralised, supported elements only) |
+| `zinc_sample_10000.csv` | 10,000-molecule random sample (neutralised, supported elements only) |
 | `cawkwell_energetic.csv` | 531 energetic CHNO molecules from Cawkwell et al. (2021) |
 | `cawkwell_gxtb_predictions.csv` | gXTB + bondorder_ext predictions for Cawkwell energetic set |
 | `zinc_gxtb_predictions.csv` | gXTB + bondorder_ext predictions for ZINC sample |
 
@@ -15,9 +15,8 @@
 PLOTS_DIR = Path(__file__).parent
 REPO_ROOT = PLOTS_DIR.parent
 
-CAWKWELL_CSV = PLOTS_DIR / "cawkwell_si_atom_counts.csv"
 ZINC_CSV = PLOTS_DIR / "250k_rndm_zinc_drugs_clean_3.csv"
-ZINC_SAMPLE_CSV = PLOTS_DIR / "zinc_sample_1000.csv"
+ZINC_SAMPLE_CSV = PLOTS_DIR / "zinc_sample_10000.csv"
 CAWKWELL_INPUT_CSV = PLOTS_DIR / "cawkwell_energetic.csv"
 CAWKWELL_OUT_CSV = PLOTS_DIR / "cawkwell_gxtb_predictions.csv"
 ZINC_OUT_CSV = PLOTS_DIR / "zinc_gxtb_predictions.csv"
@@ -36,12 +35,7 @@ def neutralize_smiles(smiles: str) -> str:
 
 
 def prepare_inputs(seed: int = 42):
-    # Cawkwell: extract smiles + name
-    cawk = pd.read_csv(CAWKWELL_CSV)[["smiles", "name"]]
-    cawk.to_csv(CAWKWELL_INPUT_CSV, index=False)
-    print(f"Cawkwell input: {len(cawk)} molecules -> {CAWKWELL_INPUT_CSV}")
-
-    # ZINC: filter to supported elements, sample 1000, then neutralize
+    # ZINC: filter to supported elements, sample 10000, then neutralize
     zinc = pd.read_csv(ZINC_CSV)
     def has_supported_elements(smi):
         mol = Chem.MolFromSmiles(smi.strip())
@@ -51,7 +45,7 @@ def has_supported_elements(smi):
     zinc["smiles"] = zinc["smiles"].str.strip()
     supported = zinc[zinc["smiles"].apply(has_supported_elements)]
     print(f"ZINC: {len(supported)}/{len(zinc)} molecules have supported elements")
-    sample = supported.sample(n=1000, random_state=seed)[["smiles"]]
+    sample = supported.sample(n=10000, random_state=seed)[["smiles"]]
     sample["smiles"] = sample["smiles"].apply(neutralize_smiles)
     print(f"ZINC sample: {len(sample)} molecules (neutralized) -> {ZINC_SAMPLE_CSV}")
     sample.to_csv(ZINC_SAMPLE_CSV, index=False)
@@ -66,7 +60,7 @@ def run_predict(input_csv: Path, output_csv: Path, label: str):
         "-i", str(input_csv),
         "--model", "bondorder_ext",
         "--use-gxtb",
-        "--xtb-threads", "16",
+        "--xtb-threads", "8",
         "--cache-dir", str(CACHE_DIR / label),
         "-o", str(output_csv),
     ]