Skip to content

Commit 9c383a8

Browse files
committed
add analysis notebook for convenience
1 parent 69eb7f9 commit 9c383a8

File tree

3 files changed

+994
-9
lines changed

3 files changed

+994
-9
lines changed

analysis.ipynb

Lines changed: 253 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,253 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "ohuujbmsz7",
6+
"metadata": {},
7+
"source": [
8+
"# Autoresearch Experiment Analysis\n",
9+
"\n",
10+
"Analysis of autonomous hyperparameter tuning results from `results.tsv`."
11+
]
12+
},
13+
{
14+
"cell_type": "code",
15+
"execution_count": null,
16+
"id": "v3r8c77lxhs",
17+
"metadata": {},
18+
"outputs": [],
19+
"source": [
20+
"import pandas as pd\n",
21+
"import matplotlib.pyplot as plt\n",
22+
"import numpy as np\n",
23+
"\n",
24+
"# Load the TSV (tab-separated, 5 columns: commit, val_bpb, memory_gb, status, description)\n",
25+
"df = pd.read_csv(\"results.tsv\", sep=\"\\t\")\n",
26+
"df[\"val_bpb\"] = pd.to_numeric(df[\"val_bpb\"], errors=\"coerce\")\n",
27+
"df[\"memory_gb\"] = pd.to_numeric(df[\"memory_gb\"], errors=\"coerce\")\n",
28+
"df[\"status\"] = df[\"status\"].str.strip().str.upper()\n",
29+
"\n",
30+
"print(f\"Total experiments: {len(df)}\")\n",
31+
"print(f\"Columns: {list(df.columns)}\")\n",
32+
"df.head(10)"
33+
]
34+
},
35+
{
36+
"cell_type": "code",
37+
"execution_count": null,
38+
"id": "0v37bji707o",
39+
"metadata": {},
40+
"outputs": [],
41+
"source": [
42+
"counts = df[\"status\"].value_counts()\n",
43+
"print(\"Experiment outcomes:\")\n",
44+
"print(counts.to_string())\n",
45+
"\n",
46+
"n_keep = counts.get(\"KEEP\", 0)\n",
47+
"n_discard = counts.get(\"DISCARD\", 0)\n",
48+
"n_crash = counts.get(\"CRASH\", 0)\n",
49+
"n_decided = n_keep + n_discard\n",
50+
"if n_decided > 0:\n",
51+
" print(f\"\\nKeep rate: {n_keep}/{n_decided} = {n_keep / n_decided:.1%}\")"
52+
]
53+
},
54+
{
55+
"cell_type": "code",
56+
"execution_count": null,
57+
"id": "j887idiuu5",
58+
"metadata": {},
59+
"outputs": [],
60+
"source": [
61+
"# Show all KEPT experiments (the improvements that stuck)\n",
62+
"kept = df[df[\"status\"] == \"KEEP\"].copy()\n",
63+
"print(f\"KEPT experiments ({len(kept)} total):\\n\")\n",
64+
"for i, row in kept.iterrows():\n",
65+
" bpb = row[\"val_bpb\"]\n",
66+
" desc = row[\"description\"]\n",
67+
" print(f\" #{i:3d} bpb={bpb:.6f} mem={row['memory_gb']:.1f}GB {desc}\")"
68+
]
69+
},
70+
{
71+
"cell_type": "markdown",
72+
"id": "99l0xlw0lv",
73+
"metadata": {},
74+
"source": [
75+
"## Val BPB Over Time\n",
76+
"\n",
77+
"Track how the best (kept) val_bpb evolves as experiments progress. The running minimum shows the \"frontier\" -- the best result achieved so far."
78+
]
79+
},
80+
{
81+
"cell_type": "code",
82+
"execution_count": null,
83+
"id": "79jh74veqg9",
84+
"metadata": {},
85+
"outputs": [],
86+
"source": [
87+
"fig, ax = plt.subplots(figsize=(16, 8))\n",
88+
"\n",
89+
"# Filter out crashes for plotting\n",
90+
"valid = df[df[\"status\"] != \"CRASH\"].copy()\n",
91+
"valid = valid.reset_index(drop=True)\n",
92+
"\n",
93+
"baseline_bpb = valid.loc[0, \"val_bpb\"]\n",
94+
"\n",
95+
"# Only plot points at or below baseline (the interesting region)\n",
96+
"below = valid[valid[\"val_bpb\"] <= baseline_bpb + 0.0005]\n",
97+
"\n",
98+
"# Plot discarded as faint background dots\n",
99+
"disc = below[below[\"status\"] == \"DISCARD\"]\n",
100+
"ax.scatter(disc.index, disc[\"val_bpb\"],\n",
101+
" c=\"#cccccc\", s=12, alpha=0.5, zorder=2, label=\"Discarded\")\n",
102+
"\n",
103+
"# Plot kept experiments as prominent green dots\n",
104+
"kept_v = below[below[\"status\"] == \"KEEP\"]\n",
105+
"ax.scatter(kept_v.index, kept_v[\"val_bpb\"],\n",
106+
" c=\"#2ecc71\", s=50, zorder=4, label=\"Kept\", edgecolors=\"black\", linewidths=0.5)\n",
107+
"\n",
108+
"# Running minimum step line\n",
109+
"kept_mask = valid[\"status\"] == \"KEEP\"\n",
110+
"kept_idx = valid.index[kept_mask]\n",
111+
"kept_bpb = valid.loc[kept_mask, \"val_bpb\"]\n",
112+
"running_min = kept_bpb.cummin()\n",
113+
"ax.step(kept_idx, running_min, where=\"post\", color=\"#27ae60\",\n",
114+
" linewidth=2, alpha=0.7, zorder=3, label=\"Running best\")\n",
115+
"\n",
116+
"# Label each kept experiment with its description\n",
117+
"for idx, bpb in zip(kept_idx, kept_bpb):\n",
118+
" desc = str(valid.loc[idx, \"description\"]).strip()\n",
119+
" if len(desc) > 45:\n",
120+
" desc = desc[:42] + \"...\"\n",
121+
"\n",
122+
" ax.annotate(desc, (idx, bpb),\n",
123+
" textcoords=\"offset points\",\n",
124+
" xytext=(6, 6), fontsize=6.0,\n",
125+
" color=\"#1a7a3a\", alpha=0.9,\n",
126+
" rotation=30, ha=\"left\", va=\"bottom\")\n",
127+
"\n",
128+
"# Reference lines\n",
129+
"ax.axhline(y=baseline_bpb, color=\"#e74c3c\", linewidth=1,\n",
130+
" linestyle=\"--\", alpha=0.5, label=f\"Baseline ({baseline_bpb:.4f})\")\n",
131+
"best = kept_bpb.min()\n",
132+
"ax.axhline(y=best, color=\"#27ae60\", linewidth=1,\n",
133+
" linestyle=\"--\", alpha=0.5, label=f\"Best ({best:.4f})\")\n",
134+
"\n",
135+
"n_total = len(df)\n",
136+
"n_kept = len(df[df[\"status\"] == \"KEEP\"])\n",
137+
"ax.set_xlabel(\"Experiment #\", fontsize=12)\n",
138+
"ax.set_ylabel(\"Validation BPB (lower is better)\", fontsize=12)\n",
139+
"ax.set_title(f\"Autoresearch Progress: {n_total} Experiments, {n_kept} Kept Improvements\", fontsize=14)\n",
140+
"ax.legend(loc=\"upper right\", fontsize=9)\n",
141+
"ax.grid(True, alpha=0.2)\n",
142+
"\n",
143+
"# Y-axis: from just below best to just above baseline\n",
144+
"margin = (baseline_bpb - best) * 0.15\n",
145+
"ax.set_ylim(best - margin, baseline_bpb + margin)\n",
146+
"\n",
147+
"plt.tight_layout()\n",
148+
"plt.savefig(\"progress.png\", dpi=150, bbox_inches=\"tight\")\n",
149+
"plt.show()\n",
150+
"print(\"Saved to progress.png\")"
151+
]
152+
},
153+
{
154+
"cell_type": "markdown",
155+
"id": "ce48phivyou",
156+
"metadata": {},
157+
"source": [
158+
"## Summary Statistics"
159+
]
160+
},
161+
{
162+
"cell_type": "code",
163+
"execution_count": null,
164+
"id": "re1f8za8oj9",
165+
"metadata": {},
166+
"outputs": [],
167+
"source": [
168+
"# Summary stats\n",
169+
"kept = df[df[\"status\"] == \"KEEP\"].copy()\n",
170+
"baseline_bpb = df.iloc[0][\"val_bpb\"]\n",
171+
"best_bpb = kept[\"val_bpb\"].min()\n",
172+
"best_row = kept.loc[kept[\"val_bpb\"].idxmin()]\n",
173+
"\n",
174+
"print(f\"Baseline val_bpb: {baseline_bpb:.6f}\")\n",
175+
"print(f\"Best val_bpb: {best_bpb:.6f}\")\n",
176+
"print(f\"Total improvement: {baseline_bpb - best_bpb:.6f} ({(baseline_bpb - best_bpb) / baseline_bpb * 100:.2f}%)\")\n",
177+
"print(f\"Best experiment: {best_row['description']}\")\n",
178+
"print()\n",
179+
"\n",
180+
"# How many experiments to find each improvement\n",
181+
"print(\"Cumulative effort per improvement:\")\n",
182+
"kept_sorted = kept.reset_index()\n",
183+
"for i, (_, row) in enumerate(kept_sorted.iterrows()):\n",
184+
" desc = str(row[\"description\"]).strip()\n",
185+
" print(f\" Experiment #{row['index']:3d}: bpb={row['val_bpb']:.6f} {desc}\")"
186+
]
187+
},
188+
{
189+
"cell_type": "markdown",
190+
"id": "oxri9h5c9gs",
191+
"metadata": {},
192+
"source": [
193+
"## Top Hits (Kept Experiments by Improvement)"
194+
]
195+
},
196+
{
197+
"cell_type": "code",
198+
"execution_count": null,
199+
"id": "q86hxu10djk",
200+
"metadata": {},
201+
"outputs": [],
202+
"source": [
203+
"# Each kept experiment's delta is measured vs the previous kept experiment's bpb\n",
204+
"# (since experiments are cumulative -- each one builds on the last kept state)\n",
205+
"kept = df[df[\"status\"] == \"KEEP\"].copy()\n",
206+
"kept[\"prev_bpb\"] = kept[\"val_bpb\"].shift(1)\n",
207+
"kept[\"delta\"] = kept[\"prev_bpb\"] - kept[\"val_bpb\"]\n",
208+
"\n",
209+
"# Drop baseline (no delta)\n",
210+
"hits = kept.iloc[1:].copy()\n",
211+
"\n",
212+
"# Sort by delta improvement (biggest first)\n",
213+
"hits = hits.sort_values(\"delta\", ascending=False)\n",
214+
"\n",
215+
"print(f\"{'Rank':>4} {'Delta':>8} {'BPB':>10} Description\")\n",
216+
"print(\"-\" * 80)\n",
217+
"for rank, (_, row) in enumerate(hits.iterrows(), 1):\n",
218+
" print(f\"{rank:4d} {row['delta']:+.6f} {row['val_bpb']:.6f} {row['description']}\")\n",
219+
"\n",
220+
"print(f\"\\n{'':>4} {hits['delta'].sum():+.6f} {'':>10} TOTAL improvement over baseline\")"
221+
]
222+
},
223+
{
224+
"cell_type": "code",
225+
"execution_count": null,
226+
"id": "f9bffe89",
227+
"metadata": {},
228+
"outputs": [],
229+
"source": []
230+
}
231+
],
232+
"metadata": {
233+
"kernelspec": {
234+
"display_name": ".venv",
235+
"language": "python",
236+
"name": "python3"
237+
},
238+
"language_info": {
239+
"codemirror_mode": {
240+
"name": "ipython",
241+
"version": 3
242+
},
243+
"file_extension": ".py",
244+
"mimetype": "text/x-python",
245+
"name": "python",
246+
"nbconvert_exporter": "python",
247+
"pygments_lexer": "ipython3",
248+
"version": "3.10.12"
249+
}
250+
},
251+
"nbformat": 4,
252+
"nbformat_minor": 5
253+
}

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@ readme = "README.md"
66
requires-python = ">=3.10"
77
dependencies = [
88
"kernels>=0.11.7",
9+
"matplotlib>=3.10.8",
910
"numpy>=2.2.6",
11+
"pandas>=2.3.3",
1012
"pyarrow>=21.0.0",
1113
"requests>=2.32.0",
1214
"rustbpe>=0.1.0",

0 commit comments

Comments
 (0)