bkataru
diff --git a/‎CHANGELOG.md‎
Lines changed: 20 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎CLAUDE.md‎
Lines changed: 7 additions & 2 deletions b/‎CLAUDE.md‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎build.zig‎
Lines changed: 20 additions & 1 deletion b/‎build.zig‎
Lines changed: 20 additions & 1 deletion
diff --git a/‎build.zig.zon‎
Lines changed: 1 addition & 1 deletion b/‎build.zig.zon‎
Lines changed: 1 addition & 1 deletion
@@ -2,6 +2,26 @@
 
 All notable changes to the **powerglide** project will be documented in this file.
 
+## [0.3.0] - 2026-03-05
+
+### Added
+- **4B quant curve completed** — downloaded Qwen3.5-4B-Q4_K_M/Q5_K_M/Q6_K GGUFs; full Q4→BF16 precision curve measured: 4B saturates at Q4 (13/17), Q4 is optimal (2.6 GB vs 7.9 GB for BF16)
+- **T01–T17 in `trial_quant.zig`** — harness extended from T01–T13 to T01–T17, adding code generation, JSON round-trip, error recovery, and multi-source synthesis tasks across all quantization variants
+- **`examples/bench.zig`** — throughput benchmark: measures tokens/second via igllama `usage.completion_tokens` across Q4/Q8/BF16 for each weight class; reports tok/s, file size, and RAM (RSS); `zig build bench`
+- **`zig build bench` step** — bench harness added to build.zig
+- **igllama v0.3.10 patch** — `usage.completion_tokens` in non-streaming responses now returns real counts (was hardcoded 0); fix upstreamed as igllama PR #82, released as v0.3.10
+
+### Changed
+- **`trial_quant.zig` QUANT_MODELS** — 12 → 16 models: added 4B-Q4/Q5/Q6/Q8 to complete the 4B quant curve alongside 4B-BF16
+- **`build.zig` step description** — updated to reflect T01–T17 and all four weight classes
+- **Showcase** — quantization sensitivity table expanded to include full 4B curve; speed benchmark section added with measured tok/s and RAM data; key finding documented (RAM cliff at 4B-Q8 on ≤6 GB systems)
+- **CLAUDE.md** — version 0.2.9 → 0.3.0; roadmap items 20–23 added; bench harness documented
+- **`src/main.zig` VERSION** — `"0.2.9"` → `"0.3.0"`; test assertion updated
+- **`build.zig.zon`** — version `"0.2.9"` → `"0.3.0"`
+
+### Fixed
+- **bench.zig token counting** — initial implementation used content-length/4 estimate (igllama returned `completion_tokens:0`); updated to prefer API counts with fallback; igllama v0.3.10 upstream fix makes API counts accurate
+
 ## [0.2.9] - 2026-03-05
 
 ### Added
 
@@ -55,11 +55,12 @@ Start with: `igllama api <model> --port <N> --no-think --max-tokens 512 --thread
 - `OpenAIClient.json_mode = true` → forces `response_format: {"type":"json_object"}` for constrained output
 - Doctor scans `:8090–8099` automatically
 - Trial harness: `zig build trial` — runs T01–T17 × all 4 endpoints
-- Quant harness: `zig build trial-quant` — runs T01–T13 × 12 models: 0.8B-BF16 | 2B (Q4/Q5/Q6/Q8/BF16) | 4B-BF16 | 9B (Q4/Q5/Q6/Q8/BF16), sequential on :8090
+- Quant harness: `zig build trial-quant` — runs T01–T17 × 16 models: 0.8B-BF16 | 2B (Q4/Q5/Q6/Q8/BF16) | 4B (Q4/Q5/Q6/Q8/BF16) | 9B (Q4/Q5/Q6/Q8/BF16), sequential on :8090
+- Bench harness: `zig build bench` — tokens/sec throughput benchmark, accurate via igllama v0.3.10 `usage.completion_tokens`
 
 ## Current Version
 
-`0.2.9` — 195/195 tests passing, 0 leaks.
+`0.3.0` — 195/195 tests passing, 0 leaks.
 
 ## Roadmap
 
@@ -82,3 +83,7 @@ Start with: `igllama api <model> --port <N> --no-think --max-tokens 512 --thread
 17. ✅ /security-review pass — MCP input validation hardened, OOM guard on readLine, JSON injection in listAsJson fixed
 18. ✅ 0.8B-BF16 added to quant harness — all four weight classes now have BF16 coverage; 4B-BF16 confirmed 13/13
 19. ✅ MCP server hardened — type assertion panic fixed, OOM guard on stdin buffer, error logging filtered
+20. ✅ 4B quant curve completed — Q4/Q5/Q6 GGUFs downloaded, full Q4→BF16 curve measured; 4B saturated at Q4
+21. ✅ T01–T17 extended to trial_quant.zig — all 17 agentic tasks now in quantization sensitivity harness
+22. ✅ Throughput benchmark (`examples/bench.zig`) — tokens/sec × RAM measurement across Q4/Q8/BF16 per weight class; igllama v0.3.10 usage.completion_tokens fix integrated
+23. ✅ igllama v0.3.10 — populate usage.completion_tokens in non-streaming responses (patched upstream, PR #82)
@@ -95,8 +95,27 @@ pub fn build(b: *std.Build) void {
         }),
     });
     b.installArtifact(trial_quant_exe);
-    const trial_quant_step = b.step("trial-quant", "Run the igllama quantization sensitivity harness (Q4/Q5/Q6/Q8 on 2B and 9B)");
+    const trial_quant_step = b.step("trial-quant", "Run the igllama quantization sensitivity harness (T01-T17 x Q4/Q5/Q6/Q8/BF16 across all 4 weight classes)");
     const trial_quant_cmd = b.addRunArtifact(trial_quant_exe);
     trial_quant_cmd.step.dependOn(b.getInstallStep());
     trial_quant_step.dependOn(&trial_quant_cmd.step);
+
+    // Throughput benchmark (examples/bench.zig)
+    const bench_exe = b.addExecutable(.{
+        .name = "bench",
+        .root_module = b.createModule(.{
+            .root_source_file = b.path("examples/bench.zig"),
+            .target = target,
+            .optimize = optimize,
+            .link_libc = true,
+            .imports = &.{
+                .{ .name = "powerglide", .module = mod },
+            },
+        }),
+    });
+    b.installArtifact(bench_exe);
+    const bench_step = b.step("bench", "Run the igllama throughput benchmark (tokens/sec across Q4/Q8/BF16 x all weight classes)");
+    const bench_cmd = b.addRunArtifact(bench_exe);
+    bench_cmd.step.dependOn(b.getInstallStep());
+    bench_step.dependOn(&bench_cmd.step);
 }
@@ -9,7 +9,7 @@
     .name = .powerglide,
     // This is a [Semantic Version](https://semver.org/).
     // In a future version of Zig it will be used for package deduplication.
-    .version = "0.2.9",
+    .version = "0.3.0",
     // Together with name, this represents a globally unique package
     // identifier. This field is generated by the Zig toolchain when the
     // package is first created, and then *never changes*. This allows