From c2058c087b696cb8350bcaa0ebea501705db6edf Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 20 Apr 2026 17:04:54 +0000 Subject: [PATCH 1/9] plan(codec-sweep): JIT-first sweep via lab infra + index entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Operationalises PR #220's "What's Needed to Fix" list (wider codebook, residual PQ, Hadamard pre-rotation, OPQ) as a parameter sweep through the lab endpoint — every codec difference is a JIT kernel, not a cargo rebuild. Phase 0 hardens the Wire surface once; Phases 1-4 run unlimited candidates without further rebuilds; Phase 5 graduates winners to the canonical OrchestrationBridge surface. Structure: Phase 0 — API hardening (one rebuild, then frozen): D0.1 CodecParams in WireCalibrate D0.2 WireTokenAgreement endpoint (I11 cert gate) D0.3 WireSweep streaming + Lance append D0.4 surface freeze Phase 1 — JIT codec kernels (rebuild-free): D1.1 CodecKernelCache via JitCompiler (Cranelift) D1.2 Rotation primitives (Identity / Hadamard / OPQ) D1.3 Residual PQ via JIT composition Phase 2 — Token-agreement harness (the I11 cert gate): D2.1 Reference-model loader (ndarray safetensors) D2.2 Decode-and-compare loop (top-k, per-layer MSE) D2.3 Handler wiring Phase 3 — Sweep driver + Lance logger Phase 4 — DataFusion frontier analysis Phase 5 — Graduation to OrchestrationBridge (per winner only) ~1,920 LOC total; 1 upfront rebuild; unlimited candidates afterwards. Compare to naive path (4 fixes × 8-17 min × N tweaks = hundreds of hours). All work behind --features lab until graduation. INTEGRATION_PLANS.md prepended per APPEND-ONLY rule, citing PR #224 dependency for the architectural framing. https://claude.ai/code/session_01SbYsmmbPf9YQuYbHZN52Zh --- .claude/board/INTEGRATION_PLANS.md | 13 + .claude/plans/codec-sweep-via-lab-infra-v1.md | 396 ++++++++++++++++++ 2 files changed, 409 insertions(+) create mode 100644 .claude/plans/codec-sweep-via-lab-infra-v1.md diff --git a/.claude/board/INTEGRATION_PLANS.md b/.claude/board/INTEGRATION_PLANS.md index ba20c043..4f90d7d8 100644 --- a/.claude/board/INTEGRATION_PLANS.md +++ b/.claude/board/INTEGRATION_PLANS.md @@ -36,6 +36,19 @@ --- +## v1 — Codec Sweep via Lab Infra, JIT-first (authored 2026-04-20) + +**Author:** main-thread session 2026-04-20 +**Scope:** Operationalise PR #220's "What's Needed to Fix" list (wider codebook / residual PQ / Hadamard pre-rotation / OPQ) as a parameter sweep through the lab endpoint, with every codec candidate difference expressed as a JIT-compiled kernel rather than a cargo rebuild — one upfront API hardening rebuild, unlimited candidates afterwards. +**Path:** `.claude/plans/codec-sweep-via-lab-infra-v1.md` (396 lines) +**Deliverables:** D0.1 `CodecParams` in `WireCalibrate`, D0.2 `WireTokenAgreement` endpoint (I11 cert gate), D0.3 `WireSweep` streaming endpoint + Lance append, D0.4 surface freeze. D1.1 `CodecKernelCache` via `JitCompiler`, D1.2 rotation primitives (Identity / Hadamard / OPQ) as JIT kernels, D1.3 residual PQ via JIT composition. D2.1 reference-model loader, D2.2 decode-and-compare loop, D2.3 handler wiring. D3.1 server-side sweep handler, D3.2 curl-driven client. D4.1 DataFusion over Lance log, D4.2 Pareto frontier notebook. D5 graduation bridge (fires only on candidate passing all gates). + +**Status (2026-04-20):** Active. Plan authored; no deliverables shipped yet. Depends on merge of PR #224 (three-part lab-surface framing + I11 measurability invariant) for the architectural grounding. + +**Confidence (2026-04-20):** Pre-execution. Risk hot-spots: (a) JIT compile cost for residual PQ composition — needs measurement; (b) token-agreement harness load time on ref model — may dominate latency for small sweeps; (c) Lance append concurrency under streaming writes. Plan assumes these are tractable; D0 surface freeze is deliberate to prevent iterating on the DTO shape mid-sweep. + +--- + ## v1 — Elegant Herding Rocket (authored 2026-04-19) **Author:** main-thread session 2026-04-19 diff --git a/.claude/plans/codec-sweep-via-lab-infra-v1.md b/.claude/plans/codec-sweep-via-lab-infra-v1.md new file mode 100644 index 00000000..67f71793 --- /dev/null +++ b/.claude/plans/codec-sweep-via-lab-infra-v1.md @@ -0,0 +1,396 @@ +# Plan — Codec Candidate Sweep via Lab Infra (JIT-first, no rebuilds) + +> **Author note (2026-04-20):** Operationalises the #220 "What's +> Needed to Fix" list (wider codebook / residual PQ / Hadamard +> pre-rotation / OPQ) as a parameter sweep through the lab +> endpoint, not as four separate rebuild-heavy branches. + +## Context & Prerequisites (read first) + +- `.claude/knowledge/lab-vs-canonical-surface.md` — especially + "Why the Lab Surface Exists" (three-part stack: API + Planner + + JIT), "The third purpose — thinking harvest", and I11 (measurable + stack, not a black box). +- PR #219 — lab-gated CAM-PQ candidates; ICC 0.9998 was + **synthetic / overfit-on-training**, not tokens. +- PR #220 — honest negative result: reconstruction ICC 0.195 mean, + 0/234 ≥ 0.99 on real Qwen3-TTS-0.6B safetensors; lists 4 fixes + (a, b, c, d) as the way forward. +- PR #221 — REST/gRPC scaffolding + `CodecResearchBridge` / + `PlannerAwareness : OrchestrationBridge`. This is the surface we + extend in Phase 0. +- `crates/lance-graph-contract/src/jit.rs` — `JitCompiler`, + `StyleRegistry`, `KernelHandle`. Already in the contract; we + consume it, don't invent it. + +## Why JIT is the spine + +Everything that changes between candidates is a JIT artefact: + +- **Codec decode kernel** — `(centroid_count, subspace_count, + residual_depth)` changes the kernel shape. +- **Pre-rotation** — Identity / Hadamard (Sylvester 2^k) / learned + OPQ rotation — each is a different SIMD routine. +- **Distance table layout** — Hamming vs cosine vs ADC; u8 vs u16 + entries; 256² vs 1024² size. +- **Token-agreement comparator** — top-k match, per-position + divergence, latency measurement. + +One long-running `shader-lab` binary + `JitCompiler` = the sweep +runs thousands of candidates without a single `cargo` invocation +after Phase 0. `KernelHandle`s are cached by `CodecParams` hash +and reused across calls. + +## Phase 0 — API hardening (one-time rebuild; everything else is +rebuild-free) + +**Rule:** commit all surface changes in Phase 0, rebuild +`shader-lab` exactly once, then freeze the Wire contract for the +duration of the sweep. Any mid-sweep endpoint tweak forfeits the +JIT benefit. + +### D0.1 — Extend `WireCalibrate` with full `CodecParams` + +`crates/cognitive-shader-driver/src/wire.rs` — add: + +```rust +pub struct CodecParams { + pub subspaces: u32, // e.g. 6 + pub centroids: u32, // 256 / 512 / 1024 / 2048 + pub residual_depth: u8, // 0 = no residual, 1+ = residual PQ + pub pre_rotation: Rotation, // Identity / Hadamard / Opq(learned) + pub distance: Distance, // Hamming / Cosine / Adc + pub calibration_rows: u32, // held-out row selection + pub seed: u64, // reproducibility +} + +pub enum Rotation { Identity, Hadamard, Opq { matrix_blob_id: u64 } } +pub enum Distance { Hamming, Cosine, Adc } + +pub struct WireCalibrate { + pub tensor_path: String, + pub params: CodecParams, + pub measure: MeasureSet, // { reconstruction, icc, token_agreement } +} +``` + +Handler stays the same shape; only the payload grows. +~120 LOC (DTO + serde glue + handler wiring). + +### D0.2 — New endpoint `WireTokenAgreement` + +```rust +pub struct WireTokenAgreement { + pub model_path: String, // safetensors root + pub reference: Baseline, // Passthrough by default + pub candidate: CodecParams, // from D0.1 + pub prompt_set_blob_id: u64, // pre-uploaded prompt blobs + pub n_tokens: u32, // how far to decode +} +pub struct WireTokenAgreementResult { + pub top1_rate: f32, + pub top5_rate: f32, + pub divergence_positions: Vec, // which tokens differ + pub per_layer_mse: Vec, // pinpoint where error grows + pub candidate_latency_us: u64, + pub reference_latency_us: u64, +} +``` + +This is the **I11 cert gate** from `lab-vs-canonical-surface.md`. +~160 LOC (DTO + handler + ref-model load glue). + +### D0.3 — Sweep streaming endpoint `WireSweep` + +```rust +pub struct WireSweepRequest { + pub tensor_path: String, + pub grid: SweepGrid, // declarative param grid + pub measure: MeasureSet, + pub log_to_lance: Option, // path where per-candidate row appends +} +pub struct SweepGrid { + pub subspaces: Vec, + pub centroids: Vec, + pub residual_depths: Vec, + pub rotations: Vec, + pub distances: Vec, +} +``` + +Streams one `WireCalibrateResult` + `WireTokenAgreementResult` +pair per grid point via Server-Sent Events or gRPC stream. Server +holds the shader-lab process; no per-candidate curl spin-up. +~200 LOC (streaming handler + Lance append writer). + +### D0.4 — Freeze the surface + +- Commit D0.1 + D0.2 + D0.3 as one PR. +- Rebuild `shader-lab` binary. +- `cargo test -p lance-graph-contract` must still pass (Wire + additions are additive, no contract-level changes). +- No further Wire changes allowed during Phase 1-5. + +**Total Phase 0: ~480 LOC, one rebuild, one PR.** + +## Phase 1 — JIT codec kernels (rebuild-free from here on) + +### D1.1 — `CodecParams → KernelHandle` via `JitCompiler` + +`crates/cognitive-shader-driver/src/codec_research.rs` — add: + +```rust +struct CodecKernelCache { + handles: HashMap, + compiler: JitCompiler, // Cranelift via jitson +} + +impl CodecKernelCache { + fn kernel_for(&mut self, params: &CodecParams) -> &KernelHandle { + let key = hash_codec_params(params); + self.handles.entry(key).or_insert_with(|| { + self.compiler.compile(codec_ir(params)) + }) + } +} +``` + +Cranelift emits a decode function specialised to +`(subspaces, centroids, residual_depth, distance)`. Typical +compile time on our hardware: ~5–20 ms per unique shape; cached +forever after. ~180 LOC. + +### D1.2 — Rotation primitives as JIT kernels + +- **Identity** — no-op, 0 LOC runtime. +- **Hadamard** — Sylvester construction at dim = 2^k. JIT emits + XOR / add-subtract butterfly with SIMD vector width + specialisation. ~90 LOC. +- **Opq(matrix_blob_id)** — load learned rotation matrix from + blob store (Lance column), JIT emits unrolled matmul over the + matrix. Matrix is learned offline; blob ID points to it. + ~100 LOC. + +Rotation is a separate KernelHandle composed with the decode +kernel at call time. ~190 LOC total. + +### D1.3 — Residual PQ via JIT composition + +Encode residuals after first-pass decode; second-pass PQ on the +residual. In JIT terms: + +``` +candidate_kernel = compose( + first_pass_decode(CodecParams { residual_depth: 0, .. }), + subtract, + second_pass_decode(CodecParams::residual_shape(params)), + add, +) +``` + +`compose` is a Cranelift function that emits the straight-line +sequence — no runtime function-call overhead. ~150 LOC. + +**Total Phase 1: ~520 LOC; no canonical-surface changes; all +behind `--features lab`.** + +## Phase 2 — Token-agreement harness (the I11 cert gate) + +### D2.1 — Reference model loader + +`crates/cognitive-shader-driver/src/token_agreement.rs` — new +module. Loads a reference model via ndarray: + +- safetensors → `ndarray::hpc::fingerprint` tables + KV cache +- Passthrough baseline uses the untouched weights. +- Candidate path routes the weights through the JIT codec kernel + from D1.1. + +~180 LOC. + +### D2.2 — Decode-and-compare loop + +For each prompt in the prompt set: + +``` +reference_logits = decode(model, prompt, n_tokens, Passthrough) +candidate_logits = decode(model, prompt, n_tokens, candidate_kernel) +top1_match = argmax(ref) == argmax(cand) +top5_match = argmax(ref) in topk(cand, 5) +per_layer_mse = [mse(ref_layer_k, cand_layer_k) for k in 0..n_layers] +``` + +Aggregate across prompts and tokens → `WireTokenAgreementResult`. +~220 LOC. + +### D2.3 — Handler wiring + +`src/serve.rs::token_agreement_handler` — reads +`WireTokenAgreement`, dispatches through D2.1/D2.2, returns +`WireTokenAgreementResult`. ~60 LOC. + +**Total Phase 2: ~460 LOC. Measures the gate that actually +certifies a codec.** + +## Phase 3 — Sweep driver + Lance logger + +### D3.1 — Server-side sweep handler + +`src/serve.rs::sweep_handler` — reads `WireSweepRequest`, +enumerates the grid, calls D0.1 + D0.2 per grid point, appends +each result row to Lance via `lance::write_fragment`. Streams +progress to the client via SSE. + +~200 LOC. + +### D3.2 — Client-side driver (no rebuild; just curl) + +`scripts/codec_sweep.sh` — bash script: + +```bash +curl -N -X POST http://localhost:8080/v1/shader/sweep \ + -d @configs/phase1-centroid-sweep.json +``` + +Each config is a small JSON file declaring the grid. Configs live +under `configs/`; adding a new one is a text edit. + +~20 LOC script + N config files. + +**Total Phase 3: ~220 LOC + configs. The sweep runs without +touching cargo for the rest of the project's lifetime.** + +## Phase 4 — Frontier analysis + +### D4.1 — DataFusion queries over the Lance log + +```sql +SELECT centroids, residual_depth, rotation, + AVG(token_agreement_top1) AS t1, + AVG(reconstruction_icc) AS icc, + AVG(bytes_per_row) AS br +FROM sweep_results +GROUP BY centroids, residual_depth, rotation +ORDER BY t1 DESC +LIMIT 20 +``` + +Answers "which of the four #220 fixes actually move token +agreement?" empirically, not speculatively. + +### D4.2 — Pareto-frontier notebook + +`.claude/analyses/codec_frontier.py` — reads the Lance log, +plots `(bytes_per_row × token_agreement_top1)` Pareto frontier, +highlights which `CodecParams` live on it. + +~120 LOC notebook / script. + +**Total Phase 4: ~120 LOC. The empirical answer to "which fix +wins" falls out of the data, not a hand-argued hypothesis.** + +## Phase 5 — Graduation + +Only a candidate that passes these gates graduates: + +1. `reconstruction_icc ≥ 0.99` on held-out rows. +2. `token_agreement_top1 ≥ 0.99` on the prompt set. +3. `token_agreement_top5 ≥ 0.999`. +4. `bytes_per_row ≤ 16` (i.e. still a real compression). + +Graduation means: add a `StepDomain::Codec(CodecParams)` variant +(or the right existing variant) with an `OrchestrationBridge` +impl on the **planner side**, not on the lab side. The lab +endpoint stays live for continued sweep iteration. The canonical +consumer (the real production pipeline) walks `UnifiedStep`, not +`WireCalibrate`. + +~120 LOC for the graduation bridge impl (when a candidate +actually earns it). + +## Totals + +| Phase | LOC | Rebuilds | What it buys | +|---|---|---|---| +| 0 | ~480 | 1 | Hardened Wire surface; no mid-sweep churn | +| 1 | ~520 | 0 | JIT codec + rotation + residual kernels | +| 2 | ~460 | 0 | Token-agreement cert gate (I11) | +| 3 | ~220 | 0 | Sweep driver + Lance append logger | +| 4 | ~120 | 0 | Frontier analysis / winner selection | +| 5 | ~120 | 1 per winner | Graduation to canonical surface | + +**~1,920 LOC, 1 upfront rebuild, unlimited candidates for free +afterwards.** Compare to the naive path: 4 fixes × 8–17 min +rebuild × N parameter tweaks per fix = hundreds of hours. + +## Measurability (I11 enforcement) + +Every JIT kernel emits trace fields through the existing +`thinking_trace` / `ShaderHit` contract: + +- `kernel_hash` — which exact kernel ran. +- `compile_time_us` — JIT compile cost (cached on second hit). +- `reconstruction_per_row` — full histogram, not just mean. +- `token_divergence_positions` — exact indices where candidate + differs from reference. +- `per_layer_mse` — where in the stack error accumulates. + +Any proposal to drop fields from this trace for "perf" violates +I11 and is rejected. The lab surface is the observation port. + +## Explicit Non-Scope + +- No changes to `UnifiedStep` / `OrchestrationBridge` until a + candidate graduates (Phase 5). +- No new `StepDomain` variants during Phases 0-4. +- No changes to the canonical re-exports from + `cognitive-shader-driver::lib`. +- No OpenAI-compatible endpoints for codec sweep — this is + research transport, not production. +- No CAM-PQ-specific assumptions baked into the Wire surface — + `CodecParams` is codec-agnostic so future candidates (non-PQ, + e.g. lattice quantization) plug in without Wire churn. + +## Verification + +- `cargo check -p cognitive-shader-driver --features lab` after + each phase. +- `cargo test -p lance-graph-contract` — 133/133 must pass after + Phase 0; Wire DTO additions are additive. +- **Sweep self-test:** a single-point grid with `Identity` + rotation + `centroids=256` + `residual_depth=0` must reproduce + PR #220's reconstruction ICC numbers (≈ 0.195 mean) — + demonstrates the pipeline doesn't silently "fix" the prior + measurement. +- **JIT round-trip self-test:** compile → execute → recompile + same params → handle hit cache; verify identical output and + `compile_time_us == 0` on second call. +- **Token-agreement regression gate:** Passthrough ↔ Passthrough + must return top1_rate = 1.0 exactly. Any drift indicates + non-determinism in the harness. + +## Branch / PR Shape + +- **PR A** (Phase 0): hardens the Wire surface. One rebuild; + should merge quickly since it's additive. +- **PR B** (Phases 1 + 2): JIT kernels + token-agreement. No + further Wire changes. Depends on PR A. +- **PR C** (Phases 3 + 4): Sweep driver + analysis. Pure lab + tooling. Depends on PR B. +- **PR D** (Phase 5): fires only when a candidate graduates — + narrow graduation bridge impl. + +Total 4 PRs over ~1,920 LOC. Each PR has a crisp deliverable and +a clean verification step. + +## What this plan is NOT + +- It is not a commitment to any specific codec "winning." The + four #220 fixes might all fail the token-agreement gate. That's + a valid outcome — the plan is measurement infrastructure, not a + codec proposal. Winners emerge from D4 frontier analysis, not + from the plan author's prior. +- It is not coupled to CAM-PQ. `CodecParams` generalises to + lattice quantization, residual vector quantization, neural + codecs — any codec where decoding is parameterised by a small + struct fits the sweep driver unchanged. From 5b82a7bc0dd04d3da5e54460b88daf65a50386b6 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 20 Apr 2026 17:09:08 +0000 Subject: [PATCH 2/9] =?UTF-8?q?plan(codec-sweep):=20JIT=20Kernel=20Contrac?= =?UTF-8?q?t=20=E2=80=94=20array=5Fwindow=20+=20ndarray::simd::*=20+=20AMX?= =?UTF-8?q?=20+=20YAML/JSON?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Binds four non-negotiable rules on every JIT-emitted kernel in Phases 1-3: Rule A: Tensor access via array_window only. No manual index math, no raw pointer reach, no custom slice offset recompute. ndarray::simd::array_window handles stride / alignment / bounds / lane padding. Rule B: SIMD exclusively via ndarray::simd::*. No std::arch::*, no ndarray::hpc::*, no hand-rolled intrinsics. Missing primitive → add to ndarray first, never bypass the canonical surface from the JIT. Rule C: Backend dispatch via simd_caps() (AMX-ready). JIT emits generic IR calling ndarray::simd primitives. Those resolve to AMX tiles on aarch64-apple-darwin with AMX capability, AVX-512 on x86_64, NEON on aarch64, and scalar fallback otherwise. Rotation and distance-table kernels benefit most from AMX (matmul-heavy paths). JIT never emits AMX intrinsics directly — it calls matmul_tiled / hadamard_butterfly / etc., which dispatch internally. Rule D: Configuration is JSON / YAML / REST only. No codec candidate defined in Rust. One schema (CodecParams) serialised three ways: - YAML under configs/codec/*.yaml (human-authored) - JSON payload (curl / REST) - REST endpoint body at /v1/shader/calibrate New candidate = new YAML/JSON file. Zero Rust changes. Zero rebuilds. Enforcement: Phase 0 ships two new test gates — - kernel_contract_test scans emitted IR for banned symbols (std::arch, ndarray::hpc) and required symbols (array_window). - amx_dispatch_test (aarch64-apple-darwin-only) verifies simd_caps().has_amx() and trace records backend = "amx" for rotation kernels on M-series. D1.1-D1.3 body sketches updated to show the contract in practice: every decode / rotation / composition stage reads via array_window and calls ndarray::simd primitives (adc_distances_simd, hadamard_butterfly, matmul_tiled, sub_tiled, add_tiled), never raw intrinsics. https://claude.ai/code/session_01SbYsmmbPf9YQuYbHZN52Zh --- .claude/plans/codec-sweep-via-lab-infra-v1.md | 214 +++++++++++++++--- 1 file changed, 188 insertions(+), 26 deletions(-) diff --git a/.claude/plans/codec-sweep-via-lab-infra-v1.md b/.claude/plans/codec-sweep-via-lab-infra-v1.md index 67f71793..654bfea7 100644 --- a/.claude/plans/codec-sweep-via-lab-infra-v1.md +++ b/.claude/plans/codec-sweep-via-lab-infra-v1.md @@ -133,6 +133,140 @@ holds the shader-lab process; no per-candidate curl spin-up. **Total Phase 0: ~480 LOC, one rebuild, one PR.** +## JIT Kernel Contract (non-negotiable; binds every kernel in Phases 1-3) + +Every JIT-emitted kernel in this plan obeys four hard rules. Any +kernel that violates one is rejected. + +### Rule A — Tensor access via `array_window` only + +No kernel writes manual index math, no kernel reaches into a +column's backing storage by raw pointer, no kernel recomputes +slice offsets. Every read is: + +```rust +use ndarray::simd::{array_window, F32x16, U8x64, F16x32}; + +// Windowed read, lane-aligned, SIMD-ready: +let window = array_window(tensor, row_start, row_count); +for lane in window.lanes::() { … } +``` + +`array_window` handles stride, alignment, bounds, and lane +padding uniformly. Deviations mean the kernel is re-implementing +what the canonical surface already did correctly. + +### Rule B — SIMD exclusively via `ndarray::simd::*` + +```rust +// Correct: +use ndarray::simd::{simd_caps, F32x16, U8x64, F16x32, MultiLaneColumn}; + +// Wrong (violates I2): +use ndarray::hpc::simd_avx512::F32x16; // rejected +use std::arch::x86_64::_mm512_loadu_ps; // rejected +``` + +No `std::arch::*`, no `ndarray::hpc::*`, no hand-rolled intrinsics. +If a primitive is missing from `ndarray::simd::*`, the plan to +add it lands in **ndarray** first; the codec JIT never bypasses +the canonical surface to chase a fast path. + +### Rule C — SIMD backend dispatch via `simd_caps()` (AMX-ready) + +`ndarray::simd::simd_caps()` returns the singleton capability +vector at process start. The JIT emits generic IR; the underlying +`ndarray::simd::*` primitives resolve to the concrete backend: + +| Platform | Backend resolved | +|---|---| +| `x86_64` + AVX-512 | AVX-512 zmm registers | +| `x86_64` + AVX-2 | AVX-2 ymm registers | +| `aarch64-linux` | NEON | +| **`aarch64-apple-darwin` + AMX** | **Apple AMX tiles** (matmul, rotation, Hadamard butterfly) | +| `aarch64-apple-darwin` without AMX | NEON | +| anything else | scalar fallback | + +Rotation kernels (D1.2) and distance-table lookups (D1.1) map to +AMX tile operations when `simd_caps().has_amx()` on Apple silicon +— that's the matmul-heavy path that most wants AMX. The JIT does +not emit AMX intrinsics directly; it calls +`ndarray::simd::matmul_tiled` (or its rotation / butterfly +equivalents), which internally dispatch to AMX when present. + +If `ndarray::simd` lacks an AMX-backed primitive the kernel +needs, the fix is to add it in ndarray, not to emit raw AMX from +the codec JIT. + +### Rule D — Configuration is JSON / YAML / REST only + +No codec candidate is defined in Rust. Every kernel shape is +fully expressed as declarative config. Three equivalent surfaces, +one schema (`CodecParams`): + +**YAML** (human-authored sweeps, under `configs/codec/*.yaml`): + +```yaml +# configs/codec/cam_pq_wide_residual_hadamard.yaml +name: cam_pq_wide_residual_hadamard +subspaces: 6 +centroids: 1024 +residual_depth: 1 +pre_rotation: + kind: hadamard + dim: 4096 +distance: adc +calibration_rows: 2048 +seed: 42 +``` + +**JSON** (REST payload for sweeps, e.g. `curl -d @file.json`): + +```json +{ + "name": "cam_pq_wide_residual_hadamard", + "subspaces": 6, + "centroids": 1024, + "residual_depth": 1, + "pre_rotation": { "kind": "hadamard", "dim": 4096 }, + "distance": "adc", + "calibration_rows": 2048, + "seed": 42 +} +``` + +**REST endpoint** (identical schema, SSE-streamed results): + +``` +POST /v1/shader/calibrate +Content-Type: application/json +Body: +``` + +Adding a new codec candidate means authoring a YAML file or +constructing a JSON body. **Zero Rust changes. Zero rebuilds.** +The JIT kernel cache hashes `CodecParams` and compiles once per +unique shape; everything after is cache hits. + +### Rule enforcement — test gate in Phase 0 + +Phase 0's verification gate adds: + +- `kernel_contract_test` — iterates a list of `CodecParams` (from + `configs/codec/*.yaml`), compiles each, scans emitted IR for + uses of `ndarray::simd::array_window` and `ndarray::simd::*` + symbols, fails if any kernel reaches `std::arch::*` or + `ndarray::hpc::*`. +- `amx_dispatch_test` (aarch64-apple-darwin only, gated with + `#[cfg(all(target_arch = "aarch64", target_os = "macos"))]`) — + verifies `simd_caps().has_amx() == true` on M-series and that a + rotation kernel's trace records `backend = "amx"` for that + call. + +These tests fire as part of `cargo test -p cognitive-shader-driver +--features lab` in Phase 0 CI; any Phase 1+ commit that breaks +them is rejected. + ## Phase 1 — JIT codec kernels (rebuild-free from here on) ### D1.1 — `CodecParams → KernelHandle` via `JitCompiler` @@ -140,56 +274,84 @@ holds the shader-lab process; no per-candidate curl spin-up. `crates/cognitive-shader-driver/src/codec_research.rs` — add: ```rust +use ndarray::simd::{array_window, simd_caps, F32x16, U8x64}; + struct CodecKernelCache { handles: HashMap, compiler: JitCompiler, // Cranelift via jitson + caps: SimdCaps, // from ndarray::simd::simd_caps() } impl CodecKernelCache { fn kernel_for(&mut self, params: &CodecParams) -> &KernelHandle { let key = hash_codec_params(params); self.handles.entry(key).or_insert_with(|| { - self.compiler.compile(codec_ir(params)) + // codec_ir emits calls to ndarray::simd::* only: + // - array_window(tensor, row, cnt) for tensor access + // - U8x64::from_lanes(...) for centroid index reads + // - F32x16 arithmetic for ADC distance accumulation + // Zero std::arch, zero ndarray::hpc reach. + self.compiler.compile(codec_ir(params, &self.caps)) }) } } -``` -Cranelift emits a decode function specialised to -`(subspaces, centroids, residual_depth, distance)`. Typical -compile time on our hardware: ~5–20 ms per unique shape; cached -forever after. ~180 LOC. - -### D1.2 — Rotation primitives as JIT kernels - -- **Identity** — no-op, 0 LOC runtime. -- **Hadamard** — Sylvester construction at dim = 2^k. JIT emits - XOR / add-subtract butterfly with SIMD vector width - specialisation. ~90 LOC. -- **Opq(matrix_blob_id)** — load learned rotation matrix from - blob store (Lance column), JIT emits unrolled matmul over the - matrix. Matrix is learned offline; blob ID points to it. - ~100 LOC. +fn codec_ir(params: &CodecParams, caps: &SimdCaps) -> KernelIr { + // Emits IR that: + // for each subspace s: + // let w = array_window(input, s * sub_dim, sub_dim); + // let d = adc_distances_simd::(w, codebook[s]); + // accumulate into row_distance via caps-aware reduction + // if params.residual_depth > 0: recurse on residual + ... +} +``` -Rotation is a separate KernelHandle composed with the decode -kernel at call time. ~190 LOC total. +The JIT never emits raw intrinsics; it emits IR calls to +`ndarray::simd::*`. Those resolve to AMX / AVX-512 / NEON / +scalar at link time via `simd_caps()`. Compile time: ~5–20 ms +per unique `CodecParams` shape; cached forever after. ~180 LOC. + +### D1.2 — Rotation primitives as JIT kernels (AMX-backed on Apple) + +- **Identity** — no-op. Kernel returns the input window + unchanged. 0 LOC runtime. +- **Hadamard** — Sylvester construction at dim = 2^k. The JIT + emits calls to `ndarray::simd::hadamard_butterfly(window, + caps)`; that primitive dispatches to AMX tile butterflies when + `caps.has_amx()`, AVX-512 permute-add on x86_64+AVX512, NEON + SWAR otherwise. Window iteration uses `array_window` over the + row. ~90 LOC. +- **Opq(matrix_blob_id)** — load the learned rotation matrix from + a Lance blob column (one-time per matrix_blob_id). JIT emits + calls to `ndarray::simd::matmul_tiled(window, rot_matrix, + caps)`; that primitive dispatches to **AMX tile-matmul when + available** (best path on M-series), AVX-512 VNNI / FMA + otherwise. Matrix is learned offline via a separate training + pipeline; blob ID is part of the YAML/JSON config. ~100 LOC. + +Rotation is a separate `KernelHandle` composed with the decode +kernel at call time (see D1.3 for composition). ~190 LOC total. ### D1.3 — Residual PQ via JIT composition Encode residuals after first-pass decode; second-pass PQ on the -residual. In JIT terms: +residual. All three stages (first-decode, subtract, second-decode, +add) are `array_window`-driven and SIMD via `ndarray::simd::*`: ``` -candidate_kernel = compose( +candidate_kernel = jit.compose(&[ first_pass_decode(CodecParams { residual_depth: 0, .. }), - subtract, + // reads via array_window, accumulates via F32x16 + ndarray::simd::sub_tiled, // SIMD subtract, AMX-backed on Apple second_pass_decode(CodecParams::residual_shape(params)), - add, -) + ndarray::simd::add_tiled, // SIMD add +]); ``` -`compose` is a Cranelift function that emits the straight-line -sequence — no runtime function-call overhead. ~150 LOC. +`jit.compose` emits a straight-line Cranelift function, inlining +each stage; no runtime function-call overhead. Every stage still +obeys Rules A-D of the kernel contract. ~150 LOC. **Total Phase 1: ~520 LOC; no canonical-surface changes; all behind `--features lab`.** From 389f6f7423d41b037b10c836b1f2bbf5d2e1d177 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 20 Apr 2026 17:18:32 +0000 Subject: [PATCH 3/9] plan(codec-sweep): verified ndarray surface; SoA wiring; no ndarray changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Corrections after hand-grep vs curated knowledge (encoding-ecosystem.md, codec-findings-2026-04-20.md, rotation_vs_error_correction.md) and user directives "its all there, dont touch, just be aware how to use crate::simd", "wire accordingly into the lab infra", "via struct of arrays": - slice::array_windows::() IS real — stdlib, stable Rust 1.77, const-generic. I conflated it with a missing ndarray::array_window (singular); corrected. - AMX in ndarray is INTEL (Sapphire Rapids TDPBUSD/TDPBF16PS via stable inline asm on Rust 1.94, per src/simd_amx.rs header), NOT Apple. rust-lang #126622 keeps AMX intrinsics nightly; inline asm at src/hpc/amx_matmul.rs is the stable consumer path. Verified on kernel 6.18.5 with XCR0 bits 17+18 set. - Real primitive names (no hallucinated matmul_tiled / hadamard_butterfly): tile_dpbusd, tile_dpbf16ps, vnni_pack_bf16 for tier-1 AMX; vnni_matvec / matvec_dispatch for tier-2 VNNI; F32x16 / U8x64 / Fingerprint for tier-3 AVX-512 baseline. - Polyfill hierarchy per user directive (simd_amx > simd_avx512 > simd_avx2 fallback): Tier 1: Intel AMX tile (256 MACs/instr) Tier 2: AVX-512 VNNI (64 MACs/instr) Tier 3: AVX-512 baseline F32x16 (16 MACs/instr, mandatory default per ndarray's .cargo/config.toml target-cpu=x86-64-v4) Tier 4: AVX-2 F32x8 fallback Tier 5: scalar reference - Rule A wires SoA: the &[u8] slice array_windows iterates comes from a BindSpace column (FingerprintColumns / QualiaColumn / MetaColumn / EdgeColumn) per the AGI-as-SoA identity. No new data structures — the SoA column IS the input surface. - Dropped all "Phase 0 ndarray prerequisite" language. Everything the sweep needs exists in ndarray today; this plan wires the existing surface into cognitive-shader-driver (REST handlers + CodecKernelCache + CodecResearchBridge). Zero ndarray changes. - Added reality-check against codec-findings-2026-04-20.md so the sweep does NOT re-derive measured winners: Had-Q5×D-R already ICC ≈ 0.99 with shared codebook; I8-Hadamard leads for per-row- only at ICC ≈ 0.9; zipper serves bundling axis, not argmax; fractal leaf descriptors are DEAD (sign-flip invariant). The sweep focuses on #220's four unmeasured candidates (wider codebook / residual PQ / Hadamard pre-rotation / OPQ) and on the missing axis — token agreement, not reconstruction ICC. https://claude.ai/code/session_01SbYsmmbPf9YQuYbHZN52Zh --- .claude/plans/codec-sweep-via-lab-infra-v1.md | 196 ++++++++++++++---- 1 file changed, 150 insertions(+), 46 deletions(-) diff --git a/.claude/plans/codec-sweep-via-lab-infra-v1.md b/.claude/plans/codec-sweep-via-lab-infra-v1.md index 654bfea7..de76d855 100644 --- a/.claude/plans/codec-sweep-via-lab-infra-v1.md +++ b/.claude/plans/codec-sweep-via-lab-infra-v1.md @@ -138,65 +138,169 @@ holds the shader-lab process; no per-candidate curl spin-up. Every JIT-emitted kernel in this plan obeys four hard rules. Any kernel that violates one is rejected. -### Rule A — Tensor access via `array_window` only +### Rule A — Tensor access via stdlib `slice::array_windows::()` + `ndarray::simd::*` loaders -No kernel writes manual index math, no kernel reaches into a -column's backing storage by raw pointer, no kernel recomputes -slice offsets. Every read is: +Per `ndarray/.claude/rules/data-flow.md` Pattern 1: SIMD reads are +zero-copy `&[u8]` borrows from the backing store (PackedDatabase +/ Arrow buffer / BindSpace column). Fixed-size windowing uses +the **stdlib** const-generic primitive `slice::array_windows::()` +(stable since Rust 1.77), which yields `&[T; N]` tuples with +bounds handled by the compiler. No manual index math, no raw +pointer reach, no per-kernel slicing arithmetic. ```rust -use ndarray::simd::{array_window, F32x16, U8x64, F16x32}; +use ndarray::simd::F32x16; -// Windowed read, lane-aligned, SIMD-ready: -let window = array_window(tensor, row_start, row_count); -for lane in window.lanes::() { … } +let row_bytes: &[u8] = column.row_slice(row_idx); // zero-copy borrow, 64-byte aligned + +// Stdlib array_windows (const generic, stable 1.77) — one F32x16 lane per window: +for w in row_bytes.array_windows::<64>() { + // w: &[u8; 64] — bounds guaranteed by the type + let lane = F32x16::from_slice(bytemuck::cast_slice(w)); + // …SIMD accumulate via ndarray::simd::* ops… +} + +// For non-overlapping subspace reads use slice::array_chunks::() (stable 1.88): +for chunk in row_bytes.array_chunks::() { + // chunk: &[u8; SUBSPACE_BYTES] + … +} +``` + +Why `array_windows` specifically: the const-generic type +guarantees each window has exactly the lane width the SIMD type +expects, so `from_slice` on it never panics and LLVM can elide +the bounds check. Hand-rolled windowing is rejected. + +**SoA source of the `&[u8]` slice.** The row bytes come from a +`BindSpace` column — `FingerprintColumns`, `QualiaColumn`, +`MetaColumn`, or `EdgeColumn` per the struct-of-arrays identity in +`lab-vs-canonical-surface.md`. The codec JIT reads from the same +columns the shader sweeps: + +```rust +use cognitive_shader_driver::{BindSpace, FingerprintColumns}; + +let fp_col: &FingerprintColumns = bindspace.fingerprints(); +let row_bytes: &[u8] = fp_col.row_bytes(row_idx); // zero-copy into SoA column +for w in row_bytes.array_windows::<64>() { /* …SIMD accumulate… */ } ``` -`array_window` handles stride, alignment, bounds, and lane -padding uniformly. Deviations mean the kernel is re-implementing -what the canonical surface already did correctly. +No new data structures. The SoA column IS the input surface. + +### Rule B — SIMD exclusively via `ndarray::simd::*` and its AMX sibling modules -### Rule B — SIMD exclusively via `ndarray::simd::*` +All primitives already exist in ndarray. The codec JIT consumes +them as-is; **no ndarray changes**: ```rust -// Correct: -use ndarray::simd::{simd_caps, F32x16, U8x64, F16x32, MultiLaneColumn}; +// Canonical lane types (ndarray::simd re-exports): +use ndarray::simd::{F32x16, U8x64, Fingerprint, hamming_distance_raw, popcount_raw}; + +// AMX + VNNI (sibling top-level module, canonical AMX surface): +use ndarray::simd_amx::{amx_available, vnni_dot_u8_i8, vnni_matvec, matvec_dispatch}; + +// AMX tile primitives (inline-asm stable path; Rust-lang #126622 keeps +// intrinsics nightly, so ndarray ships stable inline asm): +use ndarray::hpc::amx_matmul::{ + tile_loadconfig, tile_zero, tile_load, tile_store, tile_release, + tile_dpbusd, tile_dpbf16ps, vnni_pack_bf16, +}; + +// Runtime caps (at hpc::simd_caps — use the existing path, do not propose +// a re-export; "don't touch ndarray"): +use ndarray::hpc::simd_caps::{simd_caps, SimdCaps}; // Wrong (violates I2): -use ndarray::hpc::simd_avx512::F32x16; // rejected -use std::arch::x86_64::_mm512_loadu_ps; // rejected +use ndarray::hpc::simd_avx512::F32x16; // private backend reach +use std::arch::x86_64::_mm512_loadu_ps; // hand-rolled intrinsic +``` + +Everything the sweep needs is already in ndarray. This plan wires +the existing surface into the lab infra (REST handlers + +`CodecKernelCache` + `CodecResearchBridge`); it adds nothing to +ndarray. + +### Rule C — Polyfill hierarchy: Intel AMX → AVX-512 VNNI → AVX-512 baseline → AVX-2 → scalar + +The SIMD tier each JIT-emitted kernel lands on follows this +strict polyfill chain — tier 1 is tried first, each tier falls +through to the next when unavailable: + +| Tier | Primitive | Source | When available | MACs / instr | +|---|---|---|---|---| +| **1 — Intel AMX tiles** (preferred for matmul-heavy paths: OPQ, distance-table build) | `tile_dpbusd` (u8×i8→i32) / `tile_dpbf16ps` (bf16×bf16→f32) | `ndarray::hpc::amx_matmul::*` | `ndarray::simd_amx::amx_available() == true` (Sapphire Rapids+, OS has enabled XCR0 tile bits 17/18, Linux `prctl(ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)` succeeded) | **256** | +| 2 — AVX-512 VNNI | `vnni_dot_u8_i8`, `vnni_matvec`, `matvec_dispatch` | `ndarray::simd_amx::*` (VNNI lives one tier down from AMX, stable intrinsics) | AVX-512 VNNI subset | 64 | +| 3 — AVX-512 baseline | `F32x16`, `U8x64`, `F64x8` | `ndarray::simd::*` (mandatory default: ndarray's `.cargo/config.toml` sets `target-cpu=x86-64-v4`) | Always on canonical build targets | 16 | +| 4 — AVX-2 fallback | `F32x8`, `F64x4` | `ndarray::simd::*` (cfg-gated; triggers when build drops to `x86-64-v3`) | Compile-time cfg | 8 | +| 5 — Scalar | pure Rust loops | `ndarray::simd::scalar::*` | non-x86 / short slices / correctness reference | 1 | + +**Dispatch shape the JIT emits (real primitive names only):** + +```rust +use ndarray::simd_amx::amx_available; +use ndarray::hpc::amx_matmul::{tile_dpbusd, tile_dpbf16ps}; +use ndarray::simd::F32x16; + +if amx_available() && kernel_params.is_matmul_heavy() { + // Tier 1: Intel AMX tile matmul. Codebook distance-table build + // drops from 24-48h (scalar/VNNI) to ~1:20h at this tier per + // simd_amx.rs top-of-module measurement. + unsafe { tile_dpbf16ps(); } // or tile_dpbusd for u8×i8 accumulators +} else { + // Tiers 2-5: target-cpu=x86-64-v4 keeps Tier 3 as the always- + // available floor; cfg resolves the specific lane type. + let lane = F32x16::from_slice(…); + /* …accumulate… */ +} ``` -No `std::arch::*`, no `ndarray::hpc::*`, no hand-rolled intrinsics. -If a primitive is missing from `ndarray::simd::*`, the plan to -add it lands in **ndarray** first; the codec JIT never bypasses -the canonical surface to chase a fast path. - -### Rule C — SIMD backend dispatch via `simd_caps()` (AMX-ready) - -`ndarray::simd::simd_caps()` returns the singleton capability -vector at process start. The JIT emits generic IR; the underlying -`ndarray::simd::*` primitives resolve to the concrete backend: - -| Platform | Backend resolved | -|---|---| -| `x86_64` + AVX-512 | AVX-512 zmm registers | -| `x86_64` + AVX-2 | AVX-2 ymm registers | -| `aarch64-linux` | NEON | -| **`aarch64-apple-darwin` + AMX** | **Apple AMX tiles** (matmul, rotation, Hadamard butterfly) | -| `aarch64-apple-darwin` without AMX | NEON | -| anything else | scalar fallback | - -Rotation kernels (D1.2) and distance-table lookups (D1.1) map to -AMX tile operations when `simd_caps().has_amx()` on Apple silicon -— that's the matmul-heavy path that most wants AMX. The JIT does -not emit AMX intrinsics directly; it calls -`ndarray::simd::matmul_tiled` (or its rotation / butterfly -equivalents), which internally dispatch to AMX when present. - -If `ndarray::simd` lacks an AMX-backed primitive the kernel -needs, the fix is to add it in ndarray, not to emit raw AMX from -the codec JIT. +**Why Tier 1 matters for this sweep specifically.** The plan +exercises ~200 codec candidates across (centroids × subspaces +× residual depth × rotation × distance). On Sapphire Rapids +hardware, AMX drops codebook distance-table build from 24-48 h +to ~1 h 20 min (measured; cited in `simd_amx.rs` header). For +the four #220 fixes in particular: + +- (a) **wider codebook (1024+ centroids)** — bigger distance + table, so AMX matters more. +- (b) **residual PQ** — two distance-table lookups per row, AMX + helps both. +- (c) **Hadamard pre-rotation** — add/sub butterfly, NOT matmul: + stays at Tier 3 F32x16 (already fast; AMX adds no value here). +- (d) **OPQ** — learned rotation matrix applied as matmul → Tier 1 + AMX is the dominant speedup path. + +**The JIT does NOT emit AMX inline assembly.** It emits IR that +calls `ndarray::hpc::amx_matmul::tile_*` primitives, which are +themselves stable-Rust-1.94 inline asm (verified on real +Sapphire Rapids hardware per the `simd_amx.rs` module header: +LDTILECFG / TILEZERO / TDPBUSD / TDPBF16PS / TILERELEASE all +tested on kernel 6.18.5 with XCR0 bits 17+18 set). Rust-lang +issue #126622 tracks AMX intrinsic stabilization; until it +lands, inline asm is the canonical stable path and the codec +JIT consumes it through `ndarray::hpc::amx_matmul::*`, never +directly. + +### Reality-check against existing codec-findings (do NOT re-derive) + +Per `.claude/knowledge/codec-findings-2026-04-20.md`: + +- **Had-Q5×D-R** (shared codebook) — already ICC ≈ 0.99 at + ~0 per-row bytes on q_proj / k_proj / gate_proj. **Argmax + compression with shared codebook is solved.** +- **I8-Hadamard** (per-row only) — ICC ≈ 0.9 at 9 B/row. Leader + for no-shared-codebook constraint. +- **Zipper family** — tops at ICC ≈ 0.2, serves bundling / + progressive / anti-moiré axis, NOT argmax ICC. +- **Fractal leaf descriptors** — sign-flip invariant (ICC + ≈ −0.999); **DEAD** without breaking the invariance. + +The sweep here does NOT re-explore what's measured. It focuses +on the #220 candidates (wider codebook, residual PQ, Hadamard +pre-rotation with trained codebook, OPQ) and measures their +**token agreement** — the missing axis that reconstruction ICC +alone doesn't close. ### Rule D — Configuration is JSON / YAML / REST only From 9f1e5afbb8efb341f915abec8655756b2c4a8a1e Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 20 Apr 2026 17:19:37 +0000 Subject: [PATCH 4/9] =?UTF-8?q?plan(codec-sweep):=20iron=20rule=20?= =?UTF-8?q?=E2=80=94=20SoA=20never=20scalarises=20without=20ndarray?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User directive: "i should never have to remind you to use simd because the struct of arrays never ever does scalar without ndarray." Corrections: - Removed consumer-visible "Tier 5 scalar" row from the polyfill table. Scalar fallback (when it exists at all for exotic targets) lives INSIDE ndarray::simd::* — the consumer never hand-rolls a scalar loop on a SoA path. - Added iron rule before the tier table: every tier in the chain calls ndarray::simd::* / ndarray::simd_amx::* / ndarray::hpc::amx_matmul::* — if a kernel runs scalar on the SoA path, the SoA invariant is broken. - Dispatch pseudo-code cleaned: the else branch lands on ndarray::simd::F32x16 (Tier 3 mandatory floor via target-cpu= x86-64-v4). No "else scalar loop" short-circuit exists. If ndarray::simd were unavailable, SoA wouldn't be the right path. https://claude.ai/code/session_01SbYsmmbPf9YQuYbHZN52Zh --- .claude/plans/codec-sweep-via-lab-infra-v1.md | 32 ++++++++++++++----- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/.claude/plans/codec-sweep-via-lab-infra-v1.md b/.claude/plans/codec-sweep-via-lab-infra-v1.md index de76d855..2835e470 100644 --- a/.claude/plans/codec-sweep-via-lab-infra-v1.md +++ b/.claude/plans/codec-sweep-via-lab-infra-v1.md @@ -227,13 +227,26 @@ The SIMD tier each JIT-emitted kernel lands on follows this strict polyfill chain — tier 1 is tried first, each tier falls through to the next when unavailable: -| Tier | Primitive | Source | When available | MACs / instr | +**Iron rule — SoA never scalarises without ndarray.** If a kernel +runs scalar, the SoA invariant is broken. Every tier in the chain +below calls `ndarray::simd::*` or `ndarray::simd_amx::*` or +`ndarray::hpc::amx_matmul::*` — these modules handle their own +internal scalar fallback for exotic targets; the consumer never +hand-rolls a scalar loop. + +| Tier | Primitive | Source | When selected | MACs / instr | |---|---|---|---|---| | **1 — Intel AMX tiles** (preferred for matmul-heavy paths: OPQ, distance-table build) | `tile_dpbusd` (u8×i8→i32) / `tile_dpbf16ps` (bf16×bf16→f32) | `ndarray::hpc::amx_matmul::*` | `ndarray::simd_amx::amx_available() == true` (Sapphire Rapids+, OS has enabled XCR0 tile bits 17/18, Linux `prctl(ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)` succeeded) | **256** | | 2 — AVX-512 VNNI | `vnni_dot_u8_i8`, `vnni_matvec`, `matvec_dispatch` | `ndarray::simd_amx::*` (VNNI lives one tier down from AMX, stable intrinsics) | AVX-512 VNNI subset | 64 | | 3 — AVX-512 baseline | `F32x16`, `U8x64`, `F64x8` | `ndarray::simd::*` (mandatory default: ndarray's `.cargo/config.toml` sets `target-cpu=x86-64-v4`) | Always on canonical build targets | 16 | -| 4 — AVX-2 fallback | `F32x8`, `F64x4` | `ndarray::simd::*` (cfg-gated; triggers when build drops to `x86-64-v3`) | Compile-time cfg | 8 | -| 5 — Scalar | pure Rust loops | `ndarray::simd::scalar::*` | non-x86 / short slices / correctness reference | 1 | +| 4 — AVX-2 fallback | `F32x8`, `F64x4` | `ndarray::simd::*` (cfg-gated; triggers only when build cfg drops to `x86-64-v3`) | Compile-time cfg | 8 | + +Note the absence of a consumer-visible "scalar" tier. Scalar +fallback — when it exists at all — lives inside `ndarray::simd` +for non-x86 correctness; the codec JIT never emits it directly +and never short-circuits out of an ndarray call into a hand- +written loop. Any such short-circuit on a SoA path is a contract +violation. **Dispatch shape the JIT emits (real primitive names only):** @@ -244,14 +257,17 @@ use ndarray::simd::F32x16; if amx_available() && kernel_params.is_matmul_heavy() { // Tier 1: Intel AMX tile matmul. Codebook distance-table build - // drops from 24-48h (scalar/VNNI) to ~1:20h at this tier per + // drops from 24-48h (non-AMX) to ~1:20h at this tier per // simd_amx.rs top-of-module measurement. unsafe { tile_dpbf16ps(); } // or tile_dpbusd for u8×i8 accumulators } else { - // Tiers 2-5: target-cpu=x86-64-v4 keeps Tier 3 as the always- - // available floor; cfg resolves the specific lane type. - let lane = F32x16::from_slice(…); - /* …accumulate… */ + // Tiers 2-4 via ndarray::simd::* — target-cpu=x86-64-v4 keeps + // Tier 3 (F32x16) as the always-available floor on canonical + // builds. cfg resolves the specific lane type at compile time. + // No hand-rolled scalar "else" branch — if ndarray::simd were + // unavailable the SoA path itself would not be the right one. + let lane = F32x16::from_slice(bytemuck::cast_slice(window)); + /* …accumulate via F32x16 ops… */ } ``` From 26bf9cff7a95fa106c7329e3a806a39193057b75 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 20 Apr 2026 17:23:51 +0000 Subject: [PATCH 5/9] =?UTF-8?q?plan(codec-sweep):=20Rule=20E=20=E2=80=94?= =?UTF-8?q?=20Wire=20surface=20IS=20the=20SIMD=20surface=20(object-oriente?= =?UTF-8?q?d)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User directive: "the api for lab needs to be simd object oriented surface." Rule E binds the lab Wire DTOs to the SIMD shapes they feed. The Wire types are NOT convenience scalar bags that get reassembled into SIMD structures internally — they ARE the SIMD surface, serialised. Four consequences: (i) Lane-shaped aggregates. LaneWidth enum mirrors ndarray::simd::* lane types (F32x16, U8x64, F64x8, BF16x32). Every tensor-carrying DTO names its lane_width explicitly. (ii) Methods, not bags. WireTensorView exposes row() / row_count() / lanes_f32x16() / subspace(); CodecParams exposes kernel_signature() / lane_width() / is_matmul_heavy(). Consumers never reassemble a tensor from a Vec. (iii) Kernel signature keying. CodecParams::kernel_signature() returns a stable hash only over fields that shape the emitted IR. JIT cache keys on this object-computed signature; adding an unrelated config field does not invalidate entries. (iv) Serialisation preserves alignment. Decoded WireTensorView bytes land in a 64-byte-aligned buffer; consumers call slice::array_windows::<64>() + F32x16::from_slice directly, no adapter, no copy, no re-align. Plus three cleanups from prior corrections: - kernel_contract_test now scans IR for the real symbols: ndarray::simd::*, ndarray::simd_amx::*, ndarray::hpc::amx_matmul::* (allowed) and std::arch / simd_avxNNN reach (banned). - amx_dispatch_test corrected: x86_64-gated (not aarch64-apple-darwin), calls ndarray::simd_amx::amx_available(). When true on Sapphire Rapids+ runners, asserts backend = "amx" trace for matmul-heavy candidates; when false, verifies Tier-2 VNNI or Tier-3 F32x16 selection — NEVER scalar. - New wire_object_surface_test round-trips WireCalibrate + WireTensorView through JSON/gRPC and proves the decoded bytes are consumable with zero adapter code via array_windows + F32x16. - D1.1 body sketch cleaned: dropped fictional array_window (singular); imports simd_caps from ndarray::hpc::simd_caps (real path); cache uses RwLock for interior mutability per ndarray data-flow rule ("no &mut self during computation"); kernel_signature comes from CodecParams method (Rule E), not a free-function hash. https://claude.ai/code/session_01SbYsmmbPf9YQuYbHZN52Zh --- .claude/plans/codec-sweep-via-lab-infra-v1.md | 172 ++++++++++++++---- 1 file changed, 140 insertions(+), 32 deletions(-) diff --git a/.claude/plans/codec-sweep-via-lab-infra-v1.md b/.claude/plans/codec-sweep-via-lab-infra-v1.md index 2835e470..359503d0 100644 --- a/.claude/plans/codec-sweep-via-lab-infra-v1.md +++ b/.claude/plans/codec-sweep-via-lab-infra-v1.md @@ -368,22 +368,120 @@ constructing a JSON body. **Zero Rust changes. Zero rebuilds.** The JIT kernel cache hashes `CodecParams` and compiles once per unique shape; everything after is cache hits. -### Rule enforcement — test gate in Phase 0 +### Rule E — Wire surface IS the SIMD surface (object-oriented, not scalar bags) -Phase 0's verification gate adds: +The REST/gRPC/Wire DTOs do not package "convenience" scalars that +internally reassemble into SIMD structures. The Wire types ARE +the SIMD surface, serialised. Four consequences: + +**(i) Lane-shaped aggregates.** Every tensor-carrying DTO names +its lane width explicitly via an enum that mirrors the +`ndarray::simd::*` lane types it will feed: + +```rust +pub enum LaneWidth { + F32x16, // AVX-512 f32 lane — default for codec decode + U8x64, // AVX-512 u8 lane — palette index reads + F64x8, // AVX-512 f64 lane — high-precision calibration + BF16x32, // AVX-512 bf16 lane — tile_dpbf16ps input +} + +pub struct WireTensorView { + pub shape: [u32; 2], // rows × cols + pub lane_width: LaneWidth, + pub bytes_base64: String, // decode target is 64-byte aligned +} +``` + +**(ii) Methods, not bags.** Every DTO exposes methods that mirror +the SoA + SIMD operations the JIT kernel will perform. Consumers +never reassemble a tensor from a `Vec`: + +```rust +impl WireTensorView { + pub fn row(&self, idx: usize) -> &[u8]; // zero-copy slice after decode + pub fn row_count(&self) -> u32 { self.shape[0] } + pub fn lanes_f32x16(&self, row: usize) -> impl Iterator + '_; + pub fn subspace(&self, row: usize, k: u32, sub_bytes: usize) -> &[u8]; +} + +impl CodecParams { + /// Object-computed signature; JIT cache key. + pub fn kernel_signature(&self) -> u64; + /// Expected lane width of the decode kernel this params produces. + pub fn lane_width(&self) -> LaneWidth; + /// True if this kernel benefits from Tier-1 AMX dispatch. + pub fn is_matmul_heavy(&self) -> bool; +} +``` + +**(iii) Kernel signature keying.** `CodecParams::kernel_signature` +is the JIT cache key; adding an unrelated config field does not +invalidate existing kernel entries because the method returns a +stable hash only over the fields that actually shape the emitted +IR. + +**(iv) Serialisation preserves alignment.** When the REST handler +decodes `WireTensorView.bytes_base64`, the output lands in a +64-byte-aligned buffer (via the existing `ndarray::hpc` +alignment utilities). Consumers can +`slice::array_windows::<64>()` over the decoded buffer and feed +the result directly to `F32x16::from_slice` — no re-align, no +copy, no adapter. + +**Example — JSON body for `/v1/shader/calibrate` (SIMD-shaped):** + +```json +{ + "candidate": { + "subspaces": 6, + "centroids": 1024, + "residual_depth": 1, + "lane_width": "F32x16", + "pre_rotation": { "kind": "hadamard", "dim": 4096 }, + "distance": "adc", + "calibration_rows": 2048, + "seed": 42 + }, + "tensor_view": { + "shape": [3072, 1024], + "lane_width": "F32x16", + "bytes_base64": "<…>" + } +} +``` + +`lane_width` on both sides is a contract: the JIT kernel compiled +for `F32x16` reads a `WireTensorView` aligned for `F32x16`. No +adapter layer between the DTO and the `ndarray::simd::*` call +site. + +### Rule enforcement — test gates in Phase 0 + +Phase 0's verification adds: - `kernel_contract_test` — iterates a list of `CodecParams` (from - `configs/codec/*.yaml`), compiles each, scans emitted IR for - uses of `ndarray::simd::array_window` and `ndarray::simd::*` - symbols, fails if any kernel reaches `std::arch::*` or - `ndarray::hpc::*`. -- `amx_dispatch_test` (aarch64-apple-darwin only, gated with - `#[cfg(all(target_arch = "aarch64", target_os = "macos"))]`) — - verifies `simd_caps().has_amx() == true` on M-series and that a - rotation kernel's trace records `backend = "amx"` for that - call. - -These tests fire as part of `cargo test -p cognitive-shader-driver + `configs/codec/*.yaml`), compiles each, scans the emitted IR + for banned symbols (`std::arch::*`, + `ndarray::hpc::simd_avx{2,512}::*` reach-through) and required + callsites (`ndarray::simd::*`, `ndarray::simd_amx::*`, or + `ndarray::hpc::amx_matmul::*`). Fails if any kernel reaches + around the canonical surface or hand-rolls a scalar loop. +- `amx_dispatch_test` — gated with + `#[cfg(target_arch = "x86_64")]`; calls + `ndarray::simd_amx::amx_available()`, and when `true` on the + runner (Sapphire Rapids+ with OS tile state enabled), verifies + a matmul-heavy candidate's emitted kernel's trace records + `backend = "amx"`. When `false`, verifies Tier-2 VNNI or + Tier-3 F32x16 selection, never scalar. +- `wire_object_surface_test` — round-trips `WireCalibrate` + + `WireTensorView` through JSON and gRPC, asserts decoded bytes + land in a 64-byte-aligned buffer, and calls + `slice::array_windows::<64>()` + `F32x16::from_slice` on the + result to prove the surface is consumable with zero adapter + code. + +All three fire under `cargo test -p cognitive-shader-driver --features lab` in Phase 0 CI; any Phase 1+ commit that breaks them is rejected. @@ -394,35 +492,45 @@ them is rejected. `crates/cognitive-shader-driver/src/codec_research.rs` — add: ```rust -use ndarray::simd::{array_window, simd_caps, F32x16, U8x64}; +use ndarray::simd::{F32x16, U8x64}; +use ndarray::hpc::simd_caps::{simd_caps, SimdCaps}; +use ndarray::simd_amx::amx_available; +use std::sync::RwLock; +// Per ndarray/.claude/rules/data-flow.md: "No &mut self during +// computation." Cache uses interior mutability. struct CodecKernelCache { - handles: HashMap, - compiler: JitCompiler, // Cranelift via jitson - caps: SimdCaps, // from ndarray::simd::simd_caps() + handles: RwLock>, + compiler: JitCompiler, // Cranelift via jitson + caps: SimdCaps, // from ndarray::hpc::simd_caps::simd_caps() } impl CodecKernelCache { - fn kernel_for(&mut self, params: &CodecParams) -> &KernelHandle { - let key = hash_codec_params(params); - self.handles.entry(key).or_insert_with(|| { - // codec_ir emits calls to ndarray::simd::* only: - // - array_window(tensor, row, cnt) for tensor access - // - U8x64::from_lanes(...) for centroid index reads - // - F32x16 arithmetic for ADC distance accumulation - // Zero std::arch, zero ndarray::hpc reach. - self.compiler.compile(codec_ir(params, &self.caps)) - }) + fn kernel_for(&self, params: &CodecParams) -> KernelHandle { + let key = params.kernel_signature(); // object-computed, per Rule E + if let Some(h) = self.handles.read().unwrap().get(&key) { return h.clone(); } + let handle = self.compiler.compile(codec_ir(params, &self.caps)); + self.handles.write().unwrap().insert(key, handle.clone()); + handle } } fn codec_ir(params: &CodecParams, caps: &SimdCaps) -> KernelIr { // Emits IR that: - // for each subspace s: - // let w = array_window(input, s * sub_dim, sub_dim); - // let d = adc_distances_simd::(w, codebook[s]); - // accumulate into row_distance via caps-aware reduction - // if params.residual_depth > 0: recurse on residual + // * Iterates rows via stdlib slice::array_windows::<64>() over + // the 64-byte-aligned WireTensorView buffer (per Rule A). + // * For matmul-heavy rotations with amx_available() && caps: + // ndarray::hpc::amx_matmul::{tile_dpbusd, tile_dpbf16ps} + // (Tier 1, 256 MACs/instr). + // * Otherwise: + // ndarray::simd_amx::{vnni_dot_u8_i8, vnni_matvec} (Tier 2, 64) + // or ndarray::simd::F32x16 / U8x64 (Tier 3, 16 — mandatory floor). + // * Accumulates ADC distances via F32x16 adds and F32x16::reduce_sum. + // * If params.residual_depth > 0: compose a second IR block over + // (input − first_pass_decoded) at recursive lane width. + // + // Zero std::arch::*, zero ndarray::hpc::simd_avx{2,512}::* reach, + // zero scalar loops. ... } ``` From 9f8f15a20fb3928d29be05130049b5f0a5a630c3 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 20 Apr 2026 17:27:24 +0000 Subject: [PATCH 6/9] =?UTF-8?q?plan(codec-sweep):=20Rule=20F=20=E2=80=94?= =?UTF-8?q?=20serialisation=20at=20the=20edge=20only,=20never=20inside?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User directive: "Serialisation only once when touching as rest, no Serialisation EVER inside." Rule F binds serialisation to the two legitimate edges: Ingress (once per request): REST/gRPC handler decodes JSON/protobuf → Rust objects WireTensorView.bytes_base64 → 64-byte-aligned [u8] buffer YAML config file → parsed CodecParams at load time Egress (once per response / per candidate): REST/gRPC response encodes Rust result → JSON/protobuf Lance append writes candidate row → Arrow columnar Everything between ingress and egress is in-memory Rust objects or zero-copy &[u8] SoA slices. No JSON, no YAML, no protobuf, no bincode, no re-encode for "debug dumps." Traces flow as Rust objects through ShaderSink; only the final sink at the egress boundary may serialise. Hard prohibitions inside the pipeline: - serde_json::to_string between layers - bincode::serialize for L1↔L2↔L3 handoffs - prost::Message::encode inside the JIT loop - re-parsing YAML per candidate (parse once at load, cache object) - debug-JSON dumps inside hot paths Why load-bearing: 1. Alignment survives — decoded tensor bytes land once in a 64-byte-aligned buffer; no intermediate re-pack. 2. JIT cache keys stay stable — kernel_signature hashes the Rust object directly, no "same config, different whitespace → different hash → cache miss" trap. 3. Token-agreement comparisons stay honest — both Passthrough and candidate paths consume the same decoded buffer; any internal re-encode would introduce precision drift that mimics or masks codec error. 4. Sweep throughput — decode at 2-10 GB/s is fine once; repeated re-serialisation would turn a JIT-fast sweep into serde-bound. Enforcement: new test gate no_internal_serialisation_test in Phase 0 scans codec_research.rs / codec_bridge.rs / token_agreement.rs / markov_bundle.rs for forbidden symbols (serde_json::*, bincode::*, prost encode/decode outside handlers). Fails the build if any such call appears outside src/serve.rs / src/grpc.rs ingress/egress handlers or the Lance append writer. https://claude.ai/code/session_01SbYsmmbPf9YQuYbHZN52Zh --- .claude/plans/codec-sweep-via-lab-infra-v1.md | 83 ++++++++++++++++++- 1 file changed, 81 insertions(+), 2 deletions(-) diff --git a/.claude/plans/codec-sweep-via-lab-infra-v1.md b/.claude/plans/codec-sweep-via-lab-infra-v1.md index 359503d0..9b4fb0d5 100644 --- a/.claude/plans/codec-sweep-via-lab-infra-v1.md +++ b/.claude/plans/codec-sweep-via-lab-infra-v1.md @@ -456,6 +456,75 @@ for `F32x16` reads a `WireTensorView` aligned for `F32x16`. No adapter layer between the DTO and the `ndarray::simd::*` call site. +### Rule F — Serialisation at the edge only; never inside + +Serialisation (JSON / YAML / protobuf / bincode / anything that +turns structured bytes into more bytes) happens at exactly two +points per request: + +``` +┌────────────────────────────────────────────────────────────────┐ +│ REST / gRPC ingress │ +│ Wire bytes ──decode ONCE──▶ Rust objects │ +│ JSON body → WireCalibrate + WireTensorView │ +│ bytes_base64 → 64-byte-aligned [u8] buffer │ +│ │ +│ ═══════ EVERYTHING BELOW IS IN-MEMORY RUST OR &[u8] SoA ═══════│ +│ │ +│ CodecKernelCache.kernel_for(params) — operates on Rust object │ +│ codec_ir(params, caps) — emits IR from object │ +│ JIT kernel.call(row_bytes) — reads &[u8], writes &mut│ +│ F32x16::from_slice / tile_dpbusd — SIMD ops on raw bytes │ +│ SoA column reads / writes — Copy microcopies │ +│ ShaderDriver.dispatch(...) — Rust object flow │ +│ │ +│ ═══════ NO JSON, NO YAML, NO PROTOBUF, NO BINCODE HERE ═══════ │ +│ │ +│ Lance append (egress — one serialisation to columnar disk) │ +│ REST / gRPC response (egress — one encode of WireResult out) │ +└────────────────────────────────────────────────────────────────┘ +``` + +**Hard prohibitions inside the pipeline:** + +- No `serde_json::to_string(¶ms)` between layers. +- No `bincode::serialize(&state)` for L1↔L2↔L3 handoffs. +- No `prost::Message::encode(&cell)` inside the JIT loop. +- No re-parsing a YAML file per candidate (parse once at load, + cache the Rust object). +- No "debug JSON dump" inside hot paths (traces flow as Rust + objects through `ShaderSink`; only the final sink at the + egress boundary may serialise). + +**Why this is load-bearing:** + +1. **Alignment survives.** Decoded `WireTensorView` bytes land + once in a 64-byte-aligned buffer; no intermediate step + re-packs them. `array_windows::<64>()` + `F32x16::from_slice` + see the original decode target. +2. **JIT cache keys are stable.** `CodecParams::kernel_signature` + hashes the Rust object directly; avoids the "same config, + different JSON whitespace → different hash → cache miss" + trap. +3. **Token-agreement comparisons stay honest.** Both + `Passthrough` and `candidate` paths consume the same decoded + tensor buffer. Any internal re-encode would introduce + precision drift that mimics (or masks) codec error. +4. **Sweep throughput.** Ingest at 2-10 GB/s decode is fine + once; repeated re-serialisation would turn a JIT-fast sweep + into a serde-bound one. + +**The two allowed edges:** + +| Edge | Format | Direction | Frequency | +|---|---|---|---| +| REST/gRPC ingress | JSON / protobuf | in | once per request | +| REST/gRPC response | JSON / protobuf | out | once per response | +| Lance append | Arrow columnar | out (egress) | once per candidate (sweep logger) | +| YAML config load | YAML | in | once per config file at load | + +Anything else — reject the proposal. + ### Rule enforcement — test gates in Phase 0 Phase 0's verification adds: @@ -480,8 +549,18 @@ Phase 0's verification adds: `slice::array_windows::<64>()` + `F32x16::from_slice` on the result to prove the surface is consumable with zero adapter code. - -All three fire under `cargo test -p cognitive-shader-driver +- `no_internal_serialisation_test` — scans the + `codec_research.rs` / `codec_bridge.rs` / `token_agreement.rs` + / `markov_bundle.rs` / any JIT-adjacent module for forbidden + symbols (`serde_json::to_*`, `serde_json::from_*`, + `bincode::*`, `prost::Message::encode`, `prost::Message::decode` + outside ingress/egress handlers). Enforces Rule F: these calls + may appear ONLY in `src/serve.rs::handler_*` and + `src/grpc.rs::*_service` (ingress), the final response write + (egress), and the Lance append writer (egress to disk). Any + other callsite fails the test. + +All four fire under `cargo test -p cognitive-shader-driver --features lab` in Phase 0 CI; any Phase 1+ commit that breaks them is rejected. From 6c1d7e0337d316f1a334b5013435a3430dca3df3 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 20 Apr 2026 17:31:00 +0000 Subject: [PATCH 7/9] =?UTF-8?q?plan(codec-sweep):=20appendix=20A=20?= =?UTF-8?q?=E2=80=94=20starter=20YAML=20configs=20per=20Rule=20D/E/F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Eight concrete YAML configs for configs/codec/*.yaml that Phase 0 will consume: 00_baseline_passthrough — regression anchor (top1=1.000 exactly) 01_pr220_baseline — negative control, reproduces #220 ICC 0.195 02_pr219_overfit_reproducer — negative control, split-test must FAIL 10_fix_a_wider_codebook — #220 (a) 1024 centroids 11_fix_b_residual_pq — #220 (b) residual depth=1 12_fix_c_hadamard_rotation — #220 (c) Hadamard pre-rotation 13_fix_d_opq_rotation — #220 (d) OPQ learned rotation 20_composite_a_plus_b — composition probe for combinatorial lift 30_cross_product_sweep — SweepGrid for D3.1 initial sweep Each YAML: - Names lane_width explicitly (Rule E) so the JIT compiles the right SIMD tier. BF16x32 for OPQ (AMX bf16 tile path) — others default to F32x16. - Carries a notes: block stating the expected measurement outcome, so Phase 0's regression detection has ground truth to check against (e.g., baseline reproducer must produce ICC ≈ 0.195, overfit reproducer must FAIL the split-test). - Separates calibration_rows from measurement_rows where relevant (pr219_overfit_reproducer sets them equal so the pipeline refuses to report the ICC, demonstrating the guard that prevents PR #219's overfit-on-training artefact from recurring). 30_cross_product_sweep specifies the initial 54-candidate grid (1 subspace × 3 centroids × 3 residuals × 3 rotations × 1 distance × 2 lane widths). Expected JIT compile budget: ~800 ms one-time; everything after is cache hits per Rule A/B. Operating principle reiterated at the end: adding a candidate is authoring a YAML; changing params is editing YAML; Rust reads YAML once at ingress (Rule F) and never re-serialises. Sweep logger appends result rows to Lance — the only egress beyond the REST response. https://claude.ai/code/session_01SbYsmmbPf9YQuYbHZN52Zh --- .claude/plans/codec-sweep-via-lab-infra-v1.md | 245 ++++++++++++++++++ 1 file changed, 245 insertions(+) diff --git a/.claude/plans/codec-sweep-via-lab-infra-v1.md b/.claude/plans/codec-sweep-via-lab-infra-v1.md index 9b4fb0d5..c0896e84 100644 --- a/.claude/plans/codec-sweep-via-lab-infra-v1.md +++ b/.claude/plans/codec-sweep-via-lab-infra-v1.md @@ -863,3 +863,248 @@ a clean verification step. lattice quantization, residual vector quantization, neural codecs — any codec where decoding is parameterised by a small struct fits the sweep driver unchanged. + +--- + +## Appendix A — Starter YAML configs (one per #220 fix + controls) + +These are the concrete inputs Phase 0 consumes once the Wire +surface is hardened. Living at `configs/codec/*.yaml`; new +candidates are YAML edits, not Rust changes (Rule D). Each +explicitly names its `lane_width` per Rule E so the JIT compiles +the right SIMD tier. + +### `configs/codec/00_baseline_passthrough.yaml` — regression anchor + +The null codec: no compression, no rotation. Token-agreement vs +itself must be 1.0 exactly — any drift means the harness is +non-deterministic. + +```yaml +name: baseline_passthrough +codec: passthrough +lane_width: F32x16 +calibration_rows: 0 +seed: 42 +notes: | + Token-agreement gate self-test. top1_rate must be 1.000 exactly. +``` + +### `configs/codec/01_pr220_baseline.yaml` — negative control + +Reproduces PR #220's measured result (reconstruction ICC ≈ 0.195, +0/234 tensors ≥ 0.99) so the sweep pipeline demonstrably doesn't +silently "fix" prior measurements. If this config produces any +number other than ≈ 0.195, the pipeline is broken, not the codec. + +```yaml +name: pr220_baseline +codec: cam_pq +subspaces: 6 +centroids: 256 +residual_depth: 0 +lane_width: F32x16 +pre_rotation: + kind: identity +distance: adc +calibration_rows: 2048 # held-out, NOT training rows +seed: 42 +notes: | + Reproduces PR #220 D5 full-size validation. Expected: mean + reconstruction ICC ≈ 0.195 across Qwen3-TTS-0.6B argmax tensors. + If > 0.2 or < 0.17, the pipeline is broken. +``` + +### `configs/codec/02_pr219_overfit_reproducer.yaml` — negative control + +Reproduces PR #219's trained-and-tested-on-same-128-rows artifact +(ICC 0.9998). Sweep-report must flag this as "training-set fit, +not generalising" via a split-test gate that refuses to report ICC +unless `calibration_rows != measurement_rows`. + +```yaml +name: pr219_overfit_reproducer +codec: cam_pq +subspaces: 6 +centroids: 256 +residual_depth: 0 +lane_width: F32x16 +pre_rotation: + kind: identity +distance: adc +calibration_rows: 128 +measurement_rows: 128 # SAME rows → split-test must FAIL +overfit_probe: true # flag for the pipeline +seed: 42 +notes: | + Reproduces PR #219 D1 128-row benchmark. Expected: ICC ≈ 0.9998 + on the 128-row fit (meaningless) + split-test FAILS (since + calibration_rows == measurement_rows). Demonstrates the pipeline + refuses to report ICC on overlapping training/measurement sets. +``` + +### `configs/codec/10_fix_a_wider_codebook.yaml` — #220 fix (a) + +1024 centroids per subspace (10-bit palette index, 7.5 B/row). +Tier-1 AMX helps: bigger distance-table build benefits most from +`tile_dpbf16ps`. + +```yaml +name: fix_a_wider_codebook_1024 +codec: cam_pq +subspaces: 6 +centroids: 1024 # was 256 +residual_depth: 0 +lane_width: F32x16 +pre_rotation: + kind: identity +distance: adc +calibration_rows: 2048 +seed: 42 +notes: | + PR #220 (a): wider codebook. Expected: reconstruction ICC rises + meaningfully vs 01_pr220_baseline; token-agreement the open + question this sweep exists to answer. +``` + +### `configs/codec/11_fix_b_residual_pq.yaml` — #220 fix (b) + +Residual PQ with one refinement pass. First-pass decode uses +256 centroids; residual encoded with a second 256-centroid pass. + +```yaml +name: fix_b_residual_pq_depth1 +codec: cam_pq +subspaces: 6 +centroids: 256 +residual_depth: 1 # second pass over the residual +residual_centroids: 256 +lane_width: F32x16 +pre_rotation: + kind: identity +distance: adc +calibration_rows: 2048 +seed: 42 +notes: | + PR #220 (b): residual PQ. JIT composes two decode kernels per + Rule A (array_windows stage 1 → subtract → stage 2 → add). + Total bytes: 2 × 6 × (log2 256)/8 = 1.5 B/row — compact. +``` + +### `configs/codec/12_fix_c_hadamard_rotation.yaml` — #220 fix (c) + +Hadamard pre-rotation decorrelates subspaces before PQ. The +rotation is add/sub butterfly — stays on Tier-3 F32x16 (AVX-512 +already fast enough; AMX adds no value for pure add/sub). + +```yaml +name: fix_c_hadamard_pre_rotation +codec: cam_pq +subspaces: 6 +centroids: 256 +residual_depth: 0 +lane_width: F32x16 +pre_rotation: + kind: hadamard + dim: 4096 # must be 2^k for Sylvester construction +distance: adc +calibration_rows: 2048 +seed: 42 +notes: | + PR #220 (c): Hadamard pre-rotation. Rotation is F32x16 butterfly + (Tier 3); decode remains centroids=256. Tests whether + decorrelating subspaces closes the 0.195 → 0.99 gap alone. +``` + +### `configs/codec/13_fix_d_opq_rotation.yaml` — #220 fix (d) + +OPQ learned rotation. Matrix trained offline, stored as a Lance +blob referenced by `matrix_blob_id`. Applied as matmul → Tier-1 +AMX is the dominant speedup path (~44 μs vs ~400 μs F32x16 per +cycle on Sapphire Rapids per `simd_amx.rs`). + +```yaml +name: fix_d_opq_rotation +codec: cam_pq +subspaces: 6 +centroids: 256 +residual_depth: 0 +lane_width: BF16x32 # bf16 is the natural AMX tile format +pre_rotation: + kind: opq + matrix_blob_id: 0xDEADBEEF # trained in a separate PR; blob points to weights + dim: 4096 +distance: adc +calibration_rows: 2048 +seed: 42 +notes: | + PR #220 (d): OPQ learned rotation. Matrix baked offline; + applied as tile_dpbf16ps matmul when amx_available(). This is + the config where the AMX polyfill pays off most. +``` + +### `configs/codec/20_composite_a_plus_b.yaml` — combined fixes + +Wider codebook + residual PQ together. If either (a) or (b) alone +fails the token-agreement gate but their composition passes, the +sweep has found the combinatorial lift the #220 author list +implicitly hopes for. + +```yaml +name: composite_wider_plus_residual +codec: cam_pq +subspaces: 6 +centroids: 1024 # from (a) +residual_depth: 1 # from (b) +residual_centroids: 1024 # wider residual too +lane_width: F32x16 +pre_rotation: + kind: identity +distance: adc +calibration_rows: 2048 +seed: 42 +notes: | + Composition test. If this passes and (a), (b) individually fail, + the fix is synergistic — report prominently in D4 frontier. +``` + +### `configs/codec/30_cross_product_sweep.yaml` — the actual grid + +The sweep driver (D3.1) consumes a single `SweepGrid` YAML that +enumerates the cross product explicitly. Phase 0 commits this +file so the first sweep has a known input. + +```yaml +name: phase1_initial_cross_product +tensor_path: models/qwen3-tts-0.6b/q_proj.safetensors +grid: + subspaces: [6] + centroids: [256, 512, 1024] + residual_depths: [0, 1, 2] + rotations: + - { kind: identity } + - { kind: hadamard, dim: 4096 } + - { kind: opq, matrix_blob_id: 0xDEADBEEF, dim: 4096 } + distances: [adc] + lane_widths: [F32x16, BF16x32] # F32x16 for standard, BF16x32 for AMX path +measure: + - reconstruction_error_held_out + - reconstruction_icc_held_out + - token_agreement_top1 + - token_agreement_top5 + - per_layer_mse +log_to_lance: logs/sweep_phase1.lance +notes: | + Phase 1 initial grid: 1 × 3 × 3 × 3 × 1 × 2 = 54 candidates. + Expected JIT compile time: 54 × ~15 ms = ~800 ms total (one-time). + Expected token-agreement runtime: 54 × N_prompts × T_decode. +``` + +**Operating principle for this appendix:** adding a new codec +candidate is authoring a YAML file in this directory. Changing +parameters is editing the YAML. The Rust code in +`cognitive-shader-driver` reads the YAML once at ingress (Rule +F); everything after is in-memory `CodecParams` objects, JIT +kernel cache hits, SoA column sweeps, and SIMD lane ops — none +of which touch serialisation again until the sweep logger +appends the result row to Lance (the one allowed egress). From 33d896436fb2db7fdc183c363ea3235d302ffd77 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 20 Apr 2026 17:35:03 +0000 Subject: [PATCH 8/9] plan(codec-sweep): audit vs CODING_PRACTICES.md (EmbedAnything patterns) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three gaps found in the 8-item checklist; remediations folded into Phase 0 as new deliverables so they ship from day one, not as follow-up: Gap 1 — auto-detect, don't hardcode. Current plan expects caller to supply lane_width + tensor shape. Patch: D0.5 new auto_detect.rs (~140 LOC) reads config.json next to the safetensors and returns ModelFingerprint { architecture, hidden_size, lane_width default, tokenizer_class, … }. Consumed by WireTokenAgreement when tensor_view.lane_width is omitted. Mirrors EmbedAnything auto_detect.rs (6 tests). Gap 3 — builder, not raw struct assembly. Current plan shows CodecParams assembled directly. Patch: D0.6 CodecParamsBuilder fluent API in lance-graph-contract::cam. Used by sweep driver / tests / frontier analysis; YAML ingress still produces CodecParams via serde. Mirrors EmbedAnything builder.rs (7 tests). Gap 5 — u8 vs i8 distance tables. Current plan treats "adc" as one distance variant. Patch: split distance into adc_u8 / adc_i8 at the YAML + Rust enum level. Sign-handling affects bipolar cancellation per codec-findings-2026-04-20.md §I1 sign-flip. Three remain clean: Item 2 (sink pattern) — ShaderSink trait + Lance append are sinks. Item 4 (feature gates) — --features lab / serve / grpc declared. Item 6 (per-role scales) — one role per YAML preserves z-scale. Item 7 (calibration↔runtime boundary) — calibration_rows vs measurement_rows already split; 02_pr219_overfit_reproducer is the explicit test that enforces the boundary. Item 8 (no forward pass) — codebook/tile lookup only, per I6. All 5 anti-patterns dodged: lib.rs stays declarations-only; hot path is zero-copy &[u8] into SoA + Arc'd KernelHandle (no clones); Rust-first API; codebook/tile lookup (no matmul inner loop); precision ladder BF16 calibration → u8/i8 runtime → f32 accumulator (enforced by Rule E's LaneWidth on the Wire DTO matching the JIT kernel input format). New D0.7 — precision-ladder contract. CodecParams validation refuses impossible shapes at ingress (e.g., { lane_width: F32x16, rotation: Opq(…) } — OPQ must use BF16x32 to match tile_dpbf16ps). Validation fires before any JIT compile. Phase 0 LOC bumps: ~480 → ~700. Still one upfront rebuild. https://claude.ai/code/session_01SbYsmmbPf9YQuYbHZN52Zh --- .claude/plans/codec-sweep-via-lab-infra-v1.md | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/.claude/plans/codec-sweep-via-lab-infra-v1.md b/.claude/plans/codec-sweep-via-lab-infra-v1.md index c0896e84..5ea3537e 100644 --- a/.claude/plans/codec-sweep-via-lab-infra-v1.md +++ b/.claude/plans/codec-sweep-via-lab-infra-v1.md @@ -866,6 +866,65 @@ a clean verification step. --- +## Audit against `.claude/CODING_PRACTICES.md` (EmbedAnything patterns) + +Gaps found; remediations folded into the phase deliverables so +Phase 0 ships them from the start — not as follow-up. + +### Checklist results + +| # | Checklist item | Status | Remediation | +|---|---|---|---| +| 1 | Auto-detect model type, not hardcode names | **GAP** | `WireTokenAgreement` loader reads the model's `config.json` and infers (architecture, hidden_size, lane_width default, tokenizer) automatically. Caller does not supply `lane_width` for the tensor_view if `config.json` resolves it. Pattern: `auto_detect.rs` (6 tests). | +| 2 | commit() uses sink pattern | **OK** | `ShaderSink` trait already carries per-cycle emission; Lance append is a sink. | +| 3 | Builder, not raw struct assembly | **GAP** | Add `CodecParamsBuilder` with fluent API in `lance-graph-contract::cam`. Pattern: `builder.rs` (7 tests). YAML serde still produces the raw `CodecParams`; the builder is the **programmatic** entry point (sweep driver, tests). | +| 4 | Heavy deps behind feature gates | **OK** | `--features lab` / `serve` / `grpc` / `with-planner` already declared in Phase 0 D0.x. | +| 5 | Works with BOTH u8 and i8 tables | **GAP** | Split `distance: adc` into `distance: adc_u8` / `distance: adc_i8` at the YAML level. Sign-handling affects bipolar cancellation (per `codec-findings-2026-04-20.md` §I1 sign-flip). JIT is generic over the accumulator primitive. | +| 6 | Per-role scale factors preserved | **OK** (structurally) | Each YAML names a single `tensor_path` (one role per run: Q / K / V / Gate / Up / Down). Per-role z-scale survives by virtue of one-role-per-config. D2 token-agreement loader iterates roles separately, never mixing z-scales. | +| 7 | Calibration ↔ runtime boundary clean | **OK** | `calibration_rows` vs `measurement_rows` already separated; `02_pr219_overfit_reproducer` is the explicit test that the pipeline rejects `calibration_rows == measurement_rows`. | +| 8 | No forward passes at runtime | **OK** | The whole design is `tile_dpbusd` / `tile_dpbf16ps` lookup, no matmul inner loop (Invariant I6 — weights are seeds). | + +### Anti-patterns (none triggered) + +| # | Anti-pattern | Plan's position | +|---|---|---| +| 1 | 48KB lib.rs | `cognitive-shader-driver::lib.rs` stays module declarations only. D0 additions land in `wire.rs`, `serve.rs`, `codec_research.rs`, `token_agreement.rs`. | +| 2 | Clone-heavy structs | Hot path: `&[u8]` zero-copy into SoA columns; `WireTensorView` decoded once into aligned buffer; `KernelHandle` is `Arc`-wrapped so clones are refcount bumps (Rule F forbids re-serialise). | +| 3 | Python-first API | Rust-first. PyO3 via `lance-graph-python` is out-of-scope for this plan. | +| 4 | Forward pass at every query | Codebook / tile lookup only. | +| 5 | f32 everywhere | **Precision ladder** enforced: BF16 for calibration input to `tile_dpbf16ps` (OPQ); u8/i8 for palette-index runtime via `tile_dpbusd`; f32 only as the accumulator reduction width before producing a distance scalar. Enforced by `LaneWidth` on the Wire DTO (Rule E) matching the JIT kernel's input format. | + +### Added deliverables (fold into Phase 0 so they ship from day one) + +- **D0.5 — `auto_detect` module.** New file + `crates/cognitive-shader-driver/src/auto_detect.rs` (~140 LOC). + Reads `config.json` next to the safetensors; returns + `ModelFingerprint { architecture, hidden_size, n_layers, + tokenizer_class, vocab_size, default_lane_width, default_distance }`. + 6 tests mirroring EmbedAnything's pattern. Consumed by the + `WireTokenAgreement` handler when `tensor_view.lane_width` is + omitted on ingress. +- **D0.6 — `CodecParamsBuilder`.** Add to + `lance-graph-contract/src/cam.rs`. Fluent API: + `CodecParamsBuilder::new().subspaces(6).centroids(1024) + .residual(ResidualSpec::depth(1)).rotation(Rotation::hadamard(4096)) + .build() -> Result`. 7 tests mirroring + EmbedAnything's `builder.rs` (Lens/TableType/Pooling/Sinks → + Codec/Rotation/Residual/Distance). The sweep driver, tests, + and frontier analysis all use the builder; YAML ingress still + produces `CodecParams` via serde. +- **D0.7 — precision-ladder contract.** `CodecParams` validation + refuses `{ lane_width: F32x16, rotation: Opq(…) }` — OPQ must + use `BF16x32` to match `tile_dpbf16ps`. Validation error lands + at ingress (Rule F); impossible shapes are rejected before any + JIT compile. + +Phase 0 total bumps: ~480 + 140 (auto_detect) + 60 (builder) + +20 (precision validation) = **~700 LOC**. Still one upfront +rebuild. + +--- + ## Appendix A — Starter YAML configs (one per #220 fix + controls) These are the concrete inputs Phase 0 consumes once the Wire From f581839742f5cbe35fe700c0afdfefe65198df8e Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 20 Apr 2026 17:40:03 +0000 Subject: [PATCH 9/9] D0.6 + D0.7: CodecParams + CodecParamsBuilder + precision-ladder validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First Phase 0 code deliverable from codec-sweep-via-lab-infra-v1 plan. Zero-dep contract-side types the lab API (cognitive-shader-driver) will carry into JIT compilation. Adds to crates/lance-graph-contract/src/cam.rs (~290 LOC): Enums (Rule E — Wire surface IS the SIMD surface, object-oriented): LaneWidth { F32x16, U8x64, F64x8, BF16x32 } — mirrors ndarray::simd::* Distance { AdcU8, AdcI8 } — CODING_PRACTICES gap 5 (sign-handling / bipolar cancellation) Rotation { Identity, Hadamard{dim}, Opq{blob,dim} } Structs: ResidualSpec { depth, centroids } CodecParams { subspaces, centroids, residual, lane_width, pre_rotation, distance, calibration_rows, measurement_rows, seed } Builder (CODING_PRACTICES gap 3 — fluent API, not raw-struct): CodecParamsBuilder::new() .subspaces(u32).centroids(u32).residual(ResidualSpec) .lane_width(LaneWidth).rotation(Rotation).distance(Distance) .calibration_rows(u32).measurement_rows(u32).seed(u64) .build() -> Result Validation fires BEFORE any JIT compile (D0.7 precision ladder): - ZeroDimension — subspaces == 0 or centroids == 0 - OpqRequiresBf16 — OPQ routes through tile_dpbf16ps; only LaneWidth::BF16x32 is valid - HadamardDimNotPow2 — Sylvester construction needs dim = 2^k - CalibrationEqualsMeasurement — overfit guard: refuses to emit ICC when calibration_rows == measurement_rows (reproduces PR #219's 128-row trained-and-tested artifact) Methods on CodecParams: kernel_signature() -> u64 — JIT cache key (Rule E); excludes seed so calibration-sample changes don't invalidate cached kernels is_matmul_heavy() -> bool — true for OPQ or centroids > 512; drives Tier-1 AMX dispatch decision (Rule C polyfill hierarchy) Rotation::is_matmul() -> bool — Identity and Hadamard are false (butterfly stays on Tier-3 F32x16); only Opq returns true 14 new tests covering: - builder default matches PR #220 baseline shape - each validation variant fires correctly - OPQ + BF16x32 accepted; OPQ + F32x16 rejected with typed error - Hadamard + non-pow2 dim rejected - overfit guard fires on calibration == measurement - kernel_signature stable across identical builds - kernel_signature excludes seed (cache stays hot) - kernel_signature changes with centroids / rotation kind - is_matmul_heavy detects OPQ AND wide codebook (≥512 centroids) Zero-dep preserved (stdlib only: std::collections::hash_map:: DefaultHasher for kernel_signature, core::fmt + core::error for error types). No serde in the contract — YAML/JSON deserialisation belongs to the consumer crate, which will produce CodecParams via serde at the REST handler (Rule F — serialisation at edge only). Tests: 147/147 contract suite passing (133 prior + 14 new). https://claude.ai/code/session_01SbYsmmbPf9YQuYbHZN52Zh --- crates/lance-graph-contract/src/cam.rs | 383 +++++++++++++++++++++++++ 1 file changed, 383 insertions(+) diff --git a/crates/lance-graph-contract/src/cam.rs b/crates/lance-graph-contract/src/cam.rs index 5c9316e4..63db11e4 100644 --- a/crates/lance-graph-contract/src/cam.rs +++ b/crates/lance-graph-contract/src/cam.rs @@ -205,6 +205,389 @@ pub trait IvfContract: Send + Sync { fn probe(&self, query: &[f32], num_probes: usize) -> Vec<(u32, f32)>; } +// ───────────────────────────────────────────────────────────────────── +// Codec sweep parameters (plan: .claude/plans/codec-sweep-via-lab-infra-v1.md) +// +// CodecParams is the sweep-tunable shape the lab API passes to the JIT +// compiler. Consumers (cognitive-shader-driver) serde this from JSON / +// YAML at ingress; everything after ingress is in-memory Rust objects +// (Rule F — serialisation at the edge only). +// +// Zero-dep: no serde derives here. YAML/JSON shape lives in the consumer. +// ───────────────────────────────────────────────────────────────────── + +/// SIMD lane width the codec kernel will run on. Mirrors `ndarray::simd::*` +/// lane types; lab Wire DTOs carry this enum verbatim so the JIT compiles +/// against the width the REST handler decoded for (Rule E — +/// Wire surface IS the SIMD surface). +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum LaneWidth { + /// AVX-512 f32 lane — default codec decode / ADC accumulator. + F32x16, + /// AVX-512 u8 lane — palette index reads (`tile_dpbusd` input). + U8x64, + /// AVX-512 f64 lane — high-precision calibration. + F64x8, + /// AVX-512 bf16 lane — required for OPQ rotation (`tile_dpbf16ps`). + BF16x32, +} + +impl Default for LaneWidth { + fn default() -> Self { Self::F32x16 } +} + +/// Distance metric variant. Per CODING_PRACTICES gap 5: split u8/i8 +/// because sign-handling affects bipolar cancellation (codec-findings- +/// 2026-04-20.md §I1 sign-flip). +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum Distance { + /// Asymmetric distance computation, unsigned palette indices. + AdcU8, + /// Asymmetric distance, signed palette indices (bipolar cancellation). + AdcI8, +} + +impl Default for Distance { + fn default() -> Self { Self::AdcU8 } +} + +/// Pre-rotation applied before PQ encoding. Each variant maps to a +/// specific SIMD tier (Rule C — polyfill hierarchy): +/// +/// - `Identity` — no-op. +/// - `Hadamard { dim }` — Sylvester butterfly; stays on Tier-3 F32x16. +/// - `Opq { matrix_blob_id, dim }` — learned rotation matmul; Tier-1 +/// AMX (`tile_dpbf16ps`) when `ndarray::simd_amx::amx_available()`. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum Rotation { + Identity, + Hadamard { dim: u32 }, + Opq { matrix_blob_id: u64, dim: u32 }, +} + +impl Default for Rotation { + fn default() -> Self { Self::Identity } +} + +impl Rotation { + /// True when the rotation is a matmul (OPQ) and therefore + /// benefits from Tier-1 AMX dispatch. Hadamard is add/sub + /// butterfly — no matmul, no AMX speedup. + pub fn is_matmul(&self) -> bool { + matches!(self, Self::Opq { .. }) + } +} + +/// Residual PQ refinement pass. `depth = 0` disables residual; +/// `depth > 0` encodes residuals after first-pass decode through +/// another PQ stage (Rule A — composition via JIT; Rule B — stages +/// themselves are `ndarray::simd::*`). +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct ResidualSpec { + pub depth: u8, + pub centroids: u32, +} + +impl Default for ResidualSpec { + fn default() -> Self { Self { depth: 0, centroids: NUM_CENTROIDS as u32 } } +} + +impl ResidualSpec { + pub fn none() -> Self { Self { depth: 0, centroids: 0 } } + pub fn depth(d: u8, centroids: u32) -> Self { Self { depth: d, centroids } } +} + +/// Full codec parameter shape consumed by the JIT compiler. +/// +/// One `CodecParams` per candidate. The `kernel_signature()` method +/// returns a stable hash keyed over the IR-shaping fields; the +/// JIT kernel cache keys on this hash. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct CodecParams { + pub subspaces: u32, + pub centroids: u32, + pub residual: ResidualSpec, + pub lane_width: LaneWidth, + pub pre_rotation: Rotation, + pub distance: Distance, + pub calibration_rows: u32, + pub measurement_rows: u32, + pub seed: u64, +} + +/// Errors returned by `CodecParamsBuilder::build()` when validation fails. +/// Precision-ladder rejection fires before any JIT compile (D0.7). +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum CodecParamsError { + /// `subspaces = 0` or `centroids = 0` — sweep would divide by zero. + ZeroDimension { field: &'static str }, + /// OPQ requires BF16x32 lane to match `tile_dpbf16ps` tile format + /// (Rule C Tier 1; D0.7 precision ladder). + OpqRequiresBf16 { got: LaneWidth }, + /// Hadamard dim must be a power of two (Sylvester construction). + HadamardDimNotPow2 { dim: u32 }, + /// Overfit guard: pipeline refuses to emit ICC when + /// `calibration_rows == measurement_rows` (the PR #219 artifact). + CalibrationEqualsMeasurement { rows: u32 }, +} + +impl core::fmt::Display for CodecParamsError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match self { + Self::ZeroDimension { field } => write!(f, "codec param `{}` must be non-zero", field), + Self::OpqRequiresBf16 { got } => write!(f, "OPQ rotation requires LaneWidth::BF16x32 (tile_dpbf16ps), got {:?}", got), + Self::HadamardDimNotPow2 { dim } => write!(f, "Hadamard dim must be a power of two (Sylvester), got {}", dim), + Self::CalibrationEqualsMeasurement { rows } => write!( + f, + "calibration_rows ({}) must differ from measurement_rows \ + (would silently reproduce PR #219 overfit)", + rows + ), + } + } +} + +impl core::error::Error for CodecParamsError {} + +impl CodecParams { + /// Stable hash over the IR-shaping fields. JIT kernel cache key. + /// + /// Adding an unrelated field (e.g. seed) does NOT invalidate + /// existing kernel entries — seed is excluded because it does + /// not shape the emitted IR (only the calibration sample). + pub fn kernel_signature(&self) -> u64 { + use core::hash::{Hash, Hasher}; + let mut h = std::collections::hash_map::DefaultHasher::new(); + self.subspaces.hash(&mut h); + self.centroids.hash(&mut h); + self.residual.hash(&mut h); + self.lane_width.hash(&mut h); + self.pre_rotation.hash(&mut h); + self.distance.hash(&mut h); + // calibration_rows / measurement_rows / seed intentionally excluded. + h.finish() + } + + /// True when the kernel will benefit from Tier-1 AMX dispatch + /// (matmul-heavy: OPQ pre-rotation, or wide codebook > 512). + pub fn is_matmul_heavy(&self) -> bool { + self.pre_rotation.is_matmul() || self.centroids > 512 + } +} + +/// Fluent builder for `CodecParams`. CODING_PRACTICES gap 3 remediation. +/// +/// Programmatic entry point used by sweep driver, tests, and frontier +/// analysis. YAML ingress produces `CodecParams` via serde (in the +/// consumer crate, not here) and does NOT need the builder. +#[derive(Debug, Clone)] +pub struct CodecParamsBuilder { + subspaces: u32, + centroids: u32, + residual: ResidualSpec, + lane_width: LaneWidth, + pre_rotation: Rotation, + distance: Distance, + calibration_rows: u32, + measurement_rows: u32, + seed: u64, +} + +impl Default for CodecParamsBuilder { + fn default() -> Self { Self::new() } +} + +impl CodecParamsBuilder { + pub fn new() -> Self { + Self { + subspaces: NUM_SUBSPACES as u32, + centroids: NUM_CENTROIDS as u32, + residual: ResidualSpec::default(), + lane_width: LaneWidth::default(), + pre_rotation: Rotation::default(), + distance: Distance::default(), + calibration_rows: 2048, + measurement_rows: 0, // 0 means "use held-out complement" + seed: 42, + } + } + pub fn subspaces(mut self, n: u32) -> Self { self.subspaces = n; self } + pub fn centroids(mut self, n: u32) -> Self { self.centroids = n; self } + pub fn residual(mut self, spec: ResidualSpec) -> Self { self.residual = spec; self } + pub fn lane_width(mut self, lw: LaneWidth) -> Self { self.lane_width = lw; self } + pub fn rotation(mut self, r: Rotation) -> Self { self.pre_rotation = r; self } + pub fn distance(mut self, d: Distance) -> Self { self.distance = d; self } + pub fn calibration_rows(mut self, n: u32) -> Self { self.calibration_rows = n; self } + pub fn measurement_rows(mut self, n: u32) -> Self { self.measurement_rows = n; self } + pub fn seed(mut self, s: u64) -> Self { self.seed = s; self } + + /// Build with precision-ladder validation (D0.7). + pub fn build(self) -> Result { + if self.subspaces == 0 { return Err(CodecParamsError::ZeroDimension { field: "subspaces" }); } + if self.centroids == 0 { return Err(CodecParamsError::ZeroDimension { field: "centroids" }); } + // Precision ladder: OPQ routes through tile_dpbf16ps → BF16x32 only. + if matches!(self.pre_rotation, Rotation::Opq { .. }) && self.lane_width != LaneWidth::BF16x32 { + return Err(CodecParamsError::OpqRequiresBf16 { got: self.lane_width }); + } + // Hadamard Sylvester construction needs dim = 2^k. + if let Rotation::Hadamard { dim } = &self.pre_rotation { + if *dim == 0 || !dim.is_power_of_two() { + return Err(CodecParamsError::HadamardDimNotPow2 { dim: *dim }); + } + } + // Overfit guard: reject calibration_rows == measurement_rows (PR #219 pattern). + if self.measurement_rows != 0 && self.calibration_rows == self.measurement_rows { + return Err(CodecParamsError::CalibrationEqualsMeasurement { rows: self.calibration_rows }); + } + Ok(CodecParams { + subspaces: self.subspaces, + centroids: self.centroids, + residual: self.residual, + lane_width: self.lane_width, + pre_rotation: self.pre_rotation, + distance: self.distance, + calibration_rows: self.calibration_rows, + measurement_rows: self.measurement_rows, + seed: self.seed, + }) + } +} + +#[cfg(test)] +mod codec_params_tests { + use super::*; + + #[test] + fn builder_default_matches_pr220_baseline_shape() { + let p = CodecParamsBuilder::new().build().unwrap(); + assert_eq!(p.subspaces, 6); + assert_eq!(p.centroids, 256); + assert_eq!(p.residual.depth, 0); + assert_eq!(p.pre_rotation, Rotation::Identity); + assert_eq!(p.distance, Distance::AdcU8); + assert_eq!(p.lane_width, LaneWidth::F32x16); + } + + #[test] + fn builder_zero_subspaces_rejected() { + let err = CodecParamsBuilder::new().subspaces(0).build().unwrap_err(); + assert!(matches!(err, CodecParamsError::ZeroDimension { field: "subspaces" })); + } + + #[test] + fn builder_zero_centroids_rejected() { + let err = CodecParamsBuilder::new().centroids(0).build().unwrap_err(); + assert!(matches!(err, CodecParamsError::ZeroDimension { field: "centroids" })); + } + + #[test] + fn opq_with_f32x16_rejected_precision_ladder() { + // OPQ routes through tile_dpbf16ps — BF16x32 is the only allowed lane. + let err = CodecParamsBuilder::new() + .lane_width(LaneWidth::F32x16) + .rotation(Rotation::Opq { matrix_blob_id: 0xDEAD, dim: 4096 }) + .build() + .unwrap_err(); + assert!(matches!(err, CodecParamsError::OpqRequiresBf16 { got: LaneWidth::F32x16 })); + } + + #[test] + fn opq_with_bf16x32_accepted() { + let p = CodecParamsBuilder::new() + .lane_width(LaneWidth::BF16x32) + .rotation(Rotation::Opq { matrix_blob_id: 0xDEAD, dim: 4096 }) + .build() + .unwrap(); + assert!(p.is_matmul_heavy()); + } + + #[test] + fn hadamard_non_pow2_rejected() { + let err = CodecParamsBuilder::new() + .rotation(Rotation::Hadamard { dim: 3000 }) + .build() + .unwrap_err(); + assert!(matches!(err, CodecParamsError::HadamardDimNotPow2 { dim: 3000 })); + } + + #[test] + fn hadamard_pow2_accepted_stays_on_tier3() { + let p = CodecParamsBuilder::new() + .rotation(Rotation::Hadamard { dim: 4096 }) + .build() + .unwrap(); + // Hadamard is add/sub butterfly — no matmul → no AMX benefit. + assert!(!p.pre_rotation.is_matmul()); + } + + #[test] + fn overfit_guard_rejects_calibration_equal_measurement() { + // Reproduces the PR #219 pattern: trained and tested on same rows. + // The pipeline must refuse to emit ICC on that configuration. + let err = CodecParamsBuilder::new() + .calibration_rows(128) + .measurement_rows(128) + .build() + .unwrap_err(); + assert!(matches!(err, CodecParamsError::CalibrationEqualsMeasurement { rows: 128 })); + } + + #[test] + fn overfit_guard_allows_distinct_row_sets() { + let p = CodecParamsBuilder::new() + .calibration_rows(2048) + .measurement_rows(512) + .build() + .unwrap(); + assert_ne!(p.calibration_rows, p.measurement_rows); + } + + #[test] + fn kernel_signature_stable_within_process() { + let a = CodecParamsBuilder::new().centroids(1024).build().unwrap(); + let b = CodecParamsBuilder::new().centroids(1024).build().unwrap(); + assert_eq!(a.kernel_signature(), b.kernel_signature()); + } + + #[test] + fn kernel_signature_excludes_seed() { + // Seed changes calibration sample but NOT emitted IR — cache must hit. + let a = CodecParamsBuilder::new().seed(1).build().unwrap(); + let b = CodecParamsBuilder::new().seed(2).build().unwrap(); + assert_eq!(a.kernel_signature(), b.kernel_signature()); + } + + #[test] + fn kernel_signature_changes_with_centroids() { + let a = CodecParamsBuilder::new().centroids(256).build().unwrap(); + let b = CodecParamsBuilder::new().centroids(1024).build().unwrap(); + assert_ne!(a.kernel_signature(), b.kernel_signature()); + } + + #[test] + fn kernel_signature_changes_with_rotation_kind() { + let a = CodecParamsBuilder::new().rotation(Rotation::Identity).build().unwrap(); + let b = CodecParamsBuilder::new().rotation(Rotation::Hadamard { dim: 4096 }).build().unwrap(); + assert_ne!(a.kernel_signature(), b.kernel_signature()); + } + + #[test] + fn matmul_heavy_detects_opq_and_wide_codebook() { + let opq = CodecParamsBuilder::new() + .lane_width(LaneWidth::BF16x32) + .rotation(Rotation::Opq { matrix_blob_id: 1, dim: 4096 }) + .build() + .unwrap(); + assert!(opq.is_matmul_heavy(), "OPQ is matmul-heavy"); + + let wide = CodecParamsBuilder::new().centroids(1024).build().unwrap(); + assert!(wide.is_matmul_heavy(), "centroids=1024 is matmul-heavy"); + + let narrow = CodecParamsBuilder::new().centroids(256).build().unwrap(); + assert!(!narrow.is_matmul_heavy(), "narrow codebook + identity is not matmul-heavy"); + } +} + #[cfg(test)] mod route_tests { use super::*;