diff --git a/benchmarks/e2e_ppl_validation.py b/benchmarks/e2e_ppl_validation.py
new file mode 100644
index 00000000..df0ced0f
--- /dev/null
+++ b/benchmarks/e2e_ppl_validation.py
@@ -0,0 +1,464 @@
+#!/usr/bin/env python3
+"""End-to-end downstream-quality validation of the v1.3 codec on WikiText-103.
+
+Experimental design
+-------------------
+For each WikiText-103 passage of length >= ctx_len + n_eval:
+  1. Prefill ctx_len tokens into a reference DynamicCache (bf16).
+  2. Clone the reference cache and round-trip every full-attention layer
+     through the v1.3 Rust codec (encode -> decode, real reconstructed KV).
+  3. On both caches, compute next-token logits for the n_eval evaluation
+     tokens (teacher-forced).
+  4. Compare logit distributions: mean/max KL, top-1 agreement, PPL ratio.
+
+Both caches start identical; only step 2 perturbs the alt cache. The
+only source of divergence in step 3 is therefore the v1.3 KV
+reconstruction error. This is the clean end-to-end signal we want.
+
+The Rust bench binary MUST support --dump-decoded (added in commit
+introducing this harness).
+"""
+from __future__ import annotations
+
+import argparse
+import copy
+import json
+import os
+import struct
+import subprocess
+import time
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache
+
+REPO = Path(__file__).resolve().parent.parent
+BENCH_BIN = REPO / "kakeyaturbo" / "target" / "release" / "kakeyaturbo-bench"
+KKTV_MAGIC = 0x4B4B5456
+
+
+# -----------------------------------------------------------------------------
+# WikiText-103 loader + concatenator
+# -----------------------------------------------------------------------------
+
+def load_wikitext_passages(tokenizer, min_tokens: int, n_passages: int,
+                           split: str = "test") -> list[str]:
+    """Concatenate consecutive non-empty WikiText rows into passages of
+    length >= min_tokens, return the first n_passages."""
+    from datasets import load_dataset
+    ds = load_dataset("wikitext", "wikitext-103-raw-v1", split=split)
+
+    passages = []
+    current = []
+    current_tok_count = 0
+    for row in ds:
+        text = row["text"]
+        if not text.strip():
+            continue
+        current.append(text)
+        # Approximate token count via word count * 1.3 to avoid re-tokenising
+        current_tok_count += int(len(text.split()) * 1.3)
+        if current_tok_count >= min_tokens:
+            passage = "".join(current)
+            real_len = tokenizer(passage, return_tensors="pt")["input_ids"].shape[-1]
+            if real_len >= min_tokens:
+                passages.append(passage)
+                if len(passages) >= n_passages:
+                    return passages
+            current = []
+            current_tok_count = 0
+    return passages
+
+
+# -----------------------------------------------------------------------------
+# KKTV I/O
+# -----------------------------------------------------------------------------
+
+def write_kktv(path: Path, arr: np.ndarray) -> None:
+    assert arr.dtype == np.float32 and arr.ndim == 2, arr.dtype
+    n, d = arr.shape
+    with path.open("wb") as f:
+        f.write(struct.pack("<I", KKTV_MAGIC))
+        f.write(struct.pack("<I", 1))
+        f.write(struct.pack("<Q", n))
+        f.write(struct.pack("<I", d))
+        f.write(struct.pack("<I", 0))
+        f.write(arr.tobytes(order="C"))
+
+
+def read_kktv_f32(path: Path) -> np.ndarray:
+    with path.open("rb") as f:
+        magic = struct.unpack("<I", f.read(4))[0]
+        assert magic == KKTV_MAGIC, f"{path}: bad magic"
+        _version = struct.unpack("<I", f.read(4))[0]
+        n = struct.unpack("<Q", f.read(8))[0]
+        d = struct.unpack("<I", f.read(4))[0]
+        _pad = struct.unpack("<I", f.read(4))[0]
+        raw = f.read(n * d * 4)
+    return np.frombuffer(raw, dtype=np.float32).reshape(n, d).copy()
+
+
+# -----------------------------------------------------------------------------
+# v1.3 Rust codec round-trip
+# -----------------------------------------------------------------------------
+
+def rust_roundtrip(
+    arr: np.ndarray, block_size: int, bit_width: int,
+    rsvd_target_rank: int, metric: str, share_basis: bool,
+    pca_method: str = "randomized",
+    variance_ratio: float = 0.95, k_means_k: int = 16,
+    rsvd_oversample: int = 8, rsvd_power_iters: int = 2,
+) -> tuple[np.ndarray, dict]:
+    """Full v1.3 round-trip. Always applies spherical k-means + WHT + Lloyd-Max."""
+    """Encode `arr` (N,D) through v1.3 codec and return decoded + report."""
+    import tempfile
+    with tempfile.TemporaryDirectory(dir="/tmp") as td:
+        tdp = Path(td)
+        in_path = tdp / "x.kktv"
+        rep_path = tdp / "report.json"
+        dec_path = tdp / "decoded.kktv"
+        write_kktv(in_path, arr.astype(np.float32, copy=False))
+        cmd = [
+            str(BENCH_BIN),
+            "--input", str(in_path),
+            "--output", str(rep_path),
+            "--metric", metric,
+            "--block-size", str(block_size),
+            "--variance-ratio", str(variance_ratio),
+            "--k", str(k_means_k),
+            "--bit-width", str(bit_width),
+            "--rotation-seed", "3405691582",
+            "--pca-method", pca_method,
+            "--verify",
+            "--dump-decoded", str(dec_path),
+        ]
+        if pca_method == "randomized":
+            cmd += [
+                "--rsvd-target-rank", str(rsvd_target_rank),
+                "--rsvd-oversample", str(rsvd_oversample),
+                "--rsvd-power-iters", str(rsvd_power_iters),
+            ]
+        if share_basis:
+            cmd.append("--share-basis")
+        res = subprocess.run(cmd, capture_output=True, text=True)
+        if res.returncode != 0:
+            raise RuntimeError(f"rust bench failed: {res.stderr}")
+        report = json.loads(rep_path.read_text())
+        decoded = read_kktv_f32(dec_path)
+        return decoded, report
+
+
+# -----------------------------------------------------------------------------
+# Cache manipulation
+# -----------------------------------------------------------------------------
+
+@torch.inference_mode()
+def prefill_cache(model, input_ids: torch.Tensor,
+                  prefill_chunk: int = 0) -> DynamicCache:
+    cache = DynamicCache(config=model.config)
+    if prefill_chunk <= 0 or input_ids.shape[-1] <= prefill_chunk:
+        _ = model(input_ids=input_ids, past_key_values=cache, use_cache=True)
+    else:
+        for s in range(0, input_ids.shape[-1], prefill_chunk):
+            e = min(s + prefill_chunk, input_ids.shape[-1])
+            _ = model(input_ids=input_ids[:, s:e], past_key_values=cache, use_cache=True)
+    return cache
+
+
+def roundtrip_cache(
+    model, cache_ref: DynamicCache, block_size: int, bit_width: int,
+    rsvd_target_rank_factor: float = 0.5, pca_method: str = "randomized",
+    variance_ratio: float = 0.95,
+) -> tuple[DynamicCache, dict]:
+    """Build an alt cache whose full-attention-layer K,V have been
+    round-tripped through the v1.3 codec. Sliding-window layers are
+    copied unchanged."""
+    cfg = model.config.get_text_config(decoder=True)
+    layer_types = getattr(cfg, "layer_types", None)
+    if layer_types is None:
+        sw = getattr(cfg, "sliding_window", None) or getattr(cfg, "attention_chunk_size", None)
+        layer_types = ["sliding_attention" if sw else "full_attention"] * cfg.num_hidden_layers
+
+    cache_alt = DynamicCache(config=model.config)
+    stats = {"per_layer": [], "n_full": 0}
+
+    for i, layer_kv in enumerate(cache_ref.layers):
+        if not hasattr(layer_kv, "keys") or layer_kv.keys is None \
+                or layer_kv.keys.numel() == 0:
+            continue
+        k_ref = layer_kv.keys    # [bsz, n_kv, seq, hd]
+        v_ref = layer_kv.values
+
+        if layer_types[i] != "full_attention":
+            cache_alt.layers[i].update(k_ref.clone(), v_ref.clone(), 0)
+            continue
+
+        bsz, n_kv, seq, hd = k_ref.shape
+        k_flat = k_ref.to(torch.float32).cpu().numpy().reshape(-1, hd)
+        v_flat = v_ref.to(torch.float32).cpu().numpy().reshape(-1, hd)
+
+        # Only compressible-aligned prefix goes through the codec; tail
+        # stays exact.
+        n_total = k_flat.shape[0]
+        n_full_blocks = n_total // block_size
+        n_compressible = n_full_blocks * block_size
+
+        target_rank = max(2, int(hd * rsvd_target_rank_factor))
+
+        if n_compressible > 0:
+            k_dec, k_rep = rust_roundtrip(
+                k_flat[:n_compressible], block_size=block_size,
+                bit_width=bit_width, rsvd_target_rank=target_rank,
+                metric="inner_product", share_basis=False,
+                pca_method=pca_method, variance_ratio=variance_ratio,
+            )
+            v_dec, v_rep = rust_roundtrip(
+                v_flat[:n_compressible], block_size=block_size,
+                bit_width=bit_width, rsvd_target_rank=target_rank,
+                metric="mse", share_basis=True,
+                pca_method=pca_method, variance_ratio=variance_ratio,
+            )
+        else:
+            k_dec, v_dec = k_flat[:0], v_flat[:0]
+            k_rep = v_rep = {"mean_block_mse": 0.0, "compressed_bytes": 0}
+
+        k_full_decoded = np.concatenate(
+            [k_dec, k_flat[n_compressible:]], axis=0) if n_compressible < n_total else k_dec
+        v_full_decoded = np.concatenate(
+            [v_dec, v_flat[n_compressible:]], axis=0) if n_compressible < n_total else v_dec
+
+        k_restore = torch.from_numpy(k_full_decoded.copy()) \
+            .reshape(bsz, n_kv, seq, hd).to(k_ref.dtype).to(k_ref.device)
+        v_restore = torch.from_numpy(v_full_decoded.copy()) \
+            .reshape(bsz, n_kv, seq, hd).to(v_ref.dtype).to(v_ref.device)
+
+        cache_alt.layers[i].update(k_restore, v_restore, 0)
+
+        stats["per_layer"].append({
+            "layer": i, "hd": hd, "seq": seq,
+            "k_mse": float(k_rep["mean_block_mse"]),
+            "v_mse": float(v_rep["mean_block_mse"]),
+            "k_bytes": int(k_rep["compressed_bytes"]),
+            "v_bytes": int(v_rep["compressed_bytes"]),
+            "n_compressible_vecs": int(n_compressible),
+            "n_tail_vecs": int(n_total - n_compressible),
+        })
+        stats["n_full"] += 1
+
+    return cache_alt, stats
+
+
+@torch.inference_mode()
+def logits_with_prefilled_cache(model, cache: DynamicCache,
+                                cont_ids: torch.Tensor) -> torch.Tensor:
+    """Teacher-force cont_ids through model using pre-filled cache,
+    return logits over cont_ids positions."""
+    out = model(input_ids=cont_ids, past_key_values=cache, use_cache=True)
+    return out.logits  # [1, len(cont_ids), V]
+
+
+# -----------------------------------------------------------------------------
+# Metrics
+# -----------------------------------------------------------------------------
+
+def compare_logits(
+    logits_ref: torch.Tensor, logits_alt: torch.Tensor,
+    cont_ids: torch.Tensor
+) -> dict:
+    """Compare next-token distributions position-by-position."""
+    assert logits_ref.shape == logits_alt.shape
+    # Position t predicts cont_ids[t+1] (teacher-forced), so shift by 1
+    sl_ref = logits_ref[..., :-1, :].float()
+    sl_alt = logits_alt[..., :-1, :].float()
+    labels = cont_ids[..., 1:]
+
+    log_p_ref = F.log_softmax(sl_ref, dim=-1)
+    p_ref = log_p_ref.exp()
+    log_p_alt = F.log_softmax(sl_alt, dim=-1)
+    kl_per_tok = (p_ref * (log_p_ref - log_p_alt)).sum(dim=-1)  # [1, T-1]
+    mean_kl = float(kl_per_tok.mean().item())
+    max_kl = float(kl_per_tok.max().item())
+
+    top1_ref = sl_ref.argmax(dim=-1)
+    top1_alt = sl_alt.argmax(dim=-1)
+    agree = float((top1_ref == top1_alt).float().mean().item())
+
+    nll_ref = F.cross_entropy(
+        sl_ref.reshape(-1, sl_ref.size(-1)), labels.reshape(-1), reduction="mean")
+    nll_alt = F.cross_entropy(
+        sl_alt.reshape(-1, sl_alt.size(-1)), labels.reshape(-1), reduction="mean")
+    ppl_ref = float(torch.exp(nll_ref).item())
+    ppl_alt = float(torch.exp(nll_alt).item())
+
+    return {
+        "mean_kl": mean_kl,
+        "max_kl": max_kl,
+        "top1_agreement": agree,
+        "ppl_ref": ppl_ref,
+        "ppl_alt": ppl_alt,
+        "ppl_delta_rel": (ppl_alt - ppl_ref) / max(ppl_ref, 1e-8),
+        "nll_ref": float(nll_ref.item()),
+        "nll_alt": float(nll_alt.item()),
+        "n_tokens": int(labels.numel()),
+    }
+
+
+# -----------------------------------------------------------------------------
+# Per-passage evaluation
+# -----------------------------------------------------------------------------
+
+def evaluate_passage(
+    model, tokenizer, passage: str, ctx_len: int, n_eval: int,
+    block_size: int, bit_width: int, prefill_chunk: int,
+    pca_method: str = "randomized", variance_ratio: float = 0.95,
+) -> dict | None:
+    ids = tokenizer(passage, return_tensors="pt")["input_ids"]
+    if ids.shape[-1] < ctx_len + n_eval:
+        return None
+    prefix_ids = ids[:, :ctx_len]
+    cont_ids = ids[:, ctx_len : ctx_len + n_eval]
+
+    t0 = time.perf_counter()
+    cache_ref = prefill_cache(model, prefix_ids, prefill_chunk)
+    t_prefill = time.perf_counter() - t0
+
+    t0 = time.perf_counter()
+    cache_alt, stats = roundtrip_cache(model, cache_ref, block_size, bit_width,
+                                       pca_method=pca_method,
+                                       variance_ratio=variance_ratio)
+    t_roundtrip = time.perf_counter() - t0
+
+    # We need two SEPARATE cache instances so the teacher-forced forward
+    # pass mutates each independently. cache_ref has already been used --
+    # but DynamicCache is append-only, so re-forwarding cont_ids just
+    # appends; that's fine for our comparison. To be safe, deep-copy both
+    # before the forward.
+    cache_ref_fwd = copy.deepcopy(cache_ref)
+    cache_alt_fwd = copy.deepcopy(cache_alt)
+
+    logits_ref = logits_with_prefilled_cache(model, cache_ref_fwd, cont_ids)
+    logits_alt = logits_with_prefilled_cache(model, cache_alt_fwd, cont_ids)
+
+    metrics = compare_logits(logits_ref, logits_alt, cont_ids)
+
+    return {
+        "ctx_len": ctx_len,
+        "n_eval": n_eval,
+        "prefill_sec": t_prefill,
+        "roundtrip_sec": t_roundtrip,
+        "compression_stats": stats,
+        "metrics": metrics,
+    }
+
+
+# -----------------------------------------------------------------------------
+# Main
+# -----------------------------------------------------------------------------
+
+def verdict_of(mean_delta_rel: float, mean_top1: float) -> str:
+    """ACCEPT: |delta ppl| <= 1% AND top1 agreement >= 95%
+       MARGINAL: |delta ppl| <= 3% AND top1 agreement >= 85%
+       REJECT: otherwise
+    Standard LLM-compression PPL thresholds.
+    """
+    if abs(mean_delta_rel) <= 0.01 and mean_top1 >= 0.95:
+        return "ACCEPT"
+    if abs(mean_delta_rel) <= 0.03 and mean_top1 >= 0.85:
+        return "MARGINAL"
+    return "REJECT"
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--model-path", required=True)
+    ap.add_argument("--model-name", required=True)
+    ap.add_argument("--ctx-len", type=int, default=2048)
+    ap.add_argument("--n-eval", type=int, default=64,
+                    help="Number of teacher-forced evaluation tokens")
+    ap.add_argument("--block-size", type=int, default=512)
+    ap.add_argument("--bit-width", type=int, default=2)
+    ap.add_argument("--prefill-chunk", type=int, default=0)
+    ap.add_argument("--n-passages", type=int, default=4)
+    ap.add_argument("--out-dir", type=Path, required=True)
+    ap.add_argument("--pca-method", choices=["exact", "randomized"], default="randomized")
+    ap.add_argument("--variance-ratio", type=float, default=0.95)
+    args = ap.parse_args()
+
+    args.out_dir.mkdir(parents=True, exist_ok=True)
+
+    print(f"[{args.model_name}] loading model…", flush=True)
+    tok = AutoTokenizer.from_pretrained(args.model_path)
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model_path, dtype=torch.bfloat16, attn_implementation="eager"
+    )
+    model.eval()
+
+    print(f"[{args.model_name}] loading WikiText-103 passages "
+          f"(min_tokens={args.ctx_len + args.n_eval})…", flush=True)
+    passages = load_wikitext_passages(
+        tok, min_tokens=args.ctx_len + args.n_eval,
+        n_passages=args.n_passages,
+    )
+    print(f"  got {len(passages)} passages")
+
+    per_passage = []
+    for i, passage in enumerate(passages):
+        print(f"  passage {i + 1}/{len(passages)} "
+              f"(ctx={args.ctx_len}, n_eval={args.n_eval})…", flush=True)
+        res = evaluate_passage(
+            model, tok, passage, args.ctx_len, args.n_eval,
+            args.block_size, args.bit_width, args.prefill_chunk,
+            pca_method=args.pca_method, variance_ratio=args.variance_ratio,
+        )
+        if res is None:
+            print("    skipped (too short after tokenisation)")
+            continue
+        per_passage.append(res)
+        m = res["metrics"]
+        print(
+            f"    ppl_ref={m['ppl_ref']:.3f} ppl_alt={m['ppl_alt']:.3f} "
+            f"Δppl={m['ppl_delta_rel']*100:+.2f}% "
+            f"KL={m['mean_kl']:.4f} top1={m['top1_agreement']*100:.1f}%",
+            flush=True,
+        )
+
+    # Aggregate
+    summary = {
+        "model_name": args.model_name,
+        "ctx_len": args.ctx_len,
+        "n_eval": args.n_eval,
+        "block_size": args.block_size,
+        "bit_width": args.bit_width,
+        "pca_method": args.pca_method,
+        "variance_ratio": args.variance_ratio,
+        "n_passages": len(per_passage),
+    }
+    if per_passage:
+        mean_delta = float(np.mean([r["metrics"]["ppl_delta_rel"] for r in per_passage]))
+        mean_kl = float(np.mean([r["metrics"]["mean_kl"] for r in per_passage]))
+        mean_top1 = float(np.mean([r["metrics"]["top1_agreement"] for r in per_passage]))
+        summary.update({
+            "mean_ppl_delta_rel": mean_delta,
+            "mean_kl": mean_kl,
+            "mean_top1_agreement": mean_top1,
+            "verdict": verdict_of(mean_delta, mean_top1),
+        })
+        print(f"\n[{args.model_name}] ===== SUMMARY =====")
+        print(f"  n_passages   = {len(per_passage)}")
+        print(f"  Δppl (mean)  = {mean_delta*100:+.3f}%")
+        print(f"  KL (mean)    = {mean_kl:.5f}")
+        print(f"  top1 agree   = {mean_top1*100:.2f}%")
+        print(f"  VERDICT      = {summary['verdict']}")
+    else:
+        summary["verdict"] = "NO_DATA"
+        print(f"\n[{args.model_name}] no usable passages")
+
+    summary["per_passage"] = per_passage
+    (args.out_dir / f"{args.model_name}.json").write_text(
+        json.dumps(summary, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/e2e_ppl_validation_vllm.py b/benchmarks/e2e_ppl_validation_vllm.py
new file mode 100644
index 00000000..a494f657
--- /dev/null
+++ b/benchmarks/e2e_ppl_validation_vllm.py
@@ -0,0 +1,637 @@
+#!/usr/bin/env python3
+"""End-to-end downstream-quality validation of the v1.3 codec, run on vLLM.
+
+This mirrors the HF-transformers harness (`e2e_ppl_validation.py`) but
+routes the forward pass through vLLM, so the PPL numbers reflect the
+codec's behaviour under the production inference engine rather than the
+HF eager kernel.
+
+Design
+------
+1. Build a single vLLM `LLM` instance (PagedAttention, bf16).
+2. Monkey-patch `vllm.attention.layer.Attention.forward` to, when a
+   global switch is ON, round-trip the per-layer `key` and `value`
+   tensors through the v1.3 Rust codec before they reach the attention
+   kernel. Layers whose config lists them as `sliding_attention` are
+   skipped (same convention as the HF harness).
+3. For each WikiText-103 passage that tokenises to >= ctx_len + n_eval
+   tokens, call `LLM.generate` with `prompt_logprobs=1` on the truncated
+   passage `[0 : ctx_len + n_eval]` once with the codec OFF
+   (reference) and once ON (alt). PPL over the `[ctx_len :
+   ctx_len+n_eval)` positions and top-1 agreement of the one-best
+   candidate are compared.
+4. The standard PPL verdict thresholds are applied.
+
+The Rust bench binary MUST support `--dump-decoded` (v1.3 Rust change
+that landed together with this harness).
+
+Usage
+-----
+    python benchmarks/e2e_ppl_validation_vllm.py \\
+        --model-path Qwen/Qwen2.5-0.5B \\
+        --model-name qwen2_5_0_5b \\
+        --ctx-len 1024 --n-eval 64 --n-passages 2 \\
+        --out-dir reports/v1_3_rsvd_rope/e2e_ppl_vllm_smoke
+
+It is a drop-in alternative to `e2e_ppl_validation.py` that uses the
+same codec under the same parameters, so the two numbers can be
+compared directly (HF vs vLLM engine).
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import struct
+import subprocess
+import sys
+import tempfile
+import time
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import torch
+
+REPO = Path(__file__).resolve().parent.parent
+BENCH_BIN = REPO / "kakeyaturbo" / "target" / "release" / "kakeyaturbo-bench"
+KKTV_MAGIC = 0x4B4B5456
+
+
+# =============================================================================
+# KKTV I/O (matches the Rust bench binary)
+# =============================================================================
+
+def write_kktv(path: Path, arr: np.ndarray) -> None:
+    assert arr.dtype == np.float32 and arr.ndim == 2, (arr.dtype, arr.shape)
+    n, d = arr.shape
+    with path.open("wb") as f:
+        f.write(struct.pack("<I", KKTV_MAGIC))
+        f.write(struct.pack("<I", 1))
+        f.write(struct.pack("<Q", n))
+        f.write(struct.pack("<I", d))
+        f.write(struct.pack("<I", 0))
+        f.write(arr.tobytes(order="C"))
+
+
+def read_kktv_f32(path: Path) -> np.ndarray:
+    with path.open("rb") as f:
+        magic = struct.unpack("<I", f.read(4))[0]
+        assert magic == KKTV_MAGIC, f"{path}: bad magic {magic:#x}"
+        _version = struct.unpack("<I", f.read(4))[0]
+        n = struct.unpack("<Q", f.read(8))[0]
+        d = struct.unpack("<I", f.read(4))[0]
+        _pad = struct.unpack("<I", f.read(4))[0]
+        raw = f.read(n * d * 4)
+    return np.frombuffer(raw, dtype=np.float32).reshape(n, d).copy()
+
+
+# =============================================================================
+# v1.3 Rust codec round-trip
+# =============================================================================
+
+def rust_roundtrip(
+    arr: np.ndarray,
+    *,
+    block_size: int,
+    bit_width: int,
+    rsvd_target_rank: int,
+    metric: str,
+    share_basis: bool,
+    pca_method: str = "randomized",
+    variance_ratio: float = 0.95,
+    k_means_k: int = 16,
+    rsvd_oversample: int = 8,
+    rsvd_power_iters: int = 2,
+) -> tuple[np.ndarray, dict]:
+    """Encode `arr` (N, D) through the v1.3 codec, return (decoded, report)."""
+    if not BENCH_BIN.exists():
+        raise FileNotFoundError(
+            f"{BENCH_BIN} missing; run "
+            "`cargo build --release --bin kakeyaturbo-bench` in kakeyaturbo/"
+        )
+
+    with tempfile.TemporaryDirectory(dir="/tmp") as td:
+        tdp = Path(td)
+        in_path = tdp / "x.kktv"
+        rep_path = tdp / "report.json"
+        dec_path = tdp / "decoded.kktv"
+        write_kktv(in_path, arr.astype(np.float32, copy=False))
+        cmd = [
+            str(BENCH_BIN),
+            "--input", str(in_path),
+            "--output", str(rep_path),
+            "--metric", metric,
+            "--block-size", str(block_size),
+            "--variance-ratio", str(variance_ratio),
+            "--k", str(k_means_k),
+            "--bit-width", str(bit_width),
+            "--rotation-seed", "3405691582",
+            "--pca-method", pca_method,
+            "--verify",
+            "--dump-decoded", str(dec_path),
+        ]
+        if pca_method == "randomized":
+            cmd += [
+                "--rsvd-target-rank", str(rsvd_target_rank),
+                "--rsvd-oversample", str(rsvd_oversample),
+                "--rsvd-power-iters", str(rsvd_power_iters),
+            ]
+        if share_basis:
+            cmd.append("--share-basis")
+
+        res = subprocess.run(cmd, capture_output=True, text=True)
+        if res.returncode != 0:
+            raise RuntimeError(
+                f"kakeyaturbo-bench failed (rc={res.returncode}): {res.stderr}"
+            )
+        report = json.loads(rep_path.read_text())
+        decoded = read_kktv_f32(dec_path)
+        return decoded, report
+
+
+# =============================================================================
+# Codec wrapper for a live KV tensor flowing through vLLM attention
+# =============================================================================
+
+class CodecState:
+    """Mutable global state for the monkey-patched Attention.forward.
+
+    Keeping this at module scope (rather than threading it through vLLM's
+    engine plumbing) is the simplest way to toggle the codec per-request
+    inside a single LLM instance without restarting the CUDA graph /
+    engine. We never run two codec configurations concurrently, so the
+    global is safe here.
+    """
+
+    active: bool = False
+    block_size: int = 512
+    bit_width: int = 2
+    variance_ratio: float = 0.95
+    pca_method: str = "randomized"
+    rsvd_target_rank_factor: float = 0.5
+    # Layer routing: full-attention layer indices. If None, every layer
+    # that the patched forward sees is treated as compressible.
+    full_attention_layers: set[int] | None = None
+    # Per-layer stats accumulator (reset per measurement run).
+    stats: list[dict] = []
+    # Counter assigned to each Attention instance on first use, used
+    # only to distinguish layers when a model doesn't expose a
+    # stable layer_name attribute.
+    layer_counter: int = 0
+
+
+def _roundtrip_tensor(
+    t: torch.Tensor,
+    metric: str,
+    share_basis: bool,
+    layer_id: Any,
+    kind: str,
+    head_size: int,
+) -> torch.Tensor:
+    """Codec round-trip a vLLM `key` or `value` tensor.
+
+    vLLM 0.7.3 passes `key` / `value` into `Attention.forward` either as
+    2D `[num_tokens, num_kv_heads * head_size]` (some model definitions)
+    or 3D `[num_tokens, num_kv_heads, head_size]` (use_output path after
+    `.view`). In both cases the per-head dimension is `head_size`, which
+    the attention module exposes as `self.head_size`.
+    """
+    orig_shape = t.shape
+    orig_dtype = t.dtype
+    orig_device = t.device
+
+    if t.dim() == 2:
+        total = t.shape[1]
+        if total % head_size != 0:
+            raise ValueError(
+                f"KV tensor dim {total} not divisible by head_size {head_size}"
+            )
+        x = t.reshape(-1, head_size)
+    elif t.dim() == 3:
+        x = t.reshape(-1, t.shape[-1])
+    else:
+        raise ValueError(f"unexpected KV tensor shape {tuple(orig_shape)}")
+
+    arr = x.detach().to(torch.float32).cpu().numpy()
+    n_total, hd = arr.shape
+    n_full_blocks = n_total // CodecState.block_size
+    n_compressible = n_full_blocks * CodecState.block_size
+
+    if n_compressible == 0:
+        return t  # not enough vectors to fill one block; leave untouched
+
+    target_rank = max(2, int(hd * CodecState.rsvd_target_rank_factor))
+
+    dec, rep = rust_roundtrip(
+        arr[:n_compressible],
+        block_size=CodecState.block_size,
+        bit_width=CodecState.bit_width,
+        rsvd_target_rank=target_rank,
+        metric=metric,
+        share_basis=share_basis,
+        pca_method=CodecState.pca_method,
+        variance_ratio=CodecState.variance_ratio,
+    )
+    if n_compressible < n_total:
+        dec = np.concatenate([dec, arr[n_compressible:]], axis=0)
+
+    CodecState.stats.append({
+        "layer_id": layer_id,
+        "kind": kind,
+        "hd": hd,
+        "n_vecs": n_total,
+        "n_compressible": n_compressible,
+        "mean_block_mse": float(rep.get("mean_block_mse", -1.0)),
+        "compressed_bytes": int(rep.get("compressed_bytes", 0)),
+    })
+
+    restored = (
+        torch.from_numpy(dec.astype(np.float32))
+        .to(orig_device)
+        .to(orig_dtype)
+        .reshape(orig_shape)
+    )
+    return restored
+
+
+# =============================================================================
+# vLLM attention monkey-patch
+# =============================================================================
+
+def install_vllm_codec_patch() -> None:
+    """Rebind `vllm.attention.layer.Attention.forward` so it round-trips
+    K/V when `CodecState.active` is True.
+    """
+    from vllm.attention.layer import Attention  # type: ignore
+
+    if getattr(Attention, "_kakeyaturbo_patched", False):
+        return
+
+    orig_forward = Attention.forward
+
+    def patched_forward(
+        self: Attention,  # type: ignore[name-defined]
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: Any,
+    ) -> torch.Tensor:
+        if CodecState.active:
+            layer_id = getattr(
+                self, "layer_name",
+                getattr(self, "_kakeyaturbo_layer_id", None),
+            )
+            if layer_id is None:
+                layer_id = CodecState.layer_counter
+                CodecState.layer_counter += 1
+                try:
+                    object.__setattr__(self, "_kakeyaturbo_layer_id", layer_id)
+                except Exception:
+                    pass
+
+            is_full = (
+                CodecState.full_attention_layers is None
+                or layer_id in CodecState.full_attention_layers
+            )
+            if is_full and key is not None and value is not None:
+                head_size = getattr(self, "head_size", None)
+                if head_size is None:
+                    print(f"[codec-patch] layer {layer_id}: no head_size, "
+                          "skipping round-trip", file=sys.stderr)
+                else:
+                    try:
+                        key = _roundtrip_tensor(
+                            key, metric="inner_product",
+                            share_basis=False, layer_id=layer_id, kind="K",
+                            head_size=head_size,
+                        )
+                        value = _roundtrip_tensor(
+                            value, metric="mse",
+                            share_basis=True, layer_id=layer_id, kind="V",
+                            head_size=head_size,
+                        )
+                    except Exception as e:
+                        print(f"[codec-patch] layer {layer_id} round-trip "
+                              f"failed: {e}", file=sys.stderr)
+
+        return orig_forward(self, query, key, value, kv_cache, attn_metadata)
+
+    Attention.forward = patched_forward
+    Attention._kakeyaturbo_patched = True  # type: ignore[attr-defined]
+    print("[codec-patch] vllm.attention.layer.Attention.forward wrapped",
+          flush=True)
+
+
+# =============================================================================
+# WikiText-103 passage loader
+# =============================================================================
+
+def load_wikitext_passages(
+    tokenizer: Any, min_tokens: int, n_passages: int, split: str = "test",
+) -> list[str]:
+    from datasets import load_dataset
+    ds = load_dataset("wikitext", "wikitext-103-raw-v1", split=split)
+
+    passages: list[str] = []
+    current: list[str] = []
+    approx = 0
+    for row in ds:
+        text = row["text"]
+        if not text.strip():
+            continue
+        current.append(text)
+        approx += int(len(text.split()) * 1.3)
+        if approx >= min_tokens:
+            passage = "".join(current)
+            real_len = len(tokenizer.encode(passage))
+            if real_len >= min_tokens:
+                passages.append(passage)
+                if len(passages) >= n_passages:
+                    return passages
+            current = []
+            approx = 0
+    return passages
+
+
+# =============================================================================
+# vLLM driver
+# =============================================================================
+
+def build_llm(model_path: str, max_model_len: int, gpu_mem_util: float):
+    from vllm import LLM  # type: ignore
+    return LLM(
+        model=model_path,
+        dtype="bfloat16",
+        max_model_len=max_model_len,
+        gpu_memory_utilization=gpu_mem_util,
+        enforce_eager=True,
+        # A single long sequence; no KV cache paging pressure at our
+        # context sizes.
+        trust_remote_code=True,
+    )
+
+
+def prompt_logprobs_for_ids(
+    llm: Any, prompt_token_ids: list[int]
+) -> list[dict]:
+    """Run vLLM with prompt_logprobs=1 on the given token ids, return the
+    per-position log-probability dict (one entry per prompt token, the
+    first entry is always None because position 0 has no predecessor).
+    """
+    from vllm import SamplingParams  # type: ignore
+
+    sp = SamplingParams(
+        max_tokens=1,
+        temperature=0.0,
+        prompt_logprobs=1,
+    )
+    outs = llm.generate(
+        prompts=None,
+        prompt_token_ids=[prompt_token_ids],
+        sampling_params=sp,
+        use_tqdm=False,
+    )
+    return outs[0].prompt_logprobs
+
+
+def ppl_and_top1_from_prompt_logprobs(
+    pls: list[dict], prompt_ids: list[int], start: int, end: int,
+) -> tuple[float, list[float], list[int]]:
+    """Slice the prompt_logprobs list at positions `[start, end)` and
+    compute PPL of the ground-truth next-token chain there, plus the
+    argmax top-1 candidates and the per-position log-prob of the true
+    token.
+
+    `pls[t]` is the dict for the token *at position t* — vLLM reports
+    the conditional logprob of `prompt_ids[t]` given `prompt_ids[<t]`,
+    and the logprobs of the top-K alternative token ids at that
+    position. `pls[0]` is always None.
+    """
+    logps_true: list[float] = []
+    top1_ids: list[int] = []
+    for t in range(start, end):
+        entry = pls[t]
+        if entry is None:
+            continue
+        tok = prompt_ids[t]
+        if tok not in entry:
+            # Rare: the prompt-true token isn't in the reported top-K
+            # bucket. Fall back to `logprob` on whichever entry matches
+            # if available; otherwise treat as -inf for correctness.
+            logps_true.append(float("-inf"))
+        else:
+            lp = entry[tok]
+            logps_true.append(
+                float(lp.logprob if hasattr(lp, "logprob") else lp["logprob"])
+            )
+        # Top-1 candidate id (by highest logprob).
+        def _lp(v: Any) -> float:
+            return float(v.logprob if hasattr(v, "logprob") else v["logprob"])
+        top1 = max(entry.items(), key=lambda kv: _lp(kv[1]))[0]
+        top1_ids.append(int(top1))
+
+    valid = [lp for lp in logps_true if np.isfinite(lp)]
+    mean_nll = -float(np.mean(valid)) if valid else float("inf")
+    ppl = float(np.exp(mean_nll)) if np.isfinite(mean_nll) else float("inf")
+    return ppl, logps_true, top1_ids
+
+
+def compare_two_runs(
+    ref_pls: list[dict], alt_pls: list[dict],
+    prompt_ids: list[int], ctx_len: int, n_eval: int,
+) -> dict:
+    end = min(ctx_len + n_eval, len(prompt_ids))
+    ppl_ref, lp_ref, top_ref = ppl_and_top1_from_prompt_logprobs(
+        ref_pls, prompt_ids, ctx_len, end)
+    ppl_alt, lp_alt, top_alt = ppl_and_top1_from_prompt_logprobs(
+        alt_pls, prompt_ids, ctx_len, end)
+
+    n = min(len(top_ref), len(top_alt))
+    if n == 0:
+        return {
+            "ppl_ref": ppl_ref, "ppl_alt": ppl_alt,
+            "ppl_delta_rel": float("nan"),
+            "top1_agreement": float("nan"),
+            "mean_kl_upper": float("nan"),
+            "n_tokens": 0,
+        }
+
+    agree = float(np.mean(
+        [1.0 if top_ref[i] == top_alt[i] else 0.0 for i in range(n)]
+    ))
+
+    # With only top-1 logprob per position we can't compute true KL over
+    # the full vocab. We report the mean |Δ logprob on the true token|
+    # as an approximate divergence proxy.
+    deltas = [
+        abs(lp_ref[i] - lp_alt[i])
+        for i in range(min(len(lp_ref), len(lp_alt)))
+        if np.isfinite(lp_ref[i]) and np.isfinite(lp_alt[i])
+    ]
+    mean_abs_dlogp = float(np.mean(deltas)) if deltas else float("nan")
+
+    return {
+        "ppl_ref": ppl_ref,
+        "ppl_alt": ppl_alt,
+        "ppl_delta_rel": (ppl_alt - ppl_ref) / max(ppl_ref, 1e-8),
+        "top1_agreement": agree,
+        "mean_abs_dlogp_true": mean_abs_dlogp,
+        "n_tokens": n,
+    }
+
+
+def verdict_of(mean_delta_rel: float, mean_top1: float) -> str:
+    if (abs(mean_delta_rel) <= 0.01) and (mean_top1 >= 0.95):
+        return "ACCEPT"
+    if (abs(mean_delta_rel) <= 0.03) and (mean_top1 >= 0.85):
+        return "MARGINAL"
+    return "REJECT"
+
+
+# =============================================================================
+# Main
+# =============================================================================
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--model-path", required=True)
+    ap.add_argument("--model-name", required=True)
+    ap.add_argument("--ctx-len", type=int, default=1024)
+    ap.add_argument("--n-eval", type=int, default=64)
+    ap.add_argument("--block-size", type=int, default=512)
+    ap.add_argument("--bit-width", type=int, default=2)
+    ap.add_argument("--variance-ratio", type=float, default=0.95)
+    ap.add_argument("--pca-method", choices=["exact", "randomized"],
+                    default="randomized")
+    ap.add_argument("--rsvd-target-rank-factor", type=float, default=0.5)
+    ap.add_argument("--n-passages", type=int, default=2)
+    ap.add_argument("--gpu-mem-util", type=float, default=0.80)
+    ap.add_argument("--out-dir", type=Path, required=True)
+    args = ap.parse_args()
+
+    args.out_dir.mkdir(parents=True, exist_ok=True)
+
+    # Propagate codec config into the global state the patched forward
+    # reads from.
+    CodecState.block_size = args.block_size
+    CodecState.bit_width = args.bit_width
+    CodecState.variance_ratio = args.variance_ratio
+    CodecState.pca_method = args.pca_method
+    CodecState.rsvd_target_rank_factor = args.rsvd_target_rank_factor
+
+    # Install the patch BEFORE constructing the LLM so the wrapped
+    # forward is what the engine binds to its model shards.
+    install_vllm_codec_patch()
+
+    print(f"[{args.model_name}] loading vLLM engine…", flush=True)
+    max_len = args.ctx_len + args.n_eval + 16
+    llm = build_llm(args.model_path, max_len, args.gpu_mem_util)
+
+    # Build a HF-equivalent tokenizer for passage selection via vLLM's
+    # own tokenizer.
+    tok = llm.get_tokenizer()
+
+    print(f"[{args.model_name}] loading WikiText-103 passages "
+          f"(min_tokens={args.ctx_len + args.n_eval})…", flush=True)
+    passages = load_wikitext_passages(
+        tok, min_tokens=args.ctx_len + args.n_eval,
+        n_passages=args.n_passages,
+    )
+    print(f"  got {len(passages)} passages", flush=True)
+
+    per_passage: list[dict] = []
+    for i, passage in enumerate(passages):
+        print(f"  passage {i+1}/{len(passages)}…", flush=True)
+        ids = tok.encode(passage)
+        ids = ids[: args.ctx_len + args.n_eval]
+        if len(ids) < args.ctx_len + args.n_eval:
+            print("    skipped (too short after tokenization)", flush=True)
+            continue
+
+        CodecState.active = False
+        CodecState.stats = []
+        t0 = time.perf_counter()
+        ref_pls = prompt_logprobs_for_ids(llm, ids)
+        t_ref = time.perf_counter() - t0
+
+        CodecState.active = True
+        CodecState.stats = []
+        CodecState.layer_counter = 0  # re-key layer ids for this run
+        t0 = time.perf_counter()
+        alt_pls = prompt_logprobs_for_ids(llm, ids)
+        t_alt = time.perf_counter() - t0
+        alt_stats = list(CodecState.stats)
+        CodecState.active = False
+
+        metrics = compare_two_runs(ref_pls, alt_pls, ids,
+                                   args.ctx_len, args.n_eval)
+        m = metrics
+        print(
+            f"    ppl_ref={m['ppl_ref']:.3f} ppl_alt={m['ppl_alt']:.3f} "
+            f"Δppl={m['ppl_delta_rel']*100:+.2f}% "
+            f"top1={m['top1_agreement']*100:.1f}% "
+            f"Δlogp={m['mean_abs_dlogp_true']:.4f} "
+            f"(ref={t_ref:.2f}s, alt={t_alt:.2f}s, "
+            f"layer_calls={len(alt_stats)})",
+            flush=True,
+        )
+
+        per_passage.append({
+            "ctx_len": args.ctx_len,
+            "n_eval": args.n_eval,
+            "t_ref_sec": t_ref,
+            "t_alt_sec": t_alt,
+            "codec_layer_calls": len(alt_stats),
+            "codec_total_compressed_bytes": int(
+                sum(s["compressed_bytes"] for s in alt_stats)
+            ),
+            "metrics": metrics,
+        })
+
+    summary: dict = {
+        "model_name": args.model_name,
+        "model_path": args.model_path,
+        "engine": "vllm",
+        "ctx_len": args.ctx_len,
+        "n_eval": args.n_eval,
+        "block_size": args.block_size,
+        "bit_width": args.bit_width,
+        "variance_ratio": args.variance_ratio,
+        "pca_method": args.pca_method,
+        "rsvd_target_rank_factor": args.rsvd_target_rank_factor,
+        "n_passages": len(per_passage),
+    }
+    if per_passage:
+        mean_delta = float(np.mean(
+            [r["metrics"]["ppl_delta_rel"] for r in per_passage
+             if np.isfinite(r["metrics"]["ppl_delta_rel"])]
+        ))
+        mean_top1 = float(np.mean(
+            [r["metrics"]["top1_agreement"] for r in per_passage
+             if np.isfinite(r["metrics"]["top1_agreement"])]
+        ))
+        summary.update({
+            "mean_ppl_delta_rel": mean_delta,
+            "mean_top1_agreement": mean_top1,
+            "verdict": verdict_of(mean_delta, mean_top1),
+        })
+        print(f"\n[{args.model_name}] ===== SUMMARY (vLLM engine) =====",
+              flush=True)
+        print(f"  n_passages   = {len(per_passage)}", flush=True)
+        print(f"  Δppl (mean)  = {mean_delta*100:+.3f}%", flush=True)
+        print(f"  top1 agree   = {mean_top1*100:.2f}%", flush=True)
+        print(f"  VERDICT      = {summary['verdict']}", flush=True)
+    else:
+        summary["verdict"] = "NO_DATA"
+
+    summary["per_passage"] = per_passage
+    out_path = args.out_dir / f"{args.model_name}_vllm.json"
+    out_path.write_text(json.dumps(summary, indent=2))
+    print(f"\nwrote {out_path}", flush=True)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/benchmarks/e2e_ppl_validation_vllm_full.py b/benchmarks/e2e_ppl_validation_vllm_full.py
new file mode 100644
index 00000000..96512414
--- /dev/null
+++ b/benchmarks/e2e_ppl_validation_vllm_full.py
@@ -0,0 +1,690 @@
+#!/usr/bin/env python3
+"""End-to-end PPL validation of the FULL 'v1.3 PPL' production recipe
+(= v1.3 RSVD + four PPL-stabilization guardrails) running on vLLM.
+
+Guardrails are applied, per `reports/SPRINT_CLOSEOUT.md`:
+
+  1. Q-preconditioning        K_tilde = K @ L   (pre-RoPE, per (layer, kv-head))
+  2. Calibrated Lloyd-Max     K residual codebook (and V at b=2)
+  3. 6-layer boundary skip    layers [0, 1, 7, 14, 26, 27] kept bf16
+  4. Outlier compensation     T = 2.0 on K residual coords (~4.5% → f16)
+
+Unlike the scaffolding harness (`e2e_ppl_validation_vllm.py`) that
+patches `vllm.attention.layer.Attention.forward` — i.e. sees K/V AFTER
+RoPE — this harness patches the model's own `Qwen2Attention.forward`
+so we can touch K BEFORE RoPE, apply whitening, round-trip through
+the codec, un-whiten, and let RoPE + attention run normally on the
+repaired K. That is the mathematically correct place to do
+Q-preconditioning (L is calibrated on pre-RoPE distributions).
+
+V is round-tripped (without whitening) at the same hook point.
+
+Usage
+-----
+
+  python benchmarks/e2e_ppl_validation_vllm_full.py \\
+      --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \\
+      --model-name ds_distill_qwen_1_5b \\
+      --q-calib reports/v1_4_q_pca/flagship/deepseek_distill_q_calib.safetensors \\
+      --k-centroids reports/v1_4_q_pca/calibrated_codebook/ds_K_b3_centroids.f32 \\
+      --v-centroids reports/v1_4_q_pca/calibrated_codebook/ds_V_b2_centroids.f32 \\
+      --bit-width-k 3 --bit-width-v 2 \\
+      --outlier-threshold 2.0 \\
+      --boundary-skip-layers 0 1 7 14 26 27 \\
+      --ctx-len 2048 --n-eval 64 --n-passages 4 \\
+      --out-dir reports/v1_3_ppl/vllm/
+
+The default flags land the SPRINT_CLOSEOUT production cell:
+    K b=3, V b=2, share-basis-v, T=2.0, 6 bdry on DS-Distill.
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import struct
+import subprocess
+import sys
+import tempfile
+import time
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import torch
+
+REPO = Path(__file__).resolve().parent.parent
+BENCH_BIN = REPO / "kakeyaturbo" / "target" / "release" / "kakeyaturbo-bench"
+KKTV_MAGIC = 0x4B4B_5456
+
+sys.path.insert(0, str(REPO / "benchmarks"))
+from q_precondition import QPrecond, load as qp_load  # noqa: E402
+
+
+# =============================================================================
+# KKTV I/O
+# =============================================================================
+
+def write_kktv(path: Path, arr: np.ndarray) -> None:
+    assert arr.dtype == np.float32 and arr.ndim == 2, (arr.dtype, arr.shape)
+    n, d = arr.shape
+    with path.open("wb") as f:
+        f.write(struct.pack("<I", KKTV_MAGIC))
+        f.write(struct.pack("<I", 1))
+        f.write(struct.pack("<Q", n))
+        f.write(struct.pack("<I", d))
+        f.write(struct.pack("<I", 0))
+        f.write(arr.tobytes(order="C"))
+
+
+def read_kktv_f32(path: Path) -> np.ndarray:
+    with path.open("rb") as f:
+        magic = struct.unpack("<I", f.read(4))[0]
+        assert magic == KKTV_MAGIC
+        _ = struct.unpack("<I", f.read(4))[0]
+        n = struct.unpack("<Q", f.read(8))[0]
+        d = struct.unpack("<I", f.read(4))[0]
+        _ = struct.unpack("<I", f.read(4))[0]
+        raw = f.read(n * d * 4)
+    return np.frombuffer(raw, dtype=np.float32).reshape(n, d).copy()
+
+
+# =============================================================================
+# Rust codec round-trip
+# =============================================================================
+
+def rust_roundtrip(
+    arr: np.ndarray,
+    *,
+    block_size: int,
+    bit_width: int,
+    rsvd_target_rank: int,
+    metric: str,
+    share_basis: bool,
+    pca_method: str = "randomized",
+    variance_ratio: float = 0.95,
+    centroids_file: str | None = None,
+    outlier_threshold: float | None = None,
+) -> tuple[np.ndarray, dict]:
+    if not BENCH_BIN.exists():
+        raise FileNotFoundError(
+            f"{BENCH_BIN} missing; build with "
+            "`cargo build --release --bin kakeyaturbo-bench` in kakeyaturbo/"
+        )
+
+    with tempfile.TemporaryDirectory(dir="/tmp") as td:
+        tdp = Path(td)
+        in_path = tdp / "x.kktv"
+        rep_path = tdp / "report.json"
+        dec_path = tdp / "decoded.kktv"
+        write_kktv(in_path, arr.astype(np.float32, copy=False))
+        cmd = [
+            str(BENCH_BIN),
+            "--input", str(in_path),
+            "--output", str(rep_path),
+            "--metric", metric,
+            "--block-size", str(block_size),
+            "--variance-ratio", str(variance_ratio),
+            "--k", "16", "--bit-width", str(bit_width),
+            "--rotation-seed", "3405691582",
+            "--pca-method", pca_method,
+            "--verify",
+            "--dump-decoded", str(dec_path),
+        ]
+        if pca_method == "randomized":
+            cmd += [
+                "--rsvd-target-rank", str(rsvd_target_rank),
+                "--rsvd-oversample", "8",
+                "--rsvd-power-iters", "2",
+            ]
+        if share_basis:
+            cmd.append("--share-basis")
+        if centroids_file is not None:
+            cmd += ["--centroids-file", str(centroids_file)]
+        if outlier_threshold is not None:
+            cmd += ["--outlier-threshold", str(outlier_threshold)]
+
+        res = subprocess.run(cmd, capture_output=True, text=True)
+        if res.returncode != 0:
+            raise RuntimeError(
+                f"kakeyaturbo-bench failed (rc={res.returncode}): "
+                f"{res.stderr[:2000]}"
+            )
+        report = json.loads(rep_path.read_text())
+        decoded = read_kktv_f32(dec_path)
+        return decoded, report
+
+
+# =============================================================================
+# Global codec config (read by the monkey-patched Qwen2Attention.forward)
+# =============================================================================
+
+class CodecState:
+    active: bool = False
+    block_size: int = 512
+    bit_width_k: int = 3
+    bit_width_v: int = 2
+    variance_ratio: float = 0.95
+    pca_method: str = "randomized"
+    rsvd_target_rank_factor: float = 0.5
+    k_centroids_file: str | None = None
+    v_centroids_file: str | None = None
+    k_outlier_threshold: float | None = None
+    v_outlier_threshold: float | None = None
+    boundary_skip_layers: set[int] = set()
+    q_precond: QPrecond | None = None
+    share_basis_k: bool = False
+    share_basis_v: bool = True
+    # Stream selector. "kv" = compress both (production).
+    # "k"  = compress only K, V stays bf16 (diagnose K-only PPL cost).
+    # "v"  = compress only V, K stays bf16 (diagnose V-only PPL cost).
+    compress_stream: str = "kv"
+    stats: list[dict] = []
+
+
+# =============================================================================
+# Monkey-patch vLLM's Qwen2Attention.forward (pre-RoPE hook point)
+# =============================================================================
+
+def install_qwen2_pre_rope_patch() -> None:
+    """Patch vllm.model_executor.models.qwen2.Qwen2Attention.forward.
+
+    Replaces the stock forward with one that, when CodecState.active is
+    True, inserts:
+        K_tilde = whiten(K)        (per-layer, per-kv-head)
+        K_hat_tilde = codec_roundtrip(K_tilde)
+        K_hat = unwhiten(K_hat_tilde)
+        V_hat = codec_roundtrip(V)
+    immediately after the QKV projection (so BEFORE RoPE), then lets the
+    rest of the stock forward run. This is the same hook point as the
+    HF pre-RoPE harness in PR #13.
+    """
+    from vllm.model_executor.models.qwen2 import Qwen2Attention  # type: ignore
+
+    if getattr(Qwen2Attention, "_kk_full_patched", False):
+        return
+
+    orig_forward = Qwen2Attention.forward
+
+    def patched_forward(
+        self: Qwen2Attention,  # type: ignore[name-defined]
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: Any,
+    ) -> torch.Tensor:
+        if not CodecState.active:
+            return orig_forward(
+                self, positions, hidden_states, kv_cache, attn_metadata
+            )
+
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split(
+            [self.q_size, self.kv_size, self.kv_size], dim=-1
+        )
+        layer_id = _layer_id_from_attn_module(self.attn)
+        k = _apply_k_guardrails(k, v_ref=False, layer_id=layer_id, attn=self.attn)
+        v = _apply_k_guardrails(v, v_ref=True, layer_id=layer_id, attn=self.attn)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_out = self.attn(q, k, v, kv_cache, attn_metadata)
+        out, _ = self.o_proj(attn_out)
+        return out
+
+    Qwen2Attention.forward = patched_forward
+    Qwen2Attention._kk_full_patched = True  # type: ignore[attr-defined]
+    print("[codec-patch] vllm Qwen2Attention.forward wrapped "
+          "(pre-RoPE hook)", flush=True)
+
+
+def _layer_id_from_attn_module(attn_mod: Any) -> int:
+    """Map a vLLM Attention module back to an integer layer index.
+
+    vLLM assigns `self.layer_name = "model.layers.{L}.self_attn.attn"`.
+    We parse the "{L}" field.
+    """
+    name = getattr(attn_mod, "layer_name", None)
+    if name is None:
+        # Fall back to a monotonic counter the first time we see each
+        # instance.
+        if not hasattr(attn_mod, "_kk_layer_counter"):
+            CodecState.stats.append({"warn": "no-layer-name"})
+        cnt = getattr(attn_mod, "_kk_layer_counter", None)
+        if cnt is None:
+            cnt = len([s for s in CodecState.stats
+                       if s.get("_layer_counter_assignment")])
+            object.__setattr__(attn_mod, "_kk_layer_counter", cnt)
+            CodecState.stats.append({"_layer_counter_assignment": True})
+        return cnt
+    parts = name.split(".")
+    for i, p in enumerate(parts):
+        if p == "layers" and i + 1 < len(parts):
+            try:
+                return int(parts[i + 1])
+            except ValueError:
+                pass
+    return 0
+
+
+def _apply_k_guardrails(
+    t: torch.Tensor, *, v_ref: bool, layer_id: int, attn: Any,
+) -> torch.Tensor:
+    """Round-trip one K or V tensor through the v1.3 PPL pipeline.
+
+    `t` is post-QKV-projection, pre-RoPE: shape `[num_tokens,
+    kv_size]` where `kv_size = num_kv_heads * head_size`. We reshape to
+    `[num_tokens, num_kv_heads, head_size]`, round-trip, reshape back.
+    """
+    # Boundary-skip layers stay fully bf16.
+    if layer_id in CodecState.boundary_skip_layers:
+        CodecState.stats.append({
+            "layer": layer_id, "kind": "V" if v_ref else "K",
+            "skipped_boundary": True,
+        })
+        return t
+
+    orig_shape = t.shape
+    orig_dtype = t.dtype
+    orig_device = t.device
+
+    head_size = getattr(attn, "head_size", None)
+    num_kv_heads = getattr(attn, "num_kv_heads", None)
+    if head_size is None or num_kv_heads is None:
+        return t
+
+    # Stream gating: if this stream is not selected, pass through.
+    stream_on = (
+        (not v_ref and "k" in CodecState.compress_stream)
+        or (v_ref and "v" in CodecState.compress_stream)
+    )
+    if not stream_on:
+        CodecState.stats.append({
+            "layer": layer_id, "kind": "V" if v_ref else "K",
+            "stream_off": True,
+        })
+        return t
+
+    x = t.reshape(-1, num_kv_heads, head_size)  # [tokens, n_kv, head_size]
+    arr = x.detach().to(torch.float32).cpu().numpy()
+    n_tokens, n_kv, hd = arr.shape
+
+    # Q-preconditioning: whiten only K (v_ref is False) and only when
+    # a calibrated Cholesky is present for this layer.
+    qp = CodecState.q_precond
+    use_whiten = (
+        (not v_ref) and qp is not None
+        and qp.n_kv == n_kv and qp.head_dim == hd
+        and qp.is_active(layer_id)
+    )
+    if use_whiten:
+        arr_enc = qp.whiten(arr, layer=layer_id)
+    else:
+        arr_enc = arr
+
+    flat = arr_enc.reshape(-1, hd).astype(np.float32, copy=False)
+    n_total = flat.shape[0]
+    bs = CodecState.block_size
+    n_comp = (n_total // bs) * bs
+    if n_comp == 0:
+        return t  # not enough vectors to fill one block
+
+    bit_width = CodecState.bit_width_v if v_ref else CodecState.bit_width_k
+    target_rank = max(2, int(hd * CodecState.rsvd_target_rank_factor))
+    if v_ref:
+        centroids = CodecState.v_centroids_file
+        outlier_thr = CodecState.v_outlier_threshold
+        share = CodecState.share_basis_v
+        metric = "mse"
+    else:
+        centroids = CodecState.k_centroids_file
+        outlier_thr = CodecState.k_outlier_threshold
+        share = CodecState.share_basis_k
+        # With Q-precond, the codec's MSE on K_tilde is the proper
+        # proxy for the true \Sigma_q-weighted K distortion. If we
+        # didn't whiten, we'd fall back to inner_product metric like
+        # the scaffolding harness does.
+        metric = "mse" if use_whiten else "inner_product"
+
+    dec, rep = rust_roundtrip(
+        flat[:n_comp],
+        block_size=bs, bit_width=bit_width,
+        rsvd_target_rank=target_rank,
+        metric=metric, share_basis=share,
+        pca_method=CodecState.pca_method,
+        variance_ratio=CodecState.variance_ratio,
+        centroids_file=centroids,
+        outlier_threshold=outlier_thr,
+    )
+
+    # Stitch tail (vectors past the last full block) back in.
+    if n_comp < n_total:
+        dec = np.concatenate([dec, flat[n_comp:]], axis=0)
+
+    dec = dec.reshape(n_tokens, n_kv, hd)
+    if use_whiten:
+        dec = qp.unwhiten(dec, layer=layer_id)
+
+    restored = (
+        torch.from_numpy(dec.astype(np.float32))
+        .to(orig_device).to(orig_dtype)
+        .reshape(orig_shape)
+    )
+    CodecState.stats.append({
+        "layer": layer_id, "kind": "V" if v_ref else "K",
+        "metric": metric, "whitened": bool(use_whiten),
+        "bit_width": bit_width,
+        "n_compressible": int(n_comp),
+        "n_tail": int(n_total - n_comp),
+        "mean_block_mse": float(rep.get("mean_block_mse", -1.0)),
+        "compressed_bytes": int(rep.get("compressed_bytes", 0)),
+        "outlier_threshold": outlier_thr,
+        "centroids_file": centroids,
+    })
+    return restored
+
+
+# =============================================================================
+# WikiText-103 loader
+# =============================================================================
+
+def load_wikitext_passages(
+    tokenizer: Any, min_tokens: int, n_passages: int, split: str = "test",
+) -> list[str]:
+    from datasets import load_dataset
+    ds = load_dataset("wikitext", "wikitext-103-raw-v1", split=split)
+    passages: list[str] = []
+    current: list[str] = []
+    approx = 0
+    for row in ds:
+        text = row["text"]
+        if not text.strip():
+            continue
+        current.append(text)
+        approx += int(len(text.split()) * 1.3)
+        if approx >= min_tokens:
+            passage = "".join(current)
+            if len(tokenizer.encode(passage)) >= min_tokens:
+                passages.append(passage)
+                if len(passages) >= n_passages:
+                    return passages
+            current = []
+            approx = 0
+    return passages
+
+
+# =============================================================================
+# vLLM engine build + PPL measurement
+# =============================================================================
+
+def build_llm(model_path: str, max_model_len: int, gpu_mem_util: float):
+    from vllm import LLM  # type: ignore
+    return LLM(
+        model=model_path,
+        dtype="bfloat16",
+        max_model_len=max_model_len,
+        gpu_memory_utilization=gpu_mem_util,
+        enforce_eager=True,
+        trust_remote_code=True,
+    )
+
+
+def prompt_logprobs_for_ids(llm: Any, ids: list[int]) -> list[dict]:
+    from vllm import SamplingParams  # type: ignore
+    sp = SamplingParams(max_tokens=1, temperature=0.0, prompt_logprobs=1)
+    out = llm.generate(
+        prompts=None,
+        prompt_token_ids=[ids],
+        sampling_params=sp,
+        use_tqdm=False,
+    )
+    return out[0].prompt_logprobs
+
+
+def ppl_and_top1(
+    pls: list[dict], ids: list[int], start: int, end: int,
+) -> tuple[float, list[float], list[int]]:
+    lps: list[float] = []
+    top1: list[int] = []
+    for t in range(start, end):
+        entry = pls[t]
+        if entry is None:
+            continue
+        tok = ids[t]
+        if tok in entry:
+            lp = entry[tok]
+            lps.append(float(lp.logprob if hasattr(lp, "logprob") else lp["logprob"]))
+        else:
+            lps.append(float("-inf"))
+
+        def _lp(v: Any) -> float:
+            return float(v.logprob if hasattr(v, "logprob") else v["logprob"])
+
+        top1.append(int(max(entry.items(), key=lambda kv: _lp(kv[1]))[0]))
+    valid = [lp for lp in lps if np.isfinite(lp)]
+    mean_nll = -float(np.mean(valid)) if valid else float("inf")
+    ppl = float(np.exp(mean_nll)) if np.isfinite(mean_nll) else float("inf")
+    return ppl, lps, top1
+
+
+def compare(
+    ref_pls: list[dict], alt_pls: list[dict],
+    ids: list[int], ctx_len: int, n_eval: int,
+) -> dict:
+    end = min(ctx_len + n_eval, len(ids))
+    ppl_r, lp_r, t_r = ppl_and_top1(ref_pls, ids, ctx_len, end)
+    ppl_a, lp_a, t_a = ppl_and_top1(alt_pls, ids, ctx_len, end)
+    n = min(len(t_r), len(t_a))
+    agree = (
+        float(np.mean([1.0 if t_r[i] == t_a[i] else 0.0 for i in range(n)]))
+        if n else float("nan")
+    )
+    deltas = [
+        abs(lp_r[i] - lp_a[i])
+        for i in range(min(len(lp_r), len(lp_a)))
+        if np.isfinite(lp_r[i]) and np.isfinite(lp_a[i])
+    ]
+    return {
+        "ppl_ref": ppl_r,
+        "ppl_alt": ppl_a,
+        "ppl_delta_rel": (ppl_a - ppl_r) / max(ppl_r, 1e-8),
+        "top1_agreement": agree,
+        "mean_abs_dlogp_true": float(np.mean(deltas)) if deltas else float("nan"),
+        "n_tokens": n,
+    }
+
+
+def verdict_of(delta: float, top1: float) -> str:
+    if abs(delta) <= 0.01 and top1 >= 0.95:
+        return "ACCEPT"
+    if abs(delta) <= 0.03 and top1 >= 0.85:
+        return "MARGINAL"
+    return "REJECT"
+
+
+# =============================================================================
+# Main
+# =============================================================================
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--model-path", required=True)
+    ap.add_argument("--model-name", required=True)
+    ap.add_argument("--ctx-len", type=int, default=2048)
+    ap.add_argument("--n-eval", type=int, default=64)
+    ap.add_argument("--block-size", type=int, default=512)
+    ap.add_argument("--bit-width-k", type=int, default=3)
+    ap.add_argument("--bit-width-v", type=int, default=2)
+    ap.add_argument("--variance-ratio", type=float, default=0.95)
+    ap.add_argument("--pca-method", choices=["exact", "randomized"],
+                    default="randomized")
+    ap.add_argument("--rsvd-target-rank-factor", type=float, default=0.5)
+    ap.add_argument("--q-calib", type=str, default=None,
+                    help="Path to Σ_q Cholesky safetensors "
+                         "(set None to disable Q-preconditioning)")
+    ap.add_argument("--k-centroids", type=str, default=None)
+    ap.add_argument("--v-centroids", type=str, default=None)
+    ap.add_argument("--outlier-threshold", type=float, default=None,
+                    help="K residual outlier T (e.g. 2.0 for v1.3 PPL)")
+    ap.add_argument("--v-outlier-threshold", type=float, default=None,
+                    help="V residual outlier T. Unset by default (V has no "
+                         "outlier compensation in SPRINT_CLOSEOUT v1.3 PPL); "
+                         "enables symmetric outlier compensation on V when "
+                         "set (e.g. 2.0).")
+    ap.add_argument("--boundary-skip-layers", type=int, nargs="*",
+                    default=[0, 1, 7, 14, 26, 27],
+                    help="Layer indices kept at full precision (bf16)")
+    ap.add_argument("--compress-stream", choices=["kv", "k", "v"],
+                    default="kv",
+                    help="Which streams go through the codec. 'kv' is the "
+                         "production config; 'k' / 'v' run one stream through "
+                         "the codec and leave the other pass-through (bf16) "
+                         "for per-channel PPL attribution.")
+    ap.add_argument("--share-basis-v", action="store_true", default=True)
+    ap.add_argument("--no-share-basis-v", dest="share_basis_v",
+                    action="store_false")
+    ap.add_argument("--n-passages", type=int, default=4)
+    ap.add_argument("--gpu-mem-util", type=float, default=0.80)
+    ap.add_argument("--out-dir", type=Path, required=True)
+    args = ap.parse_args()
+
+    args.out_dir.mkdir(parents=True, exist_ok=True)
+
+    # Populate global codec state.
+    CodecState.block_size = args.block_size
+    CodecState.bit_width_k = args.bit_width_k
+    CodecState.bit_width_v = args.bit_width_v
+    CodecState.variance_ratio = args.variance_ratio
+    CodecState.pca_method = args.pca_method
+    CodecState.rsvd_target_rank_factor = args.rsvd_target_rank_factor
+    CodecState.k_centroids_file = args.k_centroids
+    CodecState.v_centroids_file = args.v_centroids
+    CodecState.k_outlier_threshold = args.outlier_threshold
+    CodecState.v_outlier_threshold = args.v_outlier_threshold
+    CodecState.boundary_skip_layers = set(args.boundary_skip_layers or [])
+    CodecState.share_basis_v = args.share_basis_v
+    CodecState.compress_stream = args.compress_stream
+    if args.compress_stream != "kv":
+        print(f"[setup] compress_stream={args.compress_stream}: "
+              f"{'K' if args.compress_stream == 'k' else 'V'} through codec, "
+              f"{'V' if args.compress_stream == 'k' else 'K'} stays bf16",
+              flush=True)
+    if args.q_calib:
+        print(f"[setup] loading Q-preconditioner from {args.q_calib}",
+              flush=True)
+        CodecState.q_precond = qp_load(args.q_calib, skip_layers=[0])
+        print(f"        calibrated layers: "
+              f"{CodecState.q_precond.n_calibrated_layers}/"
+              f"{CodecState.q_precond.n_layers} "
+              f"(n_kv={CodecState.q_precond.n_kv}, D={CodecState.q_precond.head_dim})",
+              flush=True)
+
+    # Install patch BEFORE LLM is constructed.
+    install_qwen2_pre_rope_patch()
+
+    print(f"[{args.model_name}] loading vLLM engine…", flush=True)
+    max_len = args.ctx_len + args.n_eval + 16
+    llm = build_llm(args.model_path, max_len, args.gpu_mem_util)
+
+    tok = llm.get_tokenizer()
+    print(f"[{args.model_name}] loading WikiText-103 passages…", flush=True)
+    passages = load_wikitext_passages(
+        tok, min_tokens=args.ctx_len + args.n_eval, n_passages=args.n_passages,
+    )
+    print(f"  got {len(passages)} passages", flush=True)
+
+    per_passage: list[dict] = []
+    for i, p in enumerate(passages):
+        print(f"  passage {i + 1}/{len(passages)}…", flush=True)
+        ids = tok.encode(p)[: args.ctx_len + args.n_eval]
+        if len(ids) < args.ctx_len + args.n_eval:
+            print("    skipped (short)", flush=True)
+            continue
+
+        CodecState.active = False
+        t0 = time.perf_counter()
+        ref_pls = prompt_logprobs_for_ids(llm, ids)
+        t_ref = time.perf_counter() - t0
+
+        CodecState.active = True
+        CodecState.stats = []
+        t0 = time.perf_counter()
+        alt_pls = prompt_logprobs_for_ids(llm, ids)
+        t_alt = time.perf_counter() - t0
+        stats_this = list(CodecState.stats)
+        CodecState.active = False
+
+        m = compare(ref_pls, alt_pls, ids, args.ctx_len, args.n_eval)
+        print(
+            f"    ppl_ref={m['ppl_ref']:.3f} ppl_alt={m['ppl_alt']:.3f} "
+            f"Δppl={m['ppl_delta_rel']*100:+.3f}% "
+            f"top1={m['top1_agreement']*100:.2f}% "
+            f"Δlogp={m['mean_abs_dlogp_true']:.4f} "
+            f"(ref={t_ref:.2f}s alt={t_alt:.2f}s calls={len(stats_this)})",
+            flush=True,
+        )
+        per_passage.append({
+            "ctx_len": args.ctx_len,
+            "n_eval": args.n_eval,
+            "t_ref_sec": t_ref,
+            "t_alt_sec": t_alt,
+            "codec_layer_calls": len(stats_this),
+            "codec_total_compressed_bytes": int(
+                sum(s.get("compressed_bytes", 0) for s in stats_this)
+            ),
+            "boundary_skips": int(
+                sum(1 for s in stats_this if s.get("skipped_boundary"))
+            ),
+            "metrics": m,
+        })
+
+    summary: dict = {
+        "model_name": args.model_name,
+        "model_path": args.model_path,
+        "engine": "vllm",
+        "recipe": "v1.3 PPL full guardrails",
+        "ctx_len": args.ctx_len,
+        "n_eval": args.n_eval,
+        "block_size": args.block_size,
+        "bit_width_k": args.bit_width_k,
+        "bit_width_v": args.bit_width_v,
+        "variance_ratio": args.variance_ratio,
+        "pca_method": args.pca_method,
+        "rsvd_target_rank_factor": args.rsvd_target_rank_factor,
+        "q_calib": args.q_calib,
+        "k_centroids": args.k_centroids,
+        "v_centroids": args.v_centroids,
+        "outlier_threshold": args.outlier_threshold,
+        "v_outlier_threshold": args.v_outlier_threshold,
+        "boundary_skip_layers": sorted(CodecState.boundary_skip_layers),
+        "share_basis_v": args.share_basis_v,
+        "n_passages": len(per_passage),
+    }
+    if per_passage:
+        valid = [r for r in per_passage
+                 if np.isfinite(r["metrics"]["ppl_delta_rel"])]
+        mean_delta = (float(np.mean([r["metrics"]["ppl_delta_rel"] for r in valid]))
+                      if valid else float("nan"))
+        mean_top1 = (float(np.mean([r["metrics"]["top1_agreement"] for r in valid]))
+                     if valid else float("nan"))
+        summary.update({
+            "mean_ppl_delta_rel": mean_delta,
+            "mean_top1_agreement": mean_top1,
+            "verdict": verdict_of(mean_delta, mean_top1),
+        })
+        print(f"\n[{args.model_name}] ===== SUMMARY (vLLM v1.3 PPL) =====",
+              flush=True)
+        print(f"  n_passages   = {len(per_passage)}", flush=True)
+        print(f"  Δppl (mean)  = {mean_delta*100:+.3f}%", flush=True)
+        print(f"  top1 agree   = {mean_top1*100:.2f}%", flush=True)
+        print(f"  VERDICT      = {summary['verdict']}", flush=True)
+    else:
+        summary["verdict"] = "NO_DATA"
+
+    summary["per_passage"] = per_passage
+    out_path = args.out_dir / f"{args.model_name}_vllm_full.json"
+    out_path.write_text(json.dumps(summary, indent=2))
+    print(f"\nwrote {out_path}", flush=True)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/benchmarks/e2e_ppl_validation_vllm_snapshot.py b/benchmarks/e2e_ppl_validation_vllm_snapshot.py
new file mode 100644
index 00000000..a76caa9d
--- /dev/null
+++ b/benchmarks/e2e_ppl_validation_vllm_snapshot.py
@@ -0,0 +1,555 @@
+#!/usr/bin/env python3
+"""Snapshot-mode vLLM harness: HF two-pass semantics inside vLLM.
+
+Goal
+----
+Phase 6 (PR #16) localised the HF (+7.82 %) \u2194 vLLM (+35.33 %) \u0394ppl
+gap to a +39 pp "cross-layer non-linear compounding" term that came
+from the vLLM harness applying the codec INSIDE the forward graph at
+every layer \u2014 so layer l+1's K/V projection is computed from a
+residual already shifted by layer l's codec. HF's harness avoids
+that by running the codec on a CLEAN prefill snapshot in the
+DynamicCache and then teacher-forcing the eval tokens against that
+cache. This script reproduces HF's semantics in vLLM:
+
+  Step 1: clean forward through vLLM (codec OFF). Capture per-layer
+          pre-RoPE K / V snapshots for all positions.
+  Step 2: offline \u2014 run the production v1.3 codec on each snapshot
+          (Q-precond on K + Lloyd-Max + outlier + boundary skip;
+          Lloyd-Max + share_basis on V).
+  Step 3: second forward through vLLM. Hook the Qwen2Attention.forward
+          so that instead of projecting K/V from the current (maybe
+          codec-shifted) residual, we FORCE the layer to use the
+          PRE-COMPUTED codec'd snapshot from Step 2. This kills the
+          in-forward cross-layer pollution path. Q still comes from
+          the running residual, matching HF's teacher-force flow.
+
+If this run measures \u0394ppl \u2248 +8 %, the entire +39 pp compounding is
+harness-integration (snapshot-vs-inline), and "deploy codec as a
+post-prefill cache compressor" is the honest production number on
+vLLM too. If it's still materially > +8 %, there IS an intrinsic
+engine component left.
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import struct
+import subprocess
+import sys
+import tempfile
+import time
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import torch
+
+REPO = Path(__file__).resolve().parent.parent
+BENCH_BIN = REPO / "kakeyaturbo" / "target" / "release" / "kakeyaturbo-bench"
+KKTV_MAGIC = 0x4B4B_5456
+
+sys.path.insert(0, str(REPO / "benchmarks"))
+from q_precondition import QPrecond, load as qp_load  # noqa: E402
+
+
+# =============================================================================
+# KKTV I/O + rust codec (reused)
+# =============================================================================
+
+def write_kktv(path: Path, arr: np.ndarray) -> None:
+    assert arr.dtype == np.float32 and arr.ndim == 2
+    n, d = arr.shape
+    with path.open("wb") as f:
+        f.write(struct.pack("<I", KKTV_MAGIC))
+        f.write(struct.pack("<I", 1))
+        f.write(struct.pack("<Q", n))
+        f.write(struct.pack("<I", d))
+        f.write(struct.pack("<I", 0))
+        f.write(arr.tobytes(order="C"))
+
+
+def read_kktv_f32(path: Path) -> np.ndarray:
+    with path.open("rb") as f:
+        magic = struct.unpack("<I", f.read(4))[0]
+        assert magic == KKTV_MAGIC
+        struct.unpack("<I", f.read(4))
+        n = struct.unpack("<Q", f.read(8))[0]
+        d = struct.unpack("<I", f.read(4))[0]
+        struct.unpack("<I", f.read(4))
+        raw = f.read(n * d * 4)
+    return np.frombuffer(raw, dtype=np.float32).reshape(n, d).copy()
+
+
+def rust_roundtrip(
+    arr: np.ndarray, *, block_size: int, bit_width: int,
+    rsvd_target_rank: int, metric: str, share_basis: bool,
+    pca_method: str = "randomized", variance_ratio: float = 0.95,
+    centroids_file: str | None = None,
+    outlier_threshold: float | None = None,
+) -> tuple[np.ndarray, dict]:
+    with tempfile.TemporaryDirectory(dir="/tmp") as td:
+        tdp = Path(td)
+        in_p, rep, dec = tdp/"x.kktv", tdp/"r.json", tdp/"d.kktv"
+        write_kktv(in_p, arr.astype(np.float32, copy=False))
+        cmd = [
+            str(BENCH_BIN), "--input", str(in_p), "--output", str(rep),
+            "--metric", metric, "--block-size", str(block_size),
+            "--variance-ratio", str(variance_ratio),
+            "--k", "16", "--bit-width", str(bit_width),
+            "--rotation-seed", "3405691582",
+            "--pca-method", pca_method, "--verify",
+            "--dump-decoded", str(dec),
+        ]
+        if pca_method == "randomized":
+            cmd += ["--rsvd-target-rank", str(rsvd_target_rank),
+                    "--rsvd-oversample", "8", "--rsvd-power-iters", "2"]
+        if share_basis:
+            cmd.append("--share-basis")
+        if centroids_file is not None:
+            cmd += ["--centroids-file", str(centroids_file)]
+        if outlier_threshold is not None:
+            cmd += ["--outlier-threshold", str(outlier_threshold)]
+        res = subprocess.run(cmd, capture_output=True, text=True)
+        if res.returncode != 0:
+            raise RuntimeError(f"codec rc={res.returncode}: "
+                               f"{res.stderr[:2000]}")
+        return read_kktv_f32(dec), json.loads(rep.read_text())
+
+
+# =============================================================================
+# Offline codec of a per-layer K (or V) snapshot, returning the decoded
+# tensor of identical shape. Applies the same recipe as the production
+# cell does inside its forward hook.
+# =============================================================================
+
+def codec_layer(
+    K_or_V: np.ndarray, *, is_v: bool, layer_id: int,
+    q_precond: QPrecond | None,
+    block_size: int, bit_width_k: int, bit_width_v: int,
+    k_centroids: str | None, v_centroids: str | None,
+    k_outlier_threshold: float | None, v_outlier_threshold: float | None,
+    boundary_skip: set[int], rsvd_target_rank_factor: float = 0.5,
+    share_basis_v: bool = True, share_basis_k: bool = False,
+) -> tuple[np.ndarray, dict]:
+    """Same logic as e2e_ppl_validation_vllm_full._apply_k_guardrails but
+    pure offline on a fp32 numpy array of shape [n_tokens, num_kv_heads,
+    head_size]."""
+    n, n_kv, hd = K_or_V.shape
+    if layer_id in boundary_skip:
+        return K_or_V.astype(np.float32, copy=False), {
+            "layer": layer_id, "stream": "V" if is_v else "K",
+            "boundary_skip": True,
+        }
+    rank = max(2, int(hd * rsvd_target_rank_factor))
+
+    use_whiten = (
+        (not is_v) and q_precond is not None
+        and q_precond.n_kv == n_kv and q_precond.head_dim == hd
+        and q_precond.is_active(layer_id)
+    )
+    arr_enc = q_precond.whiten(K_or_V, layer=layer_id) if use_whiten else K_or_V
+    flat = arr_enc.reshape(-1, hd).astype(np.float32, copy=False)
+    n_total = flat.shape[0]
+    n_comp = (n_total // block_size) * block_size
+    if n_comp == 0:
+        return K_or_V.astype(np.float32, copy=False), {
+            "layer": layer_id, "stream": "V" if is_v else "K",
+            "skipped_short": True,
+        }
+
+    if is_v:
+        bit_width = bit_width_v
+        centroids = v_centroids
+        outlier_thr = v_outlier_threshold
+        share = share_basis_v
+        metric = "mse"
+    else:
+        bit_width = bit_width_k
+        centroids = k_centroids
+        outlier_thr = k_outlier_threshold
+        share = share_basis_k
+        metric = "mse" if use_whiten else "inner_product"
+
+    dec, rep = rust_roundtrip(
+        flat[:n_comp], block_size=block_size, bit_width=bit_width,
+        rsvd_target_rank=rank, metric=metric, share_basis=share,
+        centroids_file=centroids, outlier_threshold=outlier_thr,
+    )
+    if n_comp < n_total:
+        dec = np.concatenate([dec, flat[n_comp:]], axis=0)
+    dec = dec.reshape(n, n_kv, hd)
+    if use_whiten:
+        dec = q_precond.unwhiten(dec, layer=layer_id)
+
+    return dec.astype(np.float32, copy=False), {
+        "layer": layer_id, "stream": "V" if is_v else "K",
+        "n_tokens": int(n), "n_compressible": int(n_comp),
+        "mean_block_mse": float(rep.get("mean_block_mse", -1.0)),
+        "compressed_bytes": int(rep.get("compressed_bytes", 0)),
+        "whitened": bool(use_whiten),
+        "metric": metric, "bit_width": bit_width,
+    }
+
+
+# =============================================================================
+# vLLM hook — snapshot-mode replacement
+# =============================================================================
+
+class HookState:
+    """Module-level state for the Qwen2Attention hook.
+
+    Three phases that the hook distinguishes:
+
+      phase == "capture"   record per-layer pre-RoPE K, V for all
+                           num_tokens of the current prompt; call the
+                           original forward unchanged so the forward is
+                           a true clean pass.
+      phase == "replace"   ignore the live pre-RoPE K, V projections
+                           and use the pre-codec'd tensor from
+                           `replacements[layer_id]` instead (same n
+                           tokens, same shape).
+      phase == "off"       no-op: the hook is equivalent to the stock
+                           Qwen2Attention.forward.
+    """
+    phase: str = "off"
+    captured: dict[int, dict[str, np.ndarray]] = {}
+    replacements: dict[int, dict[str, torch.Tensor]] = {}  # fp32 GPU tensors
+    head_size: int = 0
+    num_kv_heads: int = 0
+    num_heads: int = 0
+
+
+def install_qwen2_snapshot_patch() -> None:
+    from vllm.model_executor.models.qwen2 import Qwen2Attention  # type: ignore
+    if getattr(Qwen2Attention, "_kk_snapshot_patched", False):
+        return
+    orig = Qwen2Attention.forward
+
+    def patched(self, positions, hidden_states, kv_cache, attn_metadata):  # type: ignore[no-untyped-def]
+        if HookState.phase == "off":
+            return orig(self, positions, hidden_states, kv_cache, attn_metadata)
+        # Reimplement the parts of Qwen2Attention.forward we need.
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split(
+            [self.q_size, self.kv_size, self.kv_size], dim=-1
+        )
+        # Parse layer id
+        layer_id = 0
+        name = getattr(self.attn, "layer_name", None)
+        if name:
+            parts = name.split(".")
+            for i, p in enumerate(parts):
+                if p == "layers" and i + 1 < len(parts):
+                    try:
+                        layer_id = int(parts[i + 1])
+                    except ValueError:
+                        pass
+        HookState.head_size = self.attn.head_size
+        HookState.num_kv_heads = self.attn.num_kv_heads
+        HookState.num_heads = self.attn.num_heads
+
+        nkv = self.attn.num_kv_heads
+        hd = self.attn.head_size
+
+        if HookState.phase == "capture":
+            # Record pre-RoPE K, V (and shape) to numpy fp32.
+            k_np = (k.detach().to(torch.float32).cpu().numpy()
+                     .reshape(-1, nkv, hd))
+            v_np = (v.detach().to(torch.float32).cpu().numpy()
+                     .reshape(-1, nkv, hd))
+            HookState.captured[layer_id] = {"K": k_np, "V": v_np}
+            # Fall through to the normal forward with untouched k, v.
+        elif HookState.phase == "replace":
+            if layer_id in HookState.replacements:
+                repl = HookState.replacements[layer_id]
+                k_new = repl["K"]  # fp32 GPU tensor [n_tokens, nkv, hd]
+                v_new = repl["V"]
+                # Make sure shapes match THIS forward's n_tokens
+                n_tokens = k.shape[0]
+                if k_new.shape[0] == n_tokens:
+                    # Reshape back to [n_tokens, nkv*hd] and cast.
+                    k = k_new.reshape(n_tokens, -1).to(k.dtype)
+                    v = v_new.reshape(n_tokens, -1).to(v.dtype)
+                else:
+                    # Token count mismatch \u2014 typically means this is a
+                    # second forward over a different prompt length.
+                    # Skip replacement (shouldn't happen if we stick to
+                    # the same prompt ids across capture/replace).
+                    pass
+
+        q, k = self.rotary_emb(positions, q, k)
+        attn_out = self.attn(q, k, v, kv_cache, attn_metadata)
+        out, _ = self.o_proj(attn_out)
+        return out
+
+    Qwen2Attention.forward = patched
+    Qwen2Attention._kk_snapshot_patched = True  # type: ignore[attr-defined]
+    print("[snap-patch] Qwen2Attention.forward wrapped "
+          "(capture / replace / off)", flush=True)
+
+
+# =============================================================================
+# WikiText loader + vLLM driver
+# =============================================================================
+
+def load_wikitext_passages(tok: Any, min_tokens: int, n_passages: int,
+                           split: str = "test") -> list[str]:
+    from datasets import load_dataset
+    ds = load_dataset("wikitext", "wikitext-103-raw-v1", split=split)
+    passages, cur, approx = [], [], 0
+    for row in ds:
+        text = row["text"]
+        if not text.strip():
+            continue
+        cur.append(text)
+        approx += int(len(text.split()) * 1.3)
+        if approx >= min_tokens:
+            passage = "".join(cur)
+            if len(tok.encode(passage)) >= min_tokens:
+                passages.append(passage)
+                if len(passages) >= n_passages:
+                    return passages
+            cur, approx = [], 0
+    return passages
+
+
+def prompt_logprobs_for_ids(llm: Any, ids: list[int]) -> list[dict]:
+    from vllm import SamplingParams  # type: ignore
+    sp = SamplingParams(max_tokens=1, temperature=0.0, prompt_logprobs=1)
+    out = llm.generate(prompts=None, prompt_token_ids=[ids],
+                       sampling_params=sp, use_tqdm=False)
+    return out[0].prompt_logprobs
+
+
+def ppl_and_top1(pls: list[dict], ids: list[int],
+                 start: int, end: int) -> tuple[float, list[float], list[int]]:
+    lps, top1 = [], []
+    for t in range(start, end):
+        entry = pls[t]
+        if entry is None:
+            continue
+        tok = ids[t]
+        if tok in entry:
+            lp = entry[tok]
+            lps.append(float(lp.logprob if hasattr(lp, "logprob")
+                             else lp["logprob"]))
+        else:
+            lps.append(float("-inf"))
+
+        def _lp(v: Any) -> float:
+            return float(v.logprob if hasattr(v, "logprob") else v["logprob"])
+
+        top1.append(int(max(entry.items(), key=lambda kv: _lp(kv[1]))[0]))
+    valid = [lp for lp in lps if np.isfinite(lp)]
+    mean_nll = -float(np.mean(valid)) if valid else float("inf")
+    ppl = float(np.exp(mean_nll)) if np.isfinite(mean_nll) else float("inf")
+    return ppl, lps, top1
+
+
+def compare(ref_pls: list[dict], alt_pls: list[dict], ids: list[int],
+            ctx_len: int, n_eval: int) -> dict:
+    end = min(ctx_len + n_eval, len(ids))
+    ppl_r, lp_r, t_r = ppl_and_top1(ref_pls, ids, ctx_len, end)
+    ppl_a, lp_a, t_a = ppl_and_top1(alt_pls, ids, ctx_len, end)
+    n = min(len(t_r), len(t_a))
+    agree = (float(np.mean([1.0 if t_r[i] == t_a[i] else 0.0 for i in range(n)]))
+             if n else float("nan"))
+    deltas = [abs(lp_r[i] - lp_a[i])
+              for i in range(min(len(lp_r), len(lp_a)))
+              if np.isfinite(lp_r[i]) and np.isfinite(lp_a[i])]
+    return {
+        "ppl_ref": ppl_r, "ppl_alt": ppl_a,
+        "ppl_delta_rel": (ppl_a - ppl_r) / max(ppl_r, 1e-8),
+        "top1_agreement": agree,
+        "mean_abs_dlogp_true": (float(np.mean(deltas)) if deltas
+                                else float("nan")),
+        "n_tokens": n,
+    }
+
+
+def verdict_of(d: float, t: float) -> str:
+    if abs(d) <= 0.01 and t >= 0.95:
+        return "ACCEPT"
+    if abs(d) <= 0.03 and t >= 0.85:
+        return "MARGINAL"
+    return "REJECT"
+
+
+# =============================================================================
+# Main
+# =============================================================================
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--model-path", required=True)
+    ap.add_argument("--model-name", required=True)
+    ap.add_argument("--ctx-len", type=int, default=2048)
+    ap.add_argument("--n-eval", type=int, default=64)
+    ap.add_argument("--n-passages", type=int, default=4)
+    ap.add_argument("--gpu-mem-util", type=float, default=0.40)
+    ap.add_argument("--block-size", type=int, default=512)
+    ap.add_argument("--bit-width-k", type=int, default=3)
+    ap.add_argument("--bit-width-v", type=int, default=2)
+    ap.add_argument("--q-calib", type=str,
+        default="reports/v1_4_q_pca/flagship/"
+                "deepseek_distill_q_calib.safetensors")
+    ap.add_argument("--k-centroids", type=str,
+        default="reports/v1_4_q_pca/calibrated_codebook/ds_K_b3_centroids.f32")
+    ap.add_argument("--v-centroids", type=str,
+        default="reports/v1_4_q_pca/calibrated_codebook/ds_V_b2_centroids.f32")
+    ap.add_argument("--outlier-threshold", type=float, default=2.0)
+    ap.add_argument("--boundary-skip-layers", type=int, nargs="*",
+                    default=[0, 1, 7, 14, 26, 27])
+    ap.add_argument("--out-dir", type=Path, required=True)
+    args = ap.parse_args()
+    args.out_dir.mkdir(parents=True, exist_ok=True)
+
+    print(f"[setup] loading Q-precond from {args.q_calib}", flush=True)
+    qp = qp_load(args.q_calib, skip_layers=[0])
+    boundary_skip = set(args.boundary_skip_layers or [])
+
+    install_qwen2_snapshot_patch()
+    from vllm import LLM  # type: ignore
+    print(f"[{args.model_name}] loading vLLM engine\u2026", flush=True)
+    llm = LLM(model=args.model_path, dtype="bfloat16",
+              max_model_len=args.ctx_len + args.n_eval + 16,
+              gpu_memory_utilization=args.gpu_mem_util,
+              enforce_eager=True, trust_remote_code=True)
+    tok = llm.get_tokenizer()
+
+    print(f"[{args.model_name}] loading WikiText passages\u2026", flush=True)
+    passages = load_wikitext_passages(
+        tok, min_tokens=args.ctx_len + args.n_eval,
+        n_passages=args.n_passages,
+    )
+    passages_ids = [tok.encode(p)[: args.ctx_len + args.n_eval]
+                    for p in passages
+                    if len(tok.encode(p)) >= args.ctx_len + args.n_eval]
+    print(f"  usable: {len(passages_ids)}", flush=True)
+
+    per_passage = []
+    codec_stats_total: list[dict] = []
+
+    for pi, ids in enumerate(passages_ids):
+        print(f"\n  passage {pi + 1}/{len(passages_ids)}", flush=True)
+
+        # ---- Pass 1: clean, codec OFF, captures pre-RoPE K/V ----
+        HookState.phase = "capture"
+        HookState.captured = {}
+        t0 = time.perf_counter()
+        ref_pls = prompt_logprobs_for_ids(llm, ids)
+        t_ref = time.perf_counter() - t0
+        HookState.phase = "off"
+        # Sanity: captured layer count.
+        n_layers_captured = len(HookState.captured)
+        print(f"    [capture] {n_layers_captured} layers, "
+              f"{HookState.captured[0]['K'].shape[0]} tokens, "
+              f"{t_ref:.2f}s", flush=True)
+
+        # ---- Offline: codec every layer ----
+        t0 = time.perf_counter()
+        replacements: dict[int, dict[str, torch.Tensor]] = {}
+        stats_this = []
+        for lid, kv in HookState.captured.items():
+            k_hat, k_rep = codec_layer(
+                kv["K"], is_v=False, layer_id=lid, q_precond=qp,
+                block_size=args.block_size,
+                bit_width_k=args.bit_width_k, bit_width_v=args.bit_width_v,
+                k_centroids=args.k_centroids, v_centroids=args.v_centroids,
+                k_outlier_threshold=args.outlier_threshold,
+                v_outlier_threshold=None, boundary_skip=boundary_skip,
+            )
+            v_hat, v_rep = codec_layer(
+                kv["V"], is_v=True, layer_id=lid, q_precond=qp,
+                block_size=args.block_size,
+                bit_width_k=args.bit_width_k, bit_width_v=args.bit_width_v,
+                k_centroids=args.k_centroids, v_centroids=args.v_centroids,
+                k_outlier_threshold=args.outlier_threshold,
+                v_outlier_threshold=None, boundary_skip=boundary_skip,
+            )
+            replacements[lid] = {
+                "K": torch.from_numpy(k_hat).to("cuda").to(torch.float32),
+                "V": torch.from_numpy(v_hat).to("cuda").to(torch.float32),
+            }
+            stats_this.append(k_rep); stats_this.append(v_rep)
+        t_codec = time.perf_counter() - t0
+        n_boundary = sum(1 for s in stats_this if s.get("boundary_skip"))
+        print(f"    [codec] {len(stats_this)} layer-streams "
+              f"({n_boundary} boundary-skipped), {t_codec:.2f}s", flush=True)
+
+        # ---- Pass 2: codec'd K/V injected via replace hook ----
+        HookState.replacements = replacements
+        HookState.phase = "replace"
+        t0 = time.perf_counter()
+        alt_pls = prompt_logprobs_for_ids(llm, ids)
+        t_alt = time.perf_counter() - t0
+        HookState.phase = "off"
+        HookState.replacements = {}
+        # Free GPU memory held by replacements.
+        for r in replacements.values():
+            del r["K"]; del r["V"]
+        torch.cuda.empty_cache()
+
+        # ---- Compare ----
+        metrics = compare(ref_pls, alt_pls, ids, args.ctx_len, args.n_eval)
+        m = metrics
+        print(
+            f"    [result] ppl_ref={m['ppl_ref']:.3f} "
+            f"ppl_alt={m['ppl_alt']:.3f} "
+            f"Δppl={m['ppl_delta_rel']*100:+.3f}% "
+            f"top1={m['top1_agreement']*100:.2f}% "
+            f"Δlogp={m['mean_abs_dlogp_true']:.4f} "
+            f"(ref={t_ref:.2f}s codec={t_codec:.2f}s alt={t_alt:.2f}s)",
+            flush=True,
+        )
+        per_passage.append({
+            "passage": pi, "ctx_len": args.ctx_len, "n_eval": args.n_eval,
+            "t_ref_sec": t_ref, "t_codec_sec": t_codec, "t_alt_sec": t_alt,
+            "metrics": metrics,
+            "n_layers_captured": n_layers_captured,
+            "n_boundary_skipped": n_boundary,
+        })
+
+    summary = {
+        "model_name": args.model_name,
+        "model_path": args.model_path,
+        "engine": "vllm",
+        "recipe": "v1.3 PPL snapshot-mode",
+        "ctx_len": args.ctx_len, "n_eval": args.n_eval,
+        "bit_width_k": args.bit_width_k, "bit_width_v": args.bit_width_v,
+        "outlier_threshold": args.outlier_threshold,
+        "boundary_skip_layers": sorted(boundary_skip),
+        "q_calib": args.q_calib,
+        "k_centroids": args.k_centroids,
+        "v_centroids": args.v_centroids,
+        "n_passages": len(per_passage),
+    }
+    if per_passage:
+        valid = [r for r in per_passage
+                 if np.isfinite(r["metrics"]["ppl_delta_rel"])]
+        mean_delta = float(np.mean([r["metrics"]["ppl_delta_rel"] for r in valid]))
+        mean_top1 = float(np.mean([r["metrics"]["top1_agreement"] for r in valid]))
+        summary.update({
+            "mean_ppl_delta_rel": mean_delta,
+            "mean_top1_agreement": mean_top1,
+            "verdict": verdict_of(mean_delta, mean_top1),
+        })
+        print(f"\n[{args.model_name}] ===== SUMMARY (snapshot-mode) =====",
+              flush=True)
+        print(f"  n_passages  = {len(per_passage)}", flush=True)
+        print(f"  Δppl (mean) = {mean_delta*100:+.3f}%", flush=True)
+        print(f"  top1 agree  = {mean_top1*100:.2f}%", flush=True)
+        print(f"  VERDICT     = {summary['verdict']}", flush=True)
+    else:
+        summary["verdict"] = "NO_DATA"
+
+    summary["per_passage"] = per_passage
+    out_path = args.out_dir / f"{args.model_name}_vllm_snapshot.json"
+    out_path.write_text(json.dumps(summary, indent=2))
+    print(f"\nwrote {out_path}", flush=True)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/benchmarks/lloyd_max_calibration.py b/benchmarks/lloyd_max_calibration.py
new file mode 100644
index 00000000..d4337334
--- /dev/null
+++ b/benchmarks/lloyd_max_calibration.py
@@ -0,0 +1,355 @@
+#!/usr/bin/env python3
+"""Offline Lloyd-Max codebook calibration for KakeyaTurbo.
+
+Step 3 of the v1.4 Sprint 5-step plan.
+
+The current Rust codec uses Lloyd-Max centroids derived from the
+unit-variance Gaussian assumption.  But the actual residual distribution
+after PCA + WHT is only approximately Gaussian.  Mis-modelling shows up
+as ~+2-5pp Δppl inflation at b=2 (from KIVI / KVQuant literature).
+
+This tool collects real post-WHT residuals from a model, runs
+empirical Lloyd-Max iteration to compute MSE-optimal centroids for the
+actual distribution, and emits them as a .f32 binary file consumable
+by `kakeyaturbo-bench --centroids-file`.
+
+Pipeline:
+  1. Load model, run calibration prompts, get K/V tensors per layer
+  2. For each layer, apply: mean-centre → PCA basis → project → residual
+     after K-means cluster projection → pad to wht_len → WHT rotate →
+     normalise by residual norm
+  3. Collect all these scaled residuals across all layers into one
+     empirical distribution
+  4. Run Lloyd-Max iteration (initialize from Gaussian centroids,
+     iterate until convergence) to find optimal centroid positions
+  5. Write centroids to .f32 file (2^bits entries, sorted ascending,
+     little-endian)
+
+Assumption: residuals are globally stationary across the model (one
+calibrated codebook for all layers).  This is the same assumption the
+unit-Gaussian default makes, just with the actual empirical
+distribution substituted.
+"""
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+
+import numpy as np
+import torch
+
+REPO = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(REPO))
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import benchmarks.pre_rope_cache as prc
+from benchmarks.q_precondition import QPrecond, load as load_q_precond
+
+
+# ---------------------------------------------------------------------------
+# WHT and PCA utilities (copy of codec's flow, simplified)
+# ---------------------------------------------------------------------------
+
+def next_pow2(n: int) -> int:
+    p = 1
+    while p < n:
+        p <<= 1
+    return max(p, 1)
+
+
+def hadamard_matrix(n: int) -> np.ndarray:
+    """Standard Walsh-Hadamard ordered matrix (Sylvester construction)."""
+    assert (n & (n - 1)) == 0 and n > 0, f"n must be power of 2, got {n}"
+    h = np.array([[1.0]], dtype=np.float32)
+    while h.shape[0] < n:
+        h = np.block([[h, h], [h, -h]])
+    return h / np.sqrt(n)
+
+
+def sign_pattern(seed: int, n: int) -> np.ndarray:
+    """Reproduce the codec's Rademacher sign pattern from a seed."""
+    # This needs to match the Rust impl exactly for the calibration to be
+    # portable back.  For the calibration purposes, we don't need the same
+    # seed — as long as we're consistent.  The Rust side uses a fixed
+    # rotation_seed per codec config; we use the same one here.
+    rng = np.random.default_rng(seed)
+    return (rng.integers(0, 2, size=n) * 2 - 1).astype(np.float32)
+
+
+def wht_rotate(x: np.ndarray, seed: int) -> np.ndarray:
+    """Mirror the codec's `rotate` (D · H), rolled out in numpy.
+    x: [n, n_feat], returns same shape."""
+    n_feat = x.shape[-1]
+    assert (n_feat & (n_feat - 1)) == 0, f"wht requires power-of-2 length, got {n_feat}"
+    signs = sign_pattern(seed, n_feat)
+    xs = x * signs
+    h = hadamard_matrix(n_feat)
+    # unnormalised wht_inplace; apply matrix
+    return xs @ h.T * np.sqrt(n_feat)  # multiply by sqrt(N) to undo h's norm
+
+
+# Actually, the Rust code's `rotate` is documented as:
+#   buf = x * signs
+#   wht_inplace(&mut buf)  -- this is unnormalised WHT
+#   return buf
+# and we know after codec bugfix the `scale = 1.0 / res_norm` is applied
+# after rotate, so effectively the encoder does:
+#   scaled_residual = rotate(residual) / ||residual||
+# The DECODER undoes this (norm restored), so the quantiser sees
+# scaled_residual which should be ~unit-norm per vector.
+# So for our calibration, we simulate the codec's exact path.
+
+
+def fit_pca_simple(X: np.ndarray, vr: float = 1.0):
+    mean = X.mean(axis=0)
+    Xc = X - mean
+    cov = (Xc.T @ Xc) / X.shape[0]
+    evals, evecs = np.linalg.eigh(cov)
+    idx = np.argsort(evals)[::-1]
+    evals = evals[idx]
+    evecs = evecs[:, idx]
+    total = max(float(evals.sum()), 1e-20)
+    if vr >= 1.0:
+        d_eff = X.shape[1]
+    else:
+        cum = np.cumsum(np.maximum(evals, 0.0)) / total
+        d_eff = int(np.searchsorted(cum, vr) + 1)
+        d_eff = max(1, min(d_eff, X.shape[1]))
+    basis = evecs[:, :d_eff].T  # [d_eff, D]
+    return mean.astype(np.float32), basis.astype(np.float32), d_eff
+
+
+# ---------------------------------------------------------------------------
+# Residual collector
+# ---------------------------------------------------------------------------
+
+@torch.inference_mode()
+def collect_residuals(model_path: str, stream: str, n_passages: int, ctx_len: int,
+                       block_size: int, q_precond_path: str | None,
+                       skip_layers: list[int] | None, rotation_seed: int,
+                       vr: float = 1.0) -> np.ndarray:
+    """Collect scaled residuals (what the Lloyd-Max quantiser sees) across
+    all specified layers.  Returns a flat numpy array of all residual
+    coordinate values."""
+    print(f"loading {model_path}…", flush=True)
+    tok = AutoTokenizer.from_pretrained(model_path)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path, dtype=torch.bfloat16, attn_implementation="eager"
+    )
+    model.eval()
+    prc.install(model)
+
+    cfg = model.config.get_text_config(decoder=True)
+    layer_types = getattr(cfg, "layer_types", None) or (
+        ["full_attention"] * cfg.num_hidden_layers
+    )
+    full_attn_layers = [
+        l for l in range(cfg.num_hidden_layers)
+        if layer_types[l] == "full_attention"
+    ]
+    if skip_layers:
+        skip_set = set(skip_layers)
+        full_attn_layers = [l for l in full_attn_layers if l not in skip_set]
+    print(f"  collecting {stream} stream residuals from {len(full_attn_layers)} full-attn layers",
+          flush=True)
+
+    qp = load_q_precond(q_precond_path, skip_layers=skip_layers) if q_precond_path else None
+
+    from benchmarks.e2e_ppl_pre_rope import load_wikitext_passages, prefill_cache
+    passages = load_wikitext_passages(tok, ctx_len, n_passages)
+    print(f"  got {len(passages)} passages", flush=True)
+
+    residual_pool: list[np.ndarray] = []
+    for passage_i, p in enumerate(passages):
+        ids = tok(p, return_tensors="pt")["input_ids"][:, :ctx_len]
+        cache = prefill_cache(model, ids, prefill_chunk=1024)
+        for l in full_attn_layers:
+            tensor = (cache.layers[l].keys if stream == "K"
+                      else cache.layers[l].values)
+            # [1, n_kv, seq, D] → [seq, n_kv, D]
+            t_np = tensor[0].to(torch.float32).permute(1, 0, 2).cpu().numpy()
+            if stream == "K" and qp is not None and qp.is_active(l):
+                t_np = qp.whiten(t_np, layer=l)
+            flat = t_np.reshape(-1, t_np.shape[-1]).astype(np.float32, copy=False)
+            n_total = flat.shape[0]
+            n_comp = (n_total // block_size) * block_size
+            if n_comp == 0:
+                continue
+            D = flat.shape[-1]
+            # For calibration we approximate with block-level PCA (no
+            # K-means inside for simplicity — K-means residual is a small
+            # perturbation on top of the PCA residual in the WHT'd space).
+            for block_start in range(0, n_comp, block_size):
+                block = flat[block_start:block_start + block_size]
+                mean, basis, d_eff = fit_pca_simple(block, vr=vr)
+                # Project to coefficient space
+                coeff = (block - mean) @ basis.T  # [bs, d_eff]
+                # Approximate the codec's residual: coeff minus K-means reconstruction.
+                # Since we're collecting POOLED statistics and want the
+                # "bulk" residual distribution, we use the coeff directly —
+                # after WHT + norm scaling this is within 1.2x of the true
+                # residual, which is good enough for codebook calibration.
+                wht_len = next_pow2(d_eff)
+                padded = np.zeros((coeff.shape[0], wht_len), dtype=np.float32)
+                padded[:, :d_eff] = coeff
+                # Rotate each row
+                rotated = wht_rotate(padded, rotation_seed)
+                # Per-vector norm scaling (matches codec's scale = 1/res_norm)
+                norms = np.linalg.norm(coeff, axis=1, keepdims=True).clip(min=1e-12)
+                # Rotated vector should be scaled by 1/||coeff||
+                scaled = rotated / norms
+                residual_pool.append(scaled.reshape(-1).astype(np.float32))
+        print(f"  passage {passage_i+1}: accumulated residuals ({sum(r.size for r in residual_pool):,} samples so far)",
+              flush=True)
+
+    all_residuals = np.concatenate(residual_pool)
+    print(f"  total scaled residual samples: {all_residuals.size:,}", flush=True)
+    print(f"  residual stats: mean={all_residuals.mean():.4f}, "
+          f"std={all_residuals.std():.4f}, "
+          f"p5={np.percentile(all_residuals, 5):.4f}, "
+          f"p95={np.percentile(all_residuals, 95):.4f}, "
+          f"min={all_residuals.min():.3f}, max={all_residuals.max():.3f}", flush=True)
+    return all_residuals
+
+
+# ---------------------------------------------------------------------------
+# Lloyd-Max iteration
+# ---------------------------------------------------------------------------
+
+def lloyd_max_iterate(samples: np.ndarray, bits: int,
+                       init_centroids: np.ndarray | None = None,
+                       max_iter: int = 200, tol: float = 1e-6) -> np.ndarray:
+    """Run Lloyd-Max on a large 1D sample array.  Returns sorted centroids."""
+    k = 1 << bits
+    if init_centroids is None:
+        # Initialise from equi-quantile positions
+        init_centroids = np.array([
+            np.percentile(samples, (i + 0.5) / k * 100.0) for i in range(k)
+        ], dtype=np.float64)
+    else:
+        init_centroids = np.sort(init_centroids.astype(np.float64))
+
+    centroids = init_centroids.copy()
+    samples_d = samples.astype(np.float64)
+
+    for it in range(max_iter):
+        # Assignment: each sample → nearest centroid
+        # Use boundaries (midpoints between sorted centroids) for efficient assignment.
+        centroids_sorted = np.sort(centroids)
+        boundaries = (centroids_sorted[:-1] + centroids_sorted[1:]) / 2.0
+        assignments = np.searchsorted(boundaries, samples_d)
+
+        # Update: each centroid = mean of its assigned samples.
+        new_centroids = np.zeros_like(centroids_sorted)
+        for i in range(k):
+            mask = assignments == i
+            if mask.any():
+                new_centroids[i] = samples_d[mask].mean()
+            else:
+                new_centroids[i] = centroids_sorted[i]  # unchanged
+
+        # Convergence check
+        delta = float(np.max(np.abs(new_centroids - centroids_sorted)))
+        centroids = new_centroids
+        if delta < tol:
+            print(f"    Lloyd-Max converged at iter {it+1}, max-delta = {delta:.2e}", flush=True)
+            break
+        if (it + 1) % 20 == 0:
+            print(f"    iter {it+1}: max-delta = {delta:.4e}", flush=True)
+
+    return np.sort(centroids).astype(np.float32)
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+def write_centroids(centroids: np.ndarray, path: Path) -> None:
+    """Write centroids as a little-endian f32 binary file."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("wb") as f:
+        f.write(centroids.astype("<f4").tobytes())
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--model-path", required=True)
+    ap.add_argument("--stream", choices=["K", "V"], required=True,
+                    help="Which stream's residuals to calibrate against.")
+    ap.add_argument("--bit-width", type=int, required=True, choices=[1, 2, 3, 4])
+    ap.add_argument("--out-path", type=Path, required=True,
+                    help="Output .f32 centroids file.")
+    ap.add_argument("--q-precondition", type=Path, default=None,
+                    help="For K-stream calibration: apply Q-precond whitening "
+                         "before residual collection (matches runtime codec).")
+    ap.add_argument("--q-precond-skip-layers", type=int, nargs="+", default=[0])
+    ap.add_argument("--skip-layers", type=int, nargs="+", default=None,
+                    help="Layers to EXCLUDE from the calibration pool (should "
+                         "match runtime --q-precond-skip-layers for K).")
+    ap.add_argument("--n-passages", type=int, default=4)
+    ap.add_argument("--ctx-len", type=int, default=2048)
+    ap.add_argument("--block-size", type=int, default=1024)
+    ap.add_argument("--variance-ratio", type=float, default=1.0)
+    ap.add_argument("--rotation-seed", type=int, default=3405691582,
+                    help="Must match runtime --rotation-seed on the bench CLI "
+                         "(default 3405691582).")
+    ap.add_argument("--max-iter", type=int, default=200)
+    args = ap.parse_args()
+
+    samples = collect_residuals(
+        args.model_path, args.stream,
+        n_passages=args.n_passages, ctx_len=args.ctx_len,
+        block_size=args.block_size,
+        q_precond_path=str(args.q_precondition) if args.q_precondition else None,
+        skip_layers=args.skip_layers or args.q_precond_skip_layers,
+        rotation_seed=args.rotation_seed,
+        vr=args.variance_ratio,
+    )
+
+    # Subsample for iteration speed if huge
+    if samples.size > 5_000_000:
+        rng = np.random.default_rng(0)
+        idx = rng.choice(samples.size, size=5_000_000, replace=False)
+        samples = samples[idx]
+        print(f"  subsampled to {samples.size:,} for Lloyd-Max iteration", flush=True)
+
+    # Start from Gaussian defaults (matches the codec's baseline)
+    gaussian_centroids = {
+        1: [-0.798156, 0.798156],
+        2: [-1.5100, -0.4528, 0.4528, 1.5100],
+        3: [-2.151945, -1.343757, -0.756268, -0.244943,
+             0.244943, 0.756268, 1.343757, 2.151945],
+        4: [-2.7322, -2.0690, -1.6177, -1.2563, -0.9422, -0.6566, -0.3885, -0.1281,
+             0.1281, 0.3885, 0.6566, 0.9422, 1.2563, 1.6177, 2.0690, 2.7322],
+    }[args.bit_width]
+    init = np.array(gaussian_centroids, dtype=np.float64)
+
+    print(f"\nrunning Lloyd-Max at b={args.bit_width} (k={1<<args.bit_width} centroids)…", flush=True)
+    centroids = lloyd_max_iterate(samples, args.bit_width,
+                                   init_centroids=init, max_iter=args.max_iter)
+
+    # Compare MSE vs Gaussian default
+    sigma = float(samples.std())
+    rec_gaussian = np.array(gaussian_centroids, dtype=np.float64)[
+        np.argmin(np.abs(samples[:, None] - np.array(gaussian_centroids)[None, :]), axis=1)
+    ]
+    mse_gaussian = float(np.mean((samples - rec_gaussian) ** 2))
+    rec_cal = centroids[np.argmin(np.abs(samples[:, None] - centroids[None, :]), axis=1)]
+    mse_cal = float(np.mean((samples - rec_cal) ** 2))
+
+    print(f"\ncentroid comparison for {args.stream} stream:")
+    print(f"  Gaussian (default) : {gaussian_centroids}")
+    print(f"  Calibrated (fitted): {centroids.tolist()}")
+    print(f"  residual std       : {sigma:.4f}")
+    print(f"  MSE Gaussian       : {mse_gaussian:.4e}")
+    print(f"  MSE Calibrated     : {mse_cal:.4e}")
+    print(f"  MSE improvement    : {mse_gaussian / mse_cal:.2f}×"
+          if mse_cal > 0 else "  (degenerate)")
+
+    write_centroids(centroids, args.out_path)
+    print(f"\n[wrote] {args.out_path} ({args.out_path.stat().st_size} bytes)")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/q_calibration.py b/benchmarks/q_calibration.py
new file mode 100644
index 00000000..75b50de6
--- /dev/null
+++ b/benchmarks/q_calibration.py
@@ -0,0 +1,239 @@
+#!/usr/bin/env python3
+"""Q calibration for Q-preconditioned PCA.
+
+For every full-attention layer l and every kv-head h_k we compute
+
+    Sigma_q^{(l, h_k)} = sum_{h_q in group(h_k)}  E[ q^{(l, h_q)} (q^{(l, h_q)})^T ]
+
+where q is the *pre-RoPE* query (right after q_proj, no rotary).  That is
+what the codec's k_pre sees when attention is factored as
+
+    q_post^T k_post = q_pre^T R^T R k_pre = q_pre^T R(delta) k_pre
+
+and the coupling with k_pre under marginalised relative position is the
+pre-RoPE Gram matrix (to leading order — exact at zero position lag,
+orthogonally mixed at nonzero lags, which is why we collect q_pre over
+many positions and many prompts, not just one).
+
+We then Cholesky-factor each Sigma_q and store L (lower triangular).
+The downstream codec pipeline whitens K via `K_tilde = K @ L` before
+encode and unwhitens via `K_hat = K_hat_tilde @ L^{-T}` after decode.
+This is mathematically identical to using a Sigma_q-weighted distortion
+in PCA / K-means / Lloyd-Max, but requires zero Rust codec change.
+
+Math.  We want to minimise
+
+    sum_i  e_i^T  Sigma_q  e_i  = tr(E Sigma_q E^T)  = || E L ||_F^2
+                                                     = || K L - K_hat L ||_F^2
+
+Whiten before codec:       K_tilde     = K @ L
+Un-whiten after decode:    K_hat       = K_hat_tilde @ L^{-1}
+
+where L is the lower-triangular Cholesky factor of Sigma_q (L L^T = Sigma_q).
+
+Output file format (safetensors):
+
+    layer_<l>_chol     : [n_kv, D, D]  fp32  (lower triangular L)
+    layer_<l>_inv_chol : [n_kv, D, D]  fp32  (lower triangular L^{-1})
+    layer_<l>_sigma    : [n_kv, D, D]  fp32  (for diagnostics only)
+
+plus a `config.json` sidecar with shapes and axis meanings.
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+import numpy as np
+import torch
+
+REPO = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(REPO))
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+import benchmarks.pre_rope_cache as prc
+from benchmarks.e2e_ppl_pre_rope import load_wikitext_passages
+
+
+def _kv_group_ranges(num_q_heads: int, num_kv_heads: int):
+    """For each kv-head, return the list of query-head indices that share it."""
+    assert num_q_heads % num_kv_heads == 0, "non-evenly-divisible GQA unsupported"
+    group_size = num_q_heads // num_kv_heads
+    return [list(range(h * group_size, (h + 1) * group_size))
+            for h in range(num_kv_heads)]
+
+
+@torch.inference_mode()
+def calibrate(model_path: str, out_path: Path, *,
+              n_passages: int, ctx_len: int, prefill_chunk: int,
+              ridge: float):
+    print(f"loading {model_path}…", flush=True)
+    tok = AutoTokenizer.from_pretrained(model_path)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path, dtype=torch.bfloat16, attn_implementation="eager"
+    )
+    model.eval()
+    info = prc.install(model)
+    print(f"  patched {info['patched_layers']} attention layers", flush=True)
+
+    cfg = model.config.get_text_config(decoder=True)
+    # Qwen3 and similar families set an explicit head_dim that differs
+    # from hidden_size // num_attention_heads. Honor the explicit value
+    # when present; fall back to the classical formula otherwise.
+    head_dim = getattr(cfg, "head_dim", None) or (cfg.hidden_size // cfg.num_attention_heads)
+    n_q = cfg.num_attention_heads
+    n_kv = cfg.num_key_value_heads
+    groups = _kv_group_ranges(n_q, n_kv)
+    n_layers = cfg.num_hidden_layers
+    layer_types = getattr(cfg, "layer_types", None) or (
+        ["full_attention"] * n_layers
+    )
+    print(f"  D={head_dim}, n_q={n_q}, n_kv={n_kv}, groups per kv = {len(groups[0])}")
+
+    passages = load_wikitext_passages(tok, ctx_len, n_passages)
+    print(f"  using {len(passages)} WikiText passages of >= {ctx_len} tokens", flush=True)
+
+    # Accumulate gram sums, not per-passage tensors — cheap and bounded memory.
+    # gram[l][h_kv]  shape [D, D]   sum_{all tokens in group} q q^T
+    gram = [[np.zeros((head_dim, head_dim), dtype=np.float64)
+             for _ in range(n_kv)]
+            for _ in range(n_layers)]
+    count = [0 for _ in range(n_layers)]
+
+    for i, passage in enumerate(passages):
+        ids = tok(passage, return_tensors="pt")["input_ids"][:, :ctx_len]
+        if ids.shape[-1] < ctx_len:
+            print(f"  passage {i+1}: SKIP (too short: {ids.shape[-1]})")
+            continue
+
+        # Reset recorder for this passage so we can consume incrementally
+        # without letting memory pile up at long context.
+        cfg._q_recorder = {}
+
+        # Chunked prefill; recorder accumulates q_pre per chunk.
+        if prefill_chunk <= 0 or ids.shape[-1] <= prefill_chunk:
+            _ = model(input_ids=ids, use_cache=False)
+        else:
+            # use_cache=False means no kv cache is kept — we only want Q stats.
+            # BUT attention still needs the full context so calibrate on full
+            # input in one shot (bf16 CPU is fine at ctx=1024-2048).
+            _ = model(input_ids=ids, use_cache=False)
+
+        # Consume recorder: accumulate into grams, then drop refs.
+        for l_idx, q_list in cfg._q_recorder.items():
+            if layer_types[l_idx] != "full_attention":
+                continue
+            # q_list entries: [bsz=1, n_q, seq, D]
+            for q in q_list:
+                q = q[0]  # [n_q, seq, D]
+                for h_kv in range(n_kv):
+                    # Sum over all q heads in this kv group
+                    group = groups[h_kv]
+                    q_group = q[group]  # [g, seq, D]
+                    q_flat = q_group.reshape(-1, head_dim).numpy()  # [g*seq, D]
+                    gram[l_idx][h_kv] += q_flat.T @ q_flat
+                count[l_idx] += q.shape[1]  # seq, same for every layer
+        cfg._q_recorder = None
+        print(f"  passage {i+1}/{len(passages)}: processed", flush=True)
+
+    # Finalize: Sigma = gram / (group_size * total_tokens_seen)
+    # then L = chol(Sigma + ridge*I),  L^{-T}
+    print(f"\nfactoring Sigma_q for {n_layers} layers ×  {n_kv} kv-heads…",
+          flush=True)
+    out_tensors = {}
+    diagnostics = []
+    for l in range(n_layers):
+        if layer_types[l] != "full_attention":
+            continue
+        if count[l] == 0:
+            continue
+        chol_stack = np.zeros((n_kv, head_dim, head_dim), dtype=np.float32)
+        inv_chol_stack = np.zeros_like(chol_stack)
+        sigma_stack = np.zeros_like(chol_stack)
+        group_size = len(groups[0])
+        n_tokens_per_head = group_size * count[l]
+        for h_kv in range(n_kv):
+            sigma = gram[l][h_kv] / n_tokens_per_head
+            # Symmetrize against fp drift
+            sigma = 0.5 * (sigma + sigma.T)
+            # Ridge for numerical stability — ridge * mean_diag
+            mean_diag = float(np.mean(np.diag(sigma)))
+            sigma_reg = sigma + ridge * mean_diag * np.eye(head_dim)
+            L = np.linalg.cholesky(sigma_reg)       # lower triangular, L L^T = Sigma
+            # Inverse of L (also lower triangular) for the unwhitening post-decode step.
+            L_inv = np.linalg.solve(L, np.eye(head_dim))
+            chol_stack[h_kv]     = L.astype(np.float32)
+            inv_chol_stack[h_kv] = L_inv.astype(np.float32)
+            sigma_stack[h_kv]    = sigma.astype(np.float32)
+
+            evals = np.linalg.eigvalsh(sigma_reg)
+            diagnostics.append({
+                "layer": l, "kv_head": h_kv,
+                "sigma_trace": float(np.trace(sigma)),
+                "eig_min":     float(evals.min()),
+                "eig_max":     float(evals.max()),
+                "condition":   float(evals.max() / max(evals.min(), 1e-30)),
+                "diag_mean":   mean_diag,
+                "off_diag_max_abs": float(np.abs(sigma - np.diag(np.diag(sigma))).max()),
+            })
+        out_tensors[f"layer_{l}_chol"]     = torch.from_numpy(chol_stack)
+        out_tensors[f"layer_{l}_inv_chol"] = torch.from_numpy(inv_chol_stack)
+        out_tensors[f"layer_{l}_sigma"]    = torch.from_numpy(sigma_stack)
+
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    from safetensors.torch import save_file
+    save_file(out_tensors, str(out_path))
+
+    cfg_sidecar = out_path.with_suffix(".json")
+    cfg_sidecar.write_text(json.dumps({
+        "model_path": model_path,
+        "head_dim": head_dim,
+        "num_q_heads": n_q,
+        "num_kv_heads": n_kv,
+        "num_layers": n_layers,
+        "layer_types": layer_types,
+        "n_passages_used": sum(1 for c in count if c > 0),
+        "ctx_len": ctx_len,
+        "ridge": ridge,
+        "diagnostics": diagnostics,
+    }, indent=2))
+    print(f"wrote {out_path} + {cfg_sidecar}", flush=True)
+
+    # Summary stats: is Sigma_q anisotropic, or close to isotropic?
+    conds = [d["condition"] for d in diagnostics]
+    off_ratios = [
+        d["off_diag_max_abs"] / max(d["diag_mean"], 1e-30)
+        for d in diagnostics
+    ]
+    print(f"\nSigma_q anisotropy summary (across all (layer, kv_head) pairs):")
+    print(f"  condition number:         min={min(conds):.2f}  "
+          f"median={np.median(conds):.2f}  max={max(conds):.2f}")
+    print(f"  max(|off-diag|)/mean_diag: min={min(off_ratios):.3f}  "
+          f"median={np.median(off_ratios):.3f}  max={max(off_ratios):.3f}")
+    print("  (condition ≫ 1 or off/diag ≫ 0 ⇒ Sigma_q is anisotropic, "
+          "so Q-precondition has something to do)")
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--model-path", required=True)
+    ap.add_argument("--out-path", type=Path, required=True,
+                    help=".safetensors file path for the calibration")
+    ap.add_argument("--n-passages", type=int, default=8)
+    ap.add_argument("--ctx-len", type=int, default=2048)
+    ap.add_argument("--prefill-chunk", type=int, default=0)
+    ap.add_argument("--ridge", type=float, default=1e-3,
+                    help="Cholesky ridge = ridge * mean_diag(Sigma) for numerical stability")
+    args = ap.parse_args()
+    calibrate(
+        args.model_path, args.out_path,
+        n_passages=args.n_passages, ctx_len=args.ctx_len,
+        prefill_chunk=args.prefill_chunk, ridge=args.ridge,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/q_precondition.py b/benchmarks/q_precondition.py
new file mode 100644
index 00000000..c4da15e7
--- /dev/null
+++ b/benchmarks/q_precondition.py
@@ -0,0 +1,141 @@
+"""Q-preconditioning utility for the K-stream codec.
+
+This module owns the math
+
+    K_tilde = K @ L                (whiten)
+    K_hat   = K_hat_tilde @ L^{-1} (un-whiten)
+
+where L is the lower-triangular Cholesky factor of the per-(layer, kv_head)
+query Gram matrix Sigma_q produced by `benchmarks/q_calibration.py`.
+
+Minimising standard MSE on K_tilde is mathematically equivalent to
+minimising the Sigma_q-weighted distortion on K, which is exactly the
+"InnerProduct on K" failure metric that v1.3 paper §2 claims but the
+current codec (per-coordinate MSE on K) does not actually enforce.
+
+The Rust codec is not touched.  Whitening is done in Python, right
+before the tensor is serialised to the KKTV format and sent into the
+bench binary.  Unwhitening is done right after the decoded tensor is
+read back.
+"""
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Optional
+
+import numpy as np
+import torch
+
+
+class QPrecond:
+    """Per-layer, per-kv-head Cholesky factors L and L^{-1} held in fp32 cpu.
+
+    Shapes:
+        chol[l]     : [n_kv, D, D]  lower triangular
+        inv_chol[l] : [n_kv, D, D]  lower triangular (L^{-1})
+
+    Layers not present in the dict (e.g. skipped via `skip_layers`) use
+    an identity transform (no-op), so the codec falls back to its plain
+    Euclidean behaviour on those layers.
+    """
+
+    def __init__(self, path: str | Path, skip_layers: list[int] | None = None):
+        path = Path(path)
+        cfg = json.loads(path.with_suffix(".json").read_text())
+        self.head_dim = cfg["head_dim"]
+        self.n_kv = cfg["num_kv_heads"]
+        self.n_layers = cfg["num_layers"]
+        self.layer_types = cfg["layer_types"]
+        self.skip_layers = set(skip_layers or [])
+        from safetensors.torch import load_file
+        tensors = load_file(str(path))
+        self.chol: dict[int, np.ndarray] = {}
+        self.inv_chol: dict[int, np.ndarray] = {}
+        for l in range(self.n_layers):
+            if l in self.skip_layers:
+                continue
+            k_chol = f"layer_{l}_chol"
+            k_inv = f"layer_{l}_inv_chol"
+            if k_chol in tensors:
+                self.chol[l]     = tensors[k_chol].numpy().astype(np.float32)
+                self.inv_chol[l] = tensors[k_inv].numpy().astype(np.float32)
+
+    @property
+    def n_calibrated_layers(self) -> int:
+        return len(self.chol)
+
+    def is_active(self, layer: int) -> bool:
+        """Layer has a calibrated Cholesky; whiten/unwhiten are non-trivial."""
+        return layer in self.chol
+
+    def whiten(self, k_per_head: np.ndarray, layer: int) -> np.ndarray:
+        """Input: K with shape [seq, n_kv, D] (pre-RoPE, one passage, one layer).
+        Output: same shape, whitened per kv-head. No-op for layers not
+        in the calibration (e.g. layer 0 when skip_layers=[0])."""
+        assert k_per_head.ndim == 3
+        assert k_per_head.shape[1] == self.n_kv
+        assert k_per_head.shape[2] == self.head_dim
+        if layer not in self.chol:
+            return k_per_head.astype(np.float32, copy=False)
+        L = self.chol[layer]                # [n_kv, D, D]
+        # K[t, h, :] @ L[h, :, :]  → K_tilde[t, h, :]
+        return np.einsum("thj,hjk->thk", k_per_head, L, optimize=True).astype(
+            np.float32, copy=False
+        )
+
+    def unwhiten(self, k_tilde_per_head: np.ndarray, layer: int) -> np.ndarray:
+        """Inverse of `whiten`.  Applies L^{-1} on the right per kv-head.
+        No-op for layers not in the calibration."""
+        assert k_tilde_per_head.ndim == 3
+        if layer not in self.inv_chol:
+            return k_tilde_per_head.astype(np.float32, copy=False)
+        Linv = self.inv_chol[layer]
+        return np.einsum("thj,hjk->thk", k_tilde_per_head, Linv, optimize=True).astype(
+            np.float32, copy=False
+        )
+
+
+def sanity_check(qp: QPrecond) -> dict:
+    """Verify whiten ∘ unwhiten ≈ identity on random data for every layer.
+
+    Reports max absolute error and max relative error (per layer).  A
+    correctly-built QPrecond will have errors ≲ 1e-5 (fp32 round-off).
+    """
+    rng = np.random.default_rng(0)
+    results = []
+    for l, L in qp.chol.items():
+        x = rng.normal(size=(128, qp.n_kv, qp.head_dim)).astype(np.float32)
+        x_tilde = qp.whiten(x, l)
+        x_back = qp.unwhiten(x_tilde, l)
+        abs_err = float(np.abs(x - x_back).max())
+        rel_err = float(np.abs(x - x_back).max() / max(np.abs(x).max(), 1e-30))
+        results.append({"layer": l, "max_abs_err": abs_err, "max_rel_err": rel_err})
+    return {
+        "max_abs_err": max(r["max_abs_err"] for r in results),
+        "max_rel_err": max(r["max_rel_err"] for r in results),
+        "per_layer": results,
+    }
+
+
+def load(path: Optional[str | Path],
+         skip_layers: list[int] | None = None) -> Optional[QPrecond]:
+    if path is None:
+        return None
+    return QPrecond(path, skip_layers=skip_layers)
+
+
+if __name__ == "__main__":
+    import argparse
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--calib", required=True)
+    args = ap.parse_args()
+    qp = QPrecond(args.calib)
+    print(f"loaded {qp.n_calibrated_layers} layers, n_kv={qp.n_kv}, D={qp.head_dim}")
+    san = sanity_check(qp)
+    print(f"sanity: max_abs_err={san['max_abs_err']:.3e}  "
+          f"max_rel_err={san['max_rel_err']:.3e}")
+    # Also show anisotropy of L directly (not Sigma)
+    for l in sorted(qp.chol)[:3]:
+        L = qp.chol[l][0]  # kv-head 0
+        print(f"  layer {l} kv0 L diag[:5]: {np.diag(L)[:5]}")
diff --git a/benchmarks/run_v1_3_ppl_full_vllm.sh b/benchmarks/run_v1_3_ppl_full_vllm.sh
new file mode 100755
index 00000000..e45aad26
--- /dev/null
+++ b/benchmarks/run_v1_3_ppl_full_vllm.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+# Build kakeyaturbo-bench and run the v1.3 PPL production cell on vLLM:
+# DS-Distill D=128, K b=3 + V b=2, calibrated Lloyd-Max + outlier T=2.0
+# + 6-layer boundary skip + pre-RoPE Q-preconditioning.
+#
+# HF reference for this cell (SPRINT_CLOSEOUT): Δppl +7.82 %, top-1
+# 78.97 %, ratio 4.61× (MARGINAL). Measuring the same cell under
+# vLLM 0.7.3 / FLASH_ATTN to quantify the engine-level gap.
+#
+# Per-channel attribution (K-only / V-only) is controlled by
+# COMPRESS_STREAM ∈ {kv, k, v}. Default kv = full production cell.
+set -euo pipefail
+cd "$(dirname "$0")/.."
+
+MODEL_PATH="${MODEL_PATH:-deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B}"
+MODEL_NAME="${MODEL_NAME:-ds_distill_qwen_1_5b}"
+CTX_LEN="${CTX_LEN:-2048}"
+N_EVAL="${N_EVAL:-64}"
+BLOCK_SIZE="${BLOCK_SIZE:-512}"
+BIT_WIDTH_K="${BIT_WIDTH_K:-3}"
+BIT_WIDTH_V="${BIT_WIDTH_V:-2}"
+OUTLIER_THRESHOLD="${OUTLIER_THRESHOLD:-2.0}"
+V_OUTLIER_THRESHOLD="${V_OUTLIER_THRESHOLD:-}"
+N_PASSAGES="${N_PASSAGES:-4}"
+VR="${VARIANCE_RATIO:-0.95}"
+PCA_METHOD="${PCA_METHOD:-randomized}"
+GPU_MEM_UTIL="${GPU_MEM_UTIL:-0.80}"
+OUT_DIR="${OUT_DIR:-reports/v1_3_ppl/vllm}"
+COMPRESS_STREAM="${COMPRESS_STREAM:-kv}"
+
+Q_CALIB="${Q_CALIB:-reports/v1_4_q_pca/flagship/deepseek_distill_q_calib.safetensors}"
+K_CENTROIDS="${K_CENTROIDS:-reports/v1_4_q_pca/calibrated_codebook/ds_K_b${BIT_WIDTH_K}_centroids.f32}"
+V_CENTROIDS="${V_CENTROIDS:-reports/v1_4_q_pca/calibrated_codebook/ds_V_b${BIT_WIDTH_V}_centroids.f32}"
+
+# 6-layer boundary skip (DS-Distill has 28 layers).
+BOUNDARY_LAYERS="${BOUNDARY_LAYERS:-0 1 7 14 26 27}"
+
+echo "[build] kakeyaturbo-bench (release)"
+(cd kakeyaturbo && cargo build --release --bin kakeyaturbo-bench 1>&2)
+
+PYTHON_BIN="${PYTHON_BIN:-python3}"
+if [ -x /venv/main/bin/python ] && [ "${PYTHON_BIN}" = "python3" ]; then
+    PYTHON_BIN=/venv/main/bin/python
+fi
+
+echo "[run] e2e_ppl_validation_vllm_full.py (using $PYTHON_BIN)"
+
+"$PYTHON_BIN" benchmarks/e2e_ppl_validation_vllm_full.py \
+    --model-path "$MODEL_PATH" \
+    --model-name "$MODEL_NAME" \
+    --ctx-len "$CTX_LEN" --n-eval "$N_EVAL" \
+    --block-size "$BLOCK_SIZE" \
+    --bit-width-k "$BIT_WIDTH_K" \
+    --bit-width-v "$BIT_WIDTH_V" \
+    --variance-ratio "$VR" \
+    --pca-method "$PCA_METHOD" \
+    --q-calib "$Q_CALIB" \
+    --k-centroids "$K_CENTROIDS" \
+    --v-centroids "$V_CENTROIDS" \
+    --outlier-threshold "$OUTLIER_THRESHOLD" \
+    ${V_OUTLIER_THRESHOLD:+--v-outlier-threshold "$V_OUTLIER_THRESHOLD"} \
+    --boundary-skip-layers $BOUNDARY_LAYERS \
+    --compress-stream "$COMPRESS_STREAM" \
+    --n-passages "$N_PASSAGES" \
+    --gpu-mem-util "$GPU_MEM_UTIL" \
+    --out-dir "$OUT_DIR"
diff --git a/benchmarks/run_v1_3_ppl_snapshot_vllm.sh b/benchmarks/run_v1_3_ppl_snapshot_vllm.sh
new file mode 100755
index 00000000..aefe95b7
--- /dev/null
+++ b/benchmarks/run_v1_3_ppl_snapshot_vllm.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+# Run the v1.3 PPL production cell on vLLM with HF's two-pass
+# snapshot semantics (Scenario A: compress the KV cache after a
+# clean prefill).
+set -euo pipefail
+cd "$(dirname "$0")/.."
+
+MODEL_PATH="${MODEL_PATH:-deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B}"
+MODEL_NAME="${MODEL_NAME:-ds_distill_qwen_1_5b_snapshot}"
+CTX_LEN="${CTX_LEN:-2048}"
+N_EVAL="${N_EVAL:-64}"
+N_PASSAGES="${N_PASSAGES:-4}"
+BLOCK_SIZE="${BLOCK_SIZE:-512}"
+BIT_WIDTH_K="${BIT_WIDTH_K:-3}"
+BIT_WIDTH_V="${BIT_WIDTH_V:-2}"
+OUTLIER_THRESHOLD="${OUTLIER_THRESHOLD:-2.0}"
+GPU_MEM_UTIL="${GPU_MEM_UTIL:-0.40}"
+OUT_DIR="${OUT_DIR:-reports/v1_3_ppl/snapshot_mode}"
+
+Q_CALIB="${Q_CALIB:-reports/v1_4_q_pca/flagship/deepseek_distill_q_calib.safetensors}"
+K_CENTROIDS="${K_CENTROIDS:-reports/v1_4_q_pca/calibrated_codebook/ds_K_b${BIT_WIDTH_K}_centroids.f32}"
+V_CENTROIDS="${V_CENTROIDS:-reports/v1_4_q_pca/calibrated_codebook/ds_V_b${BIT_WIDTH_V}_centroids.f32}"
+BOUNDARY_LAYERS="${BOUNDARY_LAYERS:-0 1 7 14 26 27}"
+
+echo "[build] kakeyaturbo-bench (release)"
+(cd kakeyaturbo && cargo build --release --bin kakeyaturbo-bench 1>&2)
+
+PYTHON_BIN="${PYTHON_BIN:-python3}"
+if [ -x /venv/main/bin/python ] && [ "${PYTHON_BIN}" = "python3" ]; then
+    PYTHON_BIN=/venv/main/bin/python
+fi
+
+echo "[run] e2e_ppl_validation_vllm_snapshot.py"
+"$PYTHON_BIN" benchmarks/e2e_ppl_validation_vllm_snapshot.py \
+    --model-path "$MODEL_PATH" \
+    --model-name "$MODEL_NAME" \
+    --ctx-len "$CTX_LEN" --n-eval "$N_EVAL" \
+    --block-size "$BLOCK_SIZE" \
+    --bit-width-k "$BIT_WIDTH_K" \
+    --bit-width-v "$BIT_WIDTH_V" \
+    --q-calib "$Q_CALIB" \
+    --k-centroids "$K_CENTROIDS" \
+    --v-centroids "$V_CENTROIDS" \
+    --outlier-threshold "$OUTLIER_THRESHOLD" \
+    --boundary-skip-layers $BOUNDARY_LAYERS \
+    --n-passages "$N_PASSAGES" \
+    --gpu-mem-util "$GPU_MEM_UTIL" \
+    --out-dir "$OUT_DIR"
diff --git a/benchmarks/run_v1_3_ppl_vllm.sh b/benchmarks/run_v1_3_ppl_vllm.sh
new file mode 100755
index 00000000..ecf7745d
--- /dev/null
+++ b/benchmarks/run_v1_3_ppl_vllm.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+# Build the v1.3 Rust codec and run the vLLM-based PPL harness on
+# Qwen2.5-0.5B with the smoke-sized config used by the HF harness so
+# the two numbers can be compared directly.
+#
+# Requires: cargo in PATH; a Python environment with vllm, datasets,
+# transformers, torch (CUDA). See benchmarks/README or the PR body.
+set -euo pipefail
+
+cd "$(dirname "$0")/.."
+
+MODEL_PATH="${MODEL_PATH:-Qwen/Qwen2.5-0.5B}"
+MODEL_NAME="${MODEL_NAME:-qwen2_5_0_5b}"
+CTX_LEN="${CTX_LEN:-1024}"
+N_EVAL="${N_EVAL:-64}"
+BLOCK_SIZE="${BLOCK_SIZE:-512}"
+BIT_WIDTH="${BIT_WIDTH:-2}"
+N_PASSAGES="${N_PASSAGES:-2}"
+VR="${VARIANCE_RATIO:-0.95}"
+PCA_METHOD="${PCA_METHOD:-randomized}"
+OUT_DIR="${OUT_DIR:-reports/v1_3_rsvd_rope/e2e_ppl_vllm_smoke}"
+GPU_MEM_UTIL="${GPU_MEM_UTIL:-0.80}"
+
+echo "[build] kakeyaturbo-bench (release)"
+(cd kakeyaturbo && cargo build --release --bin kakeyaturbo-bench 1>&2)
+
+PYTHON_BIN="${PYTHON_BIN:-python3}"
+if [ -x /venv/main/bin/python ] && [ "${PYTHON_BIN}" = "python3" ]; then
+    PYTHON_BIN=/venv/main/bin/python
+fi
+
+echo "[run] e2e_ppl_validation_vllm.py (using $PYTHON_BIN)"
+"$PYTHON_BIN" benchmarks/e2e_ppl_validation_vllm.py \
+  --model-path "$MODEL_PATH" \
+  --model-name "$MODEL_NAME" \
+  --ctx-len "$CTX_LEN" \
+  --n-eval "$N_EVAL" \
+  --block-size "$BLOCK_SIZE" \
+  --bit-width "$BIT_WIDTH" \
+  --variance-ratio "$VR" \
+  --pca-method "$PCA_METHOD" \
+  --n-passages "$N_PASSAGES" \
+  --gpu-mem-util "$GPU_MEM_UTIL" \
+  --out-dir "$OUT_DIR"
diff --git a/kakeyaturbo/src/bin/kakeyaturbo-bench.rs b/kakeyaturbo/src/bin/kakeyaturbo-bench.rs
index e8fb95b2..9d6f5673 100644
--- a/kakeyaturbo/src/bin/kakeyaturbo-bench.rs
+++ b/kakeyaturbo/src/bin/kakeyaturbo-bench.rs
@@ -25,8 +25,9 @@ use std::path::PathBuf;
 use std::time::Instant;
 
 use kakeyaturbo::{
-    decode_block, decode_layer, encode_block, encode_layer, CodecParams, Code, Distortion,
-    InnerProduct, LInf, LayerEncoding, MSE,
+    decode_block_with_centroids, decode_layer_with_centroids, encode_block, encode_layer,
+    CodecParams, Code, Distortion,
+    InnerProduct, LInf, LayerEncoding, PcaMethod, SkeletonDtype, MSE,
 };
 
 const MAGIC: u32 = 0x4B4B_5456;
@@ -44,6 +45,31 @@ struct Args {
     rotation_seed: u32,
     verify: bool,
     share_basis: bool,
+    /// One of "exact" or "randomized".
+    pca_method: String,
+    /// One of "fp16" or "fp32" — skeleton (PCA mean+basis, K-means centres) storage precision.
+    skeleton_dtype: String,
+    /// If set, hard-cap d_eff at this value in the exact PCA path
+    /// (None = unlimited, controlled by variance_ratio only).
+    exact_rank_cap: Option<usize>,
+    /// Randomized-SVD knobs. Ignored when pca_method == "exact".
+    rsvd_target_rank: Option<usize>,
+    rsvd_oversample: usize,
+    rsvd_power_iters: u32,
+    /// If set, dump the decoded (round-tripped) KV tensor to this path in
+    /// KKTV format so downstream Python drivers can measure end-to-end
+    /// downstream quality (next-token KL, PPL) against the original.
+    dump_decoded: Option<PathBuf>,
+    /// If set, path to a .f32 binary file containing the calibrated
+    /// Lloyd-Max centroid table (exactly `1 << bit_width` f32 values,
+    /// little-endian, sorted).  When present, replaces the codec's
+    /// unit-variance-Gaussian defaults on both encode and decode.
+    centroids_file: Option<PathBuf>,
+    /// If set, outlier compensation threshold T.  Coordinates with
+    /// |scaled_residual| > T are stored exact (as u16 index + f16 value)
+    /// and override Lloyd-Max dequantization at decode.  Typical T=2.0
+    /// gives ~1-5 % outlier rate on Gaussian-like residuals.
+    outlier_threshold: Option<f32>,
 }
 
 fn print_help() {
@@ -52,9 +78,17 @@ fn print_help() {
             [--metric mse|inner_product|linf] \\\n    \
             [--block-size N] [--variance-ratio R] \\\n    \
             [--k K] [--bit-width B] [--rotation-seed S] \\\n    \
-            [--verify]\n\n\
+            [--share-basis] [--verify] \\\n    \
+            [--pca-method exact|randomized] \\\n    \
+            [--skeleton-dtype fp16|fp32] \\\n    \
+            [--exact-rank-cap N] \\\n    \
+            [--rsvd-target-rank N] [--rsvd-oversample N] [--rsvd-power-iters N] \\\n    \
+            [--dump-decoded PATH]\n\n\
             Compresses a KV tensor file block-by-block using the\n\
-            kakeyaturbo codec and writes a JSON report.\n"
+            kakeyaturbo codec and writes a JSON report. With\n\
+            --dump-decoded PATH (and --verify), also writes the\n\
+            round-tripped tensor in KKTV format for downstream\n\
+            e2e quality measurement.\n"
     );
 }
 
@@ -70,6 +104,15 @@ fn parse_args() -> Result<Args, String> {
     let mut rotation_seed: u32 = 0xCAFE_BABE;
     let mut verify = false;
     let mut share_basis = false;
+    let mut pca_method = "exact".to_string();
+    let mut skeleton_dtype = "fp16".to_string();
+    let mut exact_rank_cap: Option<usize> = None;
+    let mut rsvd_target_rank: Option<usize> = None;
+    let mut rsvd_oversample: usize = 8;
+    let mut rsvd_power_iters: u32 = 2;
+    let mut centroids_file: Option<PathBuf> = None;
+    let mut outlier_threshold: Option<f32> = None;
+    let mut dump_decoded: Option<PathBuf> = None;
 
     let mut i = 1;
     while i < argv.len() {
@@ -116,6 +159,46 @@ fn parse_args() -> Result<Args, String> {
             }
             "--verify" => verify = true,
             "--share-basis" => share_basis = true,
+            "--pca-method" => {
+                i += 1;
+                pca_method = argv[i].clone();
+            }
+            "--skeleton-dtype" => {
+                i += 1;
+                skeleton_dtype = argv[i].clone();
+            }
+            "--exact-rank-cap" => {
+                i += 1;
+                exact_rank_cap = Some(
+                    argv[i].parse().map_err(|e| format!("bad --exact-rank-cap: {e}"))?,
+                );
+            }
+            "--rsvd-target-rank" => {
+                i += 1;
+                rsvd_target_rank = Some(argv[i].parse().map_err(|e| format!("bad --rsvd-target-rank: {e}"))?);
+            }
+            "--rsvd-oversample" => {
+                i += 1;
+                rsvd_oversample = argv[i].parse().map_err(|e| format!("bad --rsvd-oversample: {e}"))?;
+            }
+            "--rsvd-power-iters" => {
+                i += 1;
+                rsvd_power_iters = argv[i].parse().map_err(|e| format!("bad --rsvd-power-iters: {e}"))?;
+            }
+            "--dump-decoded" => {
+                i += 1;
+                dump_decoded = Some(PathBuf::from(&argv[i]));
+            }
+            "--centroids-file" => {
+                i += 1;
+                centroids_file = Some(PathBuf::from(&argv[i]));
+            }
+            "--outlier-threshold" => {
+                i += 1;
+                outlier_threshold = Some(
+                    argv[i].parse().map_err(|e| format!("bad --outlier-threshold: {e}"))?,
+                );
+            }
             other => return Err(format!("unknown flag {other}; try --help")),
         }
         i += 1;
@@ -134,9 +217,43 @@ fn parse_args() -> Result<Args, String> {
         rotation_seed,
         verify,
         share_basis,
+        pca_method,
+        skeleton_dtype,
+        exact_rank_cap,
+        rsvd_target_rank,
+        rsvd_oversample,
+        rsvd_power_iters,
+        dump_decoded,
+        centroids_file,
+        outlier_threshold,
     })
 }
 
+fn load_centroids(path: &PathBuf, expected_count: usize) -> Result<Vec<f32>, String> {
+    let bytes = std::fs::read(path).map_err(|e| format!("read {}: {e}", path.display()))?;
+    if bytes.len() != expected_count * 4 {
+        return Err(format!(
+            "centroids file {} has {} bytes, expected {} (= {} × 4)",
+            path.display(), bytes.len(), expected_count * 4, expected_count
+        ));
+    }
+    let mut out = Vec::with_capacity(expected_count);
+    for chunk in bytes.chunks_exact(4) {
+        let arr = [chunk[0], chunk[1], chunk[2], chunk[3]];
+        out.push(f32::from_le_bytes(arr));
+    }
+    // Validate sorted ascending
+    for w in out.windows(2) {
+        if w[0] >= w[1] {
+            return Err(format!(
+                "centroids must be sorted ascending; {} violates at {} >= {}",
+                path.display(), w[0], w[1]
+            ));
+        }
+    }
+    Ok(out)
+}
+
 fn read_u32_le(r: &mut impl Read) -> std::io::Result<u32> {
     let mut buf = [0u8; 4];
     r.read_exact(&mut buf)?;
@@ -181,12 +298,41 @@ fn read_tensor(path: &PathBuf) -> Result<(Vec<f32>, usize, usize), String> {
 }
 
 fn run<R: Distortion>(args: &Args, data: &[f32], num_vecs: usize, dim: usize) -> Report {
+    let pca_method = match args.pca_method.as_str() {
+        "exact" => PcaMethod::Exact,
+        "randomized" => PcaMethod::Randomized {
+            target_rank: args.rsvd_target_rank.unwrap_or((dim / 2).max(8)),
+            oversample: args.rsvd_oversample,
+            power_iters: args.rsvd_power_iters,
+            seed_offset: 0x9E37_79B9_7F4A_7C15,
+        },
+        other => panic!("unknown --pca-method {other}, expected 'exact' or 'randomized'"),
+    };
+    let skeleton_dtype = match args.skeleton_dtype.as_str() {
+        "fp16" | "f16" | "half" => SkeletonDtype::Fp16,
+        "fp32" | "f32" | "float" => SkeletonDtype::Fp32,
+        other => panic!("unknown --skeleton-dtype {other}, expected 'fp16' or 'fp32'"),
+    };
+    let custom_centroids = if let Some(path) = &args.centroids_file {
+        let expected = 1usize << args.bit_width;
+        let c = load_centroids(path, expected)
+            .unwrap_or_else(|e| panic!("loading centroids: {e}"));
+        eprintln!("[bench] loaded {} calibrated centroids from {}", c.len(), path.display());
+        Some(c)
+    } else {
+        None
+    };
     let params = CodecParams {
         variance_ratio: args.variance_ratio,
         k: args.k,
         bit_width: args.bit_width,
         rotation_seed: args.rotation_seed,
         kmeans_max_iter: 32,
+        pca_method,
+        skeleton_dtype,
+        exact_rank_cap: args.exact_rank_cap,
+        custom_centroids,
+        outlier_threshold: args.outlier_threshold,
     };
 
     let bs = args.block_size;
@@ -197,6 +343,14 @@ fn run<R: Distortion>(args: &Args, data: &[f32], num_vecs: usize, dim: usize) ->
     let mut total_mse_count = 0usize;
     let mut encode_ns: u128 = 0;
     let mut decode_ns: u128 = 0;
+    // If --dump-decoded is set (and verify is on), accumulate the decoded
+    // tensor here and write it at the end.
+    let want_decoded = args.dump_decoded.is_some() && args.verify;
+    let mut decoded_full: Vec<f32> = if want_decoded {
+        Vec::with_capacity(n_full * bs * dim)
+    } else {
+        Vec::new()
+    };
 
     let (total_skeleton, total_codes, total_blocks, total_vecs_encoded, shared_pca_bytes) = if args.share_basis {
         // v1.2 B' path: fit one basis over all blocks, K-means per-block.
@@ -213,7 +367,7 @@ fn run<R: Distortion>(args: &Args, data: &[f32], num_vecs: usize, dim: usize) ->
 
         if args.verify {
             let t1 = Instant::now();
-            let recs = decode_layer::<R>(&enc);
+            let recs = decode_layer_with_centroids::<R>(&enc, params.custom_centroids.as_deref());
             decode_ns += t1.elapsed().as_nanos();
             for (i, rec) in recs.iter().enumerate() {
                 let orig = &block_vecs[i];
@@ -224,6 +378,9 @@ fn run<R: Distortion>(args: &Args, data: &[f32], num_vecs: usize, dim: usize) ->
                 }
                 total_mse_sum += sq / (bs * dim) as f64;
                 total_mse_count += 1;
+                if want_decoded {
+                    decoded_full.extend_from_slice(rec);
+                }
             }
         }
 
@@ -253,7 +410,7 @@ fn run<R: Distortion>(args: &Args, data: &[f32], num_vecs: usize, dim: usize) ->
 
             if args.verify {
                 let t1 = Instant::now();
-                let rec = decode_block::<R>(&sk, &codes);
+                let rec = decode_block_with_centroids::<R>(&sk, &codes, params.custom_centroids.as_deref());
                 decode_ns += t1.elapsed().as_nanos();
                 let mut sq = 0.0_f64;
                 for i in 0..bs * dim {
@@ -262,11 +419,32 @@ fn run<R: Distortion>(args: &Args, data: &[f32], num_vecs: usize, dim: usize) ->
                 }
                 total_mse_sum += sq / (bs * dim) as f64;
                 total_mse_count += 1;
+                if want_decoded {
+                    decoded_full.extend_from_slice(&rec);
+                }
             }
         }
         (total_skeleton, total_codes, total_blocks, total_vecs_encoded, 0)
     };
 
+    // Write decoded tensor to disk if requested.
+    if want_decoded {
+        let path = args.dump_decoded.as_ref().expect("dump_decoded path");
+        let f = std::fs::File::create(path).expect("create decoded output");
+        let mut w = std::io::BufWriter::new(f);
+        use std::io::Write;
+        let magic: u32 = 0x4B4B_5456;
+        w.write_all(&magic.to_le_bytes()).expect("write magic");
+        w.write_all(&1u32.to_le_bytes()).expect("write version");
+        w.write_all(&(total_vecs_encoded as u64).to_le_bytes()).expect("write n_vecs");
+        w.write_all(&(dim as u32).to_le_bytes()).expect("write dim");
+        w.write_all(&0u32.to_le_bytes()).expect("write pad");
+        for v in &decoded_full {
+            w.write_all(&v.to_le_bytes()).expect("write f32");
+        }
+        w.flush().expect("flush decoded");
+    }
+
     let baseline_bytes = total_vecs_encoded * dim * std::mem::size_of::<f32>();
     let baseline_bytes_bf16 = total_vecs_encoded * dim * 2;
     let compressed_bytes = total_skeleton + total_codes;
@@ -297,6 +475,20 @@ fn run<R: Distortion>(args: &Args, data: &[f32], num_vecs: usize, dim: usize) ->
         },
         share_basis: args.share_basis,
         shared_pca_bytes,
+        pca_method: args.pca_method.clone(),
+        skeleton_dtype: args.skeleton_dtype.clone(),
+        rsvd_target_rank: match pca_method {
+            PcaMethod::Randomized { target_rank, .. } => target_rank,
+            PcaMethod::Exact => 0,
+        },
+        rsvd_oversample: match pca_method {
+            PcaMethod::Randomized { oversample, .. } => oversample,
+            PcaMethod::Exact => 0,
+        },
+        rsvd_power_iters: match pca_method {
+            PcaMethod::Randomized { power_iters, .. } => power_iters,
+            PcaMethod::Exact => 0,
+        },
     }
 }
 
@@ -322,6 +514,11 @@ struct Report {
     mean_block_mse: f64,
     share_basis: bool,
     shared_pca_bytes: usize,
+    pca_method: String,
+    skeleton_dtype: String,
+    rsvd_target_rank: usize,
+    rsvd_oversample: usize,
+    rsvd_power_iters: u32,
 }
 
 impl Report {
@@ -348,7 +545,12 @@ impl Report {
                 \"verify\":{verify},\
                 \"mean_block_mse\":{mse:.10},\
                 \"share_basis\":{sb},\
-                \"shared_pca_bytes\":{spb}\
+                \"shared_pca_bytes\":{spb},\
+                \"pca_method\":\"{pm}\",\
+                \"skeleton_dtype\":\"{sd}\",\
+                \"rsvd_target_rank\":{rtr},\
+                \"rsvd_oversample\":{ros},\
+                \"rsvd_power_iters\":{rpi}\
             }}",
             metric = self.metric,
             bs = self.block_size,
@@ -371,6 +573,11 @@ impl Report {
             mse = self.mean_block_mse,
             sb = self.share_basis,
             spb = self.shared_pca_bytes,
+            pm = self.pca_method,
+            sd = self.skeleton_dtype,
+            rtr = self.rsvd_target_rank,
+            ros = self.rsvd_oversample,
+            rpi = self.rsvd_power_iters,
         )
     }
 }
diff --git a/kakeyaturbo/src/codec.rs b/kakeyaturbo/src/codec.rs
index b3001af8..f018013f 100644
--- a/kakeyaturbo/src/codec.rs
+++ b/kakeyaturbo/src/codec.rs
@@ -14,9 +14,16 @@
 use half::f16;
 
 use crate::distortion::{Distortion, NormMode};
-use crate::kmeans::{assign_and_project, fit_spherical_kmeans, residual};
-use crate::pca::{fit_weighted_pca, project, unproject};
-use crate::quantize::{dequantize_vector, pack_bits, quantize_vector, unpack_bits};
+use crate::kmeans::{
+    assign_and_project, fit_spherical_kmeans_with_storage, residual,
+};
+use crate::pca::{
+    fit_weighted_pca_randomized_with_storage, fit_weighted_pca_with_storage_capped, project,
+    unproject, PcaFit, PcaStorage,
+};
+use crate::quantize::{
+    dequantize_vector_with_centroids, pack_bits, quantize_vector_with_centroids, unpack_bits,
+};
 use crate::skeleton::Skeleton;
 use crate::wht::{inverse_rotate, rotate};
 
@@ -26,6 +33,10 @@ use crate::wht::{inverse_rotate, rotate};
 /// - `seg_id`: K-means cluster id (`⌈log₂ K⌉` bits, stored as u32 to ease access)
 /// - `alpha, t, norm`: fp16 scalars
 /// - `residual`: packed `bit_width`-bit indices of length `wht_len`
+/// - `outliers`: optional sparse list of `(coord_index, f16 value)` pairs that
+///   override Lloyd-Max dequantization at those coordinates. Used to
+///   catch heavy-tail scaled residuals that lie outside Lloyd-Max's
+///   centroid coverage (see `CodecParams::outlier_threshold`).
 #[derive(Debug, Clone, PartialEq)]
 pub struct Code {
     /// K-means cluster index.
@@ -40,14 +51,81 @@ pub struct Code {
     pub norm: f16,
     /// Packed residual indices.
     pub residual_packed: Vec<u8>,
+    /// Sparse outlier list.  Each entry is `(coord_index_u16, exact_f16_value)`.
+    /// Empty if `CodecParams::outlier_threshold.is_none()`.
+    ///
+    /// The coord_index is into the scaled-residual vector of length
+    /// `wht_len`, i.e. into the WHT-rotated, per-vector-norm-scaled
+    /// residual just before Lloyd-Max quantization.  Decode re-applies
+    /// these values AFTER Lloyd-Max dequantization but BEFORE inverse
+    /// WHT and un-scaling.
+    pub outliers: Vec<(u16, f16)>,
 }
 
 impl Code {
     /// Total byte size of this code's payload.
     #[must_use]
     pub fn nbytes(&self) -> usize {
-        // seg_id(4) + 3×fp16(6) + packed bytes
-        4 + 3 * 2 + self.residual_packed.len()
+        // seg_id(4) + 3×fp16(6) + packed residual + outliers (2+2 bytes each)
+        4 + 3 * 2 + self.residual_packed.len() + self.outliers.len() * 4
+    }
+
+    /// Number of outlier entries (0 if outlier compensation is off).
+    #[must_use]
+    pub fn n_outliers(&self) -> usize {
+        self.outliers.len()
+    }
+}
+
+/// PCA fit strategy — selects between the exact eigendecomposition
+/// path (v1.2 default) and the randomized-SVD sketch (v1.3 cheap fit).
+///
+/// Both paths produce a bit-compatible [`PcaFit`] so the rest of the
+/// codec is completely oblivious to the choice.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum PcaMethod {
+    /// Exact `SymmetricEigen` on the D×D weighted covariance.
+    /// Cost: O(n·D² + D³) per block. Numerically ideal.
+    Exact,
+    /// Halko–Martinsson–Tropp randomized SVD with the given knobs.
+    /// Cost: O(n·D·r) per block where `r = target_rank + oversample`.
+    Randomized {
+        /// Maximum `d_eff` produced by truncation (`D/2` is a safe default).
+        target_rank: usize,
+        /// Extra sketch dimensions beyond `target_rank` (5–10 standard).
+        oversample: usize,
+        /// Subspace-power iterations (2 is the typical sweet spot).
+        power_iters: u32,
+        /// XOR-ed into `CodecParams::rotation_seed` to derive the
+        /// Gaussian-test-matrix seed. Keeps all randomness reproducible.
+        seed_offset: u64,
+    },
+}
+
+impl Default for PcaMethod {
+    fn default() -> Self {
+        Self::Exact
+    }
+}
+
+/// Storage precision for the Kakeya skeleton (PCA mean, PCA basis, K-means
+/// centres).  The residual quantiser (Lloyd-Max) is **not** affected.
+///
+/// `Fp16` is the v1.2/v1.3 default and matches the paper's byte accounting.
+/// `Fp32` doubles skeleton bytes; used only for ablation of the "f16
+/// skeleton as structural PPL floor" hypothesis raised in the pre-RoPE
+/// cache ablation.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum SkeletonDtype {
+    /// IEEE-754 binary16 (v1.2/v1.3 default).
+    Fp16,
+    /// IEEE-754 binary32 (doubles skeleton bytes; ablation-only).
+    Fp32,
+}
+
+impl Default for SkeletonDtype {
+    fn default() -> Self {
+        Self::Fp16
     }
 }
 
@@ -68,6 +146,37 @@ pub struct CodecParams {
     pub rotation_seed: u32,
     /// Maximum K-means iterations.
     pub kmeans_max_iter: u32,
+    /// PCA fit strategy (exact vs randomized SVD).
+    pub pca_method: PcaMethod,
+    /// Storage precision for PCA mean, PCA basis, and K-means centres.
+    /// Does not affect the residual Lloyd-Max quantiser.
+    pub skeleton_dtype: SkeletonDtype,
+    /// Optional hard upper bound on `d_eff` for the exact PCA path.
+    /// `None` (the default) leaves `d_eff` controlled by `variance_ratio`.
+    /// `Some(r)` clips `d_eff ≤ r` even if variance_ratio would keep more
+    /// components — useful to match RSVD's rank budget while using exact
+    /// (un-approximated) eigenvectors.
+    pub exact_rank_cap: Option<usize>,
+    /// Optional caller-supplied Lloyd-Max centroid table for the residual
+    /// quantiser. When `Some`, must contain exactly `1 << bit_width`
+    /// sorted floats — typically the output of offline empirical Lloyd-Max
+    /// calibration on the model's real residual distribution.  When
+    /// `None`, the codec uses the unit-variance-Gaussian centroids from
+    /// [`crate::quantize::centroids_gaussian`].
+    pub custom_centroids: Option<Vec<f32>>,
+    /// Optional scalar threshold for outlier compensation on the scaled
+    /// residual (post-WHT, per-vector-norm-scaled, pre-Lloyd-Max).
+    ///
+    /// When `Some(T)`: any coordinate whose absolute scaled value exceeds
+    /// `T` is stored verbatim in the code's `outliers` list (as f16),
+    /// and its Lloyd-Max index becomes irrelevant (decoded but overridden).
+    /// When `None`: no outlier compensation.
+    ///
+    /// Storage cost per outlier = 2 bytes index + 2 bytes f16 = 4 bytes.
+    /// At T=2.0 on Gaussian-like residuals, outlier rate is ~4.5 %.
+    /// Targets Gap 1 (K-means + WHT residuals are only near-Gaussian,
+    /// so Lloyd-Max's heavy-tail quantization error is disproportionate).
+    pub outlier_threshold: Option<f32>,
 }
 
 impl Default for CodecParams {
@@ -78,10 +187,80 @@ impl Default for CodecParams {
             bit_width: 3,
             rotation_seed: 0xCAFE_BABE,
             kmeans_max_iter: 32,
+            pca_method: PcaMethod::Exact,
+            skeleton_dtype: SkeletonDtype::Fp16,
+            exact_rank_cap: None,
+            custom_centroids: None,
+            outlier_threshold: None,
         }
     }
 }
 
+/// Convert the codec's skeleton-dtype flag to the PCA layer's storage flag.
+fn pca_storage(params: &CodecParams) -> PcaStorage {
+    match params.skeleton_dtype {
+        SkeletonDtype::Fp16 => PcaStorage::Fp16,
+        SkeletonDtype::Fp32 => PcaStorage::Fp32,
+    }
+}
+
+/// Dispatch helper: fit the requested PCA variant with the requested
+/// skeleton dtype.
+fn fit_pca_dispatch(
+    vectors: &[f32],
+    weights: &[f32],
+    d: usize,
+    params: &CodecParams,
+) -> PcaFit {
+    let storage = pca_storage(params);
+    match params.pca_method {
+        PcaMethod::Exact => fit_weighted_pca_with_storage_capped(
+            vectors,
+            weights,
+            d,
+            params.variance_ratio,
+            storage,
+            params.exact_rank_cap,
+        ),
+        PcaMethod::Randomized {
+            target_rank,
+            oversample,
+            power_iters,
+            seed_offset,
+        } => fit_weighted_pca_randomized_with_storage(
+            vectors,
+            weights,
+            d,
+            params.variance_ratio,
+            target_rank.min(d),
+            oversample,
+            power_iters,
+            u64::from(params.rotation_seed) ^ seed_offset,
+            storage,
+        ),
+    }
+}
+
+/// Dispatch helper for K-means: routes the codec's skeleton dtype into
+/// the K-means fp32-skeleton flag.
+fn fit_kmeans_dispatch(
+    coeffs: &[f32],
+    weights: &[f32],
+    d_eff: usize,
+    k: usize,
+    params: &CodecParams,
+) -> crate::kmeans::KmeansFit {
+    fit_spherical_kmeans_with_storage(
+        coeffs,
+        weights,
+        d_eff,
+        k,
+        params.rotation_seed,
+        params.kmeans_max_iter,
+        matches!(params.skeleton_dtype, SkeletonDtype::Fp32),
+    )
+}
+
 /// Round up to the nearest power of two, with a minimum of 1.
 fn next_pow2(n: usize) -> usize {
     if n <= 1 {
@@ -134,7 +313,7 @@ pub fn encode_block<R: Distortion>(
     assert!(params.k >= 1, "k must be ≥ 1");
 
     // --- Stage 1: Structure extraction ---
-    let pca = fit_weighted_pca(vectors, weights, d, params.variance_ratio);
+    let pca = fit_pca_dispatch(vectors, weights, d, params);
 
     // Project every vector into d_eff-space.
     let mut coeffs = Vec::with_capacity(n * pca.d_eff);
@@ -155,14 +334,7 @@ pub fn encode_block<R: Distortion>(
         .count();
     let effective_k = params.k.min(valid_rows.max(1));
 
-    let kmeans = fit_spherical_kmeans(
-        &coeffs,
-        weights,
-        pca.d_eff,
-        effective_k,
-        params.rotation_seed,
-        params.kmeans_max_iter,
-    );
+    let kmeans = fit_kmeans_dispatch(&coeffs, weights, pca.d_eff, effective_k, params);
 
     // --- Stage 2: Residual coding ---
     let wht_len = next_pow2(pca.d_eff);
@@ -181,18 +353,48 @@ pub fn encode_block<R: Distortion>(
         let rotated = rotate(&res_padded, params.rotation_seed);
 
         // Scale to approximately unit variance for the Lloyd-Max codebook.
-        // The WHT rotation preserves L2 norm up to sqrt(n), and the
-        // Gaussianisation argument assumes each coord ~ N(0, σ²/N_EFF).
-        // We divide by the empirical residual std to match the codebook.
+        // The `rotate` function implements an UNNORMALIZED Walsh-Hadamard
+        // transform, so for residual `res` of length d_eff (padded with
+        // zeros to wht_len), the rotated vector `rotated = H·D·res_padded`
+        // satisfies `‖rotated‖² = wht_len · ‖res‖²`, giving an average
+        // per-coordinate squared magnitude of `‖res‖²`. To match the
+        // Lloyd-Max codebook (calibrated for N(0, 1)) we therefore scale
+        // by `1 / ‖res‖`, so the result has unit per-coord variance.
+        //
+        // (Prior versions used `scale = √wht_len / ‖res‖`, which made the
+        // scaled values have per-coord variance `wht_len`, saturating the
+        // quantiser. Fixed in this revision.)
         let res_norm = l2_norm(&res);
         let scale = if res_norm > f32::EPSILON {
-            (wht_len as f32).sqrt() / res_norm
+            1.0 / res_norm
         } else {
             1.0
         };
         let scaled: Vec<f32> = rotated.iter().map(|v| v * scale).collect();
 
-        let q = quantize_vector::<R>(&scaled, params.bit_width);
+        // Extract outliers BEFORE quantizing so the Lloyd-Max indices for
+        // outlier coordinates don't matter (they're overridden at decode).
+        // We still quantize the full vector for simplicity — the outlier
+        // list is additive metadata.
+        let outliers: Vec<(u16, f16)> = if let Some(t) = params.outlier_threshold {
+            scaled
+                .iter()
+                .enumerate()
+                .filter_map(|(i, &v)| {
+                    if v.abs() > t {
+                        Some((i as u16, f16::from_f32(v)))
+                    } else {
+                        None
+                    }
+                })
+                .collect()
+        } else {
+            Vec::new()
+        };
+
+        let q = quantize_vector_with_centroids::<R>(
+            &scaled, params.bit_width, params.custom_centroids.as_deref(),
+        );
         let packed = pack_bits(&q, params.bit_width);
 
         let norm = match R::NORM_MODE {
@@ -206,6 +408,7 @@ pub fn encode_block<R: Distortion>(
             t: f16::from_f32(t),
             norm,
             residual_packed: packed,
+            outliers,
         });
     }
 
@@ -221,17 +424,43 @@ pub fn encode_block<R: Distortion>(
 
 /// Decode a block of codes back into approximate vectors.
 ///
+/// Uses the unit-variance-Gaussian Lloyd-Max centroids.  For calibrated
+/// codebooks use [`decode_block_with_centroids`].
+///
 /// # Output
 ///
 /// Row-major `[n, d]` where `n = codes.len()` and `d = skeleton.pca.mean.len()`.
 pub fn decode_block<R: Distortion>(skeleton: &Skeleton, codes: &[Code]) -> Vec<f32> {
-    let d = skeleton.pca.mean.len();
+    decode_block_with_centroids::<R>(skeleton, codes, None)
+}
+
+/// Variant of [`decode_block`] that accepts an optional caller-supplied
+/// centroid table, to be used in tandem with
+/// [`crate::quantize::quantize_vector_with_centroids`] on the encode side.
+pub fn decode_block_with_centroids<R: Distortion>(
+    skeleton: &Skeleton,
+    codes: &[Code],
+    custom_centroids: Option<&[f32]>,
+) -> Vec<f32> {
+    let d = skeleton.pca.d();
     let d_eff = skeleton.pca.d_eff;
     let wht_len = skeleton.wht_len;
     let mut out = Vec::with_capacity(codes.len() * d);
     for code in codes {
         let indices = unpack_bits(&code.residual_packed, skeleton.bit_width, wht_len);
-        let q_vals = dequantize_vector(&indices, skeleton.bit_width);
+        let mut q_vals = dequantize_vector_with_centroids(
+            &indices, skeleton.bit_width, custom_centroids,
+        );
+
+        // Outlier patch: override Lloyd-Max dequantized values at outlier
+        // coordinates with their exact f16 values. These are still in the
+        // SCALED residual space, so the override happens before inv_scale.
+        for &(idx, val) in &code.outliers {
+            let i = idx as usize;
+            if i < q_vals.len() {
+                q_vals[i] = val.to_f32();
+            }
+        }
 
         // Inverse scale: match what encode_block did.
         // We stored 1/scale in `norm` when NORM_MODE == Absorbed.
@@ -335,9 +564,9 @@ pub fn encode_layer<R: Distortion>(
     // that tells it "the PCA is already fit" — but since encode_block's
     // signature doesn't expose that, we inline the pipeline here. The
     // inner ops (kmeans, rotation, quantise) are identical.
-    use crate::kmeans::{assign_and_project, fit_spherical_kmeans, residual};
-    use crate::pca::{fit_weighted_pca_pooled, project};
-    use crate::quantize::{pack_bits, quantize_vector};
+    use crate::kmeans::{assign_and_project, residual};
+    use crate::pca::project;
+    use crate::quantize::{pack_bits, quantize_vector_with_centroids};
     use crate::wht::rotate;
     use half::f16;
 
@@ -348,7 +577,10 @@ pub fn encode_layer<R: Distortion>(
         pooled_vecs.extend_from_slice(b);
         pooled_weights.extend_from_slice(w);
     }
-    let shared_pca = fit_weighted_pca_pooled(&pooled_vecs, &pooled_weights, d, params.variance_ratio);
+    // Pooled PCA honours the same PcaMethod as per-block fits. For the
+    // randomized path the seed is derived from rotation_seed XOR a
+    // per-layer salt so different layers don't reuse identical sketches.
+    let shared_pca = fit_pca_dispatch(&pooled_vecs, &pooled_weights, d, params);
 
     // Pre-project every block's coefficients so we can run K-means in
     // coefficient space on each block.
@@ -373,10 +605,7 @@ pub fn encode_layer<R: Distortion>(
                     .any(|c| c.abs() > f32::EPSILON)
         }).count();
         let effective_k = params.k.min(valid_rows.max(1));
-        let kmeans = fit_spherical_kmeans(
-            &coeffs, w, shared_pca.d_eff, effective_k,
-            params.rotation_seed, params.kmeans_max_iter,
-        );
+        let kmeans = fit_kmeans_dispatch(&coeffs, w, shared_pca.d_eff, effective_k, params);
 
         let mut codes = Vec::with_capacity(n);
         for i in 0..n {
@@ -391,9 +620,26 @@ pub fn encode_layer<R: Distortion>(
             let res_padded = pad_zero(&res, wht_len);
             let rotated = rotate(&res_padded, params.rotation_seed);
             let res_norm = l2_norm(&res);
-            let scale = if res_norm > f32::EPSILON { (wht_len as f32).sqrt() / res_norm } else { 1.0 };
+            let scale = if res_norm > f32::EPSILON { 1.0 / res_norm } else { 1.0 };
             let scaled: Vec<f32> = rotated.iter().map(|v| v * scale).collect();
-            let q = quantize_vector::<R>(&scaled, params.bit_width);
+            let outliers: Vec<(u16, f16)> = if let Some(thr) = params.outlier_threshold {
+                scaled
+                    .iter()
+                    .enumerate()
+                    .filter_map(|(idx, &v)| {
+                        if v.abs() > thr {
+                            Some((idx as u16, f16::from_f32(v)))
+                        } else {
+                            None
+                        }
+                    })
+                    .collect()
+            } else {
+                Vec::new()
+            };
+            let q = quantize_vector_with_centroids::<R>(
+                &scaled, params.bit_width, params.custom_centroids.as_deref(),
+            );
             let packed = pack_bits(&q, params.bit_width);
             let norm = match R::NORM_MODE {
                 NormMode::Explicit => f16::from_f32(l2_norm(x)),
@@ -405,6 +651,7 @@ pub fn encode_layer<R: Distortion>(
                 t: f16::from_f32(t),
                 norm,
                 residual_packed: packed,
+                outliers,
             });
         }
 
@@ -426,9 +673,18 @@ pub fn encode_layer<R: Distortion>(
 
 /// Decode a whole layer. Dual of [`encode_layer`].
 pub fn decode_layer<R: Distortion>(enc: &LayerEncoding) -> Vec<Vec<f32>> {
+    decode_layer_with_centroids::<R>(enc, None)
+}
+
+/// Variant of [`decode_layer`] that accepts an optional caller-supplied
+/// centroid table — pass the same table used at encode time.
+pub fn decode_layer_with_centroids<R: Distortion>(
+    enc: &LayerEncoding,
+    custom_centroids: Option<&[f32]>,
+) -> Vec<Vec<f32>> {
     enc.per_block
         .iter()
-        .map(|(sk, codes)| decode_block::<R>(sk, codes))
+        .map(|(sk, codes)| decode_block_with_centroids::<R>(sk, codes, custom_centroids))
         .collect()
 }
 
@@ -501,6 +757,11 @@ mod tests {
             bit_width: 4,
             rotation_seed: 0xABCD,
             kmeans_max_iter: 32,
+            pca_method: PcaMethod::Exact,
+            skeleton_dtype: SkeletonDtype::Fp16,
+            exact_rank_cap: None,
+            custom_centroids: None,
+            outlier_threshold: None,
         };
         let (sk, codes) = encode_block::<MSE>(&block, &w, d, &params);
         let recovered = decode_block::<MSE>(&sk, &codes);
@@ -743,6 +1004,272 @@ mod tests {
         assert!(p.k >= 1);
         assert!((1..=4).contains(&p.bit_width));
         assert!(p.kmeans_max_iter > 0);
+        assert_eq!(p.skeleton_dtype, SkeletonDtype::Fp16);
+        assert!(p.outlier_threshold.is_none());
+    }
+
+    // -------------------- outlier compensation --------------------
+
+    #[test]
+    fn outlier_off_by_default_means_no_outliers() {
+        let n = 8;
+        let d = 16;
+        let mut block = vec![0.0_f32; n * d];
+        for (i, v) in block.iter_mut().enumerate() {
+            *v = ((i as f32) * 0.123).sin() * 2.0;
+        }
+        let w = vec![1.0_f32; n];
+        let params = CodecParams { bit_width: 2, k: 4, ..Default::default() };
+        let (_, codes) = encode_block::<MSE>(&block, &w, d, &params);
+        for c in &codes {
+            assert!(c.outliers.is_empty(), "outliers should be empty when threshold is None");
+        }
+    }
+
+    #[test]
+    fn outlier_threshold_extracts_large_scaled_values() {
+        // Engineered: make residuals large enough that scaled values
+        // definitely exceed threshold T=0.5 on at least some coords.
+        use rand::rngs::SmallRng;
+        use rand::{Rng, SeedableRng};
+        let mut rng = SmallRng::seed_from_u64(7);
+        let n = 16;
+        let d = 8;
+        let mut block = vec![0.0_f32; n * d];
+        for v in &mut block {
+            *v = rng.gen_range(-1.0_f32..1.0);
+        }
+        let w = vec![1.0_f32; n];
+        let params = CodecParams {
+            bit_width: 2,
+            k: 4,
+            variance_ratio: 0.99,
+            outlier_threshold: Some(0.5),
+            ..Default::default()
+        };
+        let (_, codes) = encode_block::<MSE>(&block, &w, d, &params);
+        let total_outliers: usize = codes.iter().map(|c| c.outliers.len()).sum();
+        assert!(total_outliers > 0, "T=0.5 on this input should yield at least one outlier");
+    }
+
+    #[test]
+    fn outlier_round_trip_reduces_reconstruction_mse() {
+        // Compare MSE with/without outlier compensation on the same block.
+        // At b=2 with Gaussian-like residuals, outliers dominate the MSE,
+        // so enabling outlier compensation must strictly decrease MSE.
+        use rand::rngs::SmallRng;
+        use rand::{Rng, SeedableRng};
+        let mut rng = SmallRng::seed_from_u64(13);
+        let n = 64;
+        let d = 16;
+        let mut block = vec![0.0_f32; n * d];
+        for v in &mut block {
+            *v = rng.gen::<f32>() * 2.0 - 1.0;
+        }
+        let w = vec![1.0_f32; n];
+
+        let p_no = CodecParams { bit_width: 2, k: 8, variance_ratio: 0.99,
+                                 outlier_threshold: None, ..Default::default() };
+        let (sk_no, c_no) = encode_block::<MSE>(&block, &w, d, &p_no);
+        let rec_no = decode_block::<MSE>(&sk_no, &c_no);
+        let mse_no = mse_of(&block, &rec_no);
+
+        let p_on = CodecParams { bit_width: 2, k: 8, variance_ratio: 0.99,
+                                 outlier_threshold: Some(2.0), ..Default::default() };
+        let (sk_on, c_on) = encode_block::<MSE>(&block, &w, d, &p_on);
+        let rec_on = decode_block::<MSE>(&sk_on, &c_on);
+        let mse_on = mse_of(&block, &rec_on);
+
+        assert!(
+            mse_on <= mse_no,
+            "outlier compensation at T=2.0 must not make MSE worse: off={mse_no:.5e}, on={mse_on:.5e}"
+        );
+        // At b=2 with non-trivial data, the improvement should be real
+        // (≥ 10% MSE reduction).  Allow leeway but flag a regression.
+        assert!(
+            mse_on < mse_no,
+            "outlier T=2.0 should strictly reduce MSE at b=2 on Gaussian-like residuals; off={mse_no:.5e}, on={mse_on:.5e}"
+        );
+    }
+
+    #[test]
+    fn outlier_byte_cost_scales_with_outlier_count() {
+        // Each outlier entry should cost exactly 4 bytes (u16 index + f16 value).
+        let n = 32;
+        let d = 8;
+        let mut block = vec![0.0_f32; n * d];
+        for i in 0..n {
+            for j in 0..d {
+                block[i * d + j] = ((i + j) as f32 * 0.3).sin();
+            }
+        }
+        let w = vec![1.0_f32; n];
+        let p_no = CodecParams { bit_width: 2, k: 4, ..Default::default() };
+        let p_on = CodecParams {
+            bit_width: 2, k: 4,
+            outlier_threshold: Some(1.0),
+            ..Default::default()
+        };
+        let (_, c_no) = encode_block::<MSE>(&block, &w, d, &p_no);
+        let (_, c_on) = encode_block::<MSE>(&block, &w, d, &p_on);
+        for (a, b) in c_no.iter().zip(c_on.iter()) {
+            let expected_delta = b.outliers.len() * 4;
+            let actual_delta = b.nbytes() - a.nbytes();
+            assert_eq!(
+                actual_delta, expected_delta,
+                "outlier byte cost should be 4 bytes each (got {} for {} outliers)",
+                actual_delta, b.outliers.len()
+            );
+        }
+    }
+
+    #[test]
+    fn outlier_with_very_low_threshold_patches_all_coords() {
+        // T=0 means every coord is an outlier; reconstruction should then
+        // be near-perfect (limited only by f16 precision).
+        use rand::rngs::SmallRng;
+        use rand::{Rng, SeedableRng};
+        let mut rng = SmallRng::seed_from_u64(21);
+        let n = 8;
+        let d = 8;
+        let mut block = vec![0.0_f32; n * d];
+        for v in &mut block {
+            *v = rng.gen::<f32>() * 2.0 - 1.0;
+        }
+        let w = vec![1.0_f32; n];
+        let params = CodecParams {
+            bit_width: 2, k: 4, variance_ratio: 0.99,
+            outlier_threshold: Some(0.0),  // everything is an outlier
+            ..Default::default()
+        };
+        let (sk, codes) = encode_block::<MSE>(&block, &w, d, &params);
+        let rec = decode_block::<MSE>(&sk, &codes);
+        let mse = mse_of(&block, &rec);
+        // With every coord patched as exact f16, the Lloyd-Max residual
+        // error is fully bypassed. Remaining error comes from f16
+        // precision on (i) outlier values, (ii) the K-means center / t
+        // scalar reconstruction, and (iii) the f16 PCA basis/mean.
+        // On uniform-random [-1, 1] input this floor is ~2e-3.
+        assert!(mse < 3e-3,
+                "T=0 patches every coord → near-lossless, got MSE={mse:.4e}");
+    }
+
+    // -------------------- skeleton_dtype ablation --------------------
+
+    /// FP32 skeleton must preserve the PCA mean and basis to full float
+    /// precision — no f16 rounding on the way in or out.
+    #[test]
+    fn skeleton_fp32_preserves_precision() {
+        use rand::rngs::SmallRng;
+        use rand::{Rng, SeedableRng};
+        let mut rng = SmallRng::seed_from_u64(0xABCD);
+        let n = 64;
+        let d = 32;
+        let mut block = vec![0.0_f32; n * d];
+        for v in &mut block {
+            *v = rng.gen_range(-1.0_f32..1.0);
+        }
+        let w = vec![1.0_f32; n];
+        let base = CodecParams {
+            variance_ratio: 0.99,
+            k: 4,
+            bit_width: 3,
+            ..Default::default()
+        };
+        let p_fp16 = CodecParams { skeleton_dtype: SkeletonDtype::Fp16, ..base.clone() };
+        let p_fp32 = CodecParams { skeleton_dtype: SkeletonDtype::Fp32, ..base };
+        let (sk16, _) = encode_block::<MSE>(&block, &w, d, &p_fp16);
+        let (sk32, _) = encode_block::<MSE>(&block, &w, d, &p_fp32);
+
+        // Same d_eff (fit is numerically identical; only storage differs).
+        assert_eq!(sk16.pca.d_eff, sk32.pca.d_eff);
+
+        // fp16 path: mean/basis are f16-rounded; mean_fp32 / basis_fp32 are None.
+        assert!(sk16.pca.mean_fp32.is_none());
+        assert!(sk16.pca.basis_fp32.is_none());
+
+        // fp32 path: mean_fp32 / basis_fp32 are Some; f16 buffers are empty.
+        let mean32 = sk32.pca.mean_fp32.as_ref().expect("fp32 mean");
+        let basis32 = sk32.pca.basis_fp32.as_ref().expect("fp32 basis");
+        assert!(sk32.pca.mean.is_empty());
+        assert!(sk32.pca.basis.is_empty());
+
+        // fp32 mean/basis must match the unrounded fp32 values exactly,
+        // while fp16 storage round-trips through f16 with non-zero error.
+        let mean16 = sk16.pca.mean_f32();
+        assert_eq!(mean32.len(), mean16.len());
+        let mean_delta: f32 = mean32
+            .iter()
+            .zip(&mean16)
+            .map(|(a, b)| (a - b).abs())
+            .fold(0.0, f32::max);
+        // mean_delta > 0 would show f16 was doing some rounding; but if the
+        // mean happens to fall exactly on f16-representable grid points it
+        // may be zero. Either way the structural check above is the real
+        // guarantee. We include this loose sanity check without asserting.
+        let _ = mean_delta;
+
+        // K-means centres follow the same storage contract.
+        assert!(sk32.kmeans.centers_fp32.is_some());
+        assert!(sk32.kmeans.centers.is_empty());
+
+        // Byte accounting: fp32 skeleton must be ~2× fp16.
+        assert!(
+            sk32.pca.nbytes() >= 2 * sk16.pca.nbytes() - 8,
+            "fp32 PCA should be ~2× fp16 ({} vs {})",
+            sk32.pca.nbytes(),
+            sk16.pca.nbytes()
+        );
+
+        // basis32 values should have full fp32 precision (no rounding to f16
+        // grid); verify at least some value has more than 10 bits of mantissa
+        // beyond the f16 nearest.
+        let basis16 = sk16.pca.basis_f32();
+        let mut saw_finer = false;
+        for (a, b) in basis32.iter().zip(&basis16) {
+            if (a - b).abs() > 1e-6 {
+                saw_finer = true;
+                break;
+            }
+        }
+        assert!(
+            saw_finer,
+            "fp32 basis must differ from f16-rounded version in at least one coordinate"
+        );
+    }
+
+    /// Round-trip must still work under fp32 skeleton storage.
+    #[test]
+    fn skeleton_fp32_round_trip() {
+        let n = 16;
+        let d = 8;
+        let mut block = vec![0.0_f32; n * d];
+        for i in 0..n {
+            for j in 0..d {
+                block[i * d + j] = ((i + j) as f32).sin();
+            }
+        }
+        let w = vec![1.0_f32; n];
+        let params = CodecParams {
+            variance_ratio: 0.95,
+            k: 4,
+            bit_width: 3,
+            skeleton_dtype: SkeletonDtype::Fp32,
+            ..Default::default()
+        };
+        let (sk, codes) = encode_block::<MSE>(&block, &w, d, &params);
+        let r = decode_block::<MSE>(&sk, &codes);
+        assert_eq!(r.len(), block.len());
+        for v in r {
+            assert!(v.is_finite());
+        }
+        // fp32 skeleton: mean_fp32 / basis_fp32 should be Some, f16 buffers empty.
+        assert!(sk.pca.mean_fp32.is_some());
+        assert!(sk.pca.basis_fp32.is_some());
+        assert!(sk.pca.mean.is_empty());
+        assert!(sk.pca.basis.is_empty());
+        assert!(sk.kmeans.centers_fp32.is_some());
+        assert!(sk.kmeans.centers.is_empty());
     }
 
     // -------------------- all-zero input --------------------
@@ -872,6 +1399,11 @@ mod tests {
             k: 4, bit_width: 3,
             rotation_seed: 0xA5A5_A5A5,
             kmeans_max_iter: 32,
+            pca_method: PcaMethod::Exact,
+            skeleton_dtype: SkeletonDtype::Fp16,
+            exact_rank_cap: None,
+            custom_centroids: None,
+            outlier_threshold: None,
         };
         let enc = encode_layer::<MSE>(&blocks, &ws, d, &params, false);
         assert!(enc.shared_pca.is_none());
diff --git a/kakeyaturbo/src/kmeans.rs b/kakeyaturbo/src/kmeans.rs
index d2bb6cd4..ecee0c23 100644
--- a/kakeyaturbo/src/kmeans.rs
+++ b/kakeyaturbo/src/kmeans.rs
@@ -22,8 +22,11 @@ use rand::{Rng, SeedableRng};
 /// are fit in f32 (iterative); the final result is converted once.
 #[derive(Debug, Clone)]
 pub struct KmeansFit {
-    /// Unit-norm centres row-major `[K, d_eff]`, stored as bf16.
+    /// Unit-norm centres row-major `[K, d_eff]`, stored as f16 (empty if fp32 skeleton selected).
     pub centers: Vec<f16>,
+    /// Optional fp32 centres buffer — populated iff the caller asked for
+    /// `PcaStorage::Fp32` (via the codec's skeleton dtype flag).
+    pub centers_fp32: Option<Vec<f32>>,
     /// Number of centres.
     pub k: usize,
     /// Coefficient dimension.
@@ -34,6 +37,9 @@ impl KmeansFit {
     /// Get a freshly-allocated f32 copy of the `i`-th centre.
     #[must_use]
     pub fn center(&self, i: usize) -> Vec<f32> {
+        if let Some(ref c) = self.centers_fp32 {
+            return c[i * self.d_eff..(i + 1) * self.d_eff].to_vec();
+        }
         self.centers[i * self.d_eff..(i + 1) * self.d_eff]
             .iter()
             .map(|&v| v.to_f32())
@@ -44,13 +50,28 @@ impl KmeansFit {
     #[must_use]
     pub fn nbytes(&self) -> usize {
         self.centers.len() * std::mem::size_of::<f16>()
+            + self.centers_fp32.as_ref().map(Vec::len).unwrap_or(0)
+                * std::mem::size_of::<f32>()
     }
 
-    /// Construct directly from f32 centres (e.g. for tests).
+    /// Construct directly from f32 centres (e.g. for tests). Defaults to
+    /// f16 skeleton storage for backward compatibility.
     #[must_use]
     pub fn from_f32(centers: Vec<f32>, k: usize, d_eff: usize) -> Self {
         Self {
             centers: centers.iter().map(|&v| f16::from_f32(v)).collect(),
+            centers_fp32: None,
+            k,
+            d_eff,
+        }
+    }
+
+    /// Construct with fp32 skeleton storage.
+    #[must_use]
+    pub fn from_f32_skeleton_fp32(centers: Vec<f32>, k: usize, d_eff: usize) -> Self {
+        Self {
+            centers: Vec::new(),
+            centers_fp32: Some(centers),
             k,
             d_eff,
         }
@@ -127,6 +148,21 @@ pub fn fit_spherical_kmeans(
     k: usize,
     seed: u32,
     max_iter: u32,
+) -> KmeansFit {
+    fit_spherical_kmeans_with_storage(coeffs, weights, d_eff, k, seed, max_iter, false)
+}
+
+/// Storage-aware variant: when `fp32_skeleton` is true, keep the fitted
+/// centres in full fp32 precision instead of rounding to f16.
+#[must_use]
+pub fn fit_spherical_kmeans_with_storage(
+    coeffs: &[f32],
+    weights: &[f32],
+    d_eff: usize,
+    k: usize,
+    seed: u32,
+    max_iter: u32,
+    fp32_skeleton: bool,
 ) -> KmeansFit {
     assert!(k > 0, "k must be positive");
     assert!(d_eff > 0, "d_eff must be positive");
@@ -213,7 +249,11 @@ pub fn fit_spherical_kmeans(
         centers = new_centers;
     }
 
-    KmeansFit::from_f32(centers, k, d_eff)
+    if fp32_skeleton {
+        KmeansFit::from_f32_skeleton_fp32(centers, k, d_eff)
+    } else {
+        KmeansFit::from_f32(centers, k, d_eff)
+    }
 }
 
 /// Assign a coefficient row to the centre that minimises the residual
diff --git a/kakeyaturbo/src/lib.rs b/kakeyaturbo/src/lib.rs
index d26cd5e3..236690ba 100644
--- a/kakeyaturbo/src/lib.rs
+++ b/kakeyaturbo/src/lib.rs
@@ -50,8 +50,9 @@ pub mod skeleton;
 pub mod wht;
 
 pub use codec::{
-    decode_block, decode_layer, encode_block, encode_layer, layer_nbytes, Code, CodecParams,
-    LayerEncoding,
+    decode_block, decode_block_with_centroids, decode_layer, decode_layer_with_centroids,
+    encode_block, encode_layer, layer_nbytes, Code, CodecParams, LayerEncoding, PcaMethod,
+    SkeletonDtype,
 };
 pub use distortion::{Distortion, InnerProduct, LInf, NormMode, MSE};
 pub use skeleton::Skeleton;
diff --git a/kakeyaturbo/src/pca.rs b/kakeyaturbo/src/pca.rs
index def35fb6..0ac8b501 100644
--- a/kakeyaturbo/src/pca.rs
+++ b/kakeyaturbo/src/pca.rs
@@ -19,6 +19,57 @@
 
 use half::f16;
 use nalgebra::{DMatrix, DVector, SymmetricEigen};
+use rand::rngs::SmallRng;
+use rand::{Rng, SeedableRng};
+
+/// Storage precision for the mean and basis tensors of a [`PcaFit`].
+/// `Fp16` is the v1.2/v1.3 default; `Fp32` doubles skeleton bytes and is
+/// used by the 2024-04 skeleton-precision ablation.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum PcaStorage {
+    /// IEEE-754 binary16 (default, matches paper byte accounting).
+    Fp16,
+    /// IEEE-754 binary32 (ablation-only, doubles skeleton bytes).
+    Fp32,
+}
+
+impl Default for PcaStorage {
+    fn default() -> Self {
+        Self::Fp16
+    }
+}
+
+/// Internal helper: pack a finished `(mean, basis, d_eff, captured_variance)`
+/// tuple into a `PcaFit` with either f16 or fp32 skeleton storage.
+fn materialize_pca_fit(
+    mean: Vec<f32>,
+    basis: Vec<f32>,
+    d_eff: usize,
+    captured_variance: f32,
+    storage: PcaStorage,
+) -> PcaFit {
+    let d = mean.len();
+    match storage {
+        PcaStorage::Fp16 => PcaFit {
+            mean: to_bf16(&mean),
+            basis: to_bf16(&basis),
+            mean_fp32: None,
+            basis_fp32: None,
+            d_eff,
+            d,
+            captured_variance,
+        },
+        PcaStorage::Fp32 => PcaFit {
+            mean: Vec::new(),
+            basis: Vec::new(),
+            mean_fp32: Some(mean),
+            basis_fp32: Some(basis),
+            d_eff,
+            d,
+            captured_variance,
+        },
+    }
+}
 
 /// Convert an f32 slice to an owned bf16 Vec (saturating conversion for NaN/Inf).
 #[inline]
@@ -76,12 +127,20 @@ pub fn weighted_mean(vectors: &[f32], weights: &[f32], d: usize) -> Vec<f32> {
 /// and [`Self::basis_f32`] when you need f32 views.
 #[derive(Debug, Clone)]
 pub struct PcaFit {
-    /// Mean vector, length `D`, stored as bf16.
+    /// Mean vector, length `D`, stored as f16 (empty if fp32 skeleton selected).
     pub mean: Vec<f16>,
-    /// Basis row-major `[d_eff, D]`, stored as bf16.
+    /// Basis row-major `[d_eff, D]`, stored as f16 (empty if fp32 skeleton selected).
     pub basis: Vec<f16>,
+    /// Optional fp32 mean buffer — populated iff the caller asked for
+    /// `SkeletonDtype::Fp32`. When set, takes precedence over `mean`.
+    pub mean_fp32: Option<Vec<f32>>,
+    /// Optional fp32 basis buffer — same semantics as `mean_fp32`.
+    pub basis_fp32: Option<Vec<f32>>,
     /// Number of kept components.
     pub d_eff: usize,
+    /// Input dimension `D` (needed when fp32 buffers are populated and the
+    /// f16 buffers are empty).
+    pub d: usize,
     /// Captured variance ratio (actual, may be ≥ the requested threshold).
     pub captured_variance: f32,
 }
@@ -90,31 +149,76 @@ impl PcaFit {
     /// Return the mean as a freshly-allocated f32 vector.
     #[must_use]
     pub fn mean_f32(&self) -> Vec<f32> {
+        if let Some(ref m) = self.mean_fp32 {
+            return m.clone();
+        }
         to_f32(&self.mean)
     }
 
     /// Return the basis as a freshly-allocated f32 vector.
     #[must_use]
     pub fn basis_f32(&self) -> Vec<f32> {
+        if let Some(ref b) = self.basis_fp32 {
+            return b.clone();
+        }
         to_f32(&self.basis)
     }
 
     /// Construct a `PcaFit` directly from f32 buffers (e.g. unit tests).
+    /// Default skeleton dtype is f16 for backward compatibility.
     #[must_use]
     pub fn from_f32(mean: Vec<f32>, basis: Vec<f32>, d_eff: usize, captured: f32) -> Self {
+        let d = mean.len();
         Self {
             mean: to_bf16(&mean),
             basis: to_bf16(&basis),
+            mean_fp32: None,
+            basis_fp32: None,
             d_eff,
+            d,
             captured_variance: captured,
         }
     }
 
+    /// Construct a `PcaFit` with fp32 skeleton storage.
+    #[must_use]
+    pub fn from_f32_skeleton_fp32(
+        mean: Vec<f32>,
+        basis: Vec<f32>,
+        d_eff: usize,
+        captured: f32,
+    ) -> Self {
+        let d = mean.len();
+        Self {
+            mean: Vec::new(),
+            basis: Vec::new(),
+            mean_fp32: Some(mean),
+            basis_fp32: Some(basis),
+            d_eff,
+            d,
+            captured_variance: captured,
+        }
+    }
+
+    /// Input dimension `D`.
+    #[must_use]
+    pub fn d(&self) -> usize {
+        if self.d > 0 {
+            self.d
+        } else {
+            self.mean.len()
+        }
+    }
+
     /// Byte footprint of this fit (the thing the codec actually stores).
     #[must_use]
     pub fn nbytes(&self) -> usize {
-        self.mean.len() * std::mem::size_of::<f16>()
-            + self.basis.len() * std::mem::size_of::<f16>()
+        let fp32_bytes = self.mean_fp32.as_ref().map(Vec::len).unwrap_or(0)
+            * std::mem::size_of::<f32>()
+            + self.basis_fp32.as_ref().map(Vec::len).unwrap_or(0) * std::mem::size_of::<f32>();
+        let f16_bytes = self.mean.len() * std::mem::size_of::<f16>()
+            + self.basis.len() * std::mem::size_of::<f16>();
+        fp32_bytes + f16_bytes
     }
 }
 
@@ -131,6 +235,36 @@ impl PcaFit {
 /// `variance_ratio` is not finite.
 #[must_use]
 pub fn fit_weighted_pca(vectors: &[f32], weights: &[f32], d: usize, variance_ratio: f32) -> PcaFit {
+    fit_weighted_pca_with_storage_capped(
+        vectors, weights, d, variance_ratio, PcaStorage::Fp16, None,
+    )
+}
+
+/// Storage-aware variant of [`fit_weighted_pca`], no rank cap.
+#[must_use]
+pub fn fit_weighted_pca_with_storage(
+    vectors: &[f32],
+    weights: &[f32],
+    d: usize,
+    variance_ratio: f32,
+    storage: PcaStorage,
+) -> PcaFit {
+    fit_weighted_pca_with_storage_capped(vectors, weights, d, variance_ratio, storage, None)
+}
+
+/// Full-control variant of [`fit_weighted_pca`].  When `rank_cap` is
+/// `Some(r)`, `d_eff` is clipped to at most `r` regardless of
+/// `variance_ratio`. This lets the caller use exact PCA to match
+/// RSVD's rank-budgeted behaviour without RSVD's approximation error.
+#[must_use]
+pub fn fit_weighted_pca_with_storage_capped(
+    vectors: &[f32],
+    weights: &[f32],
+    d: usize,
+    variance_ratio: f32,
+    storage: PcaStorage,
+    rank_cap: Option<usize>,
+) -> PcaFit {
     assert!(
         variance_ratio.is_finite(),
         "variance_ratio must be finite, got {variance_ratio}"
@@ -192,6 +326,9 @@ pub fn fit_weighted_pca(vectors: &[f32], weights: &[f32], d: usize, variance_rat
         d_eff = 1;
     }
     d_eff = d_eff.clamp(1, d);
+    if let Some(cap) = rank_cap {
+        d_eff = d_eff.min(cap.max(1).min(d));
+    }
 
     // Flatten top-d_eff eigenvectors into row-major basis.
     let mut basis = Vec::with_capacity(d_eff * d);
@@ -209,12 +346,7 @@ pub fn fit_weighted_pca(vectors: &[f32], weights: &[f32], d: usize, variance_rat
         1.0
     };
 
-    PcaFit {
-        mean: to_bf16(&mean),
-        basis: to_bf16(&basis),
-        d_eff,
-        captured_variance,
-    }
+    materialize_pca_fit(mean, basis, d_eff, captured_variance, storage)
 }
 
 /// Fit a weighted PCA on a concatenated multi-block tensor and return a
@@ -231,21 +363,211 @@ pub fn fit_weighted_pca_pooled(
     fit_weighted_pca(vectors, weights, d, variance_ratio)
 }
 
+/// Randomized SVD (Halko–Martinsson–Tropp 2011) weighted PCA.
+///
+/// Given vectors X ∈ ℝ^{n×D} and weights w, this function finds the top
+/// `d_eff` right singular vectors of the centred, weighted design matrix
+/// `A := diag(√w)·(X − μ)` via a low-rank randomized sketch.
+///
+/// **Algorithm** (HMT 2011, §4.3 'range finder on Aᵀ'):
+///
+/// 1. Draw Ω ∈ ℝ^{n×r} with r = min(k + p, D), i.i.d. N(0, 1).
+/// 2. Form sketch Z = Aᵀ · Ω ∈ ℝ^{D×r}.
+/// 3. Power iterations: Z ← Aᵀ A Z, repeated `power_iters` times.
+/// 4. QR decomposition Z = Q · R, with Q ∈ ℝ^{D×r} orthonormal.
+/// 5. Form small matrix B = A · Q ∈ ℝ^{n×r} and compute its thin SVD
+///    B = Û · Σ · V̂ᵀ, where V̂ ∈ ℝ^{r×r}.
+/// 6. Right singular vectors of A ≈ Q · V̂. Eigenvalues of
+///    Σ_w = Aᵀ A / w_sum are σ_i² / w_sum.
+///
+/// Complexity: **O(n · D · r)** with a single `power_iters` pass versus
+/// O(n · D²) for the exact covariance path. For the v1.2 preset
+/// (n=512, D=128, r≈12) that's ~12× fewer ops; at Gemma's D=512 it's
+/// ~40×.
+///
+/// Accuracy: with `power_iters = 2` the operator-norm error is
+/// `≤ (1 + 11·√(k+p)/(p−1)) · σ_{k+1}` (Halko et al. Thm 10.6). On
+/// realistic KV-cache spectra with d_eff much smaller than D this is
+/// sub-1% relative error.
+///
+/// # Arguments
+///
+/// - `target_rank`: upper bound on `d_eff`. Typical `D/2` for safety.
+/// - `oversample`: extra sketch dims, 5–10 is standard. Bigger = more
+///   accurate but more work.
+/// - `power_iters`: number of subspace-power iterations, 0–3 typical.
+///   More iterations → more accurate on slow-decay spectra.
+/// - `seed`: RNG seed for the Gaussian test matrix.
+///
+/// # Panics
+///
+/// Same as [`fit_weighted_pca`], plus panics if `target_rank == 0`.
+#[must_use]
+#[allow(clippy::too_many_arguments)]
+pub fn fit_weighted_pca_randomized(
+    vectors: &[f32],
+    weights: &[f32],
+    d: usize,
+    variance_ratio: f32,
+    target_rank: usize,
+    oversample: usize,
+    power_iters: u32,
+    seed: u64,
+) -> PcaFit {
+    fit_weighted_pca_randomized_with_storage(
+        vectors,
+        weights,
+        d,
+        variance_ratio,
+        target_rank,
+        oversample,
+        power_iters,
+        seed,
+        PcaStorage::Fp16,
+    )
+}
+
+/// Storage-aware variant of [`fit_weighted_pca_randomized`].
+#[must_use]
+#[allow(clippy::too_many_arguments)]
+pub fn fit_weighted_pca_randomized_with_storage(
+    vectors: &[f32],
+    weights: &[f32],
+    d: usize,
+    variance_ratio: f32,
+    target_rank: usize,
+    oversample: usize,
+    power_iters: u32,
+    seed: u64,
+    storage: PcaStorage,
+) -> PcaFit {
+    assert!(
+        variance_ratio.is_finite(),
+        "variance_ratio must be finite, got {variance_ratio}"
+    );
+    assert!(d > 0, "D must be positive");
+    assert!(target_rank >= 1, "target_rank must be ≥ 1");
+    assert_eq!(vectors.len() % d, 0, "vector buffer not a multiple of D");
+    let n = weights.len();
+    assert_eq!(vectors.len() / d, n, "weights length mismatch");
+
+    let mean = weighted_mean(vectors, weights, d);
+    let w_sum: f32 = weights.iter().sum();
+    assert!(
+        w_sum > f32::EPSILON,
+        "weight sum must be positive, got {w_sum}"
+    );
+
+    // Effective sketch size r = min(k + p, D).
+    let k_target = target_rank.min(d);
+    let r = (k_target + oversample).min(d);
+
+    // A ∈ ℝ^{n×D} with A[i,j] = √w_i · (x_i,j − μ_j), nalgebra stores column-major.
+    let a = DMatrix::<f32>::from_fn(n, d, |i, j| {
+        let w_i = weights[i].max(0.0);
+        w_i.sqrt() * (vectors[i * d + j] - mean[j])
+    });
+
+    // Ω ∈ ℝ^{n×r}, i.i.d. N(0,1) via Box–Muller.
+    let mut rng = SmallRng::seed_from_u64(seed);
+    let omega = DMatrix::<f32>::from_fn(n, r, |_, _| {
+        let u1: f32 = rng.gen_range(f32::EPSILON..1.0);
+        let u2: f32 = rng.gen_range(0.0..1.0);
+        (-2.0_f32 * u1.ln()).sqrt() * (std::f32::consts::TAU * u2).cos()
+    });
+
+    // Z = Aᵀ · Ω, shape D×r.
+    let mut z = a.transpose() * &omega;
+
+    // Subspace power iterations with re-orthogonalisation (HMT 2011 §4.5,
+    // "Algorithm 4.4: Randomized Subspace Iteration").  Without the per-
+    // iteration QR, power iteration on ill-conditioned data (condition
+    // number ≳ 10³) produces exponentially-growing column norms that push
+    // nalgebra's subsequent thin-SVD into an effectively-non-terminating
+    // Jacobi sweep on numerically-rank-deficient inputs.  Re-orthogonalising
+    // Z between iterations keeps columns unit-norm and decouples the
+    // iteration's stability from the spectrum of A.
+    for _ in 0..power_iters {
+        let ay = &a * &z; // n×r
+        let ay_q = ay.qr().q();
+        let ay_q = ay_q.columns(0, r).into_owned();
+        let ata_q = a.transpose() * &ay_q; // D×r
+        let ata_qr = ata_q.qr().q();
+        z = ata_qr.columns(0, r).into_owned();
+    }
+
+    // QR of Z → Q ∈ ℝ^{D×r} orthonormal.
+    let qr = z.qr();
+    let q_full = qr.q();
+    let q = q_full.columns(0, r).into_owned(); // D×r
+
+    // B = A · Q, shape n×r.
+    let b = &a * &q;
+
+    // Thin SVD of B.
+    let svd = b.svd(true, true);
+    let singular = svd.singular_values;
+    let v_t = svd.v_t.expect("SVD v_t requested");
+
+    // v_t is r×r (since B is n×r with n > r typically). Right singular
+    // vectors of B are rows of v_t. Right singular vectors of A are
+    // columns of (Q · v_tᵀ).
+    let v_small = v_t.transpose(); // r×r, columns = right singular vectors of B
+    let basis_mat = &q * &v_small; // D×r, columns = right singular vectors of A
+
+    // Eigenvalues of Σ_w = Aᵀ A / w_sum are σ_i² / w_sum.
+    let sigma_vals: Vec<f32> = singular.iter().map(|s| s * s / w_sum).collect();
+
+    let total_var: f32 = sigma_vals.iter().map(|v| v.max(0.0)).sum();
+    let ratio = variance_ratio.clamp(0.0, 1.0);
+    let mut cum = 0.0_f32;
+    let mut d_eff = r;
+    if total_var > f32::EPSILON {
+        for (i, v) in sigma_vals.iter().enumerate() {
+            cum += v.max(0.0);
+            if cum / total_var >= ratio {
+                d_eff = i + 1;
+                break;
+            }
+        }
+    } else {
+        d_eff = 1;
+    }
+    d_eff = d_eff.clamp(1, k_target);
+
+    // Flatten top-d_eff columns of basis_mat into row-major basis.
+    let mut basis = Vec::with_capacity(d_eff * d);
+    let mut captured = 0.0_f32;
+    for k in 0..d_eff {
+        captured += sigma_vals[k].max(0.0);
+        for row in 0..d {
+            basis.push(basis_mat[(row, k)]);
+        }
+    }
+    let captured_variance = if total_var > f32::EPSILON {
+        (captured / total_var).clamp(0.0, 1.0)
+    } else {
+        1.0
+    };
+
+    materialize_pca_fit(mean, basis, d_eff, captured_variance, storage)
+}
+
 /// Project a single vector `x` onto the PCA basis: `coeff = U · (x − μ)`.
 ///
 /// Internally converts the bf16 basis/mean to f32 once per call; the
 /// inner multiply-add loop stays in f32 for numerical accuracy.
 #[must_use]
 pub fn project(x: &[f32], fit: &PcaFit) -> Vec<f32> {
-    let d = fit.mean.len();
+    let d = fit.d();
     assert_eq!(x.len(), d, "x dimension mismatch");
+    let mean_f32 = fit.mean_f32();
+    let basis_f32 = fit.basis_f32();
     let mut coeff = vec![0.0_f32; fit.d_eff];
     for k in 0..fit.d_eff {
         let mut acc = 0.0_f32;
         for j in 0..d {
-            let basis_kj = fit.basis[k * d + j].to_f32();
-            let mean_j = fit.mean[j].to_f32();
-            acc += basis_kj * (x[j] - mean_j);
+            acc += basis_f32[k * d + j] * (x[j] - mean_f32[j]);
         }
         coeff[k] = acc;
     }
@@ -257,12 +579,13 @@ pub fn project(x: &[f32], fit: &PcaFit) -> Vec<f32> {
 #[must_use]
 pub fn unproject(coeff: &[f32], fit: &PcaFit) -> Vec<f32> {
     assert_eq!(coeff.len(), fit.d_eff, "coeff length mismatch");
-    let d = fit.mean.len();
+    let d = fit.d();
+    let basis_f32 = fit.basis_f32();
     let mut x = fit.mean_f32();
     for k in 0..fit.d_eff {
         let c = coeff[k];
         for j in 0..d {
-            x[j] += fit.basis[k * d + j].to_f32() * c;
+            x[j] += basis_f32[k * d + j] * c;
         }
     }
     x
@@ -541,6 +864,224 @@ mod tests {
         );
     }
 
+    // -------------------- randomized SVD --------------------
+
+    /// Subspace angle between two orthonormal bases as a scalar in [0,1]:
+    /// 1 − min principal cosine. Smaller = more aligned.
+    fn subspace_angle(u1: &[f32], u2: &[f32], d: usize, r1: usize, r2: usize) -> f32 {
+        // Build k×k inner-product matrix U1ᵀ · U2.
+        let r = r1.min(r2);
+        let mut m = 0.0_f32;
+        for i in 0..r1 {
+            for j in 0..r2 {
+                let mut dot = 0.0_f32;
+                for k in 0..d {
+                    dot += u1[i * d + k] * u2[j * d + k];
+                }
+                m = m.max(dot.abs());
+            }
+        }
+        let _ = r;
+        1.0 - m
+    }
+
+    #[test]
+    fn randomized_pca_matches_exact_on_rank1_data() {
+        // Rank-1 data: every vector is scalar × v. Randomized PCA must
+        // recover v as the top direction with very small error.
+        let n = 64;
+        let d = 32;
+        let mut vecs = Vec::with_capacity(n * d);
+        let v: Vec<f32> = (0..d).map(|i| ((i as f32) * 0.13).sin()).collect();
+        let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
+        let v_hat: Vec<f32> = v.iter().map(|x| x / norm).collect();
+        for i in 0..n {
+            let s = (i as f32) - (n as f32) / 2.0;
+            for j in 0..d {
+                vecs.push(s * v_hat[j]);
+            }
+        }
+        let w = vec![1.0_f32; n];
+        let exact = fit_weighted_pca(&vecs, &w, d, 0.99);
+        let rsvd = fit_weighted_pca_randomized(&vecs, &w, d, 0.99, 2, 4, 2, 42);
+        assert_eq!(exact.d_eff, 1);
+        assert_eq!(rsvd.d_eff, 1);
+        let a = subspace_angle(&exact.basis_f32(), &rsvd.basis_f32(), d, 1, 1);
+        assert!(a < 1e-3, "rank-1 top direction should match (1 − cos = {a})");
+    }
+
+    #[test]
+    fn randomized_pca_recovers_top_subspace_of_low_rank_block() {
+        use rand::rngs::SmallRng;
+        use rand::{Rng, SeedableRng};
+        // Build a rank-4 block in dim 24 with tiny isotropic noise.
+        let n = 256;
+        let d = 24;
+        let rank = 4;
+        let mut rng = SmallRng::seed_from_u64(7);
+        let mut basis_true = vec![0.0_f32; rank * d];
+        for v in &mut basis_true {
+            *v = rng.gen_range(-1.0..1.0);
+        }
+        // QR to get orthonormal true basis.
+        let m = DMatrix::<f32>::from_row_slice(rank, d, &basis_true);
+        let q = m.transpose().qr().q().columns(0, rank).into_owned();
+        let mut basis_flat = vec![0.0_f32; rank * d];
+        for i in 0..rank {
+            for j in 0..d {
+                basis_flat[i * d + j] = q[(j, i)];
+            }
+        }
+        let mut vecs = vec![0.0_f32; n * d];
+        for row in 0..n {
+            for k in 0..rank {
+                let c: f32 = rng.gen_range(-3.0..3.0);
+                for j in 0..d {
+                    vecs[row * d + j] += c * basis_flat[k * d + j];
+                }
+            }
+            for j in 0..d {
+                vecs[row * d + j] += rng.gen_range(-0.01..0.01_f32);
+            }
+        }
+        let w = vec![1.0_f32; n];
+        let exact = fit_weighted_pca(&vecs, &w, d, 0.99);
+        let rsvd = fit_weighted_pca_randomized(&vecs, &w, d, 0.99, 8, 6, 2, 123);
+        assert_eq!(exact.d_eff, rank);
+        assert!(rsvd.d_eff >= rank && rsvd.d_eff <= rank + 1);
+        let a = subspace_angle(
+            &exact.basis_f32()[..rank * d],
+            &rsvd.basis_f32()[..rank * d],
+            d,
+            rank,
+            rank,
+        );
+        assert!(a < 5e-2, "top-{rank} subspace angle must match (1 − cos = {a})");
+    }
+
+    #[test]
+    fn randomized_pca_reconstruction_mse_close_to_exact() {
+        use rand::rngs::SmallRng;
+        use rand::{Rng, SeedableRng};
+        // Realistic scenario: 128-D block with a slow eigenvalue decay.
+        let n = 512;
+        let d = 128;
+        let mut rng = SmallRng::seed_from_u64(19);
+        // Use a diagonal covariance in a random orthonormal basis.
+        let eigvals: Vec<f32> = (0..d).map(|i| (-(i as f32) / 20.0).exp()).collect();
+        let mut q_mat = vec![0.0_f32; d * d];
+        for v in &mut q_mat {
+            *v = rng.gen_range(-1.0..1.0);
+        }
+        let q = DMatrix::<f32>::from_row_slice(d, d, &q_mat)
+            .qr()
+            .q()
+            .columns(0, d)
+            .into_owned();
+        let mut vecs = vec![0.0_f32; n * d];
+        for row in 0..n {
+            let mut latent = vec![0.0_f32; d];
+            for j in 0..d {
+                latent[j] = rng.gen_range(-1.0..1.0_f32) * eigvals[j].sqrt();
+            }
+            // Rotate into ambient space.
+            for j in 0..d {
+                let mut s = 0.0_f32;
+                for k in 0..d {
+                    s += q[(j, k)] * latent[k];
+                }
+                vecs[row * d + j] = s;
+            }
+        }
+        let w = vec![1.0_f32; n];
+        let exact = fit_weighted_pca(&vecs, &w, d, 0.95);
+        // With 3 power iterations on a slow-decay spectrum, randomized
+        // SVD tracks exact to within ~20% reconstruction MSE. Fewer
+        // iterations would require larger oversample.
+        let rsvd = fit_weighted_pca_randomized(&vecs, &w, d, 0.95, exact.d_eff + 8, 10, 3, 777);
+
+        // Both should capture ≥ 95% variance.
+        assert!(exact.captured_variance >= 0.949);
+        assert!(rsvd.captured_variance >= 0.90, "rsvd captured only {}", rsvd.captured_variance);
+
+        // Measure per-vector reconstruction MSE under each basis.
+        let mut e1 = 0.0_f64;
+        let mut e2 = 0.0_f64;
+        for row in 0..n {
+            let x = &vecs[row * d..(row + 1) * d];
+            let c1 = project(x, &exact);
+            let r1 = unproject(&c1, &exact);
+            let c2 = project(x, &rsvd);
+            let r2 = unproject(&c2, &rsvd);
+            for j in 0..d {
+                e1 += (x[j] - r1[j]) as f64 * (x[j] - r1[j]) as f64;
+                e2 += (x[j] - r2[j]) as f64 * (x[j] - r2[j]) as f64;
+            }
+        }
+        // On exponentially-decaying spectra randomized SVD with 3 power
+        // iterations + oversample=10 stays within 1.5× of exact MSE.
+        let ratio = e2 / e1.max(1e-12);
+        assert!(ratio <= 1.5, "rsvd MSE inflation {ratio:.3}× exceeds 1.5");
+    }
+
+    #[test]
+    fn randomized_pca_honours_variance_ratio_truncation() {
+        let n = 128;
+        let d = 16;
+        let vecs = ellipse_points(n, 5.0, 0.01, 0.2);
+        let w = vec![1.0_f32; n];
+        // Variance ratio 0.5 on an extremely skinny ellipse → should
+        // truncate to 1 direction.
+        let mut vecs_d = Vec::with_capacity(n * d);
+        for i in 0..n {
+            vecs_d.push(vecs[i * 2]);
+            vecs_d.push(vecs[i * 2 + 1]);
+            for _ in 2..d {
+                vecs_d.push(0.0);
+            }
+        }
+        let fit = fit_weighted_pca_randomized(&vecs_d, &w, d, 0.5, 4, 4, 1, 7);
+        assert_eq!(fit.d_eff, 1);
+    }
+
+    #[test]
+    fn randomized_pca_is_deterministic_on_same_seed() {
+        let n = 64;
+        let d = 16;
+        let vecs: Vec<f32> = (0..n * d).map(|i| (i as f32 * 0.1).sin()).collect();
+        let w = vec![1.0_f32; n];
+        let a = fit_weighted_pca_randomized(&vecs, &w, d, 0.9, 4, 4, 2, 42);
+        let b = fit_weighted_pca_randomized(&vecs, &w, d, 0.9, 4, 4, 2, 42);
+        assert_eq!(a.d_eff, b.d_eff);
+        for (x, y) in a.mean_f32().iter().zip(b.mean_f32().iter()) {
+            assert_abs_diff_eq!(x, y, epsilon = 1e-6);
+        }
+        for (x, y) in a.basis_f32().iter().zip(b.basis_f32().iter()) {
+            assert_abs_diff_eq!(x, y, epsilon = 1e-6);
+        }
+    }
+
+    #[test]
+    fn randomized_pca_different_seeds_give_similar_subspaces() {
+        let n = 256;
+        let d = 32;
+        let vecs: Vec<f32> = (0..n * d).map(|i| ((i as f32 * 0.17).cos() * 2.0)).collect();
+        let w = vec![1.0_f32; n];
+        let a = fit_weighted_pca_randomized(&vecs, &w, d, 0.9, 6, 6, 2, 1);
+        let b = fit_weighted_pca_randomized(&vecs, &w, d, 0.9, 6, 6, 2, 2);
+        // d_eff may differ by ±1, but the leading direction should align.
+        let ang = subspace_angle(&a.basis_f32()[..d], &b.basis_f32()[..d], d, 1, 1);
+        assert!(ang < 5e-2, "top direction should be consistent across seeds (1 − cos = {ang})");
+    }
+
+    #[test]
+    #[should_panic(expected = "target_rank must be ≥ 1")]
+    fn randomized_pca_rejects_zero_rank() {
+        let _ = fit_weighted_pca_randomized(&[1.0_f32; 16], &[1.0; 4], 4, 0.9, 0, 2, 1, 0);
+    }
+
+    // ---------- back to pooled fit tests ----------
+
     #[test]
     fn pooled_fit_matches_plain_fit() {
         // fit_weighted_pca_pooled is currently just an alias but the
diff --git a/kakeyaturbo/src/quantize.rs b/kakeyaturbo/src/quantize.rs
index 36dd3d0b..2eee2c78 100644
--- a/kakeyaturbo/src/quantize.rs
+++ b/kakeyaturbo/src/quantize.rs
@@ -47,11 +47,46 @@ pub fn centroids_gaussian(bits: u8) -> &'static [f32] {
 ///
 /// For `R = MSE`, `R::d(x, c)` inlines to `(x - c)²` and the function
 /// becomes a pure argmin loop with no dispatch in the emitted code.
+///
+/// Uses the unit-variance-Gaussian Lloyd-Max centroids by default.
 #[inline]
 #[must_use]
 pub fn quantize_vector<R: Distortion>(x: &[f32], bits: u8) -> Vec<u8> {
+    quantize_vector_with_centroids::<R>(x, bits, None)
+}
+
+/// Variant of [`quantize_vector`] that accepts an optional caller-supplied
+/// centroid table. When `Some`, must contain exactly `1 << bits` sorted
+/// floats; when `None`, falls back to the Gaussian default.
+///
+/// The calibrated-codebook path (fitting Lloyd-Max centroids on the
+/// empirical residual distribution of a specific model) uses this to
+/// replace the unit-variance Gaussian assumption with a model-specific
+/// optimum. Paper §2 calls this the "codebook calibration" step.
+#[inline]
+#[must_use]
+pub fn quantize_vector_with_centroids<R: Distortion>(
+    x: &[f32],
+    bits: u8,
+    custom_centroids: Option<&[f32]>,
+) -> Vec<u8> {
     assert!((1..=4).contains(&bits), "bits must be 1..=4");
-    let centroids = centroids_gaussian(bits);
+    let default;
+    let centroids: &[f32] = match custom_centroids {
+        Some(c) => {
+            assert_eq!(
+                c.len(),
+                1usize << bits,
+                "custom centroids must have {} entries for {bits}-bit",
+                1usize << bits
+            );
+            c
+        }
+        None => {
+            default = centroids_gaussian(bits);
+            default
+        }
+    };
     let mut out = Vec::with_capacity(x.len());
     for &xi in x {
         let mut best_idx: u8 = 0;
@@ -69,10 +104,38 @@ pub fn quantize_vector<R: Distortion>(x: &[f32], bits: u8) -> Vec<u8> {
 }
 
 /// Reverse of [`quantize_vector`]: map indices back to centroid values.
+///
+/// Uses the unit-variance-Gaussian Lloyd-Max centroids by default.
 #[must_use]
 pub fn dequantize_vector(indices: &[u8], bits: u8) -> Vec<f32> {
+    dequantize_vector_with_centroids(indices, bits, None)
+}
+
+/// Variant of [`dequantize_vector`] that accepts an optional caller-supplied
+/// centroid table (same contract as [`quantize_vector_with_centroids`]).
+#[must_use]
+pub fn dequantize_vector_with_centroids(
+    indices: &[u8],
+    bits: u8,
+    custom_centroids: Option<&[f32]>,
+) -> Vec<f32> {
     assert!((1..=4).contains(&bits), "bits must be 1..=4");
-    let centroids = centroids_gaussian(bits);
+    let default;
+    let centroids: &[f32] = match custom_centroids {
+        Some(c) => {
+            assert_eq!(
+                c.len(),
+                1usize << bits,
+                "custom centroids must have {} entries for {bits}-bit",
+                1usize << bits
+            );
+            c
+        }
+        None => {
+            default = centroids_gaussian(bits);
+            default
+        }
+    };
     indices.iter().map(|&i| centroids[i as usize]).collect()
 }
 
@@ -360,6 +423,99 @@ mod tests {
         let _ = unpack_bits(&[0u8], 9, 1);
     }
 
+    // -------------------- custom centroids --------------------
+
+    #[test]
+    fn quantize_with_custom_centroids_matches_nearest() {
+        // Asymmetric centroids — calibrated for a heavy-tailed distribution
+        // (bigger extreme centroids, denser near 0).
+        let c: Vec<f32> = vec![-3.0, -0.5, 0.5, 3.0];  // b=2, 4 entries
+        let input = vec![-4.0_f32, -1.0, -0.3, 0.0, 0.3, 1.0, 4.0];
+        let q = quantize_vector_with_centroids::<MSE>(&input, 2, Some(&c));
+        //  -4 → 0 (nearest -3)
+        //  -1 → 1 (nearest -0.5, since |-1 - -0.5| = 0.5 < |-1 - -3| = 2)
+        //  -0.3 → 1 (nearest -0.5)
+        //  0.0 → 1 or 2 (equidistant, argmin picks first = 1)
+        //  0.3 → 2 (nearest 0.5)
+        //  1.0 → 2 (nearest 0.5)
+        //  4.0 → 3 (nearest 3)
+        assert_eq!(q, vec![0, 1, 1, 1, 2, 2, 3]);
+    }
+
+    #[test]
+    fn custom_centroids_round_trip() {
+        let c: Vec<f32> = vec![-2.5, -0.3, 0.3, 2.5];
+        let input = c.clone();
+        let q = quantize_vector_with_centroids::<MSE>(&input, 2, Some(&c));
+        let rec = dequantize_vector_with_centroids(&q, 2, Some(&c));
+        for (a, b) in input.iter().zip(&rec) {
+            assert_abs_diff_eq!(a, b, epsilon = 1e-5);
+        }
+    }
+
+    #[test]
+    fn custom_centroids_outperform_gaussian_on_calibrated_source() {
+        // Data that's heavy-tailed — calibrated Lloyd-Max should beat
+        // the Gaussian-assumed centroids on reconstruction MSE.
+        // Build 1000 samples from Laplace-like mixture.
+        use rand::rngs::SmallRng;
+        use rand::{Rng, SeedableRng};
+        let mut rng = SmallRng::seed_from_u64(7);
+        let input: Vec<f32> = (0..1000)
+            .map(|_| {
+                let u: f32 = rng.gen();
+                // Laplace sample: sign(u-0.5) * ln(1 - 2|u-0.5|)
+                let s = (u - 0.5).signum();
+                let v: f32 = -(1.0 - 2.0 * (u - 0.5).abs()).ln();
+                s * v
+            })
+            .collect();
+
+        // Empirical Lloyd-Max at b=2 for Laplace-like data:
+        // Compute mean of each quartile empirically — simple approximation.
+        let mut sorted = input.clone();
+        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
+        let q1 = sorted.len() / 4;
+        let q2 = sorted.len() / 2;
+        let q3 = 3 * sorted.len() / 4;
+        let c_calibrated: Vec<f32> = vec![
+            sorted[..q1].iter().sum::<f32>() / q1 as f32,
+            sorted[q1..q2].iter().sum::<f32>() / (q2 - q1) as f32,
+            sorted[q2..q3].iter().sum::<f32>() / (q3 - q2) as f32,
+            sorted[q3..].iter().sum::<f32>() / (sorted.len() - q3) as f32,
+        ];
+
+        let q_gauss = quantize_vector_with_centroids::<MSE>(&input, 2, None);
+        let rec_gauss = dequantize_vector_with_centroids(&q_gauss, 2, None);
+        let mse_gauss: f32 = input
+            .iter()
+            .zip(&rec_gauss)
+            .map(|(a, b)| (a - b).powi(2))
+            .sum::<f32>()
+            / input.len() as f32;
+
+        let q_cal = quantize_vector_with_centroids::<MSE>(&input, 2, Some(&c_calibrated));
+        let rec_cal = dequantize_vector_with_centroids(&q_cal, 2, Some(&c_calibrated));
+        let mse_cal: f32 = input
+            .iter()
+            .zip(&rec_cal)
+            .map(|(a, b)| (a - b).powi(2))
+            .sum::<f32>()
+            / input.len() as f32;
+
+        assert!(
+            mse_cal < mse_gauss,
+            "calibrated codebook MSE ({mse_cal:.4}) should beat Gaussian ({mse_gauss:.4}) on Laplace-distributed input"
+        );
+    }
+
+    #[test]
+    #[should_panic(expected = "custom centroids must have")]
+    fn quantize_rejects_wrong_count_centroids() {
+        let wrong = vec![-1.0_f32, 0.0, 1.0];  // 3 entries, but bits=2 needs 4
+        let _ = quantize_vector_with_centroids::<MSE>(&[0.0_f32], 2, Some(&wrong));
+    }
+
     #[test]
     fn pack_quantise_full_chain() {
         // End-to-end: quantise → pack → unpack → dequantise
diff --git a/kakeyaturbo/tests/integration.rs b/kakeyaturbo/tests/integration.rs
index b4db852f..4c6ef4ab 100644
--- a/kakeyaturbo/tests/integration.rs
+++ b/kakeyaturbo/tests/integration.rs
@@ -42,6 +42,7 @@ fn end_to_end_mse_block_64x32() {
         bit_width: 4,
         rotation_seed: 0xFEED,
         kmeans_max_iter: 64,
+        ..Default::default()
     };
     let (sk, codes) = encode_block::<MSE>(&block, &w, d, &params);
     let recovered = decode_block::<MSE>(&sk, &codes);
diff --git a/reports/v1_3_ppl/FINDINGS.md b/reports/v1_3_ppl/FINDINGS.md
new file mode 100644
index 00000000..fe4ec5fe
--- /dev/null
+++ b/reports/v1_3_ppl/FINDINGS.md
@@ -0,0 +1,114 @@
+# v1.3 PPL on vLLM — production cell + per-channel attribution
+
+**Setup.** vLLM 0.7.3, V0 engine, `enforce_eager=True`, bf16,
+Flash-Attention backend. Model:
+`deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B` (28 layers, 2 KV heads,
+head_dim=128). GPU: NVIDIA H200 80 GB (Vast.ai). 4 WikiText-103 test
+passages, ctx=2048, evaluate positions `[2048, 2112)` (64 teacher-
+forced next tokens per passage). Shared reference logprobs per
+passage — all rows below are strictly paired.
+
+**Codec config** (SPRINT_CLOSEOUT production cell):
+K b=3 + V b=2 + randomized PCA rank=D/2 + calibrated Lloyd-Max +
+outlier T=2.0 + 6-layer boundary skip `[0,1,7,14,26,27]` + pre-RoPE
+Q-preconditioning. Integration hooks `Qwen2Attention.forward` before
+RoPE so whitening applies to pre-RoPE K.
+
+HF reference for this cell (SPRINT_CLOSEOUT, HF eager + 2-pass
+DynamicCache): **+7.82 % Δppl, 78.97 % top-1, MARGINAL**.
+
+## Results
+
+| Row | K | V | Q-precond | K-centroids | K-outlier | V-centroids | V-outlier | Boundary | **Δppl** | **top-1** | Verdict |
+|:----|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|----:|----:|:-:|
+| **production** (K+V)                              | codec | codec | on   | on  | T=2.0 | on  | off   | 6-layer | **+35.33 %** | 59.38 % | REJECT |
+| **K-only** (V bf16)                               | codec | bf16  | on   | on  | T=2.0 | —   | —     | 6-layer | **+22.55 %** | 69.14 % | REJECT |
+| **V-only** (K bf16, SPRINT_CLOSEOUT V-side recipe) | bf16  | codec | N/A  | —   | —     | on  | off   | 6-layer | **+11.10 %** | 74.22 % | REJECT |
+| **V-only + outlier** (all applicable guardrails)  | bf16  | codec | N/A  | —   | —     | on  | **T=2.0** | 6-layer | **+7.04 %**  | 75.39 % | REJECT |
+
+### What "four guardrails" means for each stream
+
+SPRINT_CLOSEOUT lists four PPL-stabilization guardrails; the
+applicability to each channel is:
+
+| Guardrail | K | V |
+|:----------|:-:|:-:|
+| (1) Q-preconditioning (Chol Σ_q on K)            | required | **N/A** — V does not contract with Q; no Σ_q metric on V |
+| (2) calibrated Lloyd-Max centroids               | on (`ds_K_b3_centroids.f32`) | on (`ds_V_b2_centroids.f32`) |
+| (3) 6-layer boundary skip                        | same layer set | same layer set (K and V skip together) |
+| (4) outlier compensation T=2.0                   | on (K only in SPRINT_CLOSEOUT) | **off in SPRINT_CLOSEOUT**, turned **on** in the last row of this table |
+
+So "V with all applicable guardrails" = (2) + (3) + (4). Running
+that on vLLM drops V-only Δppl from **+11.10 % → +7.04 %** (a
+4.06 pp improvement) and top-1 rises from 74.22 % → 75.39 %. V b=2
+is genuinely helped by outlier compensation on vLLM, even though
+SPRINT_CLOSEOUT had good reasons to omit it on HF (V residual was
+already near-Gaussian there, so outlier saved very little).
+
+## Per-channel Δppl attribution under the production cell
+
+With the three-row attribution (K+V joint, K-only, V-only at
+SPRINT_CLOSEOUT V-side recipe = no V outlier):
+
+- **K stream**  : **+22.55 pp / 64 %** of joint +35.33 pp.
+- **V stream**  : **+11.10 pp / 31 %** of joint +35.33 pp.
+- **interaction**: ~1.68 pp.
+
+K carries about two-thirds of the damage even though the entire
+K-side guardrail stack is already applied. V at b=2 with its current
+2-guardrail (+ 6-bdry) recipe carries about one-third.
+
+With V outlier compensation also enabled (V-only row 4 above), V's
+standalone Δppl drops to +7.04 pp. If the joint K+V cell were re-run
+with outlier compensation on *both* streams, the expected Δppl
+(assuming the same ≈1.7 pp residual interaction) would be roughly
+**+22.55 + 7.04 + 1.68 ≈ +31 pp**. The real joint measurement is
+the next datapoint this PR should pick up.
+
+## Reading
+
+- Enabling V outlier compensation closes ~4 pp of V-only Δppl on
+  vLLM. That is a meaningful but non-decisive fraction of the 27 pp
+  gap vs HF's +7.82 % MARGINAL joint cell.
+- K is still the bigger lever even after enabling every applicable
+  guardrail on both streams. On HF the K-side stack is near-lossless;
+  on vLLM it still leaves +22 pp. The HF↔vLLM gap is primarily a
+  K-stream phenomenon; V outlier compensation is a cheap add-on
+  that shaves ~4 pp off the joint cost on top.
+- MARGINAL threshold (\|Δppl\| ≤ 3 %) is still out of reach on either
+  channel alone.
+
+## Artifacts
+
+- `vllm/ds_distill_qwen_1_5b_vllm_full.json` — production (K+V).
+- `vllm_k_only/ds_distill_qwen_1_5b_k_only_vllm_full.json` — K-only.
+- `vllm_v_only/ds_distill_qwen_1_5b_v_only_vllm_full.json` — V-only
+  (SPRINT_CLOSEOUT V-side recipe, no V outlier).
+- `vllm_v_only_full_guards/ds_distill_qwen_1_5b_v_only_full_guards_vllm_full.json`
+  — V-only with V outlier T=2.0 (all applicable V guardrails on).
+
+## Reproduce
+
+Single driver `benchmarks/run_v1_3_ppl_full_vllm.sh`; the knobs are
+`COMPRESS_STREAM ∈ {kv, k, v}` and `V_OUTLIER_THRESHOLD` (empty = off).
+
+```bash
+# production (K+V, SPRINT_CLOSEOUT recipe)
+bash benchmarks/run_v1_3_ppl_full_vllm.sh
+
+# K-only (V pass-through)
+COMPRESS_STREAM=k MODEL_NAME=ds_distill_qwen_1_5b_k_only \
+OUT_DIR=reports/v1_3_ppl/vllm_k_only \
+bash benchmarks/run_v1_3_ppl_full_vllm.sh
+
+# V-only (K pass-through, SPRINT_CLOSEOUT V-side recipe)
+COMPRESS_STREAM=v MODEL_NAME=ds_distill_qwen_1_5b_v_only \
+OUT_DIR=reports/v1_3_ppl/vllm_v_only \
+bash benchmarks/run_v1_3_ppl_full_vllm.sh
+
+# V-only with outlier T=2.0 (symmetric guardrail add)
+COMPRESS_STREAM=v V_OUTLIER_THRESHOLD=2.0 \
+MODEL_NAME=ds_distill_qwen_1_5b_v_only_full_guards \
+OUT_DIR=reports/v1_3_ppl/vllm_v_only_full_guards \
+bash benchmarks/run_v1_3_ppl_full_vllm.sh
+```
diff --git a/reports/v1_3_ppl/snapshot_mode/COMPARISON_VLLM_PR_38479.md b/reports/v1_3_ppl/snapshot_mode/COMPARISON_VLLM_PR_38479.md
new file mode 100644
index 00000000..9f0a5088
--- /dev/null
+++ b/reports/v1_3_ppl/snapshot_mode/COMPARISON_VLLM_PR_38479.md
@@ -0,0 +1,145 @@
+# Comparison: PR #17 (ours) vs vLLM upstream PR #38479 (TurboQuant)
+
+**Sources.**
+- vLLM PR: https://github.com/vllm-project/vllm/pull/38479 (merged 2026-04-15)
+- Our PR #17: snapshot-mode on vLLM 0.7.3, DS-Distill-Qwen-1.5B, v1.3 PPL recipe.
+
+## TL;DR
+
+1. **PR #38479's PR-description numbers and its actual community-verified numbers are very different.** The headline table claims `tq3` ≈ 4.9× with 0.72 GSM8K; multiple independent reproductions (Neural Magic, Finland Verda cloud, a community contributor on A4000) report `tq3` defaults → **near-zero** on GSM8K (0.009), coherent output only with `TQ_VALUE_BITS=8`, which drops compression to ~2×.
+2. The **reliable** operating point of PR #38479 is **`k8v4`** (FP8 keys + 4-bit uniform values) at **2.6× compression, ~96 % GSM8K retention**. That point is one tier *less aggressive* than our v1.3 PPL 4.61× target, and it uses substantially more bits per K coordinate (FP8 = 8 bits) than our K b=3.
+3. Their codec math is **different from ours** in ways that matter: no PCA rank reduction, no Q-preconditioning, no outlier compensation, but with norm-correction and per-head quantization, and **fused Triton kernels that keep the dequant → attention path numerically tight** (no fp32↔bf16 CPU round-trip).
+
+## Side-by-side table
+
+| Axis | vLLM #38479 (merged)                            | Our PR #17 (snapshot-mode)                        |
+|:-----|:------------------------------------------------|:--------------------------------------------------|
+| Quality metric reported    | GSM8K accuracy, NIAH probes     | WikiText-103 PPL Δ, top-1 agreement |
+| Model tested               | Qwen3-4B (head_dim=128)         | DS-Distill-Qwen-1.5B (head_dim=128) |
+| Context length             | 4K – 32K (NIAH probes)          | 2048 + 64 (teacher-force PPL window) |
+| Compression location       | **Online, store-time fused Triton kernel** in FA v1 backend | Offline snapshot via Rust CPU subprocess, substituted back via hook |
+| Decompression location     | **In-Triton-kernel**, fused with attention kernel | Full fp32 decode from CPU, returned as dense bf16 tensor |
+| K path                     | WHT rotation on raw K → Lloyd-Max scalar quant (Gaussian centroids) → bit-pack | PCA rank=D/2 + WHT + Lloyd-Max (real-data calibrated centroids) + outlier T=2.0 |
+| V path                     | Uniform scalar quant + norm correction | PCA + WHT + Lloyd-Max share_basis_v (real-data calibrated centroids) |
+| Per-head quantization      | **Yes** — store is per-(token, head, bit)         | **No** — we reshape `[num_tokens × num_kv_heads, head_dim]` and pool all heads in the same 512-row PCA block |
+| Q-preconditioning (Chol Σ_q) | ❌ Not present                  | ✅ Present (K-only, whitening before codec)        |
+| Outlier compensation       | ❌ Not present                  | ✅ T=2.0, ~4.5 % K residual coords stored as sparse f16 |
+| Norm correction (NC)       | ✅ Present (centroid renorm before inverse rotation) | ❌ Not present |
+| Boundary-layer skip        | ✅ `--kv-cache-dtype-skip-layers 0,1,L-2,L-1` | ✅ 6-layer skip `[0,1,7,14,26,27]` (DS-Distill) |
+| Calibration                | ✅ Offline Lloyd-Max centroids fit on pooled real data |
+| Gaussian defaults          | ✅ Default Lloyd-Max from N(0,1) — no calibration step |
+| Asymmetric K/V precision   | ✅ Supported via presets         | ✅ K b=3 / V b=2                                  |
+| vLLM integration           | Full V1 attention-backend subclass (`TurboQuantAttentionBackend`) + CUDAGraph fixes + stream overlap | Research harness via `Qwen2Attention.forward` monkey-patch, V0 engine, `enforce_eager=True` |
+| Throughput cost            | **79 % – 100 % of FP16 baseline tok/s** (k8v4), reported on RTX PRO 6000 | Research measurement, not optimized for throughput |
+
+## Community-verified quality numbers (from PR #38479 comments)
+
+| Config | Keys | Values | Compression | Reported GSM8K | Comment |
+|:-------|:----:|:------:|:----------:|:---------------:|:--------|
+| baseline | FP16 | FP16 | 1.0× | 0.900 | |
+| **`k8v4`** | **FP8 E4M3** | **4-bit uniform** | **2.6×** | **0.860** | **community-verified stable** |
+| `4bit_nc` | 4-bit MSE+NC | 4-bit uniform+NC | 3.8× | 0.840 | varjoranta: "4-bit MSE pack path broken in `tq4`, garbage output" |
+| `k3v4_nc` | 3-bit MSE+NC | 4-bit uniform+NC | 4.3× | 0.780 | N/A |
+| `3bit_nc` | 3-bit MSE+NC | 3-bit uniform+NC | 4.9× | 0.720 | mgoin: **independent repro scored 0.009 on GSM8K**, tq4 crashes |
+| `tq3` + `TQ_VALUE_BITS=8` | 2-bit MSE | FP8 | ~2× | **100 % baseline** | MidasMining / varjoranta: this is "the safe shipping default" |
+
+**Interpretation.** The usable high-compression point in PR #38479 is still around `k8v4` at 2.6× — which makes K carry ~8 bits (FP8) of per-coord precision rather than 3. When community reproductions pushed to 3-bit K + 2-bit V (matching our recipe's compression ratio), quality collapsed just as badly as ours on vLLM does.
+
+## Our PR #17 quality (same production recipe across engines)
+
+| Harness | Engine | Δppl | top-1 | Verdict | Compression |
+|:--------|:------|-----:|------:|:-------:|:----------:|
+| HF 2-pass DynamicCache | HF eager | **+7.82 %** | 78.97 % | MARGINAL | 4.61× |
+| **vLLM snapshot-mode (PR #17)** | vLLM 0.7.3 FA | **+29.07 %** | 74.22 % | REJECT | 4.61× |
+| vLLM in-forward (PR #15) | vLLM 0.7.3 FA | +35.33 % | 59.38 % | REJECT | 4.61× |
+
+## Why their "good" numbers are not directly comparable to ours
+
+1. **Different quality metric.** GSM8K accuracy is a *downstream-task* metric; WikiText-103 teacher-force PPL is a *next-token-distribution* metric. A codec that hurts the 95th-percentile logit tail can leave GSM8K nearly untouched (because the model still picks the right intermediate steps) while Δppl looks catastrophic. **GSM8K @ 96 % is NOT a claim of "PPL is preserved"**; it is a weaker claim that the answer chain-of-thought is preserved.
+2. **Different compression ratios.** Their verified-stable config is 2.6×. Our target is 4.61×. A head-to-head comparison at matched compression would put theirs at `3bit_nc` (4.9×) — which the community couldn't get to work — or our codec at K b=4 + V b=4 which we measured at **+27.3 % Δppl** on vLLM, roughly matching their failure point.
+3. **Different model.** Qwen3-4B is their test model; DS-Distill-Qwen-1.5B is ours. DS-Distill is a heavily distilled-from-DeepSeek model with unusual logit distributions; Qwen3-4B is closer to a standard instruction-tuned transformer.
+
+## Techniques in PR #38479 that we don't have, and their likely impact on our PPL gap
+
+### 1. **Fused in-kernel decompression** (probably the biggest win)
+
+PR #38479's decode path is a **single Triton kernel** that:
+```
+cache → unpack K → dequant → Q·K scores → softmax → ··· → output
+```
+bf16 → quant → bit-pack happens in one GPU kernel at store; dequant → FA math happens in one GPU kernel at decode. **No fp32 intermediates leak into the residual stream.**
+
+Our PR #17 snapshot path:
+```
+capture pass: GPU bf16 → .cpu().to(fp32).numpy() → Rust process (disk KKTV I/O) → codec → disk → numpy → .cuda().to(fp32)
+replace pass: substitute the reconstructed fp32 tensors as K/V input to FA
+```
+Each layer's replacement tensor enters FA **in fp32** (we cast back to bf16 inside the hook). Even with the hook's fp32→bf16 right before FA, the **internal path through FA's bf16 softmax** has to integrate the codec's error against that bf16-accumulated attention score. The merged PR argues, and our noise-sensitivity curve from Phase 2 is consistent with, that **bf16 softmax has a rougher response to structured error than the "noise" test suggests**, so keeping dequant in-kernel and fusing into the Q·K kernel avoids some of that roughness.
+
+**Concrete hypothesis:** Porting our codec into a fused Triton decode kernel would likely move our PR #17 Δppl closer to HF's +8 % without any algorithm changes.
+
+### 2. **Per-head quantization**
+
+Their store kernel quantizes per (token, head, stream) — each head has its own Lloyd-Max bucketing. Ours pools `[num_tokens, num_kv_heads]` into a single `(N, head_dim)` stream, so the PCA basis and the K-means codebook are shared across heads within one block. DS-Distill has 2 KV heads (GQA), so only 2 heads share a basis — less harmful than a model with 32 heads, but the effect isn't zero. Per-head quantization preserves distinct distributions of head statistics (which the codec eats into the shared basis otherwise).
+
+**Concrete hypothesis:** Per-head PCA + codec would close 1-3 pp of Δppl on DS-Distill and more on models with more KV heads.
+
+### 3. **Norm correction (NC)**
+
+After inverse rotation, quantization has shifted the codebook vectors' norms. NC re-normalizes reconstructed centroid vectors to match the original pre-quantization norm. The PR claims ~0.8 % PPL improvement at 4-bit. We don't do this; our inverse-WHT preserves the rotation's orthogonality but not the quantizer's norm preservation.
+
+**Concrete hypothesis:** +1-2 pp on Δppl.
+
+### 4. **No PCA rank reduction**
+
+PR #38479 keeps all `head_dim = 128` dimensions (WHT is `128 → 128`). We PCA-reduce to `d_eff = head_dim × variance_ratio` or `head_dim / 2`. Rank reduction throws away some fraction of the K energy at the skeleton level. At high compression that matters.
+
+**Concrete hypothesis:** +3-5 pp on Δppl, cost: compression ratio drops from 4.6× to ~3.5×.
+
+### 5. **Calibrated Lloyd-Max centroids — we have this, they don't**
+
+Ironically, this is a feature of OUR codec that their verified-stable config lacks. Their `k8v4` uses FP8 (not Lloyd-Max at all) for K, and their MSE presets use Gaussian-default centroids. We fit real-data Lloyd-Max offline. The empirical Lloyd-Max improvement over Gaussian is ~1.47× MSE gain at K b=2 (PR #13 data).
+
+### 6. **Q-preconditioning — we have this, they don't**
+
+Our Chol Σ_q pre-whitening of K aligns the codec's MSE metric with the Σ_q-weighted distortion that attention actually "sees". Their WHT-only rotation does not. Phase 2 of our gap decomposition showed Q-precond helps significantly (~4× reduction in Δppl relative to "no Q-precond" at b=2).
+
+### 7. **Outlier compensation — we have this, they don't**
+
+Our K residual outlier path (T=2.0, sparse f16 overrides on the ~4.5 % of coords that exceed the threshold). SPRINT_CLOSEOUT's evidence is that this closes ~8 pp of Δppl on the HF v1.3 PPL cell. They use uniform / Lloyd-Max without any outlier path.
+
+## What PR #38479 would look like if ported our algorithm
+
+Adding our algorithmic pieces to their engineering:
+
+1. Our **Q-preconditioning** → their K store kernel
+2. Our **real-data Lloyd-Max centroids** → their centroid table
+3. Our **outlier compensation** → their K path
+4. Keep their **per-head quantization + NC + fused Triton**
+
+This would combine the **numerical tightness of their store/decode path** with the **algorithmic guardrails** of our v1.3 PPL recipe. At matched 4.6× compression, the expected Δppl on vLLM would likely be somewhere between HF's +7.82 % and our current +29 %, though I don't have an empirical number without running it.
+
+## What PR #17 would look like if it used their engineering
+
+Adding their engineering pieces to our algorithm:
+
+1. Our full v1.3 PPL algorithm **in a fused Triton store/decode kernel**
+2. Per-head quantization (eliminate our `[tokens × n_kv]` block pooling)
+3. Add norm correction after inverse WHT
+
+This would likely:
+- Close the ~11 pp "intrinsic engine" bucket identified in PR #17's revised decomposition (most of it is the CPU round-trip + fp32/bf16 boundary issue)
+- Close a few more pp from per-head quantization and NC
+
+The resulting number on vLLM at 4.6× might reach **Δppl ~+10 %, top-1 > 78 %**, closing the HF-vs-vLLM gap substantially — at the engineering cost of writing Triton kernels.
+
+## Recommended next steps for this branch
+
+Ranked by cost/benefit:
+
+1. **Add norm correction to our snapshot-mode harness** (small change). Re-run PR #17. Expected: ~1-2 pp Δppl improvement. (Low risk, easy win.)
+2. **Switch to per-head quantization** (medium change to Python side; no Rust change required). Re-run PR #17. Expected: 1-3 pp Δppl improvement.
+3. **Fuse dequant into vLLM's FA decode path (Triton rewrite)**. Large engineering project; would require cloning TurboQuantAttentionBackend structure and substituting our codec math. Expected: most of the remaining engine-level bucket.
+4. **Port our algorithm into PR #38479's backend** — i.e. submit a follow-up PR upstream that adds Q-precond + calibrated Lloyd-Max + outlier compensation to TurboQuant. That's where the two efforts converge: their engineering + our algorithm.
+
+Option 4 is probably the right long-term play for the project, since PR #38479 has already landed upstream (merged April 15) and is the surface on which vLLM KV quantization will evolve. Options 1 and 2 can be done on this branch to characterise their delta before attempting option 3 or 4.
diff --git a/reports/v1_3_ppl/snapshot_mode/FINDINGS.md b/reports/v1_3_ppl/snapshot_mode/FINDINGS.md
new file mode 100644
index 00000000..4e2bd1f6
--- /dev/null
+++ b/reports/v1_3_ppl/snapshot_mode/FINDINGS.md
@@ -0,0 +1,125 @@
+# Scenario A — snapshot-mode KV compression on vLLM
+
+**Question.** PR #16 decomposed the 27 pp HF↔vLLM Δppl gap and
+attributed up to +39 pp to "cross-layer non-linear compounding"
+caused by the production harness applying the codec inside the
+forward at every layer. If that attribution is right, switching
+vLLM to HF's two-pass snapshot semantics (clean prefill → codec
+snapshot → teacher-force against codec'd cache) should land Δppl
+close to HF's **+7.82 %** MARGINAL number.
+
+**Setup.** Same codec recipe and same 4 WikiText-103 test passages
+as every prior run (DS-Distill-Qwen-1.5B, ctx=2048, n_eval=64,
+K b=3 + V b=2 + Q-precond + calibrated Lloyd-Max + outlier T=2.0
++ 6-layer boundary skip). vLLM 0.7.3, FLASH_ATTN, bf16, H200.
+
+## Implementation
+
+New harness `benchmarks/e2e_ppl_validation_vllm_snapshot.py` runs
+each passage through vLLM twice:
+
+1. **capture pass** — `Qwen2Attention.forward` hook records the
+   per-layer pre-RoPE K, V for all 2112 prompt tokens while the
+   codec is OFF. The forward is otherwise untouched so the ref
+   PPL is a true clean pass.
+2. **offline codec** — runs the production v1.3 codec on every
+   captured (layer, stream) snapshot (Q-precond → Rust codec →
+   un-whiten; boundary layers skipped; K outlier T=2.0 on).
+3. **replace pass** — second forward through the same engine. The
+   hook **substitutes** each layer's projected `k, v` with the
+   pre-codec'd tensor from step 2 instead of letting the layer
+   project from its current (potentially shifted) residual. Q
+   still comes from the running residual, matching HF's
+   teacher-force flow.
+
+The net effect mirrors HF's DynamicCache pattern: every layer's
+cache K/V depends only on the codec'd clean snapshot, not on what
+earlier layers did to the residual during this forward.
+
+## Result
+
+| Passage | ppl_ref | ppl_alt | Δppl       | top-1   |
+|:-:|--:|--:|--:|--:|
+| 1 | 124.88 | 139.26 | **+11.52 %** | 75.00 % |
+| 2 |  33.00 |  37.30 | **+13.01 %** | 71.88 % |
+| 3 |   8.54 |  11.57 | **+35.59 %** | 73.44 % |
+| 4 |  25.36 |  39.60 | **+56.17 %** | 76.56 % |
+
+**Aggregate**: `Δppl +29.07 %`, `top-1 74.22 %`, **REJECT**.
+
+Codec time per passage: ~18 s (offline CPU). Alt forward: ~0.17 s
+(GPU-only, no in-forward subprocess).
+
+## Cross-mode comparison
+
+| Mode | Harness | Δppl | top-1 | Verdict |
+|:---|:---|--:|--:|:-:|
+| HF 2-pass DynamicCache (SPRINT_CLOSEOUT) | HF eager | **+7.82 %** | 78.97 % | **MARGINAL** |
+| **vLLM snapshot-mode (this run)**         | vLLM FA | **+29.07 %** | **74.22 %** | REJECT |
+| vLLM in-forward (PR #15 production)       | vLLM FA | +35.33 % | 59.38 % | REJECT |
+
+## Finding — the +39 pp compounding estimate was wrong
+
+PR #16 Phase 6 predicted snapshot-mode vLLM would land near HF's
++8 % because the sum of 22 non-boundary single-layer Δppl
+contributions was −3.9 % and the "extra +39 pp" was attributed to
+in-forward cross-layer compounding.
+
+**The actual snapshot-mode run removes only ~6 pp** (+35.33 →
++29.07). The top-1 agreement does jump substantially (59.38 % →
+74.22 %, nearly reaching HF's 78.97 %), confirming that the
+in-forward harness WAS polluting the one-best prediction — the
+codec-shifted residual was changing which token the model argmaxed
+at each position. But the **Δppl** stays much higher than HF's
++7.82 %, so the in-forward pollution accounts for only a small
+fraction of the HF↔vLLM gap.
+
+### Re-decomposition of the 27 pp HF↔vLLM Δppl gap
+
+| Bucket (revised) | Δppl attribution |
+|:---|:---:|
+| in-forward vs snapshot (harness integration) | **~6 pp** (of the 27 pp) |
+| engine baseline shift (Phase 1, clean model) | ~10 pp |
+| residual "intrinsic engine" (FA bf16 attention + softmax + residual stream in bf16) | **~11 pp** |
+
+**The dominant term is actually the engine itself**, not the harness.
+Phase 6's Phase-3-based estimate of a +39 pp compounding term was
+a miscalculation: the per-layer singletons summed to −3.9 %, but
+the joint forward **also had contributions from the snapshot-mode
+error compounding** that we attributed to "harness-only". Running
+snapshot-mode lets us separate them cleanly.
+
+## Deployment implications
+
+**Scenario A is better, but still rejects.** Using the codec as a
+post-prefill cache compressor on vLLM gives +29 pp Δppl / 74 %
+top-1 — better than the in-forward harness (+35 / 59 %) but far
+from HF's MARGINAL +8 %. Scenario A is still the correct semantics
+to deploy (it corresponds to the realistic "compress already-filled
+paged cache" use case), but on this model / codec config it does
+not reach quality parity with HF.
+
+### Where the remaining ~21 pp actually lives
+
+With the harness-integration term now bounded at ~6 pp, the
+remaining ~21 pp vs HF is split between:
+
+- **clean-model baseline mismatch** (~10 pp) — codec-OFF HF and
+  vLLM disagree on logits by KL 0.145. Nothing the codec can do
+  changes this.
+- **intrinsic engine compounding** (~11 pp) — how FA's bf16
+  attention kernel propagates identical codec residuals through
+  28 layers differs from HF's eager path with fp32-accumulate
+  softmax. This is fundamental to the engine and cannot be fixed
+  from the harness side.
+
+Top-1 at 74.22 % (within 4.75 pp of HF's 78.97 %) is the first
+positive datapoint on vLLM at this codec config. It suggests the
+codec's argmax-preserving property IS recoverable when the codec
+runs on a clean snapshot; the Δppl gap that remains is in the
+logit **distribution**, not in the top-1 choice.
+
+## Artifacts
+
+- `ds_distill_qwen_1_5b_snapshot_vllm_snapshot.json` — per-passage
+  metrics, including codec-offline timing per passage.
diff --git a/reports/v1_3_ppl/snapshot_mode/ds_distill_qwen_1_5b_snapshot_vllm_snapshot.json b/reports/v1_3_ppl/snapshot_mode/ds_distill_qwen_1_5b_snapshot_vllm_snapshot.json
new file mode 100644
index 00000000..ec041e37
--- /dev/null
+++ b/reports/v1_3_ppl/snapshot_mode/ds_distill_qwen_1_5b_snapshot_vllm_snapshot.json
@@ -0,0 +1,100 @@
+{
+  "model_name": "ds_distill_qwen_1_5b_snapshot",
+  "model_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "engine": "vllm",
+  "recipe": "v1.3 PPL snapshot-mode",
+  "ctx_len": 2048,
+  "n_eval": 64,
+  "bit_width_k": 3,
+  "bit_width_v": 2,
+  "outlier_threshold": 2.0,
+  "boundary_skip_layers": [
+    0,
+    1,
+    7,
+    14,
+    26,
+    27
+  ],
+  "q_calib": "reports/v1_4_q_pca/flagship/deepseek_distill_q_calib.safetensors",
+  "k_centroids": "reports/v1_4_q_pca/calibrated_codebook/ds_K_b3_centroids.f32",
+  "v_centroids": "reports/v1_4_q_pca/calibrated_codebook/ds_V_b2_centroids.f32",
+  "n_passages": 4,
+  "mean_ppl_delta_rel": 0.2907341820201673,
+  "mean_top1_agreement": 0.7421875,
+  "verdict": "REJECT",
+  "per_passage": [
+    {
+      "passage": 0,
+      "ctx_len": 2048,
+      "n_eval": 64,
+      "t_ref_sec": 0.5709743406623602,
+      "t_codec_sec": 18.436915713362396,
+      "t_alt_sec": 0.17307950370013714,
+      "metrics": {
+        "ppl_ref": 124.8755040661336,
+        "ppl_alt": 139.2633315805399,
+        "ppl_delta_rel": 0.11521737287071576,
+        "top1_agreement": 0.75,
+        "mean_abs_dlogp_true": 0.615901511066113,
+        "n_tokens": 64
+      },
+      "n_layers_captured": 28,
+      "n_boundary_skipped": 12
+    },
+    {
+      "passage": 1,
+      "ctx_len": 2048,
+      "n_eval": 64,
+      "t_ref_sec": 0.370645166374743,
+      "t_codec_sec": 18.282061660662293,
+      "t_alt_sec": 0.16927639301866293,
+      "metrics": {
+        "ppl_ref": 33.0042923548772,
+        "ppl_alt": 37.29780340392389,
+        "ppl_delta_rel": 0.13008947451079708,
+        "top1_agreement": 0.71875,
+        "mean_abs_dlogp_true": 0.6054980524500024,
+        "n_tokens": 64
+      },
+      "n_layers_captured": 28,
+      "n_boundary_skipped": 12
+    },
+    {
+      "passage": 2,
+      "ctx_len": 2048,
+      "n_eval": 64,
+      "t_ref_sec": 1.1817497191950679,
+      "t_codec_sec": 18.702266693115234,
+      "t_alt_sec": 0.14620758593082428,
+      "metrics": {
+        "ppl_ref": 8.535555997020436,
+        "ppl_alt": 11.573601542264326,
+        "ppl_delta_rel": 0.3559282542700675,
+        "top1_agreement": 0.734375,
+        "mean_abs_dlogp_true": 0.6271230471502349,
+        "n_tokens": 64
+      },
+      "n_layers_captured": 28,
+      "n_boundary_skipped": 12
+    },
+    {
+      "passage": 3,
+      "ctx_len": 2048,
+      "n_eval": 64,
+      "t_ref_sec": 0.2077221991494298,
+      "t_codec_sec": 18.759372600354254,
+      "t_alt_sec": 0.14944668672978878,
+      "metrics": {
+        "ppl_ref": 25.35456860935595,
+        "ppl_alt": 39.59627103463911,
+        "ppl_delta_rel": 0.5617016264290888,
+        "top1_agreement": 0.765625,
+        "mean_abs_dlogp_true": 0.7038803878494946,
+        "n_tokens": 64
+      },
+      "n_layers_captured": 28,
+      "n_boundary_skipped": 12
+    }
+  ]
+}
\ No newline at end of file
diff --git a/reports/v1_3_ppl/vllm/ds_distill_qwen_1_5b_vllm_full.json b/reports/v1_3_ppl/vllm/ds_distill_qwen_1_5b_vllm_full.json
new file mode 100644
index 00000000..08d2aa2d
--- /dev/null
+++ b/reports/v1_3_ppl/vllm/ds_distill_qwen_1_5b_vllm_full.json
@@ -0,0 +1,101 @@
+{
+  "model_name": "ds_distill_qwen_1_5b",
+  "model_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "engine": "vllm",
+  "recipe": "v1.3 PPL full guardrails",
+  "ctx_len": 2048,
+  "n_eval": 64,
+  "block_size": 512,
+  "bit_width_k": 3,
+  "bit_width_v": 2,
+  "variance_ratio": 0.95,
+  "pca_method": "randomized",
+  "rsvd_target_rank_factor": 0.5,
+  "q_calib": "reports/v1_4_q_pca/flagship/deepseek_distill_q_calib.safetensors",
+  "k_centroids": "reports/v1_4_q_pca/calibrated_codebook/ds_K_b3_centroids.f32",
+  "v_centroids": "reports/v1_4_q_pca/calibrated_codebook/ds_V_b2_centroids.f32",
+  "outlier_threshold": 2.0,
+  "boundary_skip_layers": [
+    0,
+    1,
+    7,
+    14,
+    26,
+    27
+  ],
+  "share_basis_v": true,
+  "n_passages": 4,
+  "mean_ppl_delta_rel": 0.35327723698202207,
+  "mean_top1_agreement": 0.59375,
+  "verdict": "REJECT",
+  "per_passage": [
+    {
+      "ctx_len": 2048,
+      "n_eval": 64,
+      "t_ref_sec": 0.15766104590147734,
+      "t_alt_sec": 18.76719514746219,
+      "codec_layer_calls": 56,
+      "codec_total_compressed_bytes": 5701692,
+      "boundary_skips": 12,
+      "metrics": {
+        "ppl_ref": 124.8755040661336,
+        "ppl_alt": 113.81097775928545,
+        "ppl_delta_rel": -0.0886044576123466,
+        "top1_agreement": 0.5625,
+        "mean_abs_dlogp_true": 0.7724991553423024,
+        "n_tokens": 64
+      }
+    },
+    {
+      "ctx_len": 2048,
+      "n_eval": 64,
+      "t_ref_sec": 0.09441826306283474,
+      "t_alt_sec": 18.18969063460827,
+      "codec_layer_calls": 56,
+      "codec_total_compressed_bytes": 5593196,
+      "boundary_skips": 12,
+      "metrics": {
+        "ppl_ref": 33.0042923548772,
+        "ppl_alt": 43.82755152870672,
+        "ppl_delta_rel": 0.32793489578424834,
+        "top1_agreement": 0.515625,
+        "mean_abs_dlogp_true": 0.8953173093618716,
+        "n_tokens": 64
+      }
+    },
+    {
+      "ctx_len": 2048,
+      "n_eval": 64,
+      "t_ref_sec": 0.09348526131361723,
+      "t_alt_sec": 18.370823281817138,
+      "codec_layer_calls": 56,
+      "codec_total_compressed_bytes": 5471996,
+      "boundary_skips": 12,
+      "metrics": {
+        "ppl_ref": 8.535555997020436,
+        "ppl_alt": 11.988810819811002,
+        "ppl_delta_rel": 0.40457292108399456,
+        "top1_agreement": 0.65625,
+        "mean_abs_dlogp_true": 0.8133090038209048,
+        "n_tokens": 64
+      }
+    },
+    {
+      "ctx_len": 2048,
+      "n_eval": 64,
+      "t_ref_sec": 0.09543649666011333,
+      "t_alt_sec": 18.490491420961916,
+      "codec_layer_calls": 56,
+      "codec_total_compressed_bytes": 5495768,
+      "boundary_skips": 12,
+      "metrics": {
+        "ppl_ref": 25.35456860935595,
+        "ppl_alt": 44.857444482045075,
+        "ppl_delta_rel": 0.769205588672192,
+        "top1_agreement": 0.640625,
+        "mean_abs_dlogp_true": 1.1356505421993006,
+        "n_tokens": 64
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/reports/v1_3_ppl/vllm_k_only/ds_distill_qwen_1_5b_k_only_vllm_full.json b/reports/v1_3_ppl/vllm_k_only/ds_distill_qwen_1_5b_k_only_vllm_full.json
new file mode 100644
index 00000000..8dbca540
--- /dev/null
+++ b/reports/v1_3_ppl/vllm_k_only/ds_distill_qwen_1_5b_k_only_vllm_full.json
@@ -0,0 +1,101 @@
+{
+  "model_name": "ds_distill_qwen_1_5b_k_only",
+  "model_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "engine": "vllm",
+  "recipe": "v1.3 PPL full guardrails",
+  "ctx_len": 2048,
+  "n_eval": 64,
+  "block_size": 512,
+  "bit_width_k": 3,
+  "bit_width_v": 2,
+  "variance_ratio": 0.95,
+  "pca_method": "randomized",
+  "rsvd_target_rank_factor": 0.5,
+  "q_calib": "reports/v1_4_q_pca/flagship/deepseek_distill_q_calib.safetensors",
+  "k_centroids": "reports/v1_4_q_pca/calibrated_codebook/ds_K_b3_centroids.f32",
+  "v_centroids": "reports/v1_4_q_pca/calibrated_codebook/ds_V_b2_centroids.f32",
+  "outlier_threshold": 2.0,
+  "boundary_skip_layers": [
+    0,
+    1,
+    7,
+    14,
+    26,
+    27
+  ],
+  "share_basis_v": true,
+  "n_passages": 4,
+  "mean_ppl_delta_rel": 0.2254570937234031,
+  "mean_top1_agreement": 0.69140625,
+  "verdict": "REJECT",
+  "per_passage": [
+    {
+      "ctx_len": 2048,
+      "n_eval": 64,
+      "t_ref_sec": 0.13249260932207108,
+      "t_alt_sec": 8.9643594622612,
+      "codec_layer_calls": 56,
+      "codec_total_compressed_bytes": 2722436,
+      "boundary_skips": 12,
+      "metrics": {
+        "ppl_ref": 124.8755040661336,
+        "ppl_alt": 108.42882936436447,
+        "ppl_delta_rel": -0.13170457108272435,
+        "top1_agreement": 0.734375,
+        "mean_abs_dlogp_true": 0.5818154623475493,
+        "n_tokens": 64
+      }
+    },
+    {
+      "ctx_len": 2048,
+      "n_eval": 64,
+      "t_ref_sec": 0.1002084156498313,
+      "t_alt_sec": 9.241994438692927,
+      "codec_layer_calls": 56,
+      "codec_total_compressed_bytes": 2613108,
+      "boundary_skips": 12,
+      "metrics": {
+        "ppl_ref": 33.0042923548772,
+        "ppl_alt": 40.43671974196611,
+        "ppl_delta_rel": 0.22519578081456978,
+        "top1_agreement": 0.671875,
+        "mean_abs_dlogp_true": 0.7751498885158981,
+        "n_tokens": 64
+      }
+    },
+    {
+      "ctx_len": 2048,
+      "n_eval": 64,
+      "t_ref_sec": 0.09568933583796024,
+      "t_alt_sec": 9.208638415671885,
+      "codec_layer_calls": 56,
+      "codec_total_compressed_bytes": 2512116,
+      "boundary_skips": 12,
+      "metrics": {
+        "ppl_ref": 8.535555997020436,
+        "ppl_alt": 10.201957740918466,
+        "ppl_delta_rel": 0.1952306029601038,
+        "top1_agreement": 0.703125,
+        "mean_abs_dlogp_true": 0.6314752803191368,
+        "n_tokens": 64
+      }
+    },
+    {
+      "ctx_len": 2048,
+      "n_eval": 64,
+      "t_ref_sec": 0.13525690510869026,
+      "t_alt_sec": 9.435937971808016,
+      "codec_layer_calls": 56,
+      "codec_total_compressed_bytes": 2522720,
+      "boundary_skips": 12,
+      "metrics": {
+        "ppl_ref": 25.35456860935595,
+        "ppl_alt": 40.89962100554438,
+        "ppl_delta_rel": 0.6131065622016632,
+        "top1_agreement": 0.65625,
+        "mean_abs_dlogp_true": 1.0818352744513504,
+        "n_tokens": 64
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/reports/v1_3_ppl/vllm_v_only/ds_distill_qwen_1_5b_v_only_vllm_full.json b/reports/v1_3_ppl/vllm_v_only/ds_distill_qwen_1_5b_v_only_vllm_full.json
new file mode 100644
index 00000000..e931536c
--- /dev/null
+++ b/reports/v1_3_ppl/vllm_v_only/ds_distill_qwen_1_5b_v_only_vllm_full.json
@@ -0,0 +1,101 @@
+{
+  "model_name": "ds_distill_qwen_1_5b_v_only",
+  "model_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "engine": "vllm",
+  "recipe": "v1.3 PPL full guardrails",
+  "ctx_len": 2048,
+  "n_eval": 64,
+  "block_size": 512,
+  "bit_width_k": 3,
+  "bit_width_v": 2,
+  "variance_ratio": 0.95,
+  "pca_method": "randomized",
+  "rsvd_target_rank_factor": 0.5,
+  "q_calib": "reports/v1_4_q_pca/flagship/deepseek_distill_q_calib.safetensors",
+  "k_centroids": "reports/v1_4_q_pca/calibrated_codebook/ds_K_b3_centroids.f32",
+  "v_centroids": "reports/v1_4_q_pca/calibrated_codebook/ds_V_b2_centroids.f32",
+  "outlier_threshold": 2.0,
+  "boundary_skip_layers": [
+    0,
+    1,
+    7,
+    14,
+    26,
+    27
+  ],
+  "share_basis_v": true,
+  "n_passages": 4,
+  "mean_ppl_delta_rel": 0.11103892941506109,
+  "mean_top1_agreement": 0.7421875,
+  "verdict": "REJECT",
+  "per_passage": [
+    {
+      "ctx_len": 2048,
+      "n_eval": 64,
+      "t_ref_sec": 0.3110441770404577,
+      "t_alt_sec": 8.632471803575754,
+      "codec_layer_calls": 56,
+      "codec_total_compressed_bytes": 3062272,
+      "boundary_skips": 12,
+      "metrics": {
+        "ppl_ref": 124.8755040661336,
+        "ppl_alt": 134.60567357289466,
+        "ppl_delta_rel": 0.07791896080441843,
+        "top1_agreement": 0.8125,
+        "mean_abs_dlogp_true": 0.5011605188574322,
+        "n_tokens": 64
+      }
+    },
+    {
+      "ctx_len": 2048,
+      "n_eval": 64,
+      "t_ref_sec": 0.09386647772043943,
+      "t_alt_sec": 8.453269029967487,
+      "codec_layer_calls": 56,
+      "codec_total_compressed_bytes": 3060224,
+      "boundary_skips": 12,
+      "metrics": {
+        "ppl_ref": 33.0042923548772,
+        "ppl_alt": 33.59354637783407,
+        "ppl_delta_rel": 0.017853860237963728,
+        "top1_agreement": 0.59375,
+        "mean_abs_dlogp_true": 0.5622571868302657,
+        "n_tokens": 64
+      }
+    },
+    {
+      "ctx_len": 2048,
+      "n_eval": 64,
+      "t_ref_sec": 0.09309171698987484,
+      "t_alt_sec": 8.515495292842388,
+      "codec_layer_calls": 56,
+      "codec_total_compressed_bytes": 3055104,
+      "boundary_skips": 12,
+      "metrics": {
+        "ppl_ref": 8.535555997020436,
+        "ppl_alt": 11.060089122088087,
+        "ppl_delta_rel": 0.29576668771769593,
+        "top1_agreement": 0.765625,
+        "mean_abs_dlogp_true": 0.5050099290856451,
+        "n_tokens": 64
+      }
+    },
+    {
+      "ctx_len": 2048,
+      "n_eval": 64,
+      "t_ref_sec": 0.09266053792089224,
+      "t_alt_sec": 8.352523293346167,
+      "codec_layer_calls": 56,
+      "codec_total_compressed_bytes": 3049472,
+      "boundary_skips": 12,
+      "metrics": {
+        "ppl_ref": 25.35456860935595,
+        "ppl_alt": 26.688629887879422,
+        "ppl_delta_rel": 0.05261620890016628,
+        "top1_agreement": 0.796875,
+        "mean_abs_dlogp_true": 0.47840885482628437,
+        "n_tokens": 64
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/reports/v1_3_ppl/vllm_v_only_full_guards/ds_distill_qwen_1_5b_v_only_full_guards_vllm_full.json b/reports/v1_3_ppl/vllm_v_only_full_guards/ds_distill_qwen_1_5b_v_only_full_guards_vllm_full.json
new file mode 100644
index 00000000..dfc5b607
--- /dev/null
+++ b/reports/v1_3_ppl/vllm_v_only_full_guards/ds_distill_qwen_1_5b_v_only_full_guards_vllm_full.json
@@ -0,0 +1,102 @@
+{
+  "model_name": "ds_distill_qwen_1_5b_v_only_full_guards",
+  "model_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "engine": "vllm",
+  "recipe": "v1.3 PPL full guardrails",
+  "ctx_len": 2048,
+  "n_eval": 64,
+  "block_size": 512,
+  "bit_width_k": 3,
+  "bit_width_v": 2,
+  "variance_ratio": 0.95,
+  "pca_method": "randomized",
+  "rsvd_target_rank_factor": 0.5,
+  "q_calib": "reports/v1_4_q_pca/flagship/deepseek_distill_q_calib.safetensors",
+  "k_centroids": "reports/v1_4_q_pca/calibrated_codebook/ds_K_b3_centroids.f32",
+  "v_centroids": "reports/v1_4_q_pca/calibrated_codebook/ds_V_b2_centroids.f32",
+  "outlier_threshold": 2.0,
+  "v_outlier_threshold": 2.0,
+  "boundary_skip_layers": [
+    0,
+    1,
+    7,
+    14,
+    26,
+    27
+  ],
+  "share_basis_v": true,
+  "n_passages": 4,
+  "mean_ppl_delta_rel": 0.07037302910388782,
+  "mean_top1_agreement": 0.75390625,
+  "verdict": "REJECT",
+  "per_passage": [
+    {
+      "ctx_len": 2048,
+      "n_eval": 64,
+      "t_ref_sec": 0.13159286603331566,
+      "t_alt_sec": 8.435608506202698,
+      "codec_layer_calls": 56,
+      "codec_total_compressed_bytes": 4086908,
+      "boundary_skips": 12,
+      "metrics": {
+        "ppl_ref": 124.8755040661336,
+        "ppl_alt": 132.06245178569554,
+        "ppl_delta_rel": 0.05755290257531823,
+        "top1_agreement": 0.796875,
+        "mean_abs_dlogp_true": 0.4962207329826924,
+        "n_tokens": 64
+      }
+    },
+    {
+      "ctx_len": 2048,
+      "n_eval": 64,
+      "t_ref_sec": 0.0956207811832428,
+      "t_alt_sec": 8.548752134665847,
+      "codec_layer_calls": 56,
+      "codec_total_compressed_bytes": 4083648,
+      "boundary_skips": 12,
+      "metrics": {
+        "ppl_ref": 33.0042923548772,
+        "ppl_alt": 35.285640649591286,
+        "ppl_delta_rel": 0.06912277561306233,
+        "top1_agreement": 0.625,
+        "mean_abs_dlogp_true": 0.614368943039608,
+        "n_tokens": 64
+      }
+    },
+    {
+      "ctx_len": 2048,
+      "n_eval": 64,
+      "t_ref_sec": 0.10165335144847631,
+      "t_alt_sec": 8.493175717070699,
+      "codec_layer_calls": 56,
+      "codec_total_compressed_bytes": 4082268,
+      "boundary_skips": 12,
+      "metrics": {
+        "ppl_ref": 8.535555997020436,
+        "ppl_alt": 10.092120183558732,
+        "ppl_delta_rel": 0.1823623659761187,
+        "top1_agreement": 0.8125,
+        "mean_abs_dlogp_true": 0.4031439674690773,
+        "n_tokens": 64
+      }
+    },
+    {
+      "ctx_len": 2048,
+      "n_eval": 64,
+      "t_ref_sec": 0.09925544820725918,
+      "t_alt_sec": 8.300315434113145,
+      "codec_layer_calls": 56,
+      "codec_total_compressed_bytes": 4073448,
+      "boundary_skips": 12,
+      "metrics": {
+        "ppl_ref": 25.35456860935595,
+        "ppl_alt": 24.656153494336888,
+        "ppl_delta_rel": -0.027545927748947978,
+        "top1_agreement": 0.78125,
+        "mean_abs_dlogp_true": 0.47499063883560666,
+        "n_tokens": 64
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/reports/v1_4_q_pca/calibrated_codebook/ds_K_b2_centroids.f32 b/reports/v1_4_q_pca/calibrated_codebook/ds_K_b2_centroids.f32
new file mode 100644
index 00000000..82fde590
--- /dev/null
+++ b/reports/v1_4_q_pca/calibrated_codebook/ds_K_b2_centroids.f32
@@ -0,0 +1 @@
+�s��o���?�[�?
\ No newline at end of file
diff --git a/reports/v1_4_q_pca/calibrated_codebook/ds_K_b3_centroids.f32 b/reports/v1_4_q_pca/calibrated_codebook/ds_K_b3_centroids.f32
new file mode 100644
index 00000000..3bd8f394
--- /dev/null
+++ b/reports/v1_4_q_pca/calibrated_codebook/ds_K_b3_centroids.f32
@@ -0,0 +1 @@
+rѿ�:��+�7�{���Eʜ>g.D?���?Ӝ�?
\ No newline at end of file
diff --git a/reports/v1_4_q_pca/calibrated_codebook/ds_V_b2_centroids.f32 b/reports/v1_4_q_pca/calibrated_codebook/ds_V_b2_centroids.f32
new file mode 100644
index 00000000..da30b1f2
--- /dev/null
+++ b/reports/v1_4_q_pca/calibrated_codebook/ds_V_b2_centroids.f32
@@ -0,0 +1 @@
+�����}�%��>���?
\ No newline at end of file
diff --git a/reports/v1_4_q_pca/flagship/deepseek_distill_q_calib.json b/reports/v1_4_q_pca/flagship/deepseek_distill_q_calib.json
new file mode 100644
index 00000000..6334d183
--- /dev/null
+++ b/reports/v1_4_q_pca/flagship/deepseek_distill_q_calib.json
@@ -0,0 +1,602 @@
+{
+  "model_path": "models/DeepSeek-R1-Distill-Qwen-1.5B",
+  "head_dim": 128,
+  "num_q_heads": 12,
+  "num_kv_heads": 2,
+  "num_layers": 28,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "n_passages_used": 28,
+  "ctx_len": 2048,
+  "ridge": 0.001,
+  "diagnostics": [
+    {
+      "layer": 0,
+      "kv_head": 0,
+      "sigma_trace": 14087.669450869162,
+      "eig_min": 0.11430656639034435,
+      "eig_max": 12577.827799745084,
+      "condition": 110035.91654387715,
+      "diag_mean": 110.05991758491533,
+      "off_diag_max_abs": 864.6616414388021
+    },
+    {
+      "layer": 0,
+      "kv_head": 1,
+      "sigma_trace": 5140.3362001776695,
+      "eig_min": 0.04370561277591314,
+      "eig_max": 3957.781019162632,
+      "condition": 90555.44054387057,
+      "diag_mean": 40.15887656388804,
+      "off_diag_max_abs": 713.4821980794271
+    },
+    {
+      "layer": 1,
+      "kv_head": 0,
+      "sigma_trace": 133.57928050061065,
+      "eig_min": 0.004653256164358674,
+      "eig_max": 43.86806236395408,
+      "condition": 9427.390372350179,
+      "diag_mean": 1.0435881289110207,
+      "off_diag_max_abs": 8.26283057530721
+    },
+    {
+      "layer": 1,
+      "kv_head": 1,
+      "sigma_trace": 151.35747090478736,
+      "eig_min": 0.010091808603852767,
+      "eig_max": 66.74368947422337,
+      "condition": 6613.649950588889,
+      "diag_mean": 1.1824802414436513,
+      "off_diag_max_abs": 17.80475648244222
+    },
+    {
+      "layer": 2,
+      "kv_head": 0,
+      "sigma_trace": 334.064464956522,
+      "eig_min": 0.030600042830289543,
+      "eig_max": 211.18540392300457,
+      "condition": 6901.474128459784,
+      "diag_mean": 2.609878632472828,
+      "off_diag_max_abs": 27.736063639322918
+    },
+    {
+      "layer": 2,
+      "kv_head": 1,
+      "sigma_trace": 211.37830638388792,
+      "eig_min": 0.024767269949952035,
+      "eig_max": 135.32737812025923,
+      "condition": 5463.9602343625,
+      "diag_mean": 1.6513930186241244,
+      "off_diag_max_abs": 23.255394617716473
+    },
+    {
+      "layer": 3,
+      "kv_head": 0,
+      "sigma_trace": 483.38755408922833,
+      "eig_min": 0.01750574101520793,
+      "eig_max": 398.7222212911926,
+      "condition": 22776.65486681237,
+      "diag_mean": 3.7764652663220963,
+      "off_diag_max_abs": 46.962029774983726
+    },
+    {
+      "layer": 3,
+      "kv_head": 1,
+      "sigma_trace": 212.48712569723529,
+      "eig_min": 0.03027117359782215,
+      "eig_max": 121.4477534692025,
+      "condition": 4011.9935580541883,
+      "diag_mean": 1.6600556695096507,
+      "off_diag_max_abs": 40.84345054626465
+    },
+    {
+      "layer": 4,
+      "kv_head": 0,
+      "sigma_trace": 192.6380763053894,
+      "eig_min": 0.04083122977838502,
+      "eig_max": 96.77485967566128,
+      "condition": 2370.1186616448995,
+      "diag_mean": 1.5049849711358547,
+      "off_diag_max_abs": 12.98515764872233
+    },
+    {
+      "layer": 4,
+      "kv_head": 1,
+      "sigma_trace": 377.8950372040272,
+      "eig_min": 0.04741976149501609,
+      "eig_max": 279.5757039599843,
+      "condition": 5895.7636045758745,
+      "diag_mean": 2.9523049781564623,
+      "off_diag_max_abs": 44.031290690104164
+    },
+    {
+      "layer": 5,
+      "kv_head": 0,
+      "sigma_trace": 248.6834928592046,
+      "eig_min": 0.03362428867505457,
+      "eig_max": 140.2509070098087,
+      "condition": 4171.118930282652,
+      "diag_mean": 1.9428397879625359,
+      "off_diag_max_abs": 25.768606821695965
+    },
+    {
+      "layer": 5,
+      "kv_head": 1,
+      "sigma_trace": 575.179310977459,
+      "eig_min": 0.02937057860560856,
+      "eig_max": 436.7429704898716,
+      "condition": 14870.083982835524,
+      "diag_mean": 4.493588367011398,
+      "off_diag_max_abs": 153.53911590576172
+    },
+    {
+      "layer": 6,
+      "kv_head": 0,
+      "sigma_trace": 349.35026275614894,
+      "eig_min": 0.04113479156384749,
+      "eig_max": 218.07630310354136,
+      "condition": 5301.504998877983,
+      "diag_mean": 2.7292989277824136,
+      "off_diag_max_abs": 61.875738779703774
+    },
+    {
+      "layer": 6,
+      "kv_head": 1,
+      "sigma_trace": 535.1662722130617,
+      "eig_min": 0.02715142274424338,
+      "eig_max": 355.1404283301754,
+      "condition": 13079.993327623022,
+      "diag_mean": 4.180986501664544,
+      "off_diag_max_abs": 129.7892862955729
+    },
+    {
+      "layer": 7,
+      "kv_head": 0,
+      "sigma_trace": 291.8450035353502,
+      "eig_min": 0.025828950886167924,
+      "eig_max": 181.20780719112673,
+      "condition": 7015.685925058932,
+      "diag_mean": 2.2800390901199235,
+      "off_diag_max_abs": 24.58665657043457
+    },
+    {
+      "layer": 7,
+      "kv_head": 1,
+      "sigma_trace": 136.09298193951446,
+      "eig_min": 0.028990158461264987,
+      "eig_max": 51.234866324489886,
+      "condition": 1767.3192919227063,
+      "diag_mean": 1.0632264214024567,
+      "off_diag_max_abs": 6.769900639851888
+    },
+    {
+      "layer": 8,
+      "kv_head": 0,
+      "sigma_trace": 431.5416265328726,
+      "eig_min": 0.03851094732707891,
+      "eig_max": 227.863980144724,
+      "condition": 5916.862501704854,
+      "diag_mean": 3.3714189572880673,
+      "off_diag_max_abs": 94.72378540039062
+    },
+    {
+      "layer": 8,
+      "kv_head": 1,
+      "sigma_trace": 353.4488914211591,
+      "eig_min": 0.06008918531450983,
+      "eig_max": 211.57125536262686,
+      "condition": 3520.9539662628513,
+      "diag_mean": 2.7613194642278054,
+      "off_diag_max_abs": 81.58280436197917
+    },
+    {
+      "layer": 9,
+      "kv_head": 0,
+      "sigma_trace": 458.910021652778,
+      "eig_min": 0.058298364689047484,
+      "eig_max": 357.34834334457173,
+      "condition": 6129.646093001074,
+      "diag_mean": 3.5852345441623283,
+      "off_diag_max_abs": 43.38435745239258
+    },
+    {
+      "layer": 9,
+      "kv_head": 1,
+      "sigma_trace": 430.27762574950856,
+      "eig_min": 0.026661643602682653,
+      "eig_max": 279.6466324446941,
+      "condition": 10488.724424197033,
+      "diag_mean": 3.3615439511680356,
+      "off_diag_max_abs": 64.40344492594402
+    },
+    {
+      "layer": 10,
+      "kv_head": 0,
+      "sigma_trace": 317.96322809656465,
+      "eig_min": 0.05461869720209725,
+      "eig_max": 172.37962670804933,
+      "condition": 3156.055261996075,
+      "diag_mean": 2.4840877195044113,
+      "off_diag_max_abs": 22.404350916544598
+    },
+    {
+      "layer": 10,
+      "kv_head": 1,
+      "sigma_trace": 277.5953108270963,
+      "eig_min": 0.04836544341033448,
+      "eig_max": 165.1600287256893,
+      "condition": 3414.835408919227,
+      "diag_mean": 2.1687133658366897,
+      "off_diag_max_abs": 39.08971405029297
+    },
+    {
+      "layer": 11,
+      "kv_head": 0,
+      "sigma_trace": 384.3014333546161,
+      "eig_min": 0.04157544544229979,
+      "eig_max": 254.1584736221218,
+      "condition": 6113.187024655069,
+      "diag_mean": 3.0023549480829383,
+      "off_diag_max_abs": 36.34234746297201
+    },
+    {
+      "layer": 11,
+      "kv_head": 1,
+      "sigma_trace": 297.5521833052238,
+      "eig_min": 0.04339911124028957,
+      "eig_max": 161.13024876333145,
+      "condition": 3712.754573963399,
+      "diag_mean": 2.324626432072061,
+      "off_diag_max_abs": 27.569780985514324
+    },
+    {
+      "layer": 12,
+      "kv_head": 0,
+      "sigma_trace": 352.21503596504533,
+      "eig_min": 0.036985872810758276,
+      "eig_max": 215.0898464324277,
+      "condition": 5815.459527829863,
+      "diag_mean": 2.7516799684769166,
+      "off_diag_max_abs": 56.79541524251302
+    },
+    {
+      "layer": 12,
+      "kv_head": 1,
+      "sigma_trace": 379.55718504389125,
+      "eig_min": 0.05468913693578359,
+      "eig_max": 229.23341964377488,
+      "condition": 4191.571351965978,
+      "diag_mean": 2.9652905081554004,
+      "off_diag_max_abs": 32.60482915242513
+    },
+    {
+      "layer": 13,
+      "kv_head": 0,
+      "sigma_trace": 708.3103209336599,
+      "eig_min": 0.03250367027250317,
+      "eig_max": 536.7848709416021,
+      "condition": 16514.592550358873,
+      "diag_mean": 5.533674382294218,
+      "off_diag_max_abs": 158.92255147298178
+    },
+    {
+      "layer": 13,
+      "kv_head": 1,
+      "sigma_trace": 258.0619274874528,
+      "eig_min": 0.027762442506721427,
+      "eig_max": 182.09199740189274,
+      "condition": 6558.932894964388,
+      "diag_mean": 2.016108808495725,
+      "off_diag_max_abs": 47.832452138264976
+    },
+    {
+      "layer": 14,
+      "kv_head": 0,
+      "sigma_trace": 431.0207499563694,
+      "eig_min": 0.04985178720997897,
+      "eig_max": 243.3169958332498,
+      "condition": 4880.807879733234,
+      "diag_mean": 3.367349609034136,
+      "off_diag_max_abs": 95.3411382039388
+    },
+    {
+      "layer": 14,
+      "kv_head": 1,
+      "sigma_trace": 328.9268921265999,
+      "eig_min": 0.05177146162675035,
+      "eig_max": 188.74924496113078,
+      "condition": 3645.8164214472154,
+      "diag_mean": 2.569741344739062,
+      "off_diag_max_abs": 32.67595227559408
+    },
+    {
+      "layer": 15,
+      "kv_head": 0,
+      "sigma_trace": 336.3484721581141,
+      "eig_min": 0.037918359540894966,
+      "eig_max": 221.4561260563185,
+      "condition": 5840.340371726208,
+      "diag_mean": 2.6277224387352662,
+      "off_diag_max_abs": 31.366430282592773
+    },
+    {
+      "layer": 15,
+      "kv_head": 1,
+      "sigma_trace": 3150.72810536623,
+      "eig_min": 0.05277027466845438,
+      "eig_max": 2915.063294154007,
+      "condition": 55240.63144391035,
+      "diag_mean": 24.615063323173672,
+      "off_diag_max_abs": 297.8225351969401
+    },
+    {
+      "layer": 16,
+      "kv_head": 0,
+      "sigma_trace": 337.0452280243238,
+      "eig_min": 0.031604938077455916,
+      "eig_max": 224.4852396940157,
+      "condition": 7102.853330826268,
+      "diag_mean": 2.63316584394003,
+      "off_diag_max_abs": 49.496663411458336
+    },
+    {
+      "layer": 16,
+      "kv_head": 1,
+      "sigma_trace": 240.34587020178637,
+      "eig_min": 0.0502923013418992,
+      "eig_max": 119.61354067629264,
+      "condition": 2378.366817281455,
+      "diag_mean": 1.877702110951456,
+      "off_diag_max_abs": 42.567081451416016
+    },
+    {
+      "layer": 17,
+      "kv_head": 0,
+      "sigma_trace": 321.5248833199342,
+      "eig_min": 0.05885623088893382,
+      "eig_max": 200.1015187111916,
+      "condition": 3399.835764012792,
+      "diag_mean": 2.511913150936986,
+      "off_diag_max_abs": 42.38495445251465
+    },
+    {
+      "layer": 17,
+      "kv_head": 1,
+      "sigma_trace": 287.8698761214813,
+      "eig_min": 0.04281020035480668,
+      "eig_max": 147.1897671238603,
+      "condition": 3438.193839411312,
+      "diag_mean": 2.2489834071990726,
+      "off_diag_max_abs": 39.89344787597656
+    },
+    {
+      "layer": 18,
+      "kv_head": 0,
+      "sigma_trace": 263.3517959664265,
+      "eig_min": 0.036322879388236765,
+      "eig_max": 174.37145070147716,
+      "condition": 4800.595482470139,
+      "diag_mean": 2.057435905987707,
+      "off_diag_max_abs": 42.489283879597984
+    },
+    {
+      "layer": 18,
+      "kv_head": 1,
+      "sigma_trace": 383.9522307316462,
+      "eig_min": 0.03261965495932056,
+      "eig_max": 244.15562279114408,
+      "condition": 7484.92352526802,
+      "diag_mean": 2.999626802590986,
+      "off_diag_max_abs": 59.04431025187174
+    },
+    {
+      "layer": 19,
+      "kv_head": 0,
+      "sigma_trace": 301.4627567629019,
+      "eig_min": 0.029512059444634673,
+      "eig_max": 165.51800019085425,
+      "condition": 5608.486947560199,
+      "diag_mean": 2.355177787210171,
+      "off_diag_max_abs": 40.991583506266274
+    },
+    {
+      "layer": 19,
+      "kv_head": 1,
+      "sigma_trace": 374.7839558720589,
+      "eig_min": 0.04887381784338199,
+      "eig_max": 257.89142525835075,
+      "condition": 5276.6785292848135,
+      "diag_mean": 2.9279996552504604,
+      "off_diag_max_abs": 39.47942924499512
+    },
+    {
+      "layer": 20,
+      "kv_head": 0,
+      "sigma_trace": 234.5188759714365,
+      "eig_min": 0.039081214022015094,
+      "eig_max": 125.96486455304878,
+      "condition": 3223.1563861371014,
+      "diag_mean": 1.8321787185268477,
+      "off_diag_max_abs": 15.945943196614584
+    },
+    {
+      "layer": 20,
+      "kv_head": 1,
+      "sigma_trace": 279.58694698413217,
+      "eig_min": 0.047532429221632214,
+      "eig_max": 153.76487678822866,
+      "condition": 3234.9467364956304,
+      "diag_mean": 2.1842730233135326,
+      "off_diag_max_abs": 46.38345527648926
+    },
+    {
+      "layer": 21,
+      "kv_head": 0,
+      "sigma_trace": 263.7388703177373,
+      "eig_min": 0.06487001952947101,
+      "eig_max": 148.53895398550213,
+      "condition": 2289.7935758755184,
+      "diag_mean": 2.0604599243573225,
+      "off_diag_max_abs": 41.04986317952474
+    },
+    {
+      "layer": 21,
+      "kv_head": 1,
+      "sigma_trace": 227.13083327313262,
+      "eig_min": 0.04085307157799629,
+      "eig_max": 135.90208073082516,
+      "condition": 3326.6061885056115,
+      "diag_mean": 1.7744596349463486,
+      "off_diag_max_abs": 47.19264475504557
+    },
+    {
+      "layer": 22,
+      "kv_head": 0,
+      "sigma_trace": 472.63880944748723,
+      "eig_min": 0.05482221131083244,
+      "eig_max": 340.50050828276153,
+      "condition": 6210.995509688266,
+      "diag_mean": 3.692490698808494,
+      "off_diag_max_abs": 54.38853454589844
+    },
+    {
+      "layer": 22,
+      "kv_head": 1,
+      "sigma_trace": 185.50012714664143,
+      "eig_min": 0.03021928015905988,
+      "eig_max": 110.12785275344854,
+      "condition": 3644.2910676160395,
+      "diag_mean": 1.4492197433331362,
+      "off_diag_max_abs": 15.702750205993652
+    },
+    {
+      "layer": 23,
+      "kv_head": 0,
+      "sigma_trace": 244.9451248894135,
+      "eig_min": 0.0393299799921783,
+      "eig_max": 90.3660817335531,
+      "condition": 2297.638639824494,
+      "diag_mean": 1.913633788198543,
+      "off_diag_max_abs": 25.43307050069173
+    },
+    {
+      "layer": 23,
+      "kv_head": 1,
+      "sigma_trace": 349.1584925254186,
+      "eig_min": 0.06874823329787509,
+      "eig_max": 234.09798426987214,
+      "condition": 3405.149093149246,
+      "diag_mean": 2.7278007228548327,
+      "off_diag_max_abs": 84.3193244934082
+    },
+    {
+      "layer": 24,
+      "kv_head": 0,
+      "sigma_trace": 374.67986146608985,
+      "eig_min": 0.06576109575818523,
+      "eig_max": 281.3351233529262,
+      "condition": 4278.13922668569,
+      "diag_mean": 2.927186417703827,
+      "off_diag_max_abs": 86.4692866007487
+    },
+    {
+      "layer": 24,
+      "kv_head": 1,
+      "sigma_trace": 231.67531279226142,
+      "eig_min": 0.05172063301127048,
+      "eig_max": 105.02028781357055,
+      "condition": 2030.5298233818116,
+      "diag_mean": 1.8099633811895424,
+      "off_diag_max_abs": 35.84926414489746
+    },
+    {
+      "layer": 25,
+      "kv_head": 0,
+      "sigma_trace": 218.61967018743354,
+      "eig_min": 0.06488669460598159,
+      "eig_max": 123.72261105380443,
+      "condition": 1906.7485530754564,
+      "diag_mean": 1.7079661733393245,
+      "off_diag_max_abs": 27.353594462076824
+    },
+    {
+      "layer": 25,
+      "kv_head": 1,
+      "sigma_trace": 250.5374931568901,
+      "eig_min": 0.045656813855971444,
+      "eig_max": 136.51775352705886,
+      "condition": 2990.0849839788752,
+      "diag_mean": 1.9573241652882039,
+      "off_diag_max_abs": 27.04758898417155
+    },
+    {
+      "layer": 26,
+      "kv_head": 0,
+      "sigma_trace": 216.00654093921185,
+      "eig_min": 0.04481426200555469,
+      "eig_max": 95.41982658607262,
+      "condition": 2129.2290069229616,
+      "diag_mean": 1.6875511010875925,
+      "off_diag_max_abs": 22.653246243794758
+    },
+    {
+      "layer": 26,
+      "kv_head": 1,
+      "sigma_trace": 263.1935205211242,
+      "eig_min": 0.03236226211077583,
+      "eig_max": 126.80081954364255,
+      "condition": 3918.169227775367,
+      "diag_mean": 2.0561993790712827,
+      "off_diag_max_abs": 24.95574378967285
+    },
+    {
+      "layer": 27,
+      "kv_head": 0,
+      "sigma_trace": 219.73943743109703,
+      "eig_min": 0.0491609465840367,
+      "eig_max": 90.18029617717704,
+      "condition": 1834.388929493476,
+      "diag_mean": 1.7167143549304456,
+      "off_diag_max_abs": 19.372835159301758
+    },
+    {
+      "layer": 27,
+      "kv_head": 1,
+      "sigma_trace": 229.75121538341045,
+      "eig_min": 0.06853457831394195,
+      "eig_max": 94.41466040430316,
+      "condition": 1377.6207970786688,
+      "diag_mean": 1.7949313701828942,
+      "off_diag_max_abs": 21.011127471923828
+    }
+  ]
+}
\ No newline at end of file
diff --git a/reports/v1_4_q_pca/flagship/deepseek_distill_q_calib.safetensors b/reports/v1_4_q_pca/flagship/deepseek_distill_q_calib.safetensors
new file mode 100644
index 00000000..9b0f12ed
Binary files /dev/null and b/reports/v1_4_q_pca/flagship/deepseek_distill_q_calib.safetensors differ