diff --git a/crates/thinking-engine/examples/tts_full_inference.rs b/crates/thinking-engine/examples/tts_full_inference.rs index 6a5fc016..f852b5f1 100644 --- a/crates/thinking-engine/examples/tts_full_inference.rs +++ b/crates/thinking-engine/examples/tts_full_inference.rs @@ -266,6 +266,9 @@ fn main() { // Save tokens let tok_path = "/home/user/models/qwen3-tts-0.6b/codebooks/real_codec_tokens.bin"; + if let Some(parent) = std::path::Path::new(tok_path).parent() { + std::fs::create_dir_all(parent).expect("create codebooks dir"); + } std::fs::write(tok_path, &codec_tokens).expect("write tokens"); // Decode via speech tokenizer diff --git a/crates/thinking-engine/examples/tts_rvq_e2e.rs b/crates/thinking-engine/examples/tts_rvq_e2e.rs index a2fae195..c1e5fa32 100644 --- a/crates/thinking-engine/examples/tts_rvq_e2e.rs +++ b/crates/thinking-engine/examples/tts_rvq_e2e.rs @@ -363,31 +363,28 @@ fn load_weights( .map(|r| f32_data[r * n_cols..(r + 1) * n_cols].to_vec()) .collect(); - // Shape-dispatch: vocab-sized tensors (n_rows > 8192) go through - // hierarchical CLAM 256×256 — progressive residual RVQ at k=4096 - // can't reach cos ≈ 1 when k_final < n_rows / 4. + // Shape-dispatch: vocab-sized tensors (n_rows > 8192) are passed + // through unchanged (BF16 → f32, no compression). Progressive + // residual RVQ fails (k_final < n_rows / 4 → cos collapse) and + // hierarchical CLAM 256×256 ALSO fails for this shape because + // vocab rows are near-orthogonal in 2048-d — single-centroid + // reconstruction picks a random-direction row. See PR #177 comment. + // + // Proper remediation is bgz-tensor::HhtlDTensor + shared palettes + // (343:1 lookup-grade), which requires switching inference from + // f32 GEMM to HHTL cascade. Out of scope for this pipeline. let short = tensor.name.rsplit('.').take(3).collect::>().into_iter().rev().collect::>().join("."); use std::io::Write as _; let (reconstructed, cos, tag): (Vec>, f64, String) = if n_rows > 8192 { - // Hierarchical CLAM 256×256 path - let t0 = Instant::now(); - let (l1_centroids, l2_codebooks, indices) = build_hclam_256x256(&rows); - let rec = reconstruct_hclam(&l2_codebooks, &indices, n_cols); - let el = t0.elapsed(); - - // Storage: L1 + sum of L2 + indices - codebook_bytes += l1_centroids.len() * n_cols * 4; - for cb in &l2_codebooks { - codebook_bytes += cb.len() * n_cols * 4; - } - index_bytes += indices.len() * 2; // (u8, u8) per row - - let c = cosine_f32(&rows[0], &rec[0]); - println!(" [{:>3}] {:<60} [{}x{}] cos={:.4} hclam=256x256 {:?}", - weights.len() + 1, short, n_rows, n_cols, c, el); + // Passthrough: keep BF16-precision f32 rows as-is, no codebook. + // Cos = 1 trivially. Ship cost is BF16 (2 bytes per element). + codebook_bytes += n_rows * n_cols * 2; // BF16 shipping footprint + println!(" [{:>3}] {:<60} [{}x{}] cos=1.0000 passthrough (n_rows>8192, BF16 {:.1}MB)", + weights.len() + 1, short, n_rows, n_cols, + (n_rows * n_cols * 2) as f64 / 1e6); std::io::stdout().flush().ok(); - (rec, c, "hclam".into()) + (rows.clone(), 1.0, "passthrough".into()) } else { // K levels based on role let role = tensor.name.to_lowercase(); diff --git a/docs/LANCE_UPGRADE_ROADMAP.md b/docs/LANCE_UPGRADE_ROADMAP.md new file mode 100644 index 00000000..ddf9e72e --- /dev/null +++ b/docs/LANCE_UPGRADE_ROADMAP.md @@ -0,0 +1,161 @@ +# Lance 2 → 4/5 Upgrade Roadmap + +> **Status**: planning doc, no migration work started. +> **Current pins** (`crates/lance-graph/Cargo.toml`): +> `lance = "2"`, `lance-linalg = "2"`, `lance-namespace = "2"`, `lance-arrow = "2"`, `lance-index = "2"`, `datafusion = "51"`, `datafusion-common/expr/sql/functions-aggregate = "51"`. +> **Target**: Lance 4.0 (stable) or 5.0-rc.1 (RC). + +## Why upgrade at all + +Lance 4.0 and 5.0-rc.1 ship features that directly overlap with compression work we've been doing custom in `crates/thinking-engine/examples/tts_rvq_e2e.rs` and `crates/bgz-tensor/`: + +- **IVF_RQ index** (first-class) — same algorithm family as our `build_rvq` +- **IVF partitions multi-split** (5.0-rc.1, PR #6423) — adaptive partitioning for skewed distributions (candidate fix for the `text_embedding` cos=0.054 failure; see PR `AdaWorldAPI/lance-graph#177` comment) +- **HNSW-accelerated partition assignment for fp16 vectors** (4.0) — ~100-500× speedup on large-N assignment +- **BF16 support from PyTorch datasets** (5.0-rc.1) — first-class ingest +- **CacheBackend trait + CacheCodec** (5.0-rc.1) — plug slot for `bgz-tensor::HhtlDTensor` as an index cache codec +- **Distributed IVF_RQ segment builds** (5.0-rc.1) — horizontal scale for Qwen3-235B-size models +- **Index segment commit API** (4.0) — atomic multi-segment commits +- **Pre-transposed PQ codebook for SIMD L2** (4.0) — same pattern as our `l2_dist_sq` F32x16 FMA +- **File format 2.3** added in 4.0, 2.1 becomes default in 5.0-rc.1 +- **Hamming distance in HNSW** (5.0-rc.1) — consumer for our `bgz17` Hamming semirings + +None of these are strict *requirements* for our current stack. They become attractive when we graduate from "f32 GEMM on reconstructed weights" (our current RVQ path) to "HHTL cascade lookup" (our `bgz-tensor::HhtlDTensor` path) or "IVF_RQ storage" (Lance native). + +## Blockers + +### Primary: DataFusion 51 → 52.1 bump + +Lance 4.0 bumps its DataFusion dependency to `52.1.0`. Our `lance-graph-planner` (10,326 LOC, 16 strategies) is tied to DataFusion 51 APIs: + +| Area | Files depending on DF 51 | +|---|---| +| Cypher → DataFusion SQL planner | `crates/lance-graph/src/datafusion_planner/` (~6K LOC) | +| CAM-PQ operator | `crates/lance-graph-planner/src/physical/cam_pq_scan.rs` | +| 16 planner strategies | `crates/lance-graph-planner/src/strategy/` | +| TruthPropagating semiring execution | `crates/lance-graph-planner/src/physical/truth_semiring.rs` | +| Rule optimizer / histogram cost / DP join enum | `crates/lance-graph-planner/src/strategy/{rule,histogram,dp_join}_*.rs` | +| MUL assessment + 36 thinking styles | `crates/lance-graph-planner/src/thinking/` (indirect via DF expr types) | + +A direct 51 → 52.1 bump will surface breakage in: +- `datafusion::logical_plan` API changes +- `datafusion::physical_plan` operator trait signatures +- `datafusion::sql::unparser` (we use it for Cypher→SQL) +- `datafusion_expr::Expr` variants (52.x dropped several deprecated variants) +- `datafusion-functions-aggregate` signature changes + +### Secondary: file format version default + +Lance 5.0-rc.1 makes 2.1 the default file format. Any baked dataset we read from Releases (`v0.1.0-bgz-data`, 41 bgz7 files) needs either: +- Pin the reader to 2.0 explicitly +- Re-bake on 2.1 + +The 41-shard bgz7 archive is well under 1 GB total; re-bake is acceptable cost. + +### Tertiary: Java / namespace API cleanup + +Lance 4.0 + 5.0-rc.1 both touched namespace APIs. We don't use Java; we use `lance-namespace = "2"` sparingly. Worth auditing but likely a 2-file fix. + +## Phased migration plan + +### Phase 0 — No-op baseline (this session) + +Finish the RVQ reality-check on Qwen3-TTS-0.6B via the passthrough fix in PR #177. Publish codec-token-match ≥ 99% number. Lance version irrelevant. + +### Phase 1 — Algorithm evaluation probe (next session, ≤ 1 day) + +Deliverable: `crates/thinking-engine/examples/lance_ivf_rq_probe.rs` + +- Use Lance 4.0 **as a library dependency ONLY** in one example, pinning the main workspace to Lance 2 / DF 51. +- Build an IVF_RQ index on one tensor (e.g. `model.text_embedding.weight [151936, 2048]`), read it back, measure cos per row and storage size. +- Compare against the hierarchical CLAM and passthrough baselines. +- If the IVF_RQ + multi-split result is ≥ cos 0.95 at < 1:2 storage on that tensor, migration worth pursuing. +- If not, HHTL-D via `bgz-tensor` stays the forward path. + +Risk: Lance 4.0 may transitively pull in DF 52.1 even through an example. Workaround: put the example in its own crate outside the workspace (`crates/lance-graph-ivf-rq-probe/` with explicit `workspace = { resolver = "2" }` override). + +### Phase 2 — Peripheral crates (~1 week) + +Upgrade the crates that don't touch DataFusion-51 planner APIs: + +- `lance-graph-contract` — zero deps, no change needed +- `lance-graph-catalog` — catalog providers, lance-only deps → upgradeable first +- `lance-graph-benches` — benchmarks, no planner coupling +- `crates/bgz-tensor` — 0 deps of its own, only needs `lance-arrow` indirectly + +At end of phase 2, contract + catalog + benches + bgz-tensor run on Lance 4.x, but core `lance-graph` + `lance-graph-planner` remain on Lance 2 / DF 51. Workspace compiles via dual-version resolution. + +### Phase 3 — DataFusion 51 → 52.1 (~2-4 weeks) + +This is the hard part. + +1. Bump DataFusion version in `lance-graph` and `lance-graph-planner` simultaneously +2. Fix compile errors walk: + - `datafusion_planner/`: expression unparsing, predicate pushdown, UDF registration + - `lance-graph-planner/src/strategy/*`: Strategy trait signatures + - `lance-graph-planner/src/physical/*`: ExecutionPlan trait signatures + - `lance-graph-planner/src/thinking/*`: Expr type migrations (minimal, mostly pattern matches) +3. Run `cargo test -p lance-graph -p lance-graph-planner` — expect 150+ failures initially, triage into + - "syntactic rename" (fast) + - "semantic API change" (need understanding) + - "truly broken" (needs redesign) +4. Gate merge on green `cargo test --workspace` + +### Phase 4 — Adopt new features (~1-2 weeks) + +Once everything compiles on Lance 4.0 / DF 52.1: + +- Replace our custom `build_rvq` with Lance IVF_RQ index where benchmarks justify it (the Phase 1 probe decides which tensors) +- Wire `CacheBackend` in `bgz-tensor::HhtlDTensor` so HHTL-D encodings plug in as Lance cache codecs +- Enable multi-split for the `text_embedding` path +- Switch BF16 ingest from our custom `bf16_to_f32_batch` loading to Lance's first-class BF16 dataset type (5.0-rc.1 only — defer to phase 5 unless RC is stable) + +### Phase 5 — Lance 5.0 stable (when released) + +- Bump 4.0 → 5.0 (minor, Lance historically stable across minor bumps) +- Adopt BF16 ingest, io_uring file reader, distributed IVF_RQ segment builds +- File format default → 2.1 (re-bake the 41 bgz7 shards in v0.1.0-bgz-data release) + +## Feature priority vs migration cost + +| Lance feature | Our problem it solves | Portable without full migration? | +|---|---|---| +| IVF partitions multi-split (5.0) | `text_embedding` cos=0.05 failure | **Yes** — vendor PR #6423 algorithm (~200 LOC target) | +| HNSW fp16 partition assignment (4.0) | Encoder build time at scale | Yes — vendor the kernel | +| IVF_RQ index (4.0) | Replace our custom RVQ | **No** — tightly Lance-coupled | +| CacheBackend + CacheCodec (5.0) | `bgz-tensor::HhtlDTensor` integration | No — needs Lance core | +| Distributed IVF_RQ (5.0) | 235B MoE scale | No — needs Lance core | +| BF16 PyTorch ingest (5.0) | Drop custom `bf16_to_f32_batch` | No — needs Lance core | +| Pre-transposed PQ codebook SIMD (4.0) | Already done in our `l2_dist_sq` | N/A — we did it independently | +| Hamming distance in HNSW (5.0) | Consumer for `bgz17` semirings | No — needs Lance core | +| File format 2.3 (4.0) | Shipping compressed weights to Releases | No — needs Lance core | + +## Recommended forward path + +**Do not migrate Lance in this codebase yet.** Two cheaper paths capture 80% of the value: + +1. **Vendor the algorithms we want** — port PR #6423 (IVF multi-split) and the HNSW fp16 partition assignment kernel into `crates/bgz-tensor/src/` or `crates/thinking-engine/src/`. Pure algorithm, no Lance/DF coupling. + +2. **Use Lance 4.x as an out-of-tree library for specific experiments** — isolated probe crates outside the main workspace, to evaluate features at low cost before paying the full migration tax. + +A full Lance 2 → 4/5 migration is a ~3-6 week project mostly gated on DataFusion 51 → 52.1. It's the right eventual move but not this session, not next session, probably not this month. Revisit when Lance 5.0 ships stable and we have a concrete feature in phase 4 that demands it. + +## Open questions (for next session) + +1. **PR #6423 source** — what are the exact multi-split criteria? Density threshold? Fixed fan-out? Read the PR, log the algorithm, decide portability. +2. **Lance 4.0 vs 5.0-rc.1** — if we migrate, which target? 5.0 RC may stabilize before we finish phase 3. Pin-on-signal strategy: watch `v5.0.0` stable release, only go 5.0 stable. +3. **DataFusion 52.1 breakage scope** — estimate by running `cargo check` with DF bumped (dry run on a branch, count errors). Decides phase 3 week estimate. +4. **Lance 5.0 "non-shared centroid vector index builds"** — does this conflict with our `bgz-tensor::SharedPaletteGroup` (26 groups for Qwen3-TTS-1.7B, 5.4 MB overhead)? Needs clarification of semantics. +5. **io_uring file reader (5.0)** — requires Linux ≥ 5.6. Our teleport VM is 4.4.0. Works on real Railway / CI hosts. Cost: none, behind feature flag. + +## Cross-references + +- `AdaWorldAPI/lance-graph#176` (merged) — AVX-512 F32x16 FMA encoder + AMX TDPBF16PS polyfill baseline +- `AdaWorldAPI/lance-graph#177` (merged) — hierarchical CLAM dispatch (REFUTED for vocab) + F32x16 rms_norm + passthrough fix +- `docs/RVQ_ENCODER_REPLICATION.md` — runnable pipeline for any BF16 safetensors model +- `docs/RVQ_K_LADDER_TUNING.md` — shape→k decision rule (Section 3 claim REFUTED for vocab tensors) +- `docs/RVQ_ALTERNATIVES.md` — codec-family comparison +- `crates/bgz-tensor/BGZ_HHTL_D.md` — 343:1 lookup-grade encoding (the forward path we're aligning to) +- `.claude/prompts/fisher-z-wiring/` — 12-step HhtlDTensor integration plan + +https://claude.ai/code/session_01NYGrxVopyszZYgLBxe4hgj