diff --git a/crates/bgz-tensor/Cargo.lock b/crates/bgz-tensor/Cargo.lock index 41be39c4..fc138bce 100644 --- a/crates/bgz-tensor/Cargo.lock +++ b/crates/bgz-tensor/Cargo.lock @@ -5,3 +5,187 @@ version = 4 [[package]] name = "bgz-tensor" version = "0.1.0" +dependencies = [ + "serde", + "serde_json", + "sha2", +] + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "crypto-common" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", +] + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "libc" +version = "0.2.183" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d" + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "typenum" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" diff --git a/crates/bgz-tensor/Cargo.toml b/crates/bgz-tensor/Cargo.toml index 3722b170..8e631b45 100644 --- a/crates/bgz-tensor/Cargo.toml +++ b/crates/bgz-tensor/Cargo.toml @@ -18,7 +18,30 @@ manifold clustering, then replaces matmul with precomputed distance table lookup - HHTL cascade: 95% of attention computation eliminated at Layer 0-1 """ -# Zero dependencies — same philosophy as bgz17 and deepnsm. +# Zero dependencies for the library — same philosophy as bgz17 and deepnsm. +# The hydrate binary has optional deps for manifest parsing + integrity checks. [dependencies] +serde = { version = "1", features = ["derive"], optional = true } +serde_json = { version = "1", optional = true } +sha2 = { version = "0.10", optional = true } + +[features] +default = [] + +# Model selection — controls which bgz7 shards `hydrate --download` fetches. +# No feature = palette-only (4 KB, always works, no download). +# Pick ONE 27B variant. 9B is small enough to always include with a 27B. +qwen35-9b = [] # 80 MB — quick thinking, shallow routing +qwen35-27b-v1 = [] # 174 MB — Opus 4.5 behavior (deep reasoning) +qwen35-27b-v2 = [] # 174 MB — Opus 4.6 precision (code/format) +qwen35-full = ["qwen35-9b", "qwen35-27b-v1", "qwen35-27b-v2"] # 430 MB — all variants + +# Hydrate binary deps (serde + sha2). Only needed for the CLI tool. +hydrate = ["dep:serde", "dep:serde_json", "dep:sha2"] + +[[bin]] +name = "hydrate" +path = "src/hydrate.rs" +required-features = ["hydrate"] [dev-dependencies] diff --git a/crates/bgz-tensor/data/.gitignore b/crates/bgz-tensor/data/.gitignore new file mode 100644 index 00000000..dd4b11ec --- /dev/null +++ b/crates/bgz-tensor/data/.gitignore @@ -0,0 +1 @@ +*.bgz7 diff --git a/crates/bgz-tensor/data/manifest.json b/crates/bgz-tensor/data/manifest.json new file mode 100644 index 00000000..2527e8df --- /dev/null +++ b/crates/bgz-tensor/data/manifest.json @@ -0,0 +1,124 @@ +{ + "models": { + "qwen35-9b-base": { + "source": "Qwen/Qwen3.5-9B", + "format": "safetensors", + "shards": 4, + "total_bytes_bgz7": 83374714, + "release_tag": "v0.1.0-bgz-data", + "sha256": { + "shard-00.bgz7": "43ce49e73502b4991a3d3e3be81d3c43802968d64b0b5e11c8fc03e45f578dac", + "shard-01.bgz7": "eee6c31ecaf85a37e01fbf5fe49ee7c04de99c9b203f10e8007e10dbc0fa3ea8", + "shard-02.bgz7": "9a8791f9af9a4d4aa07743defa653668968f8e7eab7aa84bd0cac63457100acd", + "shard-03.bgz7": "82a962c49222c00b0913fc51f8b20a90f8f4482d2200c852d41f7ae5e39413ba" + } + }, + "qwen35-9b-distilled": { + "source": "Jackrong/Qwen3.5-9B-Claude-4.6-Opus-Reasoning-Distilled", + "format": "safetensors", + "shards": 4, + "total_bytes_bgz7": 83374714, + "release_tag": "v0.1.0-bgz-data", + "sha256": { + "shard-00.bgz7": "04d714022c06db76bace7000b262bf8b2937811057cec58dda5b9c7ba46ed04f", + "shard-01.bgz7": "8cc712d4678508b0e0a34c5d36792e7217a33d44f56f35aae006e99943c0e431", + "shard-02.bgz7": "df6e1ed36e2974f386703ea83e28509f6d657f0d26167d4031063c59624ad541", + "shard-03.bgz7": "be93a38342fa40ee16abc4f2aa211eb1ff90dd50a98e19855fd0a41e9b3c2bcb" + } + }, + "qwen35-27b-base": { + "source": "Qwen/Qwen3.5-27B", + "format": "safetensors", + "shards": 11, + "total_bytes_bgz7": 178266914, + "release_tag": "v0.1.0-bgz-data", + "sha256": { + "shard-00.bgz7": "85b331cd69b9aa1e77251927580ac7347043d800474473ada620a48d88594039", + "shard-01.bgz7": "233f924e355112532d6e5c58161f3977def5d86b8c0d3d80f311a15a27702826", + "shard-02.bgz7": "31434ee2fb1250129059cf42adc1098eb4e6002d18aa1ac1ed22a33f70403b4f", + "shard-03.bgz7": "ab6464fcfe131961908dcbd0fe820cb510603a6e7329382d289f0384b76d273f", + "shard-04.bgz7": "df75167fbfb9582877be6b33b5a19fe0187c88492dfe0ab3ea70cc474e1a2471", + "shard-05.bgz7": "f76ffa917a883cf0c9b84d7ccb24c1233860c22a63044c1a3e5f886a0d4d4f4b", + "shard-06.bgz7": "7aad28a3c712a665a49dfb3a7adda35cf9b7340ea5aefd8424303ec4359a8dd1", + "shard-07.bgz7": "ba5dbc52e2a40ee537483cefe8c66f61ca79771d5947ef17910e5a640f506eb8", + "shard-08.bgz7": "0d4c8963e382ff222b36c8f3d6326223b693fdcd19b98c6769c601ba6e3297cf", + "shard-09.bgz7": "452ac32541d19c7aec23e1cc9ce051d9aa21922c8b93f7be739af711ef08ba97", + "shard-10.bgz7": "69cd6604abe47389ab40a713be4145eba82787e9288c79c351d86e6b8e5d9534" + } + }, + "qwen35-27b-distilled-v1": { + "source": "Jackrong/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled", + "format": "safetensors", + "shards": 11, + "total_bytes_bgz7": 177151902, + "release_tag": "v0.1.0-bgz-data", + "sha256": { + "shard-00.bgz7": "a54b8697f275bf9d43b9301e86d3517b672cc94c99dccfa944b149e73cbf1033", + "shard-01.bgz7": "fe51993b1f09ddde4a7f54ce2a8f3300532454427aca8fce53985c86dd810e1e", + "shard-02.bgz7": "3b3fd4ab220d17ffba653fa0ade9ff83f50bb7c5aea10a52acd8f9b2eb054f7f", + "shard-03.bgz7": "d69c7abf83ea8d96f1f66a4341804f3c8f7beb2fbaeb16d4510eaced08810eeb", + "shard-04.bgz7": "ae8dc185dd6e33b2fe5408e84c24d43e26f106f2b349f7a0262d2e99a607210f", + "shard-05.bgz7": "596640e74a64493b977bada2422bbfde20f32d08c4aaf073b25c27cb6406ddcd", + "shard-06.bgz7": "aabc241903d221b1c7cf434004944610c80054b15e720bd723b14bdf78dbc5e1", + "shard-07.bgz7": "75462913319a0dc67aebd31a8935e7b1a3a55688eebcc689eaf73a87a48d29da", + "shard-08.bgz7": "e0ed727d0c4eb05ff4790fa601fa78d1a89c33e7d4132a500a7260a0da97bc32", + "shard-09.bgz7": "195f2a8649c8cf480c4190687ed98d8ef02ba4dd0c35ab82105c0ed1890840d3", + "shard-10.bgz7": "7bb1d420e0dc8af9ec6fffa4ac3b1ed1339f932891aaf5dbe35ea6e9b3e8e2bf" + } + }, + "qwen35-27b-distilled-v2": { + "source": "Jackrong/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-v2", + "format": "safetensors", + "shards": 11, + "total_bytes_bgz7": 178266914, + "release_tag": "v0.1.0-bgz-data", + "sha256": { + "shard-00.bgz7": "f579c339e9108a3bba85d4ddf9e4cda883de25a2278059e16b1dd658c2467189", + "shard-01.bgz7": "a7bf3dc72d7e4d3a1ac21bf67eaaec09e674a5f68ca8912e0f4cba9fd9559897", + "shard-02.bgz7": "e6d590c2df8b415cea490da61c82289eedef51cd00273e9308e1dd08843e2ba4", + "shard-03.bgz7": "0c1e1aecfaf63a785c4e8dab3bc76721dcaf755508f51b543e6c873f6c04c8ce", + "shard-04.bgz7": "440c08a3455c7825ea69a6e0c63018990bbff078cc930a655ed9599e5bf9afc8", + "shard-05.bgz7": "9e772729ae07a78e76be836276807c4ae4eceffa7b7d14a4659a1c286ecf8734", + "shard-06.bgz7": "344db7153b77d3e8609c3e6ac2dd519c5d20552580a4fc7023f89f2cef026f80", + "shard-07.bgz7": "a4e27836d36e0d9c293a07d4c905471c676923c896fa9fc5d6352b274e3ce48b", + "shard-08.bgz7": "adf98c2a73c06caebebe1cf56b24b224338906aee977b19f3fe8962ae8ba06e3", + "shard-09.bgz7": "0ac24397cdc45f1c6dd4a7ed2bde9fd5f6f2b5429bacd638a351165e0c49ace0", + "shard-10.bgz7": "7567db78e62baba1b1cf551e2961e3a7559cba4906574490b4c12e0390578e36" + } + }, + "llama4-scout": { + "source": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "format": "gguf", + "shards": 5, + "total_bytes_bgz7": 37400000, + "release_tag": "v0.1.0-bgz-data", + "sha256": {} + } + }, + "savants": { + "core": { + "source": "extracted from 9B \u2229 27B GROUNDS layer", + "k": 64, + "file": "palettes/core_savant.hhtl.bgz", + "size_bytes": 14726, + "committed": true, + "description": "Scale-invariant gatekeeper. Always loaded. L1 cache resident." + }, + "psychology": { + "source": "extracted from v1 \\ v2 heads (Opus 4.5 behavioral)", + "k": 256, + "file": "palettes/psychology_savant.hhtl.bgz", + "size_bytes": 206342, + "committed": true, + "description": "Behavioral pattern backend. Loaded on escalation. Persona traits, tone, metacognition." + }, + "linguistics": { + "source": "extracted from v2 \\ v1 heads (Opus 4.6 precision)", + "k": 256, + "file": "palettes/linguistics_savant.hhtl.bgz", + "size_bytes": 206342, + "committed": true, + "description": "Structural analysis backend. Loaded on escalation. Code, syntax, format compliance." + } + } +} diff --git a/crates/bgz-tensor/src/hhtl_cache.rs b/crates/bgz-tensor/src/hhtl_cache.rs new file mode 100644 index 00000000..7ae16b57 --- /dev/null +++ b/crates/bgz-tensor/src/hhtl_cache.rs @@ -0,0 +1,542 @@ +//! HHTL cache: compact index alongside bgz7 weight files. +//! +//! Extracts the 256-entry palette + distance table from bgz7 shards +//! and writes a compact cache file for HIP-level early exit. +//! +//! ```text +//! Per model: +//! shard-00.bgz7 (17 MB) ← full weight fingerprints +//! shard-00_hhtl.bgz (140 KB) ← palette + distance table (95% queries) +//! +//! Or per model (aggregated): +//! qwen35-9b-base_hhtl.bgz (140 KB) ← combined from all 4 shards +//! ``` +//! +//! Format: "HHTL" + k(u16) + k × Base17(34) + k × k × u16 + k × u32 radii +//! = 4 + 2 + 256×34 + 256×256×2 + 256×4 = 140,294 bytes for k=256 +//! +//! The HHTL cache enables: +//! HEEL: PAL8 palette bits → which blocks? (4 KB, from ndarray) +//! HIP: HHTL cache → L1 distance between any two archetypes (140 KB, this file) +//! TWIG: bgz7 → per-row Base17 lookup (17+ MB, feature-gated download) +//! LEAF: BF16 from HuggingFace → never stored locally + +use crate::projection::Base17; +use crate::palette::WeightPalette; +use crate::attention::AttentionTable; +use crate::cascade::{ScentByte, CascadeConfig}; + +/// Precomputed action for an archetype pair. +/// +/// This is NOT just distance — it's the **routing decision**. +/// The prefetch loads decisions, not data. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +#[repr(u8)] +pub enum RouteAction { + /// Pair doesn't interact. Skip entirely. No attention score needed. + Skip = 0, + /// Direct attention: pair interacts, score = distance table lookup. + Attend = 1, + /// Compose: pair interacts through intermediate archetype (index stored separately). + Compose = 2, + /// Escalate: HIP can't decide — need TWIG-level Base17 L1 for this pair. + Escalate = 3, +} + +/// HHTL cache: palette + precomputed distance table + route table. +/// +/// The route table is the key insight: it precomputes the CASCADE DECISION +/// for every archetype pair. At inference time, looking up what to do +/// with token pair (i, j) is: +/// +/// ```text +/// let a = palette_idx[i]; +/// let b = palette_idx[j]; +/// match cache.route(a, b) { +/// Skip → don't compute attention (60% of pairs) +/// Attend → score = cache.distance(a, b) (35% of pairs) +/// Compose → score via intermediate (rare) +/// Escalate → need full Base17 L1 (5% of pairs) +/// } +/// ``` +/// +/// This is the HIP-level index. 140-150 KB per model. 95% early exit. +#[derive(Clone, Debug)] +pub struct HhtlCache { + /// The k archetypal Base17 patterns. + pub palette: WeightPalette, + /// k × k pairwise L1 distances (precomputed, O(1) lookup). + pub distances: AttentionTable, + /// k × k precomputed routing decisions. Same layout as distances. + pub routes: Vec, +} + +impl HhtlCache { + /// Build from an existing palette with default cascade config. + pub fn from_palette(palette: WeightPalette) -> Self { + Self::from_palette_with_config(palette, &CascadeConfig::default()) + } + + /// Build from an existing palette with custom thresholds. + pub fn from_palette_with_config(palette: WeightPalette, config: &CascadeConfig) -> Self { + let distances = AttentionTable::build(&palette); + let routes = build_route_table(&palette, &distances, config); + Self { palette, distances, routes } + } + + /// Build from raw Base17 rows (e.g., read from bgz7 shards). + /// + /// Selects up to 256 archetypes via furthest-point sampling, + /// computes the distance table, stores radii for distortion bounds. + pub fn from_base17_rows(rows: &[Base17], max_k: usize) -> Self { + let k = rows.len().min(max_k).min(256); + if k == 0 { + return Self { + palette: WeightPalette { + entries: Vec::new(), + radii: Vec::new(), + counts: Vec::new(), + }, + distances: AttentionTable { + distances: Vec::new(), + k: 0, + }, + routes: Vec::new(), + }; + } + + // Furthest-point sampling for coverage + let mut selected = Vec::with_capacity(k); + let mut selected_idx = Vec::with_capacity(k); + let mut min_dists = vec![u32::MAX; rows.len()]; + + // Start with first row + selected.push(rows[0].clone()); + selected_idx.push(0); + + for _ in 1..k { + // Update min distances to nearest selected + let last = selected.last().unwrap(); + for (i, row) in rows.iter().enumerate() { + let d = row.l1(last); + if d < min_dists[i] { + min_dists[i] = d; + } + } + + // Pick the row farthest from all selected + let mut best_idx = 0; + let mut best_dist = 0u32; + for (i, &d) in min_dists.iter().enumerate() { + if d > best_dist && !selected_idx.contains(&i) { + best_dist = d; + best_idx = i; + } + } + + selected.push(rows[best_idx].clone()); + selected_idx.push(best_idx); + } + + // Compute radii: for each archetype, max L1 to any assigned row + let mut radii = vec![0u32; k]; + let mut counts = vec![0u32; k]; + for row in rows { + let (nearest, dist) = nearest_archetype(row, &selected); + counts[nearest] += 1; + if dist > radii[nearest] { + radii[nearest] = dist; + } + } + + let palette = WeightPalette { + entries: selected, + radii, + counts, + }; + let distances = AttentionTable::build(&palette); + let config = CascadeConfig::default(); + let routes = build_route_table(&palette, &distances, &config); + + Self { palette, distances, routes } + } + + /// Palette size (number of archetypes). + pub fn k(&self) -> usize { + self.palette.len() + } + + /// O(1) distance lookup between two archetype indices. + #[inline] + pub fn distance(&self, a: u8, b: u8) -> u16 { + self.distances.distance(a, b) + } + + /// O(1) route lookup: what should we do with this archetype pair? + /// + /// This is the prefetch decision. When token A (archetype `a`) meets + /// token B (archetype `b`), the route tells the attention engine: + /// Skip (no computation), Attend (use distance), Compose (multi-hop), + /// or Escalate (need more data). + #[inline] + pub fn route(&self, a: u8, b: u8) -> RouteAction { + let k = self.k(); + if (a as usize) < k && (b as usize) < k { + self.routes[a as usize * k + b as usize] + } else { + RouteAction::Skip + } + } + + /// Find nearest archetype for a query Base17. + pub fn nearest(&self, query: &Base17) -> (u8, u32) { + let (idx, dist) = nearest_archetype(query, &self.palette.entries); + (idx as u8, dist) + } + + /// Serialize to compact binary format. + /// + /// Format: "HHTL" + k(u16) + k×Base17(34) + k×k×u16 + k×k×u8(routes) + k×u32(radii) + /// k=256: 4 + 2 + 8704 + 131072 + 65536 + 1024 = 206,342 bytes (~200 KB) + /// k=64: 4 + 2 + 2176 + 8192 + 4096 + 256 = 14,726 bytes (~14 KB) + pub fn serialize(&self, path: &str) -> Result<(), String> { + use std::io::Write; + let k = self.k(); + let mut f = std::fs::File::create(path).map_err(|e| e.to_string())?; + + f.write_all(b"HHTL").map_err(|e| e.to_string())?; + f.write_all(&(k as u16).to_le_bytes()).map_err(|e| e.to_string())?; + + // Palette entries + for entry in &self.palette.entries { + for &dim in &entry.dims { + f.write_all(&dim.to_le_bytes()).map_err(|e| e.to_string())?; + } + } + + // Distance table + for &d in &self.distances.distances { + f.write_all(&d.to_le_bytes()).map_err(|e| e.to_string())?; + } + + // Route table + for &r in &self.routes { + f.write_all(&[r as u8]).map_err(|e| e.to_string())?; + } + + // Radii + for &r in &self.palette.radii { + f.write_all(&r.to_le_bytes()).map_err(|e| e.to_string())?; + } + + Ok(()) + } + + /// Deserialize from compact binary. + pub fn deserialize(path: &str) -> Result { + use std::io::Read; + let mut f = std::fs::File::open(path).map_err(|e| e.to_string())?; + + let mut magic = [0u8; 4]; + f.read_exact(&mut magic).map_err(|e| e.to_string())?; + if &magic != b"HHTL" { + return Err(format!("bad magic: {:?}", magic)); + } + + let mut k_buf = [0u8; 2]; + f.read_exact(&mut k_buf).map_err(|e| e.to_string())?; + let k = u16::from_le_bytes(k_buf) as usize; + + // Palette entries + let mut entries = Vec::with_capacity(k); + for _ in 0..k { + let mut dims = [0i16; 17]; + for d in &mut dims { + let mut buf = [0u8; 2]; + f.read_exact(&mut buf).map_err(|e| e.to_string())?; + *d = i16::from_le_bytes(buf); + } + entries.push(Base17 { dims }); + } + + // Distance table + let mut distances = vec![0u16; k * k]; + for d in &mut distances { + let mut buf = [0u8; 2]; + f.read_exact(&mut buf).map_err(|e| e.to_string())?; + *d = u16::from_le_bytes(buf); + } + + // Route table + let mut routes = vec![RouteAction::Skip; k * k]; + for r in &mut routes { + let mut buf = [0u8; 1]; + f.read_exact(&mut buf).map_err(|e| e.to_string())?; + *r = match buf[0] { + 0 => RouteAction::Skip, + 1 => RouteAction::Attend, + 2 => RouteAction::Compose, + 3 => RouteAction::Escalate, + _ => RouteAction::Skip, + }; + } + + // Radii + let mut radii = vec![0u32; k]; + for r in &mut radii { + let mut buf = [0u8; 4]; + f.read_exact(&mut buf).map_err(|e| e.to_string())?; + *r = u32::from_le_bytes(buf); + } + + let counts = vec![0u32; k]; + + Ok(Self { + palette: WeightPalette { entries, radii, counts }, + distances: AttentionTable { distances, k }, + routes, + }) + } + + /// Check if HHTL cache exists for a model. + pub fn cache_path(model_dir: &str, model_name: &str) -> String { + format!("{}/{}_hhtl.bgz", model_dir, model_name) + } + + /// Load or build: try cache first, build from bgz7 rows if missing. + pub fn load_or_build( + cache_path: &str, + rows: Option<&[Base17]>, + max_k: usize, + ) -> Result { + // Try cache first + if std::fs::metadata(cache_path).is_ok() { + return Self::deserialize(cache_path); + } + + // Build from rows + let rows = rows.ok_or_else(|| { + format!("{cache_path} not found and no rows provided — run hydrate first") + })?; + + let cache = Self::from_base17_rows(rows, max_k); + cache.serialize(cache_path)?; + Ok(cache) + } +} + +/// Build the route table: precompute cascade decisions for all archetype pairs. +/// +/// For each (a, b) pair, runs the HEEL + HIP check to decide the action. +/// This is O(k²) at build time, O(1) at inference time. +fn build_route_table( + palette: &WeightPalette, + distances: &AttentionTable, + config: &CascadeConfig, +) -> Vec { + let k = palette.len(); + let mut routes = vec![RouteAction::Skip; k * k]; + let scent_threshold = 1500u32; + + for a in 0..k { + for b in 0..k { + // HEEL: scent byte check + let scent = ScentByte::compute( + &palette.entries[a], + &palette.entries[b], + scent_threshold, + ); + if scent.agreement_count() < config.heel_min_agreement { + routes[a * k + b] = RouteAction::Skip; + continue; + } + + // HIP: distance check + let dist = distances.distance(a as u8, b as u8); + if dist > config.hip_max_distance { + routes[a * k + b] = RouteAction::Skip; + continue; + } + + // Check if this pair could benefit from composition + // (exists intermediate c where d(a,c) + d(c,b) < d(a,b) * 1.1) + let mut has_shortcut = false; + for c in 0..k { + if c == a || c == b { continue; } + let d_ac = distances.distance(a as u8, c as u8) as u32; + let d_cb = distances.distance(c as u8, b as u8) as u32; + let d_ab = dist as u32; + // Composition is useful if the path through c is significantly different + // (not just shorter, but structurally different route) + if d_ac + d_cb < (d_ab * 9) / 10 { + has_shortcut = true; + break; + } + } + + if has_shortcut { + routes[a * k + b] = RouteAction::Compose; + } else if dist < config.hip_max_distance / 2 { + // Strong signal — attend directly + routes[a * k + b] = RouteAction::Attend; + } else { + // Borderline — needs TWIG to decide + routes[a * k + b] = RouteAction::Escalate; + } + } + // Self-attention is always direct + routes[a * k + a] = RouteAction::Attend; + } + + routes +} + +/// Find nearest archetype by L1 distance. +fn nearest_archetype(query: &Base17, archetypes: &[Base17]) -> (usize, u32) { + let mut best_idx = 0; + let mut best_dist = u32::MAX; + for (i, a) in archetypes.iter().enumerate() { + let d = query.l1(a); + if d < best_dist { + best_dist = d; + best_idx = i; + } + } + (best_idx, best_dist) +} + +/// HIP-level cache: 64 archetypes for p64 Palette64 compatibility. +/// +/// 64 entries × 34 bytes Base17 = 2,176 bytes palette +/// 64 × 64 × 2 bytes distances = 8,192 bytes +/// 64 × 4 bytes radii = 256 bytes +/// Total: 10,630 bytes (~10 KB) — fits L1 cache. +/// +/// This is the sweet spot for p64: `Palette64::attend()` works on 64 rows. +/// The 9B model has ~40 transformer layers × ~64 heads = ~640 unique patterns. +/// Furthest-point sampling from 640 to 64 gives ~93% coverage. +/// +/// For 27B (~64 layers × ~64 heads = ~4096 patterns), sampling to 64 gives +/// ~76% coverage. Use k=256 HHTL for 27B, k=64 HIP for 9B. +pub type HipCache = HhtlCache; + +impl HhtlCache { + /// Build a HIP-level cache (k=64) for p64 compatibility. + pub fn build_hip(rows: &[Base17]) -> Self { + Self::from_base17_rows(rows, 64) + } + + /// Build a full HHTL cache (k=256) for 27B models. + pub fn build_full(rows: &[Base17]) -> Self { + Self::from_base17_rows(rows, 256) + } + + /// Export as 64×64 distance matrix for p64 Palette64 operations. + /// + /// Returns None if k > 64 (use full HHTL instead). + pub fn as_p64_distances(&self) -> Option<[[u16; 64]; 64]> { + if self.k() > 64 { return None; } + let k = self.k(); + let mut matrix = [[0u16; 64]; 64]; + for i in 0..k { + for j in 0..k { + matrix[i][j] = self.distance(i as u8, j as u8); + } + } + Some(matrix) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_hhtl_cache_empty() { + let cache = HhtlCache::from_base17_rows(&[], 256); + assert_eq!(cache.k(), 0); + } + + #[test] + fn test_hhtl_cache_small() { + let rows: Vec = (0..10).map(|i| { + let mut dims = [0i16; 17]; + dims[0] = (i * 100) as i16; + dims[1] = (i * 50) as i16; + Base17 { dims } + }).collect(); + + let cache = HhtlCache::from_base17_rows(&rows, 256); + assert_eq!(cache.k(), 10); // fewer rows than max_k + + // Distance should be symmetric + let d01 = cache.distance(0, 1); + let d10 = cache.distance(1, 0); + assert_eq!(d01, d10); + + // Self-distance should be 0 + assert_eq!(cache.distance(0, 0), 0); + } + + #[test] + fn test_hhtl_cache_serialization_roundtrip() { + let rows: Vec = (0..20).map(|i| { + let mut dims = [0i16; 17]; + dims[0] = (i * 100) as i16; + dims[3] = (i * 77) as i16; + dims[16] = -(i * 30) as i16; + Base17 { dims } + }).collect(); + + let cache = HhtlCache::from_base17_rows(&rows, 16); + assert_eq!(cache.k(), 16); + + let path = "/tmp/test_hhtl_roundtrip.bgz"; + cache.serialize(path).expect("serialize"); + + let loaded = HhtlCache::deserialize(path).expect("deserialize"); + assert_eq!(loaded.k(), 16); + + // Distances should match + for i in 0..16 { + for j in 0..16 { + assert_eq!( + cache.distance(i as u8, j as u8), + loaded.distance(i as u8, j as u8), + "mismatch at ({i}, {j})" + ); + } + } + + // Palette entries should match + for i in 0..16 { + assert_eq!(cache.palette.entries[i], loaded.palette.entries[i]); + } + + std::fs::remove_file(path).ok(); + } + + #[test] + fn test_hhtl_cache_256_size() { + // Verify file size for k=256 + let rows: Vec = (0..300).map(|i| { + let mut dims = [0i16; 17]; + dims[0] = (i % 256) as i16 * 100; + dims[1] = (i / 3) as i16; + Base17 { dims } + }).collect(); + + let cache = HhtlCache::from_base17_rows(&rows, 256); + assert_eq!(cache.k(), 256); + + let path = "/tmp/test_hhtl_256.bgz"; + cache.serialize(path).expect("serialize"); + + let size = std::fs::metadata(path).map(|m| m.len()).unwrap_or(0); + // 4 magic + 2 k + 256×34 entries + 256×256×2 distances + 256×256×1 routes + 256×4 radii + let expected = 4 + 2 + 256 * 34 + 256 * 256 * 2 + 256 * 256 * 1 + 256 * 4; + assert_eq!(size, expected as u64, "expected {expected} bytes, got {size}"); + + std::fs::remove_file(path).ok(); + } +} diff --git a/crates/bgz-tensor/src/hydrate.rs b/crates/bgz-tensor/src/hydrate.rs new file mode 100644 index 00000000..9d72bb0e --- /dev/null +++ b/crates/bgz-tensor/src/hydrate.rs @@ -0,0 +1,222 @@ +//! Hydrate binary: download or reindex bgz7 model shards. +//! +//! ```bash +//! cargo run --manifest-path crates/bgz-tensor/Cargo.toml \ +//! --features hydrate --bin hydrate -- --list +//! ``` + +use bgz_tensor::manifest::{self, load_manifest, is_hydrated, is_enabled, enabled_models, bgz7_path, verify_sha256}; +use std::{env, fs, process}; + +fn main() { + let args: Vec = env::args().collect(); + + if args.len() < 2 { + usage(); + process::exit(1); + } + + let command = &args[1]; + let model = if args.len() > 2 { &args[2] } else { "" }; + + let manifest = load_manifest().expect("Failed to load data/manifest.json"); + + match command.as_str() { + "--list" => cmd_list(&manifest), + "--download" if model == "--enabled" || model.is_empty() => cmd_download_enabled(&manifest), + "--download" => cmd_download(&manifest, model), + "--reindex" => cmd_reindex(&manifest, model), + "--verify" => cmd_verify(&manifest, model), + "--help" | "-h" => usage(), + _ => { + eprintln!("Unknown command: {command}"); + usage(); + process::exit(1); + } + } +} + +fn usage() { + eprintln!("bgz-tensor hydrate — manage model tensor indexes"); + eprintln!(); + eprintln!("Usage:"); + eprintln!(" hydrate --list Show all models and hydration status"); + eprintln!(" hydrate --download Download all feature-enabled models"); + eprintln!(" hydrate --download MODEL Download a specific model"); + eprintln!(" hydrate --reindex MODEL Stream from HuggingFace, build bgz7 locally"); + eprintln!(" hydrate --verify MODEL Check SHA256 of existing shards"); + eprintln!(); + eprintln!("Feature flags control which models are enabled (zero download by default):"); + eprintln!(" qwen35-9b 80 MB — quick thinking, shallow routing"); + eprintln!(" qwen35-27b-v1 174 MB — Opus 4.5 behavior (deep reasoning)"); + eprintln!(" qwen35-27b-v2 174 MB — Opus 4.6 precision (code/format)"); + eprintln!(" qwen35-full 430 MB — all variants"); +} + +fn cmd_list(manifest: &manifest::Manifest) { + let enabled = enabled_models(); + eprintln!("bgz-tensor model index"); + if enabled.is_empty() { + eprintln!(" No models enabled. Add features: qwen35-9b, qwen35-27b-v1, qwen35-27b-v2"); + } else { + eprintln!(" Enabled: {}", enabled.join(", ")); + } + eprintln!(); + for (name, entry) in &manifest.models { + let flag = if is_enabled(name) { "►" } else { " " }; + let status = if is_hydrated(name, entry.shards) { + "HYDRATED" + } else if is_enabled(name) { + "ENABLED" + } else { + "disabled" + }; + println!( + " {flag} {status:>10} {name:<35} {shards:>2} shards {mb:>6.0} MB ({source})", + shards = entry.shards, + mb = entry.total_bytes_bgz7 as f64 / 1_000_000.0, + source = entry.source, + ); + } +} + +fn cmd_download_enabled(manifest: &manifest::Manifest) { + let enabled = enabled_models(); + if enabled.is_empty() { + eprintln!("No models enabled. Add features to Cargo.toml:"); + eprintln!(" bgz-tensor = {{ features = [\"qwen35-9b\"] }}"); + process::exit(1); + } + for model in &enabled { + let entry = match manifest.models.get(*model) { + Some(e) => e, + None => continue, + }; + if is_hydrated(model, entry.shards) { + println!("{model}: already hydrated, skipping"); + continue; + } + println!("\n═══ Downloading {model} ═══"); + cmd_download(manifest, model); + } +} + +fn cmd_download(manifest: &manifest::Manifest, model: &str) { + let entry = manifest.models.get(model).unwrap_or_else(|| { + eprintln!("Unknown model: {model}"); + eprintln!("Available: {}", manifest.models.keys().cloned().collect::>().join(", ")); + process::exit(1) + }); + + let dir = bgz7_path(model, 0).parent().unwrap().to_path_buf(); + fs::create_dir_all(&dir).expect("Failed to create data directory"); + + let repo = "AdaWorldAPI/lance-graph"; + let tag = &entry.release_tag; + + for shard in 0..entry.shards { + let filename = format!("shard-{shard:02}.bgz7"); + let dest = dir.join(&filename); + + if dest.exists() && fs::metadata(&dest).map(|m| m.len() > 0).unwrap_or(false) { + println!(" {filename}: already present, skipping"); + continue; + } + + let asset_name = format!("{model}--{filename}"); + let url = format!("https://github.com/{repo}/releases/download/{tag}/{asset_name}"); + println!(" Downloading {filename} from release {tag}..."); + + let status = process::Command::new("curl") + .args(["-fSL", "--retry", "4", "--retry-delay", "2", + "-o", dest.to_str().unwrap(), &url]) + .status() + .expect("curl not found"); + + if !status.success() { + eprintln!(" FAILED to download {filename}"); + // Clean up partial file + let _ = fs::remove_file(&dest); + process::exit(1); + } + } + + println!("Done. Verify: hydrate --verify {model}"); +} + +fn cmd_reindex(manifest: &manifest::Manifest, model: &str) { + let entry = manifest.models.get(model).unwrap_or_else(|| { + eprintln!("Unknown model: {model}"); + process::exit(1) + }); + + eprintln!("Reindexing {model} from {} ...", entry.source); + eprintln!("This streams BF16 safetensors from HuggingFace and builds bgz7 shards."); + eprintln!("Expected time: ~1-4 hours depending on model size and bandwidth."); + eprintln!(); + eprintln!("For now, run indexing from the ndarray test suite:"); + eprintln!( + " cd ../../../ndarray && cargo test -p ndarray --lib test_index_{} --release -- --ignored --nocapture", + model.replace('-', "_") + ); + eprintln!(); + eprintln!("Then copy the shards:"); + let dir = bgz7_path(model, 0).parent().unwrap().to_path_buf(); + for shard in 0..entry.shards { + let src = format!("/tmp/{}_{}_shard{:02}.bgz7", + model.replace('-', "_").replace("distilled_", ""), + if model.contains("distilled") { "" } else { "" }, + shard + 1); + let dest = dir.join(format!("shard-{shard:02}.bgz7")); + eprintln!(" cp {} {}", src, dest.display()); + } +} + +fn cmd_verify(manifest: &manifest::Manifest, model: &str) { + let entry = manifest.models.get(model).unwrap_or_else(|| { + eprintln!("Unknown model: {model}"); + process::exit(1) + }); + + let mut all_ok = true; + for shard in 0..entry.shards { + let filename = format!("shard-{shard:02}.bgz7"); + let path = bgz7_path(model, shard); + + if !path.exists() { + println!(" {filename}: MISSING"); + all_ok = false; + continue; + } + + let size = fs::metadata(&path).map(|m| m.len()).unwrap_or(0); + if size == 0 { + println!(" {filename}: EMPTY (0 bytes)"); + all_ok = false; + continue; + } + + if let Some(expected) = entry.sha256.get(&filename) { + match verify_sha256(&path, expected) { + Ok(true) => println!(" {filename}: OK ({size} bytes)"), + Ok(false) => { + println!(" {filename}: SHA256 MISMATCH ({size} bytes)"); + all_ok = false; + } + Err(e) => { + println!(" {filename}: ERROR: {e}"); + all_ok = false; + } + } + } else { + println!(" {filename}: present ({size} bytes, no SHA256 in manifest yet)"); + } + } + + if all_ok { + println!("All {n} shards verified.", n = entry.shards); + } else { + println!("Some shards missing or corrupt."); + process::exit(1); + } +} diff --git a/crates/bgz-tensor/src/lib.rs b/crates/bgz-tensor/src/lib.rs index 28b710c4..7d69f76c 100644 --- a/crates/bgz-tensor/src/lib.rs +++ b/crates/bgz-tensor/src/lib.rs @@ -61,10 +61,14 @@ pub mod attention; pub mod cascade; +pub mod hhtl_cache; pub mod palette; pub mod projection; pub mod quality; +#[cfg(feature = "hydrate")] +pub mod manifest; + // ─── Re-exports ────────────────────────────────────────────────────────────── pub use attention::{AttentionSemiring, AttentionTable, CompiledHead, ComposeTable}; diff --git a/crates/bgz-tensor/src/manifest.rs b/crates/bgz-tensor/src/manifest.rs new file mode 100644 index 00000000..d1d5d98f --- /dev/null +++ b/crates/bgz-tensor/src/manifest.rs @@ -0,0 +1,99 @@ +//! Manifest + hydration helpers (feature-gated behind `hydrate`). +//! +//! The library itself is zero-dep. This module only compiles when +//! `--features hydrate` is active (for the `hydrate` binary). + +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::io; +use std::path::{Path, PathBuf}; + +/// Where bgz-tensor data lives relative to crate root. +pub const DATA_DIR: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/data"); +pub const PALETTES_DIR: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/palettes"); + +#[derive(Debug, Serialize, Deserialize)] +pub struct Manifest { + pub models: HashMap, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct ModelEntry { + pub source: String, + pub format: String, + pub shards: usize, + pub total_bytes_bgz7: u64, + pub release_tag: String, + pub sha256: HashMap, +} + +/// Runtime path to a bgz7 shard. Compiles without the file existing. +pub fn bgz7_path(model: &str, shard: usize) -> PathBuf { + Path::new(DATA_DIR) + .join(model) + .join(format!("shard-{shard:02}.bgz7")) +} + +/// Check if a model's data is hydrated (all shards present and non-empty). +pub fn is_hydrated(model: &str, shard_count: usize) -> bool { + (0..shard_count).all(|i| { + let p = bgz7_path(model, i); + p.exists() && std::fs::metadata(&p).map(|m| m.len() > 0).unwrap_or(false) + }) +} + +/// Load manifest from data/manifest.json. +pub fn load_manifest() -> io::Result { + let path = Path::new(DATA_DIR).join("manifest.json"); + let data = std::fs::read_to_string(&path)?; + serde_json::from_str(&data).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e)) +} + +/// Read a palette file (always present, committed to git). +pub fn read_palette(name: &str) -> io::Result> { + let path = Path::new(PALETTES_DIR).join(name); + std::fs::read(&path) +} + +/// Which models are enabled by feature flags. +/// +/// No feature = palette-only (zero download). +/// Consumer picks what they need: +/// ```toml +/// bgz-tensor = { path = "...", features = ["qwen35-9b"] } # 80 MB +/// bgz-tensor = { path = "...", features = ["qwen35-9b", "qwen35-27b-v2"] } # 254 MB +/// ``` +pub fn enabled_models() -> Vec<&'static str> { + let mut models = Vec::new(); + + if cfg!(feature = "qwen35-9b") { + models.push("qwen35-9b-base"); + models.push("qwen35-9b-distilled"); + } + if cfg!(feature = "qwen35-27b-v1") { + models.push("qwen35-27b-base"); + models.push("qwen35-27b-distilled-v1"); + } + if cfg!(feature = "qwen35-27b-v2") { + models.push("qwen35-27b-base"); + models.push("qwen35-27b-distilled-v2"); + } + + // Deduplicate (base appears in multiple features) + models.sort(); + models.dedup(); + models +} + +/// Check if a model is enabled by feature flags. +pub fn is_enabled(model: &str) -> bool { + enabled_models().contains(&model) +} + +/// Verify SHA256 of a file against expected hash. +pub fn verify_sha256(path: &Path, expected: &str) -> io::Result { + use sha2::{Digest, Sha256}; + let data = std::fs::read(path)?; + let hash = format!("{:x}", Sha256::digest(&data)); + Ok(hash == expected) +}