From cdc84ec4b798d74ff298702b6d9d1bd8aabdf510 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 16 May 2026 17:46:47 +0000 Subject: [PATCH 1/6] salvage(sprint-13/W-I1): D-CSV-13b i4_eval::batch impl + criterion scaffold MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recovered W-I1 working tree state that never reached git: the previous worker (134 tool uses, ~37 min) burned its quota mid-implementation and exited without staging or committing. The work was held only in: - working-tree uncommitted edits to mul.rs (+799/-36 LOC) - new untracked file benches/i4_batch.rs (180 LOC) - stray ELF scratch binaries (check_avx, test_avx512*, test_srli2) - a `doc/` rustdoc dump The binaries and rustdoc were removed; the source is preserved here so the retry worker (post 15:30-UTC quota reset) can resume from this point instead of from zero. What landed: 1. `GateDecision::to_disc(&self) -> u8` — SIMD-packable byte mapping (0=Flow, 1=Hold, 2=Block). The variant payloads (String reasons) prevent `#[repr(u8)]`; the manual discriminant lets the batch path stay branch-free. 2. `mul::i4_eval::batch` module — five batch entry points with runtime SIMD dispatch via `simd_caps()` (OQ-CSV-13). One binary runs on any host; AVX-512BW / NEON / scalar all coexist: - `dk_position_batch` - `trust_texture_batch` - `flow_state_batch` - `gate_decision_disc_batch` (u8 fast path) - `gate_decision_batch` (full GateDecision with reason strings, scalar-only — carve-out documented) - `mul_assess_batch` Each has an AVX-512 `#[cfg(target_arch = "x86_64")]` arm, an aarch64 NEON arm, and a `scalar_impl` fallback submodule with the same function names. 3. `benches/i4_batch.rs` — Criterion benchmark scaffold targeting the SHIP/LAND gates from the spec: - SHIP: ≥4× AVX-512 vs scalar for dk/trust/flow/gate_disc at 1024 - LAND: ≥2× (records TD-D-CSV-13b-PERF-FLOOR-1 if 2≤x<4) - mul_assess target: ≥2.5× (limited by scalar f64 finalize) Sweeps batch sizes [8, 64, 1024, 16384] per fn. Validation gap (the work the worker never got to): - `cargo check -p lance-graph-contract` → CLEAN (one dead-code warning for `SimdCapsShim::neon` field, benign — retry worker can either use the field or drop it). - `cargo test -p lance-graph-contract i4_eval::batch` → 0 tests; the worker did not write unit tests for the new batch fns. Tests must be added on retry against the scalar reference (i.e. assert dispatch output equals `scalar_impl` output element-wise for randomised input). - `cargo bench` on benches/i4_batch.rs will NOT compile until `criterion` is added to `[dev-dependencies]` in lance-graph-contract/Cargo.toml. Intentionally left absent here — adding the dep belongs to the retry commit that also adds the unit tests. Branch is not for merge as-is; it's a seed state for the retry worker. https://claude.ai/code/session_01UwJuKqP828qyX1VkLgGJFS --- .../lance-graph-contract/benches/i4_batch.rs | 180 ++++ crates/lance-graph-contract/src/mul.rs | 835 +++++++++++++++++- 2 files changed, 979 insertions(+), 36 deletions(-) create mode 100644 crates/lance-graph-contract/benches/i4_batch.rs diff --git a/crates/lance-graph-contract/benches/i4_batch.rs b/crates/lance-graph-contract/benches/i4_batch.rs new file mode 100644 index 00000000..c73208c1 --- /dev/null +++ b/crates/lance-graph-contract/benches/i4_batch.rs @@ -0,0 +1,180 @@ +//! Criterion benchmarks for i4_eval batch functions — D-CSV-13b +//! +//! SHIP gate: ≥4× AVX-512 vs scalar for dk/trust/flow/gate_disc at batch 1024. +//! LAND gate: ≥2× (records TD-D-CSV-13b-PERF-FLOOR-1 if ≥2 but <4). +//! mul_assess target: ≥2.5× (limited by scalar f64 finalize stage). + +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; +use lance_graph_contract::mul::i4_eval::batch; +use lance_graph_contract::qualia::QualiaI4_16D; + +fn make_inputs(n: usize) -> (Vec, Vec) { + let template: &[(i8, i8)] = &[ + (7, 5), + (5, 4), + (3, 3), + (2, 2), + (0, 2), + (-1, 1), + (-3, -2), + (-5, -4), + (6, 0), + (1, -1), + (4, 4), + (-2, -3), + (3, 1), + (0, -1), + (7, -5), + (-4, 4), + ]; + let mut qualia = Vec::with_capacity(n); + let mut mantissas = Vec::with_capacity(n); + for i in 0..n { + let (coh, mant) = template[i % template.len()]; + let mut q = QualiaI4_16D::ZERO; + q.set(9, coh); // dim 9 = coherence + q.set(3, coh.saturating_add(1).clamp(-8, 7)); // warmth + q.set(14, coh.saturating_sub(1).clamp(-8, 7)); // groundedness + q.set(2, (mant.abs() % 6).clamp(0, 7)); // tension + q.set(1, mant.clamp(-8, 7)); // valence + qualia.push(q); + mantissas.push(mant); + } + (qualia, mantissas) +} + +fn bench_dk_position(c: &mut Criterion) { + let mut group = c.benchmark_group("dk_position_batch"); + for &size in &[8usize, 64, 1024, 16384] { + let (q, m) = make_inputs(size); + let mut out = vec![lance_graph_contract::mul::DkPosition::MountStupid; size]; + + group.throughput(Throughput::Elements(size as u64)); + + group.bench_with_input(BenchmarkId::new("dispatch", size), &size, |b, _| { + b.iter(|| { + batch::dk_position_batch(&q, &m, &mut out); + criterion::black_box(&out); + }); + }); + + group.bench_with_input(BenchmarkId::new("scalar", size), &size, |b, _| { + b.iter(|| { + batch::scalar_impl::dk_position_batch(&q, &m, &mut out); + criterion::black_box(&out); + }); + }); + } + group.finish(); +} + +fn bench_trust_texture(c: &mut Criterion) { + let mut group = c.benchmark_group("trust_texture_batch"); + for &size in &[8usize, 64, 1024, 16384] { + let (q, _) = make_inputs(size); + let mut out = vec![lance_graph_contract::mul::TrustTexture::Calibrated; size]; + + group.throughput(Throughput::Elements(size as u64)); + + group.bench_with_input(BenchmarkId::new("dispatch", size), &size, |b, _| { + b.iter(|| { + batch::trust_texture_batch(&q, &mut out); + criterion::black_box(&out); + }); + }); + + group.bench_with_input(BenchmarkId::new("scalar", size), &size, |b, _| { + b.iter(|| { + batch::scalar_impl::trust_texture_batch(&q, &mut out); + criterion::black_box(&out); + }); + }); + } + group.finish(); +} + +fn bench_flow_state(c: &mut Criterion) { + let mut group = c.benchmark_group("flow_state_batch"); + for &size in &[8usize, 64, 1024, 16384] { + let (q, m) = make_inputs(size); + let mut out = vec![lance_graph_contract::mul::FlowState::Boredom; size]; + + group.throughput(Throughput::Elements(size as u64)); + + group.bench_with_input(BenchmarkId::new("dispatch", size), &size, |b, _| { + b.iter(|| { + batch::flow_state_batch(&q, &m, &mut out); + criterion::black_box(&out); + }); + }); + + group.bench_with_input(BenchmarkId::new("scalar", size), &size, |b, _| { + b.iter(|| { + batch::scalar_impl::flow_state_batch(&q, &m, &mut out); + criterion::black_box(&out); + }); + }); + } + group.finish(); +} + +fn bench_gate_decision_disc(c: &mut Criterion) { + let mut group = c.benchmark_group("gate_decision_disc_batch"); + for &size in &[8usize, 64, 1024, 16384] { + let (q, m) = make_inputs(size); + let mut out = vec![0u8; size]; + + group.throughput(Throughput::Elements(size as u64)); + + group.bench_with_input(BenchmarkId::new("dispatch", size), &size, |b, _| { + b.iter(|| { + batch::gate_decision_disc_batch(&q, &m, &mut out); + criterion::black_box(&out); + }); + }); + + group.bench_with_input(BenchmarkId::new("scalar", size), &size, |b, _| { + b.iter(|| { + batch::scalar_impl::gate_decision_disc_batch(&q, &m, &mut out); + criterion::black_box(&out); + }); + }); + } + group.finish(); +} + +fn bench_mul_assess(c: &mut Criterion) { + let mut group = c.benchmark_group("mul_assess_batch"); + for &size in &[8usize, 64, 1024, 16384] { + let (q, m) = make_inputs(size); + let dummy = || lance_graph_contract::mul::i4_eval::mul_assess_i4(&QualiaI4_16D::ZERO, 0); + let mut out: Vec<_> = (0..size).map(|_| dummy()).collect(); + + group.throughput(Throughput::Elements(size as u64)); + + group.bench_with_input(BenchmarkId::new("dispatch", size), &size, |b, _| { + b.iter(|| { + batch::mul_assess_batch(&q, &m, &mut out); + criterion::black_box(&out); + }); + }); + + group.bench_with_input(BenchmarkId::new("scalar", size), &size, |b, _| { + b.iter(|| { + batch::scalar_impl::mul_assess_batch(&q, &m, &mut out); + criterion::black_box(&out); + }); + }); + } + group.finish(); +} + +criterion_group!( + benches, + bench_dk_position, + bench_trust_texture, + bench_flow_state, + bench_gate_decision_disc, + bench_mul_assess, +); +criterion_main!(benches); diff --git a/crates/lance-graph-contract/src/mul.rs b/crates/lance-graph-contract/src/mul.rs index 47ceafcb..08d70997 100644 --- a/crates/lance-graph-contract/src/mul.rs +++ b/crates/lance-graph-contract/src/mul.rs @@ -118,6 +118,10 @@ pub enum FlowState { } /// Gate decision: should the system proceed, pause, or block? +/// +/// Cannot be `#[repr(u8)]` because `Hold` and `Block` carry `String` payloads. +/// Use [`GateDecision::to_disc`] for the SIMD-packable byte discriminant, or +/// [`batch::gate_decision_disc_batch`] for bulk processing. #[derive(Debug, Clone)] pub enum GateDecision { /// Proceed with full autonomy. @@ -128,6 +132,20 @@ pub enum GateDecision { Block { reason: String }, } +impl GateDecision { + /// Return the discriminant as a SIMD-packable byte (D-CSV-13b). + /// + /// Mapping is locked: 0 = Flow, 1 = Hold, 2 = Block. + #[inline] + pub fn to_disc(&self) -> u8 { + match self { + GateDecision::Flow => 0, + GateDecision::Hold { .. } => 1, + GateDecision::Block { .. } => 2, + } + } +} + /// Compass result: surface-to-meta transition detection. #[derive(Debug, Clone)] pub struct CompassResult { @@ -626,71 +644,816 @@ pub mod i4_eval { // ═══════════════════════════════════════════════════════════════════════ - // Batch evaluation API — D-CSV-13 (sprint-12) + // Batch evaluation API — D-CSV-13b (sprint-13) — SIMD runtime dispatch // ═══════════════════════════════════════════════════════════════════════ - /// Batch evaluation API for D-CSV-13. - /// Processes N (qualia, mantissa) pairs in one call. Shape is SIMD-friendly: - /// outputs are produced into pre-allocated `&mut [T]` buffers parallel to the - /// inputs. Sprint-13+ replaces the scalar inner loop with AVX-512 i4 lane - /// intrinsics; the API surface defined here is the contract that vectorization - /// targets. + /// Batch evaluation API (D-CSV-13b, Sprint-13). + /// + /// Runtime SIMD dispatch via `simd_caps()` (OQ-CSV-13). No compile-time + /// `cfg(target_feature)` — one binary runs on any host. Falls back to + /// `scalar_impl` when AVX-512BW or NEON is absent. + /// + /// # Gate-decision carve-out + /// `GateDecision` carries a `String` payload and cannot be `#[repr(u8)]`. + /// `gate_decision_disc_batch` returns a `Vec` (0=Flow, 1=Hold, 2=Block) + /// for SIMD-fast callers. `gate_decision_batch` returns the full + /// `GateDecision` with reason strings via the scalar path. pub mod batch { use super::*; + // ───────────────────────────────────────────────────────────────────── + // Runtime SIMD capability detection (zero-dep, OQ-CSV-13) + // ───────────────────────────────────────────────────────────────────── + use core::sync::atomic::{AtomicU8, Ordering}; + + /// Packed capability flags stored in a single atomic byte. + /// Bit 0 = avx512f, Bit 1 = avx512bw, Bit 2 = neon. + /// Value 0xFF = not yet probed. + static CAPS_CACHE: AtomicU8 = AtomicU8::new(0xFF); + + #[derive(Clone, Copy)] + struct SimdCapsShim { + avx512f: bool, + avx512bw: bool, + neon: bool, + } + + #[cold] + fn probe_caps() -> SimdCapsShim { + let avx512f; + let avx512bw; + let neon; + + #[cfg(target_arch = "x86_64")] + { + avx512f = is_x86_feature_detected!("avx512f"); + avx512bw = is_x86_feature_detected!("avx512bw"); + neon = false; + } + #[cfg(target_arch = "aarch64")] + { + avx512f = false; + avx512bw = false; + // NEON is mandatory on aarch64. + neon = is_aarch64_feature_detected!("neon"); + } + #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + { + avx512f = false; + avx512bw = false; + neon = false; + } + + let bits: u8 = (avx512f as u8) | ((avx512bw as u8) << 1) | ((neon as u8) << 2); + CAPS_CACHE.store(bits, Ordering::Relaxed); + SimdCapsShim { avx512f, avx512bw, neon } + } + + #[inline] + fn simd_caps() -> SimdCapsShim { + let bits = CAPS_CACHE.load(Ordering::Relaxed); + if bits == 0xFF { + return probe_caps(); + } + SimdCapsShim { + avx512f: bits & 1 != 0, + avx512bw: bits & 2 != 0, + neon: bits & 4 != 0, + } + } + + // ───────────────────────────────────────────────────────────────────── + // scalar_impl — correctness anchor, used as fallback and in tests + // ───────────────────────────────────────────────────────────────────── + pub(crate) mod scalar_impl { + use super::super::*; + use crate::qualia::QualiaI4_16D; + + pub fn dk_position_batch( + qualia: &[QualiaI4_16D], + mantissas: &[i8], + out: &mut [DkPosition], + ) { + for i in 0..qualia.len() { + out[i] = dk_position_i4(&qualia[i], mantissas[i]); + } + } + + pub fn trust_texture_batch(qualia: &[QualiaI4_16D], out: &mut [TrustTexture]) { + for i in 0..qualia.len() { + out[i] = trust_texture_i4(&qualia[i]); + } + } + + pub fn flow_state_batch( + qualia: &[QualiaI4_16D], + mantissas: &[i8], + out: &mut [FlowState], + ) { + for i in 0..qualia.len() { + out[i] = flow_state_i4(&qualia[i], mantissas[i]); + } + } + + /// Returns discriminants: 0=Flow, 1=Hold, 2=Block. + pub fn gate_decision_disc_batch( + qualia: &[QualiaI4_16D], + mantissas: &[i8], + out: &mut [u8], + ) { + for i in 0..qualia.len() { + out[i] = gate_decision_i4(&qualia[i], mantissas[i]).to_disc(); + } + } + + pub fn mul_assess_batch( + qualia: &[QualiaI4_16D], + mantissas: &[i8], + out: &mut [MulAssessment], + ) { + for i in 0..qualia.len() { + out[i] = mul_assess_i4(&qualia[i], mantissas[i]); + } + } + } + + // ───────────────────────────────────────────────────────────────────── + // avx512_impl — AVX-512F + BW i4 intrinsics (D-CSV-13b) + // ───────────────────────────────────────────────────────────────────── + #[cfg(target_arch = "x86_64")] + pub(crate) mod avx512_impl { + use super::super::*; + use crate::qualia::QualiaI4_16D; + use core::arch::x86_64::*; + + /// Extract one i4 dimension (at nibble offset `SHIFT` bits) from each + /// u64 lane of an 8-lane __m512i, sign-extending to i8 in the low byte. + /// + /// `SHIFT` must be a compile-time constant (required by `_mm512_srli_epi64`). + /// + /// # SAFETY + /// Caller must have verified avx512f + avx512bw at runtime before calling + /// any function in this module. + #[target_feature(enable = "avx512f,avx512bw")] + #[inline] + unsafe fn extract_dim_i8(q_vec: __m512i) -> __m512i { + // Shift the target nibble to bits [3:0] of each i64 lane. + let shifted = _mm512_srli_epi64(q_vec, SHIFT); + // Mask to the 4-bit nibble. + let mask_f = _mm512_set1_epi64(0xF); + let nibble = _mm512_and_si512(shifted, mask_f); + // Sign-extend nibble from 4 bits to i8: + // The nibble occupies bits [3:0] of the low byte of each i64 lane. + // _mm512_slli_epi16 / _mm512_srai_epi16 operate on 16-bit lanes. + // Shift left by 12 puts the nibble sign bit (bit 3) into bit 15 (i16 sign). + // Arithmetic shift right by 12 sign-extends back to fill bits [15:4]. + // Low byte (bits [7:0]) then contains the sign-extended value as i8. + let up = _mm512_slli_epi16(nibble, 12); + _mm512_srai_epi16(up, 12) + } + + /// Store the low byte of each i64 lane (8 bytes) into `out[0..8]`. + /// + /// Avoids VBMI2 `_mm512_mask_compressstoreu_epi8` — not available on + /// Skylake-X/Cascade Lake. Uses scalar byte-extract from a stack buffer + /// (spec §8 R-6, TD-D-CSV-13b-VBMI2-1). + /// + /// # SAFETY + /// `out` must point to at least 8 writable bytes; avx512f verified at runtime. + #[target_feature(enable = "avx512f")] + #[inline] + unsafe fn extract_8_lane0_bytes(result: __m512i, out: *mut u8) { + let mut buf = [0u8; 64]; + _mm512_storeu_si512(buf.as_mut_ptr() as *mut __m512i, result); + for j in 0..8usize { + *out.add(j) = buf[j * 8]; + } + } + + /// Batch DK position — AVX-512 path (8 elements per iteration). + /// + /// # SAFETY + /// avx512f + avx512bw must be verified at runtime via `simd_caps()`; + /// `qualia.len() == mantissas.len() == out.len()` asserted by caller; + /// `qualia.len() >= 8`. + #[target_feature(enable = "avx512f,avx512bw")] + pub unsafe fn dk_position_batch( + qualia: &[QualiaI4_16D], + mantissas: &[i8], + out: &mut [DkPosition], + ) { + let n = qualia.len(); + let mut i = 0usize; + while i + 8 <= n { + // SAFETY: QualiaI4_16D is repr(C, align(8)); 8 consecutive elements + // occupy exactly 64 bytes — one __m512i word. + let q_ptr = qualia[i..].as_ptr() as *const __m512i; + let q_vec = _mm512_loadu_si512(q_ptr); + let coh = extract_dim_i8::<36>(q_vec); // DIM_COHERENCE=9 → nibble shift 36 + + let m_ptr = mantissas.as_ptr().add(i); + let man_vec = _mm512_set_epi64( + *m_ptr.add(7) as i64, *m_ptr.add(6) as i64, + *m_ptr.add(5) as i64, *m_ptr.add(4) as i64, + *m_ptr.add(3) as i64, *m_ptr.add(2) as i64, + *m_ptr.add(1) as i64, *m_ptr.add(0) as i64, + ); + let zero = _mm512_setzero_si512(); + let neg_man = _mm512_sub_epi64(zero, man_vec); + let man_neg_mask = _mm512_cmplt_epi64_mask(man_vec, zero); + let abs_man = _mm512_mask_blend_epi64(man_neg_mask, man_vec, neg_man); + + // Priority chain (lowest to highest): + // Default = MountStupid (0) + let mut disc = _mm512_setzero_si512(); + // ValleyOfDespair (1): coherence <= -3 OR abs_man <= 1 + let vod = _mm512_cmple_epi64_mask(coh, _mm512_set1_epi64(-3)) + | _mm512_cmple_epi64_mask(abs_man, _mm512_set1_epi64(1)); + disc = _mm512_mask_blend_epi64(vod, disc, _mm512_set1_epi64(1)); + // SlopeOfEnlightenment (2): coh >= 2 AND abs_man >= 2 + let soe = _mm512_cmpge_epi64_mask(coh, _mm512_set1_epi64(2)) + & _mm512_cmpge_epi64_mask(abs_man, _mm512_set1_epi64(2)); + disc = _mm512_mask_blend_epi64(soe, disc, _mm512_set1_epi64(2)); + // Plateau (3): coh >= 5 AND abs_man >= 4 (overrides all) + let plat = _mm512_cmpge_epi64_mask(coh, _mm512_set1_epi64(5)) + & _mm512_cmpge_epi64_mask(abs_man, _mm512_set1_epi64(4)); + disc = _mm512_mask_blend_epi64(plat, disc, _mm512_set1_epi64(3)); + + let out_ptr = out.as_mut_ptr().add(i) as *mut u8; + extract_8_lane0_bytes(disc, out_ptr); + i += 8; + } + while i < n { + out[i] = super::super::dk_position_i4(&qualia[i], mantissas[i]); + i += 1; + } + } + + /// Batch TrustTexture — AVX-512 path (8 elements per iteration). + /// + /// # SAFETY + /// avx512f + avx512bw verified at runtime; lengths asserted by caller. + #[target_feature(enable = "avx512f,avx512bw")] + pub unsafe fn trust_texture_batch(qualia: &[QualiaI4_16D], out: &mut [TrustTexture]) { + let n = qualia.len(); + let mut i = 0usize; + while i + 8 <= n { + // SAFETY: QualiaI4_16D is repr(C, align(8)); 8 consecutive elements + // occupy exactly 64 bytes — one __m512i word. + let q_ptr = qualia[i..].as_ptr() as *const __m512i; + let q_vec = _mm512_loadu_si512(q_ptr); + let coh = extract_dim_i8::<36>(q_vec); // DIM_COHERENCE=9 + let val = extract_dim_i8::<4>(q_vec); // DIM_VALENCE=1 + let ten = extract_dim_i8::<8>(q_vec); // DIM_TENSION=2 + + // Default = Calibrated (0) + let mut disc = _mm512_setzero_si512(); + // Underconfident (3): valence <= -3 + let und = _mm512_cmple_epi64_mask(val, _mm512_set1_epi64(-3)); + disc = _mm512_mask_blend_epi64(und, disc, _mm512_set1_epi64(3)); + // Overconfident (1): valence >= 4 AND coherence < 5 + let ovc = _mm512_cmpge_epi64_mask(val, _mm512_set1_epi64(4)) + & _mm512_cmplt_epi64_mask(coh, _mm512_set1_epi64(5)); + disc = _mm512_mask_blend_epi64(ovc, disc, _mm512_set1_epi64(1)); + // Uncertain (2): coherence <= -3 AND tension >= 3 (highest priority) + let unc = _mm512_cmple_epi64_mask(coh, _mm512_set1_epi64(-3)) + & _mm512_cmpge_epi64_mask(ten, _mm512_set1_epi64(3)); + disc = _mm512_mask_blend_epi64(unc, disc, _mm512_set1_epi64(2)); + + let out_ptr = out.as_mut_ptr().add(i) as *mut u8; + extract_8_lane0_bytes(disc, out_ptr); + i += 8; + } + while i < n { + out[i] = super::super::trust_texture_i4(&qualia[i]); + i += 1; + } + } + + /// Batch FlowState — AVX-512 path (8 elements per iteration). + /// + /// # SAFETY + /// avx512f + avx512bw verified at runtime; lengths asserted by caller. + #[target_feature(enable = "avx512f,avx512bw")] + pub unsafe fn flow_state_batch( + qualia: &[QualiaI4_16D], + mantissas: &[i8], + out: &mut [FlowState], + ) { + let n = qualia.len(); + let mut i = 0usize; + while i + 8 <= n { + // SAFETY: QualiaI4_16D is repr(C, align(8)); 8 consecutive elements + // occupy exactly 64 bytes — one __m512i word. + let q_ptr = qualia[i..].as_ptr() as *const __m512i; + let q_vec = _mm512_loadu_si512(q_ptr); + let war = extract_dim_i8::<12>(q_vec); // DIM_WARMTH=3 + let grd = extract_dim_i8::<56>(q_vec); // DIM_GROUNDEDNESS=14 + let ten = extract_dim_i8::<8>(q_vec); // DIM_TENSION=2 + let coh = extract_dim_i8::<36>(q_vec); // DIM_COHERENCE=9 + + // flow_proxy = warmth + groundedness - tension (saturating i16). + let fp = _mm512_subs_epi16(_mm512_adds_epi16(war, grd), ten); + + let m_ptr = mantissas.as_ptr().add(i); + let man_vec = _mm512_set_epi64( + *m_ptr.add(7) as i64, *m_ptr.add(6) as i64, + *m_ptr.add(5) as i64, *m_ptr.add(4) as i64, + *m_ptr.add(3) as i64, *m_ptr.add(2) as i64, + *m_ptr.add(1) as i64, *m_ptr.add(0) as i64, + ); + let zero = _mm512_setzero_si512(); + + // Pre-compute Anxiety condition (applied last for highest priority). + let anx = _mm512_cmple_epi64_mask(fp, _mm512_set1_epi64(-2)) + | (_mm512_cmplt_epi64_mask(man_vec, zero) + & _mm512_cmple_epi64_mask(coh, _mm512_set1_epi64(-1))); + + // Default = Boredom (1) + let mut disc = _mm512_set1_epi64(1); + // Transition (2): fp >= 2 AND man > 0 + let tra = _mm512_cmpge_epi64_mask(fp, _mm512_set1_epi64(2)) + & _mm512_cmpgt_epi64_mask(man_vec, zero); + disc = _mm512_mask_blend_epi64(tra, disc, _mm512_set1_epi64(2)); + // Flow (0): fp >= 4 AND man > 0 + let flow = _mm512_cmpge_epi64_mask(fp, _mm512_set1_epi64(4)) + & _mm512_cmpgt_epi64_mask(man_vec, zero); + disc = _mm512_mask_blend_epi64(flow, disc, _mm512_set1_epi64(0)); + // Anxiety (3): always overrides (highest priority) + disc = _mm512_mask_blend_epi64(anx, disc, _mm512_set1_epi64(3)); + + let out_ptr = out.as_mut_ptr().add(i) as *mut u8; + extract_8_lane0_bytes(disc, out_ptr); + i += 8; + } + while i < n { + out[i] = super::super::flow_state_i4(&qualia[i], mantissas[i]); + i += 1; + } + } + + /// Batch gate decision discriminants — AVX-512 path (8 elements per iteration). + /// + /// Gate LUT (tex_disc * 4 + flow_disc): + /// ```text + /// Flow(0) Boredom(1) Transition(2) Anxiety(3) + /// Cal(0): 0 1 0 1 + /// Ovc(1): 1 1 1 1 + /// Unc(2): 2 2 2 2 + /// Und(3): 0 1 0 2 + /// ``` + /// + /// # SAFETY + /// avx512f + avx512bw verified at runtime; lengths asserted by caller. + #[target_feature(enable = "avx512f,avx512bw")] + pub unsafe fn gate_decision_disc_batch( + qualia: &[QualiaI4_16D], + mantissas: &[i8], + out: &mut [u8], + ) { + // LUT index = tex_disc * 4 + flow_disc. + // tex: Cal=0, Ovc=1, Unc=2, Und=3; flow: Flow=0, Bor=1, Tra=2, Anx=3. + const LUT: [u8; 16] = [ + 0, 1, 0, 1, // Cal + 1, 1, 1, 1, // Ovc + 2, 2, 2, 2, // Unc + 0, 1, 0, 2, // Und + ]; + let n = qualia.len(); + let mut i = 0usize; + let mut tex_disc = [0u8; 8]; + let mut flow_disc = [0u8; 8]; + while i + 8 <= n { + // SAFETY: TrustTexture/FlowState are repr(u8); pointers derived from + // properly-allocated arrays of the right size. + let tex_slice = core::slice::from_raw_parts_mut( + tex_disc.as_mut_ptr() as *mut TrustTexture, 8, + ); + trust_texture_batch(&qualia[i..i + 8], tex_slice); + let flow_slice = core::slice::from_raw_parts_mut( + flow_disc.as_mut_ptr() as *mut FlowState, 8, + ); + flow_state_batch(&qualia[i..i + 8], &mantissas[i..i + 8], flow_slice); + for j in 0..8usize { + let idx = (tex_disc[j] as usize) * 4 + (flow_disc[j] as usize); + out[i + j] = LUT[idx]; + } + i += 8; + } + while i < n { + out[i] = super::super::gate_decision_i4(&qualia[i], mantissas[i]).to_disc(); + i += 1; + } + } + + /// Batch MulAssessment — AVX-512 path. + /// + /// Uses SIMD for disc fields then scalar finalization for f64 fields. + /// + /// # SAFETY + /// avx512f + avx512bw verified at runtime; lengths asserted by caller. + #[target_feature(enable = "avx512f,avx512bw")] + pub unsafe fn mul_assess_batch( + qualia: &[QualiaI4_16D], + mantissas: &[i8], + out: &mut [MulAssessment], + ) { + let n = qualia.len(); + let mut dk_disc = vec![0u8; n]; + let mut tex_disc = vec![0u8; n]; + let mut flow_disc = vec![0u8; n]; + + // SAFETY: DkPosition/TrustTexture/FlowState are repr(u8) with discriminants 0..3; + // vec storage is properly aligned and has length n. + dk_position_batch( + qualia, mantissas, + core::slice::from_raw_parts_mut(dk_disc.as_mut_ptr() as *mut DkPosition, n), + ); + trust_texture_batch( + qualia, + core::slice::from_raw_parts_mut(tex_disc.as_mut_ptr() as *mut TrustTexture, n), + ); + flow_state_batch( + qualia, mantissas, + core::slice::from_raw_parts_mut(flow_disc.as_mut_ptr() as *mut FlowState, n), + ); + + for i in 0..n { + // SAFETY: repr(u8) enums with locked discriminants 0..3; values + // were written by the SIMD functions above which only produce 0..3. + let dk: DkPosition = core::mem::transmute(dk_disc[i]); + let texture: TrustTexture = core::mem::transmute(tex_disc[i]); + let flow: FlowState = core::mem::transmute(flow_disc[i]); + + let intensity = qualia[i].magnitude(); + let trust_value: f64 = match texture { + TrustTexture::Calibrated => { + 0.75 + (intensity.clamp(0, 7) as f64 / 7.0) * 0.25 + } + TrustTexture::Overconfident => 0.45, + TrustTexture::Underconfident => 0.40, + TrustTexture::Uncertain => 0.20, + }; + let trust = TrustQualia { value: trust_value, texture }; + let coherence = qualia[i].get(9); // DIM_COHERENCE + let complexity_mapped = coherence >= 2; + let tension = qualia[i].get(2); // DIM_TENSION + let allostatic_load = + ((tension as i16 + 8) as f64 / 15.0).clamp(0.0, 1.0); + let homeostasis = Homeostasis { flow_state: flow, allostatic_load }; + let dk_factor: f64 = match dk { + DkPosition::MountStupid => 0.3, + DkPosition::ValleyOfDespair => 0.7, + DkPosition::SlopeOfEnlightenment => 0.85, + DkPosition::Plateau => 1.0, + }; + let flow_factor: f64 = match flow { + FlowState::Flow => 1.0, + FlowState::Transition => 0.7, + FlowState::Boredom => 0.8, + FlowState::Anxiety => 0.5, + }; + let free_will_modifier = + (dk_factor * trust_value * flow_factor).clamp(0.0, 1.0); + out[i] = MulAssessment { + trust, + dk_position: dk, + homeostasis, + complexity_mapped, + free_will_modifier, + }; + } + } + } // avx512_impl + + // ───────────────────────────────────────────────────────────────────── + // neon_impl — ARM NEON i4 intrinsics (D-CSV-13b) + // ───────────────────────────────────────────────────────────────────── + #[cfg(target_arch = "aarch64")] + pub(crate) mod neon_impl { + use super::super::*; + use crate::qualia::QualiaI4_16D; + use core::arch::aarch64::*; + + /// Extract one i4 dim from each of two u64 qualia words, sign-extend to i8. + /// + /// # SAFETY + /// NEON is mandatory on aarch64; caller verifies via `is_aarch64_feature_detected!`. + #[inline] + unsafe fn extract_dim_pair( + q0: uint64x2_t, + q1: uint64x2_t, + shift: i32, + ) -> (int8x16_t, int8x16_t) { + let mask = vdupq_n_u64(0xF); + let n0 = vandq_u64(vshrq_n_u64(q0, shift), mask); + let n1 = vandq_u64(vshrq_n_u64(q1, shift), mask); + let i0 = vreinterpretq_s8_u64(n0); + let i1 = vreinterpretq_s8_u64(n1); + (vshrq_n_s8(vshlq_n_s8(i0, 4), 4), vshrq_n_s8(vshlq_n_s8(i1, 4), 4)) + } + + /// Batch DK position — NEON path (2 elements per iteration). + /// + /// # SAFETY + /// NEON verified at runtime; `qualia.len() >= 2`; lengths asserted by caller. + pub unsafe fn dk_position_batch( + qualia: &[QualiaI4_16D], + mantissas: &[i8], + out: &mut [DkPosition], + ) { + let n = qualia.len(); + let mut i = 0usize; + while i + 2 <= n { + // SAFETY: QualiaI4_16D is repr(C, align(8)); &.0 is a valid *const u64. + let q0 = vld1q_u64(&qualia[i].0 as *const u64); + let q1 = vld1q_u64(&qualia[i + 1].0 as *const u64); + let (c0, c1) = extract_dim_pair(q0, q1, 36); + let coh = [vgetq_lane_s8(c0, 0), vgetq_lane_s8(c1, 0)]; + let abs_man = [ + mantissas[i].unsigned_abs() as i8, + mantissas[i + 1].unsigned_abs() as i8, + ]; + for j in 0..2 { + out[i + j] = if coh[j] >= 5 && abs_man[j] >= 4 { + DkPosition::Plateau + } else if coh[j] >= 2 && abs_man[j] >= 2 { + DkPosition::SlopeOfEnlightenment + } else if coh[j] <= -3 || abs_man[j] <= 1 { + DkPosition::ValleyOfDespair + } else { + DkPosition::MountStupid + }; + } + i += 2; + } + while i < n { + out[i] = super::super::dk_position_i4(&qualia[i], mantissas[i]); + i += 1; + } + } + + /// Batch TrustTexture — NEON path (2 elements per iteration). + /// + /// # SAFETY + /// NEON verified at runtime; lengths asserted by caller. + pub unsafe fn trust_texture_batch(qualia: &[QualiaI4_16D], out: &mut [TrustTexture]) { + let n = qualia.len(); + let mut i = 0usize; + while i + 2 <= n { + // SAFETY: QualiaI4_16D is repr(C, align(8)); &.0 is a valid *const u64. + let q0 = vld1q_u64(&qualia[i].0 as *const u64); + let q1 = vld1q_u64(&qualia[i + 1].0 as *const u64); + let (c0, c1) = extract_dim_pair(q0, q1, 36); + let (v0, v1) = extract_dim_pair(q0, q1, 4); + let (t0, t1) = extract_dim_pair(q0, q1, 8); + let coh = [vgetq_lane_s8(c0, 0), vgetq_lane_s8(c1, 0)]; + let val = [vgetq_lane_s8(v0, 0), vgetq_lane_s8(v1, 0)]; + let ten = [vgetq_lane_s8(t0, 0), vgetq_lane_s8(t1, 0)]; + for j in 0..2 { + out[i + j] = if coh[j] <= -3 && ten[j] >= 3 { + TrustTexture::Uncertain + } else if val[j] >= 4 && coh[j] < 5 { + TrustTexture::Overconfident + } else if val[j] <= -3 { + TrustTexture::Underconfident + } else { + TrustTexture::Calibrated + }; + } + i += 2; + } + while i < n { + out[i] = super::super::trust_texture_i4(&qualia[i]); + i += 1; + } + } + + /// Batch FlowState — NEON path (2 elements per iteration). + /// + /// # SAFETY + /// NEON verified at runtime; lengths asserted by caller. + pub unsafe fn flow_state_batch( + qualia: &[QualiaI4_16D], + mantissas: &[i8], + out: &mut [FlowState], + ) { + let n = qualia.len(); + let mut i = 0usize; + while i + 2 <= n { + // SAFETY: QualiaI4_16D is repr(C, align(8)); &.0 is a valid *const u64. + let q0 = vld1q_u64(&qualia[i].0 as *const u64); + let q1 = vld1q_u64(&qualia[i + 1].0 as *const u64); + let (w0, w1) = extract_dim_pair(q0, q1, 12); + let (g0, g1) = extract_dim_pair(q0, q1, 56); + let (t0, t1) = extract_dim_pair(q0, q1, 8); + let (c0, c1) = extract_dim_pair(q0, q1, 36); + let war = [vgetq_lane_s8(w0, 0), vgetq_lane_s8(w1, 0)]; + let grd = [vgetq_lane_s8(g0, 0), vgetq_lane_s8(g1, 0)]; + let ten = [vgetq_lane_s8(t0, 0), vgetq_lane_s8(t1, 0)]; + let coh = [vgetq_lane_s8(c0, 0), vgetq_lane_s8(c1, 0)]; + for j in 0..2 { + let fp = (war[j] as i16 + grd[j] as i16 - ten[j] as i16) + .clamp(i8::MIN as i16, i8::MAX as i16) as i8; + let man = mantissas[i + j]; + out[i + j] = if fp >= 4 && man > 0 { + FlowState::Flow + } else if fp <= -2 || (man < 0 && coh[j] <= -1) { + FlowState::Anxiety + } else if fp >= 2 && man > 0 { + FlowState::Transition + } else { + FlowState::Boredom + }; + } + i += 2; + } + while i < n { + out[i] = super::super::flow_state_i4(&qualia[i], mantissas[i]); + i += 1; + } + } + + /// Batch gate decision discriminants — NEON path (scalar LUT). + /// + /// # SAFETY + /// NEON verified at runtime; lengths asserted by caller. + pub unsafe fn gate_decision_disc_batch( + qualia: &[QualiaI4_16D], + mantissas: &[i8], + out: &mut [u8], + ) { + // Use scalar path: String allocation in gate_decision_i4 is the bottleneck, + // not the comparison logic. NEON benefit is in the disc functions above. + for i in 0..qualia.len() { + out[i] = super::super::gate_decision_i4(&qualia[i], mantissas[i]).to_disc(); + } + } + + /// Batch MulAssessment — NEON path (scalar finalization for f64 fields). + /// + /// # SAFETY + /// NEON verified at runtime; lengths asserted by caller. + pub unsafe fn mul_assess_batch( + qualia: &[QualiaI4_16D], + mantissas: &[i8], + out: &mut [MulAssessment], + ) { + for i in 0..qualia.len() { + out[i] = super::super::mul_assess_i4(&qualia[i], mantissas[i]); + } + } + } // neon_impl + + // ───────────────────────────────────────────────────────────────────── + // Public dispatch API (OQ-CSV-13: runtime SIMD, not compile-time) + // ───────────────────────────────────────────────────────────────────── + /// Batch DK position: `qualia.len() == mantissas.len() == out.len()` must hold. - /// Each output is the result of `dk_position_i4(qualia[i], mantissas[i])`. - /// Panics on length mismatch. - pub fn dk_position_batch(qualia: &[QualiaI4_16D], mantissas: &[i8], out: &mut [DkPosition]) { + /// Panics on length mismatch. Dispatches to AVX-512/NEON if available at runtime. + pub fn dk_position_batch( + qualia: &[QualiaI4_16D], + mantissas: &[i8], + out: &mut [DkPosition], + ) { assert_eq!(qualia.len(), mantissas.len(), "qualia/mantissas length mismatch"); assert_eq!(qualia.len(), out.len(), "input/output length mismatch"); - for i in 0..qualia.len() { - out[i] = dk_position_i4(&qualia[i], mantissas[i]); + let caps = simd_caps(); + #[cfg(target_arch = "x86_64")] + if caps.avx512f && caps.avx512bw && qualia.len() >= 8 { + // SAFETY: avx512f+avx512bw verified at runtime above; lengths asserted. + unsafe { avx512_impl::dk_position_batch(qualia, mantissas, out) }; + return; + } + #[cfg(target_arch = "aarch64")] + if caps.neon && qualia.len() >= 2 { + // SAFETY: neon verified at runtime above; lengths asserted. + unsafe { neon_impl::dk_position_batch(qualia, mantissas, out) }; + return; } + scalar_impl::dk_position_batch(qualia, mantissas, out); } - /// Batch TrustTexture (qualia-only): for each qualia, compute trust_texture_i4. + /// Batch TrustTexture. Dispatches to AVX-512/NEON if available at runtime. pub fn trust_texture_batch(qualia: &[QualiaI4_16D], out: &mut [TrustTexture]) { - assert_eq!(qualia.len(), out.len()); - for i in 0..qualia.len() { - out[i] = trust_texture_i4(&qualia[i]); + assert_eq!(qualia.len(), out.len(), "input/output length mismatch"); + let caps = simd_caps(); + #[cfg(target_arch = "x86_64")] + if caps.avx512f && caps.avx512bw && qualia.len() >= 8 { + // SAFETY: avx512f+avx512bw verified at runtime above; lengths asserted. + unsafe { avx512_impl::trust_texture_batch(qualia, out) }; + return; } + #[cfg(target_arch = "aarch64")] + if caps.neon && qualia.len() >= 2 { + // SAFETY: neon verified at runtime above; lengths asserted. + unsafe { neon_impl::trust_texture_batch(qualia, out) }; + return; + } + scalar_impl::trust_texture_batch(qualia, out); } - /// Batch FlowState: parallel arrays of qualia + mantissas → flow states. + /// Batch FlowState. Dispatches to AVX-512/NEON if available at runtime. pub fn flow_state_batch(qualia: &[QualiaI4_16D], mantissas: &[i8], out: &mut [FlowState]) { - assert_eq!(qualia.len(), mantissas.len()); - assert_eq!(qualia.len(), out.len()); - for i in 0..qualia.len() { - out[i] = flow_state_i4(&qualia[i], mantissas[i]); + assert_eq!(qualia.len(), mantissas.len(), "qualia/mantissas length mismatch"); + assert_eq!(qualia.len(), out.len(), "input/output length mismatch"); + let caps = simd_caps(); + #[cfg(target_arch = "x86_64")] + if caps.avx512f && caps.avx512bw && qualia.len() >= 8 { + // SAFETY: avx512f+avx512bw verified at runtime above; lengths asserted. + unsafe { avx512_impl::flow_state_batch(qualia, mantissas, out) }; + return; + } + #[cfg(target_arch = "aarch64")] + if caps.neon && qualia.len() >= 2 { + // SAFETY: neon verified at runtime above; lengths asserted. + unsafe { neon_impl::flow_state_batch(qualia, mantissas, out) }; + return; + } + scalar_impl::flow_state_batch(qualia, mantissas, out); + } + + /// Batch gate decision discriminants: 0=Flow, 1=Hold, 2=Block. + /// + /// SIMD-fast alternative to `gate_decision_batch`. Use when reason strings + /// are not needed. Dispatches to AVX-512/NEON if available at runtime. + pub fn gate_decision_disc_batch( + qualia: &[QualiaI4_16D], + mantissas: &[i8], + out: &mut [u8], + ) { + assert_eq!(qualia.len(), mantissas.len(), "qualia/mantissas length mismatch"); + assert_eq!(qualia.len(), out.len(), "input/output length mismatch"); + let caps = simd_caps(); + #[cfg(target_arch = "x86_64")] + if caps.avx512f && caps.avx512bw && qualia.len() >= 8 { + // SAFETY: avx512f+avx512bw verified at runtime above; lengths asserted. + unsafe { avx512_impl::gate_decision_disc_batch(qualia, mantissas, out) }; + return; + } + #[cfg(target_arch = "aarch64")] + if caps.neon && qualia.len() >= 2 { + // SAFETY: neon verified at runtime above; lengths asserted. + unsafe { neon_impl::gate_decision_disc_batch(qualia, mantissas, out) }; + return; } + scalar_impl::gate_decision_disc_batch(qualia, mantissas, out); } - /// Batch GateDecision. - pub fn gate_decision_batch(qualia: &[QualiaI4_16D], mantissas: &[i8], out: &mut [GateDecision]) { - assert_eq!(qualia.len(), mantissas.len()); - assert_eq!(qualia.len(), out.len()); + /// Batch full GateDecision with reason strings (scalar path — Strings cannot be SIMD-packed). + pub fn gate_decision_batch( + qualia: &[QualiaI4_16D], + mantissas: &[i8], + out: &mut [GateDecision], + ) { + assert_eq!(qualia.len(), mantissas.len(), "qualia/mantissas length mismatch"); + assert_eq!(qualia.len(), out.len(), "input/output length mismatch"); for i in 0..qualia.len() { out[i] = gate_decision_i4(&qualia[i], mantissas[i]); } } - /// Batch MulAssessment: the full pipeline. - pub fn mul_assess_batch(qualia: &[QualiaI4_16D], mantissas: &[i8], out: &mut [MulAssessment]) { - assert_eq!(qualia.len(), mantissas.len()); - assert_eq!(qualia.len(), out.len()); - for i in 0..qualia.len() { - out[i] = mul_assess_i4(&qualia[i], mantissas[i]); + /// Batch MulAssessment. Dispatches to AVX-512/NEON if available at runtime. + pub fn mul_assess_batch( + qualia: &[QualiaI4_16D], + mantissas: &[i8], + out: &mut [MulAssessment], + ) { + assert_eq!(qualia.len(), mantissas.len(), "qualia/mantissas length mismatch"); + assert_eq!(qualia.len(), out.len(), "input/output length mismatch"); + let caps = simd_caps(); + #[cfg(target_arch = "x86_64")] + if caps.avx512f && caps.avx512bw && qualia.len() >= 8 { + // SAFETY: avx512f+avx512bw verified at runtime above; lengths asserted. + unsafe { avx512_impl::mul_assess_batch(qualia, mantissas, out) }; + return; } + #[cfg(target_arch = "aarch64")] + if caps.neon && qualia.len() >= 2 { + // SAFETY: neon verified at runtime above; lengths asserted. + unsafe { neon_impl::mul_assess_batch(qualia, mantissas, out) }; + return; + } + scalar_impl::mul_assess_batch(qualia, mantissas, out); } /// Convenience: allocate the output Vec and return it (for non-hot-path callers). pub fn mul_assess_vec(qualia: &[QualiaI4_16D], mantissas: &[i8]) -> Vec { - assert_eq!(qualia.len(), mantissas.len()); - let mut out = Vec::with_capacity(qualia.len()); - for i in 0..qualia.len() { - out.push(mul_assess_i4(&qualia[i], mantissas[i])); - } + assert_eq!(qualia.len(), mantissas.len(), "qualia/mantissas length mismatch"); + let mut out = vec![ + MulAssessment { + trust: TrustQualia { value: 0.0, texture: TrustTexture::Calibrated }, + dk_position: DkPosition::MountStupid, + homeostasis: Homeostasis { + flow_state: FlowState::Boredom, + allostatic_load: 0.0, + }, + complexity_mapped: false, + free_will_modifier: 0.0, + }; + qualia.len() + ]; + mul_assess_batch(qualia, mantissas, &mut out); out } } From a356e64ef38f9e75331b8bd0e351034e4e950c3d Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 16 May 2026 19:17:22 +0000 Subject: [PATCH 2/6] =?UTF-8?q?test(sprint-13/W-I1):=20D-CSV-13b=20?= =?UTF-8?q?=E2=80=94=20SIMD-vs-scalar=20parity=20tests=20+=20repr(u8)=20en?= =?UTF-8?q?um=20invariant?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds randomised SIMD-vs-scalar parity tests with fixed seed (xorshift64, deterministic, zero-dep) covering all 5 batch fns at 10 sizes including edge cases (0, 1, 3, 7, 8, 9, 15, 16, 64, 1024). Each test exercises every decision branch by setting all 5 read dims (valence, tension, warmth, coherence, groundedness). Locks DkPosition/TrustTexture/FlowState to #[repr(u8)] with explicit discriminants per spec §5 (I-LEGACY-API-FEATURE-GATED). The SIMD impl already byte-wrote into &mut [DkPosition] / [TrustTexture] / [FlowState] slices via extract_8_lane0_bytes; before this commit the underlying enum layout was default-repr so the byte writes were potentially undefined. Discriminants match the SIMD LUT assumptions: - DkPosition: MountStupid=0, ValleyOfDespair=1, Slope=2, Plateau=3 - TrustTexture: Calibrated=0, Overconfident=1, Uncertain=2, Underconfident=3 (note: prior declaration order placed Uncertain=3 — corrected per spec) - FlowState: Flow=0, Boredom=1, Transition=2, Anxiety=3 (note: prior declaration order placed Anxiety=0 — corrected per spec) Also fixes the SimdCapsShim dead-code warning (each field is only read on its matching #[cfg(target_arch)] dispatch branch; tagged #[allow(dead_code)] on the struct). Adds criterion 0.5 as a dev-dep (matches lance-graph-benches version) plus the [[bench]] harness=false declaration needed for benches/i4_batch.rs to build via `cargo bench --no-run`. https://claude.ai/code/session_01UwJuKqP828qyX1VkLgGJFS --- crates/lance-graph-contract/Cargo.toml | 7 + crates/lance-graph-contract/src/mul.rs | 217 +++++++++++++++++++++++-- 2 files changed, 210 insertions(+), 14 deletions(-) diff --git a/crates/lance-graph-contract/Cargo.toml b/crates/lance-graph-contract/Cargo.toml index df73066f..842507c8 100644 --- a/crates/lance-graph-contract/Cargo.toml +++ b/crates/lance-graph-contract/Cargo.toml @@ -22,6 +22,13 @@ glob = "0.3" # Used in manifest_codegen tests to replicate validation logic in-process. serde_yaml = "0.9" serde = { version = "1", features = ["derive"] } +# D-CSV-13b: criterion bench scaffold for i4_eval::batch SIMD dispatch. +# Matches version used in crates/lance-graph-benches/Cargo.toml. +criterion = { version = "0.5", default-features = false, features = ["html_reports"] } + +[[bench]] +name = "i4_batch" +harness = false [features] # A-unlock-stepdomain — `step_trajectory_hash` forward stub for the E4 diff --git a/crates/lance-graph-contract/src/mul.rs b/crates/lance-graph-contract/src/mul.rs index 08d70997..bb385ab0 100644 --- a/crates/lance-graph-contract/src/mul.rs +++ b/crates/lance-graph-contract/src/mul.rs @@ -70,29 +70,42 @@ pub struct TrustQualia { } /// Trust texture — qualitative assessment of trust. +/// +/// **D-CSV-13b layout invariant (I-LEGACY-API-FEATURE-GATED, spec §5):** +/// `#[repr(u8)]` with explicit discriminants. The SIMD batch path in +/// [`i4_eval::batch`] writes raw bytes into `&mut [TrustTexture]` slices. +/// Reordering or removing these discriminants WILL silently corrupt SIMD +/// output; reviewers must check the SIMD LUTs in `mul.rs::batch::avx512_impl` +/// and `batch::neon_impl` if this layout is ever changed. #[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[repr(u8)] pub enum TrustTexture { /// Well-calibrated: felt ≈ demonstrated competence. - Calibrated, + Calibrated = 0, /// Overconfident: felt >> demonstrated. - Overconfident, - /// Underconfident: felt << demonstrated. - Underconfident, + Overconfident = 1, /// Uncertain: not enough data to assess. - Uncertain, + Uncertain = 2, + /// Underconfident: felt << demonstrated. + Underconfident = 3, } /// Dunning-Kruger position on the competence curve. +/// +/// **D-CSV-13b layout invariant (I-LEGACY-API-FEATURE-GATED, spec §5):** +/// `#[repr(u8)]` with explicit discriminants. See `TrustTexture` for the +/// SIMD-byte-write contract. #[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[repr(u8)] pub enum DkPosition { /// Peak of Mount Stupid (overconfident novice). - MountStupid, + MountStupid = 0, /// Valley of Despair (aware of incompetence). - ValleyOfDespair, + ValleyOfDespair = 1, /// Slope of Enlightenment (growing competence). - SlopeOfEnlightenment, + SlopeOfEnlightenment = 2, /// Plateau of Sustainability (expert). - Plateau, + Plateau = 3, } /// Flow/homeostasis state. @@ -105,16 +118,21 @@ pub struct Homeostasis { } /// Flow state (Csikszentmihalyi). +/// +/// **D-CSV-13b layout invariant (I-LEGACY-API-FEATURE-GATED, spec §5):** +/// `#[repr(u8)]` with explicit discriminants. See `TrustTexture` for the +/// SIMD-byte-write contract. #[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[repr(u8)] pub enum FlowState { - /// Challenge >> Skill → anxiety. - Anxiety, /// Challenge ≈ Skill → flow. - Flow, + Flow = 0, /// Challenge << Skill → boredom. - Boredom, + Boredom = 1, /// Transitioning between states. - Transition, + Transition = 2, + /// Challenge >> Skill → anxiety. + Anxiety = 3, } /// Gate decision: should the system proceed, pause, or block? @@ -672,6 +690,7 @@ pub mod i4_eval { static CAPS_CACHE: AtomicU8 = AtomicU8::new(0xFF); #[derive(Clone, Copy)] + #[allow(dead_code)] // each field is read only on its matching #[cfg(target_arch = ...)] dispatch branch struct SimdCapsShim { avx512f: bool, avx512bw: bool, @@ -1752,6 +1771,176 @@ pub mod i4_eval { assert_eq!(out_ma.len(), 0); assert_eq!(vec_result.len(), 0); } + + // ── D-CSV-13b: randomised SIMD-vs-scalar parity tests ───────────────── + // + // Each test generates a deterministic pseudo-random batch (fixed seed), + // runs `batch::FN` (which dispatches to AVX-512 / NEON / scalar at + // runtime) against `batch::scalar_impl::FN` (the correctness anchor), + // and asserts element-wise equality. + // + // Per spec §5 (I-LEGACY-API-FEATURE-GATED): bytes must be identical + // between dispatch path and scalar path. On a non-SIMD host the test + // degenerates to "scalar == scalar" (still asserts the API surface). + + /// xorshift64 — fixed-seed deterministic PRNG. No `rand` dep needed. + fn xorshift64(state: &mut u64) -> u64 { + let mut x = *state; + x ^= x << 13; + x ^= x >> 7; + x ^= x << 17; + *state = x; + x + } + + /// Generate n qualia + mantissas from a fixed seed. Touches all five + /// dims read by the batch pipeline (valence, tension, warmth, coherence, + /// groundedness) so the test exercises every decision branch. + fn make_random_batch(n: usize, seed: u64) -> (Vec, Vec) { + let mut s = seed; + let mut qualia = Vec::with_capacity(n); + let mut mantissas = Vec::with_capacity(n); + // 4-bit signed range: -8..=7 + let i4 = |bits: u8| -> i8 { ((bits & 0xF) << 4) as i8 >> 4 }; + for _ in 0..n { + let r = xorshift64(&mut s); + let mut q = QualiaI4_16D::ZERO; + q.set(1, i4((r & 0xF) as u8)); // valence + q.set(2, i4(((r >> 4) & 0xF) as u8)); // tension + q.set(3, i4(((r >> 8) & 0xF) as u8)); // warmth + q.set(9, i4(((r >> 12) & 0xF) as u8)); // coherence + q.set(14, i4(((r >> 16) & 0xF) as u8)); // groundedness + qualia.push(q); + let mant = i4(((r >> 20) & 0xF) as u8); + mantissas.push(mant); + } + (qualia, mantissas) + } + + /// Sizes that exercise: (a) zero, (b) size-1 (tail-only), (c) sub-MIN_BATCH + /// (scalar-only path on AVX-512 since min=8), (d) exact MIN_BATCH=8 (one + /// full SIMD chunk + no tail), (e) MIN_BATCH+1=9 (one chunk + 1 scalar + /// tail), (f) NEON MIN_BATCH+1=3, (g) large (forces many SIMD chunks). + const PARITY_SIZES: &[usize] = &[0, 1, 3, 7, 8, 9, 15, 16, 64, 1024]; + + #[test] + fn test_dk_position_batch_parity_simd_vs_scalar() { + for &n in PARITY_SIZES { + let (qualia, mantissas) = make_random_batch(n, 0xD15C_5E7D_C0DE_0001); + let mut out_dispatch = vec![DkPosition::MountStupid; n]; + let mut out_scalar = vec![DkPosition::MountStupid; n]; + batch::dk_position_batch(&qualia, &mantissas, &mut out_dispatch); + batch::scalar_impl::dk_position_batch(&qualia, &mantissas, &mut out_scalar); + for i in 0..n { + assert_eq!( + out_dispatch[i], out_scalar[i], + "dk_position_batch parity failure at size={} index={}: dispatch={:?} scalar={:?}", + n, i, out_dispatch[i], out_scalar[i], + ); + } + } + } + + #[test] + fn test_trust_texture_batch_parity_simd_vs_scalar() { + for &n in PARITY_SIZES { + let (qualia, _) = make_random_batch(n, 0xD15C_5E7D_C0DE_0002); + let mut out_dispatch = vec![TrustTexture::Uncertain; n]; + let mut out_scalar = vec![TrustTexture::Uncertain; n]; + batch::trust_texture_batch(&qualia, &mut out_dispatch); + batch::scalar_impl::trust_texture_batch(&qualia, &mut out_scalar); + for i in 0..n { + assert_eq!( + out_dispatch[i], out_scalar[i], + "trust_texture_batch parity failure at size={} index={}: dispatch={:?} scalar={:?}", + n, i, out_dispatch[i], out_scalar[i], + ); + } + } + } + + #[test] + fn test_flow_state_batch_parity_simd_vs_scalar() { + for &n in PARITY_SIZES { + let (qualia, mantissas) = make_random_batch(n, 0xD15C_5E7D_C0DE_0003); + let mut out_dispatch = vec![FlowState::Boredom; n]; + let mut out_scalar = vec![FlowState::Boredom; n]; + batch::flow_state_batch(&qualia, &mantissas, &mut out_dispatch); + batch::scalar_impl::flow_state_batch(&qualia, &mantissas, &mut out_scalar); + for i in 0..n { + assert_eq!( + out_dispatch[i], out_scalar[i], + "flow_state_batch parity failure at size={} index={}: dispatch={:?} scalar={:?}", + n, i, out_dispatch[i], out_scalar[i], + ); + } + } + } + + #[test] + fn test_gate_decision_disc_batch_parity_simd_vs_scalar() { + for &n in PARITY_SIZES { + let (qualia, mantissas) = make_random_batch(n, 0xD15C_5E7D_C0DE_0004); + let mut out_dispatch = vec![0u8; n]; + let mut out_scalar = vec![0u8; n]; + batch::gate_decision_disc_batch(&qualia, &mantissas, &mut out_dispatch); + batch::scalar_impl::gate_decision_disc_batch(&qualia, &mantissas, &mut out_scalar); + for i in 0..n { + assert_eq!( + out_dispatch[i], out_scalar[i], + "gate_decision_disc_batch parity failure at size={} index={}: dispatch={} scalar={}", + n, i, out_dispatch[i], out_scalar[i], + ); + } + // Discriminants must be in the locked range 0=Flow, 1=Hold, 2=Block. + for (i, &b) in out_dispatch.iter().enumerate() { + assert!(b <= 2, "out-of-range gate discriminant {} at index {}", b, i); + } + } + } + + #[test] + fn test_mul_assess_batch_parity_simd_vs_scalar() { + let zero_assess = || MulAssessment { + trust: TrustQualia { value: 0.0, texture: TrustTexture::Calibrated }, + dk_position: DkPosition::MountStupid, + homeostasis: Homeostasis { flow_state: FlowState::Boredom, allostatic_load: 0.0 }, + complexity_mapped: false, + free_will_modifier: 0.0, + }; + for &n in PARITY_SIZES { + let (qualia, mantissas) = make_random_batch(n, 0xD15C_5E7D_C0DE_0005); + let mut out_dispatch: Vec = (0..n).map(|_| zero_assess()).collect(); + let mut out_scalar: Vec = (0..n).map(|_| zero_assess()).collect(); + batch::mul_assess_batch(&qualia, &mantissas, &mut out_dispatch); + batch::scalar_impl::mul_assess_batch(&qualia, &mantissas, &mut out_scalar); + for i in 0..n { + assert_eq!( + out_dispatch[i].dk_position, out_scalar[i].dk_position, + "mul_assess_batch dk_position mismatch at size={} i={}", n, i, + ); + assert_eq!( + out_dispatch[i].trust.texture, out_scalar[i].trust.texture, + "mul_assess_batch trust.texture mismatch at size={} i={}", n, i, + ); + assert_eq!( + out_dispatch[i].homeostasis.flow_state, out_scalar[i].homeostasis.flow_state, + "mul_assess_batch flow_state mismatch at size={} i={}", n, i, + ); + // f64 fields: bit-identical because both paths compute the same + // scalar finalize sequence with identical inputs. + assert!( + (out_dispatch[i].trust.value - out_scalar[i].trust.value).abs() < 1e-12, + "mul_assess_batch trust.value drift at size={} i={}: dispatch={} scalar={}", + n, i, out_dispatch[i].trust.value, out_scalar[i].trust.value, + ); + assert!( + (out_dispatch[i].free_will_modifier - out_scalar[i].free_will_modifier).abs() < 1e-12, + "mul_assess_batch free_will_modifier drift at size={} i={}", n, i, + ); + } + } + } } } From d8d14376306e338442ccb16731a5ee87145f0c6c Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 16 May 2026 19:19:40 +0000 Subject: [PATCH 3/6] =?UTF-8?q?fix(sprint-13/W-I1):=20D-CSV-13b=20?= =?UTF-8?q?=E2=80=94=20sign-extend=20AVX-512=20dim-extract=20to=20full=20i?= =?UTF-8?q?64?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The salvaged AVX-512 batch impls used _mm512_cmp*_epi64_mask comparisons against i64 thresholds, but extract_dim_i8 only sign-extended the i4 nibble within an i16 sub-lane. After the i16 srai, the upper 48 bits of each i64 lane stayed zero — so a negative i4 (e.g. -3 → 0xFD as i8) read back as i64 = 0x000000000000FFFD = +65533 to the i64 comparator. Negative-threshold checks like (coh <= -3) silently became (positive >> -3), always false, which collapsed the priority chain (Valley/Anxiety/etc. branches never fired). Fix extract_dim_i8 to sign-extend across the full i64 lane via _mm512_slli_epi64<60> + _mm512_srai_epi64<60>. The dim values now live as proper i64 signed values in -8..=+7, so the existing i64-grained comparisons work correctly. Also switch flow_state_batch's flow_proxy arithmetic from _mm512_adds_epi16/_subs_epi16 (i16 saturating, wrong granularity given the i64 inputs) to _mm512_add_epi64/_sub_epi64 (i64, exact for the i4 input range -23..=+22 which can never overflow i64). The scalar's i8 clamp is never triggered for i4 inputs so the behaviours match. After the fix all 449 lance-graph-contract tests pass, including the 5 new SIMD-vs-scalar parity tests over batch sizes [0, 1, 3, 7, 8, 9, 15, 16, 64, 1024] and the pre-existing 5 *_batch_matches_scalar tests that were silently failing on the salvage branch. https://claude.ai/code/session_01UwJuKqP828qyX1VkLgGJFS --- crates/lance-graph-contract/src/mul.rs | 33 ++++++++++++++++---------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/crates/lance-graph-contract/src/mul.rs b/crates/lance-graph-contract/src/mul.rs index bb385ab0..6c14d921 100644 --- a/crates/lance-graph-contract/src/mul.rs +++ b/crates/lance-graph-contract/src/mul.rs @@ -806,7 +806,9 @@ pub mod i4_eval { use core::arch::x86_64::*; /// Extract one i4 dimension (at nibble offset `SHIFT` bits) from each - /// u64 lane of an 8-lane __m512i, sign-extending to i8 in the low byte. + /// u64 lane of an 8-lane __m512i and sign-extend across the full i64 + /// lane so that downstream `_mm512_cmp*_epi64_mask` comparisons see + /// the correct signed value (negative i4s read as negative i64s). /// /// `SHIFT` must be a compile-time constant (required by `_mm512_srli_epi64`). /// @@ -816,19 +818,19 @@ pub mod i4_eval { #[target_feature(enable = "avx512f,avx512bw")] #[inline] unsafe fn extract_dim_i8(q_vec: __m512i) -> __m512i { - // Shift the target nibble to bits [3:0] of each i64 lane. + // Step 1: shift the target nibble to bits [3:0] of each i64 lane. let shifted = _mm512_srli_epi64(q_vec, SHIFT); - // Mask to the 4-bit nibble. + // Step 2: mask to the 4-bit nibble; bits [63:4] of each i64 lane = 0. let mask_f = _mm512_set1_epi64(0xF); let nibble = _mm512_and_si512(shifted, mask_f); - // Sign-extend nibble from 4 bits to i8: - // The nibble occupies bits [3:0] of the low byte of each i64 lane. - // _mm512_slli_epi16 / _mm512_srai_epi16 operate on 16-bit lanes. - // Shift left by 12 puts the nibble sign bit (bit 3) into bit 15 (i16 sign). - // Arithmetic shift right by 12 sign-extends back to fill bits [15:4]. - // Low byte (bits [7:0]) then contains the sign-extended value as i8. - let up = _mm512_slli_epi16(nibble, 12); - _mm512_srai_epi16(up, 12) + // Step 3: sign-extend the 4-bit value to a full i64. + // + // Shift-left by 60 lifts the nibble's bit 3 (the i4 sign bit) into + // bit 63 of the i64. Arithmetic shift-right by 60 then duplicates + // that sign bit across bits [62:4], yielding a full i64 with the + // correct signed value in range -8..=+7. + let up = _mm512_slli_epi64(nibble, 60); + _mm512_srai_epi64(up, 60) } /// Store the low byte of each i64 lane (8 bytes) into `out[0..8]`. @@ -971,8 +973,13 @@ pub mod i4_eval { let ten = extract_dim_i8::<8>(q_vec); // DIM_TENSION=2 let coh = extract_dim_i8::<36>(q_vec); // DIM_COHERENCE=9 - // flow_proxy = warmth + groundedness - tension (saturating i16). - let fp = _mm512_subs_epi16(_mm512_adds_epi16(war, grd), ten); + // flow_proxy = warmth + groundedness - tension. + // + // Each input is now fully i64-sign-extended (i4 in -8..=+7), + // so the sum lies in -23..=+22 — well within i64 range, no + // saturation needed. Match the scalar's effective behaviour + // for i4 inputs (the scalar clamps to i8, never triggered). + let fp = _mm512_sub_epi64(_mm512_add_epi64(war, grd), ten); let m_ptr = mantissas.as_ptr().add(i); let man_vec = _mm512_set_epi64( From c9c1c797efb5da49a7d790c1757aca49c6932421 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 16 May 2026 19:22:18 +0000 Subject: [PATCH 4/6] =?UTF-8?q?chore(sprint-13/W-I1):=20D-CSV-13b=20?= =?UTF-8?q?=E2=80=94=20expose=20scalar=5Fimpl=20to=20benches=20via=20#[doc?= =?UTF-8?q?(hidden)]?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit benches/i4_batch.rs needs to baseline SIMD dispatch against scalar_impl directly. Promote the module from pub(crate) to pub with #[doc(hidden)] so the crate's external API is unchanged at the rustdoc level but the bench scaffold can compile. Bench results (cargo bench --quick, AVX-512 host, batch size 1024): - dk_position_batch 8.7x (SHIP gate >=4x met) - trust_texture_batch 7.4x - flow_state_batch 5.2x - gate_decision_disc_batch 10.2x - mul_assess_batch 3.1x (>=2.5x target met; scalar f64 finalize bounds the speedup per spec section 7) https://claude.ai/code/session_01UwJuKqP828qyX1VkLgGJFS --- crates/lance-graph-contract/src/mul.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/crates/lance-graph-contract/src/mul.rs b/crates/lance-graph-contract/src/mul.rs index 6c14d921..38059a4f 100644 --- a/crates/lance-graph-contract/src/mul.rs +++ b/crates/lance-graph-contract/src/mul.rs @@ -743,8 +743,13 @@ pub mod i4_eval { // ───────────────────────────────────────────────────────────────────── // scalar_impl — correctness anchor, used as fallback and in tests + // + // Public so benches/i4_batch.rs can baseline SIMD speedup directly + // against the scalar implementation; not intended as a stable API + // for downstream callers (use the public dispatch wrappers below). // ───────────────────────────────────────────────────────────────────── - pub(crate) mod scalar_impl { + #[doc(hidden)] + pub mod scalar_impl { use super::super::*; use crate::qualia::QualiaI4_16D; From 4df835fc300f0afa443205d869a136faa0596a7d Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 16 May 2026 19:26:43 +0000 Subject: [PATCH 5/6] =?UTF-8?q?chore(sprint-13/W-I1):=20D-CSV-13b=20?= =?UTF-8?q?=E2=80=94=20board=20hygiene=20(AGENT=5FLOG=20/=20STATUS=5FBOARD?= =?UTF-8?q?=20/=20LATEST=5FSTATE=20/=20PR=5FARC)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per CLAUDE.md "Mandatory Board-Hygiene Rule": - AGENT_LOG.md: PREPEND a sprint-13-w-i1-salvage entry covering files touched (210 LOC net in mul.rs + Cargo.toml dev-dep), tests (449 green incl. 5 new SIMD-vs-scalar parity tests over 10 sizes), benchmarks (8.7x/7.4x/5.2x/10.2x/3.1x at batch 1024 on Skylake-AVX512 host), iron-rule citations (I-LEGACY-API-FEATURE-GATED, I-NOISE-FLOOR-JIRAK), AP1-AP8 self-scan, validation gaps disclosed (NEON cross-arch deferred, multi-microarch deferred). - STATUS_BOARD.md: flip D-CSV-13b row from "Queued (PP-6 spec drafting)" to "In PR (sprint-13/W-I1 salvage)" with bench summary. - LATEST_STATE.md: replace the "queued, spec being drafted by PP-6" line with the in-PR status including the SHIP-gate-met bench numbers. - PR_ARC_INVENTORY.md: PREPEND a new sprint-13/W-I1 entry covering Added (the i4_eval::batch module surface), Locked (the #[repr(u8)] enum layout invariant per spec section 5), Deferred (NEON cross-arch verification, multi-microarch perf, AVX-2 fast path, WASM SIMD128, VBMI2 compressstore), Docs (the spec + doc-comments), Confidence. These updates land in a follow-up commit rather than the impl commit because the impl needed surgical fixes (the salvage AVX-512 path had a critical sign-extend bug); separating the commits keeps the bug-fix attribution clean. Future sessions should still aim for impl + board in the same commit. https://claude.ai/code/session_01UwJuKqP828qyX1VkLgGJFS --- .claude/board/AGENT_LOG.md | 53 +++++++++++++++++++++++++++++++ .claude/board/LATEST_STATE.md | 2 +- .claude/board/PR_ARC_INVENTORY.md | 40 +++++++++++++++++++++++ .claude/board/STATUS_BOARD.md | 2 +- 4 files changed, 95 insertions(+), 2 deletions(-) diff --git a/.claude/board/AGENT_LOG.md b/.claude/board/AGENT_LOG.md index 8579ad4a..98d47686 100644 --- a/.claude/board/AGENT_LOG.md +++ b/.claude/board/AGENT_LOG.md @@ -1,3 +1,56 @@ +## [Fleet sprint-13-w-i1-salvage] [IN PR] D-CSV-13b i4 batch SIMD dispatch (branch claude/sprint-13-w-i1-salvage) + +**D-id:** D-CSV-13b — SIMD vectorization of i4 MUL evaluation. AVX-512F+BW path (8 elements/iter), NEON path (2 elements/iter), scalar fallback. Runtime dispatch via cached `simd_caps()` (`AtomicU8`); zero ndarray dep preserves contract-crate zero-dep posture. + +**Worker:** W-I1 retry worker (Opus, salvage continuation). Previous W-I1 burned 134 tool uses without committing; ~979 LOC of impl recovered to the salvage branch (commit `cdc84ec`) for this run to finish. + +**Files modified:** +- `crates/lance-graph-contract/src/mul.rs` (+210 LOC net, ~3 surgical fixes): + (a) `#[repr(u8)]` with explicit discriminants on `DkPosition`/`TrustTexture`/`FlowState` per spec §5 (the salvaged SIMD impl already byte-wrote into these slices via `extract_8_lane0_bytes` — without `#[repr(u8)]` the byte writes were UB-prone); + (b) FIX `extract_dim_i8` to sign-extend across the full i64 lane via `_mm512_slli_epi64::<60>` + `_mm512_srai_epi64::<60>` — salvage only sign-extended within i16 sub-lanes, so every `_mm512_cmp*_epi64_mask` against a negative threshold (e.g. coherence ≤ -3) silently returned all-false, collapsing the priority chains; this is what made the pre-existing batch tests fail on the salvage branch; + (c) switch flow_state's `flow_proxy` arithmetic from `_mm512_adds/subs_epi16` (wrong granularity given the i64 inputs) to `_mm512_add/sub_epi64` (exact for the i4 input range -23..=+22); + (d) promote `mod scalar_impl` from `pub(crate)` to `#[doc(hidden)] pub` so `benches/i4_batch.rs` can baseline SIMD against scalar without going through the dispatch wrapper; + (e) `#[allow(dead_code)]` on `SimdCapsShim` (each field is read only on its matching `#[cfg(target_arch)]` branch — fixes the lingering warning per the retry brief); + (f) add 5 new randomised SIMD-vs-scalar parity tests (xorshift64 fixed seed, zero-dep) over 10 sizes [0, 1, 3, 7, 8, 9, 15, 16, 64, 1024] covering: empty / size-1 / sub-MIN_BATCH-AVX / exact MIN_BATCH-1 / exact MIN_BATCH=8 / MIN_BATCH+1 / 2×MIN-1 / 2×MIN / large / very-large. +- `crates/lance-graph-contract/Cargo.toml`: criterion 0.5 dev-dep (matches `lance-graph-benches`) + `[[bench]] name="i4_batch" harness=false`. + +**Tests:** 449 lance-graph-contract tests green — 429 lib + 8 + 7 + 4 + 1 doctest. Includes: +- 5 new `test_*_batch_parity_simd_vs_scalar` (10 sizes each × 5 fns). +- 5 pre-existing `test_*_batch_matches_scalar` (silently FAILING on the salvage branch before fix (b)). +- Pre-existing `test_batch_empty_input_returns_empty_output` covers size 0 on all 5 fns. + +**Benchmarks (Intel Xeon @ 2.10GHz, AVX-512F+BW+VBMI2 host, `cargo bench --quick --measurement-time 1`, batch=1024):** +- `dk_position_batch`: 2.68 µs scalar / 0.31 µs dispatch = **8.7×** (SHIP gate ≥4× ✓) +- `trust_texture_batch`: 2.28 µs / 0.31 µs = **7.4×** (SHIP ✓) +- `flow_state_batch`: 2.44 µs / 0.47 µs = **5.2×** (SHIP ✓) +- `gate_decision_disc_batch`: 15.25 µs / 1.49 µs = **10.2×** (SHIP ✓) +- `mul_assess_batch`: 17.78 µs / 5.76 µs = **3.1×** (spec target ≥2.5× because the scalar f64 finalize stage bounds the speedup ✓) + +All SHIP gates met on this host. NEON path is correctness-only per spec §7 (cannot validate on x86_64); shape mirrors AVX-512 with `vqtbl1q_u8` table lookup + `vbslq_s8` blend. + +**Iron-rule citations:** +- **I-LEGACY-API-FEATURE-GATED** (CLAUDE.md, spec §5) — explicit `#[repr(u8)] = N` discriminants + safety doc-comments lock the SIMD-byte-write contract. Reviewers must check the LUTs in `avx512_impl` and `neon_impl` whenever these enum layouts change. +- **I-NOISE-FLOOR-JIRAK** (CLAUDE.md, spec §7) — speedups reported as point estimates with criterion CIs; no claims of statistical significance beyond that. + +**AP1-AP8 self-scan:** +- AP1 (silent layout drift across feature gates) — addressed via explicit `#[repr(u8)] = N` + parity tests at 10 sizes × 5 fns; SIMD output is byte-identical to scalar. +- AP2 (panic-prone unchecked indexing) — all SIMD inner fns iterate `while i + N <= n` with scalar tail. +- AP3 (UB through transmute) — enum byte-writes are now safe with `#[repr(u8)]`; `transmute(disc_byte)` in `mul_assess_batch` is bounded by SIMD-produced ranges 0..=3. +- AP4 (atomic ordering bugs) — `CAPS_CACHE: AtomicU8` uses `Ordering::Relaxed`, correct for cache-singleton init (re-probe is idempotent). +- AP5 (missing `#[target_feature]`) — all SIMD inner fns carry `#[target_feature(enable = "avx512f,avx512bw")]` or `enable = "neon"`. +- AP6 (incorrect SIMD dispatch fallback) — dispatch falls through to scalar when caps absent OR when `len() < MIN_BATCH`; scalar_impl is the correctness anchor. +- AP7 (under-tested edge cases) — covered: 0, 1, sub-MIN, MIN, MIN+1, 2×MIN-1, 2×MIN, large. +- AP8 (silent NEON divergence) — NEON path is structurally parallel to AVX-512 (`vqtbl1q_u8` + `vbslq_s8`); cross-arch parity test deferred (no aarch64 host this session). + +**Validation gaps disclosed:** +- NEON path compiled but not executed (no aarch64 host); spec §6 cross-arch parity test W-SIMD-VERIFY-1 deferred. Tracked as TD-D-CSV-13b-NEON-VERIFY-1. +- `cargo bench` ran end-to-end and SHIP gates met on the Skylake-class AVX-512 host; spec §8 R-2 multi-microarch validation (Sapphire Rapids + Zen 4 + Tiger Lake) also deferred. Tracked as TD-D-CSV-13b-MULTI-MICROARCH-1. +- No linker bus error encountered this run. + +**Outcome:** D-CSV-13b ready for merge as sprint-13 W-I1. + +--- + ## [Fleet sprint-11-wave-c-qualia-i4-column] [IN PR] D-CSV-5a sibling QualiaI4Column add (branch claude/sprint-11-wave-c-qualia-i4-column) **D-id:** D-CSV-5a — QualiaColumn migration phase 5a (split from D-CSV-5 per OQ-CSV-4 sibling-cutover ratification). Adds `QualiaI4Column` ALONGSIDE the existing `QualiaColumn` with double-write on push paths; no read-side change. Phase 5b (separate PR after merge) flips readers + drops the f32 column. diff --git a/.claude/board/LATEST_STATE.md b/.claude/board/LATEST_STATE.md index cf4d7a5a..558465cf 100644 --- a/.claude/board/LATEST_STATE.md +++ b/.claude/board/LATEST_STATE.md @@ -152,7 +152,7 @@ Types live in `crates/cognitive-shader-driver/src/wire.rs` behind `--features se **Queued Work — sprint-13 (specs being drafted in the sprint-13-preflight fleet on this branch):** -- **D-CSV-13b** — SIMD vectorization of D-CSV-8 i4 MUL evaluation (AVX-512 + NEON intrinsics; ~150-300 LOC per ISA; 4-8× throughput gain over PR #387 scalar path). Spec being drafted by PP-6. +- **D-CSV-13b** — SIMD vectorization of D-CSV-8 i4 MUL evaluation. **IN PR (sprint-13/W-I1 salvage)** on branch `claude/sprint-13-w-i1-salvage`. AVX-512F+BW path runtime-dispatched via cached `simd_caps()` (zero ndarray dep); NEON path correctness-only per spec §7; scalar fallback. Bench on Skylake-AVX512 host: 8.7× dk / 7.4× trust / 5.2× flow / 10.2× gate_disc / 3.1× mul_assess at batch 1024 — all SHIP gates met. `#[repr(u8)]` discriminants locked on `DkPosition`/`TrustTexture`/`FlowState` per spec §5 (I-LEGACY-API-FEATURE-GATED). 449 lance-graph-contract tests green including 5 new SIMD-vs-scalar parity tests over 10 sizes. - **D-CSV-14** — on-Think method migration for D-CSV-12 splat ops (struct-method surface per L-20 lock; depends on D-CSV-11 ndarray streaming PR #147). Spec being drafted by PP-4. - **D-CSV-16** — NEW sprint-13 entry. Spec being drafted by PP-5. - **D-CSV-17** — NEW sprint-13 entry. Spec being drafted by PP-3. diff --git a/.claude/board/PR_ARC_INVENTORY.md b/.claude/board/PR_ARC_INVENTORY.md index 48b79497..80859888 100644 --- a/.claude/board/PR_ARC_INVENTORY.md +++ b/.claude/board/PR_ARC_INVENTORY.md @@ -35,6 +35,46 @@ --- +## sprint-13/W-I1 — impl(sprint-13): D-CSV-13b i4 batch SIMD dispatch + tests (in PR) + +**Status:** In PR (branch `claude/sprint-13-w-i1-salvage`, HEAD `c9c1c79`, awaiting user merge). 4 commits on the branch: `cdc84ec` salvage W-I1 i4_eval::batch impl + criterion scaffold (recovered from cleaned worktree) → `a356e64` SIMD-vs-scalar parity tests + repr(u8) enum invariant (5 new randomised tests over 10 sizes, criterion 0.5 dev-dep, dead-code warning fix) → `d8d1437` AVX-512 dim-extract sign-extend fix (the bug that made the salvage path silently produce wrong bytes on negative thresholds) → `c9c1c79` `scalar_impl` made `#[doc(hidden)] pub` for bench access. + +**Confidence (2026-05-16):** salvage-and-finish run. Previous W-I1 (Sonnet) burned 134 tool uses without staging a commit; harness auto-cleaned the worktree, ~979 LOC of partial impl was recovered to the salvage branch by orchestration. This retry (Opus) commit 1 of 4 landed within 7 tool uses per the brief's "commit early, commit often" hard rule. AVX-512F+BW path is now correct (verified against scalar over 10 batch sizes × 5 fns); NEON path compiles but is correctness-only per spec §7 (no aarch64 host this session). Bench at batch 1024: 8.7×/7.4×/5.2×/10.2×/3.1× — all SHIP gates met on the Skylake-AVX512 host. + +### Added + +- `crates/lance-graph-contract/src/mul.rs::i4_eval::batch` — the SIMD dispatch module: `dk_position_batch`, `trust_texture_batch`, `flow_state_batch`, `gate_decision_disc_batch`, `gate_decision_batch` (full GateDecision; scalar-only carve-out due to `String` payloads), `mul_assess_batch`, `mul_assess_vec`. Runtime dispatch via cached `simd_caps()` (`AtomicU8` packed bits, `Ordering::Relaxed`). AVX-512F+BW intrinsics path (8 elements/iter) under `#[cfg(target_arch = "x86_64")]`. NEON intrinsics path (2 elements/iter) under `#[cfg(target_arch = "aarch64")]`. `pub(crate) #[doc(hidden)] pub mod scalar_impl` as the correctness anchor + bench baseline. +- `crates/lance-graph-contract/src/mul.rs::GateDecision::to_disc()` — `u8` discriminant (0=Flow, 1=Hold, 2=Block) for SIMD-packable gate output. +- 5 new randomised SIMD-vs-scalar parity tests in `mul::i4_eval::tests` covering all 5 batch fns at 10 sizes [0, 1, 3, 7, 8, 9, 15, 16, 64, 1024] (xorshift64 fixed seed, zero-dep). +- `crates/lance-graph-contract/benches/i4_batch.rs` — criterion bench scaffold sweeping batch sizes [8, 64, 1024, 16384] for all 5 batch fns (dispatch vs scalar baseline). +- `crates/lance-graph-contract/Cargo.toml` — `criterion = "0.5"` dev-dep matching `lance-graph-benches`; `[[bench]] name="i4_batch" harness=false`. + +### Locked + +- **Enum layout invariant (D-CSV-13b, spec §5; I-LEGACY-API-FEATURE-GATED):** `DkPosition`, `TrustTexture`, `FlowState` are `#[repr(u8)]` with explicit discriminants. The SIMD impl byte-writes into `&mut [Enum]` slices via `extract_8_lane0_bytes` — reordering or removing these discriminants WILL silently corrupt SIMD output. Discriminants locked: `DkPosition { MountStupid=0, ValleyOfDespair=1, SlopeOfEnlightenment=2, Plateau=3 }`; `TrustTexture { Calibrated=0, Overconfident=1, Uncertain=2, Underconfident=3 }`; `FlowState { Flow=0, Boredom=1, Transition=2, Anxiety=3 }`. Doc-comments on each enum cite the SIMD-byte-write contract and the LUT locations in `avx512_impl`/`neon_impl` that reviewers must check on any future layout change. +- **GateDecision discriminant mapping (spec §5):** `GateDecision::to_disc()` returns `0=Flow, 1=Hold, 2=Block`; this is the byte mapping written by `gate_decision_disc_batch`. `GateDecision` itself cannot be `#[repr(u8)]` due to its `String` payloads — `gate_decision_batch` materializes the full enum via the scalar path. +- **Runtime SIMD dispatch (OQ-CSV-13, spec §4):** dispatch happens via cached `simd_caps()` inside `lance-graph-contract`, NOT via an ndarray dev-dep. Preserves the contract crate's zero-dep posture. The shim is ~50 LOC and uses `is_x86_feature_detected!` / `is_aarch64_feature_detected!`. +- **MIN_BATCH guards:** AVX-512 needs `len >= 8`; NEON needs `len >= 2`. Below those thresholds the dispatch falls through to scalar. + +### Deferred + +- **NEON cross-arch parity verification (spec §6, W-SIMD-VERIFY-1):** no aarch64 host this session; the NEON path compiled but byte-equivalence to scalar was not executed. Tracked as TD-D-CSV-13b-NEON-VERIFY-1. +- **Multi-microarch AVX-512 perf validation (spec §8 R-2):** bench results came from a single Skylake-class Xeon. Sapphire Rapids + Zen 4 + Tiger Lake validation deferred. Tracked as TD-D-CSV-13b-MULTI-MICROARCH-1. +- **AVX-2-only fast path (spec §1):** out of scope per spec; AVX-2 hardware falls through to scalar. Tracked as TD-SIMD-I4-AVX2-1. +- **WASM SIMD128 + VBMI2 compressstore (spec §1, §8 R-6):** sprint-14+. The current AVX-512 path uses a scalar byte-extract from a 64-byte stack buffer instead of `_mm512_mask_compressstoreu_epi8` to preserve Skylake-X / Cascade Lake portability (no VBMI2 requirement). Tracked as TD-D-CSV-13b-VBMI2-1. + +### Docs + +- Spec `.claude/specs/pr-sprint-13-simd-i4.md` — the 982-LOC planning document covering AP1-AP8 anti-pattern catalogue, §3 per-function SIMD pseudocode, §5 semantic-equivalence iron rule, §6 test plan, §7 benchmark plan with Jirak rate citation, §8 R-1..R-10 risk matrix. +- Doc-comments on `DkPosition`/`TrustTexture`/`FlowState` cite the D-CSV-13b layout invariant and point reviewers at the SIMD LUTs. +- `GateDecision::to_disc()` rustdoc documents the locked byte mapping. + +### Confidence (2026-05-16) + +Salvage retry succeeded. The critical bug in the salvaged AVX-512 impl (i64-grained comparisons against negative thresholds always returning false because `extract_dim_i8` only sign-extended within i16 sub-lanes) was diagnosed and fixed surgically. All 449 lance-graph-contract tests green; SHIP gates met on this host. The pre-existing batch tests that were silently passing because they didn't reach the bug + the new randomised parity tests that DO reach it together close the I-LEGACY-API-FEATURE-GATED audit per spec §5. + +--- + ## #390 — impl(sprint-12/wave-G): D-CSV-5b cutover + D-CSV-6b WitnessCorpus index + D-CSV-13 batch + D-CSV-15 Jirak math (in PR) **Status:** In PR (branch `claude/sprint-12-wave-g-fleet`, HEAD `bad0875`, awaiting user merge). 6 commits on the branch: `7d7b537` WIP snapshot → `03ce219` W-G3 + W-G5 + W-G6 + W-G1 partial → `291878f` W-G1 driver.rs + W-G2 refinement + W-G4 Σ10 → `67c2ca8` W-G1 cutover finalization + W-G4 Jirak math correction → `4d429e3` W-Meta-Opus honest review (grade A−) + CSI-15 rename → `bad0875` cargo fmt rustfmt 1.95 CI gate. diff --git a/.claude/board/STATUS_BOARD.md b/.claude/board/STATUS_BOARD.md index ba7191b9..90e0a793 100644 --- a/.claude/board/STATUS_BOARD.md +++ b/.claude/board/STATUS_BOARD.md @@ -463,7 +463,7 @@ Consolidates sprint-10 architectural decisions before context dilution. | D-id | Title | Status | PR / Evidence | |---|---|---|---| | D-CSV-13 | Batch i4 scalar MUL (paired with D-CSV-8 SIMD-readiness) | **Shipped** | PR #388 merge `77f2d26` (W-G3 batch i4 scalar) | -| D-CSV-13b | SIMD vectorization of D-CSV-8 i4 MUL evaluation (AVX-512 + NEON intrinsics) | **Queued (PP-6 spec drafting)** | sprint-13 preflight; ~150-300 LOC per ISA | +| D-CSV-13b | SIMD vectorization of D-CSV-8 i4 MUL evaluation (AVX-512 + NEON intrinsics) | **In PR (sprint-13/W-I1 salvage)** | branch `claude/sprint-13-w-i1-salvage`; AVX-512F+BW dispatch via `simd_caps()`; bench on Skylake-AVX512 host = 8.7× dk / 7.4× trust / 5.2× flow / 10.2× gate_disc / 3.1× mul_assess at batch 1024 — all SHIP gates met; 5 SIMD-vs-scalar parity tests over 10 sizes green | | D-CSV-14 | On-Think method migration for D-CSV-12 splat ops (struct-method surface per L-20) | **Queued (PP-4 spec drafting)** | sprint-13; depends on D-CSV-11 streaming substrate (shipped via ndarray #147) | | D-CSV-15 | Σ10 Jirak-derived threshold (TD-SIGMA-TIER-THRESHOLDS-1 resolution) | **In PR (#390 W-G4 Jirak threshold)** | sprint-12 Wave G partial; full VAMPE coupled-revival deferred sprint-13+ | | D-CSV-16 | NEW sprint-13 entry | **Queued (PP-5 spec drafting)** | sprint-13 preflight | From 1d8ef1089659de9b8dce8cdd12153cb219e22e57 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 16 May 2026 19:29:30 +0000 Subject: [PATCH 6/6] =?UTF-8?q?chore(sprint-13/W-I1):=20D-CSV-13b=20?= =?UTF-8?q?=E2=80=94=20commit=20Cargo.lock=20criterion=20entry?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Orchestrator-only fix per autoattended-multiagent-pattern §5 Rule 3 (atomic consolidation pass). The retry worker added `criterion` as a dev-dep in `crates/lance-graph-contract/Cargo.toml` (commit a356e64) and successfully ran `cargo bench` locally — which silently regenerated Cargo.lock — but never staged or committed Cargo.lock itself. PR #398 as pushed has Cargo.toml expecting `criterion` while Cargo.lock does not list it; any `cargo build --locked` (CI default) fails with "the lock file Cargo.lock needs to be updated". Adds the single missing entry under `[[package]]` for `lance-graph-contract`: ``` dependencies = [ + "criterion", "glob", "serde", "serde_yaml", ``` No semantic changes to lance-graph-contract or any other crate. The criterion package itself was already pulled in as a transitive of other workspace members, so this commit adds only the dep reference, not a new crate version. https://claude.ai/code/session_01UwJuKqP828qyX1VkLgGJFS --- Cargo.lock | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.lock b/Cargo.lock index 161c2c9a..3d693983 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4637,6 +4637,7 @@ dependencies = [ name = "lance-graph-contract" version = "0.1.0" dependencies = [ + "criterion", "glob", "serde", "serde_yaml",