AdaWorldAPI · AdaWorldAPI · Mar 14, 2026 · Mar 13, 2026 · Mar 13, 2026 · Mar 13, 2026
diff --git a/.claude/BELICHTUNGSMESSER.md b/.claude/BELICHTUNGSMESSER.md
@@ -635,208 +635,3 @@ In ladybug-rs:
   Uses rustynum-core::Belichtungsmesser directly.
   Integrated into Plane.distance() for alpha-aware cascade.
 ```
-
----
-
-### CRITICAL: SIMD Dispatch — Works Everywhere Without Configuration
-
-The cascade description above mentions "AVX-512 ops" for cycle counts. Those are BEST CASE numbers. The actual implementation MUST work on:
-
-- **GitHub Actions CI:** x86_64 with AVX2 but NO AVX-512
-- **Apple Silicon:** ARM64 with NEON, no x86 at all
-- **Old servers:** x86_64 with only SSE4.2
-- **Developer laptops:** mix of everything
-
-The dispatch is RUNTIME, not compile-time. The binary ships ALL paths. The CPU decides at first call:
-
-```rust
-use std::sync::OnceLock;
-
-/// Resolved once on first call. Zero overhead after.
-static HAMMING_FN: OnceLock<fn(&[u8], &[u8]) -> u64> = OnceLock::new();
-
-fn select_hamming_fn() -> fn(&[u8], &[u8]) -> u64 {
-    #[cfg(target_arch = "x86_64")]
-    {
-        if is_x86_feature_detected!("avx512vpopcntdq") && is_x86_feature_detected!("avx512f") {
-            return hamming_avx512_safe;  // 32 iterations for 16K, ~32 cycles
-        }
-        if is_x86_feature_detected!("avx2") {
-            return hamming_avx2_safe;    // 64 iterations for 16K, ~64 cycles
-        }
-        // SSE4.2 has POPCNT instruction — scalar loop uses it via .count_ones()
-    }
-    #[cfg(target_arch = "aarch64")]
-    {
-        return hamming_neon_safe;        // NEON CNT instruction
-    }
-    hamming_scalar                       // works on literally anything
-}
-
-/// Get the resolved function pointer. First call detects CPU. All others: direct call.
-#[inline]
-fn hamming(a: &[u8], b: &[u8]) -> u64 {
-    let f = HAMMING_FN.get_or_init(select_hamming_fn);
-    f(a, b)
-}
-```
-
-The four implementations that MUST be present:
-
-```rust
-/// AVX-512 VPOPCNTDQ path. Ice Lake+ / Zen 4+.
-/// 64 bytes per iteration. 16K = 32 iterations.
-#[cfg(target_arch = "x86_64")]
-#[target_feature(enable = "avx512f,avx512vpopcntdq")]
-unsafe fn hamming_avx512_safe(a: &[u8], b: &[u8]) -> u64 {
-    use core::arch::x86_64::*;
-    let len = a.len();
-    let chunks = len / 64;
-    let mut total = _mm512_setzero_si512();
-    for i in 0..chunks {
-        let base = i * 64;
-        let av = _mm512_loadu_si512(a[base..].as_ptr() as *const __m512i);
-        let bv = _mm512_loadu_si512(b[base..].as_ptr() as *const __m512i);
-        let xor = _mm512_xor_si512(av, bv);
-        let popcnt = _mm512_popcnt_epi64(xor);
-        total = _mm512_add_epi64(total, popcnt);
-    }
-    let mut buf = [0i64; 8];
-    _mm512_storeu_si512(buf.as_mut_ptr() as *mut __m512i, total);
-    let mut sum: u64 = buf.iter().map(|&v| v as u64).sum();
-    for i in (chunks * 64)..len { sum += (a[i] ^ b[i]).count_ones() as u64; }
-    sum
-}
-
-/// AVX2 Harley-Seal path. Haswell+ / all modern x86.
-/// THIS IS WHAT GITHUB ACTIONS CI USES.
-/// 32 bytes per iteration. 16K = 64 iterations.
-#[cfg(target_arch = "x86_64")]
-#[target_feature(enable = "avx2")]
-unsafe fn hamming_avx2_safe(a: &[u8], b: &[u8]) -> u64 {
-    use core::arch::x86_64::*;
-    let len = a.len();
-    let chunks = len / 32;
-    let low_mask = _mm256_set1_epi8(0x0f);
-    let lookup = _mm256_setr_epi8(
-        0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,
-        0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,
-    );
-    let mut total = _mm256_setzero_si256();
-    // Process in blocks of 8 to avoid u8 saturation in vpshufb accumulation
-    let blocks = chunks / 8;
-    for block in 0..blocks {
-        let mut local = _mm256_setzero_si256();
-        for i in 0..8 {
-            let idx = (block * 8 + i) * 32;
-            let av = _mm256_loadu_si256(a[idx..].as_ptr() as *const __m256i);
-            let bv = _mm256_loadu_si256(b[idx..].as_ptr() as *const __m256i);
-            let xor = _mm256_xor_si256(av, bv);
-            let lo = _mm256_and_si256(xor, low_mask);
-            let hi = _mm256_and_si256(_mm256_srli_epi16(xor, 4), low_mask);
-            let cnt = _mm256_add_epi8(
-                _mm256_shuffle_epi8(lookup, lo),
-                _mm256_shuffle_epi8(lookup, hi),
-            );
-            local = _mm256_add_epi8(local, cnt);
-        }
-        // Widen u8 counts to u64 via SAD (sum of absolute differences against zero)
-        total = _mm256_add_epi64(total, _mm256_sad_epu8(local, _mm256_setzero_si256()));
-    }
-    // Remaining chunks after last full block
-    let remaining_start = blocks * 8;
-    let mut local = _mm256_setzero_si256();
-    for c in remaining_start..chunks {
-        let idx = c * 32;
-        let av = _mm256_loadu_si256(a[idx..].as_ptr() as *const __m256i);
-        let bv = _mm256_loadu_si256(b[idx..].as_ptr() as *const __m256i);
-        let xor = _mm256_xor_si256(av, bv);
-        let lo = _mm256_and_si256(xor, low_mask);
-        let hi = _mm256_and_si256(_mm256_srli_epi16(xor, 4), low_mask);
-        let cnt = _mm256_add_epi8(
-            _mm256_shuffle_epi8(lookup, lo),
-            _mm256_shuffle_epi8(lookup, hi),
-        );
-        local = _mm256_add_epi8(local, cnt);
-    }
-    total = _mm256_add_epi64(total, _mm256_sad_epu8(local, _mm256_setzero_si256()));
-    // Horizontal sum of 4 x u64
-    let mut buf = [0i64; 4];
-    _mm256_storeu_si256(buf.as_mut_ptr() as *mut __m256i, total);
-    let mut sum: u64 = buf.iter().map(|&v| v as u64).sum();
-    // Scalar tail
-    for i in (chunks * 32)..len { sum += (a[i] ^ b[i]).count_ones() as u64; }
-    sum
-}
-
-/// ARM NEON path. Apple Silicon, Graviton, Raspberry Pi 4+.
-#[cfg(target_arch = "aarch64")]
-fn hamming_neon_safe(a: &[u8], b: &[u8]) -> u64 {
-    // aarch64 has hardware POPCNT via CNT instruction.
-    // Rust's .count_ones() on aarch64 compiles to CNT.
-    // No explicit intrinsics needed — the scalar path IS fast on ARM.
-    hamming_scalar(a, b)
-}
-
-/// Scalar path. Works on literally every CPU since 2008.
-/// Uses hardware POPCNT instruction via u64::count_ones()
-/// which Rust/LLVM emits as POPCNT on x86 and CNT on ARM.
-fn hamming_scalar(a: &[u8], b: &[u8]) -> u64 {
-    let len = a.len();
-    let chunks = len / 8;
-    let mut sum: u64 = 0;
-    for i in 0..chunks {
-        let base = i * 8;
-        let a_u64 = u64::from_le_bytes([
-            a[base], a[base+1], a[base+2], a[base+3],
-            a[base+4], a[base+5], a[base+6], a[base+7],
-        ]);
-        let b_u64 = u64::from_le_bytes([
-            b[base], b[base+1], b[base+2], b[base+3],
-            b[base+4], b[base+5], b[base+6], b[base+7],
-        ]);
-        sum += (a_u64 ^ b_u64).count_ones() as u64;
-    }
-    for i in (chunks * 8)..len { sum += (a[i] ^ b[i]).count_ones() as u64; }
-    sum
-}
-```
-
-**CORRECTED cascade cycle counts per platform:**
-
-```
-                    STAGE 1 (1/16)   STAGE 2 (1/4)   STAGE 3 (full)
-                    128 bytes        512 bytes        2048 bytes
-AVX-512 (server):   2 iters ~4cy    8 iters ~16cy    32 iters ~64cy
-AVX2 (CI/laptop):   4 iters ~8cy    16 iters ~32cy   64 iters ~128cy
-Scalar (fallback):  16 popcnts       64 popcnts       256 popcnts
-ARM NEON (Mac):     ~16 CNTs         ~64 CNTs         ~256 CNTs
-```
-
-All platforms get the same cascade benefit (97%+ eliminated at stage 1).
-The absolute cycle counts differ. The RATIO is the same.
-CI tests pass on ALL platforms because the logic is identical —
-only the throughput changes.
-
-**Test that MUST pass on all CI platforms:**
-
-```rust
-#[test]
-fn dispatch_works_on_this_platform() {
-    let a = vec![0xAA_u8; 2048];  // alternating bits
-    let b = vec![0x55_u8; 2048];  // opposite alternating bits
-    let dist = hamming(&a, &b);
-    assert_eq!(dist, 16384, "All 16384 bits should differ");
-
-    let c = vec![0xAA_u8; 2048];
-    let dist_same = hamming(&a, &c);
-    assert_eq!(dist_same, 0, "Identical vectors should have distance 0");
-
-    // Partial: 128 bytes (stage 1 sample size)
-    let dist_partial = hamming(&a[..128], &b[..128]);
-    assert_eq!(dist_partial, 1024, "128 bytes = 1024 bits should all differ");
-}
-```
-
-This test runs on every CI platform without knowing which SIMD path was taken.
-If it passes, the dispatch works correctly for that platform.