Skip to content
Merged
205 changes: 0 additions & 205 deletions .claude/BELICHTUNGSMESSER.md
Original file line number Diff line number Diff line change
Expand Up @@ -635,208 +635,3 @@ In ladybug-rs:
Uses rustynum-core::Belichtungsmesser directly.
Integrated into Plane.distance() for alpha-aware cascade.
```

---

### CRITICAL: SIMD Dispatch — Works Everywhere Without Configuration

The cascade description above mentions "AVX-512 ops" for cycle counts. Those are BEST CASE numbers. The actual implementation MUST work on:

- **GitHub Actions CI:** x86_64 with AVX2 but NO AVX-512
- **Apple Silicon:** ARM64 with NEON, no x86 at all
- **Old servers:** x86_64 with only SSE4.2
- **Developer laptops:** mix of everything

The dispatch is RUNTIME, not compile-time. The binary ships ALL paths. The CPU decides at first call:

```rust
use std::sync::OnceLock;

/// Resolved once on first call. Zero overhead after.
static HAMMING_FN: OnceLock<fn(&[u8], &[u8]) -> u64> = OnceLock::new();

fn select_hamming_fn() -> fn(&[u8], &[u8]) -> u64 {
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx512vpopcntdq") && is_x86_feature_detected!("avx512f") {
return hamming_avx512_safe; // 32 iterations for 16K, ~32 cycles
}
if is_x86_feature_detected!("avx2") {
return hamming_avx2_safe; // 64 iterations for 16K, ~64 cycles
}
// SSE4.2 has POPCNT instruction — scalar loop uses it via .count_ones()
}
#[cfg(target_arch = "aarch64")]
{
return hamming_neon_safe; // NEON CNT instruction
}
hamming_scalar // works on literally anything
}

/// Get the resolved function pointer. First call detects CPU. All others: direct call.
#[inline]
fn hamming(a: &[u8], b: &[u8]) -> u64 {
let f = HAMMING_FN.get_or_init(select_hamming_fn);
f(a, b)
}
```

The four implementations that MUST be present:

```rust
/// AVX-512 VPOPCNTDQ path. Ice Lake+ / Zen 4+.
/// 64 bytes per iteration. 16K = 32 iterations.
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx512f,avx512vpopcntdq")]
unsafe fn hamming_avx512_safe(a: &[u8], b: &[u8]) -> u64 {
use core::arch::x86_64::*;
let len = a.len();
let chunks = len / 64;
let mut total = _mm512_setzero_si512();
for i in 0..chunks {
let base = i * 64;
let av = _mm512_loadu_si512(a[base..].as_ptr() as *const __m512i);
let bv = _mm512_loadu_si512(b[base..].as_ptr() as *const __m512i);
let xor = _mm512_xor_si512(av, bv);
let popcnt = _mm512_popcnt_epi64(xor);
total = _mm512_add_epi64(total, popcnt);
}
let mut buf = [0i64; 8];
_mm512_storeu_si512(buf.as_mut_ptr() as *mut __m512i, total);
let mut sum: u64 = buf.iter().map(|&v| v as u64).sum();
for i in (chunks * 64)..len { sum += (a[i] ^ b[i]).count_ones() as u64; }
sum
}

/// AVX2 Harley-Seal path. Haswell+ / all modern x86.
/// THIS IS WHAT GITHUB ACTIONS CI USES.
/// 32 bytes per iteration. 16K = 64 iterations.
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn hamming_avx2_safe(a: &[u8], b: &[u8]) -> u64 {
use core::arch::x86_64::*;
let len = a.len();
let chunks = len / 32;
let low_mask = _mm256_set1_epi8(0x0f);
let lookup = _mm256_setr_epi8(
0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,
0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,
);
let mut total = _mm256_setzero_si256();
// Process in blocks of 8 to avoid u8 saturation in vpshufb accumulation
let blocks = chunks / 8;
for block in 0..blocks {
let mut local = _mm256_setzero_si256();
for i in 0..8 {
let idx = (block * 8 + i) * 32;
let av = _mm256_loadu_si256(a[idx..].as_ptr() as *const __m256i);
let bv = _mm256_loadu_si256(b[idx..].as_ptr() as *const __m256i);
let xor = _mm256_xor_si256(av, bv);
let lo = _mm256_and_si256(xor, low_mask);
let hi = _mm256_and_si256(_mm256_srli_epi16(xor, 4), low_mask);
let cnt = _mm256_add_epi8(
_mm256_shuffle_epi8(lookup, lo),
_mm256_shuffle_epi8(lookup, hi),
);
local = _mm256_add_epi8(local, cnt);
}
// Widen u8 counts to u64 via SAD (sum of absolute differences against zero)
total = _mm256_add_epi64(total, _mm256_sad_epu8(local, _mm256_setzero_si256()));
}
// Remaining chunks after last full block
let remaining_start = blocks * 8;
let mut local = _mm256_setzero_si256();
for c in remaining_start..chunks {
let idx = c * 32;
let av = _mm256_loadu_si256(a[idx..].as_ptr() as *const __m256i);
let bv = _mm256_loadu_si256(b[idx..].as_ptr() as *const __m256i);
let xor = _mm256_xor_si256(av, bv);
let lo = _mm256_and_si256(xor, low_mask);
let hi = _mm256_and_si256(_mm256_srli_epi16(xor, 4), low_mask);
let cnt = _mm256_add_epi8(
_mm256_shuffle_epi8(lookup, lo),
_mm256_shuffle_epi8(lookup, hi),
);
local = _mm256_add_epi8(local, cnt);
}
total = _mm256_add_epi64(total, _mm256_sad_epu8(local, _mm256_setzero_si256()));
// Horizontal sum of 4 x u64
let mut buf = [0i64; 4];
_mm256_storeu_si256(buf.as_mut_ptr() as *mut __m256i, total);
let mut sum: u64 = buf.iter().map(|&v| v as u64).sum();
// Scalar tail
for i in (chunks * 32)..len { sum += (a[i] ^ b[i]).count_ones() as u64; }
sum
}

/// ARM NEON path. Apple Silicon, Graviton, Raspberry Pi 4+.
#[cfg(target_arch = "aarch64")]
fn hamming_neon_safe(a: &[u8], b: &[u8]) -> u64 {
// aarch64 has hardware POPCNT via CNT instruction.
// Rust's .count_ones() on aarch64 compiles to CNT.
// No explicit intrinsics needed — the scalar path IS fast on ARM.
hamming_scalar(a, b)
}

/// Scalar path. Works on literally every CPU since 2008.
/// Uses hardware POPCNT instruction via u64::count_ones()
/// which Rust/LLVM emits as POPCNT on x86 and CNT on ARM.
fn hamming_scalar(a: &[u8], b: &[u8]) -> u64 {
let len = a.len();
let chunks = len / 8;
let mut sum: u64 = 0;
for i in 0..chunks {
let base = i * 8;
let a_u64 = u64::from_le_bytes([
a[base], a[base+1], a[base+2], a[base+3],
a[base+4], a[base+5], a[base+6], a[base+7],
]);
let b_u64 = u64::from_le_bytes([
b[base], b[base+1], b[base+2], b[base+3],
b[base+4], b[base+5], b[base+6], b[base+7],
]);
sum += (a_u64 ^ b_u64).count_ones() as u64;
}
for i in (chunks * 8)..len { sum += (a[i] ^ b[i]).count_ones() as u64; }
sum
}
```

**CORRECTED cascade cycle counts per platform:**

```
STAGE 1 (1/16) STAGE 2 (1/4) STAGE 3 (full)
128 bytes 512 bytes 2048 bytes
AVX-512 (server): 2 iters ~4cy 8 iters ~16cy 32 iters ~64cy
AVX2 (CI/laptop): 4 iters ~8cy 16 iters ~32cy 64 iters ~128cy
Scalar (fallback): 16 popcnts 64 popcnts 256 popcnts
ARM NEON (Mac): ~16 CNTs ~64 CNTs ~256 CNTs
```

All platforms get the same cascade benefit (97%+ eliminated at stage 1).
The absolute cycle counts differ. The RATIO is the same.
CI tests pass on ALL platforms because the logic is identical —
only the throughput changes.

**Test that MUST pass on all CI platforms:**

```rust
#[test]
fn dispatch_works_on_this_platform() {
let a = vec![0xAA_u8; 2048]; // alternating bits
let b = vec![0x55_u8; 2048]; // opposite alternating bits
let dist = hamming(&a, &b);
assert_eq!(dist, 16384, "All 16384 bits should differ");

let c = vec![0xAA_u8; 2048];
let dist_same = hamming(&a, &c);
assert_eq!(dist_same, 0, "Identical vectors should have distance 0");

// Partial: 128 bytes (stage 1 sample size)
let dist_partial = hamming(&a[..128], &b[..128]);
assert_eq!(dist_partial, 1024, "128 bytes = 1024 bits should all differ");
}
```

This test runs on every CI platform without knowing which SIMD path was taken.
If it passes, the dispatch works correctly for that platform.
Loading