From 10939a6117f1eaef46d7b88513dafcc0b0ac236a Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 3 Jun 2026 16:46:17 +0000
Subject: [PATCH 01/14] bench: sweep bit-packed compare across all int types
 and bit widths

Add `bitpack_compare_sweep`, which exercises the public `array.binary(rhs,
op)` compare-against-constant path over all eight integer types and every
valid bit width (64Ki in-range elements per case, no patches). It isolates
the `<BitPacked as CompareKernel>` unpack + per-element compare kernel so a
kernel change shows up as a CodSpeed diff.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 encodings/fastlanes/Cargo.toml                |   4 +
 .../benches/bitpack_compare_sweep.rs          | 111 ++++++++++++++++++
 2 files changed, 115 insertions(+)
 create mode 100644 encodings/fastlanes/benches/bitpack_compare_sweep.rs
diff --git a/encodings/fastlanes/Cargo.toml b/encodings/fastlanes/Cargo.toml
index 08c96c481d7..e0aa5cbb724 100644
--- a/encodings/fastlanes/Cargo.toml
+++ b/encodings/fastlanes/Cargo.toml
@@ -64,6 +64,10 @@ required-features = ["_test-harness"]
 name = "bitpack_compare"
 harness = false
 
+[[bench]]
+name = "bitpack_compare_sweep"
+harness = false
+
 [[bench]]
 name = "cast_bitpacked"
 harness = false
diff --git a/encodings/fastlanes/benches/bitpack_compare_sweep.rs b/encodings/fastlanes/benches/bitpack_compare_sweep.rs
new file mode 100644
index 00000000000..3eb0ba3b9a2
--- /dev/null
+++ b/encodings/fastlanes/benches/bitpack_compare_sweep.rs
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Sweeps the public `BitPackedArray` compare-against-constant path (`array.binary(rhs, op)`) over
+//! every integer type and every valid bit width, so a kernel change shows up as a CodSpeed diff.
+//!
+//! The array holds in-range values (no patches, no out-of-range fast path), so each iteration runs
+//! the full unpack + per-element compare kernel that backs `<BitPacked as CompareKernel>`.
+//!
+//! Run with `cargo bench -p vortex-fastlanes --bench bitpack_compare_sweep`.
+
+#![expect(clippy::unwrap_used)]
+#![expect(clippy::cast_possible_truncation)]
+
+use divan::Bencher;
+use divan::counter::ItemsCount;
+use vortex_array::ArrayRef;
+use vortex_array::ExecutionCtx;
+use vortex_array::IntoArray;
+use vortex_array::LEGACY_SESSION;
+use vortex_array::VortexSessionExecute;
+use vortex_array::arrays::BoolArray;
+use vortex_array::arrays::ConstantArray;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::builtins::ArrayBuiltins;
+use vortex_array::dtype::NativePType;
+use vortex_array::scalar::Scalar;
+use vortex_array::scalar_fn::fns::operators::Operator;
+use vortex_array::validity::Validity;
+use vortex_buffer::BufferMut;
+use vortex_fastlanes::BitPackedData;
+
+fn main() {
+    divan::main();
+}
+
+/// Number of elements per benchmarked array (64 full FastLanes blocks).
+const LEN: usize = 64 * 1024;
+
+/// Operator under test. `Lt` exercises the full unpack + per-element comparison path.
+const OP: Operator = Operator::Lt;
+
+/// Integer types we can build packed arrays for in the benchmark.
+trait BenchInt: NativePType + Copy + Into<Scalar> {
+    /// Build an in-range value from a small counter.
+    fn from_counter(v: u64) -> Self;
+}
+
+macro_rules! impl_bench_int {
+    ($($T:ty),+) => {
+        $(impl BenchInt for $T {
+            #[inline]
+            fn from_counter(v: u64) -> Self {
+                v as $T
+            }
+        })+
+    };
+}
+
+impl_bench_int!(u8, u16, u32, u64, i8, i16, i32, i64);
+
+/// Encode `LEN` in-range values of type `T` at the given bit width, returning the packed array, a
+/// mid-range constant to compare against, and an execution context.
+fn setup<T: BenchInt>(width: usize) -> (ArrayRef, ArrayRef, ExecutionCtx) {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let cap = 1u64 << width;
+    let buf: BufferMut<T> = (0..LEN)
+        .map(|i| T::from_counter((i as u64) % cap))
+        .collect();
+    let array = BitPackedData::encode(
+        &PrimitiveArray::new(buf.freeze(), Validity::NonNullable).into_array(),
+        width as u8,
+        &mut ctx,
+    )
+    .unwrap()
+    .into_array();
+    let rhs = ConstantArray::new(T::from_counter(cap / 2), LEN).into_array();
+    (array, rhs, ctx)
+}
+
+/// Generate a compare benchmark over every valid bit width for one type. Valid widths are
+/// `1..native_bits` - bit-packing requires the target width to be strictly narrower than the type.
+macro_rules! bench_type {
+    ($modname:ident, $T:ty, $native_bits:expr) => {
+        mod $modname {
+            use super::*;
+
+            #[divan::bench(args = 1..$native_bits)]
+            fn compare(bencher: Bencher, width: usize) {
+                let (array, rhs, mut ctx) = setup::<$T>(width);
+                bencher.counter(ItemsCount::new(LEN)).bench_local(|| {
+                    array
+                        .clone()
+                        .binary(rhs.clone(), OP)
+                        .unwrap()
+                        .execute::<BoolArray>(&mut ctx)
+                        .unwrap()
+                });
+            }
+        }
+    };
+}
+
+bench_type!(u8, u8, 8usize);
+bench_type!(u16, u16, 16usize);
+bench_type!(u32, u32, 32usize);
+bench_type!(u64, u64, 64usize);
+bench_type!(i8, i8, 8usize);
+bench_type!(i16, i16, 16usize);
+bench_type!(i32, i32, 32usize);
+bench_type!(i64, i64, 64usize);

From 48da899354e048d5e0d73bce36c2be6afc30716b Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 3 Jun 2026 16:47:23 +0000
Subject: [PATCH 02/14] perf(fastlanes): fuse bit-packed compare into a
 transposed mask + untranspose

Replace the unpack-then-compare streaming kernel for compare-against-constant
with the FastLanes fused `unpack_cmp`: compare each value as it is unpacked,
accumulating results straight into a transposed 1024-bit mask (`[u64; 16]`,
one register-resident word per lane - no `[bool; 1024]`/`[T; 1024]` scratch),
then a single SIMD `untranspose_bits` per block rotates the mask into logical
row order, copied directly into the output bit buffer. Inline patches are
spliced in afterwards; sliced (offset != 0) arrays fall back to the scalar
streaming predicate.

This requires the in-development FastLanes (PR #141 fused mask + PR #145
width-generic BMI2/VBMI untranspose), pinned via a git patch until released.

Benchmarked end-to-end through the public compare path (`bitpack_compare_sweep`,
64Ki elements, all integer types and bit widths): fused beats the streaming
baseline for every type and width -

  i8/u8   ~6.2-7.7x
  i16/u16 ~4.5-6.0x
  i32/u32 ~1.9-4.3x
  i64/u64 ~1.2-1.9x

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 Cargo.lock                                    |  10 +-
 Cargo.toml                                    |   5 +
 .../src/bitpacking/compute/compare.rs         |  42 ++++--
 .../src/bitpacking/compute/compare_fused.rs   | 136 ++++++++++++++++++
 .../fastlanes/src/bitpacking/compute/mod.rs   |   1 +
 5 files changed, 182 insertions(+), 12 deletions(-)
 create mode 100644 encodings/fastlanes/src/bitpacking/compute/compare_fused.rs

diff --git a/Cargo.lock b/Cargo.lock
index ec89bf1161f..1dd83daba52 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1645,6 +1645,12 @@ version = "0.8.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
 
+[[package]]
+name = "core_detect"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f8f80099a98041a3d1622845c271458a2d73e688351bf3cb999266764b81d48"
+
 [[package]]
 name = "cpubits"
 version = "0.1.1"
@@ -3146,11 +3152,11 @@ checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55"
 [[package]]
 name = "fastlanes"
 version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "414cb755aee48ff7b0907995d2949c68c8c17900970076dff6a808e18e592d71"
+source = "git+https://github.com/spiraldb/fastlanes?rev=6c10ea72cf693a17e994aa6401604ebedbeda453#6c10ea72cf693a17e994aa6401604ebedbeda453"
 dependencies = [
  "arrayref",
  "const_for",
+ "core_detect",
  "num-traits",
  "paste",
  "seq-macro",
diff --git a/Cargo.toml b/Cargo.toml
index 9700d8d78ed..8dcbfadfe08 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -412,3 +412,8 @@ debug = false
 debug-assertions = false
 strip = "debuginfo"
 incremental = false
+
+# Pin to the in-development FastLanes branch (PR #141 fused [u64;16] cmp mask +
+# PR #145 width-generic BMI/VBMI untranspose) until a release is cut.
+[patch.crates-io]
+fastlanes = { git = "https://github.com/spiraldb/fastlanes", rev = "6c10ea72cf693a17e994aa6401604ebedbeda453" }
diff --git a/encodings/fastlanes/src/bitpacking/compute/compare.rs b/encodings/fastlanes/src/bitpacking/compute/compare.rs
index f5c5c81c5cb..4e6755b9c36 100644
--- a/encodings/fastlanes/src/bitpacking/compute/compare.rs
+++ b/encodings/fastlanes/src/bitpacking/compute/compare.rs
@@ -8,6 +8,9 @@
 //! a [`BitBuffer`]. Patches are re-applied at the end by overwriting bits at the patched
 //! indices with `predicate(patch_value)`.
 
+use fastlanes::BitPacking;
+use fastlanes::BitPackingCompare;
+use fastlanes::FastLanesComparable;
 use vortex_array::ArrayRef;
 use vortex_array::ArrayView;
 use vortex_array::ExecutionCtx;
@@ -20,7 +23,8 @@ use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
 
 use crate::BitPacked;
-use crate::bitpacking::compute::stream_predicate::stream_predicate;
+use crate::bitpacking::compute::compare_fused::stream_compare_fused;
+use crate::unpack_iter::BitPacked as BitPackedIter;
 
 impl CompareKernel for BitPacked {
     fn compare(
@@ -55,6 +59,15 @@ impl CompareKernel for BitPacked {
     }
 }
 
+/// Compare every value against the constant via the fused FastLanes `unpack_cmp` kernel.
+///
+/// `NativePType::is_eq` / `is_lt` etc. provide total comparison (matching the primitive between
+/// kernel's dispatch shape). `NotEq` has no direct method, so use `!is_eq`.
+///
+/// The fused kernel (compare straight into a transposed 1024-bit mask, then a single SIMD
+/// untranspose into logical row order) beats the unpack-then-compare streaming baseline for every
+/// integer type and bit width - see `benches/bitpack_compare_fused.rs` (~6-7x for 8-bit lanes
+/// down to ~1.2-1.9x for 64-bit lanes), so it is used unconditionally.
 fn compare_constant_typed<T>(
     lhs: ArrayView<'_, BitPacked>,
     rhs: T,
@@ -63,19 +76,28 @@ fn compare_constant_typed<T>(
     ctx: &mut ExecutionCtx,
 ) -> VortexResult<ArrayRef>
 where
-    T: NativePType + Copy + crate::unpack_iter::BitPacked,
+    T: NativePType + BitPackedIter + FastLanesComparable,
+    <T as FastLanesComparable>::Bitpacked: BitPacking + NativePType + BitPackingCompare,
 {
-    // `NativePType::is_eq` / `is_lt` etc. provide total comparison (matching the primitive
-    // between kernel's dispatch shape). `NotEq` has no direct method, so use `!is_eq`.
     match operator {
-        CompareOperator::Eq => stream_predicate::<T, _>(lhs, nullability, |v| v.is_eq(rhs), ctx),
+        CompareOperator::Eq => {
+            stream_compare_fused::<T, _>(lhs, rhs, nullability, |a, b| a.is_eq(b), ctx)
+        }
         CompareOperator::NotEq => {
-            stream_predicate::<T, _>(lhs, nullability, |v| !v.is_eq(rhs), ctx)
+            stream_compare_fused::<T, _>(lhs, rhs, nullability, |a, b| !a.is_eq(b), ctx)
+        }
+        CompareOperator::Lt => {
+            stream_compare_fused::<T, _>(lhs, rhs, nullability, |a, b| a.is_lt(b), ctx)
+        }
+        CompareOperator::Lte => {
+            stream_compare_fused::<T, _>(lhs, rhs, nullability, |a, b| a.is_le(b), ctx)
+        }
+        CompareOperator::Gt => {
+            stream_compare_fused::<T, _>(lhs, rhs, nullability, |a, b| a.is_gt(b), ctx)
+        }
+        CompareOperator::Gte => {
+            stream_compare_fused::<T, _>(lhs, rhs, nullability, |a, b| a.is_ge(b), ctx)
         }
-        CompareOperator::Lt => stream_predicate::<T, _>(lhs, nullability, |v| v.is_lt(rhs), ctx),
-        CompareOperator::Lte => stream_predicate::<T, _>(lhs, nullability, |v| v.is_le(rhs), ctx),
-        CompareOperator::Gt => stream_predicate::<T, _>(lhs, nullability, |v| v.is_gt(rhs), ctx),
-        CompareOperator::Gte => stream_predicate::<T, _>(lhs, nullability, |v| v.is_ge(rhs), ctx),
     }
 }
 
diff --git a/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs b/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs
new file mode 100644
index 00000000000..053b63e6b3b
--- /dev/null
+++ b/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Fused compare kernel for [`BitPackedArray`] against a constant.
+//!
+//! Where [`super::stream_predicate`] unpacks a full 1024-element FastLanes block into a scratch
+//! buffer and *then* folds a predicate over it, this path hands the comparison down into the
+//! FastLanes [`BitPackingCompare::unchecked_unpack_cmp`] kernel, which compares each value against
+//! the constant *as it is unpacked*, accumulating the boolean results straight into a 1024-bit
+//! mask (`[u64; 16]`) in transposed FastLanes lane order - one register-resident word per lane, no
+//! `[bool; 1024]` or `[T; 1024]` scratch. A single SIMD [`untranspose_bits`] per block then rotates
+//! that mask into logical row order, which is copied directly into the output bit buffer.
+//!
+//! Only the full-chunk fast path uses the fused kernel. Sliced arrays (non-zero block offset) fall
+//! back to the scalar streaming predicate, and inline patches are spliced in afterwards by
+//! overwriting the bits at the patched indices with `cmp(patch_value, rhs)`.
+
+use fastlanes::BitPacking;
+use fastlanes::BitPackingCompare;
+use fastlanes::FastLanesComparable;
+use fastlanes::untranspose_bits;
+use num_traits::AsPrimitive;
+use vortex_array::ArrayRef;
+use vortex_array::ArrayView;
+use vortex_array::ExecutionCtx;
+use vortex_array::IntoArray;
+use vortex_array::arrays::BoolArray;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::dtype::NativePType;
+use vortex_array::dtype::Nullability;
+use vortex_array::match_each_unsigned_integer_ptype;
+use vortex_buffer::BitBufferMut;
+use vortex_buffer::BufferMut;
+use vortex_error::VortexResult;
+
+use super::stream_predicate::stream_predicate;
+use crate::BitPacked;
+use crate::BitPackedArrayExt;
+use crate::unpack_iter::BitPacked as BitPackedIter;
+
+const CHUNK_SIZE: usize = 1024;
+/// `u64` words spanning one FastLanes block (1024 bits / 64).
+const WORDS_PER_CHUNK: usize = CHUNK_SIZE / u64::BITS as usize;
+
+/// Compare the unpacked values of a [`BitPackedArray`] against `rhs` using the fused FastLanes
+/// `unpack_cmp` kernel, producing a [`BoolArray`].
+///
+/// `cmp(value, rhs)` defines the predicate; it must be the total-order comparison matching the
+/// requested operator (e.g. `|a, b| a.is_lt(b)`).
+pub(super) fn stream_compare_fused<T, F>(
+    array: ArrayView<'_, BitPacked>,
+    rhs: T,
+    nullability: Nullability,
+    cmp: F,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<ArrayRef>
+where
+    T: NativePType + BitPackedIter + FastLanesComparable,
+    <T as FastLanesComparable>::Bitpacked: BitPacking + NativePType + BitPackingCompare,
+    F: Fn(T, T) -> bool + Copy,
+{
+    let len = array.len();
+    let bit_width = BitPackedArrayExt::bit_width(&array) as usize;
+    let offset = BitPackedArrayExt::offset(&array) as usize;
+
+    // The fused kernel consumes whole 1024-element blocks at a fixed packed width. A non-zero
+    // block offset (from slicing) or a degenerate width has no clean full-chunk form, so defer
+    // to the scalar streaming predicate, which handles every layout.
+    if offset != 0 || len == 0 || bit_width == 0 {
+        return stream_predicate::<T, _>(array, nullability, move |v| cmp(v, rhs), ctx);
+    }
+
+    let packed = BitPackedArrayExt::packed_slice::<<T as FastLanesComparable>::Bitpacked>(&array);
+    let elems_per_chunk = 128 * bit_width / size_of::<<T as FastLanesComparable>::Bitpacked>();
+    let num_chunks = len.div_ceil(CHUNK_SIZE);
+
+    let mut words: BufferMut<u64> = BufferMut::zeroed(len.div_ceil(u64::BITS as usize));
+    {
+        let words = words.as_mut_slice();
+        // Per block: fuse compare into a transposed 1024-bit mask, then untranspose into logical
+        // row order. The packed buffer is zero-padded out to a whole final block, so every chunk -
+        // including the trailing partial one - has exactly `elems_per_chunk` packed values; we just
+        // copy fewer than 16 words out of the last block's untransposed mask.
+        let mut transposed = [0u64; WORDS_PER_CHUNK];
+        let mut logical = [0u64; WORDS_PER_CHUNK];
+        for chunk in 0..num_chunks {
+            let packed_chunk = &packed[chunk * elems_per_chunk..][..elems_per_chunk];
+            // SAFETY: `packed_chunk` is exactly `128 * bit_width / size_of::<U>()` elements and
+            // `bit_width <= U::T`, satisfying `unchecked_unpack_cmp`'s contract.
+            unsafe {
+                <<T as FastLanesComparable>::Bitpacked as BitPackingCompare>::unchecked_unpack_cmp::<
+                    T,
+                    _,
+                >(bit_width, packed_chunk, &mut transposed, cmp, rhs);
+            }
+            untranspose_bits::<<T as FastLanesComparable>::Bitpacked>(&transposed, &mut logical);
+
+            let block_start = chunk * CHUNK_SIZE;
+            let block_bits = (len - block_start).min(CHUNK_SIZE);
+            let word_off = chunk * WORDS_PER_CHUNK;
+            let n_words = block_bits.div_ceil(u64::BITS as usize);
+            words[word_off..][..n_words].copy_from_slice(&logical[..n_words]);
+        }
+
+        // Patched indices hold placeholder packed values, so their fused result is meaningless;
+        // overwrite each with the comparison against the real patch value.
+        if let Some(p) = array.patches() {
+            let p_idx = p.indices().clone().execute::<PrimitiveArray>(ctx)?;
+            let p_val = p.values().clone().execute::<PrimitiveArray>(ctx)?;
+            let p_off = p.offset();
+            match_each_unsigned_integer_ptype!(p_idx.ptype(), |I| {
+                let indices = p_idx.as_slice::<I>();
+                let values = p_val.as_slice::<T>();
+                for (&global, &value) in indices.iter().zip(values) {
+                    let global: usize = global.as_();
+                    set_bit(words, global - p_off, cmp(value, rhs));
+                }
+            });
+        }
+    }
+
+    let bits = BitBufferMut::from_buffer(words.into_byte_buffer(), 0, len);
+    let validity = array.validity()?.union_nullability(nullability);
+    Ok(BoolArray::new(bits.freeze(), validity).into_array())
+}
+
+/// Branchlessly write a single bit in a packed `u64` word buffer: clear the bit, then OR in the
+/// new value. Avoids a data-dependent branch per patch in the patch-fixup loop, and touches the
+/// target word through a single bounds-checked `&mut`.
+#[inline]
+fn set_bit(words: &mut [u64], idx: usize, value: bool) {
+    let shift = idx % u64::BITS as usize;
+    let mask = 1u64 << shift;
+    let word = &mut words[idx / u64::BITS as usize];
+    *word = (*word & !mask) | (u64::from(value) << shift);
+}
diff --git a/encodings/fastlanes/src/bitpacking/compute/mod.rs b/encodings/fastlanes/src/bitpacking/compute/mod.rs
index 518f8319eb1..06a4b4597b0 100644
--- a/encodings/fastlanes/src/bitpacking/compute/mod.rs
+++ b/encodings/fastlanes/src/bitpacking/compute/mod.rs
@@ -4,6 +4,7 @@
 mod between;
 mod cast;
 mod compare;
+mod compare_fused;
 mod filter;
 pub(crate) mod is_constant;
 mod slice;

From 08ed4a4a033adf2558850ab904deb130de5a3197 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 4 Jun 2026 10:31:27 +0000
Subject: [PATCH 03/14] ci(wasm): pin in-development FastLanes in the excluded
 wasm-test workspace

wasm-test is excluded from the workspace, so it does not inherit the root
[patch.crates-io] and was building vortex-fastlanes against published fastlanes
0.5.0 (old `[bool;1024]` unpack_cmp, no `untranspose_bits`) -> compile error in
compare_fused.rs. Add the matching git `rev` pin here. Temporary, like the root
pin: both are removed when a FastLanes release is cut and the version is bumped.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 wasm-test/Cargo.toml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/wasm-test/Cargo.toml b/wasm-test/Cargo.toml
index 100c15970b0..20cf5f26cfd 100644
--- a/wasm-test/Cargo.toml
+++ b/wasm-test/Cargo.toml
@@ -16,3 +16,8 @@ vortex = { path = "../vortex", default-features = false }
 inherits = "dev"
 debug = "line-tables-only"
 incremental = false
+
+# wasm-test is excluded from the workspace, so it does not inherit the root
+# [patch.crates-io]; pin the in-development FastLanes here too until a release is cut.
+[patch.crates-io]
+fastlanes = { git = "https://github.com/spiraldb/fastlanes", rev = "6c10ea72cf693a17e994aa6401604ebedbeda453" }

From 816032b92ff34c6018a19f0b2ef329348a5dfe3c Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 4 Jun 2026 14:00:22 +0100
Subject: [PATCH 04/14] wip

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 Cargo.toml | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index dbbde5bae05..cc5201a0ff7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -152,7 +152,7 @@ dirs = "6.0.0"
 divan = { package = "codspeed-divan-compat", version = "4.0.4" }
 enum-iterator = "2.0.0"
 env_logger = "0.11"
-fastlanes = "0.5"
+fastlanes = "0.5.1"
 flatbuffers = "25.2.10"
 fsst-rs = "0.5.11"
 futures = { version = "0.3.31", default-features = false }
@@ -412,8 +412,3 @@ debug = false
 debug-assertions = false
 strip = "debuginfo"
 incremental = false
-
-# Pin to the in-development FastLanes branch (PR #141 fused [u64;16] cmp mask +
-# PR #145 width-generic BMI/VBMI untranspose) until a release is cut.
-[patch.crates-io]
-fastlanes = { git = "https://github.com/spiraldb/fastlanes", rev = "6c10ea72cf693a17e994aa6401604ebedbeda453" }

From e4dd660d74661eddf4eb59e7c53821b0cc83882a Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 4 Jun 2026 14:05:05 +0100
Subject: [PATCH 05/14] wip

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 wasm-test/Cargo.toml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/wasm-test/Cargo.toml b/wasm-test/Cargo.toml
index 20cf5f26cfd..100c15970b0 100644
--- a/wasm-test/Cargo.toml
+++ b/wasm-test/Cargo.toml
@@ -16,8 +16,3 @@ vortex = { path = "../vortex", default-features = false }
 inherits = "dev"
 debug = "line-tables-only"
 incremental = false
-
-# wasm-test is excluded from the workspace, so it does not inherit the root
-# [patch.crates-io]; pin the in-development FastLanes here too until a release is cut.
-[patch.crates-io]
-fastlanes = { git = "https://github.com/spiraldb/fastlanes", rev = "6c10ea72cf693a17e994aa6401604ebedbeda453" }

From 933ca0e19715055069c49edcff8f55359a9aa24a Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 4 Jun 2026 14:10:59 +0100
Subject: [PATCH 06/14] wip

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 Cargo.lock | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 3b785bb672f..6833328367d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3151,8 +3151,9 @@ checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55"
 
 [[package]]
 name = "fastlanes"
-version = "0.5.0"
-source = "git+https://github.com/spiraldb/fastlanes?rev=6c10ea72cf693a17e994aa6401604ebedbeda453#6c10ea72cf693a17e994aa6401604ebedbeda453"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "20c597e23b8ec8506f589d18bc701ca83a3def6086748f628ad23092e1dfe577"
 dependencies = [
  "arrayref",
  "const_for",

From 211903c0110bf243b01a07f1075de4eff8fa2a4e Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 4 Jun 2026 14:11:32 +0100
Subject: [PATCH 07/14] wip

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 .../src/bitpacking/compute/compare.rs         |  13 +-
 .../src/bitpacking/compute/compare_fused.rs   | 118 +++++-------------
 .../bitpacking/compute/stream_predicate.rs    |   2 +-
 3 files changed, 33 insertions(+), 100 deletions(-)

diff --git a/encodings/fastlanes/src/bitpacking/compute/compare.rs b/encodings/fastlanes/src/bitpacking/compute/compare.rs
index 4e6755b9c36..8f7bf775f37 100644
--- a/encodings/fastlanes/src/bitpacking/compute/compare.rs
+++ b/encodings/fastlanes/src/bitpacking/compute/compare.rs
@@ -8,9 +8,6 @@
 //! a [`BitBuffer`]. Patches are re-applied at the end by overwriting bits at the patched
 //! indices with `predicate(patch_value)`.
 
-use fastlanes::BitPacking;
-use fastlanes::BitPackingCompare;
-use fastlanes::FastLanesComparable;
 use vortex_array::ArrayRef;
 use vortex_array::ArrayView;
 use vortex_array::ExecutionCtx;
@@ -59,15 +56,10 @@ impl CompareKernel for BitPacked {
     }
 }
 
-/// Compare every value against the constant via the fused FastLanes `unpack_cmp` kernel.
+/// Compare every value against the constant by streaming the regular FastLanes unpack iterator.
 ///
 /// `NativePType::is_eq` / `is_lt` etc. provide total comparison (matching the primitive between
 /// kernel's dispatch shape). `NotEq` has no direct method, so use `!is_eq`.
-///
-/// The fused kernel (compare straight into a transposed 1024-bit mask, then a single SIMD
-/// untranspose into logical row order) beats the unpack-then-compare streaming baseline for every
-/// integer type and bit width - see `benches/bitpack_compare_fused.rs` (~6-7x for 8-bit lanes
-/// down to ~1.2-1.9x for 64-bit lanes), so it is used unconditionally.
 fn compare_constant_typed<T>(
     lhs: ArrayView<'_, BitPacked>,
     rhs: T,
@@ -76,8 +68,7 @@ fn compare_constant_typed<T>(
     ctx: &mut ExecutionCtx,
 ) -> VortexResult<ArrayRef>
 where
-    T: NativePType + BitPackedIter + FastLanesComparable,
-    <T as FastLanesComparable>::Bitpacked: BitPacking + NativePType + BitPackingCompare,
+    T: NativePType + BitPackedIter + Copy,
 {
     match operator {
         CompareOperator::Eq => {
diff --git a/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs b/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs
index 053b63e6b3b..0a5146adb6a 100644
--- a/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs
+++ b/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs
@@ -1,25 +1,15 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
-//! Fused compare kernel for [`BitPackedArray`] against a constant.
+//! Block-streaming compare kernel for [`BitPackedArray`] against a constant.
 //!
-//! Where [`super::stream_predicate`] unpacks a full 1024-element FastLanes block into a scratch
-//! buffer and *then* folds a predicate over it, this path hands the comparison down into the
-//! FastLanes [`BitPackingCompare::unchecked_unpack_cmp`] kernel, which compares each value against
-//! the constant *as it is unpacked*, accumulating the boolean results straight into a 1024-bit
-//! mask (`[u64; 16]`) in transposed FastLanes lane order - one register-resident word per lane, no
-//! `[bool; 1024]` or `[T; 1024]` scratch. A single SIMD [`untranspose_bits`] per block then rotates
-//! that mask into logical row order, which is copied directly into the output bit buffer.
-//!
-//! Only the full-chunk fast path uses the fused kernel. Sliced arrays (non-zero block offset) fall
-//! back to the scalar streaming predicate, and inline patches are spliced in afterwards by
-//! overwriting the bits at the patched indices with `cmp(patch_value, rhs)`.
+//! Walks the encoded array one 1024-element FastLanes block at a time through the regular
+//! [`crate::unpack_iter::BitUnpackedChunks`] iterator (the same path used by decompress,
+//! `is_constant`, and `stream_predicate`), splices any [`crate::patches::Patches`] into the
+//! unpacked block in place, then folds `cmp(value, rhs)` over the block into a [`BitBuffer`].
+//! The materialised primitive never appears: each block reuses a single scratch buffer and the
+//! per-element bool is packed straight into the output words.
 
-use fastlanes::BitPacking;
-use fastlanes::BitPackingCompare;
-use fastlanes::FastLanesComparable;
-use fastlanes::untranspose_bits;
-use num_traits::AsPrimitive;
 use vortex_array::ArrayRef;
 use vortex_array::ArrayView;
 use vortex_array::ExecutionCtx;
@@ -31,19 +21,16 @@ use vortex_array::dtype::Nullability;
 use vortex_array::match_each_unsigned_integer_ptype;
 use vortex_buffer::BitBufferMut;
 use vortex_buffer::BufferMut;
+use vortex_buffer::pack_bools_into_words;
 use vortex_error::VortexResult;
 
-use super::stream_predicate::stream_predicate;
+use super::stream_predicate::splice_patches;
 use crate::BitPacked;
 use crate::BitPackedArrayExt;
 use crate::unpack_iter::BitPacked as BitPackedIter;
 
-const CHUNK_SIZE: usize = 1024;
-/// `u64` words spanning one FastLanes block (1024 bits / 64).
-const WORDS_PER_CHUNK: usize = CHUNK_SIZE / u64::BITS as usize;
-
-/// Compare the unpacked values of a [`BitPackedArray`] against `rhs` using the fused FastLanes
-/// `unpack_cmp` kernel, producing a [`BoolArray`].
+/// Compare the unpacked values of a [`BitPackedArray`] against `rhs`, one FastLanes block at a
+/// time, producing a [`BoolArray`].
 ///
 /// `cmp(value, rhs)` defines the predicate; it must be the total-order comparison matching the
 /// requested operator (e.g. `|a, b| a.is_lt(b)`).
@@ -55,66 +42,32 @@ pub(super) fn stream_compare_fused<T, F>(
     ctx: &mut ExecutionCtx,
 ) -> VortexResult<ArrayRef>
 where
-    T: NativePType + BitPackedIter + FastLanesComparable,
-    <T as FastLanesComparable>::Bitpacked: BitPacking + NativePType + BitPackingCompare,
-    F: Fn(T, T) -> bool + Copy,
+    T: NativePType + BitPackedIter + Copy,
+    F: Fn(T, T) -> bool,
 {
     let len = array.len();
-    let bit_width = BitPackedArrayExt::bit_width(&array) as usize;
-    let offset = BitPackedArrayExt::offset(&array) as usize;
-
-    // The fused kernel consumes whole 1024-element blocks at a fixed packed width. A non-zero
-    // block offset (from slicing) or a degenerate width has no clean full-chunk form, so defer
-    // to the scalar streaming predicate, which handles every layout.
-    if offset != 0 || len == 0 || bit_width == 0 {
-        return stream_predicate::<T, _>(array, nullability, move |v| cmp(v, rhs), ctx);
-    }
-
-    let packed = BitPackedArrayExt::packed_slice::<<T as FastLanesComparable>::Bitpacked>(&array);
-    let elems_per_chunk = 128 * bit_width / size_of::<<T as FastLanesComparable>::Bitpacked>();
-    let num_chunks = len.div_ceil(CHUNK_SIZE);
-
     let mut words: BufferMut<u64> = BufferMut::zeroed(len.div_ceil(u64::BITS as usize));
-    {
-        let words = words.as_mut_slice();
-        // Per block: fuse compare into a transposed 1024-bit mask, then untranspose into logical
-        // row order. The packed buffer is zero-padded out to a whole final block, so every chunk -
-        // including the trailing partial one - has exactly `elems_per_chunk` packed values; we just
-        // copy fewer than 16 words out of the last block's untransposed mask.
-        let mut transposed = [0u64; WORDS_PER_CHUNK];
-        let mut logical = [0u64; WORDS_PER_CHUNK];
-        for chunk in 0..num_chunks {
-            let packed_chunk = &packed[chunk * elems_per_chunk..][..elems_per_chunk];
-            // SAFETY: `packed_chunk` is exactly `128 * bit_width / size_of::<U>()` elements and
-            // `bit_width <= U::T`, satisfying `unchecked_unpack_cmp`'s contract.
-            unsafe {
-                <<T as FastLanesComparable>::Bitpacked as BitPackingCompare>::unchecked_unpack_cmp::<
-                    T,
-                    _,
-                >(bit_width, packed_chunk, &mut transposed, cmp, rhs);
-            }
-            untranspose_bits::<<T as FastLanesComparable>::Bitpacked>(&transposed, &mut logical);
 
-            let block_start = chunk * CHUNK_SIZE;
-            let block_bits = (len - block_start).min(CHUNK_SIZE);
-            let word_off = chunk * WORDS_PER_CHUNK;
-            let n_words = block_bits.div_ceil(u64::BITS as usize);
-            words[word_off..][..n_words].copy_from_slice(&logical[..n_words]);
-        }
+    if len > 0 {
+        let mut chunks = array.unpacked_chunks::<T>()?;
+        let words = words.as_mut_slice();
 
-        // Patched indices hold placeholder packed values, so their fused result is meaningless;
-        // overwrite each with the comparison against the real patch value.
         if let Some(p) = array.patches() {
-            let p_idx = p.indices().clone().execute::<PrimitiveArray>(ctx)?;
-            let p_val = p.values().clone().execute::<PrimitiveArray>(ctx)?;
+            let p_idx_arr = p.indices().clone().execute::<PrimitiveArray>(ctx)?;
+            let p_val_arr = p.values().clone().execute::<PrimitiveArray>(ctx)?;
             let p_off = p.offset();
-            match_each_unsigned_integer_ptype!(p_idx.ptype(), |I| {
-                let indices = p_idx.as_slice::<I>();
-                let values = p_val.as_slice::<T>();
-                for (&global, &value) in indices.iter().zip(values) {
-                    let global: usize = global.as_();
-                    set_bit(words, global - p_off, cmp(value, rhs));
-                }
+            match_each_unsigned_integer_ptype!(p_idx_arr.ptype(), |I| {
+                let p_idx = p_idx_arr.as_slice::<I>();
+                let p_val = p_val_arr.as_slice::<T>();
+                let mut p_cur: usize = 0;
+                chunks.for_each_unpacked_chunk(|block, range| {
+                    p_cur = splice_patches::<T, I>(block, range.start, p_cur, p_idx, p_val, p_off);
+                    pack_bools_into_words(words, range.start, block.len(), |i| cmp(block[i], rhs));
+                });
+            });
+        } else {
+            chunks.for_each_unpacked_chunk(|block, range| {
+                pack_bools_into_words(words, range.start, block.len(), |i| cmp(block[i], rhs));
             });
         }
     }
@@ -123,14 +76,3 @@ where
     let validity = array.validity()?.union_nullability(nullability);
     Ok(BoolArray::new(bits.freeze(), validity).into_array())
 }
-
-/// Branchlessly write a single bit in a packed `u64` word buffer: clear the bit, then OR in the
-/// new value. Avoids a data-dependent branch per patch in the patch-fixup loop, and touches the
-/// target word through a single bounds-checked `&mut`.
-#[inline]
-fn set_bit(words: &mut [u64], idx: usize, value: bool) {
-    let shift = idx % u64::BITS as usize;
-    let mask = 1u64 << shift;
-    let word = &mut words[idx / u64::BITS as usize];
-    *word = (*word & !mask) | (u64::from(value) << shift);
-}
diff --git a/encodings/fastlanes/src/bitpacking/compute/stream_predicate.rs b/encodings/fastlanes/src/bitpacking/compute/stream_predicate.rs
index 4ed9de913ec..df136755a7d 100644
--- a/encodings/fastlanes/src/bitpacking/compute/stream_predicate.rs
+++ b/encodings/fastlanes/src/bitpacking/compute/stream_predicate.rs
@@ -78,7 +78,7 @@ where
 /// `[chunk_start, chunk_start + block.len())`, starting from `cursor` and returning the
 /// advanced cursor. Sorted indices mean the cursor only moves forward across the walk.
 #[inline]
-fn splice_patches<T, I>(
+pub(super) fn splice_patches<T, I>(
     block: &mut [T],
     chunk_start: usize,
     mut cursor: usize,

From 0649f03e45a6a99d73d1306499c06c153ca43027 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 4 Jun 2026 15:40:04 +0100
Subject: [PATCH 08/14] wip

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 .../src/bitpacking/array/unpack_iter.rs       |  28 +++
 .../src/bitpacking/compute/compare.rs         |  83 ++++++++-
 .../src/bitpacking/compute/compare_fused.rs   | 161 ++++++++++++++----
 .../bitpacking/compute/stream_predicate.rs    |   2 +-
 4 files changed, 240 insertions(+), 34 deletions(-)

diff --git a/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs b/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs
index 5e0673186e3..c9158d8bdac 100644
--- a/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs
+++ b/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs
@@ -259,6 +259,34 @@ impl<T: PhysicalPType, S: UnpackStrategy<T>> UnpackedChunks<T, S> {
         debug_assert_eq!(local_idx, self.len);
     }
 
+    /// Walk every *packed* chunk in array order, yielding the raw packed FastLanes block and the
+    /// padded bit range it covers, without unpacking it.
+    ///
+    /// Unlike [`Self::for_each_unpacked_chunk`], this does not fill the scratch buffer: it hands
+    /// the still-packed block to the callback so fused kernels (e.g. compare) can unpack and
+    /// consume it in a single pass. Each yielded block holds exactly `elems_per_chunk` packed
+    /// values (the buffer is zero-padded out to a whole final chunk).
+    ///
+    /// The yielded range is in *padded* coordinates: block `c` covers
+    /// `[c * 1024, min((c + 1) * 1024, offset + len))`, so it includes the leading `offset` rows
+    /// that slicing skips. Block starts are therefore always 1024-aligned regardless of `offset`.
+    /// Callers must account for the array's `offset` when mapping a block's rows back to logical
+    /// output positions (e.g. by viewing the output buffer at a bit offset of `offset`).
+    pub(crate) fn for_each_packed_chunk<F>(&self, mut f: F)
+    where
+        F: FnMut(&[T::Physical], Range<usize>),
+    {
+        let packed_slice: &[T::Physical] = buffer_as_slice(&self.packed);
+        let elems_per_chunk = self.elems_per_chunk();
+        let padded_len = self.offset + self.len;
+        for chunk in 0..self.num_chunks {
+            let packed_chunk = &packed_slice[chunk * elems_per_chunk..][..elems_per_chunk];
+            let start = chunk * CHUNK_SIZE;
+            let end = (start + CHUNK_SIZE).min(padded_len);
+            f(packed_chunk, start..end);
+        }
+    }
+
     /// Unpack full chunks into output range starting at the given index.
     fn decode_full_chunks_into_at(
         &mut self,
diff --git a/encodings/fastlanes/src/bitpacking/compute/compare.rs b/encodings/fastlanes/src/bitpacking/compute/compare.rs
index 8f7bf775f37..4fbc4cf8b9d 100644
--- a/encodings/fastlanes/src/bitpacking/compute/compare.rs
+++ b/encodings/fastlanes/src/bitpacking/compute/compare.rs
@@ -8,11 +8,15 @@
 //! a [`BitBuffer`]. Patches are re-applied at the end by overwriting bits at the patched
 //! indices with `predicate(patch_value)`.
 
+use fastlanes::BitPacking;
+use fastlanes::BitPackingCompare;
+use fastlanes::FastLanesComparable;
 use vortex_array::ArrayRef;
 use vortex_array::ArrayView;
 use vortex_array::ExecutionCtx;
 use vortex_array::dtype::NativePType;
 use vortex_array::dtype::Nullability;
+use vortex_array::dtype::PhysicalPType;
 use vortex_array::match_each_integer_ptype;
 use vortex_array::scalar_fn::fns::binary::CompareKernel;
 use vortex_array::scalar_fn::fns::operators::CompareOperator;
@@ -56,7 +60,7 @@ impl CompareKernel for BitPacked {
     }
 }
 
-/// Compare every value against the constant by streaming the regular FastLanes unpack iterator.
+/// Compare every value against the constant via the fused FastLanes `unpack_cmp` kernel.
 ///
 /// `NativePType::is_eq` / `is_lt` etc. provide total comparison (matching the primitive between
 /// kernel's dispatch shape). `NotEq` has no direct method, so use `!is_eq`.
@@ -68,7 +72,10 @@ fn compare_constant_typed<T>(
     ctx: &mut ExecutionCtx,
 ) -> VortexResult<ArrayRef>
 where
-    T: NativePType + BitPackedIter + Copy,
+    T: NativePType
+        + BitPackedIter
+        + FastLanesComparable<Bitpacked = <T as PhysicalPType>::Physical>,
+    <T as PhysicalPType>::Physical: BitPacking + NativePType + BitPackingCompare,
 {
     match operator {
         CompareOperator::Eq => {
@@ -204,6 +211,78 @@ mod tests {
         Ok(())
     }
 
+    /// Sliced inputs: a non-zero block offset (and a length spanning several blocks) must still go
+    /// through the fused kernel and agree with the primitive fallback. Sweeps slice starts that
+    /// land both inside the first block and past it, with lengths that end mid-block and on a block
+    /// boundary.
+    #[rstest]
+    #[case(1, 4000)] // start mid-first-block, multi-block length
+    #[case(1023, 2)] // start at the last row of the first block
+    #[case(1024, 1024)] // start exactly on a block boundary, exactly one block long
+    #[case(1500, 1000)] // start mid-second-block
+    #[case(3, 1021)] // ends exactly on the first block boundary
+    fn sliced_matches_primitive(
+        #[case] start: usize,
+        #[case] slice_len: usize,
+    ) -> VortexResult<()> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let values: Vec<u32> = (0..5000u32).map(|i| i % 128).collect();
+        let prim = PrimitiveArray::from_iter(values);
+        let packed = BitPackedData::encode(&prim.clone().into_array(), 7, &mut ctx)?;
+
+        let sliced = packed.into_array().slice(start..start + slice_len)?;
+        let rhs = ConstantArray::new(50u32, slice_len).into_array();
+        for op in [
+            CompareOperator::Eq,
+            CompareOperator::Lt,
+            CompareOperator::Gte,
+        ] {
+            let got = <BitPacked as CompareKernel>::compare(
+                sliced.as_::<BitPacked>(),
+                &rhs,
+                op,
+                &mut ctx,
+            )?
+            .expect("fused compare kernel must engage for sliced arrays")
+            .execute::<BoolArray>(&mut ctx)?;
+            let want = prim
+                .clone()
+                .into_array()
+                .slice(start..start + slice_len)?
+                .binary(rhs.clone(), Operator::from(op))?
+                .execute::<BoolArray>(&mut ctx)?;
+            assert_arrays_eq!(got, want);
+        }
+        Ok(())
+    }
+
+    /// Sliced *and* patched: combine a non-zero offset with out-of-range values that land in
+    /// `Patches`, exercising the `offset + (global - p_off)` patch-position math.
+    #[test]
+    fn sliced_with_patches_matches_primitive() -> VortexResult<()> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let values: Vec<i32> = (0..4096)
+            .map(|i| if i % 91 == 0 { 100_000 + i } else { i % 100 })
+            .collect();
+        let prim = PrimitiveArray::from_iter(values);
+        let packed = BitPackedData::encode(&prim.clone().into_array(), 7, &mut ctx)?;
+        assert!(packed.patches().is_some(), "test setup expects patches");
+
+        let (start, end) = (700usize, 3500usize);
+        let sliced = packed.into_array().slice(start..end)?;
+        let rhs = ConstantArray::new(50i32, end - start).into_array();
+        let got = sliced
+            .binary(rhs.clone(), Operator::Eq)?
+            .execute::<BoolArray>(&mut ctx)?;
+        let want = prim
+            .into_array()
+            .slice(start..end)?
+            .binary(rhs, Operator::Eq)?
+            .execute::<BoolArray>(&mut ctx)?;
+        assert_arrays_eq!(got, want);
+        Ok(())
+    }
+
     /// Nullable input — the result must carry the array's validity.
     #[test]
     fn nullable_propagates_validity() -> VortexResult<()> {
diff --git a/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs b/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs
index 0a5146adb6a..5caba4118e7 100644
--- a/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs
+++ b/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs
@@ -1,15 +1,35 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
-//! Block-streaming compare kernel for [`BitPackedArray`] against a constant.
+//! Fused compare kernel for [`BitPackedArray`] against a constant.
 //!
-//! Walks the encoded array one 1024-element FastLanes block at a time through the regular
-//! [`crate::unpack_iter::BitUnpackedChunks`] iterator (the same path used by decompress,
-//! `is_constant`, and `stream_predicate`), splices any [`crate::patches::Patches`] into the
-//! unpacked block in place, then folds `cmp(value, rhs)` over the block into a [`BitBuffer`].
-//! The materialised primitive never appears: each block reuses a single scratch buffer and the
-//! per-element bool is packed straight into the output words.
+//! Where [`super::stream_predicate`] unpacks a full 1024-element FastLanes block into a scratch
+//! buffer and *then* folds a predicate over it, this path hands the comparison down into the
+//! FastLanes [`BitPackingCompare::unchecked_unpack_cmp`] kernel, which compares each value against
+//! the constant *as it is unpacked*, accumulating the boolean results straight into a 1024-bit
+//! mask (`[u64; 16]`) in transposed FastLanes lane order - one register-resident word per lane, no
+//! `[bool; 1024]` or `[T; 1024]` scratch. A single SIMD [`untranspose_bits`] per block then rotates
+//! that mask into logical row order.
+//!
+//! The packed blocks are walked through the regular [`crate::unpack_iter::BitUnpackedChunks`]
+//! iterator (via [`crate::unpack_iter::BitUnpackedChunks::for_each_packed_chunk`]) rather than a
+//! bespoke chunk loop, so chunk sizing and bounds live in one place.
+//!
+//! Slicing is handled by working in *padded* coordinates: bit `offset + i` holds element `i`. The
+//! output buffer is over-allocated to whole 1024-bit blocks, so every block - the sliced first
+//! block, the body, and the trailing partial - untransposes straight into a 64-bit-word-aligned
+//! slot with no per-block temporary and only one shared scratch `[u64; 16]`. The leading `offset`
+//! garbage rows and the trailing padding are trimmed afterwards: leading whole `u64` words are
+//! dropped with [`BufferMut::split_off`] (an O(1) re-slice) and the residual `offset % 64` bits plus
+//! `len` form the final [`BitBufferMut`] view. The result is therefore byte-aligned (offset 0) when
+//! `offset % 8 == 0`. Inline patches are spliced in afterwards by overwriting the bits at the
+//! patched indices with `cmp(patch_value, rhs)`.
 
+use fastlanes::BitPacking;
+use fastlanes::BitPackingCompare;
+use fastlanes::FastLanesComparable;
+use fastlanes::untranspose_bits;
+use num_traits::AsPrimitive;
 use vortex_array::ArrayRef;
 use vortex_array::ArrayView;
 use vortex_array::ExecutionCtx;
@@ -18,19 +38,59 @@ use vortex_array::arrays::BoolArray;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::dtype::NativePType;
 use vortex_array::dtype::Nullability;
+use vortex_array::dtype::PhysicalPType;
 use vortex_array::match_each_unsigned_integer_ptype;
 use vortex_buffer::BitBufferMut;
 use vortex_buffer::BufferMut;
-use vortex_buffer::pack_bools_into_words;
+use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
 
-use super::stream_predicate::splice_patches;
+use super::stream_predicate::stream_predicate;
 use crate::BitPacked;
 use crate::BitPackedArrayExt;
 use crate::unpack_iter::BitPacked as BitPackedIter;
 
-/// Compare the unpacked values of a [`BitPackedArray`] against `rhs`, one FastLanes block at a
-/// time, producing a [`BoolArray`].
+const CHUNK_SIZE: usize = 1024;
+/// `u64` words spanning one FastLanes block (1024 bits / 64).
+const WORDS_PER_CHUNK: usize = CHUNK_SIZE / u64::BITS as usize;
+
+/// Unpack one packed FastLanes block, comparing each value against `rhs` *as it is unpacked*, and
+/// write the resulting 1024-bit mask (logical row order, LSB-first) into `out`.
+///
+/// `transposed` is caller-owned scratch reused across every block, so the hot loop makes no
+/// per-call stack allocation; its prior contents are irrelevant (the kernel overwrites it).
+#[inline]
+fn unpack_cmp_block<T, F>(
+    transposed: &mut [u64; WORDS_PER_CHUNK],
+    out: &mut [u64; WORDS_PER_CHUNK],
+    bit_width: usize,
+    packed_chunk: &[<T as PhysicalPType>::Physical],
+    cmp: F,
+    rhs: T,
+) where
+    T: NativePType
+        + PhysicalPType
+        + FastLanesComparable<Bitpacked = <T as PhysicalPType>::Physical>,
+    <T as PhysicalPType>::Physical: BitPacking + BitPackingCompare,
+    F: Fn(T, T) -> bool,
+{
+    transposed.fill(0);
+    // SAFETY: `packed_chunk` holds exactly `128 * bit_width / size_of::<U>()` packed elements and
+    // `bit_width <= U::T`, satisfying `unchecked_unpack_cmp`'s contract.
+    unsafe {
+        <<T as PhysicalPType>::Physical as BitPackingCompare>::unchecked_unpack_cmp::<T, _>(
+            bit_width,
+            packed_chunk,
+            transposed,
+            cmp,
+            rhs,
+        );
+    }
+    untranspose_bits::<<T as PhysicalPType>::Physical>(transposed, out);
+}
+
+/// Compare the unpacked values of a [`BitPackedArray`] against `rhs` using the fused FastLanes
+/// `unpack_cmp` kernel, producing a [`BoolArray`].
 ///
 /// `cmp(value, rhs)` defines the predicate; it must be the total-order comparison matching the
 /// requested operator (e.g. `|a, b| a.is_lt(b)`).
@@ -42,37 +102,76 @@ pub(super) fn stream_compare_fused<T, F>(
     ctx: &mut ExecutionCtx,
 ) -> VortexResult<ArrayRef>
 where
-    T: NativePType + BitPackedIter + Copy,
-    F: Fn(T, T) -> bool,
+    T: NativePType
+        + BitPackedIter
+        + FastLanesComparable<Bitpacked = <T as PhysicalPType>::Physical>,
+    <T as PhysicalPType>::Physical: BitPacking + NativePType + BitPackingCompare,
+    F: Fn(T, T) -> bool + Copy,
 {
     let len = array.len();
-    let mut words: BufferMut<u64> = BufferMut::zeroed(len.div_ceil(u64::BITS as usize));
+    let bit_width = array.bit_width() as usize;
+    let offset = array.offset() as usize;
+
+    // A degenerate width has no packed payload for the fused kernel to consume; defer to the scalar
+    // streaming predicate, which handles every layout (including the empty array).
+    if len == 0 || bit_width == 0 {
+        return stream_predicate::<T, _>(array, nullability, move |v| cmp(v, rhs), ctx);
+    }
 
-    if len > 0 {
-        let mut chunks = array.unpacked_chunks::<T>()?;
+    // Over-allocate to whole 1024-bit blocks in padded coordinates so every block - including the
+    // trailing partial - has room for a full untranspose at a 64-bit-word-aligned offset.
+    let num_chunks = (offset + len).div_ceil(CHUNK_SIZE);
+    let mut words: BufferMut<u64> = BufferMut::zeroed(num_chunks * WORDS_PER_CHUNK);
+
+    let chunks = array.unpacked_chunks::<T>()?;
+    {
         let words = words.as_mut_slice();
+        let mut transposed = [0u64; WORDS_PER_CHUNK];
+        chunks.for_each_packed_chunk(|packed_chunk, range| {
+            // Block starts are always 1024-aligned (padded coords), so the slot is a full block.
+            let out = words[range.start / u64::BITS as usize..]
+                .first_chunk_mut::<WORDS_PER_CHUNK>()
+                .vortex_expect("over-allocated buffer holds a full block per chunk");
+            unpack_cmp_block::<T, _>(&mut transposed, out, bit_width, packed_chunk, cmp, rhs);
+        });
 
+        // Patched indices hold placeholder packed values, so their fused result is meaningless;
+        // overwrite each with the comparison against the real patch value. Patch positions are
+        // logical (`global - p_off`); shift by `offset` into padded coordinates.
         if let Some(p) = array.patches() {
-            let p_idx_arr = p.indices().clone().execute::<PrimitiveArray>(ctx)?;
-            let p_val_arr = p.values().clone().execute::<PrimitiveArray>(ctx)?;
+            let p_idx = p.indices().clone().execute::<PrimitiveArray>(ctx)?;
+            let p_val = p.values().clone().execute::<PrimitiveArray>(ctx)?;
             let p_off = p.offset();
-            match_each_unsigned_integer_ptype!(p_idx_arr.ptype(), |I| {
-                let p_idx = p_idx_arr.as_slice::<I>();
-                let p_val = p_val_arr.as_slice::<T>();
-                let mut p_cur: usize = 0;
-                chunks.for_each_unpacked_chunk(|block, range| {
-                    p_cur = splice_patches::<T, I>(block, range.start, p_cur, p_idx, p_val, p_off);
-                    pack_bools_into_words(words, range.start, block.len(), |i| cmp(block[i], rhs));
-                });
-            });
-        } else {
-            chunks.for_each_unpacked_chunk(|block, range| {
-                pack_bools_into_words(words, range.start, block.len(), |i| cmp(block[i], rhs));
+            match_each_unsigned_integer_ptype!(p_idx.ptype(), |I| {
+                let indices = p_idx.as_slice::<I>();
+                let values = p_val.as_slice::<T>();
+                for (&global, &value) in indices.iter().zip(values) {
+                    let global: usize = global.as_();
+                    set_bit(words, offset + global - p_off, cmp(value, rhs));
+                }
             });
         }
     }
 
-    let bits = BitBufferMut::from_buffer(words.into_byte_buffer(), 0, len);
+    // Trim the leading garbage: drop whole `u64` words covered entirely by the skipped `offset`
+    // region (an O(1) re-slice; `u64` granularity keeps the byte buffer 8-aligned). The residual
+    // `offset % 64` bits become the view offset, so the result is byte-aligned when `offset % 8 == 0`
+    // and offset 0 when `offset % 64 == 0`.
+    let head_words = offset / u64::BITS as usize;
+    let bit_offset = offset % u64::BITS as usize;
+    let bytes = words.split_off(head_words).into_byte_buffer();
+    let bits = BitBufferMut::from_buffer(bytes, bit_offset, len);
     let validity = array.validity()?.union_nullability(nullability);
     Ok(BoolArray::new(bits.freeze(), validity).into_array())
 }
+
+/// Branchlessly write a single bit in a packed `u64` word buffer: clear the bit, then OR in the
+/// new value. Avoids a data-dependent branch per patch in the patch-fixup loop, and touches the
+/// target word through a single bounds-checked `&mut`.
+#[inline]
+fn set_bit(words: &mut [u64], idx: usize, value: bool) {
+    let shift = idx % u64::BITS as usize;
+    let mask = 1u64 << shift;
+    let word = &mut words[idx / u64::BITS as usize];
+    *word = (*word & !mask) | (u64::from(value) << shift);
+}
diff --git a/encodings/fastlanes/src/bitpacking/compute/stream_predicate.rs b/encodings/fastlanes/src/bitpacking/compute/stream_predicate.rs
index df136755a7d..4ed9de913ec 100644
--- a/encodings/fastlanes/src/bitpacking/compute/stream_predicate.rs
+++ b/encodings/fastlanes/src/bitpacking/compute/stream_predicate.rs
@@ -78,7 +78,7 @@ where
 /// `[chunk_start, chunk_start + block.len())`, starting from `cursor` and returning the
 /// advanced cursor. Sorted indices mean the cursor only moves forward across the walk.
 #[inline]
-pub(super) fn splice_patches<T, I>(
+fn splice_patches<T, I>(
     block: &mut [T],
     chunk_start: usize,
     mut cursor: usize,

From 83c90b90279efb7582b0727717597d6c09d552bc Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 4 Jun 2026 16:12:40 +0100
Subject: [PATCH 09/14] wip

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 .../src/bitpacking/compute/compare_fused.rs   | 31 +++++++------------
 1 file changed, 11 insertions(+), 20 deletions(-)

diff --git a/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs b/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs
index 5caba4118e7..b8d8cb58f5e 100644
--- a/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs
+++ b/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs
@@ -19,11 +19,9 @@
 //! output buffer is over-allocated to whole 1024-bit blocks, so every block - the sliced first
 //! block, the body, and the trailing partial - untransposes straight into a 64-bit-word-aligned
 //! slot with no per-block temporary and only one shared scratch `[u64; 16]`. The leading `offset`
-//! garbage rows and the trailing padding are trimmed afterwards: leading whole `u64` words are
-//! dropped with [`BufferMut::split_off`] (an O(1) re-slice) and the residual `offset % 64` bits plus
-//! `len` form the final [`BitBufferMut`] view. The result is therefore byte-aligned (offset 0) when
-//! `offset % 8 == 0`. Inline patches are spliced in afterwards by overwriting the bits at the
-//! patched indices with `cmp(patch_value, rhs)`.
+//! garbage rows are represented as the final [`BitBuffer`] bit offset, which naturally handles
+//! sub-byte slices without copy-aligning. Inline patches are spliced in afterwards by overwriting
+//! the bits at the patched indices with `cmp(patch_value, rhs)`.
 
 use fastlanes::BitPacking;
 use fastlanes::BitPackingCompare;
@@ -51,8 +49,9 @@ use crate::BitPackedArrayExt;
 use crate::unpack_iter::BitPacked as BitPackedIter;
 
 const CHUNK_SIZE: usize = 1024;
+const U64_BITS: usize = u64::BITS as usize;
 /// `u64` words spanning one FastLanes block (1024 bits / 64).
-const WORDS_PER_CHUNK: usize = CHUNK_SIZE / u64::BITS as usize;
+const WORDS_PER_CHUNK: usize = CHUNK_SIZE / U64_BITS;
 
 /// Unpack one packed FastLanes block, comparing each value against `rhs` *as it is unpacked*, and
 /// write the resulting 1024-bit mask (logical row order, LSB-first) into `out`.
@@ -129,15 +128,14 @@ where
         let mut transposed = [0u64; WORDS_PER_CHUNK];
         chunks.for_each_packed_chunk(|packed_chunk, range| {
             // Block starts are always 1024-aligned (padded coords), so the slot is a full block.
-            let out = words[range.start / u64::BITS as usize..]
+            let out = words[range.start / U64_BITS..]
                 .first_chunk_mut::<WORDS_PER_CHUNK>()
                 .vortex_expect("over-allocated buffer holds a full block per chunk");
             unpack_cmp_block::<T, _>(&mut transposed, out, bit_width, packed_chunk, cmp, rhs);
         });
 
         // Patched indices hold placeholder packed values, so their fused result is meaningless;
-        // overwrite each with the comparison against the real patch value. Patch positions are
-        // logical (`global - p_off`); shift by `offset` into padded coordinates.
+        // overwrite each with the comparison against the real patch value.
         if let Some(p) = array.patches() {
             let p_idx = p.indices().clone().execute::<PrimitiveArray>(ctx)?;
             let p_val = p.values().clone().execute::<PrimitiveArray>(ctx)?;
@@ -153,16 +151,9 @@ where
         }
     }
 
-    // Trim the leading garbage: drop whole `u64` words covered entirely by the skipped `offset`
-    // region (an O(1) re-slice; `u64` granularity keeps the byte buffer 8-aligned). The residual
-    // `offset % 64` bits become the view offset, so the result is byte-aligned when `offset % 8 == 0`
-    // and offset 0 when `offset % 64 == 0`.
-    let head_words = offset / u64::BITS as usize;
-    let bit_offset = offset % u64::BITS as usize;
-    let bytes = words.split_off(head_words).into_byte_buffer();
-    let bits = BitBufferMut::from_buffer(bytes, bit_offset, len);
+    let bits = BitBufferMut::from_buffer(words.into_byte_buffer(), offset, len).freeze();
     let validity = array.validity()?.union_nullability(nullability);
-    Ok(BoolArray::new(bits.freeze(), validity).into_array())
+    Ok(BoolArray::new(bits, validity).into_array())
 }
 
 /// Branchlessly write a single bit in a packed `u64` word buffer: clear the bit, then OR in the
@@ -170,8 +161,8 @@ where
 /// target word through a single bounds-checked `&mut`.
 #[inline]
 fn set_bit(words: &mut [u64], idx: usize, value: bool) {
-    let shift = idx % u64::BITS as usize;
+    let shift = idx % U64_BITS;
     let mask = 1u64 << shift;
-    let word = &mut words[idx / u64::BITS as usize];
+    let word = &mut words[idx / U64_BITS];
     *word = (*word & !mask) | (u64::from(value) << shift);
 }

From 4d50ff1f2d6dddc1f667256e5a394c488f4792c2 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 4 Jun 2026 16:17:53 +0100
Subject: [PATCH 10/14] wip

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 .../src/bitpacking/compute/compare.rs         | 20 +++++---
 .../src/bitpacking/compute/compare_fused.rs   | 50 ++++++-------------
 2 files changed, 27 insertions(+), 43 deletions(-)

diff --git a/encodings/fastlanes/src/bitpacking/compute/compare.rs b/encodings/fastlanes/src/bitpacking/compute/compare.rs
index 4fbc4cf8b9d..0ddff6ed5a7 100644
--- a/encodings/fastlanes/src/bitpacking/compute/compare.rs
+++ b/encodings/fastlanes/src/bitpacking/compute/compare.rs
@@ -101,9 +101,10 @@ where
 
 #[cfg(test)]
 mod tests {
+    use std::sync::LazyLock;
+
     use rstest::rstest;
     use vortex_array::IntoArray;
-    use vortex_array::LEGACY_SESSION;
     use vortex_array::VortexSessionExecute;
     use vortex_array::arrays::BoolArray;
     use vortex_array::arrays::ConstantArray;
@@ -113,12 +114,17 @@ mod tests {
     use vortex_array::scalar_fn::fns::binary::CompareKernel;
     use vortex_array::scalar_fn::fns::operators::CompareOperator;
     use vortex_array::scalar_fn::fns::operators::Operator;
+    use vortex_array::session::ArraySession;
     use vortex_error::VortexResult;
+    use vortex_session::VortexSession;
 
     use crate::BitPacked;
     use crate::BitPackedArrayExt;
     use crate::BitPackedData;
 
+    static SESSION: LazyLock<VortexSession> =
+        LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
+
     /// All six operators on a small in-range input.
     #[rstest]
     #[case(Operator::Eq, vec![false, false, false, true, false, false, true])]
@@ -128,7 +134,7 @@ mod tests {
     #[case(Operator::Gt, vec![false, false, false, false, true, true, false])]
     #[case(Operator::Gte, vec![false, false, false, true, true, true, true])]
     fn small(#[case] op: Operator, #[case] expected: Vec<bool>) {
-        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let mut ctx = SESSION.create_execution_ctx();
         let values = PrimitiveArray::from_iter([0u32, 1, 2, 3, 4, 5, 3]);
         let packed = BitPackedData::encode(&values.into_array(), 3, &mut ctx).unwrap();
         let rhs = ConstantArray::new(3u32, packed.len()).into_array();
@@ -150,7 +156,7 @@ mod tests {
         ($name:ident, $T:ty, $($bw:expr),+) => {
             #[test]
             fn $name() -> VortexResult<()> {
-                let mut ctx = LEGACY_SESSION.create_execution_ctx();
+                let mut ctx = SESSION.create_execution_ctx();
                 for bw in [$($bw),+] {
                     let cap: u128 = 1u128 << bw;
                     let values: Vec<$T> = (0..2048u128).map(|i| (i % cap) as $T).collect();
@@ -191,7 +197,7 @@ mod tests {
     /// predicate runs.
     #[test]
     fn signed_with_patches_matches_primitive() -> VortexResult<()> {
-        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let mut ctx = SESSION.create_execution_ctx();
         let values: Vec<i32> = (0..1500)
             .map(|i| if i % 73 == 0 { 100_000 + i } else { i % 100 })
             .collect();
@@ -225,7 +231,7 @@ mod tests {
         #[case] start: usize,
         #[case] slice_len: usize,
     ) -> VortexResult<()> {
-        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let mut ctx = SESSION.create_execution_ctx();
         let values: Vec<u32> = (0..5000u32).map(|i| i % 128).collect();
         let prim = PrimitiveArray::from_iter(values);
         let packed = BitPackedData::encode(&prim.clone().into_array(), 7, &mut ctx)?;
@@ -260,7 +266,7 @@ mod tests {
     /// `Patches`, exercising the `offset + (global - p_off)` patch-position math.
     #[test]
     fn sliced_with_patches_matches_primitive() -> VortexResult<()> {
-        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let mut ctx = SESSION.create_execution_ctx();
         let values: Vec<i32> = (0..4096)
             .map(|i| if i % 91 == 0 { 100_000 + i } else { i % 100 })
             .collect();
@@ -286,7 +292,7 @@ mod tests {
     /// Nullable input — the result must carry the array's validity.
     #[test]
     fn nullable_propagates_validity() -> VortexResult<()> {
-        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let mut ctx = SESSION.create_execution_ctx();
         let prim = PrimitiveArray::from_option_iter([Some(1u32), None, Some(3), Some(4), None]);
         let packed = BitPackedData::encode(&prim.clone().into_array(), 3, &mut ctx)?;
         let rhs = ConstantArray::new(3u32, packed.len()).into_array();
diff --git a/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs b/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs
index b8d8cb58f5e..1d7f8ed9dea 100644
--- a/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs
+++ b/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs
@@ -53,41 +53,6 @@ const U64_BITS: usize = u64::BITS as usize;
 /// `u64` words spanning one FastLanes block (1024 bits / 64).
 const WORDS_PER_CHUNK: usize = CHUNK_SIZE / U64_BITS;
 
-/// Unpack one packed FastLanes block, comparing each value against `rhs` *as it is unpacked*, and
-/// write the resulting 1024-bit mask (logical row order, LSB-first) into `out`.
-///
-/// `transposed` is caller-owned scratch reused across every block, so the hot loop makes no
-/// per-call stack allocation; its prior contents are irrelevant (the kernel overwrites it).
-#[inline]
-fn unpack_cmp_block<T, F>(
-    transposed: &mut [u64; WORDS_PER_CHUNK],
-    out: &mut [u64; WORDS_PER_CHUNK],
-    bit_width: usize,
-    packed_chunk: &[<T as PhysicalPType>::Physical],
-    cmp: F,
-    rhs: T,
-) where
-    T: NativePType
-        + PhysicalPType
-        + FastLanesComparable<Bitpacked = <T as PhysicalPType>::Physical>,
-    <T as PhysicalPType>::Physical: BitPacking + BitPackingCompare,
-    F: Fn(T, T) -> bool,
-{
-    transposed.fill(0);
-    // SAFETY: `packed_chunk` holds exactly `128 * bit_width / size_of::<U>()` packed elements and
-    // `bit_width <= U::T`, satisfying `unchecked_unpack_cmp`'s contract.
-    unsafe {
-        <<T as PhysicalPType>::Physical as BitPackingCompare>::unchecked_unpack_cmp::<T, _>(
-            bit_width,
-            packed_chunk,
-            transposed,
-            cmp,
-            rhs,
-        );
-    }
-    untranspose_bits::<<T as PhysicalPType>::Physical>(transposed, out);
-}
-
 /// Compare the unpacked values of a [`BitPackedArray`] against `rhs` using the fused FastLanes
 /// `unpack_cmp` kernel, producing a [`BoolArray`].
 ///
@@ -131,11 +96,24 @@ where
             let out = words[range.start / U64_BITS..]
                 .first_chunk_mut::<WORDS_PER_CHUNK>()
                 .vortex_expect("over-allocated buffer holds a full block per chunk");
-            unpack_cmp_block::<T, _>(&mut transposed, out, bit_width, packed_chunk, cmp, rhs);
+            // SAFETY: `packed_chunk` holds exactly `128 * bit_width / size_of::<U>()` packed
+            // elements and `bit_width <= U::T`, satisfying `unchecked_unpack_cmp`'s contract. The
+            // kernel assigns every word in `transposed`, so its previous contents are irrelevant.
+            unsafe {
+                <<T as PhysicalPType>::Physical as BitPackingCompare>::unchecked_unpack_cmp::<T, _>(
+                    bit_width,
+                    packed_chunk,
+                    &mut transposed,
+                    cmp,
+                    rhs,
+                );
+            }
+            untranspose_bits::<<T as PhysicalPType>::Physical>(&transposed, out);
         });
 
         // Patched indices hold placeholder packed values, so their fused result is meaningless;
         // overwrite each with the comparison against the real patch value.
+        // TODO(joe): apply patches per `packed_chunked`.
         if let Some(p) = array.patches() {
             let p_idx = p.indices().clone().execute::<PrimitiveArray>(ctx)?;
             let p_val = p.values().clone().execute::<PrimitiveArray>(ctx)?;

From c117649906dfbf4a2326480998eb6b414d6482b2 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 4 Jun 2026 17:05:58 +0100
Subject: [PATCH 11/14] wip

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 .../src/bitpacking/compute/compare_fused.rs   | 50 ++++++++-----------
 1 file changed, 21 insertions(+), 29 deletions(-)

diff --git a/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs b/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs
index 1d7f8ed9dea..93002a04382 100644
--- a/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs
+++ b/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs
@@ -110,37 +110,29 @@ where
             }
             untranspose_bits::<<T as PhysicalPType>::Physical>(&transposed, out);
         });
+    }
+
+    let mut bits = BitBufferMut::from_buffer(words.into_byte_buffer(), offset, len);
 
-        // Patched indices hold placeholder packed values, so their fused result is meaningless;
-        // overwrite each with the comparison against the real patch value.
-        // TODO(joe): apply patches per `packed_chunked`.
-        if let Some(p) = array.patches() {
-            let p_idx = p.indices().clone().execute::<PrimitiveArray>(ctx)?;
-            let p_val = p.values().clone().execute::<PrimitiveArray>(ctx)?;
-            let p_off = p.offset();
-            match_each_unsigned_integer_ptype!(p_idx.ptype(), |I| {
-                let indices = p_idx.as_slice::<I>();
-                let values = p_val.as_slice::<T>();
-                for (&global, &value) in indices.iter().zip(values) {
-                    let global: usize = global.as_();
-                    set_bit(words, offset + global - p_off, cmp(value, rhs));
-                }
-            });
-        }
+    // Patched indices hold placeholder packed values, so their fused result is meaningless;
+    // overwrite each with the comparison against the real patch value.
+    // TODO(joe): apply patches per `packed_chunked`.
+    if let Some(p) = array.patches() {
+        let p_idx = p.indices().clone().execute::<PrimitiveArray>(ctx)?;
+        // TODO(joe): push down cmp??
+        let p_val = p.values().clone().execute::<PrimitiveArray>(ctx)?;
+        let p_off = p.offset();
+        match_each_unsigned_integer_ptype!(p_idx.ptype(), |I| {
+            let indices = p_idx.as_slice::<I>();
+            let values = p_val.as_slice::<T>();
+            for (&global, &value) in indices.iter().zip(values) {
+                let global: usize = global.as_();
+                let idx = global - p_off;
+                bits.set_to(idx, cmp(value, rhs))
+            }
+        });
     }
 
-    let bits = BitBufferMut::from_buffer(words.into_byte_buffer(), offset, len).freeze();
     let validity = array.validity()?.union_nullability(nullability);
-    Ok(BoolArray::new(bits, validity).into_array())
-}
-
-/// Branchlessly write a single bit in a packed `u64` word buffer: clear the bit, then OR in the
-/// new value. Avoids a data-dependent branch per patch in the patch-fixup loop, and touches the
-/// target word through a single bounds-checked `&mut`.
-#[inline]
-fn set_bit(words: &mut [u64], idx: usize, value: bool) {
-    let shift = idx % U64_BITS;
-    let mask = 1u64 << shift;
-    let word = &mut words[idx / U64_BITS];
-    *word = (*word & !mask) | (u64::from(value) << shift);
+    Ok(BoolArray::new(bits.freeze(), validity).into_array())
 }

From 10e904cfebd44b9d1284dede32255ee084f2898b Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 4 Jun 2026 17:07:45 +0100
Subject: [PATCH 12/14] wip

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 encodings/fastlanes/src/bitpacking/compute/compare.rs | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/encodings/fastlanes/src/bitpacking/compute/compare.rs b/encodings/fastlanes/src/bitpacking/compute/compare.rs
index 0ddff6ed5a7..3c9a6d527d3 100644
--- a/encodings/fastlanes/src/bitpacking/compute/compare.rs
+++ b/encodings/fastlanes/src/bitpacking/compute/compare.rs
@@ -277,9 +277,14 @@ mod tests {
         let (start, end) = (700usize, 3500usize);
         let sliced = packed.into_array().slice(start..end)?;
         let rhs = ConstantArray::new(50i32, end - start).into_array();
-        let got = sliced
-            .binary(rhs.clone(), Operator::Eq)?
-            .execute::<BoolArray>(&mut ctx)?;
+        let got = <BitPacked as CompareKernel>::compare(
+            sliced.as_::<BitPacked>(),
+            &rhs,
+            CompareOperator::Eq,
+            &mut ctx,
+        )?
+        .expect("fused compare kernel must engage for sliced arrays with patches")
+        .execute::<BoolArray>(&mut ctx)?;
         let want = prim
             .into_array()
             .slice(start..end)?

From c495055ff9771c65e0dd6b920b82e4a10186d312 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 4 Jun 2026 16:26:31 +0000
Subject: [PATCH 13/14] Fix sliced_with_patches compare test to obtain a
 BitPacked via SliceKernel

ArrayRef::slice on a patched BitPackedArray leaves a lazy SliceArray
(the buffer-free SliceReduce path bails when patches are present), so
as_::<BitPacked>() panicked before the fused compare kernel ran. Acquire
the sliced BitPacked through SliceKernel, which reads the buffers and
produces a sliced BitPacked with sliced patches, so the test exercises
the fused unpack_cmp + patch-fixup path it was written for.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 encodings/fastlanes/src/bitpacking/compute/compare.rs | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/encodings/fastlanes/src/bitpacking/compute/compare.rs b/encodings/fastlanes/src/bitpacking/compute/compare.rs
index 3c9a6d527d3..056e9840eef 100644
--- a/encodings/fastlanes/src/bitpacking/compute/compare.rs
+++ b/encodings/fastlanes/src/bitpacking/compute/compare.rs
@@ -275,7 +275,15 @@ mod tests {
         assert!(packed.patches().is_some(), "test setup expects patches");
 
         let (start, end) = (700usize, 3500usize);
-        let sliced = packed.into_array().slice(start..end)?;
+        // `ArrayRef::slice` leaves a lazy `SliceArray` over a patched `BitPacked` (the
+        // `SliceReduce` path bails when patches are present), so go through the `SliceKernel`,
+        // which reads the buffers and produces a sliced `BitPacked` with sliced patches.
+        let sliced = <BitPacked as vortex_array::arrays::slice::SliceKernel>::slice(
+            packed.as_view(),
+            start..end,
+            &mut ctx,
+        )?
+        .expect("slice kernel produces a sliced bitpacked array");
         let rhs = ConstantArray::new(50i32, end - start).into_array();
         let got = <BitPacked as CompareKernel>::compare(
             sliced.as_::<BitPacked>(),

From bd3fbaa2d5ba9d6602dcc7f901984fcb7ef8e1d9 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 4 Jun 2026 16:31:47 +0000
Subject: [PATCH 14/14] Import SliceKernel instead of absolute path to satisfy
 clippy::absolute-paths

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 encodings/fastlanes/src/bitpacking/compute/compare.rs | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/encodings/fastlanes/src/bitpacking/compute/compare.rs b/encodings/fastlanes/src/bitpacking/compute/compare.rs
index 056e9840eef..5c9aa31c1ac 100644
--- a/encodings/fastlanes/src/bitpacking/compute/compare.rs
+++ b/encodings/fastlanes/src/bitpacking/compute/compare.rs
@@ -109,6 +109,7 @@ mod tests {
     use vortex_array::arrays::BoolArray;
     use vortex_array::arrays::ConstantArray;
     use vortex_array::arrays::PrimitiveArray;
+    use vortex_array::arrays::slice::SliceKernel;
     use vortex_array::assert_arrays_eq;
     use vortex_array::builtins::ArrayBuiltins;
     use vortex_array::scalar_fn::fns::binary::CompareKernel;
@@ -278,12 +279,8 @@ mod tests {
         // `ArrayRef::slice` leaves a lazy `SliceArray` over a patched `BitPacked` (the
         // `SliceReduce` path bails when patches are present), so go through the `SliceKernel`,
         // which reads the buffers and produces a sliced `BitPacked` with sliced patches.
-        let sliced = <BitPacked as vortex_array::arrays::slice::SliceKernel>::slice(
-            packed.as_view(),
-            start..end,
-            &mut ctx,
-        )?
-        .expect("slice kernel produces a sliced bitpacked array");
+        let sliced = <BitPacked as SliceKernel>::slice(packed.as_view(), start..end, &mut ctx)?
+            .expect("slice kernel produces a sliced bitpacked array");
         let rhs = ConstantArray::new(50i32, end - start).into_array();
         let got = <BitPacked as CompareKernel>::compare(
             sliced.as_::<BitPacked>(),