From 10939a6117f1eaef46d7b88513dafcc0b0ac236a Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 3 Jun 2026 16:46:17 +0000 Subject: [PATCH 01/14] bench: sweep bit-packed compare across all int types and bit widths Add `bitpack_compare_sweep`, which exercises the public `array.binary(rhs, op)` compare-against-constant path over all eight integer types and every valid bit width (64Ki in-range elements per case, no patches). It isolates the `` unpack + per-element compare kernel so a kernel change shows up as a CodSpeed diff. Signed-off-by: Joe Isaacs --- encodings/fastlanes/Cargo.toml | 4 + .../benches/bitpack_compare_sweep.rs | 111 ++++++++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 encodings/fastlanes/benches/bitpack_compare_sweep.rs diff --git a/encodings/fastlanes/Cargo.toml b/encodings/fastlanes/Cargo.toml index 08c96c481d7..e0aa5cbb724 100644 --- a/encodings/fastlanes/Cargo.toml +++ b/encodings/fastlanes/Cargo.toml @@ -64,6 +64,10 @@ required-features = ["_test-harness"] name = "bitpack_compare" harness = false +[[bench]] +name = "bitpack_compare_sweep" +harness = false + [[bench]] name = "cast_bitpacked" harness = false diff --git a/encodings/fastlanes/benches/bitpack_compare_sweep.rs b/encodings/fastlanes/benches/bitpack_compare_sweep.rs new file mode 100644 index 00000000000..3eb0ba3b9a2 --- /dev/null +++ b/encodings/fastlanes/benches/bitpack_compare_sweep.rs @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Sweeps the public `BitPackedArray` compare-against-constant path (`array.binary(rhs, op)`) over +//! every integer type and every valid bit width, so a kernel change shows up as a CodSpeed diff. +//! +//! The array holds in-range values (no patches, no out-of-range fast path), so each iteration runs +//! the full unpack + per-element compare kernel that backs ``. +//! +//! Run with `cargo bench -p vortex-fastlanes --bench bitpack_compare_sweep`. + +#![expect(clippy::unwrap_used)] +#![expect(clippy::cast_possible_truncation)] + +use divan::Bencher; +use divan::counter::ItemsCount; +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::LEGACY_SESSION; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::builtins::ArrayBuiltins; +use vortex_array::dtype::NativePType; +use vortex_array::scalar::Scalar; +use vortex_array::scalar_fn::fns::operators::Operator; +use vortex_array::validity::Validity; +use vortex_buffer::BufferMut; +use vortex_fastlanes::BitPackedData; + +fn main() { + divan::main(); +} + +/// Number of elements per benchmarked array (64 full FastLanes blocks). +const LEN: usize = 64 * 1024; + +/// Operator under test. `Lt` exercises the full unpack + per-element comparison path. +const OP: Operator = Operator::Lt; + +/// Integer types we can build packed arrays for in the benchmark. +trait BenchInt: NativePType + Copy + Into { + /// Build an in-range value from a small counter. + fn from_counter(v: u64) -> Self; +} + +macro_rules! impl_bench_int { + ($($T:ty),+) => { + $(impl BenchInt for $T { + #[inline] + fn from_counter(v: u64) -> Self { + v as $T + } + })+ + }; +} + +impl_bench_int!(u8, u16, u32, u64, i8, i16, i32, i64); + +/// Encode `LEN` in-range values of type `T` at the given bit width, returning the packed array, a +/// mid-range constant to compare against, and an execution context. +fn setup(width: usize) -> (ArrayRef, ArrayRef, ExecutionCtx) { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let cap = 1u64 << width; + let buf: BufferMut = (0..LEN) + .map(|i| T::from_counter((i as u64) % cap)) + .collect(); + let array = BitPackedData::encode( + &PrimitiveArray::new(buf.freeze(), Validity::NonNullable).into_array(), + width as u8, + &mut ctx, + ) + .unwrap() + .into_array(); + let rhs = ConstantArray::new(T::from_counter(cap / 2), LEN).into_array(); + (array, rhs, ctx) +} + +/// Generate a compare benchmark over every valid bit width for one type. Valid widths are +/// `1..native_bits` - bit-packing requires the target width to be strictly narrower than the type. +macro_rules! bench_type { + ($modname:ident, $T:ty, $native_bits:expr) => { + mod $modname { + use super::*; + + #[divan::bench(args = 1..$native_bits)] + fn compare(bencher: Bencher, width: usize) { + let (array, rhs, mut ctx) = setup::<$T>(width); + bencher.counter(ItemsCount::new(LEN)).bench_local(|| { + array + .clone() + .binary(rhs.clone(), OP) + .unwrap() + .execute::(&mut ctx) + .unwrap() + }); + } + } + }; +} + +bench_type!(u8, u8, 8usize); +bench_type!(u16, u16, 16usize); +bench_type!(u32, u32, 32usize); +bench_type!(u64, u64, 64usize); +bench_type!(i8, i8, 8usize); +bench_type!(i16, i16, 16usize); +bench_type!(i32, i32, 32usize); +bench_type!(i64, i64, 64usize); From 48da899354e048d5e0d73bce36c2be6afc30716b Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 3 Jun 2026 16:47:23 +0000 Subject: [PATCH 02/14] perf(fastlanes): fuse bit-packed compare into a transposed mask + untranspose Replace the unpack-then-compare streaming kernel for compare-against-constant with the FastLanes fused `unpack_cmp`: compare each value as it is unpacked, accumulating results straight into a transposed 1024-bit mask (`[u64; 16]`, one register-resident word per lane - no `[bool; 1024]`/`[T; 1024]` scratch), then a single SIMD `untranspose_bits` per block rotates the mask into logical row order, copied directly into the output bit buffer. Inline patches are spliced in afterwards; sliced (offset != 0) arrays fall back to the scalar streaming predicate. This requires the in-development FastLanes (PR #141 fused mask + PR #145 width-generic BMI2/VBMI untranspose), pinned via a git patch until released. Benchmarked end-to-end through the public compare path (`bitpack_compare_sweep`, 64Ki elements, all integer types and bit widths): fused beats the streaming baseline for every type and width - i8/u8 ~6.2-7.7x i16/u16 ~4.5-6.0x i32/u32 ~1.9-4.3x i64/u64 ~1.2-1.9x Signed-off-by: Joe Isaacs --- Cargo.lock | 10 +- Cargo.toml | 5 + .../src/bitpacking/compute/compare.rs | 42 ++++-- .../src/bitpacking/compute/compare_fused.rs | 136 ++++++++++++++++++ .../fastlanes/src/bitpacking/compute/mod.rs | 1 + 5 files changed, 182 insertions(+), 12 deletions(-) create mode 100644 encodings/fastlanes/src/bitpacking/compute/compare_fused.rs diff --git a/Cargo.lock b/Cargo.lock index ec89bf1161f..1dd83daba52 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1645,6 +1645,12 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" +[[package]] +name = "core_detect" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f8f80099a98041a3d1622845c271458a2d73e688351bf3cb999266764b81d48" + [[package]] name = "cpubits" version = "0.1.1" @@ -3146,11 +3152,11 @@ checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55" [[package]] name = "fastlanes" version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "414cb755aee48ff7b0907995d2949c68c8c17900970076dff6a808e18e592d71" +source = "git+https://github.com/spiraldb/fastlanes?rev=6c10ea72cf693a17e994aa6401604ebedbeda453#6c10ea72cf693a17e994aa6401604ebedbeda453" dependencies = [ "arrayref", "const_for", + "core_detect", "num-traits", "paste", "seq-macro", diff --git a/Cargo.toml b/Cargo.toml index 9700d8d78ed..8dcbfadfe08 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -412,3 +412,8 @@ debug = false debug-assertions = false strip = "debuginfo" incremental = false + +# Pin to the in-development FastLanes branch (PR #141 fused [u64;16] cmp mask + +# PR #145 width-generic BMI/VBMI untranspose) until a release is cut. +[patch.crates-io] +fastlanes = { git = "https://github.com/spiraldb/fastlanes", rev = "6c10ea72cf693a17e994aa6401604ebedbeda453" } diff --git a/encodings/fastlanes/src/bitpacking/compute/compare.rs b/encodings/fastlanes/src/bitpacking/compute/compare.rs index f5c5c81c5cb..4e6755b9c36 100644 --- a/encodings/fastlanes/src/bitpacking/compute/compare.rs +++ b/encodings/fastlanes/src/bitpacking/compute/compare.rs @@ -8,6 +8,9 @@ //! a [`BitBuffer`]. Patches are re-applied at the end by overwriting bits at the patched //! indices with `predicate(patch_value)`. +use fastlanes::BitPacking; +use fastlanes::BitPackingCompare; +use fastlanes::FastLanesComparable; use vortex_array::ArrayRef; use vortex_array::ArrayView; use vortex_array::ExecutionCtx; @@ -20,7 +23,8 @@ use vortex_error::VortexExpect; use vortex_error::VortexResult; use crate::BitPacked; -use crate::bitpacking::compute::stream_predicate::stream_predicate; +use crate::bitpacking::compute::compare_fused::stream_compare_fused; +use crate::unpack_iter::BitPacked as BitPackedIter; impl CompareKernel for BitPacked { fn compare( @@ -55,6 +59,15 @@ impl CompareKernel for BitPacked { } } +/// Compare every value against the constant via the fused FastLanes `unpack_cmp` kernel. +/// +/// `NativePType::is_eq` / `is_lt` etc. provide total comparison (matching the primitive between +/// kernel's dispatch shape). `NotEq` has no direct method, so use `!is_eq`. +/// +/// The fused kernel (compare straight into a transposed 1024-bit mask, then a single SIMD +/// untranspose into logical row order) beats the unpack-then-compare streaming baseline for every +/// integer type and bit width - see `benches/bitpack_compare_fused.rs` (~6-7x for 8-bit lanes +/// down to ~1.2-1.9x for 64-bit lanes), so it is used unconditionally. fn compare_constant_typed( lhs: ArrayView<'_, BitPacked>, rhs: T, @@ -63,19 +76,28 @@ fn compare_constant_typed( ctx: &mut ExecutionCtx, ) -> VortexResult where - T: NativePType + Copy + crate::unpack_iter::BitPacked, + T: NativePType + BitPackedIter + FastLanesComparable, + ::Bitpacked: BitPacking + NativePType + BitPackingCompare, { - // `NativePType::is_eq` / `is_lt` etc. provide total comparison (matching the primitive - // between kernel's dispatch shape). `NotEq` has no direct method, so use `!is_eq`. match operator { - CompareOperator::Eq => stream_predicate::(lhs, nullability, |v| v.is_eq(rhs), ctx), + CompareOperator::Eq => { + stream_compare_fused::(lhs, rhs, nullability, |a, b| a.is_eq(b), ctx) + } CompareOperator::NotEq => { - stream_predicate::(lhs, nullability, |v| !v.is_eq(rhs), ctx) + stream_compare_fused::(lhs, rhs, nullability, |a, b| !a.is_eq(b), ctx) + } + CompareOperator::Lt => { + stream_compare_fused::(lhs, rhs, nullability, |a, b| a.is_lt(b), ctx) + } + CompareOperator::Lte => { + stream_compare_fused::(lhs, rhs, nullability, |a, b| a.is_le(b), ctx) + } + CompareOperator::Gt => { + stream_compare_fused::(lhs, rhs, nullability, |a, b| a.is_gt(b), ctx) + } + CompareOperator::Gte => { + stream_compare_fused::(lhs, rhs, nullability, |a, b| a.is_ge(b), ctx) } - CompareOperator::Lt => stream_predicate::(lhs, nullability, |v| v.is_lt(rhs), ctx), - CompareOperator::Lte => stream_predicate::(lhs, nullability, |v| v.is_le(rhs), ctx), - CompareOperator::Gt => stream_predicate::(lhs, nullability, |v| v.is_gt(rhs), ctx), - CompareOperator::Gte => stream_predicate::(lhs, nullability, |v| v.is_ge(rhs), ctx), } } diff --git a/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs b/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs new file mode 100644 index 00000000000..053b63e6b3b --- /dev/null +++ b/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Fused compare kernel for [`BitPackedArray`] against a constant. +//! +//! Where [`super::stream_predicate`] unpacks a full 1024-element FastLanes block into a scratch +//! buffer and *then* folds a predicate over it, this path hands the comparison down into the +//! FastLanes [`BitPackingCompare::unchecked_unpack_cmp`] kernel, which compares each value against +//! the constant *as it is unpacked*, accumulating the boolean results straight into a 1024-bit +//! mask (`[u64; 16]`) in transposed FastLanes lane order - one register-resident word per lane, no +//! `[bool; 1024]` or `[T; 1024]` scratch. A single SIMD [`untranspose_bits`] per block then rotates +//! that mask into logical row order, which is copied directly into the output bit buffer. +//! +//! Only the full-chunk fast path uses the fused kernel. Sliced arrays (non-zero block offset) fall +//! back to the scalar streaming predicate, and inline patches are spliced in afterwards by +//! overwriting the bits at the patched indices with `cmp(patch_value, rhs)`. + +use fastlanes::BitPacking; +use fastlanes::BitPackingCompare; +use fastlanes::FastLanesComparable; +use fastlanes::untranspose_bits; +use num_traits::AsPrimitive; +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::dtype::NativePType; +use vortex_array::dtype::Nullability; +use vortex_array::match_each_unsigned_integer_ptype; +use vortex_buffer::BitBufferMut; +use vortex_buffer::BufferMut; +use vortex_error::VortexResult; + +use super::stream_predicate::stream_predicate; +use crate::BitPacked; +use crate::BitPackedArrayExt; +use crate::unpack_iter::BitPacked as BitPackedIter; + +const CHUNK_SIZE: usize = 1024; +/// `u64` words spanning one FastLanes block (1024 bits / 64). +const WORDS_PER_CHUNK: usize = CHUNK_SIZE / u64::BITS as usize; + +/// Compare the unpacked values of a [`BitPackedArray`] against `rhs` using the fused FastLanes +/// `unpack_cmp` kernel, producing a [`BoolArray`]. +/// +/// `cmp(value, rhs)` defines the predicate; it must be the total-order comparison matching the +/// requested operator (e.g. `|a, b| a.is_lt(b)`). +pub(super) fn stream_compare_fused( + array: ArrayView<'_, BitPacked>, + rhs: T, + nullability: Nullability, + cmp: F, + ctx: &mut ExecutionCtx, +) -> VortexResult +where + T: NativePType + BitPackedIter + FastLanesComparable, + ::Bitpacked: BitPacking + NativePType + BitPackingCompare, + F: Fn(T, T) -> bool + Copy, +{ + let len = array.len(); + let bit_width = BitPackedArrayExt::bit_width(&array) as usize; + let offset = BitPackedArrayExt::offset(&array) as usize; + + // The fused kernel consumes whole 1024-element blocks at a fixed packed width. A non-zero + // block offset (from slicing) or a degenerate width has no clean full-chunk form, so defer + // to the scalar streaming predicate, which handles every layout. + if offset != 0 || len == 0 || bit_width == 0 { + return stream_predicate::(array, nullability, move |v| cmp(v, rhs), ctx); + } + + let packed = BitPackedArrayExt::packed_slice::<::Bitpacked>(&array); + let elems_per_chunk = 128 * bit_width / size_of::<::Bitpacked>(); + let num_chunks = len.div_ceil(CHUNK_SIZE); + + let mut words: BufferMut = BufferMut::zeroed(len.div_ceil(u64::BITS as usize)); + { + let words = words.as_mut_slice(); + // Per block: fuse compare into a transposed 1024-bit mask, then untranspose into logical + // row order. The packed buffer is zero-padded out to a whole final block, so every chunk - + // including the trailing partial one - has exactly `elems_per_chunk` packed values; we just + // copy fewer than 16 words out of the last block's untransposed mask. + let mut transposed = [0u64; WORDS_PER_CHUNK]; + let mut logical = [0u64; WORDS_PER_CHUNK]; + for chunk in 0..num_chunks { + let packed_chunk = &packed[chunk * elems_per_chunk..][..elems_per_chunk]; + // SAFETY: `packed_chunk` is exactly `128 * bit_width / size_of::()` elements and + // `bit_width <= U::T`, satisfying `unchecked_unpack_cmp`'s contract. + unsafe { + <::Bitpacked as BitPackingCompare>::unchecked_unpack_cmp::< + T, + _, + >(bit_width, packed_chunk, &mut transposed, cmp, rhs); + } + untranspose_bits::<::Bitpacked>(&transposed, &mut logical); + + let block_start = chunk * CHUNK_SIZE; + let block_bits = (len - block_start).min(CHUNK_SIZE); + let word_off = chunk * WORDS_PER_CHUNK; + let n_words = block_bits.div_ceil(u64::BITS as usize); + words[word_off..][..n_words].copy_from_slice(&logical[..n_words]); + } + + // Patched indices hold placeholder packed values, so their fused result is meaningless; + // overwrite each with the comparison against the real patch value. + if let Some(p) = array.patches() { + let p_idx = p.indices().clone().execute::(ctx)?; + let p_val = p.values().clone().execute::(ctx)?; + let p_off = p.offset(); + match_each_unsigned_integer_ptype!(p_idx.ptype(), |I| { + let indices = p_idx.as_slice::(); + let values = p_val.as_slice::(); + for (&global, &value) in indices.iter().zip(values) { + let global: usize = global.as_(); + set_bit(words, global - p_off, cmp(value, rhs)); + } + }); + } + } + + let bits = BitBufferMut::from_buffer(words.into_byte_buffer(), 0, len); + let validity = array.validity()?.union_nullability(nullability); + Ok(BoolArray::new(bits.freeze(), validity).into_array()) +} + +/// Branchlessly write a single bit in a packed `u64` word buffer: clear the bit, then OR in the +/// new value. Avoids a data-dependent branch per patch in the patch-fixup loop, and touches the +/// target word through a single bounds-checked `&mut`. +#[inline] +fn set_bit(words: &mut [u64], idx: usize, value: bool) { + let shift = idx % u64::BITS as usize; + let mask = 1u64 << shift; + let word = &mut words[idx / u64::BITS as usize]; + *word = (*word & !mask) | (u64::from(value) << shift); +} diff --git a/encodings/fastlanes/src/bitpacking/compute/mod.rs b/encodings/fastlanes/src/bitpacking/compute/mod.rs index 518f8319eb1..06a4b4597b0 100644 --- a/encodings/fastlanes/src/bitpacking/compute/mod.rs +++ b/encodings/fastlanes/src/bitpacking/compute/mod.rs @@ -4,6 +4,7 @@ mod between; mod cast; mod compare; +mod compare_fused; mod filter; pub(crate) mod is_constant; mod slice; From 08ed4a4a033adf2558850ab904deb130de5a3197 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 4 Jun 2026 10:31:27 +0000 Subject: [PATCH 03/14] ci(wasm): pin in-development FastLanes in the excluded wasm-test workspace wasm-test is excluded from the workspace, so it does not inherit the root [patch.crates-io] and was building vortex-fastlanes against published fastlanes 0.5.0 (old `[bool;1024]` unpack_cmp, no `untranspose_bits`) -> compile error in compare_fused.rs. Add the matching git `rev` pin here. Temporary, like the root pin: both are removed when a FastLanes release is cut and the version is bumped. Signed-off-by: Joe Isaacs --- wasm-test/Cargo.toml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/wasm-test/Cargo.toml b/wasm-test/Cargo.toml index 100c15970b0..20cf5f26cfd 100644 --- a/wasm-test/Cargo.toml +++ b/wasm-test/Cargo.toml @@ -16,3 +16,8 @@ vortex = { path = "../vortex", default-features = false } inherits = "dev" debug = "line-tables-only" incremental = false + +# wasm-test is excluded from the workspace, so it does not inherit the root +# [patch.crates-io]; pin the in-development FastLanes here too until a release is cut. +[patch.crates-io] +fastlanes = { git = "https://github.com/spiraldb/fastlanes", rev = "6c10ea72cf693a17e994aa6401604ebedbeda453" } From 816032b92ff34c6018a19f0b2ef329348a5dfe3c Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 4 Jun 2026 14:00:22 +0100 Subject: [PATCH 04/14] wip Signed-off-by: Joe Isaacs --- Cargo.toml | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index dbbde5bae05..cc5201a0ff7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -152,7 +152,7 @@ dirs = "6.0.0" divan = { package = "codspeed-divan-compat", version = "4.0.4" } enum-iterator = "2.0.0" env_logger = "0.11" -fastlanes = "0.5" +fastlanes = "0.5.1" flatbuffers = "25.2.10" fsst-rs = "0.5.11" futures = { version = "0.3.31", default-features = false } @@ -412,8 +412,3 @@ debug = false debug-assertions = false strip = "debuginfo" incremental = false - -# Pin to the in-development FastLanes branch (PR #141 fused [u64;16] cmp mask + -# PR #145 width-generic BMI/VBMI untranspose) until a release is cut. -[patch.crates-io] -fastlanes = { git = "https://github.com/spiraldb/fastlanes", rev = "6c10ea72cf693a17e994aa6401604ebedbeda453" } From e4dd660d74661eddf4eb59e7c53821b0cc83882a Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 4 Jun 2026 14:05:05 +0100 Subject: [PATCH 05/14] wip Signed-off-by: Joe Isaacs --- wasm-test/Cargo.toml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/wasm-test/Cargo.toml b/wasm-test/Cargo.toml index 20cf5f26cfd..100c15970b0 100644 --- a/wasm-test/Cargo.toml +++ b/wasm-test/Cargo.toml @@ -16,8 +16,3 @@ vortex = { path = "../vortex", default-features = false } inherits = "dev" debug = "line-tables-only" incremental = false - -# wasm-test is excluded from the workspace, so it does not inherit the root -# [patch.crates-io]; pin the in-development FastLanes here too until a release is cut. -[patch.crates-io] -fastlanes = { git = "https://github.com/spiraldb/fastlanes", rev = "6c10ea72cf693a17e994aa6401604ebedbeda453" } From 933ca0e19715055069c49edcff8f55359a9aa24a Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 4 Jun 2026 14:10:59 +0100 Subject: [PATCH 06/14] wip Signed-off-by: Joe Isaacs --- Cargo.lock | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3b785bb672f..6833328367d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3151,8 +3151,9 @@ checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55" [[package]] name = "fastlanes" -version = "0.5.0" -source = "git+https://github.com/spiraldb/fastlanes?rev=6c10ea72cf693a17e994aa6401604ebedbeda453#6c10ea72cf693a17e994aa6401604ebedbeda453" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20c597e23b8ec8506f589d18bc701ca83a3def6086748f628ad23092e1dfe577" dependencies = [ "arrayref", "const_for", From 211903c0110bf243b01a07f1075de4eff8fa2a4e Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 4 Jun 2026 14:11:32 +0100 Subject: [PATCH 07/14] wip Signed-off-by: Joe Isaacs --- .../src/bitpacking/compute/compare.rs | 13 +- .../src/bitpacking/compute/compare_fused.rs | 118 +++++------------- .../bitpacking/compute/stream_predicate.rs | 2 +- 3 files changed, 33 insertions(+), 100 deletions(-) diff --git a/encodings/fastlanes/src/bitpacking/compute/compare.rs b/encodings/fastlanes/src/bitpacking/compute/compare.rs index 4e6755b9c36..8f7bf775f37 100644 --- a/encodings/fastlanes/src/bitpacking/compute/compare.rs +++ b/encodings/fastlanes/src/bitpacking/compute/compare.rs @@ -8,9 +8,6 @@ //! a [`BitBuffer`]. Patches are re-applied at the end by overwriting bits at the patched //! indices with `predicate(patch_value)`. -use fastlanes::BitPacking; -use fastlanes::BitPackingCompare; -use fastlanes::FastLanesComparable; use vortex_array::ArrayRef; use vortex_array::ArrayView; use vortex_array::ExecutionCtx; @@ -59,15 +56,10 @@ impl CompareKernel for BitPacked { } } -/// Compare every value against the constant via the fused FastLanes `unpack_cmp` kernel. +/// Compare every value against the constant by streaming the regular FastLanes unpack iterator. /// /// `NativePType::is_eq` / `is_lt` etc. provide total comparison (matching the primitive between /// kernel's dispatch shape). `NotEq` has no direct method, so use `!is_eq`. -/// -/// The fused kernel (compare straight into a transposed 1024-bit mask, then a single SIMD -/// untranspose into logical row order) beats the unpack-then-compare streaming baseline for every -/// integer type and bit width - see `benches/bitpack_compare_fused.rs` (~6-7x for 8-bit lanes -/// down to ~1.2-1.9x for 64-bit lanes), so it is used unconditionally. fn compare_constant_typed( lhs: ArrayView<'_, BitPacked>, rhs: T, @@ -76,8 +68,7 @@ fn compare_constant_typed( ctx: &mut ExecutionCtx, ) -> VortexResult where - T: NativePType + BitPackedIter + FastLanesComparable, - ::Bitpacked: BitPacking + NativePType + BitPackingCompare, + T: NativePType + BitPackedIter + Copy, { match operator { CompareOperator::Eq => { diff --git a/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs b/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs index 053b63e6b3b..0a5146adb6a 100644 --- a/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs +++ b/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs @@ -1,25 +1,15 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Fused compare kernel for [`BitPackedArray`] against a constant. +//! Block-streaming compare kernel for [`BitPackedArray`] against a constant. //! -//! Where [`super::stream_predicate`] unpacks a full 1024-element FastLanes block into a scratch -//! buffer and *then* folds a predicate over it, this path hands the comparison down into the -//! FastLanes [`BitPackingCompare::unchecked_unpack_cmp`] kernel, which compares each value against -//! the constant *as it is unpacked*, accumulating the boolean results straight into a 1024-bit -//! mask (`[u64; 16]`) in transposed FastLanes lane order - one register-resident word per lane, no -//! `[bool; 1024]` or `[T; 1024]` scratch. A single SIMD [`untranspose_bits`] per block then rotates -//! that mask into logical row order, which is copied directly into the output bit buffer. -//! -//! Only the full-chunk fast path uses the fused kernel. Sliced arrays (non-zero block offset) fall -//! back to the scalar streaming predicate, and inline patches are spliced in afterwards by -//! overwriting the bits at the patched indices with `cmp(patch_value, rhs)`. +//! Walks the encoded array one 1024-element FastLanes block at a time through the regular +//! [`crate::unpack_iter::BitUnpackedChunks`] iterator (the same path used by decompress, +//! `is_constant`, and `stream_predicate`), splices any [`crate::patches::Patches`] into the +//! unpacked block in place, then folds `cmp(value, rhs)` over the block into a [`BitBuffer`]. +//! The materialised primitive never appears: each block reuses a single scratch buffer and the +//! per-element bool is packed straight into the output words. -use fastlanes::BitPacking; -use fastlanes::BitPackingCompare; -use fastlanes::FastLanesComparable; -use fastlanes::untranspose_bits; -use num_traits::AsPrimitive; use vortex_array::ArrayRef; use vortex_array::ArrayView; use vortex_array::ExecutionCtx; @@ -31,19 +21,16 @@ use vortex_array::dtype::Nullability; use vortex_array::match_each_unsigned_integer_ptype; use vortex_buffer::BitBufferMut; use vortex_buffer::BufferMut; +use vortex_buffer::pack_bools_into_words; use vortex_error::VortexResult; -use super::stream_predicate::stream_predicate; +use super::stream_predicate::splice_patches; use crate::BitPacked; use crate::BitPackedArrayExt; use crate::unpack_iter::BitPacked as BitPackedIter; -const CHUNK_SIZE: usize = 1024; -/// `u64` words spanning one FastLanes block (1024 bits / 64). -const WORDS_PER_CHUNK: usize = CHUNK_SIZE / u64::BITS as usize; - -/// Compare the unpacked values of a [`BitPackedArray`] against `rhs` using the fused FastLanes -/// `unpack_cmp` kernel, producing a [`BoolArray`]. +/// Compare the unpacked values of a [`BitPackedArray`] against `rhs`, one FastLanes block at a +/// time, producing a [`BoolArray`]. /// /// `cmp(value, rhs)` defines the predicate; it must be the total-order comparison matching the /// requested operator (e.g. `|a, b| a.is_lt(b)`). @@ -55,66 +42,32 @@ pub(super) fn stream_compare_fused( ctx: &mut ExecutionCtx, ) -> VortexResult where - T: NativePType + BitPackedIter + FastLanesComparable, - ::Bitpacked: BitPacking + NativePType + BitPackingCompare, - F: Fn(T, T) -> bool + Copy, + T: NativePType + BitPackedIter + Copy, + F: Fn(T, T) -> bool, { let len = array.len(); - let bit_width = BitPackedArrayExt::bit_width(&array) as usize; - let offset = BitPackedArrayExt::offset(&array) as usize; - - // The fused kernel consumes whole 1024-element blocks at a fixed packed width. A non-zero - // block offset (from slicing) or a degenerate width has no clean full-chunk form, so defer - // to the scalar streaming predicate, which handles every layout. - if offset != 0 || len == 0 || bit_width == 0 { - return stream_predicate::(array, nullability, move |v| cmp(v, rhs), ctx); - } - - let packed = BitPackedArrayExt::packed_slice::<::Bitpacked>(&array); - let elems_per_chunk = 128 * bit_width / size_of::<::Bitpacked>(); - let num_chunks = len.div_ceil(CHUNK_SIZE); - let mut words: BufferMut = BufferMut::zeroed(len.div_ceil(u64::BITS as usize)); - { - let words = words.as_mut_slice(); - // Per block: fuse compare into a transposed 1024-bit mask, then untranspose into logical - // row order. The packed buffer is zero-padded out to a whole final block, so every chunk - - // including the trailing partial one - has exactly `elems_per_chunk` packed values; we just - // copy fewer than 16 words out of the last block's untransposed mask. - let mut transposed = [0u64; WORDS_PER_CHUNK]; - let mut logical = [0u64; WORDS_PER_CHUNK]; - for chunk in 0..num_chunks { - let packed_chunk = &packed[chunk * elems_per_chunk..][..elems_per_chunk]; - // SAFETY: `packed_chunk` is exactly `128 * bit_width / size_of::()` elements and - // `bit_width <= U::T`, satisfying `unchecked_unpack_cmp`'s contract. - unsafe { - <::Bitpacked as BitPackingCompare>::unchecked_unpack_cmp::< - T, - _, - >(bit_width, packed_chunk, &mut transposed, cmp, rhs); - } - untranspose_bits::<::Bitpacked>(&transposed, &mut logical); - let block_start = chunk * CHUNK_SIZE; - let block_bits = (len - block_start).min(CHUNK_SIZE); - let word_off = chunk * WORDS_PER_CHUNK; - let n_words = block_bits.div_ceil(u64::BITS as usize); - words[word_off..][..n_words].copy_from_slice(&logical[..n_words]); - } + if len > 0 { + let mut chunks = array.unpacked_chunks::()?; + let words = words.as_mut_slice(); - // Patched indices hold placeholder packed values, so their fused result is meaningless; - // overwrite each with the comparison against the real patch value. if let Some(p) = array.patches() { - let p_idx = p.indices().clone().execute::(ctx)?; - let p_val = p.values().clone().execute::(ctx)?; + let p_idx_arr = p.indices().clone().execute::(ctx)?; + let p_val_arr = p.values().clone().execute::(ctx)?; let p_off = p.offset(); - match_each_unsigned_integer_ptype!(p_idx.ptype(), |I| { - let indices = p_idx.as_slice::(); - let values = p_val.as_slice::(); - for (&global, &value) in indices.iter().zip(values) { - let global: usize = global.as_(); - set_bit(words, global - p_off, cmp(value, rhs)); - } + match_each_unsigned_integer_ptype!(p_idx_arr.ptype(), |I| { + let p_idx = p_idx_arr.as_slice::(); + let p_val = p_val_arr.as_slice::(); + let mut p_cur: usize = 0; + chunks.for_each_unpacked_chunk(|block, range| { + p_cur = splice_patches::(block, range.start, p_cur, p_idx, p_val, p_off); + pack_bools_into_words(words, range.start, block.len(), |i| cmp(block[i], rhs)); + }); + }); + } else { + chunks.for_each_unpacked_chunk(|block, range| { + pack_bools_into_words(words, range.start, block.len(), |i| cmp(block[i], rhs)); }); } } @@ -123,14 +76,3 @@ where let validity = array.validity()?.union_nullability(nullability); Ok(BoolArray::new(bits.freeze(), validity).into_array()) } - -/// Branchlessly write a single bit in a packed `u64` word buffer: clear the bit, then OR in the -/// new value. Avoids a data-dependent branch per patch in the patch-fixup loop, and touches the -/// target word through a single bounds-checked `&mut`. -#[inline] -fn set_bit(words: &mut [u64], idx: usize, value: bool) { - let shift = idx % u64::BITS as usize; - let mask = 1u64 << shift; - let word = &mut words[idx / u64::BITS as usize]; - *word = (*word & !mask) | (u64::from(value) << shift); -} diff --git a/encodings/fastlanes/src/bitpacking/compute/stream_predicate.rs b/encodings/fastlanes/src/bitpacking/compute/stream_predicate.rs index 4ed9de913ec..df136755a7d 100644 --- a/encodings/fastlanes/src/bitpacking/compute/stream_predicate.rs +++ b/encodings/fastlanes/src/bitpacking/compute/stream_predicate.rs @@ -78,7 +78,7 @@ where /// `[chunk_start, chunk_start + block.len())`, starting from `cursor` and returning the /// advanced cursor. Sorted indices mean the cursor only moves forward across the walk. #[inline] -fn splice_patches( +pub(super) fn splice_patches( block: &mut [T], chunk_start: usize, mut cursor: usize, From 0649f03e45a6a99d73d1306499c06c153ca43027 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 4 Jun 2026 15:40:04 +0100 Subject: [PATCH 08/14] wip Signed-off-by: Joe Isaacs --- .../src/bitpacking/array/unpack_iter.rs | 28 +++ .../src/bitpacking/compute/compare.rs | 83 ++++++++- .../src/bitpacking/compute/compare_fused.rs | 161 ++++++++++++++---- .../bitpacking/compute/stream_predicate.rs | 2 +- 4 files changed, 240 insertions(+), 34 deletions(-) diff --git a/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs b/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs index 5e0673186e3..c9158d8bdac 100644 --- a/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs +++ b/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs @@ -259,6 +259,34 @@ impl> UnpackedChunks { debug_assert_eq!(local_idx, self.len); } + /// Walk every *packed* chunk in array order, yielding the raw packed FastLanes block and the + /// padded bit range it covers, without unpacking it. + /// + /// Unlike [`Self::for_each_unpacked_chunk`], this does not fill the scratch buffer: it hands + /// the still-packed block to the callback so fused kernels (e.g. compare) can unpack and + /// consume it in a single pass. Each yielded block holds exactly `elems_per_chunk` packed + /// values (the buffer is zero-padded out to a whole final chunk). + /// + /// The yielded range is in *padded* coordinates: block `c` covers + /// `[c * 1024, min((c + 1) * 1024, offset + len))`, so it includes the leading `offset` rows + /// that slicing skips. Block starts are therefore always 1024-aligned regardless of `offset`. + /// Callers must account for the array's `offset` when mapping a block's rows back to logical + /// output positions (e.g. by viewing the output buffer at a bit offset of `offset`). + pub(crate) fn for_each_packed_chunk(&self, mut f: F) + where + F: FnMut(&[T::Physical], Range), + { + let packed_slice: &[T::Physical] = buffer_as_slice(&self.packed); + let elems_per_chunk = self.elems_per_chunk(); + let padded_len = self.offset + self.len; + for chunk in 0..self.num_chunks { + let packed_chunk = &packed_slice[chunk * elems_per_chunk..][..elems_per_chunk]; + let start = chunk * CHUNK_SIZE; + let end = (start + CHUNK_SIZE).min(padded_len); + f(packed_chunk, start..end); + } + } + /// Unpack full chunks into output range starting at the given index. fn decode_full_chunks_into_at( &mut self, diff --git a/encodings/fastlanes/src/bitpacking/compute/compare.rs b/encodings/fastlanes/src/bitpacking/compute/compare.rs index 8f7bf775f37..4fbc4cf8b9d 100644 --- a/encodings/fastlanes/src/bitpacking/compute/compare.rs +++ b/encodings/fastlanes/src/bitpacking/compute/compare.rs @@ -8,11 +8,15 @@ //! a [`BitBuffer`]. Patches are re-applied at the end by overwriting bits at the patched //! indices with `predicate(patch_value)`. +use fastlanes::BitPacking; +use fastlanes::BitPackingCompare; +use fastlanes::FastLanesComparable; use vortex_array::ArrayRef; use vortex_array::ArrayView; use vortex_array::ExecutionCtx; use vortex_array::dtype::NativePType; use vortex_array::dtype::Nullability; +use vortex_array::dtype::PhysicalPType; use vortex_array::match_each_integer_ptype; use vortex_array::scalar_fn::fns::binary::CompareKernel; use vortex_array::scalar_fn::fns::operators::CompareOperator; @@ -56,7 +60,7 @@ impl CompareKernel for BitPacked { } } -/// Compare every value against the constant by streaming the regular FastLanes unpack iterator. +/// Compare every value against the constant via the fused FastLanes `unpack_cmp` kernel. /// /// `NativePType::is_eq` / `is_lt` etc. provide total comparison (matching the primitive between /// kernel's dispatch shape). `NotEq` has no direct method, so use `!is_eq`. @@ -68,7 +72,10 @@ fn compare_constant_typed( ctx: &mut ExecutionCtx, ) -> VortexResult where - T: NativePType + BitPackedIter + Copy, + T: NativePType + + BitPackedIter + + FastLanesComparable::Physical>, + ::Physical: BitPacking + NativePType + BitPackingCompare, { match operator { CompareOperator::Eq => { @@ -204,6 +211,78 @@ mod tests { Ok(()) } + /// Sliced inputs: a non-zero block offset (and a length spanning several blocks) must still go + /// through the fused kernel and agree with the primitive fallback. Sweeps slice starts that + /// land both inside the first block and past it, with lengths that end mid-block and on a block + /// boundary. + #[rstest] + #[case(1, 4000)] // start mid-first-block, multi-block length + #[case(1023, 2)] // start at the last row of the first block + #[case(1024, 1024)] // start exactly on a block boundary, exactly one block long + #[case(1500, 1000)] // start mid-second-block + #[case(3, 1021)] // ends exactly on the first block boundary + fn sliced_matches_primitive( + #[case] start: usize, + #[case] slice_len: usize, + ) -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let values: Vec = (0..5000u32).map(|i| i % 128).collect(); + let prim = PrimitiveArray::from_iter(values); + let packed = BitPackedData::encode(&prim.clone().into_array(), 7, &mut ctx)?; + + let sliced = packed.into_array().slice(start..start + slice_len)?; + let rhs = ConstantArray::new(50u32, slice_len).into_array(); + for op in [ + CompareOperator::Eq, + CompareOperator::Lt, + CompareOperator::Gte, + ] { + let got = ::compare( + sliced.as_::(), + &rhs, + op, + &mut ctx, + )? + .expect("fused compare kernel must engage for sliced arrays") + .execute::(&mut ctx)?; + let want = prim + .clone() + .into_array() + .slice(start..start + slice_len)? + .binary(rhs.clone(), Operator::from(op))? + .execute::(&mut ctx)?; + assert_arrays_eq!(got, want); + } + Ok(()) + } + + /// Sliced *and* patched: combine a non-zero offset with out-of-range values that land in + /// `Patches`, exercising the `offset + (global - p_off)` patch-position math. + #[test] + fn sliced_with_patches_matches_primitive() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let values: Vec = (0..4096) + .map(|i| if i % 91 == 0 { 100_000 + i } else { i % 100 }) + .collect(); + let prim = PrimitiveArray::from_iter(values); + let packed = BitPackedData::encode(&prim.clone().into_array(), 7, &mut ctx)?; + assert!(packed.patches().is_some(), "test setup expects patches"); + + let (start, end) = (700usize, 3500usize); + let sliced = packed.into_array().slice(start..end)?; + let rhs = ConstantArray::new(50i32, end - start).into_array(); + let got = sliced + .binary(rhs.clone(), Operator::Eq)? + .execute::(&mut ctx)?; + let want = prim + .into_array() + .slice(start..end)? + .binary(rhs, Operator::Eq)? + .execute::(&mut ctx)?; + assert_arrays_eq!(got, want); + Ok(()) + } + /// Nullable input — the result must carry the array's validity. #[test] fn nullable_propagates_validity() -> VortexResult<()> { diff --git a/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs b/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs index 0a5146adb6a..5caba4118e7 100644 --- a/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs +++ b/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs @@ -1,15 +1,35 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Block-streaming compare kernel for [`BitPackedArray`] against a constant. +//! Fused compare kernel for [`BitPackedArray`] against a constant. //! -//! Walks the encoded array one 1024-element FastLanes block at a time through the regular -//! [`crate::unpack_iter::BitUnpackedChunks`] iterator (the same path used by decompress, -//! `is_constant`, and `stream_predicate`), splices any [`crate::patches::Patches`] into the -//! unpacked block in place, then folds `cmp(value, rhs)` over the block into a [`BitBuffer`]. -//! The materialised primitive never appears: each block reuses a single scratch buffer and the -//! per-element bool is packed straight into the output words. +//! Where [`super::stream_predicate`] unpacks a full 1024-element FastLanes block into a scratch +//! buffer and *then* folds a predicate over it, this path hands the comparison down into the +//! FastLanes [`BitPackingCompare::unchecked_unpack_cmp`] kernel, which compares each value against +//! the constant *as it is unpacked*, accumulating the boolean results straight into a 1024-bit +//! mask (`[u64; 16]`) in transposed FastLanes lane order - one register-resident word per lane, no +//! `[bool; 1024]` or `[T; 1024]` scratch. A single SIMD [`untranspose_bits`] per block then rotates +//! that mask into logical row order. +//! +//! The packed blocks are walked through the regular [`crate::unpack_iter::BitUnpackedChunks`] +//! iterator (via [`crate::unpack_iter::BitUnpackedChunks::for_each_packed_chunk`]) rather than a +//! bespoke chunk loop, so chunk sizing and bounds live in one place. +//! +//! Slicing is handled by working in *padded* coordinates: bit `offset + i` holds element `i`. The +//! output buffer is over-allocated to whole 1024-bit blocks, so every block - the sliced first +//! block, the body, and the trailing partial - untransposes straight into a 64-bit-word-aligned +//! slot with no per-block temporary and only one shared scratch `[u64; 16]`. The leading `offset` +//! garbage rows and the trailing padding are trimmed afterwards: leading whole `u64` words are +//! dropped with [`BufferMut::split_off`] (an O(1) re-slice) and the residual `offset % 64` bits plus +//! `len` form the final [`BitBufferMut`] view. The result is therefore byte-aligned (offset 0) when +//! `offset % 8 == 0`. Inline patches are spliced in afterwards by overwriting the bits at the +//! patched indices with `cmp(patch_value, rhs)`. +use fastlanes::BitPacking; +use fastlanes::BitPackingCompare; +use fastlanes::FastLanesComparable; +use fastlanes::untranspose_bits; +use num_traits::AsPrimitive; use vortex_array::ArrayRef; use vortex_array::ArrayView; use vortex_array::ExecutionCtx; @@ -18,19 +38,59 @@ use vortex_array::arrays::BoolArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::dtype::NativePType; use vortex_array::dtype::Nullability; +use vortex_array::dtype::PhysicalPType; use vortex_array::match_each_unsigned_integer_ptype; use vortex_buffer::BitBufferMut; use vortex_buffer::BufferMut; -use vortex_buffer::pack_bools_into_words; +use vortex_error::VortexExpect; use vortex_error::VortexResult; -use super::stream_predicate::splice_patches; +use super::stream_predicate::stream_predicate; use crate::BitPacked; use crate::BitPackedArrayExt; use crate::unpack_iter::BitPacked as BitPackedIter; -/// Compare the unpacked values of a [`BitPackedArray`] against `rhs`, one FastLanes block at a -/// time, producing a [`BoolArray`]. +const CHUNK_SIZE: usize = 1024; +/// `u64` words spanning one FastLanes block (1024 bits / 64). +const WORDS_PER_CHUNK: usize = CHUNK_SIZE / u64::BITS as usize; + +/// Unpack one packed FastLanes block, comparing each value against `rhs` *as it is unpacked*, and +/// write the resulting 1024-bit mask (logical row order, LSB-first) into `out`. +/// +/// `transposed` is caller-owned scratch reused across every block, so the hot loop makes no +/// per-call stack allocation; its prior contents are irrelevant (the kernel overwrites it). +#[inline] +fn unpack_cmp_block( + transposed: &mut [u64; WORDS_PER_CHUNK], + out: &mut [u64; WORDS_PER_CHUNK], + bit_width: usize, + packed_chunk: &[::Physical], + cmp: F, + rhs: T, +) where + T: NativePType + + PhysicalPType + + FastLanesComparable::Physical>, + ::Physical: BitPacking + BitPackingCompare, + F: Fn(T, T) -> bool, +{ + transposed.fill(0); + // SAFETY: `packed_chunk` holds exactly `128 * bit_width / size_of::()` packed elements and + // `bit_width <= U::T`, satisfying `unchecked_unpack_cmp`'s contract. + unsafe { + <::Physical as BitPackingCompare>::unchecked_unpack_cmp::( + bit_width, + packed_chunk, + transposed, + cmp, + rhs, + ); + } + untranspose_bits::<::Physical>(transposed, out); +} + +/// Compare the unpacked values of a [`BitPackedArray`] against `rhs` using the fused FastLanes +/// `unpack_cmp` kernel, producing a [`BoolArray`]. /// /// `cmp(value, rhs)` defines the predicate; it must be the total-order comparison matching the /// requested operator (e.g. `|a, b| a.is_lt(b)`). @@ -42,37 +102,76 @@ pub(super) fn stream_compare_fused( ctx: &mut ExecutionCtx, ) -> VortexResult where - T: NativePType + BitPackedIter + Copy, - F: Fn(T, T) -> bool, + T: NativePType + + BitPackedIter + + FastLanesComparable::Physical>, + ::Physical: BitPacking + NativePType + BitPackingCompare, + F: Fn(T, T) -> bool + Copy, { let len = array.len(); - let mut words: BufferMut = BufferMut::zeroed(len.div_ceil(u64::BITS as usize)); + let bit_width = array.bit_width() as usize; + let offset = array.offset() as usize; + + // A degenerate width has no packed payload for the fused kernel to consume; defer to the scalar + // streaming predicate, which handles every layout (including the empty array). + if len == 0 || bit_width == 0 { + return stream_predicate::(array, nullability, move |v| cmp(v, rhs), ctx); + } - if len > 0 { - let mut chunks = array.unpacked_chunks::()?; + // Over-allocate to whole 1024-bit blocks in padded coordinates so every block - including the + // trailing partial - has room for a full untranspose at a 64-bit-word-aligned offset. + let num_chunks = (offset + len).div_ceil(CHUNK_SIZE); + let mut words: BufferMut = BufferMut::zeroed(num_chunks * WORDS_PER_CHUNK); + + let chunks = array.unpacked_chunks::()?; + { let words = words.as_mut_slice(); + let mut transposed = [0u64; WORDS_PER_CHUNK]; + chunks.for_each_packed_chunk(|packed_chunk, range| { + // Block starts are always 1024-aligned (padded coords), so the slot is a full block. + let out = words[range.start / u64::BITS as usize..] + .first_chunk_mut::() + .vortex_expect("over-allocated buffer holds a full block per chunk"); + unpack_cmp_block::(&mut transposed, out, bit_width, packed_chunk, cmp, rhs); + }); + // Patched indices hold placeholder packed values, so their fused result is meaningless; + // overwrite each with the comparison against the real patch value. Patch positions are + // logical (`global - p_off`); shift by `offset` into padded coordinates. if let Some(p) = array.patches() { - let p_idx_arr = p.indices().clone().execute::(ctx)?; - let p_val_arr = p.values().clone().execute::(ctx)?; + let p_idx = p.indices().clone().execute::(ctx)?; + let p_val = p.values().clone().execute::(ctx)?; let p_off = p.offset(); - match_each_unsigned_integer_ptype!(p_idx_arr.ptype(), |I| { - let p_idx = p_idx_arr.as_slice::(); - let p_val = p_val_arr.as_slice::(); - let mut p_cur: usize = 0; - chunks.for_each_unpacked_chunk(|block, range| { - p_cur = splice_patches::(block, range.start, p_cur, p_idx, p_val, p_off); - pack_bools_into_words(words, range.start, block.len(), |i| cmp(block[i], rhs)); - }); - }); - } else { - chunks.for_each_unpacked_chunk(|block, range| { - pack_bools_into_words(words, range.start, block.len(), |i| cmp(block[i], rhs)); + match_each_unsigned_integer_ptype!(p_idx.ptype(), |I| { + let indices = p_idx.as_slice::(); + let values = p_val.as_slice::(); + for (&global, &value) in indices.iter().zip(values) { + let global: usize = global.as_(); + set_bit(words, offset + global - p_off, cmp(value, rhs)); + } }); } } - let bits = BitBufferMut::from_buffer(words.into_byte_buffer(), 0, len); + // Trim the leading garbage: drop whole `u64` words covered entirely by the skipped `offset` + // region (an O(1) re-slice; `u64` granularity keeps the byte buffer 8-aligned). The residual + // `offset % 64` bits become the view offset, so the result is byte-aligned when `offset % 8 == 0` + // and offset 0 when `offset % 64 == 0`. + let head_words = offset / u64::BITS as usize; + let bit_offset = offset % u64::BITS as usize; + let bytes = words.split_off(head_words).into_byte_buffer(); + let bits = BitBufferMut::from_buffer(bytes, bit_offset, len); let validity = array.validity()?.union_nullability(nullability); Ok(BoolArray::new(bits.freeze(), validity).into_array()) } + +/// Branchlessly write a single bit in a packed `u64` word buffer: clear the bit, then OR in the +/// new value. Avoids a data-dependent branch per patch in the patch-fixup loop, and touches the +/// target word through a single bounds-checked `&mut`. +#[inline] +fn set_bit(words: &mut [u64], idx: usize, value: bool) { + let shift = idx % u64::BITS as usize; + let mask = 1u64 << shift; + let word = &mut words[idx / u64::BITS as usize]; + *word = (*word & !mask) | (u64::from(value) << shift); +} diff --git a/encodings/fastlanes/src/bitpacking/compute/stream_predicate.rs b/encodings/fastlanes/src/bitpacking/compute/stream_predicate.rs index df136755a7d..4ed9de913ec 100644 --- a/encodings/fastlanes/src/bitpacking/compute/stream_predicate.rs +++ b/encodings/fastlanes/src/bitpacking/compute/stream_predicate.rs @@ -78,7 +78,7 @@ where /// `[chunk_start, chunk_start + block.len())`, starting from `cursor` and returning the /// advanced cursor. Sorted indices mean the cursor only moves forward across the walk. #[inline] -pub(super) fn splice_patches( +fn splice_patches( block: &mut [T], chunk_start: usize, mut cursor: usize, From 83c90b90279efb7582b0727717597d6c09d552bc Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 4 Jun 2026 16:12:40 +0100 Subject: [PATCH 09/14] wip Signed-off-by: Joe Isaacs --- .../src/bitpacking/compute/compare_fused.rs | 31 +++++++------------ 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs b/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs index 5caba4118e7..b8d8cb58f5e 100644 --- a/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs +++ b/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs @@ -19,11 +19,9 @@ //! output buffer is over-allocated to whole 1024-bit blocks, so every block - the sliced first //! block, the body, and the trailing partial - untransposes straight into a 64-bit-word-aligned //! slot with no per-block temporary and only one shared scratch `[u64; 16]`. The leading `offset` -//! garbage rows and the trailing padding are trimmed afterwards: leading whole `u64` words are -//! dropped with [`BufferMut::split_off`] (an O(1) re-slice) and the residual `offset % 64` bits plus -//! `len` form the final [`BitBufferMut`] view. The result is therefore byte-aligned (offset 0) when -//! `offset % 8 == 0`. Inline patches are spliced in afterwards by overwriting the bits at the -//! patched indices with `cmp(patch_value, rhs)`. +//! garbage rows are represented as the final [`BitBuffer`] bit offset, which naturally handles +//! sub-byte slices without copy-aligning. Inline patches are spliced in afterwards by overwriting +//! the bits at the patched indices with `cmp(patch_value, rhs)`. use fastlanes::BitPacking; use fastlanes::BitPackingCompare; @@ -51,8 +49,9 @@ use crate::BitPackedArrayExt; use crate::unpack_iter::BitPacked as BitPackedIter; const CHUNK_SIZE: usize = 1024; +const U64_BITS: usize = u64::BITS as usize; /// `u64` words spanning one FastLanes block (1024 bits / 64). -const WORDS_PER_CHUNK: usize = CHUNK_SIZE / u64::BITS as usize; +const WORDS_PER_CHUNK: usize = CHUNK_SIZE / U64_BITS; /// Unpack one packed FastLanes block, comparing each value against `rhs` *as it is unpacked*, and /// write the resulting 1024-bit mask (logical row order, LSB-first) into `out`. @@ -129,15 +128,14 @@ where let mut transposed = [0u64; WORDS_PER_CHUNK]; chunks.for_each_packed_chunk(|packed_chunk, range| { // Block starts are always 1024-aligned (padded coords), so the slot is a full block. - let out = words[range.start / u64::BITS as usize..] + let out = words[range.start / U64_BITS..] .first_chunk_mut::() .vortex_expect("over-allocated buffer holds a full block per chunk"); unpack_cmp_block::(&mut transposed, out, bit_width, packed_chunk, cmp, rhs); }); // Patched indices hold placeholder packed values, so their fused result is meaningless; - // overwrite each with the comparison against the real patch value. Patch positions are - // logical (`global - p_off`); shift by `offset` into padded coordinates. + // overwrite each with the comparison against the real patch value. if let Some(p) = array.patches() { let p_idx = p.indices().clone().execute::(ctx)?; let p_val = p.values().clone().execute::(ctx)?; @@ -153,16 +151,9 @@ where } } - // Trim the leading garbage: drop whole `u64` words covered entirely by the skipped `offset` - // region (an O(1) re-slice; `u64` granularity keeps the byte buffer 8-aligned). The residual - // `offset % 64` bits become the view offset, so the result is byte-aligned when `offset % 8 == 0` - // and offset 0 when `offset % 64 == 0`. - let head_words = offset / u64::BITS as usize; - let bit_offset = offset % u64::BITS as usize; - let bytes = words.split_off(head_words).into_byte_buffer(); - let bits = BitBufferMut::from_buffer(bytes, bit_offset, len); + let bits = BitBufferMut::from_buffer(words.into_byte_buffer(), offset, len).freeze(); let validity = array.validity()?.union_nullability(nullability); - Ok(BoolArray::new(bits.freeze(), validity).into_array()) + Ok(BoolArray::new(bits, validity).into_array()) } /// Branchlessly write a single bit in a packed `u64` word buffer: clear the bit, then OR in the @@ -170,8 +161,8 @@ where /// target word through a single bounds-checked `&mut`. #[inline] fn set_bit(words: &mut [u64], idx: usize, value: bool) { - let shift = idx % u64::BITS as usize; + let shift = idx % U64_BITS; let mask = 1u64 << shift; - let word = &mut words[idx / u64::BITS as usize]; + let word = &mut words[idx / U64_BITS]; *word = (*word & !mask) | (u64::from(value) << shift); } From 4d50ff1f2d6dddc1f667256e5a394c488f4792c2 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 4 Jun 2026 16:17:53 +0100 Subject: [PATCH 10/14] wip Signed-off-by: Joe Isaacs --- .../src/bitpacking/compute/compare.rs | 20 +++++--- .../src/bitpacking/compute/compare_fused.rs | 50 ++++++------------- 2 files changed, 27 insertions(+), 43 deletions(-) diff --git a/encodings/fastlanes/src/bitpacking/compute/compare.rs b/encodings/fastlanes/src/bitpacking/compute/compare.rs index 4fbc4cf8b9d..0ddff6ed5a7 100644 --- a/encodings/fastlanes/src/bitpacking/compute/compare.rs +++ b/encodings/fastlanes/src/bitpacking/compute/compare.rs @@ -101,9 +101,10 @@ where #[cfg(test)] mod tests { + use std::sync::LazyLock; + use rstest::rstest; use vortex_array::IntoArray; - use vortex_array::LEGACY_SESSION; use vortex_array::VortexSessionExecute; use vortex_array::arrays::BoolArray; use vortex_array::arrays::ConstantArray; @@ -113,12 +114,17 @@ mod tests { use vortex_array::scalar_fn::fns::binary::CompareKernel; use vortex_array::scalar_fn::fns::operators::CompareOperator; use vortex_array::scalar_fn::fns::operators::Operator; + use vortex_array::session::ArraySession; use vortex_error::VortexResult; + use vortex_session::VortexSession; use crate::BitPacked; use crate::BitPackedArrayExt; use crate::BitPackedData; + static SESSION: LazyLock = + LazyLock::new(|| VortexSession::empty().with::()); + /// All six operators on a small in-range input. #[rstest] #[case(Operator::Eq, vec![false, false, false, true, false, false, true])] @@ -128,7 +134,7 @@ mod tests { #[case(Operator::Gt, vec![false, false, false, false, true, true, false])] #[case(Operator::Gte, vec![false, false, false, true, true, true, true])] fn small(#[case] op: Operator, #[case] expected: Vec) { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let mut ctx = SESSION.create_execution_ctx(); let values = PrimitiveArray::from_iter([0u32, 1, 2, 3, 4, 5, 3]); let packed = BitPackedData::encode(&values.into_array(), 3, &mut ctx).unwrap(); let rhs = ConstantArray::new(3u32, packed.len()).into_array(); @@ -150,7 +156,7 @@ mod tests { ($name:ident, $T:ty, $($bw:expr),+) => { #[test] fn $name() -> VortexResult<()> { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let mut ctx = SESSION.create_execution_ctx(); for bw in [$($bw),+] { let cap: u128 = 1u128 << bw; let values: Vec<$T> = (0..2048u128).map(|i| (i % cap) as $T).collect(); @@ -191,7 +197,7 @@ mod tests { /// predicate runs. #[test] fn signed_with_patches_matches_primitive() -> VortexResult<()> { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let mut ctx = SESSION.create_execution_ctx(); let values: Vec = (0..1500) .map(|i| if i % 73 == 0 { 100_000 + i } else { i % 100 }) .collect(); @@ -225,7 +231,7 @@ mod tests { #[case] start: usize, #[case] slice_len: usize, ) -> VortexResult<()> { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let mut ctx = SESSION.create_execution_ctx(); let values: Vec = (0..5000u32).map(|i| i % 128).collect(); let prim = PrimitiveArray::from_iter(values); let packed = BitPackedData::encode(&prim.clone().into_array(), 7, &mut ctx)?; @@ -260,7 +266,7 @@ mod tests { /// `Patches`, exercising the `offset + (global - p_off)` patch-position math. #[test] fn sliced_with_patches_matches_primitive() -> VortexResult<()> { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let mut ctx = SESSION.create_execution_ctx(); let values: Vec = (0..4096) .map(|i| if i % 91 == 0 { 100_000 + i } else { i % 100 }) .collect(); @@ -286,7 +292,7 @@ mod tests { /// Nullable input — the result must carry the array's validity. #[test] fn nullable_propagates_validity() -> VortexResult<()> { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let mut ctx = SESSION.create_execution_ctx(); let prim = PrimitiveArray::from_option_iter([Some(1u32), None, Some(3), Some(4), None]); let packed = BitPackedData::encode(&prim.clone().into_array(), 3, &mut ctx)?; let rhs = ConstantArray::new(3u32, packed.len()).into_array(); diff --git a/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs b/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs index b8d8cb58f5e..1d7f8ed9dea 100644 --- a/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs +++ b/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs @@ -53,41 +53,6 @@ const U64_BITS: usize = u64::BITS as usize; /// `u64` words spanning one FastLanes block (1024 bits / 64). const WORDS_PER_CHUNK: usize = CHUNK_SIZE / U64_BITS; -/// Unpack one packed FastLanes block, comparing each value against `rhs` *as it is unpacked*, and -/// write the resulting 1024-bit mask (logical row order, LSB-first) into `out`. -/// -/// `transposed` is caller-owned scratch reused across every block, so the hot loop makes no -/// per-call stack allocation; its prior contents are irrelevant (the kernel overwrites it). -#[inline] -fn unpack_cmp_block( - transposed: &mut [u64; WORDS_PER_CHUNK], - out: &mut [u64; WORDS_PER_CHUNK], - bit_width: usize, - packed_chunk: &[::Physical], - cmp: F, - rhs: T, -) where - T: NativePType - + PhysicalPType - + FastLanesComparable::Physical>, - ::Physical: BitPacking + BitPackingCompare, - F: Fn(T, T) -> bool, -{ - transposed.fill(0); - // SAFETY: `packed_chunk` holds exactly `128 * bit_width / size_of::()` packed elements and - // `bit_width <= U::T`, satisfying `unchecked_unpack_cmp`'s contract. - unsafe { - <::Physical as BitPackingCompare>::unchecked_unpack_cmp::( - bit_width, - packed_chunk, - transposed, - cmp, - rhs, - ); - } - untranspose_bits::<::Physical>(transposed, out); -} - /// Compare the unpacked values of a [`BitPackedArray`] against `rhs` using the fused FastLanes /// `unpack_cmp` kernel, producing a [`BoolArray`]. /// @@ -131,11 +96,24 @@ where let out = words[range.start / U64_BITS..] .first_chunk_mut::() .vortex_expect("over-allocated buffer holds a full block per chunk"); - unpack_cmp_block::(&mut transposed, out, bit_width, packed_chunk, cmp, rhs); + // SAFETY: `packed_chunk` holds exactly `128 * bit_width / size_of::()` packed + // elements and `bit_width <= U::T`, satisfying `unchecked_unpack_cmp`'s contract. The + // kernel assigns every word in `transposed`, so its previous contents are irrelevant. + unsafe { + <::Physical as BitPackingCompare>::unchecked_unpack_cmp::( + bit_width, + packed_chunk, + &mut transposed, + cmp, + rhs, + ); + } + untranspose_bits::<::Physical>(&transposed, out); }); // Patched indices hold placeholder packed values, so their fused result is meaningless; // overwrite each with the comparison against the real patch value. + // TODO(joe): apply patches per `packed_chunked`. if let Some(p) = array.patches() { let p_idx = p.indices().clone().execute::(ctx)?; let p_val = p.values().clone().execute::(ctx)?; From c117649906dfbf4a2326480998eb6b414d6482b2 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 4 Jun 2026 17:05:58 +0100 Subject: [PATCH 11/14] wip Signed-off-by: Joe Isaacs --- .../src/bitpacking/compute/compare_fused.rs | 50 ++++++++----------- 1 file changed, 21 insertions(+), 29 deletions(-) diff --git a/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs b/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs index 1d7f8ed9dea..93002a04382 100644 --- a/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs +++ b/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs @@ -110,37 +110,29 @@ where } untranspose_bits::<::Physical>(&transposed, out); }); + } + + let mut bits = BitBufferMut::from_buffer(words.into_byte_buffer(), offset, len); - // Patched indices hold placeholder packed values, so their fused result is meaningless; - // overwrite each with the comparison against the real patch value. - // TODO(joe): apply patches per `packed_chunked`. - if let Some(p) = array.patches() { - let p_idx = p.indices().clone().execute::(ctx)?; - let p_val = p.values().clone().execute::(ctx)?; - let p_off = p.offset(); - match_each_unsigned_integer_ptype!(p_idx.ptype(), |I| { - let indices = p_idx.as_slice::(); - let values = p_val.as_slice::(); - for (&global, &value) in indices.iter().zip(values) { - let global: usize = global.as_(); - set_bit(words, offset + global - p_off, cmp(value, rhs)); - } - }); - } + // Patched indices hold placeholder packed values, so their fused result is meaningless; + // overwrite each with the comparison against the real patch value. + // TODO(joe): apply patches per `packed_chunked`. + if let Some(p) = array.patches() { + let p_idx = p.indices().clone().execute::(ctx)?; + // TODO(joe): push down cmp?? + let p_val = p.values().clone().execute::(ctx)?; + let p_off = p.offset(); + match_each_unsigned_integer_ptype!(p_idx.ptype(), |I| { + let indices = p_idx.as_slice::(); + let values = p_val.as_slice::(); + for (&global, &value) in indices.iter().zip(values) { + let global: usize = global.as_(); + let idx = global - p_off; + bits.set_to(idx, cmp(value, rhs)) + } + }); } - let bits = BitBufferMut::from_buffer(words.into_byte_buffer(), offset, len).freeze(); let validity = array.validity()?.union_nullability(nullability); - Ok(BoolArray::new(bits, validity).into_array()) -} - -/// Branchlessly write a single bit in a packed `u64` word buffer: clear the bit, then OR in the -/// new value. Avoids a data-dependent branch per patch in the patch-fixup loop, and touches the -/// target word through a single bounds-checked `&mut`. -#[inline] -fn set_bit(words: &mut [u64], idx: usize, value: bool) { - let shift = idx % U64_BITS; - let mask = 1u64 << shift; - let word = &mut words[idx / U64_BITS]; - *word = (*word & !mask) | (u64::from(value) << shift); + Ok(BoolArray::new(bits.freeze(), validity).into_array()) } From 10e904cfebd44b9d1284dede32255ee084f2898b Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 4 Jun 2026 17:07:45 +0100 Subject: [PATCH 12/14] wip Signed-off-by: Joe Isaacs --- encodings/fastlanes/src/bitpacking/compute/compare.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/encodings/fastlanes/src/bitpacking/compute/compare.rs b/encodings/fastlanes/src/bitpacking/compute/compare.rs index 0ddff6ed5a7..3c9a6d527d3 100644 --- a/encodings/fastlanes/src/bitpacking/compute/compare.rs +++ b/encodings/fastlanes/src/bitpacking/compute/compare.rs @@ -277,9 +277,14 @@ mod tests { let (start, end) = (700usize, 3500usize); let sliced = packed.into_array().slice(start..end)?; let rhs = ConstantArray::new(50i32, end - start).into_array(); - let got = sliced - .binary(rhs.clone(), Operator::Eq)? - .execute::(&mut ctx)?; + let got = ::compare( + sliced.as_::(), + &rhs, + CompareOperator::Eq, + &mut ctx, + )? + .expect("fused compare kernel must engage for sliced arrays with patches") + .execute::(&mut ctx)?; let want = prim .into_array() .slice(start..end)? From c495055ff9771c65e0dd6b920b82e4a10186d312 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 4 Jun 2026 16:26:31 +0000 Subject: [PATCH 13/14] Fix sliced_with_patches compare test to obtain a BitPacked via SliceKernel ArrayRef::slice on a patched BitPackedArray leaves a lazy SliceArray (the buffer-free SliceReduce path bails when patches are present), so as_::() panicked before the fused compare kernel ran. Acquire the sliced BitPacked through SliceKernel, which reads the buffers and produces a sliced BitPacked with sliced patches, so the test exercises the fused unpack_cmp + patch-fixup path it was written for. Signed-off-by: Joe Isaacs --- encodings/fastlanes/src/bitpacking/compute/compare.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/encodings/fastlanes/src/bitpacking/compute/compare.rs b/encodings/fastlanes/src/bitpacking/compute/compare.rs index 3c9a6d527d3..056e9840eef 100644 --- a/encodings/fastlanes/src/bitpacking/compute/compare.rs +++ b/encodings/fastlanes/src/bitpacking/compute/compare.rs @@ -275,7 +275,15 @@ mod tests { assert!(packed.patches().is_some(), "test setup expects patches"); let (start, end) = (700usize, 3500usize); - let sliced = packed.into_array().slice(start..end)?; + // `ArrayRef::slice` leaves a lazy `SliceArray` over a patched `BitPacked` (the + // `SliceReduce` path bails when patches are present), so go through the `SliceKernel`, + // which reads the buffers and produces a sliced `BitPacked` with sliced patches. + let sliced = ::slice( + packed.as_view(), + start..end, + &mut ctx, + )? + .expect("slice kernel produces a sliced bitpacked array"); let rhs = ConstantArray::new(50i32, end - start).into_array(); let got = ::compare( sliced.as_::(), From bd3fbaa2d5ba9d6602dcc7f901984fcb7ef8e1d9 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 4 Jun 2026 16:31:47 +0000 Subject: [PATCH 14/14] Import SliceKernel instead of absolute path to satisfy clippy::absolute-paths Signed-off-by: Joe Isaacs --- encodings/fastlanes/src/bitpacking/compute/compare.rs | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/encodings/fastlanes/src/bitpacking/compute/compare.rs b/encodings/fastlanes/src/bitpacking/compute/compare.rs index 056e9840eef..5c9aa31c1ac 100644 --- a/encodings/fastlanes/src/bitpacking/compute/compare.rs +++ b/encodings/fastlanes/src/bitpacking/compute/compare.rs @@ -109,6 +109,7 @@ mod tests { use vortex_array::arrays::BoolArray; use vortex_array::arrays::ConstantArray; use vortex_array::arrays::PrimitiveArray; + use vortex_array::arrays::slice::SliceKernel; use vortex_array::assert_arrays_eq; use vortex_array::builtins::ArrayBuiltins; use vortex_array::scalar_fn::fns::binary::CompareKernel; @@ -278,12 +279,8 @@ mod tests { // `ArrayRef::slice` leaves a lazy `SliceArray` over a patched `BitPacked` (the // `SliceReduce` path bails when patches are present), so go through the `SliceKernel`, // which reads the buffers and produces a sliced `BitPacked` with sliced patches. - let sliced = ::slice( - packed.as_view(), - start..end, - &mut ctx, - )? - .expect("slice kernel produces a sliced bitpacked array"); + let sliced = ::slice(packed.as_view(), start..end, &mut ctx)? + .expect("slice kernel produces a sliced bitpacked array"); let rhs = ConstantArray::new(50i32, end - start).into_array(); let got = ::compare( sliced.as_::(),