diff --git a/encodings/alp/benches/alp_compress.rs b/encodings/alp/benches/alp_compress.rs index 02cc30e44a6..b653ab8e159 100644 --- a/encodings/alp/benches/alp_compress.rs +++ b/encodings/alp/benches/alp_compress.rs @@ -14,8 +14,6 @@ use vortex_alp::ALPRDFloat; use vortex_alp::RDEncoder; use vortex_alp::alp_encode; use vortex_alp::decompress_into_array; -use vortex_array::Canonical; -use vortex_array::IntoArray; use vortex_array::VortexSessionExecute; use vortex_array::arrays::PrimitiveArray; use vortex_array::dtype::NativePType; @@ -150,8 +148,21 @@ fn compress_rd(bencher: Bencher, args: (usize, f64) .bench_refs(|(primitive, encoder, ctx)| encoder.encode(primitive.as_view(), ctx)) } +// Excluded from CodSpeed's CPU simulation. This benchmark canonicalizes the decoded array, so its +// simulated instruction count is dominated by output-buffer allocation and glibc `memcpy`/`memmove` +// (whose `ifunc`-selected implementation varies across runner images) rather than by the ALP-RD +// decode itself. That makes it report spurious, bidirectional regressions under simulation even when +// the Vortex code is byte-identical, and it cannot be stabilized by tuning inputs because the data +// movement is the thing being measured. The compute-bound `compress_rd` encode benchmark above does +// not have this problem and is kept. Per `docs/developer-guide/benchmarking.md` such benchmarks are +// gated with `#[cfg(not(codspeed))]`; it remains available via local `cargo bench`. See +// https://github.com/vortex-data/vortex/pull/8519 for the supporting analysis. +#[cfg(not(codspeed))] #[divan::bench(types = [f32, f64], args = RD_BENCH_ARGS)] fn decompress_rd(bencher: Bencher, args: (usize, f64)) { + use vortex_array::Canonical; + use vortex_array::IntoArray; + let (n, fraction_patch) = args; let primitive = make_rd_array::(n, fraction_patch); let encoder = RDEncoder::new(primitive.as_slice::()); diff --git a/encodings/fastlanes/benches/bitpacking_take.rs b/encodings/fastlanes/benches/bitpacking_take.rs index eb072017ae3..a3e26b24a9c 100644 --- a/encodings/fastlanes/benches/bitpacking_take.rs +++ b/encodings/fastlanes/benches/bitpacking_take.rs @@ -70,6 +70,17 @@ fn take_10_contiguous(bencher: Bencher) { }) } +// The four `*take_10k_*` benches below are excluded from CodSpeed's CPU simulation. Each gathers 10k +// elements and canonicalizes the result, so its simulated instruction count is bimodal: it is driven +// by output-buffer allocation plus glibc `memcpy` and by the SIMD bit-unpack's code-layout +// sensitivity across runner images, rather than by stable Vortex compute. That makes them report +// spurious, bidirectional regressions under simulation even when the code is unchanged, and they +// cannot be stabilized by tuning inputs because the data movement is the thing being measured. The +// smaller take variants here are compute-bound and stable, so they are kept. Per +// `docs/developer-guide/benchmarking.md` such benchmarks are gated with `#[cfg(not(codspeed))]` and +// remain available via local `cargo bench`. See https://github.com/vortex-data/vortex/pull/8519 for +// the supporting analysis. +#[cfg(not(codspeed))] #[divan::bench] fn take_10k_random(bencher: Bencher) { let values = fixture(65_536, 8); @@ -92,6 +103,8 @@ fn take_10k_random(bencher: Bencher) { }) } +// Excluded from CodSpeed: bimodal 10k take+canonicalize (see note above). +#[cfg(not(codspeed))] #[divan::bench] fn take_10k_contiguous(bencher: Bencher) { let values = fixture(65_536, 8); @@ -221,6 +234,8 @@ fn patched_take_10_contiguous(bencher: Bencher) { }) } +// Excluded from CodSpeed: bimodal 10k take+canonicalize (see note above). +#[cfg(not(codspeed))] #[divan::bench] fn patched_take_10k_random(bencher: Bencher) { let values = (0u32..BIG_BASE2 + NUM_EXCEPTIONS).collect::>(); @@ -262,6 +277,8 @@ fn patched_take_10k_contiguous_not_patches(bencher: Bencher) { }) } +// Excluded from CodSpeed: bimodal 10k take+canonicalize (see note above). +#[cfg(not(codspeed))] #[divan::bench] fn patched_take_10k_contiguous_patches(bencher: Bencher) { let values = (0u32..BIG_BASE2 + NUM_EXCEPTIONS).collect::>(); diff --git a/vortex-array/benches/chunk_array_builder.rs b/vortex-array/benches/chunk_array_builder.rs index 6582071a6e1..ba8237abe10 100644 --- a/vortex-array/benches/chunk_array_builder.rs +++ b/vortex-array/benches/chunk_array_builder.rs @@ -14,9 +14,15 @@ use vortex_array::VortexSessionExecute; use vortex_array::arrays::BoolArray; use vortex_array::arrays::ChunkedArray; use vortex_array::arrays::ConstantArray; +// `ArrayBuilder`, `VarBinViewBuilder`, and `DType` are used only by the VarBinView canonicalization +// benches and their helpers, all of which are excluded from CodSpeed (see the gating note below), so +// they are gated to match and avoid unused-import errors under `--cfg codspeed`. +#[cfg(not(codspeed))] use vortex_array::builders::ArrayBuilder; +#[cfg(not(codspeed))] use vortex_array::builders::VarBinViewBuilder; use vortex_array::builders::builder_with_capacity; +#[cfg(not(codspeed))] use vortex_array::dtype::DType; use vortex_error::VortexExpect; use vortex_session::VortexSession; @@ -34,6 +40,17 @@ const BENCH_ARGS: &[(usize, usize)] = &[ static SESSION: LazyLock = LazyLock::new(vortex_array::array_session); +// The canonicalization benchmarks gated below are excluded from CodSpeed's CPU simulation. Their +// simulated instruction count is dominated by output-buffer allocation and glibc `memcpy`/`memmove` +// (whose `ifunc`-selected implementation varies across runner images) rather than by Vortex compute, +// so under simulation they report spurious, bidirectional regressions even when the code is +// unchanged. `chunked_bool_canonical_into` is additionally small enough to sit near the simulation +// noise floor. They cannot be stabilized by tuning inputs because the data movement is the thing +// being measured. The `chunked_opt_bool_*` and `chunked_constant_*` benches below are compute-bound +// and stable under simulation, so they are kept. Per `docs/developer-guide/benchmarking.md` such +// benchmarks are gated with `#[cfg(not(codspeed))]` and remain available via local `cargo bench`. +// See https://github.com/vortex-data/vortex/pull/8519 for the supporting analysis. +#[cfg(not(codspeed))] #[divan::bench(args = BENCH_ARGS)] fn chunked_bool_canonical_into(bencher: Bencher, (len, chunk_count): (usize, usize)) { let chunk = make_bool_chunks(len, chunk_count); @@ -73,6 +90,8 @@ fn chunked_opt_bool_into_canonical(bencher: Bencher, (len, chunk_count): (usize, .bench_refs(|(chunk, ctx)| (**chunk).clone().execute::(ctx)) } +// Excluded from CodSpeed: VarBinView canonicalization is `memcpy`-bound (see note above). +#[cfg(not(codspeed))] #[divan::bench(args = BENCH_ARGS)] fn chunked_varbinview_canonical_into(bencher: Bencher, (len, chunk_count): (usize, usize)) { let chunks = make_string_chunks(false, len, chunk_count); @@ -91,6 +110,8 @@ fn chunked_varbinview_canonical_into(bencher: Bencher, (len, chunk_count): (usiz }) } +// Excluded from CodSpeed: VarBinView canonicalization is `memcpy`-bound (see note above). +#[cfg(not(codspeed))] #[divan::bench(args = BENCH_ARGS)] fn chunked_varbinview_into_canonical(bencher: Bencher, (len, chunk_count): (usize, usize)) { let chunks = make_string_chunks(false, len, chunk_count); @@ -100,6 +121,8 @@ fn chunked_varbinview_into_canonical(bencher: Bencher, (len, chunk_count): (usiz .bench_refs(|(chunk, ctx)| (**chunk).clone().execute::(ctx)) } +// Excluded from CodSpeed: VarBinView canonicalization is `memcpy`-bound (see note above). +#[cfg(not(codspeed))] #[divan::bench(args = BENCH_ARGS)] fn chunked_varbinview_opt_canonical_into(bencher: Bencher, (len, chunk_count): (usize, usize)) { let chunks = make_string_chunks(true, len, chunk_count); @@ -118,6 +141,8 @@ fn chunked_varbinview_opt_canonical_into(bencher: Bencher, (len, chunk_count): ( }) } +// Excluded from CodSpeed: VarBinView canonicalization is `memcpy`-bound (see note above). +#[cfg(not(codspeed))] #[divan::bench(args = BENCH_ARGS)] fn chunked_varbinview_opt_into_canonical(bencher: Bencher, (len, chunk_count): (usize, usize)) { let chunks = make_string_chunks(true, len, chunk_count); @@ -211,6 +236,9 @@ fn make_opt_bool_chunks(len: usize, chunk_count: usize) -> ArrayRef { .into_array() } +// Only used by `chunked_bool_canonical_into`, which is excluded from CodSpeed (see above), so this +// helper is gated to match and avoid dead-code errors under `--cfg codspeed`. +#[cfg(not(codspeed))] fn make_bool_chunks(len: usize, chunk_count: usize) -> ArrayRef { let mut rng = StdRng::seed_from_u64(0); @@ -220,6 +248,9 @@ fn make_bool_chunks(len: usize, chunk_count: usize) -> ArrayRef { .into_array() } +// Only used by the gated VarBinView canonicalization benches above, which are excluded from +// CodSpeed, so this helper is gated to match and avoid dead-code errors under `--cfg codspeed`. +#[cfg(not(codspeed))] fn make_string_chunks(nullable: bool, len: usize, chunk_count: usize) -> ArrayRef { let mut rng = StdRng::seed_from_u64(123);