From 98e41d4b602b5db3353031349589a4bc8606cfe8 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 20 Jun 2026 07:06:35 +0000 Subject: [PATCH 1/3] bench: gate CodSpeed-unstable canonicalization benches from simulation A small set of microbenchmarks report false-positive regressions in nearly every PR. Their CodSpeed CPU-simulation instruction count is dominated by output-buffer allocation and glibc `memcpy`/`memmove` (whose `ifunc`-selected implementation varies across runner images) rather than by Vortex compute, so they move bidirectionally by 10-90% for unchanged code and CodSpeed flags "different runtime environments" on the comparisons. They cannot be stabilized under simulation, so per `docs/developer-guide/benchmarking.md` they are gated with `#[cfg(not(codspeed))]` and remain available via local `cargo bench`. Gated from CodSpeed (kept for local runs): - alp_compress.rs: `decompress_rd` (decode-to-canonical; moved in 7/9 sampled PRs, 842-1025 us for identical code). `compress_rd` (encode, compute-bound, never flaky) is kept. - chunk_array_builder.rs: `chunked_varbinview_*` (string canonicalization, memcpy-bound; flaky in 6/9 PRs) and `chunked_bool_canonical_into` (also below the ~16-35 us noise floor, ~2x swings). The compute-bound `chunked_opt_bool_*` and `chunked_constant_*` benches are kept. Verified: both suites build and run under `cargo codspeed` (Simulation mode), the gated benches are excluded while the kept benches still execute, and the local `cargo bench` path, `cargo fmt`, and `cargo clippy` are clean. Signed-off-by: Claude Claude-Session: https://claude.ai/code/session_01GXdjWYp7AbSKwn2bw6GYsf --- encodings/alp/benches/alp_compress.rs | 11 +++++++++ vortex-array/benches/chunk_array_builder.rs | 26 +++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/encodings/alp/benches/alp_compress.rs b/encodings/alp/benches/alp_compress.rs index 02cc30e44a6..fd4e8178ef1 100644 --- a/encodings/alp/benches/alp_compress.rs +++ b/encodings/alp/benches/alp_compress.rs @@ -14,7 +14,10 @@ use vortex_alp::ALPRDFloat; use vortex_alp::RDEncoder; use vortex_alp::alp_encode; use vortex_alp::decompress_into_array; +// Only used by `decompress_rd`, which is excluded from CodSpeed (see below). +#[cfg(not(codspeed))] use vortex_array::Canonical; +#[cfg(not(codspeed))] use vortex_array::IntoArray; use vortex_array::VortexSessionExecute; use vortex_array::arrays::PrimitiveArray; @@ -150,6 +153,14 @@ fn compress_rd(bencher: Bencher, args: (usize, f64) .bench_refs(|(primitive, encoder, ctx)| encoder.encode(primitive.as_view(), ctx)) } +// Excluded from CodSpeed's CPU simulation: this benchmark canonicalizes the decoded array, so its +// instruction count is dominated by output-buffer allocation and `memcpy`/`memmove` (whose glibc +// `ifunc`-selected implementation differs across runner images) rather than by the ALP-RD decode +// itself. Under simulation it produces only false-positive regressions (it moved in 7 of the last +// 9 PRs, bidirectionally, ranging 842-1025 us for the same code, while CodSpeed flagged "different +// runtime environments"). Per `docs/developer-guide/benchmarking.md`, CodSpeed-incompatible +// benchmarks are gated with `#[cfg(not(codspeed))]`; it remains available via local `cargo bench`. +#[cfg(not(codspeed))] #[divan::bench(types = [f32, f64], args = RD_BENCH_ARGS)] fn decompress_rd(bencher: Bencher, args: (usize, f64)) { let (n, fraction_patch) = args; diff --git a/vortex-array/benches/chunk_array_builder.rs b/vortex-array/benches/chunk_array_builder.rs index 6582071a6e1..ddd2561c5ca 100644 --- a/vortex-array/benches/chunk_array_builder.rs +++ b/vortex-array/benches/chunk_array_builder.rs @@ -14,9 +14,13 @@ use vortex_array::VortexSessionExecute; use vortex_array::arrays::BoolArray; use vortex_array::arrays::ChunkedArray; use vortex_array::arrays::ConstantArray; +// Both traits/builders below are only used by benches excluded from CodSpeed (see below). +#[cfg(not(codspeed))] use vortex_array::builders::ArrayBuilder; +#[cfg(not(codspeed))] use vortex_array::builders::VarBinViewBuilder; use vortex_array::builders::builder_with_capacity; +#[cfg(not(codspeed))] use vortex_array::dtype::DType; use vortex_error::VortexExpect; use vortex_session::VortexSession; @@ -34,6 +38,18 @@ const BENCH_ARGS: &[(usize, usize)] = &[ static SESSION: LazyLock = LazyLock::new(vortex_array::array_session); +// The following canonicalization benchmarks are excluded from CodSpeed's CPU simulation. Their +// instruction count is dominated by output-buffer allocation and `memcpy`/`memmove` (whose +// glibc `ifunc`-selected implementation differs across runner images) rather than by Vortex +// compute, so under simulation they only ever report false-positive regressions: each moved in a +// majority of recent PRs (bidirectionally, e.g. `chunked_bool_canonical_into` swung 16-35 us, a +// ~2x range, for unchanged code) and CodSpeed flagged "different runtime environments" on the +// comparisons. `chunked_bool_canonical_into` is additionally below the simulation noise floor +// (~16-35 us). Per `docs/developer-guide/benchmarking.md`, CodSpeed-incompatible benchmarks are +// gated with `#[cfg(not(codspeed))]`; they remain available via local `cargo bench`. The +// `chunked_opt_bool_*` and `chunked_constant_*` benches below are compute-bound and stable under +// simulation, so they are kept. +#[cfg(not(codspeed))] #[divan::bench(args = BENCH_ARGS)] fn chunked_bool_canonical_into(bencher: Bencher, (len, chunk_count): (usize, usize)) { let chunk = make_bool_chunks(len, chunk_count); @@ -73,6 +89,8 @@ fn chunked_opt_bool_into_canonical(bencher: Bencher, (len, chunk_count): (usize, .bench_refs(|(chunk, ctx)| (**chunk).clone().execute::(ctx)) } +// Excluded from CodSpeed: VarBinView canonicalization is `memcpy`-bound (see note above). +#[cfg(not(codspeed))] #[divan::bench(args = BENCH_ARGS)] fn chunked_varbinview_canonical_into(bencher: Bencher, (len, chunk_count): (usize, usize)) { let chunks = make_string_chunks(false, len, chunk_count); @@ -91,6 +109,8 @@ fn chunked_varbinview_canonical_into(bencher: Bencher, (len, chunk_count): (usiz }) } +// Excluded from CodSpeed: VarBinView canonicalization is `memcpy`-bound (see note above). +#[cfg(not(codspeed))] #[divan::bench(args = BENCH_ARGS)] fn chunked_varbinview_into_canonical(bencher: Bencher, (len, chunk_count): (usize, usize)) { let chunks = make_string_chunks(false, len, chunk_count); @@ -100,6 +120,8 @@ fn chunked_varbinview_into_canonical(bencher: Bencher, (len, chunk_count): (usiz .bench_refs(|(chunk, ctx)| (**chunk).clone().execute::(ctx)) } +// Excluded from CodSpeed: VarBinView canonicalization is `memcpy`-bound (see note above). +#[cfg(not(codspeed))] #[divan::bench(args = BENCH_ARGS)] fn chunked_varbinview_opt_canonical_into(bencher: Bencher, (len, chunk_count): (usize, usize)) { let chunks = make_string_chunks(true, len, chunk_count); @@ -118,6 +140,8 @@ fn chunked_varbinview_opt_canonical_into(bencher: Bencher, (len, chunk_count): ( }) } +// Excluded from CodSpeed: VarBinView canonicalization is `memcpy`-bound (see note above). +#[cfg(not(codspeed))] #[divan::bench(args = BENCH_ARGS)] fn chunked_varbinview_opt_into_canonical(bencher: Bencher, (len, chunk_count): (usize, usize)) { let chunks = make_string_chunks(true, len, chunk_count); @@ -211,6 +235,7 @@ fn make_opt_bool_chunks(len: usize, chunk_count: usize) -> ArrayRef { .into_array() } +#[cfg(not(codspeed))] fn make_bool_chunks(len: usize, chunk_count: usize) -> ArrayRef { let mut rng = StdRng::seed_from_u64(0); @@ -220,6 +245,7 @@ fn make_bool_chunks(len: usize, chunk_count: usize) -> ArrayRef { .into_array() } +#[cfg(not(codspeed))] fn make_string_chunks(nullable: bool, len: usize, chunk_count: usize) -> ArrayRef { let mut rng = StdRng::seed_from_u64(123); From 50560a6443768aa75c75026db214a7ba271435f4 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 20 Jun 2026 14:32:40 +0000 Subject: [PATCH 2/3] bench: gate flaky take_10k bitpacking benches from CodSpeed simulation `take_10k_random`, `take_10k_contiguous`, `patched_take_10k_random`, and `patched_take_10k_contiguous_patches` gather 10k elements and canonicalize the result, so their CodSpeed-simulation instruction count is bimodal (~195 us vs ~255 us, +-23% for unchanged code) from output-buffer allocation + `memcpy` and the SIMD bit-unpack's code-layout sensitivity across runner images. They fired in 4+ recent unrelated PRs and on this PR itself, so they are gated with `#[cfg(not(codspeed))]` and remain available via local `cargo bench`. The other take variants in this file have not exhibited the instability and are kept. Verified: `cargo codspeed build` is clean, the four benches are excluded from the built suite while the other take variants remain, and the local `cargo bench` build, `cargo fmt`, and `cargo clippy` pass. Signed-off-by: Claude Claude-Session: https://claude.ai/code/session_01GXdjWYp7AbSKwn2bw6GYsf --- encodings/fastlanes/benches/bitpacking_take.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/encodings/fastlanes/benches/bitpacking_take.rs b/encodings/fastlanes/benches/bitpacking_take.rs index eb072017ae3..ed1acc871df 100644 --- a/encodings/fastlanes/benches/bitpacking_take.rs +++ b/encodings/fastlanes/benches/bitpacking_take.rs @@ -70,6 +70,14 @@ fn take_10_contiguous(bencher: Bencher) { }) } +// The four `*take_10k_{random,contiguous,...}` benches below are excluded from CodSpeed's CPU +// simulation. Each gathers 10k elements and canonicalizes the result, so its instruction count is +// bimodal under simulation (~195 us vs ~255 us, ±23% for unchanged code) due to output-buffer +// allocation + `memcpy` and the SIMD bit-unpack's code-layout sensitivity across runner images +// ("different runtime environments"). They fired in 4+ recent unrelated PRs and on this PR itself. +// Per `docs/developer-guide/benchmarking.md` they are gated with `#[cfg(not(codspeed))]` and remain +// available via local `cargo bench`. The other take variants here have not shown this and are kept. +#[cfg(not(codspeed))] #[divan::bench] fn take_10k_random(bencher: Bencher) { let values = fixture(65_536, 8); @@ -92,6 +100,8 @@ fn take_10k_random(bencher: Bencher) { }) } +// Excluded from CodSpeed: bimodal 10k take+canonicalize (see note above). +#[cfg(not(codspeed))] #[divan::bench] fn take_10k_contiguous(bencher: Bencher) { let values = fixture(65_536, 8); @@ -221,6 +231,8 @@ fn patched_take_10_contiguous(bencher: Bencher) { }) } +// Excluded from CodSpeed: bimodal 10k take+canonicalize (see note above). +#[cfg(not(codspeed))] #[divan::bench] fn patched_take_10k_random(bencher: Bencher) { let values = (0u32..BIG_BASE2 + NUM_EXCEPTIONS).collect::>(); @@ -262,6 +274,8 @@ fn patched_take_10k_contiguous_not_patches(bencher: Bencher) { }) } +// Excluded from CodSpeed: bimodal 10k take+canonicalize (see note above). +#[cfg(not(codspeed))] #[divan::bench] fn patched_take_10k_contiguous_patches(bencher: Bencher) { let values = (0u32..BIG_BASE2 + NUM_EXCEPTIONS).collect::>(); From 17ae8e0db5ea20542051f9f8a75412727a28519f Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 20 Jun 2026 17:38:36 +0000 Subject: [PATCH 3/3] bench: declarative CodSpeed gate comments and localized imports Make the `#[cfg(not(codspeed))]` exclusions self-documenting and durable: - Every gate now carries a comment stating exactly why the benchmark (or its helper/import) is excluded, focused on the root cause (output-buffer allocation + glibc `memcpy`/`memmove` ifunc variance and SIMD code-layout sensitivity across runner images) rather than per-PR flake counts, which go stale as the codebase moves. Each links to the analysis in PR #8519. - Drop time-relative phrasing (e.g. "last N PRs", "fired in M PRs") so the comments stay accurate over time. - Move `Canonical`/`IntoArray` in alp_compress.rs into `decompress_rd`, the only gated consumer, removing two per-import cfg attributes. In chunk_array_builder.rs the gated imports serve several gated benches, so they stay at module scope with a clear note; the two gated helper fns now explain their gating too. Verified: `cargo build`, `RUSTFLAGS="--cfg codspeed" cargo build`, and `cargo +nightly fmt --check` are clean for vortex-alp, vortex-array, and vortex-fastlanes bench targets; `cargo clippy -p vortex-alp --benches` is clean. Signed-off-by: Claude Claude-Session: https://claude.ai/code/session_01HGegegFqsRfGJuA8X9aHT7 --- encodings/alp/benches/alp_compress.rs | 24 +++++++-------- .../fastlanes/benches/bitpacking_take.rs | 17 ++++++----- vortex-array/benches/chunk_array_builder.rs | 29 +++++++++++-------- 3 files changed, 39 insertions(+), 31 deletions(-) diff --git a/encodings/alp/benches/alp_compress.rs b/encodings/alp/benches/alp_compress.rs index fd4e8178ef1..b653ab8e159 100644 --- a/encodings/alp/benches/alp_compress.rs +++ b/encodings/alp/benches/alp_compress.rs @@ -14,11 +14,6 @@ use vortex_alp::ALPRDFloat; use vortex_alp::RDEncoder; use vortex_alp::alp_encode; use vortex_alp::decompress_into_array; -// Only used by `decompress_rd`, which is excluded from CodSpeed (see below). -#[cfg(not(codspeed))] -use vortex_array::Canonical; -#[cfg(not(codspeed))] -use vortex_array::IntoArray; use vortex_array::VortexSessionExecute; use vortex_array::arrays::PrimitiveArray; use vortex_array::dtype::NativePType; @@ -153,16 +148,21 @@ fn compress_rd(bencher: Bencher, args: (usize, f64) .bench_refs(|(primitive, encoder, ctx)| encoder.encode(primitive.as_view(), ctx)) } -// Excluded from CodSpeed's CPU simulation: this benchmark canonicalizes the decoded array, so its -// instruction count is dominated by output-buffer allocation and `memcpy`/`memmove` (whose glibc -// `ifunc`-selected implementation differs across runner images) rather than by the ALP-RD decode -// itself. Under simulation it produces only false-positive regressions (it moved in 7 of the last -// 9 PRs, bidirectionally, ranging 842-1025 us for the same code, while CodSpeed flagged "different -// runtime environments"). Per `docs/developer-guide/benchmarking.md`, CodSpeed-incompatible -// benchmarks are gated with `#[cfg(not(codspeed))]`; it remains available via local `cargo bench`. +// Excluded from CodSpeed's CPU simulation. This benchmark canonicalizes the decoded array, so its +// simulated instruction count is dominated by output-buffer allocation and glibc `memcpy`/`memmove` +// (whose `ifunc`-selected implementation varies across runner images) rather than by the ALP-RD +// decode itself. That makes it report spurious, bidirectional regressions under simulation even when +// the Vortex code is byte-identical, and it cannot be stabilized by tuning inputs because the data +// movement is the thing being measured. The compute-bound `compress_rd` encode benchmark above does +// not have this problem and is kept. Per `docs/developer-guide/benchmarking.md` such benchmarks are +// gated with `#[cfg(not(codspeed))]`; it remains available via local `cargo bench`. See +// https://github.com/vortex-data/vortex/pull/8519 for the supporting analysis. #[cfg(not(codspeed))] #[divan::bench(types = [f32, f64], args = RD_BENCH_ARGS)] fn decompress_rd(bencher: Bencher, args: (usize, f64)) { + use vortex_array::Canonical; + use vortex_array::IntoArray; + let (n, fraction_patch) = args; let primitive = make_rd_array::(n, fraction_patch); let encoder = RDEncoder::new(primitive.as_slice::()); diff --git a/encodings/fastlanes/benches/bitpacking_take.rs b/encodings/fastlanes/benches/bitpacking_take.rs index ed1acc871df..a3e26b24a9c 100644 --- a/encodings/fastlanes/benches/bitpacking_take.rs +++ b/encodings/fastlanes/benches/bitpacking_take.rs @@ -70,13 +70,16 @@ fn take_10_contiguous(bencher: Bencher) { }) } -// The four `*take_10k_{random,contiguous,...}` benches below are excluded from CodSpeed's CPU -// simulation. Each gathers 10k elements and canonicalizes the result, so its instruction count is -// bimodal under simulation (~195 us vs ~255 us, ±23% for unchanged code) due to output-buffer -// allocation + `memcpy` and the SIMD bit-unpack's code-layout sensitivity across runner images -// ("different runtime environments"). They fired in 4+ recent unrelated PRs and on this PR itself. -// Per `docs/developer-guide/benchmarking.md` they are gated with `#[cfg(not(codspeed))]` and remain -// available via local `cargo bench`. The other take variants here have not shown this and are kept. +// The four `*take_10k_*` benches below are excluded from CodSpeed's CPU simulation. Each gathers 10k +// elements and canonicalizes the result, so its simulated instruction count is bimodal: it is driven +// by output-buffer allocation plus glibc `memcpy` and by the SIMD bit-unpack's code-layout +// sensitivity across runner images, rather than by stable Vortex compute. That makes them report +// spurious, bidirectional regressions under simulation even when the code is unchanged, and they +// cannot be stabilized by tuning inputs because the data movement is the thing being measured. The +// smaller take variants here are compute-bound and stable, so they are kept. Per +// `docs/developer-guide/benchmarking.md` such benchmarks are gated with `#[cfg(not(codspeed))]` and +// remain available via local `cargo bench`. See https://github.com/vortex-data/vortex/pull/8519 for +// the supporting analysis. #[cfg(not(codspeed))] #[divan::bench] fn take_10k_random(bencher: Bencher) { diff --git a/vortex-array/benches/chunk_array_builder.rs b/vortex-array/benches/chunk_array_builder.rs index ddd2561c5ca..ba8237abe10 100644 --- a/vortex-array/benches/chunk_array_builder.rs +++ b/vortex-array/benches/chunk_array_builder.rs @@ -14,7 +14,9 @@ use vortex_array::VortexSessionExecute; use vortex_array::arrays::BoolArray; use vortex_array::arrays::ChunkedArray; use vortex_array::arrays::ConstantArray; -// Both traits/builders below are only used by benches excluded from CodSpeed (see below). +// `ArrayBuilder`, `VarBinViewBuilder`, and `DType` are used only by the VarBinView canonicalization +// benches and their helpers, all of which are excluded from CodSpeed (see the gating note below), so +// they are gated to match and avoid unused-import errors under `--cfg codspeed`. #[cfg(not(codspeed))] use vortex_array::builders::ArrayBuilder; #[cfg(not(codspeed))] @@ -38,17 +40,16 @@ const BENCH_ARGS: &[(usize, usize)] = &[ static SESSION: LazyLock = LazyLock::new(vortex_array::array_session); -// The following canonicalization benchmarks are excluded from CodSpeed's CPU simulation. Their -// instruction count is dominated by output-buffer allocation and `memcpy`/`memmove` (whose -// glibc `ifunc`-selected implementation differs across runner images) rather than by Vortex -// compute, so under simulation they only ever report false-positive regressions: each moved in a -// majority of recent PRs (bidirectionally, e.g. `chunked_bool_canonical_into` swung 16-35 us, a -// ~2x range, for unchanged code) and CodSpeed flagged "different runtime environments" on the -// comparisons. `chunked_bool_canonical_into` is additionally below the simulation noise floor -// (~16-35 us). Per `docs/developer-guide/benchmarking.md`, CodSpeed-incompatible benchmarks are -// gated with `#[cfg(not(codspeed))]`; they remain available via local `cargo bench`. The -// `chunked_opt_bool_*` and `chunked_constant_*` benches below are compute-bound and stable under -// simulation, so they are kept. +// The canonicalization benchmarks gated below are excluded from CodSpeed's CPU simulation. Their +// simulated instruction count is dominated by output-buffer allocation and glibc `memcpy`/`memmove` +// (whose `ifunc`-selected implementation varies across runner images) rather than by Vortex compute, +// so under simulation they report spurious, bidirectional regressions even when the code is +// unchanged. `chunked_bool_canonical_into` is additionally small enough to sit near the simulation +// noise floor. They cannot be stabilized by tuning inputs because the data movement is the thing +// being measured. The `chunked_opt_bool_*` and `chunked_constant_*` benches below are compute-bound +// and stable under simulation, so they are kept. Per `docs/developer-guide/benchmarking.md` such +// benchmarks are gated with `#[cfg(not(codspeed))]` and remain available via local `cargo bench`. +// See https://github.com/vortex-data/vortex/pull/8519 for the supporting analysis. #[cfg(not(codspeed))] #[divan::bench(args = BENCH_ARGS)] fn chunked_bool_canonical_into(bencher: Bencher, (len, chunk_count): (usize, usize)) { @@ -235,6 +236,8 @@ fn make_opt_bool_chunks(len: usize, chunk_count: usize) -> ArrayRef { .into_array() } +// Only used by `chunked_bool_canonical_into`, which is excluded from CodSpeed (see above), so this +// helper is gated to match and avoid dead-code errors under `--cfg codspeed`. #[cfg(not(codspeed))] fn make_bool_chunks(len: usize, chunk_count: usize) -> ArrayRef { let mut rng = StdRng::seed_from_u64(0); @@ -245,6 +248,8 @@ fn make_bool_chunks(len: usize, chunk_count: usize) -> ArrayRef { .into_array() } +// Only used by the gated VarBinView canonicalization benches above, which are excluded from +// CodSpeed, so this helper is gated to match and avoid dead-code errors under `--cfg codspeed`. #[cfg(not(codspeed))] fn make_string_chunks(nullable: bool, len: usize, chunk_count: usize) -> ArrayRef { let mut rng = StdRng::seed_from_u64(123);