From 98e41d4b602b5db3353031349589a4bc8606cfe8 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 20 Jun 2026 07:06:35 +0000
Subject: [PATCH 1/3] bench: gate CodSpeed-unstable canonicalization benches
 from simulation

A small set of microbenchmarks report false-positive regressions in nearly
every PR. Their CodSpeed CPU-simulation instruction count is dominated by
output-buffer allocation and glibc `memcpy`/`memmove` (whose `ifunc`-selected
implementation varies across runner images) rather than by Vortex compute, so
they move bidirectionally by 10-90% for unchanged code and CodSpeed flags
"different runtime environments" on the comparisons. They cannot be stabilized
under simulation, so per `docs/developer-guide/benchmarking.md` they are gated
with `#[cfg(not(codspeed))]` and remain available via local `cargo bench`.

Gated from CodSpeed (kept for local runs):
- alp_compress.rs: `decompress_rd` (decode-to-canonical; moved in 7/9 sampled
  PRs, 842-1025 us for identical code). `compress_rd` (encode, compute-bound,
  never flaky) is kept.
- chunk_array_builder.rs: `chunked_varbinview_*` (string canonicalization,
  memcpy-bound; flaky in 6/9 PRs) and `chunked_bool_canonical_into` (also
  below the ~16-35 us noise floor, ~2x swings). The compute-bound
  `chunked_opt_bool_*` and `chunked_constant_*` benches are kept.

Verified: both suites build and run under `cargo codspeed` (Simulation mode),
the gated benches are excluded while the kept benches still execute, and the
local `cargo bench` path, `cargo fmt`, and `cargo clippy` are clean.

Signed-off-by: Claude <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01GXdjWYp7AbSKwn2bw6GYsf
---
 encodings/alp/benches/alp_compress.rs       | 11 +++++++++
 vortex-array/benches/chunk_array_builder.rs | 26 +++++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/encodings/alp/benches/alp_compress.rs b/encodings/alp/benches/alp_compress.rs
index 02cc30e44a6..fd4e8178ef1 100644
--- a/encodings/alp/benches/alp_compress.rs
+++ b/encodings/alp/benches/alp_compress.rs
@@ -14,7 +14,10 @@ use vortex_alp::ALPRDFloat;
 use vortex_alp::RDEncoder;
 use vortex_alp::alp_encode;
 use vortex_alp::decompress_into_array;
+// Only used by `decompress_rd`, which is excluded from CodSpeed (see below).
+#[cfg(not(codspeed))]
 use vortex_array::Canonical;
+#[cfg(not(codspeed))]
 use vortex_array::IntoArray;
 use vortex_array::VortexSessionExecute;
 use vortex_array::arrays::PrimitiveArray;
@@ -150,6 +153,14 @@ fn compress_rd<T: ALPRDFloat + NativePType>(bencher: Bencher, args: (usize, f64)
         .bench_refs(|(primitive, encoder, ctx)| encoder.encode(primitive.as_view(), ctx))
 }
 
+// Excluded from CodSpeed's CPU simulation: this benchmark canonicalizes the decoded array, so its
+// instruction count is dominated by output-buffer allocation and `memcpy`/`memmove` (whose glibc
+// `ifunc`-selected implementation differs across runner images) rather than by the ALP-RD decode
+// itself. Under simulation it produces only false-positive regressions (it moved in 7 of the last
+// 9 PRs, bidirectionally, ranging 842-1025 us for the same code, while CodSpeed flagged "different
+// runtime environments"). Per `docs/developer-guide/benchmarking.md`, CodSpeed-incompatible
+// benchmarks are gated with `#[cfg(not(codspeed))]`; it remains available via local `cargo bench`.
+#[cfg(not(codspeed))]
 #[divan::bench(types = [f32, f64], args = RD_BENCH_ARGS)]
 fn decompress_rd<T: ALPRDFloat + NativePType>(bencher: Bencher, args: (usize, f64)) {
     let (n, fraction_patch) = args;
diff --git a/vortex-array/benches/chunk_array_builder.rs b/vortex-array/benches/chunk_array_builder.rs
index 6582071a6e1..ddd2561c5ca 100644
--- a/vortex-array/benches/chunk_array_builder.rs
+++ b/vortex-array/benches/chunk_array_builder.rs
@@ -14,9 +14,13 @@ use vortex_array::VortexSessionExecute;
 use vortex_array::arrays::BoolArray;
 use vortex_array::arrays::ChunkedArray;
 use vortex_array::arrays::ConstantArray;
+// Both traits/builders below are only used by benches excluded from CodSpeed (see below).
+#[cfg(not(codspeed))]
 use vortex_array::builders::ArrayBuilder;
+#[cfg(not(codspeed))]
 use vortex_array::builders::VarBinViewBuilder;
 use vortex_array::builders::builder_with_capacity;
+#[cfg(not(codspeed))]
 use vortex_array::dtype::DType;
 use vortex_error::VortexExpect;
 use vortex_session::VortexSession;
@@ -34,6 +38,18 @@ const BENCH_ARGS: &[(usize, usize)] = &[
 
 static SESSION: LazyLock<VortexSession> = LazyLock::new(vortex_array::array_session);
 
+// The following canonicalization benchmarks are excluded from CodSpeed's CPU simulation. Their
+// instruction count is dominated by output-buffer allocation and `memcpy`/`memmove` (whose
+// glibc `ifunc`-selected implementation differs across runner images) rather than by Vortex
+// compute, so under simulation they only ever report false-positive regressions: each moved in a
+// majority of recent PRs (bidirectionally, e.g. `chunked_bool_canonical_into` swung 16-35 us, a
+// ~2x range, for unchanged code) and CodSpeed flagged "different runtime environments" on the
+// comparisons. `chunked_bool_canonical_into` is additionally below the simulation noise floor
+// (~16-35 us). Per `docs/developer-guide/benchmarking.md`, CodSpeed-incompatible benchmarks are
+// gated with `#[cfg(not(codspeed))]`; they remain available via local `cargo bench`. The
+// `chunked_opt_bool_*` and `chunked_constant_*` benches below are compute-bound and stable under
+// simulation, so they are kept.
+#[cfg(not(codspeed))]
 #[divan::bench(args = BENCH_ARGS)]
 fn chunked_bool_canonical_into(bencher: Bencher, (len, chunk_count): (usize, usize)) {
     let chunk = make_bool_chunks(len, chunk_count);
@@ -73,6 +89,8 @@ fn chunked_opt_bool_into_canonical(bencher: Bencher, (len, chunk_count): (usize,
         .bench_refs(|(chunk, ctx)| (**chunk).clone().execute::<Canonical>(ctx))
 }
 
+// Excluded from CodSpeed: VarBinView canonicalization is `memcpy`-bound (see note above).
+#[cfg(not(codspeed))]
 #[divan::bench(args = BENCH_ARGS)]
 fn chunked_varbinview_canonical_into(bencher: Bencher, (len, chunk_count): (usize, usize)) {
     let chunks = make_string_chunks(false, len, chunk_count);
@@ -91,6 +109,8 @@ fn chunked_varbinview_canonical_into(bencher: Bencher, (len, chunk_count): (usiz
         })
 }
 
+// Excluded from CodSpeed: VarBinView canonicalization is `memcpy`-bound (see note above).
+#[cfg(not(codspeed))]
 #[divan::bench(args = BENCH_ARGS)]
 fn chunked_varbinview_into_canonical(bencher: Bencher, (len, chunk_count): (usize, usize)) {
     let chunks = make_string_chunks(false, len, chunk_count);
@@ -100,6 +120,8 @@ fn chunked_varbinview_into_canonical(bencher: Bencher, (len, chunk_count): (usiz
         .bench_refs(|(chunk, ctx)| (**chunk).clone().execute::<Canonical>(ctx))
 }
 
+// Excluded from CodSpeed: VarBinView canonicalization is `memcpy`-bound (see note above).
+#[cfg(not(codspeed))]
 #[divan::bench(args = BENCH_ARGS)]
 fn chunked_varbinview_opt_canonical_into(bencher: Bencher, (len, chunk_count): (usize, usize)) {
     let chunks = make_string_chunks(true, len, chunk_count);
@@ -118,6 +140,8 @@ fn chunked_varbinview_opt_canonical_into(bencher: Bencher, (len, chunk_count): (
         })
 }
 
+// Excluded from CodSpeed: VarBinView canonicalization is `memcpy`-bound (see note above).
+#[cfg(not(codspeed))]
 #[divan::bench(args = BENCH_ARGS)]
 fn chunked_varbinview_opt_into_canonical(bencher: Bencher, (len, chunk_count): (usize, usize)) {
     let chunks = make_string_chunks(true, len, chunk_count);
@@ -211,6 +235,7 @@ fn make_opt_bool_chunks(len: usize, chunk_count: usize) -> ArrayRef {
         .into_array()
 }
 
+#[cfg(not(codspeed))]
 fn make_bool_chunks(len: usize, chunk_count: usize) -> ArrayRef {
     let mut rng = StdRng::seed_from_u64(0);
 
@@ -220,6 +245,7 @@ fn make_bool_chunks(len: usize, chunk_count: usize) -> ArrayRef {
         .into_array()
 }
 
+#[cfg(not(codspeed))]
 fn make_string_chunks(nullable: bool, len: usize, chunk_count: usize) -> ArrayRef {
     let mut rng = StdRng::seed_from_u64(123);
 

From 50560a6443768aa75c75026db214a7ba271435f4 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 20 Jun 2026 14:32:40 +0000
Subject: [PATCH 2/3] bench: gate flaky take_10k bitpacking benches from
 CodSpeed simulation

`take_10k_random`, `take_10k_contiguous`, `patched_take_10k_random`, and
`patched_take_10k_contiguous_patches` gather 10k elements and canonicalize the
result, so their CodSpeed-simulation instruction count is bimodal (~195 us vs
~255 us, +-23% for unchanged code) from output-buffer allocation + `memcpy` and
the SIMD bit-unpack's code-layout sensitivity across runner images. They fired
in 4+ recent unrelated PRs and on this PR itself, so they are gated with
`#[cfg(not(codspeed))]` and remain available via local `cargo bench`. The other
take variants in this file have not exhibited the instability and are kept.

Verified: `cargo codspeed build` is clean, the four benches are excluded from
the built suite while the other take variants remain, and the local
`cargo bench` build, `cargo fmt`, and `cargo clippy` pass.

Signed-off-by: Claude <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01GXdjWYp7AbSKwn2bw6GYsf
---
 encodings/fastlanes/benches/bitpacking_take.rs | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/encodings/fastlanes/benches/bitpacking_take.rs b/encodings/fastlanes/benches/bitpacking_take.rs
index eb072017ae3..ed1acc871df 100644
--- a/encodings/fastlanes/benches/bitpacking_take.rs
+++ b/encodings/fastlanes/benches/bitpacking_take.rs
@@ -70,6 +70,14 @@ fn take_10_contiguous(bencher: Bencher) {
         })
 }
 
+// The four `*take_10k_{random,contiguous,...}` benches below are excluded from CodSpeed's CPU
+// simulation. Each gathers 10k elements and canonicalizes the result, so its instruction count is
+// bimodal under simulation (~195 us vs ~255 us, ±23% for unchanged code) due to output-buffer
+// allocation + `memcpy` and the SIMD bit-unpack's code-layout sensitivity across runner images
+// ("different runtime environments"). They fired in 4+ recent unrelated PRs and on this PR itself.
+// Per `docs/developer-guide/benchmarking.md` they are gated with `#[cfg(not(codspeed))]` and remain
+// available via local `cargo bench`. The other take variants here have not shown this and are kept.
+#[cfg(not(codspeed))]
 #[divan::bench]
 fn take_10k_random(bencher: Bencher) {
     let values = fixture(65_536, 8);
@@ -92,6 +100,8 @@ fn take_10k_random(bencher: Bencher) {
         })
 }
 
+// Excluded from CodSpeed: bimodal 10k take+canonicalize (see note above).
+#[cfg(not(codspeed))]
 #[divan::bench]
 fn take_10k_contiguous(bencher: Bencher) {
     let values = fixture(65_536, 8);
@@ -221,6 +231,8 @@ fn patched_take_10_contiguous(bencher: Bencher) {
         })
 }
 
+// Excluded from CodSpeed: bimodal 10k take+canonicalize (see note above).
+#[cfg(not(codspeed))]
 #[divan::bench]
 fn patched_take_10k_random(bencher: Bencher) {
     let values = (0u32..BIG_BASE2 + NUM_EXCEPTIONS).collect::<Buffer<u32>>();
@@ -262,6 +274,8 @@ fn patched_take_10k_contiguous_not_patches(bencher: Bencher) {
         })
 }
 
+// Excluded from CodSpeed: bimodal 10k take+canonicalize (see note above).
+#[cfg(not(codspeed))]
 #[divan::bench]
 fn patched_take_10k_contiguous_patches(bencher: Bencher) {
     let values = (0u32..BIG_BASE2 + NUM_EXCEPTIONS).collect::<Buffer<u32>>();

From 17ae8e0db5ea20542051f9f8a75412727a28519f Mon Sep 17 00:00:00 2001
From: Claude <connor@spiraldb.com>
Date: Sat, 20 Jun 2026 17:38:36 +0000
Subject: [PATCH 3/3] bench: declarative CodSpeed gate comments and localized
 imports

Make the `#[cfg(not(codspeed))]` exclusions self-documenting and durable:

- Every gate now carries a comment stating exactly why the benchmark (or its
  helper/import) is excluded, focused on the root cause (output-buffer
  allocation + glibc `memcpy`/`memmove` ifunc variance and SIMD code-layout
  sensitivity across runner images) rather than per-PR flake counts, which go
  stale as the codebase moves. Each links to the analysis in PR #8519.
- Drop time-relative phrasing (e.g. "last N PRs", "fired in M PRs") so the
  comments stay accurate over time.
- Move `Canonical`/`IntoArray` in alp_compress.rs into `decompress_rd`, the only
  gated consumer, removing two per-import cfg attributes. In chunk_array_builder.rs
  the gated imports serve several gated benches, so they stay at module scope with
  a clear note; the two gated helper fns now explain their gating too.

Verified: `cargo build`, `RUSTFLAGS="--cfg codspeed" cargo build`, and
`cargo +nightly fmt --check` are clean for vortex-alp, vortex-array, and
vortex-fastlanes bench targets; `cargo clippy -p vortex-alp --benches` is clean.

Signed-off-by: Claude <connor@spiraldb.com>
Claude-Session: https://claude.ai/code/session_01HGegegFqsRfGJuA8X9aHT7
---
 encodings/alp/benches/alp_compress.rs         | 24 +++++++--------
 .../fastlanes/benches/bitpacking_take.rs      | 17 ++++++-----
 vortex-array/benches/chunk_array_builder.rs   | 29 +++++++++++--------
 3 files changed, 39 insertions(+), 31 deletions(-)

diff --git a/encodings/alp/benches/alp_compress.rs b/encodings/alp/benches/alp_compress.rs
index fd4e8178ef1..b653ab8e159 100644
--- a/encodings/alp/benches/alp_compress.rs
+++ b/encodings/alp/benches/alp_compress.rs
@@ -14,11 +14,6 @@ use vortex_alp::ALPRDFloat;
 use vortex_alp::RDEncoder;
 use vortex_alp::alp_encode;
 use vortex_alp::decompress_into_array;
-// Only used by `decompress_rd`, which is excluded from CodSpeed (see below).
-#[cfg(not(codspeed))]
-use vortex_array::Canonical;
-#[cfg(not(codspeed))]
-use vortex_array::IntoArray;
 use vortex_array::VortexSessionExecute;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::dtype::NativePType;
@@ -153,16 +148,21 @@ fn compress_rd<T: ALPRDFloat + NativePType>(bencher: Bencher, args: (usize, f64)
         .bench_refs(|(primitive, encoder, ctx)| encoder.encode(primitive.as_view(), ctx))
 }
 
-// Excluded from CodSpeed's CPU simulation: this benchmark canonicalizes the decoded array, so its
-// instruction count is dominated by output-buffer allocation and `memcpy`/`memmove` (whose glibc
-// `ifunc`-selected implementation differs across runner images) rather than by the ALP-RD decode
-// itself. Under simulation it produces only false-positive regressions (it moved in 7 of the last
-// 9 PRs, bidirectionally, ranging 842-1025 us for the same code, while CodSpeed flagged "different
-// runtime environments"). Per `docs/developer-guide/benchmarking.md`, CodSpeed-incompatible
-// benchmarks are gated with `#[cfg(not(codspeed))]`; it remains available via local `cargo bench`.
+// Excluded from CodSpeed's CPU simulation. This benchmark canonicalizes the decoded array, so its
+// simulated instruction count is dominated by output-buffer allocation and glibc `memcpy`/`memmove`
+// (whose `ifunc`-selected implementation varies across runner images) rather than by the ALP-RD
+// decode itself. That makes it report spurious, bidirectional regressions under simulation even when
+// the Vortex code is byte-identical, and it cannot be stabilized by tuning inputs because the data
+// movement is the thing being measured. The compute-bound `compress_rd` encode benchmark above does
+// not have this problem and is kept. Per `docs/developer-guide/benchmarking.md` such benchmarks are
+// gated with `#[cfg(not(codspeed))]`; it remains available via local `cargo bench`. See
+// https://github.com/vortex-data/vortex/pull/8519 for the supporting analysis.
 #[cfg(not(codspeed))]
 #[divan::bench(types = [f32, f64], args = RD_BENCH_ARGS)]
 fn decompress_rd<T: ALPRDFloat + NativePType>(bencher: Bencher, args: (usize, f64)) {
+    use vortex_array::Canonical;
+    use vortex_array::IntoArray;
+
     let (n, fraction_patch) = args;
     let primitive = make_rd_array::<T>(n, fraction_patch);
     let encoder = RDEncoder::new(primitive.as_slice::<T>());
diff --git a/encodings/fastlanes/benches/bitpacking_take.rs b/encodings/fastlanes/benches/bitpacking_take.rs
index ed1acc871df..a3e26b24a9c 100644
--- a/encodings/fastlanes/benches/bitpacking_take.rs
+++ b/encodings/fastlanes/benches/bitpacking_take.rs
@@ -70,13 +70,16 @@ fn take_10_contiguous(bencher: Bencher) {
         })
 }
 
-// The four `*take_10k_{random,contiguous,...}` benches below are excluded from CodSpeed's CPU
-// simulation. Each gathers 10k elements and canonicalizes the result, so its instruction count is
-// bimodal under simulation (~195 us vs ~255 us, ±23% for unchanged code) due to output-buffer
-// allocation + `memcpy` and the SIMD bit-unpack's code-layout sensitivity across runner images
-// ("different runtime environments"). They fired in 4+ recent unrelated PRs and on this PR itself.
-// Per `docs/developer-guide/benchmarking.md` they are gated with `#[cfg(not(codspeed))]` and remain
-// available via local `cargo bench`. The other take variants here have not shown this and are kept.
+// The four `*take_10k_*` benches below are excluded from CodSpeed's CPU simulation. Each gathers 10k
+// elements and canonicalizes the result, so its simulated instruction count is bimodal: it is driven
+// by output-buffer allocation plus glibc `memcpy` and by the SIMD bit-unpack's code-layout
+// sensitivity across runner images, rather than by stable Vortex compute. That makes them report
+// spurious, bidirectional regressions under simulation even when the code is unchanged, and they
+// cannot be stabilized by tuning inputs because the data movement is the thing being measured. The
+// smaller take variants here are compute-bound and stable, so they are kept. Per
+// `docs/developer-guide/benchmarking.md` such benchmarks are gated with `#[cfg(not(codspeed))]` and
+// remain available via local `cargo bench`. See https://github.com/vortex-data/vortex/pull/8519 for
+// the supporting analysis.
 #[cfg(not(codspeed))]
 #[divan::bench]
 fn take_10k_random(bencher: Bencher) {
diff --git a/vortex-array/benches/chunk_array_builder.rs b/vortex-array/benches/chunk_array_builder.rs
index ddd2561c5ca..ba8237abe10 100644
--- a/vortex-array/benches/chunk_array_builder.rs
+++ b/vortex-array/benches/chunk_array_builder.rs
@@ -14,7 +14,9 @@ use vortex_array::VortexSessionExecute;
 use vortex_array::arrays::BoolArray;
 use vortex_array::arrays::ChunkedArray;
 use vortex_array::arrays::ConstantArray;
-// Both traits/builders below are only used by benches excluded from CodSpeed (see below).
+// `ArrayBuilder`, `VarBinViewBuilder`, and `DType` are used only by the VarBinView canonicalization
+// benches and their helpers, all of which are excluded from CodSpeed (see the gating note below), so
+// they are gated to match and avoid unused-import errors under `--cfg codspeed`.
 #[cfg(not(codspeed))]
 use vortex_array::builders::ArrayBuilder;
 #[cfg(not(codspeed))]
@@ -38,17 +40,16 @@ const BENCH_ARGS: &[(usize, usize)] = &[
 
 static SESSION: LazyLock<VortexSession> = LazyLock::new(vortex_array::array_session);
 
-// The following canonicalization benchmarks are excluded from CodSpeed's CPU simulation. Their
-// instruction count is dominated by output-buffer allocation and `memcpy`/`memmove` (whose
-// glibc `ifunc`-selected implementation differs across runner images) rather than by Vortex
-// compute, so under simulation they only ever report false-positive regressions: each moved in a
-// majority of recent PRs (bidirectionally, e.g. `chunked_bool_canonical_into` swung 16-35 us, a
-// ~2x range, for unchanged code) and CodSpeed flagged "different runtime environments" on the
-// comparisons. `chunked_bool_canonical_into` is additionally below the simulation noise floor
-// (~16-35 us). Per `docs/developer-guide/benchmarking.md`, CodSpeed-incompatible benchmarks are
-// gated with `#[cfg(not(codspeed))]`; they remain available via local `cargo bench`. The
-// `chunked_opt_bool_*` and `chunked_constant_*` benches below are compute-bound and stable under
-// simulation, so they are kept.
+// The canonicalization benchmarks gated below are excluded from CodSpeed's CPU simulation. Their
+// simulated instruction count is dominated by output-buffer allocation and glibc `memcpy`/`memmove`
+// (whose `ifunc`-selected implementation varies across runner images) rather than by Vortex compute,
+// so under simulation they report spurious, bidirectional regressions even when the code is
+// unchanged. `chunked_bool_canonical_into` is additionally small enough to sit near the simulation
+// noise floor. They cannot be stabilized by tuning inputs because the data movement is the thing
+// being measured. The `chunked_opt_bool_*` and `chunked_constant_*` benches below are compute-bound
+// and stable under simulation, so they are kept. Per `docs/developer-guide/benchmarking.md` such
+// benchmarks are gated with `#[cfg(not(codspeed))]` and remain available via local `cargo bench`.
+// See https://github.com/vortex-data/vortex/pull/8519 for the supporting analysis.
 #[cfg(not(codspeed))]
 #[divan::bench(args = BENCH_ARGS)]
 fn chunked_bool_canonical_into(bencher: Bencher, (len, chunk_count): (usize, usize)) {
@@ -235,6 +236,8 @@ fn make_opt_bool_chunks(len: usize, chunk_count: usize) -> ArrayRef {
         .into_array()
 }
 
+// Only used by `chunked_bool_canonical_into`, which is excluded from CodSpeed (see above), so this
+// helper is gated to match and avoid dead-code errors under `--cfg codspeed`.
 #[cfg(not(codspeed))]
 fn make_bool_chunks(len: usize, chunk_count: usize) -> ArrayRef {
     let mut rng = StdRng::seed_from_u64(0);
@@ -245,6 +248,8 @@ fn make_bool_chunks(len: usize, chunk_count: usize) -> ArrayRef {
         .into_array()
 }
 
+// Only used by the gated VarBinView canonicalization benches above, which are excluded from
+// CodSpeed, so this helper is gated to match and avoid dead-code errors under `--cfg codspeed`.
 #[cfg(not(codspeed))]
 fn make_string_chunks(nullable: bool, len: usize, chunk_count: usize) -> ArrayRef {
     let mut rng = StdRng::seed_from_u64(123);