diff --git a/encodings/alp/benches/alp_compress.rs b/encodings/alp/benches/alp_compress.rs
index 02cc30e44a6..b653ab8e159 100644
--- a/encodings/alp/benches/alp_compress.rs
+++ b/encodings/alp/benches/alp_compress.rs
@@ -14,8 +14,6 @@ use vortex_alp::ALPRDFloat;
 use vortex_alp::RDEncoder;
 use vortex_alp::alp_encode;
 use vortex_alp::decompress_into_array;
-use vortex_array::Canonical;
-use vortex_array::IntoArray;
 use vortex_array::VortexSessionExecute;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::dtype::NativePType;
@@ -150,8 +148,21 @@ fn compress_rd<T: ALPRDFloat + NativePType>(bencher: Bencher, args: (usize, f64)
         .bench_refs(|(primitive, encoder, ctx)| encoder.encode(primitive.as_view(), ctx))
 }
 
+// Excluded from CodSpeed's CPU simulation. This benchmark canonicalizes the decoded array, so its
+// simulated instruction count is dominated by output-buffer allocation and glibc `memcpy`/`memmove`
+// (whose `ifunc`-selected implementation varies across runner images) rather than by the ALP-RD
+// decode itself. That makes it report spurious, bidirectional regressions under simulation even when
+// the Vortex code is byte-identical, and it cannot be stabilized by tuning inputs because the data
+// movement is the thing being measured. The compute-bound `compress_rd` encode benchmark above does
+// not have this problem and is kept. Per `docs/developer-guide/benchmarking.md` such benchmarks are
+// gated with `#[cfg(not(codspeed))]`; it remains available via local `cargo bench`. See
+// https://github.com/vortex-data/vortex/pull/8519 for the supporting analysis.
+#[cfg(not(codspeed))]
 #[divan::bench(types = [f32, f64], args = RD_BENCH_ARGS)]
 fn decompress_rd<T: ALPRDFloat + NativePType>(bencher: Bencher, args: (usize, f64)) {
+    use vortex_array::Canonical;
+    use vortex_array::IntoArray;
+
     let (n, fraction_patch) = args;
     let primitive = make_rd_array::<T>(n, fraction_patch);
     let encoder = RDEncoder::new(primitive.as_slice::<T>());
diff --git a/encodings/fastlanes/benches/bitpacking_take.rs b/encodings/fastlanes/benches/bitpacking_take.rs
index eb072017ae3..a3e26b24a9c 100644
--- a/encodings/fastlanes/benches/bitpacking_take.rs
+++ b/encodings/fastlanes/benches/bitpacking_take.rs
@@ -70,6 +70,17 @@ fn take_10_contiguous(bencher: Bencher) {
         })
 }
 
+// The four `*take_10k_*` benches below are excluded from CodSpeed's CPU simulation. Each gathers 10k
+// elements and canonicalizes the result, so its simulated instruction count is bimodal: it is driven
+// by output-buffer allocation plus glibc `memcpy` and by the SIMD bit-unpack's code-layout
+// sensitivity across runner images, rather than by stable Vortex compute. That makes them report
+// spurious, bidirectional regressions under simulation even when the code is unchanged, and they
+// cannot be stabilized by tuning inputs because the data movement is the thing being measured. The
+// smaller take variants here are compute-bound and stable, so they are kept. Per
+// `docs/developer-guide/benchmarking.md` such benchmarks are gated with `#[cfg(not(codspeed))]` and
+// remain available via local `cargo bench`. See https://github.com/vortex-data/vortex/pull/8519 for
+// the supporting analysis.
+#[cfg(not(codspeed))]
 #[divan::bench]
 fn take_10k_random(bencher: Bencher) {
     let values = fixture(65_536, 8);
@@ -92,6 +103,8 @@ fn take_10k_random(bencher: Bencher) {
         })
 }
 
+// Excluded from CodSpeed: bimodal 10k take+canonicalize (see note above).
+#[cfg(not(codspeed))]
 #[divan::bench]
 fn take_10k_contiguous(bencher: Bencher) {
     let values = fixture(65_536, 8);
@@ -221,6 +234,8 @@ fn patched_take_10_contiguous(bencher: Bencher) {
         })
 }
 
+// Excluded from CodSpeed: bimodal 10k take+canonicalize (see note above).
+#[cfg(not(codspeed))]
 #[divan::bench]
 fn patched_take_10k_random(bencher: Bencher) {
     let values = (0u32..BIG_BASE2 + NUM_EXCEPTIONS).collect::<Buffer<u32>>();
@@ -262,6 +277,8 @@ fn patched_take_10k_contiguous_not_patches(bencher: Bencher) {
         })
 }
 
+// Excluded from CodSpeed: bimodal 10k take+canonicalize (see note above).
+#[cfg(not(codspeed))]
 #[divan::bench]
 fn patched_take_10k_contiguous_patches(bencher: Bencher) {
     let values = (0u32..BIG_BASE2 + NUM_EXCEPTIONS).collect::<Buffer<u32>>();
diff --git a/vortex-array/benches/chunk_array_builder.rs b/vortex-array/benches/chunk_array_builder.rs
index 6582071a6e1..ba8237abe10 100644
--- a/vortex-array/benches/chunk_array_builder.rs
+++ b/vortex-array/benches/chunk_array_builder.rs
@@ -14,9 +14,15 @@ use vortex_array::VortexSessionExecute;
 use vortex_array::arrays::BoolArray;
 use vortex_array::arrays::ChunkedArray;
 use vortex_array::arrays::ConstantArray;
+// `ArrayBuilder`, `VarBinViewBuilder`, and `DType` are used only by the VarBinView canonicalization
+// benches and their helpers, all of which are excluded from CodSpeed (see the gating note below), so
+// they are gated to match and avoid unused-import errors under `--cfg codspeed`.
+#[cfg(not(codspeed))]
 use vortex_array::builders::ArrayBuilder;
+#[cfg(not(codspeed))]
 use vortex_array::builders::VarBinViewBuilder;
 use vortex_array::builders::builder_with_capacity;
+#[cfg(not(codspeed))]
 use vortex_array::dtype::DType;
 use vortex_error::VortexExpect;
 use vortex_session::VortexSession;
@@ -34,6 +40,17 @@ const BENCH_ARGS: &[(usize, usize)] = &[
 
 static SESSION: LazyLock<VortexSession> = LazyLock::new(vortex_array::array_session);
 
+// The canonicalization benchmarks gated below are excluded from CodSpeed's CPU simulation. Their
+// simulated instruction count is dominated by output-buffer allocation and glibc `memcpy`/`memmove`
+// (whose `ifunc`-selected implementation varies across runner images) rather than by Vortex compute,
+// so under simulation they report spurious, bidirectional regressions even when the code is
+// unchanged. `chunked_bool_canonical_into` is additionally small enough to sit near the simulation
+// noise floor. They cannot be stabilized by tuning inputs because the data movement is the thing
+// being measured. The `chunked_opt_bool_*` and `chunked_constant_*` benches below are compute-bound
+// and stable under simulation, so they are kept. Per `docs/developer-guide/benchmarking.md` such
+// benchmarks are gated with `#[cfg(not(codspeed))]` and remain available via local `cargo bench`.
+// See https://github.com/vortex-data/vortex/pull/8519 for the supporting analysis.
+#[cfg(not(codspeed))]
 #[divan::bench(args = BENCH_ARGS)]
 fn chunked_bool_canonical_into(bencher: Bencher, (len, chunk_count): (usize, usize)) {
     let chunk = make_bool_chunks(len, chunk_count);
@@ -73,6 +90,8 @@ fn chunked_opt_bool_into_canonical(bencher: Bencher, (len, chunk_count): (usize,
         .bench_refs(|(chunk, ctx)| (**chunk).clone().execute::<Canonical>(ctx))
 }
 
+// Excluded from CodSpeed: VarBinView canonicalization is `memcpy`-bound (see note above).
+#[cfg(not(codspeed))]
 #[divan::bench(args = BENCH_ARGS)]
 fn chunked_varbinview_canonical_into(bencher: Bencher, (len, chunk_count): (usize, usize)) {
     let chunks = make_string_chunks(false, len, chunk_count);
@@ -91,6 +110,8 @@ fn chunked_varbinview_canonical_into(bencher: Bencher, (len, chunk_count): (usiz
         })
 }
 
+// Excluded from CodSpeed: VarBinView canonicalization is `memcpy`-bound (see note above).
+#[cfg(not(codspeed))]
 #[divan::bench(args = BENCH_ARGS)]
 fn chunked_varbinview_into_canonical(bencher: Bencher, (len, chunk_count): (usize, usize)) {
     let chunks = make_string_chunks(false, len, chunk_count);
@@ -100,6 +121,8 @@ fn chunked_varbinview_into_canonical(bencher: Bencher, (len, chunk_count): (usiz
         .bench_refs(|(chunk, ctx)| (**chunk).clone().execute::<Canonical>(ctx))
 }
 
+// Excluded from CodSpeed: VarBinView canonicalization is `memcpy`-bound (see note above).
+#[cfg(not(codspeed))]
 #[divan::bench(args = BENCH_ARGS)]
 fn chunked_varbinview_opt_canonical_into(bencher: Bencher, (len, chunk_count): (usize, usize)) {
     let chunks = make_string_chunks(true, len, chunk_count);
@@ -118,6 +141,8 @@ fn chunked_varbinview_opt_canonical_into(bencher: Bencher, (len, chunk_count): (
         })
 }
 
+// Excluded from CodSpeed: VarBinView canonicalization is `memcpy`-bound (see note above).
+#[cfg(not(codspeed))]
 #[divan::bench(args = BENCH_ARGS)]
 fn chunked_varbinview_opt_into_canonical(bencher: Bencher, (len, chunk_count): (usize, usize)) {
     let chunks = make_string_chunks(true, len, chunk_count);
@@ -211,6 +236,9 @@ fn make_opt_bool_chunks(len: usize, chunk_count: usize) -> ArrayRef {
         .into_array()
 }
 
+// Only used by `chunked_bool_canonical_into`, which is excluded from CodSpeed (see above), so this
+// helper is gated to match and avoid dead-code errors under `--cfg codspeed`.
+#[cfg(not(codspeed))]
 fn make_bool_chunks(len: usize, chunk_count: usize) -> ArrayRef {
     let mut rng = StdRng::seed_from_u64(0);
 
@@ -220,6 +248,9 @@ fn make_bool_chunks(len: usize, chunk_count: usize) -> ArrayRef {
         .into_array()
 }
 
+// Only used by the gated VarBinView canonicalization benches above, which are excluded from
+// CodSpeed, so this helper is gated to match and avoid dead-code errors under `--cfg codspeed`.
+#[cfg(not(codspeed))]
 fn make_string_chunks(nullable: bool, len: usize, chunk_count: usize) -> ArrayRef {
     let mut rng = StdRng::seed_from_u64(123);