Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions encodings/alp/benches/alp_compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ use vortex_alp::ALPRDFloat;
use vortex_alp::RDEncoder;
use vortex_alp::alp_encode;
use vortex_alp::decompress_into_array;
use vortex_array::Canonical;
use vortex_array::IntoArray;
use vortex_array::VortexSessionExecute;
use vortex_array::arrays::PrimitiveArray;
use vortex_array::dtype::NativePType;
Expand Down Expand Up @@ -150,8 +148,21 @@ fn compress_rd<T: ALPRDFloat + NativePType>(bencher: Bencher, args: (usize, f64)
.bench_refs(|(primitive, encoder, ctx)| encoder.encode(primitive.as_view(), ctx))
}

// Excluded from CodSpeed's CPU simulation. This benchmark canonicalizes the decoded array, so its
// simulated instruction count is dominated by output-buffer allocation and glibc `memcpy`/`memmove`
// (whose `ifunc`-selected implementation varies across runner images) rather than by the ALP-RD
// decode itself. That makes it report spurious, bidirectional regressions under simulation even when
// the Vortex code is byte-identical, and it cannot be stabilized by tuning inputs because the data
// movement is the thing being measured. The compute-bound `compress_rd` encode benchmark above does
// not have this problem and is kept. Per `docs/developer-guide/benchmarking.md` such benchmarks are
// gated with `#[cfg(not(codspeed))]`; it remains available via local `cargo bench`. See
// https://github.com/vortex-data/vortex/pull/8519 for the supporting analysis.
#[cfg(not(codspeed))]
#[divan::bench(types = [f32, f64], args = RD_BENCH_ARGS)]
fn decompress_rd<T: ALPRDFloat + NativePType>(bencher: Bencher, args: (usize, f64)) {
use vortex_array::Canonical;
use vortex_array::IntoArray;

let (n, fraction_patch) = args;
let primitive = make_rd_array::<T>(n, fraction_patch);
let encoder = RDEncoder::new(primitive.as_slice::<T>());
Expand Down
17 changes: 17 additions & 0 deletions encodings/fastlanes/benches/bitpacking_take.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,17 @@ fn take_10_contiguous(bencher: Bencher) {
})
}

// The four `*take_10k_*` benches below are excluded from CodSpeed's CPU simulation. Each gathers 10k
// elements and canonicalizes the result, so its simulated instruction count is bimodal: it is driven
// by output-buffer allocation plus glibc `memcpy` and by the SIMD bit-unpack's code-layout
// sensitivity across runner images, rather than by stable Vortex compute. That makes them report
// spurious, bidirectional regressions under simulation even when the code is unchanged, and they
// cannot be stabilized by tuning inputs because the data movement is the thing being measured. The
// smaller take variants here are compute-bound and stable, so they are kept. Per
// `docs/developer-guide/benchmarking.md` such benchmarks are gated with `#[cfg(not(codspeed))]` and
// remain available via local `cargo bench`. See https://github.com/vortex-data/vortex/pull/8519 for
// the supporting analysis.
#[cfg(not(codspeed))]
#[divan::bench]
fn take_10k_random(bencher: Bencher) {
let values = fixture(65_536, 8);
Expand All @@ -92,6 +103,8 @@ fn take_10k_random(bencher: Bencher) {
})
}

// Excluded from CodSpeed: bimodal 10k take+canonicalize (see note above).
#[cfg(not(codspeed))]
#[divan::bench]
fn take_10k_contiguous(bencher: Bencher) {
let values = fixture(65_536, 8);
Expand Down Expand Up @@ -221,6 +234,8 @@ fn patched_take_10_contiguous(bencher: Bencher) {
})
}

// Excluded from CodSpeed: bimodal 10k take+canonicalize (see note above).
#[cfg(not(codspeed))]
#[divan::bench]
fn patched_take_10k_random(bencher: Bencher) {
let values = (0u32..BIG_BASE2 + NUM_EXCEPTIONS).collect::<Buffer<u32>>();
Expand Down Expand Up @@ -262,6 +277,8 @@ fn patched_take_10k_contiguous_not_patches(bencher: Bencher) {
})
}

// Excluded from CodSpeed: bimodal 10k take+canonicalize (see note above).
#[cfg(not(codspeed))]
#[divan::bench]
fn patched_take_10k_contiguous_patches(bencher: Bencher) {
let values = (0u32..BIG_BASE2 + NUM_EXCEPTIONS).collect::<Buffer<u32>>();
Expand Down
31 changes: 31 additions & 0 deletions vortex-array/benches/chunk_array_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,15 @@ use vortex_array::VortexSessionExecute;
use vortex_array::arrays::BoolArray;
use vortex_array::arrays::ChunkedArray;
use vortex_array::arrays::ConstantArray;
// `ArrayBuilder`, `VarBinViewBuilder`, and `DType` are used only by the VarBinView canonicalization
// benches and their helpers, all of which are excluded from CodSpeed (see the gating note below), so
// they are gated to match and avoid unused-import errors under `--cfg codspeed`.
#[cfg(not(codspeed))]
use vortex_array::builders::ArrayBuilder;
#[cfg(not(codspeed))]
use vortex_array::builders::VarBinViewBuilder;
use vortex_array::builders::builder_with_capacity;
#[cfg(not(codspeed))]
use vortex_array::dtype::DType;
use vortex_error::VortexExpect;
use vortex_session::VortexSession;
Expand All @@ -34,6 +40,17 @@ const BENCH_ARGS: &[(usize, usize)] = &[

static SESSION: LazyLock<VortexSession> = LazyLock::new(vortex_array::array_session);

// The canonicalization benchmarks gated below are excluded from CodSpeed's CPU simulation. Their
// simulated instruction count is dominated by output-buffer allocation and glibc `memcpy`/`memmove`
// (whose `ifunc`-selected implementation varies across runner images) rather than by Vortex compute,
// so under simulation they report spurious, bidirectional regressions even when the code is
// unchanged. `chunked_bool_canonical_into` is additionally small enough to sit near the simulation
// noise floor. They cannot be stabilized by tuning inputs because the data movement is the thing
// being measured. The `chunked_opt_bool_*` and `chunked_constant_*` benches below are compute-bound
// and stable under simulation, so they are kept. Per `docs/developer-guide/benchmarking.md` such
// benchmarks are gated with `#[cfg(not(codspeed))]` and remain available via local `cargo bench`.
// See https://github.com/vortex-data/vortex/pull/8519 for the supporting analysis.
#[cfg(not(codspeed))]
#[divan::bench(args = BENCH_ARGS)]
fn chunked_bool_canonical_into(bencher: Bencher, (len, chunk_count): (usize, usize)) {
let chunk = make_bool_chunks(len, chunk_count);
Expand Down Expand Up @@ -73,6 +90,8 @@ fn chunked_opt_bool_into_canonical(bencher: Bencher, (len, chunk_count): (usize,
.bench_refs(|(chunk, ctx)| (**chunk).clone().execute::<Canonical>(ctx))
}

// Excluded from CodSpeed: VarBinView canonicalization is `memcpy`-bound (see note above).
#[cfg(not(codspeed))]
#[divan::bench(args = BENCH_ARGS)]
fn chunked_varbinview_canonical_into(bencher: Bencher, (len, chunk_count): (usize, usize)) {
let chunks = make_string_chunks(false, len, chunk_count);
Expand All @@ -91,6 +110,8 @@ fn chunked_varbinview_canonical_into(bencher: Bencher, (len, chunk_count): (usiz
})
}

// Excluded from CodSpeed: VarBinView canonicalization is `memcpy`-bound (see note above).
#[cfg(not(codspeed))]
#[divan::bench(args = BENCH_ARGS)]
fn chunked_varbinview_into_canonical(bencher: Bencher, (len, chunk_count): (usize, usize)) {
let chunks = make_string_chunks(false, len, chunk_count);
Expand All @@ -100,6 +121,8 @@ fn chunked_varbinview_into_canonical(bencher: Bencher, (len, chunk_count): (usiz
.bench_refs(|(chunk, ctx)| (**chunk).clone().execute::<Canonical>(ctx))
}

// Excluded from CodSpeed: VarBinView canonicalization is `memcpy`-bound (see note above).
#[cfg(not(codspeed))]
#[divan::bench(args = BENCH_ARGS)]
fn chunked_varbinview_opt_canonical_into(bencher: Bencher, (len, chunk_count): (usize, usize)) {
let chunks = make_string_chunks(true, len, chunk_count);
Expand All @@ -118,6 +141,8 @@ fn chunked_varbinview_opt_canonical_into(bencher: Bencher, (len, chunk_count): (
})
}

// Excluded from CodSpeed: VarBinView canonicalization is `memcpy`-bound (see note above).
#[cfg(not(codspeed))]
#[divan::bench(args = BENCH_ARGS)]
fn chunked_varbinview_opt_into_canonical(bencher: Bencher, (len, chunk_count): (usize, usize)) {
let chunks = make_string_chunks(true, len, chunk_count);
Expand Down Expand Up @@ -211,6 +236,9 @@ fn make_opt_bool_chunks(len: usize, chunk_count: usize) -> ArrayRef {
.into_array()
}

// Only used by `chunked_bool_canonical_into`, which is excluded from CodSpeed (see above), so this
// helper is gated to match and avoid dead-code errors under `--cfg codspeed`.
#[cfg(not(codspeed))]
fn make_bool_chunks(len: usize, chunk_count: usize) -> ArrayRef {
let mut rng = StdRng::seed_from_u64(0);

Expand All @@ -220,6 +248,9 @@ fn make_bool_chunks(len: usize, chunk_count: usize) -> ArrayRef {
.into_array()
}

// Only used by the gated VarBinView canonicalization benches above, which are excluded from
// CodSpeed, so this helper is gated to match and avoid dead-code errors under `--cfg codspeed`.
#[cfg(not(codspeed))]
fn make_string_chunks(nullable: bool, len: usize, chunk_count: usize) -> ArrayRef {
let mut rng = StdRng::seed_from_u64(123);

Expand Down
Loading