From 74e2dbdf0075d87d28eb6ddf2f2fb3535daa677b Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 4 Jun 2026 18:27:12 +0000 Subject: [PATCH 1/3] perf: add `#[inline]` to recover cross-crate inlining under `codegen-units = 16` #8257 set `[profile.bench]` `codegen-units = 16` (cargo's release default) to speed up benchmark/CI builds, but `[profile.release]` uses `codegen-units = 1`. With 16 CGUs the compiler no longer inlines small hot functions across codegen-unit / crate boundaries unless they are explicitly `#[inline]`, so a number of benchmarks regressed (CodSpeed reported ~16% overall, 147 benches). This adds `#[inline]` to the small, hot, per-element / per-dispatch functions that were previously inlined implicitly at `codegen-units = 1`, so the bench profile once again reflects release performance. Measured locally (divan `fastest`, cu=16 baseline -> cu=16 + these inlines, cu=1 shown as the target): search_sorted::binary_search_vortex 27.4ns -> 24.1ns (cu=1 23.4ns) listview_rebuild::varbinview_large 24.2us -> 12.0us (cu=1 11.7us) chunk_array::varbinview_canonical_into 50.4us -> 28.7us (cu=1 24.1us) [1000,10] vortex_bitbuffer::from_iter[65536] 58.1us -> 32.4us (cu=1 32.4us) vortex_bitbuffer::from_iter[16384] 14.7us -> 8.2us (cu=1 8.2us) Functions: - search_sorted: the whole `SearchSorted` / `IndexOrd` chain is co-dependent; ablation shows inlining only the entry method is *worse* than none (29.4ns), because the per-iteration `index_cmp` / `search_sorted_side_idx` calls then go out of line. All links are required. - builders::varbinview: `adjust_view` / `push_view` run once per element in the `VarBinViewBuilder` extend loop (~24% of self-time when out of line). - array dispatch (`Matcher`/`is`/`as_opt`, `AnyCanonical`): folded per chunk in the canonical execute path. - vortex_buffer `BitBuffer`/`BitBufferMut` `FromIterator`: inlining into the caller lets the optimizer specialise the fill loop for the concrete iterator. `as_any` was evaluated and intentionally left un-annotated: ablation showed its `#[inline]` had no measurable effect (LLVM already inlines the trivial body). fastlanes `bitpack_compare_sweep` was also checked but showed no local cu=1 vs cu=16 difference (the SIMD unpack kernel dominates), so it is left unchanged. Signed-off-by: Joe Isaacs https://claude.ai/code/session_019w6k1b8tcA9FohTse8PNrh --- vortex-array/src/array/erased.rs | 5 +++++ vortex-array/src/builders/varbinview.rs | 3 +++ vortex-array/src/canonical.rs | 2 ++ vortex-array/src/matcher.rs | 1 + vortex-array/src/search_sorted.rs | 9 +++++++++ vortex-buffer/src/bit/buf.rs | 1 + vortex-buffer/src/bit/buf_mut.rs | 1 + 7 files changed, 22 insertions(+) diff --git a/vortex-array/src/array/erased.rs b/vortex-array/src/array/erased.rs index 77800377f1c..4543390b073 100644 --- a/vortex-array/src/array/erased.rs +++ b/vortex-array/src/array/erased.rs @@ -390,16 +390,19 @@ impl ArrayRef { } /// Does the array match the given matcher. + #[inline] pub fn is(&self) -> bool { M::matches(self) } /// Returns the array downcast by the given matcher. + #[inline] pub fn as_(&self) -> M::Match<'_> { self.as_opt::().vortex_expect("Failed to downcast") } /// Returns the array downcast by the given matcher. + #[inline] pub fn as_opt(&self) -> Option> { M::try_match(self) } @@ -738,10 +741,12 @@ impl IntoArray for ArrayRef { impl Matcher for V { type Match<'a> = ArrayView<'a, V>; + #[inline] fn matches(array: &ArrayRef) -> bool { array.0.data.as_any().is::>() } + #[inline] fn try_match(array: &'_ ArrayRef) -> Option> { let inner = array.0.data.as_any().downcast_ref::>()?; // # Safety checked by `downcast_ref`. diff --git a/vortex-array/src/builders/varbinview.rs b/vortex-array/src/builders/varbinview.rs index e06b54dd78c..392bef59997 100644 --- a/vortex-array/src/builders/varbinview.rs +++ b/vortex-array/src/builders/varbinview.rs @@ -367,6 +367,7 @@ impl ArrayBuilder for VarBinViewBuilder { } impl VarBinViewBuilder { + #[inline] fn push_view( &mut self, view: BinaryView, @@ -758,6 +759,7 @@ enum PrecomputedViewAdjustment { } impl PrecomputedViewAdjustment { + #[inline] fn adjust_view(&self, view: &BinaryView) -> BinaryView { if view.is_inlined() { return *view; @@ -815,6 +817,7 @@ struct RewritingViewAdjustment { impl RewritingViewAdjustment { /// Can return None if this view can't be adjusted, because there is no precomputed lookup /// for the current buffer. + #[inline] fn adjust_view(&self, view: &BinaryView) -> Option { if view.is_inlined() { return Some(*view); diff --git a/vortex-array/src/canonical.rs b/vortex-array/src/canonical.rs index 6fbdc154a6c..28589ec8c20 100644 --- a/vortex-array/src/canonical.rs +++ b/vortex-array/src/canonical.rs @@ -1072,6 +1072,7 @@ pub struct AnyCanonical; impl Matcher for AnyCanonical { type Match<'a> = CanonicalView<'a>; + #[inline] fn matches(array: &ArrayRef) -> bool { array.is::() || array.is::() @@ -1085,6 +1086,7 @@ impl Matcher for AnyCanonical { || array.is::() } + #[inline] fn try_match(array: &ArrayRef) -> Option> { if let Some(a) = array.as_opt::() { Some(CanonicalView::Null(a)) diff --git a/vortex-array/src/matcher.rs b/vortex-array/src/matcher.rs index 532931df083..c3f9e542f13 100644 --- a/vortex-array/src/matcher.rs +++ b/vortex-array/src/matcher.rs @@ -8,6 +8,7 @@ pub trait Matcher { type Match<'a>; /// Check if the given array matches this matcher type + #[inline] fn matches(array: &ArrayRef) -> bool { Self::try_match(array).is_some() } diff --git a/vortex-array/src/search_sorted.rs b/vortex-array/src/search_sorted.rs index f3514708aec..fc582715ddd 100644 --- a/vortex-array/src/search_sorted.rs +++ b/vortex-array/src/search_sorted.rs @@ -103,18 +103,22 @@ pub trait IndexOrd { /// For example, if self\[idx\] > elem, return Some(Greater). fn index_cmp(&self, idx: usize, elem: &V) -> VortexResult>; + #[inline] fn index_lt(&self, idx: usize, elem: &V) -> VortexResult { Ok(matches!(self.index_cmp(idx, elem)?, Some(Less))) } + #[inline] fn index_le(&self, idx: usize, elem: &V) -> VortexResult { Ok(matches!(self.index_cmp(idx, elem)?, Some(Less | Equal))) } + #[inline] fn index_gt(&self, idx: usize, elem: &V) -> VortexResult { Ok(matches!(self.index_cmp(idx, elem)?, Some(Greater))) } + #[inline] fn index_ge(&self, idx: usize, elem: &V) -> VortexResult { Ok(matches!(self.index_cmp(idx, elem)?, Some(Greater | Equal))) } @@ -132,6 +136,7 @@ pub trait IndexOrd { /// |left |array\[i-1\] < value <= array\[i\]| /// |right|array\[i-1\] <= value < array\[i\]| pub trait SearchSorted { + #[inline] fn search_sorted(&self, value: &T, side: SearchSortedSide) -> VortexResult where Self: IndexOrd, @@ -180,6 +185,7 @@ impl SearchSorted for S where S: IndexOrd + ?Sized, { + #[inline] fn search_sorted_by< F: FnMut(usize) -> VortexResult, N: FnMut(usize) -> VortexResult, @@ -210,6 +216,7 @@ where } // Code adapted from Rust standard library slice::binary_search_by +#[inline] fn search_sorted_side_idx VortexResult>( mut find: F, from: usize, @@ -276,11 +283,13 @@ impl IndexOrd for ArrayRef { } impl IndexOrd for [T] { + #[inline] fn index_cmp(&self, idx: usize, elem: &T) -> VortexResult> { // SAFETY: Used in search_sorted_by same as the standard library. The search_sorted ensures idx is in bounds Ok(unsafe { self.get_unchecked(idx) }.partial_cmp(elem)) } + #[inline] fn index_len(&self) -> usize { self.len() } diff --git a/vortex-buffer/src/bit/buf.rs b/vortex-buffer/src/bit/buf.rs index 83b28e24b4a..cad7233b372 100644 --- a/vortex-buffer/src/bit/buf.rs +++ b/vortex-buffer/src/bit/buf.rs @@ -392,6 +392,7 @@ impl From> for BitBuffer { } impl FromIterator for BitBuffer { + #[inline] fn from_iter>(iter: T) -> Self { BitBufferMut::from_iter(iter).freeze() } diff --git a/vortex-buffer/src/bit/buf_mut.rs b/vortex-buffer/src/bit/buf_mut.rs index 6dac40db8ec..829b0fcf9cc 100644 --- a/vortex-buffer/src/bit/buf_mut.rs +++ b/vortex-buffer/src/bit/buf_mut.rs @@ -581,6 +581,7 @@ impl From> for BitBufferMut { } impl FromIterator for BitBufferMut { + #[inline] fn from_iter>(iter: T) -> Self { let mut iter = iter.into_iter(); From 167249070c71166ea679e67788fad75a02ae0b49 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 4 Jun 2026 19:35:25 +0000 Subject: [PATCH 2/3] perf: inline `varbinview::zip::push_view` and `BufferHandle::as_host` Continuing the `codegen-units = 16` inlining sweep. A cu=1 vs cu=16 timing diff across all vortex-array / vortex-buffer / vortex-mask benches surfaced a further set of regressions; this fixes the one with a clean, verified cause. `varbinview_zip`: `push_view` is called once per element from `push_range`'s loops, and `BufferHandle::as_host` is a per-buffer accessor. Both were inlined implicitly at cu=1 but went out of line at cu=16 (samply self-time: push_view 17.7%, as_host 5.2% at cu=16, both absent from the cu=1 profile). Measured (divan fastest, cu=16 baseline -> cu=16 + these inlines, cu=1 target): varbinview_zip::fragmented_mask 1.358ms -> 1.222ms (cu=1 1.085ms) `block_mask` (slice-copy dominated) is unchanged and not regressed. The residual gap is a per-call `DeduplicatedBuffers` collect, which has no clean inline target. Investigated but deliberately NOT changed (proven not real inline wins): - `scalar_at_struct::execute_scalar_struct_simple` (cu=1 125us, cu=16 154us): inlining the `execute_scalar` / `OperationsVTable::scalar_at` chain made it *worse* (171-180us) -- partial inlining across the dyn-dispatch boundary backfires, like inlining a search_sorted entry without its helpers. Reverted. - `dict_compare` (~1.5x): the hot `bool::take::take_valid_indices` is out of line in both cu=1 and cu=16 with its inner `collect_bool` loop already fully inlined; the slowdown is CGU codegen variance, not a missing `#[inline]`. - `filter_bool` (~1.3x): the bench profile is dominated by per-sample RNG/mask setup; the mask `BitBuffer::from_iter` is already inlined (prior commit). Signed-off-by: Joe Isaacs https://claude.ai/code/session_019w6k1b8tcA9FohTse8PNrh --- vortex-array/src/arrays/varbinview/compute/zip.rs | 1 + vortex-array/src/buffer.rs | 2 ++ 2 files changed, 3 insertions(+) diff --git a/vortex-array/src/arrays/varbinview/compute/zip.rs b/vortex-array/src/arrays/varbinview/compute/zip.rs index f4a5c26a9ce..9c198fe6f2c 100644 --- a/vortex-array/src/arrays/varbinview/compute/zip.rs +++ b/vortex-array/src/arrays/varbinview/compute/zip.rs @@ -177,6 +177,7 @@ fn push_range( } } +#[inline] fn push_view( view: BinaryView, buffer_lookup: &[u32], diff --git a/vortex-array/src/buffer.rs b/vortex-array/src/buffer.rs index 29f9d3582be..faf7874b6ca 100644 --- a/vortex-array/src/buffer.rs +++ b/vortex-array/src/buffer.rs @@ -249,6 +249,7 @@ impl BufferHandle { } /// Downcast this handle as a handle to a host-resident buffer, or `None`. + #[inline] pub fn as_host_opt(&self) -> Option<&ByteBuffer> { match &self.0 { Inner::Host(buffer) => Some(buffer), @@ -266,6 +267,7 @@ impl BufferHandle { /// A version of [`as_host_opt`][Self::as_host_opt] that panics if the allocation is /// not a host allocation. + #[inline] pub fn as_host(&self) -> &ByteBuffer { self.as_host_opt().vortex_expect("expected host buffer") } From 5d9bb1f67ff0bef0224e04fa44ab6c5e3a263e76 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 5 Jun 2026 08:15:13 +0000 Subject: [PATCH 3/3] perf: drop three unjustified `#[inline]` annotations after ablation audit Audited every `#[inline]` added in this branch by removing it and re-measuring (or static call-graph analysis). Three are not justified and are removed: - `IndexOrd::index_gt` / `index_ge`: zero call sites anywhere in the workspace (only `index_lt` (Left) and `index_le` (Right) are ever invoked by `search_sorted`). Dead trait defaults -- the annotation can never fire. - `BitBuffer::from_iter` (the outer forwarder to `BitBufferMut::from_iter`): removing it left `from_iter_bit_buffer[65536]` unchanged (27.79us vs 27.82us); LLVM already inlines the trivial `.freeze()` delegation. Redundant. Everything else was confirmed load-bearing by ablation, e.g. dropping `BufferHandle::as_host`/`as_host_opt` regressed `varbinview_zip` 996us -> 1136us (+14%), and dropping `is`/`as_`/`as_opt` regressed `chunk_array_builder` ~3%. Signed-off-by: Joe Isaacs https://claude.ai/code/session_019w6k1b8tcA9FohTse8PNrh --- vortex-array/src/search_sorted.rs | 2 -- vortex-buffer/src/bit/buf.rs | 1 - 2 files changed, 3 deletions(-) diff --git a/vortex-array/src/search_sorted.rs b/vortex-array/src/search_sorted.rs index fc582715ddd..3db5215e049 100644 --- a/vortex-array/src/search_sorted.rs +++ b/vortex-array/src/search_sorted.rs @@ -113,12 +113,10 @@ pub trait IndexOrd { Ok(matches!(self.index_cmp(idx, elem)?, Some(Less | Equal))) } - #[inline] fn index_gt(&self, idx: usize, elem: &V) -> VortexResult { Ok(matches!(self.index_cmp(idx, elem)?, Some(Greater))) } - #[inline] fn index_ge(&self, idx: usize, elem: &V) -> VortexResult { Ok(matches!(self.index_cmp(idx, elem)?, Some(Greater | Equal))) } diff --git a/vortex-buffer/src/bit/buf.rs b/vortex-buffer/src/bit/buf.rs index cad7233b372..83b28e24b4a 100644 --- a/vortex-buffer/src/bit/buf.rs +++ b/vortex-buffer/src/bit/buf.rs @@ -392,7 +392,6 @@ impl From> for BitBuffer { } impl FromIterator for BitBuffer { - #[inline] fn from_iter>(iter: T) -> Self { BitBufferMut::from_iter(iter).freeze() }