From 9aa382e595da8e31306ac7e490648c0a02b67da7 Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Fri, 24 Oct 2025 15:35:01 -0400 Subject: [PATCH 1/2] fix from_iter impls in vectors Signed-off-by: Connor Tsui --- vortex-vector/src/bool/from_iter.rs | 21 ++++++++++++--------- vortex-vector/src/primitive/from_iter.rs | 19 ++++++++++++------- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/vortex-vector/src/bool/from_iter.rs b/vortex-vector/src/bool/from_iter.rs index fab1b90bf6c..00faf6f7287 100644 --- a/vortex-vector/src/bool/from_iter.rs +++ b/vortex-vector/src/bool/from_iter.rs @@ -26,28 +26,31 @@ impl FromIterator> for BoolVectorMut { I: IntoIterator>, { let iter = iter.into_iter(); - let (lower_bound, _) = iter.size_hint(); + // Since we do not know the length of the iterator, we can only guess how much memory we + // need to reserve. Note that these hints may be inaccurate. + let (lower_bound, upper_bound_opt) = iter.size_hint(); - let mut bits = Vec::with_capacity(lower_bound); - let mut validity = MaskMut::with_capacity(lower_bound); + // In the case that the upper bound is adversarial, we put a hard limit on the amount of + // memory we reserve (and the OS should handle the rest with zero pages). + let reserve_amount = upper_bound_opt.unwrap_or(lower_bound); + + let mut bits = BitBufferMut::with_capacity(reserve_amount); + let mut validity = MaskMut::with_capacity(reserve_amount); for opt_val in iter { match opt_val { Some(val) => { - bits.push(val); + bits.append(val); validity.append_n(true, 1); } None => { - bits.push(false); // Value doesn't matter for invalid entries. + bits.append(false); // Value doesn't matter for invalid entries. validity.append_n(false, 1); } } } - BoolVectorMut { - bits: BitBufferMut::from_iter(bits), - validity, - } + BoolVectorMut { bits, validity } } } diff --git a/vortex-vector/src/primitive/from_iter.rs b/vortex-vector/src/primitive/from_iter.rs index 3613f90497c..bf248eda980 100644 --- a/vortex-vector/src/primitive/from_iter.rs +++ b/vortex-vector/src/primitive/from_iter.rs @@ -27,10 +27,18 @@ impl FromIterator> for PVectorMut { I: IntoIterator>, { let iter = iter.into_iter(); - let (lower_bound, _) = iter.size_hint(); + // Since we do not know the length of the iterator, we can only guess how much memory we + // need to reserve. Note that these hints may be inaccurate. + let (lower_bound, upper_bound_opt) = iter.size_hint(); - let mut elements = Vec::with_capacity(lower_bound); - let mut validity = MaskMut::with_capacity(lower_bound); + // In the case that the upper bound is adversarial, we put a hard limit on the amount of + // memory we reserve (and the OS should handle the rest with zero pages). + let reserve_amount = upper_bound_opt + .unwrap_or(lower_bound) + .min(i32::MAX as usize); + + let mut elements = BufferMut::with_capacity(reserve_amount); + let mut validity = MaskMut::with_capacity(reserve_amount); for opt_val in iter { match opt_val { @@ -45,10 +53,7 @@ impl FromIterator> for PVectorMut { } } - PVectorMut { - elements: BufferMut::from_iter(elements), - validity, - } + PVectorMut { elements, validity } } } From 93d9b90eed39cb01a090018925abd9da86f5cc69 Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Mon, 27 Oct 2025 10:32:06 -0400 Subject: [PATCH 2/2] revert upper bound size hints Signed-off-by: Connor Tsui --- vortex-buffer/src/bit/buf_mut.rs | 37 +++++++++++++++++++----- vortex-buffer/src/buffer.rs | 7 +++-- vortex-buffer/src/buffer_mut.rs | 13 ++++----- vortex-buffer/src/trusted_len.rs | 8 ++--- vortex-mask/src/lib.rs | 10 ++++--- vortex-vector/src/bool/from_iter.rs | 10 +++---- vortex-vector/src/primitive/from_iter.rs | 12 +++----- 7 files changed, 56 insertions(+), 41 deletions(-) diff --git a/vortex-buffer/src/bit/buf_mut.rs b/vortex-buffer/src/bit/buf_mut.rs index f005612c097..299712426d4 100644 --- a/vortex-buffer/src/bit/buf_mut.rs +++ b/vortex-buffer/src/bit/buf_mut.rs @@ -72,6 +72,11 @@ impl BitBufferMut { } } + /// Consumes the buffer and return the underlying byte buffer. + pub fn into_inner(self) -> ByteBufferMut { + self.buffer + } + /// Create a new mutable buffer with requested `len` and all bits set to `true`. pub fn new_set(len: usize) -> Self { Self { @@ -205,20 +210,24 @@ impl BitBufferMut { /// Set the bit at `index` to `true` without checking bounds. /// + /// Note: Do not call this in a tight loop. Prefer to use [`set_bit_unchecked`]. + /// /// # Safety /// /// The caller must ensure that `index` does not exceed the largest bit index in the backing buffer. - pub unsafe fn set_unchecked(&mut self, index: usize) { + unsafe fn set_unchecked(&mut self, index: usize) { // SAFETY: checked by caller unsafe { set_bit_unchecked(self.buffer.as_mut_ptr(), self.offset + index) } } /// Unset the bit at `index` without checking bounds. /// + /// Note: Do not call this in a tight loop. Prefer to use [`unset_bit_unchecked`]. + /// /// # Safety /// /// The caller must ensure that `index` does not exceed the largest bit index in the backing buffer. - pub unsafe fn unset_unchecked(&mut self, index: usize) { + unsafe fn unset_unchecked(&mut self, index: usize) { // SAFETY: checked by caller unsafe { unset_bit_unchecked(self.buffer.as_mut_ptr(), self.offset + index) } } @@ -229,6 +238,7 @@ impl BitBufferMut { /// /// - `new_len` must be less than or equal to [`capacity()`](Self::capacity) /// - The elements at `old_len..new_len` must be initialized + #[inline(always)] pub unsafe fn set_len(&mut self, new_len: usize) { debug_assert!( new_len <= self.capacity(), @@ -470,6 +480,11 @@ impl BitBufferMut { pub fn as_mut_slice(&mut self) -> &mut [u8] { self.buffer.as_mut_slice() } + + /// Returns a raw mutable pointer to the internal buffer. + pub fn as_mut_ptr(&mut self) -> *mut u8 { + self.buffer.as_mut_ptr() + } } impl Default for BitBufferMut { @@ -511,14 +526,20 @@ impl FromIterator for BitBufferMut { fn from_iter>(iter: T) -> Self { let mut iter = iter.into_iter(); - // Note that these hints might be incorrect. - let (lower_bound, upper_bound_opt) = iter.size_hint(); - let capacity = upper_bound_opt.unwrap_or(lower_bound); + // Since we do not know the length of the iterator, we can only guess how much memory we + // need to reserve. Note that these hints may be inaccurate. + let (lower_bound, _) = iter.size_hint(); - let mut buf = BitBufferMut::new_unset(capacity); + // We choose not to use the optional upper bound size hint to match the standard library. + + // Initialize all bits to 0 with the given length. By doing this, we only need to set bits + // that are true (and this is faster from benchmarks). + let mut buf = BitBufferMut::new_unset(lower_bound); + assert_eq!(buf.offset, 0); // Directly write within our known capacity. - for i in 0..capacity { + let ptr = buf.buffer.as_mut_ptr(); + for i in 0..lower_bound { let Some(v) = iter.next() else { // SAFETY: We are definitely under the capacity and all values are already // initialized from `new_unset`. @@ -528,7 +549,7 @@ impl FromIterator for BitBufferMut { if v { // SAFETY: We have ensured that we are within the capacity. - unsafe { buf.set_unchecked(i) } + unsafe { set_bit_unchecked(ptr, i) } } } diff --git a/vortex-buffer/src/buffer.rs b/vortex-buffer/src/buffer.rs index b8524f103ec..ff839455a77 100644 --- a/vortex-buffer/src/buffer.rs +++ b/vortex-buffer/src/buffer.rs @@ -193,9 +193,10 @@ impl Buffer { /// Create a buffer with values from the TrustedLen iterator. /// Should be preferred over `from_iter` when the iterator is known to be `TrustedLen`. pub fn from_trusted_len_iter>(iter: I) -> Self { - let (_, high) = iter.size_hint(); - let mut buffer = - BufferMut::with_capacity(high.vortex_expect("TrustedLen iterator has no upper bound")); + let (_, upper_bound) = iter.size_hint(); + let mut buffer = BufferMut::with_capacity( + upper_bound.vortex_expect("TrustedLen iterator has no upper bound"), + ); buffer.extend_trusted(iter); buffer.freeze() } diff --git a/vortex-buffer/src/buffer_mut.rs b/vortex-buffer/src/buffer_mut.rs index 96557189f1e..75df8ebbcfe 100644 --- a/vortex-buffer/src/buffer_mut.rs +++ b/vortex-buffer/src/buffer_mut.rs @@ -484,14 +484,11 @@ impl BufferMut { fn extend_iter(&mut self, mut iter: impl Iterator) { // Since we do not know the length of the iterator, we can only guess how much memory we // need to reserve. Note that these hints may be inaccurate. - let (lower_bound, upper_bound_opt) = iter.size_hint(); - - // In the case that the upper bound is adversarial, we put a hard limit on the amount of - // memory we reserve (and the OS should handle the rest with zero pages). - let reserve_amount = upper_bound_opt - .unwrap_or(lower_bound) - .min(i32::MAX as usize); - self.reserve(reserve_amount); + let (lower_bound, _) = iter.size_hint(); + + // We choose not to use the optional upper bound size hint to match the standard library. + + self.reserve(lower_bound); let unwritten = self.capacity() - self.len(); diff --git a/vortex-buffer/src/trusted_len.rs b/vortex-buffer/src/trusted_len.rs index a163be2dfe9..eca196aaa70 100644 --- a/vortex-buffer/src/trusted_len.rs +++ b/vortex-buffer/src/trusted_len.rs @@ -72,17 +72,17 @@ pub trait TrustedLenExt: Iterator + Sized { /// /// The caller must guarantee that the iterator does indeed have an exact length. unsafe fn trusted_len(self) -> TrustedLenAdapter { - let (lower, maybe_upper) = self.size_hint(); - if let Some(upper) = maybe_upper { + let (lower_bound, upper_bound_opt) = self.size_hint(); + if let Some(upper_bound) = upper_bound_opt { assert_eq!( - lower, upper, + lower_bound, upper_bound, "TrustedLenExt: iterator size hints must match if upper bound is given" ); } TrustedLenAdapter { inner: self, - len: lower, + len: lower_bound, #[cfg(debug_assertions)] count: 0, } diff --git a/vortex-mask/src/lib.rs b/vortex-mask/src/lib.rs index 1cda19eecc0..88a12be6ef4 100644 --- a/vortex-mask/src/lib.rs +++ b/vortex-mask/src/lib.rs @@ -20,7 +20,7 @@ use std::sync::{Arc, OnceLock}; use itertools::Itertools; pub use mask_mut::*; -use vortex_buffer::{BitBuffer, BitBufferMut}; +use vortex_buffer::{BitBuffer, BitBufferMut, set_bit_unchecked}; use vortex_error::{VortexResult, vortex_panic}; /// Represents a set of values that are all included, all excluded, or some mixture of both. @@ -601,11 +601,13 @@ impl Mask { let existing_buffer = mask_values.bit_buffer(); let mut new_buffer_builder = BitBufferMut::new_unset(mask_values.len()); + debug_assert!(limit < mask_values.len()); + let ptr = new_buffer_builder.as_mut_ptr(); for index in existing_buffer.set_indices().take(limit) { - unsafe { - new_buffer_builder.set_unchecked(index); - } + // SAFETY: We checked that `limit` was less than the mask values length, + // therefore `index` must be within the bounds of the bit buffer. + unsafe { set_bit_unchecked(ptr, index) } } Self::from(new_buffer_builder.freeze()) diff --git a/vortex-vector/src/bool/from_iter.rs b/vortex-vector/src/bool/from_iter.rs index 00faf6f7287..9a807b6b63a 100644 --- a/vortex-vector/src/bool/from_iter.rs +++ b/vortex-vector/src/bool/from_iter.rs @@ -28,14 +28,12 @@ impl FromIterator> for BoolVectorMut { let iter = iter.into_iter(); // Since we do not know the length of the iterator, we can only guess how much memory we // need to reserve. Note that these hints may be inaccurate. - let (lower_bound, upper_bound_opt) = iter.size_hint(); + let (lower_bound, _) = iter.size_hint(); - // In the case that the upper bound is adversarial, we put a hard limit on the amount of - // memory we reserve (and the OS should handle the rest with zero pages). - let reserve_amount = upper_bound_opt.unwrap_or(lower_bound); + // We choose not to use the optional upper bound size hint to match the standard library. - let mut bits = BitBufferMut::with_capacity(reserve_amount); - let mut validity = MaskMut::with_capacity(reserve_amount); + let mut bits = BitBufferMut::with_capacity(lower_bound); + let mut validity = MaskMut::with_capacity(lower_bound); for opt_val in iter { match opt_val { diff --git a/vortex-vector/src/primitive/from_iter.rs b/vortex-vector/src/primitive/from_iter.rs index bf248eda980..9cd06b2e1d1 100644 --- a/vortex-vector/src/primitive/from_iter.rs +++ b/vortex-vector/src/primitive/from_iter.rs @@ -29,16 +29,12 @@ impl FromIterator> for PVectorMut { let iter = iter.into_iter(); // Since we do not know the length of the iterator, we can only guess how much memory we // need to reserve. Note that these hints may be inaccurate. - let (lower_bound, upper_bound_opt) = iter.size_hint(); + let (lower_bound, _) = iter.size_hint(); - // In the case that the upper bound is adversarial, we put a hard limit on the amount of - // memory we reserve (and the OS should handle the rest with zero pages). - let reserve_amount = upper_bound_opt - .unwrap_or(lower_bound) - .min(i32::MAX as usize); + // We choose not to use the optional upper bound size hint to match the standard library. - let mut elements = BufferMut::with_capacity(reserve_amount); - let mut validity = MaskMut::with_capacity(reserve_amount); + let mut elements = BufferMut::with_capacity(lower_bound); + let mut validity = MaskMut::with_capacity(lower_bound); for opt_val in iter { match opt_val {