From 454b0538cac339b77028b015444d1f8d3baa41fe Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Thu, 30 Oct 2025 08:54:27 -0400 Subject: [PATCH 1/7] feat: rest of BinaryViewVector{,Mut} Signed-off-by: Andrew Duffy --- Cargo.lock | 8 +- vortex-buffer/src/buffer.rs | 10 +- .../src/arrow/{varbin.rs => binaryview.rs} | 4 +- vortex-compute/src/arrow/mod.rs | 2 +- vortex-compute/src/mask/mod.rs | 7 +- .../src/{varbin => binaryview}/mod.rs | 31 +- vortex-vector/src/binaryview/types.rs | 120 +++++++ vortex-vector/src/binaryview/vector.rs | 198 ++++++++++ vortex-vector/src/binaryview/vector_mut.rs | 340 ++++++++++++++++++ .../src/{varbin => binaryview}/view.rs | 86 ++++- vortex-vector/src/lib.rs | 4 +- vortex-vector/src/private.rs | 4 +- vortex-vector/src/varbin/types.rs | 93 ----- vortex-vector/src/varbin/vector.rs | 76 ---- vortex-vector/src/varbin/vector_mut.rs | 67 ---- vortex-vector/src/vector_mut.rs | 2 +- 16 files changed, 782 insertions(+), 270 deletions(-) rename vortex-compute/src/arrow/{varbin.rs => binaryview.rs} (91%) rename vortex-vector/src/{varbin => binaryview}/mod.rs (62%) create mode 100644 vortex-vector/src/binaryview/types.rs create mode 100644 vortex-vector/src/binaryview/vector.rs create mode 100644 vortex-vector/src/binaryview/vector_mut.rs rename vortex-vector/src/{varbin => binaryview}/view.rs (72%) delete mode 100644 vortex-vector/src/varbin/types.rs delete mode 100644 vortex-vector/src/varbin/vector.rs delete mode 100644 vortex-vector/src/varbin/vector_mut.rs diff --git a/Cargo.lock b/Cargo.lock index f68cac8f57e..1150d340edb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1542,7 +1542,7 @@ checksum = "ba7a06c0b31fff5ff2e1e7d37dbf940864e2a974b336e1a2938d10af6e8fb283" dependencies = [ "serde", "termcolor", - "unicode-width 0.1.14", + "unicode-width 0.2.0", ] [[package]] @@ -2897,7 +2897,7 @@ dependencies = [ "libc", "option-ext", "redox_users", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -3072,7 +3072,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -5305,7 +5305,7 @@ version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] diff --git a/vortex-buffer/src/buffer.rs b/vortex-buffer/src/buffer.rs index ff839455a77..81e8a04c79c 100644 --- a/vortex-buffer/src/buffer.rs +++ b/vortex-buffer/src/buffer.rs @@ -226,6 +226,12 @@ impl Buffer { unsafe { std::slice::from_raw_parts(self.bytes.as_ptr().cast(), self.length) } } + /// Return a view over the buffer as an opaque byte slice. + #[inline(always)] + pub fn as_bytes(&self) -> &[u8] { + self.bytes.as_ref() + } + /// Returns an iterator over the buffer of elements of type T. pub fn iter(&self) -> Iter<'_, T> { Iter { @@ -319,7 +325,7 @@ impl Buffer { /// Returns a slice of self that is equivalent to the given subset. /// - /// When processing the buffer you will often end up with &\[T\] that is a subset + /// When processing the buffer you will often end up with `&[T]` that is a subset /// of the underlying buffer. This function turns the slice into a slice of the buffer /// it has been taken from. /// @@ -332,7 +338,7 @@ impl Buffer { /// Returns a slice of self that is equivalent to the given subset. /// - /// When processing the buffer you will often end up with &\[T\] that is a subset + /// When processing the buffer you will often end up with `&[T]` that is a subset /// of the underlying buffer. This function turns the slice into a slice of the buffer /// it has been taken from. /// diff --git a/vortex-compute/src/arrow/varbin.rs b/vortex-compute/src/arrow/binaryview.rs similarity index 91% rename from vortex-compute/src/arrow/varbin.rs rename to vortex-compute/src/arrow/binaryview.rs index e38e8dc33f8..92406767800 100644 --- a/vortex-compute/src/arrow/varbin.rs +++ b/vortex-compute/src/arrow/binaryview.rs @@ -6,13 +6,13 @@ use std::sync::Arc; use arrow_array::{ArrayRef, GenericByteViewArray}; use vortex_buffer::Buffer; use vortex_error::VortexResult; -use vortex_vector::{BinaryType, StringType, VarBinVector}; +use vortex_vector::{BinaryType, BinaryViewVector, StringType}; use crate::arrow::IntoArrow; macro_rules! impl_varbin { ($T:ty, $A:ty) => { - impl IntoArrow for VarBinVector<$T> { + impl IntoArrow for BinaryViewVector<$T> { fn into_arrow(self) -> VortexResult { let (views, buffers, validity) = self.into_parts(); diff --git a/vortex-compute/src/arrow/mod.rs b/vortex-compute/src/arrow/mod.rs index f8c3bdcda1d..816ecbd4a5f 100644 --- a/vortex-compute/src/arrow/mod.rs +++ b/vortex-compute/src/arrow/mod.rs @@ -5,13 +5,13 @@ use vortex_error::VortexResult; +mod binaryview; mod bool; mod decimal; mod mask; mod null; mod primitive; mod struct_; -mod varbin; mod vector; /// Trait for converting Vortex vector types into Arrow types. diff --git a/vortex-compute/src/mask/mod.rs b/vortex-compute/src/mask/mod.rs index 125c081b367..f1c2f004ac8 100644 --- a/vortex-compute/src/mask/mod.rs +++ b/vortex-compute/src/mask/mod.rs @@ -8,8 +8,9 @@ use std::ops::BitAnd; use vortex_dtype::{NativeDecimalType, NativePType}; use vortex_mask::Mask; use vortex_vector::{ - BoolVector, DVector, DecimalVector, NullVector, PVector, PrimitiveVector, StructVector, - VarBinType, VarBinVector, Vector, match_each_dvector, match_each_pvector, match_each_vector, + BinaryViewType, BinaryViewVector, BoolVector, DVector, DecimalVector, NullVector, PVector, + PrimitiveVector, StructVector, Vector, match_each_dvector, match_each_pvector, + match_each_vector, }; /// Trait for masking the validity of an array or vector. @@ -70,7 +71,7 @@ impl MaskValidity for PVector { } } -impl MaskValidity for VarBinVector { +impl MaskValidity for BinaryViewVector { fn mask_validity(self, mask: &Mask) -> Self { let (views, buffers, validity) = self.into_parts(); // SAFETY: we are preserving the original views and buffers, only modifying the validity. diff --git a/vortex-vector/src/varbin/mod.rs b/vortex-vector/src/binaryview/mod.rs similarity index 62% rename from vortex-vector/src/varbin/mod.rs rename to vortex-vector/src/binaryview/mod.rs index a594f5a1735..45cfe76389c 100644 --- a/vortex-vector/src/varbin/mod.rs +++ b/vortex-vector/src/binaryview/mod.rs @@ -1,6 +1,13 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +//! Definition and implementation of variable-length binary types. +//! +//! All types are specializations of the [`BinaryViewVector`] type, which is represented internally +//! by `BinaryView`s. `BinaryView`s are identical to the `BinaryView` type defined by the Arrow +//! [specification](https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout), +//! which are inspired by "German" strings. + pub use types::*; pub use vector::*; pub use vector_mut::*; @@ -14,16 +21,16 @@ mod vector_mut; mod view; /// Type alias for non-utf8 variable-length binary vectors. -pub type BinaryVector = VarBinVector; +pub type BinaryVector = BinaryViewVector; /// Type alias for mutable non-utf8 variable-length binary vectors. -pub type BinaryVectorMut = VarBinVectorMut; +pub type BinaryVectorMut = BinaryViewVectorMut; /// Type alias for UTF-8 variable-length string vectors. -pub type StringVector = VarBinVector; +pub type StringVector = BinaryViewVector; /// Type alias for mutable UTF-8 variable-length string vectors. -pub type StringVectorMut = VarBinVectorMut; +pub type StringVectorMut = BinaryViewVectorMut; -impl VarBinTypeDowncast for Vector { - type Output = VarBinVector; +impl BinaryViewDowncast for Vector { + type Output = BinaryViewVector; fn into_binary(self) -> Self::Output { if let Vector::Binary(v) = self { @@ -40,8 +47,8 @@ impl VarBinTypeDowncast for Vector { } } -impl VarBinTypeUpcast for Vector { - type Input = VarBinVector; +impl BinaryViewTypeUpcast for Vector { + type Input = BinaryViewVector; fn from_binary(input: Self::Input) -> Self { Vector::Binary(input) @@ -52,8 +59,8 @@ impl VarBinTypeUpcast for Vector { } } -impl VarBinTypeDowncast for VectorMut { - type Output = VarBinVectorMut; +impl BinaryViewDowncast for VectorMut { + type Output = BinaryViewVectorMut; fn into_binary(self) -> Self::Output { if let VectorMut::Binary(v) = self { @@ -70,8 +77,8 @@ impl VarBinTypeDowncast for VectorMut { } } -impl VarBinTypeUpcast for VectorMut { - type Input = VarBinVectorMut; +impl BinaryViewTypeUpcast for VectorMut { + type Input = BinaryViewVectorMut; fn from_binary(input: Self::Input) -> Self { VectorMut::Binary(input) diff --git a/vortex-vector/src/binaryview/types.rs b/vortex-vector/src/binaryview/types.rs new file mode 100644 index 00000000000..6a3418ac974 --- /dev/null +++ b/vortex-vector/src/binaryview/types.rs @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Variable-length binary types and related traits. + +use std::fmt::Debug; + +use crate::{BinaryViewVector, BinaryViewVectorMut, Vector, VectorMut}; + +impl From> for Vector { + fn from(value: BinaryViewVector) -> Self { + T::upcast(value) + } +} + +impl From> for VectorMut { + fn from(value: BinaryViewVectorMut) -> Self { + T::upcast(value) + } +} + +/// Trait to mark supported binary view types. +pub trait BinaryViewType: Debug + Sized + private::Sealed { + /// The slice type for this variable binary type. + type Slice: ?Sized + AsRef<[u8]>; + + /// Validate if a set of bytes conforms to the logical type constraints of the native `Slice`. + fn validate(bytes: &[u8]) -> bool; + + /// Returns the bytes as the native `Slice` type + /// for this binary view vector. + fn from_bytes(bytes: &[u8]) -> &Self::Slice; + + /// Downcast the provided object to a type-specific instance. + fn downcast(visitor: V) -> V::Output; + + /// Upcast a type-specific instance to a generic instance. + fn upcast(input: V::Input) -> V; +} + +/// [`BinaryType`] for UTF-8 strings. +#[derive(Clone, Debug)] +pub struct StringType; +impl BinaryViewType for StringType { + type Slice = str; + + #[inline(always)] + fn validate(bytes: &[u8]) -> bool { + std::str::from_utf8(bytes).is_ok() + } + + fn from_bytes(bytes: &[u8]) -> &Self::Slice { + // SAFETY: vectors should be checked at the boundary for upholding the UTF8 variant, + // or only be built from vectors that are known to satisfy the variant. + unsafe { std::str::from_utf8_unchecked(bytes) } + } + + fn downcast(visitor: V) -> V::Output { + visitor.into_string() + } + + fn upcast(input: V::Input) -> V { + V::from_string(input) + } +} + +/// [`BinaryType`] for raw binary data. +#[derive(Clone, Debug)] +pub struct BinaryType; +impl BinaryViewType for BinaryType { + type Slice = [u8]; + + #[inline(always)] + fn validate(_bytes: &[u8]) -> bool { + true + } + + fn from_bytes(bytes: &[u8]) -> &Self::Slice { + bytes + } + + fn downcast(visitor: V) -> V::Output { + visitor.into_binary() + } + + fn upcast(input: V::Input) -> V { + V::from_binary(input) + } +} + +/// Trait for downcasting generic variable binary types to specific types. +pub trait BinaryViewDowncast { + /// The output type after downcasting. + type Output; + + /// Downcast to a binary type. + fn into_binary(self) -> Self::Output; + /// Downcast to a string type. + fn into_string(self) -> Self::Output; +} + +/// Trait for upcasting specific variable binary types to generic types. +pub trait BinaryViewTypeUpcast { + /// The input type for upcasting. + type Input; + + /// Upcast from a binary type. + fn from_binary(input: Self::Input) -> Self; + /// Upcast from a string type. + fn from_string(input: Self::Input) -> Self; +} + +/// Private module to seal the `BinaryViewType` trait. +mod private { + /// Sealed trait to prevent external implementations of [`VarBinType`]. + pub trait Sealed {} + + impl Sealed for super::StringType {} + impl Sealed for super::BinaryType {} +} diff --git a/vortex-vector/src/binaryview/vector.rs b/vortex-vector/src/binaryview/vector.rs new file mode 100644 index 00000000000..3c348951410 --- /dev/null +++ b/vortex-vector/src/binaryview/vector.rs @@ -0,0 +1,198 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Variable-length binary vector implementation. + +use std::sync::Arc; + +use vortex_buffer::{Buffer, ByteBuffer}; +use vortex_error::{VortexExpect, VortexResult}; +use vortex_mask::Mask; + +use crate::VectorOps; +use crate::binaryview::BinaryViewType; +use crate::binaryview::vector_mut::BinaryViewVectorMut; +use crate::binaryview::view::{BinaryView, validate_views}; + +/// A variable-length binary vector. +/// +/// This is the core vector for string and binary data. +#[derive(Debug, Clone)] +pub struct BinaryViewVector { + /// Views into the binary data. + views: Buffer, + /// Buffers holding the referenced binary data. + buffers: Arc>, + /// Validity mask for the vector. + validity: Mask, + /// Marker trait for the [`BinaryViewType`]. + _marker: std::marker::PhantomData, +} + +impl BinaryViewVector { + /// Creates a new [`BinaryViewVector`] from the provided components. + /// + /// # Safety + /// + /// This function is unsafe because it does not validate the consistency of the provided + /// components. + /// + /// The caller must uphold all validation that would otherwise be validated by + /// the [safe constructor][Self::try_new]. + pub unsafe fn new_unchecked( + views: Buffer, + buffers: Arc>, + validity: Mask, + ) -> Self { + Self { + views, + validity, + buffers, + _marker: std::marker::PhantomData, + } + } + + /// Create a new `BinaryViewVector` from its components, panicking if validation fails. + /// + /// # Errors + /// + /// This function will panic if any of the validation checks performed by [`try_new`][Self::try_new] + /// fails. + pub fn new(views: Buffer, buffers: Arc>, validity: Mask) -> Self { + Self::try_new(views, buffers, validity).vortex_expect("Failed to create `BinaryViewVector`") + } + + /// Create a new [`BinaryViewVector`] from the provided components with validation. + /// + /// # Errors + /// + /// This function will return an error if any of the following validation checks fails: + /// + /// 1. The length of the `views` does not match the length of the provided `validity` + /// 2. Any non-null `views` point to invalid `buffers` or buffer offset ranges + /// 3. Any data stored inlined or in the `buffers` and referenced by the `views` does not + /// conform to the [validation constraints][BinaryViewType::validate] of this view type. + pub fn try_new( + views: Buffer, + buffers: Arc>, + validity: Mask, + ) -> VortexResult { + validate_views( + &views, + &*buffers, + |index| validity.value(index), + T::validate, + )?; + + Ok(Self { + views, + buffers, + validity, + _marker: std::marker::PhantomData, + }) + } + + /// Decomposes the vector into its constituent parts. + pub fn into_parts(self) -> (Buffer, Arc>, Mask) { + (self.views, self.buffers, self.validity) + } + + /// Get the `index` item from the vector as a native `Slice` type. + /// + /// This function will panic is `index` is out of range for the vector's length. + pub fn get(&self, index: usize) -> Option<&T::Slice> { + if !self.validity.value(index) { + return None; + } + + let view = self.views[index]; + if view.is_inlined() { + // start = seek to the `index`-th BinaryView, plus 4 bytes for the leading u32 field + let start = index * size_of::() + 4; + let length = view.as_view().size as usize; + let slice = &self.views().as_bytes()[start..start + length]; + Some(T::from_bytes(slice)) + } else { + // Get a pointer into the buffer range + let view_ref = view.as_view(); + let buffer = &self.buffers[view_ref.buffer_index as usize]; + + let start = view_ref.offset as usize; + let length = view_ref.size as usize; + Some(T::from_bytes(&buffer.as_bytes()[start..start + length])) + } + } + + /// Buffers + pub fn buffers(&self) -> &Arc> { + &self.buffers + } + + /// Views + pub fn views(&self) -> &Buffer { + &self.views + } +} + +impl VectorOps for BinaryViewVector { + type Mutable = BinaryViewVectorMut; + + fn len(&self) -> usize { + self.views.len() + } + + fn validity(&self) -> &Mask { + &self.validity + } + + fn try_into_mut(self) -> Result + where + Self: Sized, + { + let views_mut = match self.views.try_into_mut() { + Ok(views_mut) => views_mut, + Err(views) => { + return Err(Self { + views, + validity: self.validity, + buffers: self.buffers, + _marker: std::marker::PhantomData, + }); + } + }; + + let validity_mut = match self.validity.try_into_mut() { + Ok(validity_mut) => validity_mut, + Err(validity) => { + return Err(Self { + views: views_mut.freeze(), + validity, + buffers: self.buffers, + _marker: std::marker::PhantomData, + }); + } + }; + + let buffers_mut = match Arc::try_unwrap(self.buffers) { + Ok(buffers) => buffers.into_vec(), + Err(arc_buffers) => { + return Err(Self { + views: views_mut.freeze(), + validity: validity_mut.freeze(), + buffers: arc_buffers, + _marker: std::marker::PhantomData, + }); + } + }; + + // SAFETY: the BinaryViewVector maintains the same invariants that are + // otherwise checked in the safe BinaryViewVectorMut constructor. + unsafe { + Ok(BinaryViewVectorMut::new_unchecked( + views_mut, + validity_mut, + buffers_mut, + )) + } + } +} diff --git a/vortex-vector/src/binaryview/vector_mut.rs b/vortex-vector/src/binaryview/vector_mut.rs new file mode 100644 index 00000000000..7436723e699 --- /dev/null +++ b/vortex-vector/src/binaryview/vector_mut.rs @@ -0,0 +1,340 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Mutable variable-length binary vector. + +use std::sync::Arc; + +use vortex_buffer::{BufferMut, ByteBuffer, ByteBufferMut}; +use vortex_error::{VortexExpect, VortexResult, vortex_ensure}; +use vortex_mask::MaskMut; + +use crate::binaryview::BinaryViewType; +use crate::binaryview::vector::BinaryViewVector; +use crate::binaryview::view::{BinaryView, validate_views}; +use crate::{VectorMutOps, VectorOps}; + +/// Mutable variable-length binary vector. +#[derive(Clone, Debug)] + +/// A mutable vector of binary view data. +/// +/// The immutable equivalent of this type is [`BinaryViewVector`]. +pub struct BinaryViewVectorMut { + /// Views into the binary data. + views: BufferMut, + /// Validity mask for the vector. + validity: MaskMut, + + /// The completed buffers holding referenced binary data. + buffers: Vec, + /// The current buffer being appended to, if any. + open_buffer: Option, + + /// Marker trait for the [`BinaryViewType`]. + _marker: std::marker::PhantomData, +} + +impl BinaryViewVectorMut { + /// Create a new [`BinaryViewVectorMut`] from its components, panicking if validation fails. + /// + /// # Errors + /// + /// This function will panic if any of the validation checks performed by [`try_new`][Self::try_new] + /// fails. + pub fn new(views: BufferMut, buffers: Vec, validity: MaskMut) -> Self { + Self::try_new(views, buffers, validity) + .vortex_expect("Failed to create `BinaryViewVectorMut`") + } + + /// Tries to create a new [`BinaryViewVectorMut`] from its components. + /// + /// # Errors + /// + /// Returns an error if the length of the validity mask does not match the length of the views. + /// + /// Returns an error if the views reference any data that is not a valid buffer + pub fn try_new( + views: BufferMut, + buffers: Vec, + validity: MaskMut, + ) -> VortexResult { + vortex_ensure!( + views.len() == validity.len(), + "views buffer length {} != validity length {}", + views.len(), + validity.len() + ); + + validate_views(&views, &buffers, |index| validity.value(index), T::validate)?; + + Ok(Self { + views, + buffers, + validity, + open_buffer: None, + _marker: std::marker::PhantomData, + }) + } + + /// Creates a new [`BinaryViewVectorMut`] from the given bits and validity mask without validation. + /// + /// # Safety + /// + /// The caller must ensure that the validity mask has the same length as the views. + pub unsafe fn new_unchecked( + views: BufferMut, + validity: MaskMut, + buffers: Vec, + ) -> Self { + Self { + views, + buffers, + validity, + open_buffer: None, + _marker: std::marker::PhantomData, + } + } +} + +impl VectorMutOps for BinaryViewVectorMut { + type Immutable = BinaryViewVector; + + fn len(&self) -> usize { + self.views.len() + } + + fn capacity(&self) -> usize { + self.views.capacity() + } + + fn reserve(&mut self, additional: usize) { + self.views.reserve(additional); + self.validity.reserve(additional); + } + + fn extend_from_vector(&mut self, other: &Self::Immutable) { + // Close any existing views into a new buffer + if let Some(open) = self.open_buffer.take() { + self.buffers.push(open.freeze()); + } + + // We build a lookup table to map BinaryView's from the `other` to have + // valid buffer indices in the current array. + let mut buf_index_lookup: Vec = Vec::with_capacity(other.buffers().len()); + let mut new_buffers = Vec::new(); + for buffer in other.buffers().iter() { + let ptr = buffer.as_ptr().addr(); + let new_index: u32 = self + .buffers + .iter() + .position(|b| b.as_ptr().addr() == ptr) + .unwrap_or_else(|| self.buffers.len() + new_buffers.len()) + .try_into() + .vortex_expect("buffer index must fit in u32"); + + if new_index as usize == new_buffers.len() { + // We need to append the buffer + new_buffers.push(buffer.clone()); + } + + buf_index_lookup.push(new_index); + } + + // rewrite the views using our lookup table + let new_views = rewrite_views(other.views().iter().copied(), &buf_index_lookup); + + self.buffers.extend(new_buffers); + self.views.extend(new_views); + self.validity.append_mask(other.validity()) + } + + fn append_nulls(&mut self, n: usize) { + self.views.push_n(BinaryView::empty_view(), n); + self.validity.append_n(false, n); + } + + fn freeze(mut self) -> Self::Immutable { + // Freeze all components, close any in-progress views + if let Some(open) = self.open_buffer.take() { + self.buffers.push(open.freeze()); + } + + unsafe { + BinaryViewVector::new_unchecked( + self.views.freeze(), + Arc::new(self.buffers.into()), + self.validity.freeze(), + ) + } + } + + fn split_off(&mut self, _at: usize) -> Self { + todo!() + } + + fn unsplit(&mut self, _other: Self) { + todo!() + } +} + +/// Create a new iterator that yields views rewritten with a new buffer index. +#[inline] +fn rewrite_views( + views: impl Iterator, + buf_index_lookup: &[u32], +) -> impl Iterator { + views.map(|mut view| { + if view.is_inlined() { + return view; + } + let view = view.as_view_mut(); + let old_index = view.buffer_index; + let new_index = *buf_index_lookup + .get(old_index as usize) + .unwrap_or(&old_index); + view.buffer_index = new_index; + BinaryView { _ref: *view } + }) +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use vortex_buffer::{ByteBuffer, buffer, buffer_mut}; + use vortex_mask::{Mask, MaskMut}; + + use crate::binaryview::view::BinaryView; + use crate::{StringVector, StringVectorMut, VectorMutOps, VectorOps}; + + #[test] + fn test_basic() { + let strings_mut = StringVectorMut::new( + buffer_mut![ + BinaryView::new_inlined(b"inlined1"), + BinaryView::make_view(b"long string 1", 0, 0), + BinaryView::new_inlined(b"inlined2"), + BinaryView::make_view(b"long string 2", 0, 13), + BinaryView::new_inlined(b"inlined3"), + BinaryView::make_view(b"long string 3", 0, 26), + ], + vec![ByteBuffer::copy_from( + "long string 1long string 2long string 3", + )], + MaskMut::new_true(6), + ); + + let strings = strings_mut.freeze(); + assert_eq!(strings.get(0), Some("inlined1")); + assert_eq!(strings.get(1), Some("long string 1")); + assert_eq!(strings.get(2), Some("inlined2")); + assert_eq!(strings.get(3), Some("long string 2")); + assert_eq!(strings.get(4), Some("inlined3")); + assert_eq!(strings.get(5), Some("long string 3")); + } + + #[test] + fn test_extend_self_reference() { + let buf0 = ByteBuffer::copy_from( + b"a really very quite long string 1a really very quite long string 2", + ); + let buf1 = ByteBuffer::copy_from( + b"a really very quite long string 3a really very quite long string 4", + ); + + let mut strings_mut = StringVectorMut::new( + buffer_mut![ + BinaryView::new_inlined(b"inlined0"), + BinaryView::new_inlined(b"inlined1"), + BinaryView::make_view(b"a really very quite long string 4", 1, 33), + BinaryView::make_view(b"a really very quite long string 3", 1, 0), + BinaryView::make_view(b"a really very quite long string 2", 0, 33), + BinaryView::make_view(b"a really very quite long string 1", 0, 0), + ], + vec![buf0, buf1.clone()], + MaskMut::new_true(6), + ); + + // The `StringVector` we extend from + let strings = StringVector::new( + buffer![BinaryView::make_view( + b"a really very quite long string 2", + 0, + 33 + )], + Arc::new(Box::new([buf1])), + Mask::new_true(1), + ); + + strings_mut.extend_from_vector(&strings); + + let strings_finished = strings_mut.freeze(); + assert!(strings_finished.validity().all_true()); + + assert_eq!(strings_finished.get(0).unwrap(), "inlined0"); + assert_eq!(strings_finished.get(1).unwrap(), "inlined1"); + assert_eq!( + strings_finished.get(2).unwrap(), + "a really very quite long string 4" + ); + assert_eq!( + strings_finished.get(3).unwrap(), + "a really very quite long string 3" + ); + assert_eq!( + strings_finished.get(4).unwrap(), + "a really very quite long string 2", + ); + assert_eq!( + strings_finished.get(5).unwrap(), + "a really very quite long string 1" + ); + assert_eq!( + strings_finished.get(6).unwrap(), + "a really very quite long string 4" + ); + + assert_eq!(strings_finished.buffers().len(), 2); + } + + #[test] + fn test_extend_nulls() { + // Extend multiple times, with nulls. + let mut mask1 = MaskMut::with_capacity(4); + mask1.append_n(false, 2); + mask1.append_n(true, 2); + + let mut strings_mut = StringVectorMut::new( + buffer_mut![ + BinaryView::empty_view(), + BinaryView::empty_view(), + BinaryView::new_inlined(b"nonnull1"), + BinaryView::new_inlined(b"nonnull2"), + ], + vec![ByteBuffer::empty()], + mask1, + ); + + let strings = StringVector::new( + buffer![ + BinaryView::new_inlined(b"extend1"), + BinaryView::empty_view(), + BinaryView::new_inlined(b"extend2"), + ], + Arc::new(Box::new([ByteBuffer::empty()])), + Mask::from_iter([true, false, true]), + ); + + strings_mut.extend_from_vector(&strings); + let strings_finished = strings_mut.freeze(); + + assert_eq!(strings_finished.get(0), None); + assert_eq!(strings_finished.get(1), None); + assert_eq!(strings_finished.get(2), Some("nonnull1")); + assert_eq!(strings_finished.get(3), Some("nonnull2")); + assert_eq!(strings_finished.get(4), Some("extend1")); + assert_eq!(strings_finished.get(5), None); + assert_eq!(strings_finished.get(6), Some("extend2")); + } +} diff --git a/vortex-vector/src/varbin/view.rs b/vortex-vector/src/binaryview/view.rs similarity index 72% rename from vortex-vector/src/varbin/view.rs rename to vortex-vector/src/binaryview/view.rs index cbcd20919c3..1cd19111f1c 100644 --- a/vortex-vector/src/varbin/view.rs +++ b/vortex-vector/src/binaryview/view.rs @@ -8,7 +8,8 @@ use std::hash::{Hash, Hasher}; use std::ops::Range; use static_assertions::{assert_eq_align, assert_eq_size}; -use vortex_error::VortexUnwrap; +use vortex_buffer::ByteBuffer; +use vortex_error::{VortexResult, VortexUnwrap, vortex_ensure, vortex_err}; /// A view over a variable-length binary value. /// @@ -19,13 +20,13 @@ use vortex_error::VortexUnwrap; pub union BinaryView { /// Numeric representation. This is logically `u128`, but we split it into the high and low /// bits to preserve the alignment. - le_bytes: [u8; 16], + pub(crate) le_bytes: [u8; 16], /// Inlined representation: strings <= 12 bytes - inlined: Inlined, + pub(crate) inlined: Inlined, /// Reference type: strings > 12 bytes. - _ref: Ref, + pub(crate) _ref: Ref, } assert_eq_align!(BinaryView, u128); @@ -183,7 +184,7 @@ impl BinaryView { /// Create a new empty view #[inline] pub fn empty_view() -> Self { - Self::new_inlined(&[]) + Self { le_bytes: [0; 16] } } /// Create a new inlined binary view @@ -227,6 +228,11 @@ impl BinaryView { unsafe { &self._ref } } + /// Returns a mutable reference to the reference representation of the binary value. + pub fn as_view_mut(&mut self) -> &mut Ref { + unsafe { &mut self._ref } + } + /// Returns the binary view as u128 representation. pub fn as_u128(&self) -> u128 { // SAFETY: binary view always safe to read as u128 LE bytes @@ -259,3 +265,73 @@ impl fmt::Debug for BinaryView { s.finish() } } + +/// Validate that all views either +/// +/// 1. Contain valid inline data that conforms to type constraints as defined by the `validator` +/// 2. Points at a valid range of owned buffer memory, and the bytes stored there conform to +/// the type constraints as defined by the `validator`. +pub(super) fn validate_views( + views: impl AsRef<[BinaryView]>, + buffers: impl AsRef<[ByteBuffer]>, + validity: IsValidFn, + validator: ValidateFn, +) -> VortexResult<()> +where + IsValidFn: Fn(usize) -> bool, + ValidateFn: Fn(&[u8]) -> bool, +{ + let buffers = buffers.as_ref(); + let views = views.as_ref(); + for (idx, &view) in views.iter().enumerate() { + if !validity(idx) { + continue; + } + + if view.is_inlined() { + // Validate the inline bytestring + let bytes = &unsafe { view.inlined }.data[..view.len() as usize]; + vortex_ensure!( + validator(bytes), + "view at index {idx}: inlined bytes failed utf-8 validation" + ); + } else { + // Validate the view pointer + let view = view.as_view(); + let buf_index = view.buffer_index as usize; + let start_offset = view.offset as usize; + let end_offset = start_offset.saturating_add(view.size as usize); + + let buf = buffers.get(buf_index).ok_or_else(|| + vortex_err!("view at index {idx} references invalid buffer: {buf_index} out of bounds for VarBinViewArray with {} buffers", + buffers.len()))?; + + vortex_ensure!( + start_offset < buf.len(), + "start offset {start_offset} out of bounds for buffer {buf_index} with size {}", + buf.len(), + ); + + vortex_ensure!( + end_offset <= buf.len(), + "end offset {end_offset} out of bounds for buffer {buf_index} with size {}", + buf.len(), + ); + + // Make sure the prefix data matches the buffer data. + let bytes = &buf[start_offset..end_offset]; + vortex_ensure!( + view.prefix == bytes[..4], + "VarBinView prefix does not match full string" + ); + + // Validate the full string + vortex_ensure!( + validator(bytes), + "view at index {idx}: outlined bytes fails utf-8 validation" + ); + } + } + + Ok(()) +} diff --git a/vortex-vector/src/lib.rs b/vortex-vector/src/lib.rs index 4a94540e788..8adf9016b89 100644 --- a/vortex-vector/src/lib.rs +++ b/vortex-vector/src/lib.rs @@ -10,19 +10,19 @@ #![deny(clippy::missing_panics_doc)] #![deny(clippy::missing_safety_doc)] +mod binaryview; mod bool; mod decimal; mod null; mod primitive; mod struct_; -mod varbin; +pub use binaryview::*; pub use bool::*; pub use decimal::*; pub use null::*; pub use primitive::*; pub use struct_::*; -pub use varbin::*; mod ops; mod vector; diff --git a/vortex-vector/src/private.rs b/vortex-vector/src/private.rs index fcd9fb66a2f..2ae860b6894 100644 --- a/vortex-vector/src/private.rs +++ b/vortex-vector/src/private.rs @@ -34,8 +34,8 @@ impl Sealed for PrimitiveVectorMut {} impl Sealed for PVector {} impl Sealed for PVectorMut {} -impl Sealed for VarBinVector {} -impl Sealed for VarBinVectorMut {} +impl Sealed for BinaryViewVector {} +impl Sealed for BinaryViewVectorMut {} impl Sealed for StructVector {} impl Sealed for StructVectorMut {} diff --git a/vortex-vector/src/varbin/types.rs b/vortex-vector/src/varbin/types.rs deleted file mode 100644 index c4cce77c42f..00000000000 --- a/vortex-vector/src/varbin/types.rs +++ /dev/null @@ -1,93 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Variable-length binary types and related traits. - -use std::fmt::Debug; - -use crate::{VarBinVector, VarBinVectorMut, Vector, VectorMut}; - -impl From> for Vector { - fn from(value: VarBinVector) -> Self { - T::upcast(value) - } -} - -impl From> for VectorMut { - fn from(value: VarBinVectorMut) -> Self { - T::upcast(value) - } -} - -/// Trait to mark supported binary view types. -pub trait VarBinType: Debug + Sized + private::Sealed { - /// The slice type for this variable binary type. - type Slice: ?Sized + AsRef<[u8]>; - - /// Downcast the provided object to a type-specific instance. - fn downcast(visitor: V) -> V::Output; - - /// Upcast a type-specific instance to a generic instance. - fn upcast(input: V::Input) -> V; -} - -/// [`BinaryType`] for UTF-8 strings. -#[derive(Clone, Debug)] -pub struct StringType; -impl VarBinType for StringType { - type Slice = str; - - fn downcast(visitor: V) -> V::Output { - visitor.into_string() - } - - fn upcast(input: V::Input) -> V { - V::from_string(input) - } -} - -/// [`BinaryType`] for raw binary data. -#[derive(Clone, Debug)] -pub struct BinaryType; -impl VarBinType for BinaryType { - type Slice = [u8]; - - fn downcast(visitor: V) -> V::Output { - visitor.into_binary() - } - - fn upcast(input: V::Input) -> V { - V::from_binary(input) - } -} - -/// Trait for downcasting generic variable binary types to specific types. -pub trait VarBinTypeDowncast { - /// The output type after downcasting. - type Output; - - /// Downcast to a binary type. - fn into_binary(self) -> Self::Output; - /// Downcast to a string type. - fn into_string(self) -> Self::Output; -} - -/// Trait for upcasting specific variable binary types to generic types. -pub trait VarBinTypeUpcast { - /// The input type for upcasting. - type Input; - - /// Upcast from a binary type. - fn from_binary(input: Self::Input) -> Self; - /// Upcast from a string type. - fn from_string(input: Self::Input) -> Self; -} - -/// Private module to seal the [`VarBinType`] trait. -mod private { - /// Sealed trait to prevent external implementations of [`VarBinType`]. - pub trait Sealed {} - - impl Sealed for super::StringType {} - impl Sealed for super::BinaryType {} -} diff --git a/vortex-vector/src/varbin/vector.rs b/vortex-vector/src/varbin/vector.rs deleted file mode 100644 index e468151e94c..00000000000 --- a/vortex-vector/src/varbin/vector.rs +++ /dev/null @@ -1,76 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Variable-length binary vector implementation. - -use std::sync::Arc; - -use vortex_buffer::{Buffer, ByteBuffer}; -use vortex_mask::Mask; - -use crate::VectorOps; -use crate::varbin::VarBinType; -use crate::varbin::vector_mut::VarBinVectorMut; -use crate::varbin::view::BinaryView; - -/// A variable-length binary vector. -#[derive(Debug, Clone)] -pub struct VarBinVector { - /// Views into the binary data. - views: Buffer, - /// Buffers holding the referenced binary data. - buffers: Arc>, - /// Validity mask for the vector. - validity: Mask, - /// Marker trait for the [`VarBinType`]. - _marker: std::marker::PhantomData, -} - -impl VarBinVector { - /// Creates a new [`VarBinVector`] from the provided components. - /// - /// # Safety - /// - /// This function is unsafe because it does not validate the consistency of the provided - /// components. - /// - /// The caller must ensure that: - /// - The length of the `validity` mask matches the length of the `views` buffer. - /// - The `views` buffer correctly references the data in the `buffers`. - pub unsafe fn new_unchecked( - views: Buffer, - buffers: Arc>, - validity: Mask, - ) -> Self { - Self { - views, - buffers, - validity, - _marker: std::marker::PhantomData, - } - } - - /// Decomposes the vector into its constituent parts. - pub fn into_parts(self) -> (Buffer, Arc>, Mask) { - (self.views, self.buffers, self.validity) - } -} - -impl VectorOps for VarBinVector { - type Mutable = VarBinVectorMut; - - fn len(&self) -> usize { - self.views.len() - } - - fn validity(&self) -> &Mask { - &self.validity - } - - fn try_into_mut(self) -> Result - where - Self: Sized, - { - todo!() - } -} diff --git a/vortex-vector/src/varbin/vector_mut.rs b/vortex-vector/src/varbin/vector_mut.rs deleted file mode 100644 index e6ffcc7bd52..00000000000 --- a/vortex-vector/src/varbin/vector_mut.rs +++ /dev/null @@ -1,67 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Mutable variable-length binary vector. - -use vortex_buffer::{BufferMut, ByteBuffer, ByteBufferMut}; -use vortex_mask::MaskMut; - -use crate::VectorMutOps; -use crate::varbin::VarBinType; -use crate::varbin::vector::VarBinVector; -use crate::varbin::view::BinaryView; - -/// Mutable variable-length binary vector. -#[allow(dead_code)] // FIXME(ngates): remove after implementing the methods -#[derive(Clone, Debug)] -pub struct VarBinVectorMut { - /// Views into the binary data. - views: BufferMut, - /// Validity mask for the vector. - validity: MaskMut, - - /// The completed buffers holding referenced binary data. - buffers: Vec, - /// The current buffer being appended to, if any. - open_buffer: Option, - - /// Marker trait for the [`VarBinType`]. - _marker: std::marker::PhantomData, -} - -impl VectorMutOps for VarBinVectorMut { - type Immutable = VarBinVector; - - fn len(&self) -> usize { - self.views.len() - } - - fn capacity(&self) -> usize { - self.views.capacity() - } - - fn reserve(&mut self, additional: usize) { - self.views.reserve(additional); - } - - fn extend_from_vector(&mut self, _other: &Self::Immutable) { - todo!() - } - - fn append_nulls(&mut self, n: usize) { - self.views.push_n(BinaryView::empty_view(), n); - self.validity.append_n(false, n); - } - - fn freeze(self) -> Self::Immutable { - todo!() - } - - fn split_off(&mut self, _at: usize) -> Self { - todo!() - } - - fn unsplit(&mut self, _other: Self) { - todo!() - } -} diff --git a/vortex-vector/src/vector_mut.rs b/vortex-vector/src/vector_mut.rs index f52029765be..4be682dc9b7 100644 --- a/vortex-vector/src/vector_mut.rs +++ b/vortex-vector/src/vector_mut.rs @@ -9,7 +9,7 @@ use vortex_dtype::DType; use vortex_error::vortex_panic; -use crate::varbin::{BinaryVectorMut, StringVectorMut}; +use crate::binaryview::{BinaryVectorMut, StringVectorMut}; use crate::{ BoolVectorMut, DecimalVectorMut, NullVectorMut, PrimitiveVectorMut, StructVectorMut, Vector, VectorMutOps, match_each_vector_mut, match_vector_pair, From 058cb80eca5bd4cb244a73e9e518b5d6161c63f2 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 3 Nov 2025 14:46:08 -0500 Subject: [PATCH 2/7] cleaner Signed-off-by: Andrew Duffy --- vortex-vector/src/binaryview/types.rs | 6 +++--- vortex-vector/src/binaryview/vector.rs | 18 +++++++++++------- vortex-vector/src/binaryview/view.rs | 2 ++ 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/vortex-vector/src/binaryview/types.rs b/vortex-vector/src/binaryview/types.rs index 6a3418ac974..54115a81e1f 100644 --- a/vortex-vector/src/binaryview/types.rs +++ b/vortex-vector/src/binaryview/types.rs @@ -29,7 +29,7 @@ pub trait BinaryViewType: Debug + Sized + private::Sealed { /// Returns the bytes as the native `Slice` type /// for this binary view vector. - fn from_bytes(bytes: &[u8]) -> &Self::Slice; + unsafe fn from_bytes_unchecked(bytes: &[u8]) -> &Self::Slice; /// Downcast the provided object to a type-specific instance. fn downcast(visitor: V) -> V::Output; @@ -49,7 +49,7 @@ impl BinaryViewType for StringType { std::str::from_utf8(bytes).is_ok() } - fn from_bytes(bytes: &[u8]) -> &Self::Slice { + unsafe fn from_bytes_unchecked(bytes: &[u8]) -> &Self::Slice { // SAFETY: vectors should be checked at the boundary for upholding the UTF8 variant, // or only be built from vectors that are known to satisfy the variant. unsafe { std::str::from_utf8_unchecked(bytes) } @@ -75,7 +75,7 @@ impl BinaryViewType for BinaryType { true } - fn from_bytes(bytes: &[u8]) -> &Self::Slice { + unsafe fn from_bytes_unchecked(bytes: &[u8]) -> &Self::Slice { bytes } diff --git a/vortex-vector/src/binaryview/vector.rs b/vortex-vector/src/binaryview/vector.rs index 3c348951410..b4edbd8a844 100644 --- a/vortex-vector/src/binaryview/vector.rs +++ b/vortex-vector/src/binaryview/vector.rs @@ -105,13 +105,13 @@ impl BinaryViewVector { return None; } - let view = self.views[index]; + let view = &self.views[index]; if view.is_inlined() { - // start = seek to the `index`-th BinaryView, plus 4 bytes for the leading u32 field - let start = index * size_of::() + 4; - let length = view.as_view().size as usize; - let slice = &self.views().as_bytes()[start..start + length]; - Some(T::from_bytes(slice)) + let view = view.as_inlined(); + // SAFETY: validation that the string data contained in this vector is performed + // at construction time, either in the constructor for safe construction, or by + // the caller (when using the unchecked constructor). + Some(unsafe { T::from_bytes_unchecked(&view.data[..view.size as usize]) }) } else { // Get a pointer into the buffer range let view_ref = view.as_view(); @@ -119,7 +119,11 @@ impl BinaryViewVector { let start = view_ref.offset as usize; let length = view_ref.size as usize; - Some(T::from_bytes(&buffer.as_bytes()[start..start + length])) + + // SAFETY: validation that the string data contained in this vector is performed + // at construction time, either in the constructor for safe construction, or by + // the caller (when using the unchecked constructor). + Some(unsafe { T::from_bytes_unchecked(&buffer.as_bytes()[start..start + length]) }) } } diff --git a/vortex-vector/src/binaryview/view.rs b/vortex-vector/src/binaryview/view.rs index 1cd19111f1c..d20477ce761 100644 --- a/vortex-vector/src/binaryview/view.rs +++ b/vortex-vector/src/binaryview/view.rs @@ -220,11 +220,13 @@ impl BinaryView { /// Returns the inlined representation of the binary value. pub fn as_inlined(&self) -> &Inlined { + debug_assert!(self.is_inlined()); unsafe { &self.inlined } } /// Returns the reference representation of the binary value. pub fn as_view(&self) -> &Ref { + debug_assert!(!self.is_inlined()); unsafe { &self._ref } } From ad45b97805e444592de185ed950eeb0c618664f8 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 3 Nov 2025 14:48:30 -0500 Subject: [PATCH 3/7] do validation in debug mode Signed-off-by: Andrew Duffy --- vortex-vector/src/binaryview/vector.rs | 14 +++++++++----- vortex-vector/src/binaryview/vector_mut.rs | 16 ++++++++++------ 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/vortex-vector/src/binaryview/vector.rs b/vortex-vector/src/binaryview/vector.rs index b4edbd8a844..849b647934a 100644 --- a/vortex-vector/src/binaryview/vector.rs +++ b/vortex-vector/src/binaryview/vector.rs @@ -44,11 +44,15 @@ impl BinaryViewVector { buffers: Arc>, validity: Mask, ) -> Self { - Self { - views, - validity, - buffers, - _marker: std::marker::PhantomData, + if cfg!(debug_assertions) { + Self::new(views, buffers, validity) + } else { + Self { + views, + validity, + buffers, + _marker: std::marker::PhantomData, + } } } diff --git a/vortex-vector/src/binaryview/vector_mut.rs b/vortex-vector/src/binaryview/vector_mut.rs index 7436723e699..0af442ddc49 100644 --- a/vortex-vector/src/binaryview/vector_mut.rs +++ b/vortex-vector/src/binaryview/vector_mut.rs @@ -87,12 +87,16 @@ impl BinaryViewVectorMut { validity: MaskMut, buffers: Vec, ) -> Self { - Self { - views, - buffers, - validity, - open_buffer: None, - _marker: std::marker::PhantomData, + if cfg!(debug_assertions) { + Self::new(views, buffers, validity) + } else { + Self { + views, + buffers, + validity, + open_buffer: None, + _marker: std::marker::PhantomData, + } } } } From 8a038fc46087ae1952f2c5e2f3b2bc3a77e07f8d Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 3 Nov 2025 15:23:20 -0500 Subject: [PATCH 4/7] few more Signed-off-by: Andrew Duffy --- vortex-vector/src/binaryview/vector_mut.rs | 4 ++-- vortex-vector/src/binaryview/view.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vortex-vector/src/binaryview/vector_mut.rs b/vortex-vector/src/binaryview/vector_mut.rs index 0af442ddc49..9c783710796 100644 --- a/vortex-vector/src/binaryview/vector_mut.rs +++ b/vortex-vector/src/binaryview/vector_mut.rs @@ -146,10 +146,10 @@ impl VectorMutOps for BinaryViewVectorMut { } // rewrite the views using our lookup table - let new_views = rewrite_views(other.views().iter().copied(), &buf_index_lookup); + let new_views_iter = rewrite_views(other.views().iter().copied(), &buf_index_lookup); self.buffers.extend(new_buffers); - self.views.extend(new_views); + self.views.extend(new_views_iter); self.validity.append_mask(other.validity()) } diff --git a/vortex-vector/src/binaryview/view.rs b/vortex-vector/src/binaryview/view.rs index d20477ce761..05ae4de3707 100644 --- a/vortex-vector/src/binaryview/view.rs +++ b/vortex-vector/src/binaryview/view.rs @@ -274,7 +274,7 @@ impl fmt::Debug for BinaryView { /// 2. Points at a valid range of owned buffer memory, and the bytes stored there conform to /// the type constraints as defined by the `validator`. pub(super) fn validate_views( - views: impl AsRef<[BinaryView]>, + views: &[BinaryView], buffers: impl AsRef<[ByteBuffer]>, validity: IsValidFn, validator: ValidateFn, From 2c7a6415613fb59890dffedeb3f61d4d6bd6fbd9 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 3 Nov 2025 15:25:44 -0500 Subject: [PATCH 5/7] clippy Signed-off-by: Andrew Duffy --- vortex-vector/src/binaryview/types.rs | 8 ++++++++ vortex-vector/src/binaryview/view.rs | 1 - 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/vortex-vector/src/binaryview/types.rs b/vortex-vector/src/binaryview/types.rs index 54115a81e1f..e50048f2ce4 100644 --- a/vortex-vector/src/binaryview/types.rs +++ b/vortex-vector/src/binaryview/types.rs @@ -29,6 +29,14 @@ pub trait BinaryViewType: Debug + Sized + private::Sealed { /// Returns the bytes as the native `Slice` type /// for this binary view vector. + /// + /// # Safety + /// + /// The caller must check beforehand that bytes return from the vector conform to the type + /// requirements of this binary type. + /// + /// Failure to do so can result in undefined behavior or incorrect results in downstream + /// vector operations. unsafe fn from_bytes_unchecked(bytes: &[u8]) -> &Self::Slice; /// Downcast the provided object to a type-specific instance. diff --git a/vortex-vector/src/binaryview/view.rs b/vortex-vector/src/binaryview/view.rs index 05ae4de3707..294064fab2b 100644 --- a/vortex-vector/src/binaryview/view.rs +++ b/vortex-vector/src/binaryview/view.rs @@ -284,7 +284,6 @@ where ValidateFn: Fn(&[u8]) -> bool, { let buffers = buffers.as_ref(); - let views = views.as_ref(); for (idx, &view) in views.iter().enumerate() { if !validity(idx) { continue; From a2eb260674685821ecf081685be7cc4555caf240 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 3 Nov 2025 17:41:30 -0500 Subject: [PATCH 6/7] add tests, remove dedupe code path Signed-off-by: Andrew Duffy --- vortex-vector/src/binaryview/vector.rs | 37 +++++-- vortex-vector/src/binaryview/vector_mut.rs | 116 ++++++++++++--------- vortex-vector/src/binaryview/view.rs | 2 +- 3 files changed, 100 insertions(+), 55 deletions(-) diff --git a/vortex-vector/src/binaryview/vector.rs b/vortex-vector/src/binaryview/vector.rs index 849b647934a..c0173325294 100644 --- a/vortex-vector/src/binaryview/vector.rs +++ b/vortex-vector/src/binaryview/vector.rs @@ -183,13 +183,9 @@ impl VectorOps for BinaryViewVector { let buffers_mut = match Arc::try_unwrap(self.buffers) { Ok(buffers) => buffers.into_vec(), - Err(arc_buffers) => { - return Err(Self { - views: views_mut.freeze(), - validity: validity_mut.freeze(), - buffers: arc_buffers, - _marker: std::marker::PhantomData, - }); + Err(buffers) => { + // Backup: collect a new Vec with clones of each buffer + buffers.iter().cloned().collect() } }; @@ -204,3 +200,30 @@ impl VectorOps for BinaryViewVector { } } } + +#[cfg(test)] +mod tests { + use crate::{StringVectorMut, VectorMutOps, VectorOps}; + + #[test] + fn test_try_into_mut() { + let mut shared_vec = StringVectorMut::with_capacity(5); + shared_vec.append_nulls(2); + shared_vec.append_values("an example value", 2); + shared_vec.append_values("another example value", 1); + + let shared_vec = shared_vec.freeze(); + + // Making a copy aliases the vector, preventing us from converting it back into mutable + let shared_vec2 = shared_vec.clone(); + + // The Err variant is returned, because the aliasing borrow from shared_vec2 is blocking us + // from taking unique ownership of the memory. + let shared_vec = shared_vec.try_into_mut().unwrap_err(); + + // Dropping the aliasing borrow makes it possible to cast the unique reference to mut + drop(shared_vec2); + + assert!(shared_vec.try_into_mut().is_ok()); + } +} diff --git a/vortex-vector/src/binaryview/vector_mut.rs b/vortex-vector/src/binaryview/vector_mut.rs index 9c783710796..45735dfbe4e 100644 --- a/vortex-vector/src/binaryview/vector_mut.rs +++ b/vortex-vector/src/binaryview/vector_mut.rs @@ -14,6 +14,8 @@ use crate::binaryview::vector::BinaryViewVector; use crate::binaryview::view::{BinaryView, validate_views}; use crate::{VectorMutOps, VectorOps}; +// Default capacity for new string data buffers of 2MiB. +const BUFFER_CAPACITY: usize = 2 * 1024 * 1024; /// Mutable variable-length binary vector. #[derive(Clone, Debug)] @@ -47,6 +49,17 @@ impl BinaryViewVectorMut { .vortex_expect("Failed to create `BinaryViewVectorMut`") } + /// Create a new empty [`BinaryViewVectorMut`], pre-allocated to hold the specified number + /// of items. This does not reserve any memory for string data itself, only for the binary views + /// and the validity bits. + pub fn with_capacity(capacity: usize) -> Self { + Self::new( + BufferMut::with_capacity(capacity), + Vec::new(), + MaskMut::with_capacity(capacity), + ) + } + /// Tries to create a new [`BinaryViewVectorMut`] from its components. /// /// # Errors @@ -99,6 +112,43 @@ impl BinaryViewVectorMut { } } } + + /// Append a repeated sequence of binary data to a vector. + /// + /// ``` + /// # use vortex_vector::{StringVectorMut, VectorMutOps}; + /// let mut strings = StringVectorMut::with_capacity(4); + /// strings.append_values("inlined", 2); + /// strings.append_nulls(1); + /// strings.append_values("large not inlined", 1); + /// + /// let strings = strings.freeze(); + /// + /// assert_eq!( + /// [strings.get(0), strings.get(1), strings.get(2), strings.get(3)], + /// [Some("inlined"), Some("inlined"), None, Some("large not inlined")], + /// ); + /// ``` + pub fn append_values(&mut self, value: &T::Slice, n: usize) { + let bytes = value.as_ref(); + if bytes.len() <= BinaryView::MAX_INLINED_SIZE { + self.views.push_n(BinaryView::new_inlined(bytes), n); + } else { + let buffer_index = + u32::try_from(self.buffers.len()).vortex_expect("buffer count exceeds u32::MAX"); + + let buf = self + .open_buffer + .get_or_insert_with(|| ByteBufferMut::with_capacity(BUFFER_CAPACITY)); + let offset = u32::try_from(buf.len()).vortex_expect("buffer length exceeds u32::MAX"); + buf.extend_from_slice(value.as_ref()); + + self.views + .push_n(BinaryView::make_view(bytes, buffer_index, offset), n); + } + + self.validity.append_n(true, n); + } } impl VectorMutOps for BinaryViewVectorMut { @@ -123,33 +173,21 @@ impl VectorMutOps for BinaryViewVectorMut { self.buffers.push(open.freeze()); } - // We build a lookup table to map BinaryView's from the `other` to have - // valid buffer indices in the current array. - let mut buf_index_lookup: Vec = Vec::with_capacity(other.buffers().len()); - let mut new_buffers = Vec::new(); - for buffer in other.buffers().iter() { - let ptr = buffer.as_ptr().addr(); - let new_index: u32 = self - .buffers - .iter() - .position(|b| b.as_ptr().addr() == ptr) - .unwrap_or_else(|| self.buffers.len() + new_buffers.len()) - .try_into() - .vortex_expect("buffer index must fit in u32"); - - if new_index as usize == new_buffers.len() { - // We need to append the buffer - new_buffers.push(buffer.clone()); - } - - buf_index_lookup.push(new_index); - } + let offset = + u32::try_from(self.buffers.len()).vortex_expect("buffer count exceeds u32::MAX"); - // rewrite the views using our lookup table - let new_views_iter = rewrite_views(other.views().iter().copied(), &buf_index_lookup); + self.buffers.extend(other.buffers().iter().cloned()); - self.buffers.extend(new_buffers); + let new_views_iter = other.views().iter().copied().map(|mut v| { + if v.is_inlined() { + v + } else { + v.as_view_mut().buffer_index += offset; + v + } + }); self.views.extend(new_views_iter); + self.validity.append_mask(other.validity()) } @@ -182,28 +220,9 @@ impl VectorMutOps for BinaryViewVectorMut { } } -/// Create a new iterator that yields views rewritten with a new buffer index. -#[inline] -fn rewrite_views( - views: impl Iterator, - buf_index_lookup: &[u32], -) -> impl Iterator { - views.map(|mut view| { - if view.is_inlined() { - return view; - } - let view = view.as_view_mut(); - let old_index = view.buffer_index; - let new_index = *buf_index_lookup - .get(old_index as usize) - .unwrap_or(&old_index); - view.buffer_index = new_index; - BinaryView { _ref: *view } - }) -} - #[cfg(test)] mod tests { + use std::ops::Deref; use std::sync::Arc; use vortex_buffer::{ByteBuffer, buffer, buffer_mut}; @@ -256,7 +275,7 @@ mod tests { BinaryView::make_view(b"a really very quite long string 2", 0, 33), BinaryView::make_view(b"a really very quite long string 1", 0, 0), ], - vec![buf0, buf1.clone()], + vec![buf0.clone(), buf1.clone()], MaskMut::new_true(6), ); @@ -267,7 +286,7 @@ mod tests { 0, 33 )], - Arc::new(Box::new([buf1])), + Arc::new(Box::new([buf1.clone()])), Mask::new_true(1), ); @@ -299,7 +318,10 @@ mod tests { "a really very quite long string 4" ); - assert_eq!(strings_finished.buffers().len(), 2); + assert_eq!( + strings_finished.buffers().deref().as_ref(), + &[buf0, buf1.clone(), buf1] + ); } #[test] diff --git a/vortex-vector/src/binaryview/view.rs b/vortex-vector/src/binaryview/view.rs index 294064fab2b..814daef3c6f 100644 --- a/vortex-vector/src/binaryview/view.rs +++ b/vortex-vector/src/binaryview/view.rs @@ -119,7 +119,7 @@ impl Default for BinaryView { impl BinaryView { /// Maximum size of an inlined binary value. - const MAX_INLINED_SIZE: usize = 12; + pub const MAX_INLINED_SIZE: usize = 12; /// Create a view from a value, block and offset /// From 00bebf8c7e27b6ea2042d680cbe9d73bb0274ee8 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 3 Nov 2025 17:42:51 -0500 Subject: [PATCH 7/7] move Signed-off-by: Andrew Duffy --- vortex-vector/src/binaryview/vector_mut.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vortex-vector/src/binaryview/vector_mut.rs b/vortex-vector/src/binaryview/vector_mut.rs index 45735dfbe4e..0f13038f80b 100644 --- a/vortex-vector/src/binaryview/vector_mut.rs +++ b/vortex-vector/src/binaryview/vector_mut.rs @@ -16,12 +16,11 @@ use crate::{VectorMutOps, VectorOps}; // Default capacity for new string data buffers of 2MiB. const BUFFER_CAPACITY: usize = 2 * 1024 * 1024; -/// Mutable variable-length binary vector. -#[derive(Clone, Debug)] /// A mutable vector of binary view data. /// /// The immutable equivalent of this type is [`BinaryViewVector`]. +#[derive(Clone, Debug)] pub struct BinaryViewVectorMut { /// Views into the binary data. views: BufferMut,