From 33e3681139cd3f3347635f323e82899fbdbf57c5 Mon Sep 17 00:00:00 2001 From: Alexander Droste Date: Thu, 11 Jun 2026 11:34:21 +0000 Subject: [PATCH 1/2] clean: simplify Arrow device export internals Cleanups from a review of the Arrow device export path, no behavior change: - Inline the single-caller list_layout_parts 5-tuple helper and the export_list_layout_with_child indirection into export_list and export_list_layout. ListChildExport itself stays: the take-based host list-view rebuild wraps children in transient Dict encodings that the pre-computed schema does not include, so the rebuilt-child arm must keep canonicalizing; its doc now spells that out. - Remove the dead end < offset wrap check in arrow_binary.cu: both addends are u32 widened to u64, so the sum cannot wrap. - Replace the panicking as_ptype on the dyn-dispatch cast target with PType::try_from, matching the source-side check three lines up. - Correct the export_binary rationale: cuDF's Arrow Device import rejects both Binary and BinaryView today (arrow_to_cudf_type maps neither), matching the e2e harness comment. - Fix the misplaced doc comment on export_list_view, and document the validity buffer padding contract on export_arrow_validity_buffer (every path is backed by a 4-byte-padded, tail-zeroed allocation via the device copy layer). Co-Authored-By: Claude Fable 5 Signed-off-by: Alexander Droste --- vortex-cuda/kernels/src/arrow_binary.cu | 3 +- vortex-cuda/src/arrow/canonical.rs | 67 ++++++++----------- .../src/dynamic_dispatch/plan_builder.rs | 4 +- 3 files changed, 32 insertions(+), 42 deletions(-) diff --git a/vortex-cuda/kernels/src/arrow_binary.cu b/vortex-cuda/kernels/src/arrow_binary.cu index ae9e2fa8ecb..836d6d1ce29 100644 --- a/vortex-cuda/kernels/src/arrow_binary.cu +++ b/vortex-cuda/kernels/src/arrow_binary.cu @@ -63,8 +63,9 @@ __device__ void init_scan_device(const BinaryView *const __restrict views, const BinaryViewRef *const view_ref = reinterpret_cast(&view); const uint64_t buffer_index = static_cast(view_ref->buffer_index); const uint64_t offset = static_cast(view_ref->offset); + // Both addends are u32 widened to u64, so the end position cannot wrap. const uint64_t end = offset + static_cast(size); - if (buffer_index >= data_buffer_count || end < offset || end > data_buffer_lens[buffer_index]) { + if (buffer_index >= data_buffer_count || end > data_buffer_lens[buffer_index]) { scan[idx] = 0; atomicMax(status, 1u); continue; diff --git a/vortex-cuda/src/arrow/canonical.rs b/vortex-cuda/src/arrow/canonical.rs index 0c9f28cc325..f0cf4fb37c5 100644 --- a/vortex-cuda/src/arrow/canonical.rs +++ b/vortex-cuda/src/arrow/canonical.rs @@ -448,9 +448,11 @@ where /// Export Vortex binary views as standard Arrow `Binary`. /// -/// cuDF imports Arrow `Binary` through the Arrow Device path, but does not currently accept -/// Arrow `BinaryView`. This path keeps conversion on the CUDA stream by building `i32` offsets -/// from view sizes and gathering inline/out-of-line view bytes into one contiguous values buffer. +/// cuDF's Arrow Device import currently rejects both Arrow `Binary` and Arrow `BinaryView` +/// (unlike `Utf8View`, which it accepts as strings), so standard `Binary` is exported as the +/// layout other Arrow Device consumers accept most widely. This path keeps conversion on the +/// CUDA stream by building `i32` offsets from view sizes and gathering inline/out-of-line view +/// bytes into one contiguous values buffer. async fn export_binary( varbinview: VarBinViewArray, ctx: &mut CudaExecutionCtx, @@ -679,6 +681,11 @@ fn gather_binary_values( /// Export Vortex validity as an Arrow validity byte buffer on the CUDA device. /// /// Returns `None` for the buffer when Arrow can omit validity because all rows are valid. +/// +/// Every returned buffer is backed by an allocation padded to a 4-byte multiple with zeroed +/// padding so cuDF's word-sized mask reads stay in bounds: the fast path through the device +/// copy's tail zeroing, the other paths through their own padded allocations. Bits at positions +/// `>= len + arrow_offset` within the final data byte are unspecified, as Arrow permits. pub(super) async fn export_arrow_validity_buffer( validity: Validity, len: usize, @@ -788,7 +795,7 @@ pub(super) fn repack_arrow_validity_buffer( Ok(BufferHandle::new_device(Arc::new(output_device)).slice(0..output_bytes)) } -/// Export a standard Vortex list as Arrow `List`: validity, offsets, and one child array. +/// Export a Vortex list-view as Arrow `List`. async fn export_list_view( listview: ListViewArray, ctx: &mut CudaExecutionCtx, @@ -815,29 +822,12 @@ async fn export_list_view( .await } +/// Export a standard Vortex list as Arrow `List`: validity, offsets, and one child array. async fn export_list( array: ListArray, child_export: ListChildExport, ctx: &mut CudaExecutionCtx, ) -> VortexResult<(ArrowArray, SyncEvent)> { - let (elements, len, validity_buffer, null_count, offsets_buffer) = - list_layout_parts(array, ctx).await?; - export_list_layout( - elements, - len, - validity_buffer, - null_count, - offsets_buffer, - child_export, - ctx, - ) - .await -} - -async fn list_layout_parts( - array: ListArray, - ctx: &mut CudaExecutionCtx, -) -> VortexResult<(ArrayRef, usize, Option, i64, BufferHandle)> { let len = array.len(); let ListDataParts { elements, @@ -848,7 +838,16 @@ async fn list_layout_parts( let (validity_buffer, null_count) = export_arrow_validity_buffer(validity, len, 0, ctx).await?; let offsets_buffer = export_arrow_list_offsets(offsets, ctx).await?; - Ok((elements, len, validity_buffer, null_count, offsets_buffer)) + export_list_layout( + elements, + len, + validity_buffer, + null_count, + offsets_buffer, + child_export, + ctx, + ) + .await } #[derive(Clone, Copy)] @@ -858,6 +857,11 @@ pub(super) enum ListChildExport { PreserveConcreteLayout, /// Canonicalize temporary encodings introduced by the host ListView /// rebuild, while still preserving rebuilt dictionary children. + /// + /// This is not equivalent to `export_array`: the take-based rebuild wraps + /// children in transient encodings (for example `take` returns `Dict` + /// arrays nested inside struct fields) that the pre-computed export schema + /// does not include, so they must be canonicalized away before export. RebuiltListViewChild, } @@ -891,24 +895,7 @@ pub(super) async fn export_list_layout( ctx: &mut CudaExecutionCtx, ) -> VortexResult<(ArrowArray, SyncEvent)> { let elements_child = child_export.export(elements, ctx).await?; - export_list_layout_with_child( - elements_child, - len, - validity_buffer, - null_count, - offsets_buffer, - ctx, - ) -} -fn export_list_layout_with_child( - elements_child: ArrowArray, - len: usize, - validity_buffer: Option, - null_count: i64, - offsets_buffer: BufferHandle, - ctx: &mut CudaExecutionCtx, -) -> VortexResult<(ArrowArray, SyncEvent)> { let mut private_data = PrivateData::new( vec![validity_buffer, Some(offsets_buffer)], vec![elements_child], diff --git a/vortex-cuda/src/dynamic_dispatch/plan_builder.rs b/vortex-cuda/src/dynamic_dispatch/plan_builder.rs index f98dbb74a28..381e40f37dd 100644 --- a/vortex-cuda/src/dynamic_dispatch/plan_builder.rs +++ b/vortex-cuda/src/dynamic_dispatch/plan_builder.rs @@ -139,7 +139,9 @@ fn is_dyn_dispatch_cast_compatible(array: &ArrayRef) -> bool { let Ok(source_ptype) = PType::try_from(cast.child_at(0).dtype()) else { return false; }; - let target_ptype = cast.scalar_fn().as_::().as_ptype(); + let Ok(target_ptype) = PType::try_from(cast.scalar_fn().as_::()) else { + return false; + }; // Implemented as unsigned dictionary-code casts to cuDF's signed index types. // LOAD/BITUNPACK materialize directly into the target-width output type. From 6e35e91fdfb3681a6a0180a4278b597fa323f705 Mon Sep 17 00:00:00 2001 From: Alexander Droste Date: Fri, 12 Jun 2026 11:08:53 +0000 Subject: [PATCH 2/2] clean up readme Signed-off-by: Alexander Droste --- vortex-cuda/README.md | 34 +++++++------------ vortex-cuda/src/arrow/canonical.rs | 29 ++++++---------- vortex-cuda/src/arrow/list_view.rs | 6 ++-- .../src/dynamic_dispatch/plan_builder.rs | 26 ++++++++++++++ 4 files changed, 53 insertions(+), 42 deletions(-) diff --git a/vortex-cuda/README.md b/vortex-cuda/README.md index 1579bececdc..24b8f420290 100644 --- a/vortex-cuda/README.md +++ b/vortex-cuda/README.md @@ -10,35 +10,27 @@ Key files: - `vortex-cuda/src/arrow/canonical.rs`: canonical-array export to `ArrowDeviceArray`. - `vortex-test/e2e-cuda/src/lib.rs`: cuDF interop harness. -Current export coverage includes primitive, bool, decimal/temporal, string/binary view, and struct arrays. Remaining work includes null masks, broader dtype coverage, `ArrowDeviceArrayStream`, and PyVortex integration. +## Building cuDF for Arrow Device interop -## cuDF compatibility +The `cudf-test-harness` repository provides prebuilt cuDF binaries for Arrow Device interop testing on +x86_64 and aarch64. -Vortex exports string and binary columns as Arrow `Utf8View` / `BinaryView` device arrays with producer-owned `ArrowArray.private_data`. cuDF string/binary interop requires a build containing `rapidsai/cudf#22620`; until a release version is identified, test with a cuDF commit that includes that change. - -## Building cuDF for interop testing - -Pass a single CUDA architecture, e.g. `-DCMAKE_CUDA_ARCHITECTURES=90a`; otherwise cuDF builds for many architectures and local builds are much slower. +From the cuDF repository root, compile the Arrow Device interop target locally without exporting additional +environment variables: ```sh -export PATH=/usr/local/cuda-13.1/bin:$PATH +cmake -E rm -rf cpp/build cmake -S cpp -B cpp/build \ - -DCMAKE_INSTALL_PREFIX=${CONDA_PREFIX:-/usr/local} \ - -DCMAKE_CUDA_ARCHITECTURES=90a \ + -DCMAKE_INSTALL_PREFIX=/usr/local \ + -DCMAKE_CUDA_ARCHITECTURES=NATIVE \ -DBUILD_TESTS=ON \ -DDISABLE_DEPRECATION_WARNINGS=ON \ -DCMAKE_BUILD_TYPE=Debug \ -DCUDF_BUILD_STREAMS_TEST_UTIL=OFF \ - -DCUDAToolkit_ROOT=/usr/local/cuda-13.1 \ - -DCMAKE_CUDA_COMPILER=/usr/local/cuda-13.1/bin/nvcc \ - -DCMAKE_CXX_COMPILER=/usr/bin/g++-13 \ - -DCMAKE_C_COMPILER=/usr/bin/gcc-13 \ - -GNinja - -cmake --build cpp/build --target INTEROP_TEST -j$(nproc) - -LD_LIBRARY_PATH=/usr/local/cuda-13.1/compat:$LD_LIBRARY_PATH ./cpp/build/gtests/INTEROP_TEST + -DCUDAToolkit_ROOT=/usr/local/cuda \ + -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \ + -DCMAKE_C_COMPILER=gcc \ + -DCMAKE_CXX_COMPILER=g++ \ + -GNinja && cmake --build cpp/build --target INTEROP_TEST --parallel ``` - -Adjust architecture, compiler paths, and CUDA paths for the machine under test. diff --git a/vortex-cuda/src/arrow/canonical.rs b/vortex-cuda/src/arrow/canonical.rs index f0cf4fb37c5..8552d2a4767 100644 --- a/vortex-cuda/src/arrow/canonical.rs +++ b/vortex-cuda/src/arrow/canonical.rs @@ -270,7 +270,7 @@ fn export_canonical( }) } -/// Export a Vortex dictionary array as an Arrow dictionary array. +/// Export a Vortex dictionary array as an Arrow Device dictionary array. /// /// Owns the codes buffers and recursively exported dictionary values. async fn export_dict( @@ -446,13 +446,7 @@ where Ok(BufferHandle::new_device(Arc::new(output_device))) } -/// Export Vortex binary views as standard Arrow `Binary`. -/// -/// cuDF's Arrow Device import currently rejects both Arrow `Binary` and Arrow `BinaryView` -/// (unlike `Utf8View`, which it accepts as strings), so standard `Binary` is exported as the -/// layout other Arrow Device consumers accept most widely. This path keeps conversion on the -/// CUDA stream by building `i32` offsets from view sizes and gathering inline/out-of-line view -/// bytes into one contiguous values buffer. +/// Export Vortex binary views as an Arrow Device array with standard `Binary` layout. async fn export_binary( varbinview: VarBinViewArray, ctx: &mut CudaExecutionCtx, @@ -678,14 +672,13 @@ fn gather_binary_values( ) } -/// Export Vortex validity as an Arrow validity byte buffer on the CUDA device. +/// Export Vortex validity as an Arrow Device validity byte buffer. /// /// Returns `None` for the buffer when Arrow can omit validity because all rows are valid. /// -/// Every returned buffer is backed by an allocation padded to a 4-byte multiple with zeroed -/// padding so cuDF's word-sized mask reads stay in bounds: the fast path through the device -/// copy's tail zeroing, the other paths through their own padded allocations. Bits at positions -/// `>= len + arrow_offset` within the final data byte are unspecified, as Arrow permits. +/// Returned buffers use zeroed 4-byte padding so cuDF's word-sized mask reads stay in bounds. +/// Bits at positions `>= len + arrow_offset` within the final data byte are unspecified, as +/// Arrow permits. pub(super) async fn export_arrow_validity_buffer( validity: Validity, len: usize, @@ -795,7 +788,7 @@ pub(super) fn repack_arrow_validity_buffer( Ok(BufferHandle::new_device(Arc::new(output_device)).slice(0..output_bytes)) } -/// Export a Vortex list-view as Arrow `List`. +/// Export a Vortex list-view as an Arrow Device array with `List` layout. async fn export_list_view( listview: ListViewArray, ctx: &mut CudaExecutionCtx, @@ -822,7 +815,7 @@ async fn export_list_view( .await } -/// Export a standard Vortex list as Arrow `List`: validity, offsets, and one child array. +/// Export a standard Vortex list as an Arrow Device array with `List` layout. async fn export_list( array: ListArray, child_export: ListChildExport, @@ -884,7 +877,7 @@ impl ListChildExport { } } -/// Build the shared Arrow `List` parent once offsets and validity are ready on device. +/// Build the shared Arrow Device `List` parent once offsets and validity are ready. pub(super) async fn export_list_layout( elements: ArrayRef, len: usize, @@ -916,7 +909,7 @@ pub(super) async fn export_list_layout( Ok((arrow_list, sync_event)) } -/// Export a Vortex fixed-size-list as Arrow `List`. +/// Export a Vortex fixed-size-list as an Arrow Device array with `List` layout. /// /// cuDF's Arrow Device import accepts `List`/`LargeList` as cuDF `LIST`, but rejects /// `FixedSizeList`, so emit equivalent standard Arrow `List` offsets. @@ -968,7 +961,7 @@ async fn fixed_size_list_offsets( .await } -/// Return cuDF-supported Arrow `List` offsets as an `i32` device buffer. +/// Return Arrow Device `List` offsets as an `i32` device buffer. async fn export_arrow_list_offsets( offsets: ArrayRef, ctx: &mut CudaExecutionCtx, diff --git a/vortex-cuda/src/arrow/list_view.rs b/vortex-cuda/src/arrow/list_view.rs index b18aaf8c015..6e3f25a939f 100644 --- a/vortex-cuda/src/arrow/list_view.rs +++ b/vortex-cuda/src/arrow/list_view.rs @@ -39,7 +39,7 @@ use crate::CudaExecutionCtx; use crate::cub::exclusive_sum_i32; use crate::executor::CudaArrayExt; -/// Export a Vortex list-view as Arrow `List` using device kernels. +/// Export a Vortex list-view as an Arrow Device array with `List` layout using device kernels. /// /// Reuses contiguous children; rebuilds non-contiguous primitive or dictionary-code children. pub(super) async fn export_device_list_view( @@ -121,7 +121,7 @@ enum DeviceListViewOffsets { RequiresRebuild, } -/// Build cuDF-supported `i32` Arrow `List` offsets from list-view offset/size device buffers. +/// Build Arrow Device `List` offsets from list-view offset/size device buffers. #[expect(clippy::cognitive_complexity)] async fn export_device_list_view_offsets( offsets_ptype: PType, @@ -391,7 +391,7 @@ async fn export_rebuilt_dict_list_view( .await } -/// Rebuild a non-contiguous primitive list-view child and export it as an Arrow List. +/// Rebuild a non-contiguous primitive list-view child and export it as Arrow Device `List`. #[expect(clippy::too_many_arguments)] async fn export_rebuilt_primitive_list_view( elements: ArrayRef, diff --git a/vortex-cuda/src/dynamic_dispatch/plan_builder.rs b/vortex-cuda/src/dynamic_dispatch/plan_builder.rs index 381e40f37dd..1c95d8c4ba1 100644 --- a/vortex-cuda/src/dynamic_dispatch/plan_builder.rs +++ b/vortex-cuda/src/dynamic_dispatch/plan_builder.rs @@ -894,3 +894,29 @@ impl FusedPlan { len * final_elem_bytes.max(output_elem_bytes) } } + +#[cfg(test)] +mod tests { + use vortex::array::IntoArray; + use vortex::array::arrays::PrimitiveArray; + use vortex::array::builtins::ArrayBuiltins; + use vortex::dtype::DType; + use vortex::dtype::Nullability; + + use super::*; + + #[test] + fn cast_to_non_primitive_target_is_not_dyn_dispatch_compatible() -> VortexResult<()> { + let cast = PrimitiveArray::from_iter([0u8, 1]) + .into_array() + .cast(DType::Bool(Nullability::NonNullable))?; + + assert!(!is_dyn_dispatch_cast_compatible(&cast)); + assert!(matches!( + DispatchPlan::new(&cast, CudaDispatchMode::DynDispatchOnly)?, + DispatchPlan::Unfused + )); + + Ok(()) + } +}