diff --git a/vortex-cuda/README.md b/vortex-cuda/README.md index 1579bececdc..24b8f420290 100644 --- a/vortex-cuda/README.md +++ b/vortex-cuda/README.md @@ -10,35 +10,27 @@ Key files: - `vortex-cuda/src/arrow/canonical.rs`: canonical-array export to `ArrowDeviceArray`. - `vortex-test/e2e-cuda/src/lib.rs`: cuDF interop harness. -Current export coverage includes primitive, bool, decimal/temporal, string/binary view, and struct arrays. Remaining work includes null masks, broader dtype coverage, `ArrowDeviceArrayStream`, and PyVortex integration. +## Building cuDF for Arrow Device interop -## cuDF compatibility +The `cudf-test-harness` repository provides prebuilt cuDF binaries for Arrow Device interop testing on +x86_64 and aarch64. -Vortex exports string and binary columns as Arrow `Utf8View` / `BinaryView` device arrays with producer-owned `ArrowArray.private_data`. cuDF string/binary interop requires a build containing `rapidsai/cudf#22620`; until a release version is identified, test with a cuDF commit that includes that change. - -## Building cuDF for interop testing - -Pass a single CUDA architecture, e.g. `-DCMAKE_CUDA_ARCHITECTURES=90a`; otherwise cuDF builds for many architectures and local builds are much slower. +From the cuDF repository root, compile the Arrow Device interop target locally without exporting additional +environment variables: ```sh -export PATH=/usr/local/cuda-13.1/bin:$PATH +cmake -E rm -rf cpp/build cmake -S cpp -B cpp/build \ - -DCMAKE_INSTALL_PREFIX=${CONDA_PREFIX:-/usr/local} \ - -DCMAKE_CUDA_ARCHITECTURES=90a \ + -DCMAKE_INSTALL_PREFIX=/usr/local \ + -DCMAKE_CUDA_ARCHITECTURES=NATIVE \ -DBUILD_TESTS=ON \ -DDISABLE_DEPRECATION_WARNINGS=ON \ -DCMAKE_BUILD_TYPE=Debug \ -DCUDF_BUILD_STREAMS_TEST_UTIL=OFF \ - -DCUDAToolkit_ROOT=/usr/local/cuda-13.1 \ - -DCMAKE_CUDA_COMPILER=/usr/local/cuda-13.1/bin/nvcc \ - -DCMAKE_CXX_COMPILER=/usr/bin/g++-13 \ - -DCMAKE_C_COMPILER=/usr/bin/gcc-13 \ - -GNinja - -cmake --build cpp/build --target INTEROP_TEST -j$(nproc) - -LD_LIBRARY_PATH=/usr/local/cuda-13.1/compat:$LD_LIBRARY_PATH ./cpp/build/gtests/INTEROP_TEST + -DCUDAToolkit_ROOT=/usr/local/cuda \ + -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \ + -DCMAKE_C_COMPILER=gcc \ + -DCMAKE_CXX_COMPILER=g++ \ + -GNinja && cmake --build cpp/build --target INTEROP_TEST --parallel ``` - -Adjust architecture, compiler paths, and CUDA paths for the machine under test. diff --git a/vortex-cuda/kernels/src/arrow_binary.cu b/vortex-cuda/kernels/src/arrow_binary.cu index ae9e2fa8ecb..836d6d1ce29 100644 --- a/vortex-cuda/kernels/src/arrow_binary.cu +++ b/vortex-cuda/kernels/src/arrow_binary.cu @@ -63,8 +63,9 @@ __device__ void init_scan_device(const BinaryView *const __restrict views, const BinaryViewRef *const view_ref = reinterpret_cast(&view); const uint64_t buffer_index = static_cast(view_ref->buffer_index); const uint64_t offset = static_cast(view_ref->offset); + // Both addends are u32 widened to u64, so the end position cannot wrap. const uint64_t end = offset + static_cast(size); - if (buffer_index >= data_buffer_count || end < offset || end > data_buffer_lens[buffer_index]) { + if (buffer_index >= data_buffer_count || end > data_buffer_lens[buffer_index]) { scan[idx] = 0; atomicMax(status, 1u); continue; diff --git a/vortex-cuda/src/arrow/canonical.rs b/vortex-cuda/src/arrow/canonical.rs index 0c9f28cc325..8552d2a4767 100644 --- a/vortex-cuda/src/arrow/canonical.rs +++ b/vortex-cuda/src/arrow/canonical.rs @@ -270,7 +270,7 @@ fn export_canonical( }) } -/// Export a Vortex dictionary array as an Arrow dictionary array. +/// Export a Vortex dictionary array as an Arrow Device dictionary array. /// /// Owns the codes buffers and recursively exported dictionary values. async fn export_dict( @@ -446,11 +446,7 @@ where Ok(BufferHandle::new_device(Arc::new(output_device))) } -/// Export Vortex binary views as standard Arrow `Binary`. -/// -/// cuDF imports Arrow `Binary` through the Arrow Device path, but does not currently accept -/// Arrow `BinaryView`. This path keeps conversion on the CUDA stream by building `i32` offsets -/// from view sizes and gathering inline/out-of-line view bytes into one contiguous values buffer. +/// Export Vortex binary views as an Arrow Device array with standard `Binary` layout. async fn export_binary( varbinview: VarBinViewArray, ctx: &mut CudaExecutionCtx, @@ -676,9 +672,13 @@ fn gather_binary_values( ) } -/// Export Vortex validity as an Arrow validity byte buffer on the CUDA device. +/// Export Vortex validity as an Arrow Device validity byte buffer. /// /// Returns `None` for the buffer when Arrow can omit validity because all rows are valid. +/// +/// Returned buffers use zeroed 4-byte padding so cuDF's word-sized mask reads stay in bounds. +/// Bits at positions `>= len + arrow_offset` within the final data byte are unspecified, as +/// Arrow permits. pub(super) async fn export_arrow_validity_buffer( validity: Validity, len: usize, @@ -788,7 +788,7 @@ pub(super) fn repack_arrow_validity_buffer( Ok(BufferHandle::new_device(Arc::new(output_device)).slice(0..output_bytes)) } -/// Export a standard Vortex list as Arrow `List`: validity, offsets, and one child array. +/// Export a Vortex list-view as an Arrow Device array with `List` layout. async fn export_list_view( listview: ListViewArray, ctx: &mut CudaExecutionCtx, @@ -815,29 +815,12 @@ async fn export_list_view( .await } +/// Export a standard Vortex list as an Arrow Device array with `List` layout. async fn export_list( array: ListArray, child_export: ListChildExport, ctx: &mut CudaExecutionCtx, ) -> VortexResult<(ArrowArray, SyncEvent)> { - let (elements, len, validity_buffer, null_count, offsets_buffer) = - list_layout_parts(array, ctx).await?; - export_list_layout( - elements, - len, - validity_buffer, - null_count, - offsets_buffer, - child_export, - ctx, - ) - .await -} - -async fn list_layout_parts( - array: ListArray, - ctx: &mut CudaExecutionCtx, -) -> VortexResult<(ArrayRef, usize, Option, i64, BufferHandle)> { let len = array.len(); let ListDataParts { elements, @@ -848,7 +831,16 @@ async fn list_layout_parts( let (validity_buffer, null_count) = export_arrow_validity_buffer(validity, len, 0, ctx).await?; let offsets_buffer = export_arrow_list_offsets(offsets, ctx).await?; - Ok((elements, len, validity_buffer, null_count, offsets_buffer)) + export_list_layout( + elements, + len, + validity_buffer, + null_count, + offsets_buffer, + child_export, + ctx, + ) + .await } #[derive(Clone, Copy)] @@ -858,6 +850,11 @@ pub(super) enum ListChildExport { PreserveConcreteLayout, /// Canonicalize temporary encodings introduced by the host ListView /// rebuild, while still preserving rebuilt dictionary children. + /// + /// This is not equivalent to `export_array`: the take-based rebuild wraps + /// children in transient encodings (for example `take` returns `Dict` + /// arrays nested inside struct fields) that the pre-computed export schema + /// does not include, so they must be canonicalized away before export. RebuiltListViewChild, } @@ -880,7 +877,7 @@ impl ListChildExport { } } -/// Build the shared Arrow `List` parent once offsets and validity are ready on device. +/// Build the shared Arrow Device `List` parent once offsets and validity are ready. pub(super) async fn export_list_layout( elements: ArrayRef, len: usize, @@ -891,24 +888,7 @@ pub(super) async fn export_list_layout( ctx: &mut CudaExecutionCtx, ) -> VortexResult<(ArrowArray, SyncEvent)> { let elements_child = child_export.export(elements, ctx).await?; - export_list_layout_with_child( - elements_child, - len, - validity_buffer, - null_count, - offsets_buffer, - ctx, - ) -} -fn export_list_layout_with_child( - elements_child: ArrowArray, - len: usize, - validity_buffer: Option, - null_count: i64, - offsets_buffer: BufferHandle, - ctx: &mut CudaExecutionCtx, -) -> VortexResult<(ArrowArray, SyncEvent)> { let mut private_data = PrivateData::new( vec![validity_buffer, Some(offsets_buffer)], vec![elements_child], @@ -929,7 +909,7 @@ fn export_list_layout_with_child( Ok((arrow_list, sync_event)) } -/// Export a Vortex fixed-size-list as Arrow `List`. +/// Export a Vortex fixed-size-list as an Arrow Device array with `List` layout. /// /// cuDF's Arrow Device import accepts `List`/`LargeList` as cuDF `LIST`, but rejects /// `FixedSizeList`, so emit equivalent standard Arrow `List` offsets. @@ -981,7 +961,7 @@ async fn fixed_size_list_offsets( .await } -/// Return cuDF-supported Arrow `List` offsets as an `i32` device buffer. +/// Return Arrow Device `List` offsets as an `i32` device buffer. async fn export_arrow_list_offsets( offsets: ArrayRef, ctx: &mut CudaExecutionCtx, diff --git a/vortex-cuda/src/arrow/list_view.rs b/vortex-cuda/src/arrow/list_view.rs index b18aaf8c015..6e3f25a939f 100644 --- a/vortex-cuda/src/arrow/list_view.rs +++ b/vortex-cuda/src/arrow/list_view.rs @@ -39,7 +39,7 @@ use crate::CudaExecutionCtx; use crate::cub::exclusive_sum_i32; use crate::executor::CudaArrayExt; -/// Export a Vortex list-view as Arrow `List` using device kernels. +/// Export a Vortex list-view as an Arrow Device array with `List` layout using device kernels. /// /// Reuses contiguous children; rebuilds non-contiguous primitive or dictionary-code children. pub(super) async fn export_device_list_view( @@ -121,7 +121,7 @@ enum DeviceListViewOffsets { RequiresRebuild, } -/// Build cuDF-supported `i32` Arrow `List` offsets from list-view offset/size device buffers. +/// Build Arrow Device `List` offsets from list-view offset/size device buffers. #[expect(clippy::cognitive_complexity)] async fn export_device_list_view_offsets( offsets_ptype: PType, @@ -391,7 +391,7 @@ async fn export_rebuilt_dict_list_view( .await } -/// Rebuild a non-contiguous primitive list-view child and export it as an Arrow List. +/// Rebuild a non-contiguous primitive list-view child and export it as Arrow Device `List`. #[expect(clippy::too_many_arguments)] async fn export_rebuilt_primitive_list_view( elements: ArrayRef, diff --git a/vortex-cuda/src/dynamic_dispatch/plan_builder.rs b/vortex-cuda/src/dynamic_dispatch/plan_builder.rs index f98dbb74a28..1c95d8c4ba1 100644 --- a/vortex-cuda/src/dynamic_dispatch/plan_builder.rs +++ b/vortex-cuda/src/dynamic_dispatch/plan_builder.rs @@ -139,7 +139,9 @@ fn is_dyn_dispatch_cast_compatible(array: &ArrayRef) -> bool { let Ok(source_ptype) = PType::try_from(cast.child_at(0).dtype()) else { return false; }; - let target_ptype = cast.scalar_fn().as_::().as_ptype(); + let Ok(target_ptype) = PType::try_from(cast.scalar_fn().as_::()) else { + return false; + }; // Implemented as unsigned dictionary-code casts to cuDF's signed index types. // LOAD/BITUNPACK materialize directly into the target-width output type. @@ -892,3 +894,29 @@ impl FusedPlan { len * final_elem_bytes.max(output_elem_bytes) } } + +#[cfg(test)] +mod tests { + use vortex::array::IntoArray; + use vortex::array::arrays::PrimitiveArray; + use vortex::array::builtins::ArrayBuiltins; + use vortex::dtype::DType; + use vortex::dtype::Nullability; + + use super::*; + + #[test] + fn cast_to_non_primitive_target_is_not_dyn_dispatch_compatible() -> VortexResult<()> { + let cast = PrimitiveArray::from_iter([0u8, 1]) + .into_array() + .cast(DType::Bool(Nullability::NonNullable))?; + + assert!(!is_dyn_dispatch_cast_compatible(&cast)); + assert!(matches!( + DispatchPlan::new(&cast, CudaDispatchMode::DynDispatchOnly)?, + DispatchPlan::Unfused + )); + + Ok(()) + } +}