Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 13 additions & 21 deletions vortex-cuda/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,35 +10,27 @@ Key files:
- `vortex-cuda/src/arrow/canonical.rs`: canonical-array export to `ArrowDeviceArray`.
- `vortex-test/e2e-cuda/src/lib.rs`: cuDF interop harness.

Current export coverage includes primitive, bool, decimal/temporal, string/binary view, and struct arrays. Remaining work includes null masks, broader dtype coverage, `ArrowDeviceArrayStream`, and PyVortex integration.
## Building cuDF for Arrow Device interop

## cuDF compatibility
The `cudf-test-harness` repository provides prebuilt cuDF binaries for Arrow Device interop testing on
x86_64 and aarch64.

Vortex exports string and binary columns as Arrow `Utf8View` / `BinaryView` device arrays with producer-owned `ArrowArray.private_data`. cuDF string/binary interop requires a build containing `rapidsai/cudf#22620`; until a release version is identified, test with a cuDF commit that includes that change.

## Building cuDF for interop testing

Pass a single CUDA architecture, e.g. `-DCMAKE_CUDA_ARCHITECTURES=90a`; otherwise cuDF builds for many architectures and local builds are much slower.
From the cuDF repository root, compile the Arrow Device interop target locally without exporting additional
environment variables:

```sh
export PATH=/usr/local/cuda-13.1/bin:$PATH
cmake -E rm -rf cpp/build

cmake -S cpp -B cpp/build \
-DCMAKE_INSTALL_PREFIX=${CONDA_PREFIX:-/usr/local} \
-DCMAKE_CUDA_ARCHITECTURES=90a \
-DCMAKE_INSTALL_PREFIX=/usr/local \
-DCMAKE_CUDA_ARCHITECTURES=NATIVE \
-DBUILD_TESTS=ON \
-DDISABLE_DEPRECATION_WARNINGS=ON \
-DCMAKE_BUILD_TYPE=Debug \
-DCUDF_BUILD_STREAMS_TEST_UTIL=OFF \
-DCUDAToolkit_ROOT=/usr/local/cuda-13.1 \
-DCMAKE_CUDA_COMPILER=/usr/local/cuda-13.1/bin/nvcc \
-DCMAKE_CXX_COMPILER=/usr/bin/g++-13 \
-DCMAKE_C_COMPILER=/usr/bin/gcc-13 \
-GNinja

cmake --build cpp/build --target INTEROP_TEST -j$(nproc)

LD_LIBRARY_PATH=/usr/local/cuda-13.1/compat:$LD_LIBRARY_PATH ./cpp/build/gtests/INTEROP_TEST
-DCUDAToolkit_ROOT=/usr/local/cuda \
-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
-DCMAKE_C_COMPILER=gcc \
-DCMAKE_CXX_COMPILER=g++ \
-GNinja && cmake --build cpp/build --target INTEROP_TEST --parallel
```

Adjust architecture, compiler paths, and CUDA paths for the machine under test.
3 changes: 2 additions & 1 deletion vortex-cuda/kernels/src/arrow_binary.cu
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,9 @@ __device__ void init_scan_device(const BinaryView *const __restrict views,
const BinaryViewRef *const view_ref = reinterpret_cast<const BinaryViewRef *>(&view);
const uint64_t buffer_index = static_cast<uint64_t>(view_ref->buffer_index);
const uint64_t offset = static_cast<uint64_t>(view_ref->offset);
// Both addends are u32 widened to u64, so the end position cannot wrap.
const uint64_t end = offset + static_cast<uint64_t>(size);
if (buffer_index >= data_buffer_count || end < offset || end > data_buffer_lens[buffer_index]) {
if (buffer_index >= data_buffer_count || end > data_buffer_lens[buffer_index]) {
scan[idx] = 0;
atomicMax(status, 1u);
continue;
Expand Down
74 changes: 27 additions & 47 deletions vortex-cuda/src/arrow/canonical.rs
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ fn export_canonical(
})
}

/// Export a Vortex dictionary array as an Arrow dictionary array.
/// Export a Vortex dictionary array as an Arrow Device dictionary array.
///
/// Owns the codes buffers and recursively exported dictionary values.
async fn export_dict(
Expand Down Expand Up @@ -446,11 +446,7 @@ where
Ok(BufferHandle::new_device(Arc::new(output_device)))
}

/// Export Vortex binary views as standard Arrow `Binary`.
///
/// cuDF imports Arrow `Binary` through the Arrow Device path, but does not currently accept
/// Arrow `BinaryView`. This path keeps conversion on the CUDA stream by building `i32` offsets
/// from view sizes and gathering inline/out-of-line view bytes into one contiguous values buffer.
/// Export Vortex binary views as an Arrow Device array with standard `Binary` layout.
async fn export_binary(
varbinview: VarBinViewArray,
ctx: &mut CudaExecutionCtx,
Expand Down Expand Up @@ -676,9 +672,13 @@ fn gather_binary_values(
)
}

/// Export Vortex validity as an Arrow validity byte buffer on the CUDA device.
/// Export Vortex validity as an Arrow Device validity byte buffer.
///
/// Returns `None` for the buffer when Arrow can omit validity because all rows are valid.
///
/// Returned buffers use zeroed 4-byte padding so cuDF's word-sized mask reads stay in bounds.
/// Bits at positions `>= len + arrow_offset` within the final data byte are unspecified, as
/// Arrow permits.
pub(super) async fn export_arrow_validity_buffer(
validity: Validity,
len: usize,
Expand Down Expand Up @@ -788,7 +788,7 @@ pub(super) fn repack_arrow_validity_buffer(
Ok(BufferHandle::new_device(Arc::new(output_device)).slice(0..output_bytes))
}

/// Export a standard Vortex list as Arrow `List`: validity, offsets, and one child array.
/// Export a Vortex list-view as an Arrow Device array with `List` layout.
async fn export_list_view(
listview: ListViewArray,
ctx: &mut CudaExecutionCtx,
Expand All @@ -815,29 +815,12 @@ async fn export_list_view(
.await
}

/// Export a standard Vortex list as an Arrow Device array with `List` layout.
async fn export_list(
array: ListArray,
child_export: ListChildExport,
ctx: &mut CudaExecutionCtx,
) -> VortexResult<(ArrowArray, SyncEvent)> {
let (elements, len, validity_buffer, null_count, offsets_buffer) =
list_layout_parts(array, ctx).await?;
export_list_layout(
elements,
len,
validity_buffer,
null_count,
offsets_buffer,
child_export,
ctx,
)
.await
}

async fn list_layout_parts(
array: ListArray,
ctx: &mut CudaExecutionCtx,
) -> VortexResult<(ArrayRef, usize, Option<BufferHandle>, i64, BufferHandle)> {
let len = array.len();
let ListDataParts {
elements,
Expand All @@ -848,7 +831,16 @@ async fn list_layout_parts(

let (validity_buffer, null_count) = export_arrow_validity_buffer(validity, len, 0, ctx).await?;
let offsets_buffer = export_arrow_list_offsets(offsets, ctx).await?;
Ok((elements, len, validity_buffer, null_count, offsets_buffer))
export_list_layout(
elements,
len,
validity_buffer,
null_count,
offsets_buffer,
child_export,
ctx,
)
.await
}

#[derive(Clone, Copy)]
Expand All @@ -858,6 +850,11 @@ pub(super) enum ListChildExport {
PreserveConcreteLayout,
/// Canonicalize temporary encodings introduced by the host ListView
/// rebuild, while still preserving rebuilt dictionary children.
///
/// This is not equivalent to `export_array`: the take-based rebuild wraps
/// children in transient encodings (for example `take` returns `Dict`
/// arrays nested inside struct fields) that the pre-computed export schema
/// does not include, so they must be canonicalized away before export.
RebuiltListViewChild,
}

Expand All @@ -880,7 +877,7 @@ impl ListChildExport {
}
}

/// Build the shared Arrow `List` parent once offsets and validity are ready on device.
/// Build the shared Arrow Device `List` parent once offsets and validity are ready.
pub(super) async fn export_list_layout(
elements: ArrayRef,
len: usize,
Expand All @@ -891,24 +888,7 @@ pub(super) async fn export_list_layout(
ctx: &mut CudaExecutionCtx,
) -> VortexResult<(ArrowArray, SyncEvent)> {
let elements_child = child_export.export(elements, ctx).await?;
export_list_layout_with_child(
elements_child,
len,
validity_buffer,
null_count,
offsets_buffer,
ctx,
)
}

fn export_list_layout_with_child(
elements_child: ArrowArray,
len: usize,
validity_buffer: Option<BufferHandle>,
null_count: i64,
offsets_buffer: BufferHandle,
ctx: &mut CudaExecutionCtx,
) -> VortexResult<(ArrowArray, SyncEvent)> {
let mut private_data = PrivateData::new(
vec![validity_buffer, Some(offsets_buffer)],
vec![elements_child],
Expand All @@ -929,7 +909,7 @@ fn export_list_layout_with_child(
Ok((arrow_list, sync_event))
}

/// Export a Vortex fixed-size-list as Arrow `List`.
/// Export a Vortex fixed-size-list as an Arrow Device array with `List` layout.
///
/// cuDF's Arrow Device import accepts `List`/`LargeList` as cuDF `LIST`, but rejects
/// `FixedSizeList`, so emit equivalent standard Arrow `List` offsets.
Expand Down Expand Up @@ -981,7 +961,7 @@ async fn fixed_size_list_offsets(
.await
}

/// Return cuDF-supported Arrow `List` offsets as an `i32` device buffer.
/// Return Arrow Device `List` offsets as an `i32` device buffer.
async fn export_arrow_list_offsets(
offsets: ArrayRef,
ctx: &mut CudaExecutionCtx,
Expand Down
6 changes: 3 additions & 3 deletions vortex-cuda/src/arrow/list_view.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ use crate::CudaExecutionCtx;
use crate::cub::exclusive_sum_i32;
use crate::executor::CudaArrayExt;

/// Export a Vortex list-view as Arrow `List` using device kernels.
/// Export a Vortex list-view as an Arrow Device array with `List` layout using device kernels.
///
/// Reuses contiguous children; rebuilds non-contiguous primitive or dictionary-code children.
pub(super) async fn export_device_list_view(
Expand Down Expand Up @@ -121,7 +121,7 @@ enum DeviceListViewOffsets {
RequiresRebuild,
}

/// Build cuDF-supported `i32` Arrow `List` offsets from list-view offset/size device buffers.
/// Build Arrow Device `List` offsets from list-view offset/size device buffers.
#[expect(clippy::cognitive_complexity)]
async fn export_device_list_view_offsets(
offsets_ptype: PType,
Expand Down Expand Up @@ -391,7 +391,7 @@ async fn export_rebuilt_dict_list_view(
.await
}

/// Rebuild a non-contiguous primitive list-view child and export it as an Arrow List.
/// Rebuild a non-contiguous primitive list-view child and export it as Arrow Device `List`.
#[expect(clippy::too_many_arguments)]
async fn export_rebuilt_primitive_list_view(
elements: ArrayRef,
Expand Down
30 changes: 29 additions & 1 deletion vortex-cuda/src/dynamic_dispatch/plan_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,9 @@ fn is_dyn_dispatch_cast_compatible(array: &ArrayRef) -> bool {
let Ok(source_ptype) = PType::try_from(cast.child_at(0).dtype()) else {
return false;
};
let target_ptype = cast.scalar_fn().as_::<Cast>().as_ptype();
let Ok(target_ptype) = PType::try_from(cast.scalar_fn().as_::<Cast>()) else {
return false;
};

// Implemented as unsigned dictionary-code casts to cuDF's signed index types.
// LOAD/BITUNPACK materialize directly into the target-width output type.
Expand Down Expand Up @@ -892,3 +894,29 @@ impl FusedPlan {
len * final_elem_bytes.max(output_elem_bytes)
}
}

#[cfg(test)]
mod tests {
use vortex::array::IntoArray;
use vortex::array::arrays::PrimitiveArray;
use vortex::array::builtins::ArrayBuiltins;
use vortex::dtype::DType;
use vortex::dtype::Nullability;

use super::*;

#[test]
fn cast_to_non_primitive_target_is_not_dyn_dispatch_compatible() -> VortexResult<()> {
let cast = PrimitiveArray::from_iter([0u8, 1])
.into_array()
.cast(DType::Bool(Nullability::NonNullable))?;

assert!(!is_dyn_dispatch_cast_compatible(&cast));
assert!(matches!(
DispatchPlan::new(&cast, CudaDispatchMode::DynDispatchOnly)?,
DispatchPlan::Unfused
));

Ok(())
}
}
Loading