vortex-data · 0ax1 · Jun 12, 2026 · Jun 11, 2026 · Jun 12, 2026
diff --git a/vortex-cuda/README.md b/vortex-cuda/README.md
@@ -10,35 +10,27 @@ Key files:
 - `vortex-cuda/src/arrow/canonical.rs`: canonical-array export to `ArrowDeviceArray`.
 - `vortex-test/e2e-cuda/src/lib.rs`: cuDF interop harness.
 
-Current export coverage includes primitive, bool, decimal/temporal, string/binary view, and struct arrays. Remaining work includes null masks, broader dtype coverage, `ArrowDeviceArrayStream`, and PyVortex integration.
+## Building cuDF for Arrow Device interop
 
-## cuDF compatibility
+The `cudf-test-harness` repository provides prebuilt cuDF binaries for Arrow Device interop testing on
+x86_64 and aarch64.
 
-Vortex exports string and binary columns as Arrow `Utf8View` / `BinaryView` device arrays with producer-owned `ArrowArray.private_data`. cuDF string/binary interop requires a build containing `rapidsai/cudf#22620`; until a release version is identified, test with a cuDF commit that includes that change.
-
-## Building cuDF for interop testing
-
-Pass a single CUDA architecture, e.g. `-DCMAKE_CUDA_ARCHITECTURES=90a`; otherwise cuDF builds for many architectures and local builds are much slower.
+From the cuDF repository root, compile the Arrow Device interop target locally without exporting additional
+environment variables:
 
 ```sh
-export PATH=/usr/local/cuda-13.1/bin:$PATH
+cmake -E rm -rf cpp/build
 
 cmake -S cpp -B cpp/build \
-  -DCMAKE_INSTALL_PREFIX=${CONDA_PREFIX:-/usr/local} \
-  -DCMAKE_CUDA_ARCHITECTURES=90a \
+  -DCMAKE_INSTALL_PREFIX=/usr/local \
+  -DCMAKE_CUDA_ARCHITECTURES=NATIVE \
   -DBUILD_TESTS=ON \
   -DDISABLE_DEPRECATION_WARNINGS=ON \
   -DCMAKE_BUILD_TYPE=Debug \
   -DCUDF_BUILD_STREAMS_TEST_UTIL=OFF \
-  -DCUDAToolkit_ROOT=/usr/local/cuda-13.1 \
-  -DCMAKE_CUDA_COMPILER=/usr/local/cuda-13.1/bin/nvcc \
-  -DCMAKE_CXX_COMPILER=/usr/bin/g++-13 \
-  -DCMAKE_C_COMPILER=/usr/bin/gcc-13 \
-  -GNinja
-
-cmake --build cpp/build --target INTEROP_TEST -j$(nproc)
-
-LD_LIBRARY_PATH=/usr/local/cuda-13.1/compat:$LD_LIBRARY_PATH ./cpp/build/gtests/INTEROP_TEST
+  -DCUDAToolkit_ROOT=/usr/local/cuda \
+  -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
+  -DCMAKE_C_COMPILER=gcc \
+  -DCMAKE_CXX_COMPILER=g++ \
+  -GNinja && cmake --build cpp/build --target INTEROP_TEST --parallel
 ```
-
-Adjust architecture, compiler paths, and CUDA paths for the machine under test.
diff --git a/vortex-cuda/kernels/src/arrow_binary.cu b/vortex-cuda/kernels/src/arrow_binary.cu
@@ -63,8 +63,9 @@ __device__ void init_scan_device(const BinaryView *const __restrict views,
             const BinaryViewRef *const view_ref = reinterpret_cast<const BinaryViewRef *>(&view);
             const uint64_t buffer_index = static_cast<uint64_t>(view_ref->buffer_index);
             const uint64_t offset = static_cast<uint64_t>(view_ref->offset);
+            // Both addends are u32 widened to u64, so the end position cannot wrap.
             const uint64_t end = offset + static_cast<uint64_t>(size);
-            if (buffer_index >= data_buffer_count || end < offset || end > data_buffer_lens[buffer_index]) {
+            if (buffer_index >= data_buffer_count || end > data_buffer_lens[buffer_index]) {
                 scan[idx] = 0;
                 atomicMax(status, 1u);
                 continue;

diff --git a/vortex-cuda/src/arrow/canonical.rs b/vortex-cuda/src/arrow/canonical.rs
@@ -270,7 +270,7 @@ fn export_canonical(
     })
 }
 
-/// Export a Vortex dictionary array as an Arrow dictionary array.
+/// Export a Vortex dictionary array as an Arrow Device dictionary array.
 ///
 /// Owns the codes buffers and recursively exported dictionary values.
 async fn export_dict(
@@ -446,11 +446,7 @@ where
     Ok(BufferHandle::new_device(Arc::new(output_device)))
 }
 
-/// Export Vortex binary views as standard Arrow `Binary`.
-///
-/// cuDF imports Arrow `Binary` through the Arrow Device path, but does not currently accept
-/// Arrow `BinaryView`. This path keeps conversion on the CUDA stream by building `i32` offsets
-/// from view sizes and gathering inline/out-of-line view bytes into one contiguous values buffer.
+/// Export Vortex binary views as an Arrow Device array with standard `Binary` layout.
 async fn export_binary(
     varbinview: VarBinViewArray,
     ctx: &mut CudaExecutionCtx,
@@ -676,9 +672,13 @@ fn gather_binary_values(
     )
 }
 
-/// Export Vortex validity as an Arrow validity byte buffer on the CUDA device.
+/// Export Vortex validity as an Arrow Device validity byte buffer.
 ///
 /// Returns `None` for the buffer when Arrow can omit validity because all rows are valid.
+///
+/// Returned buffers use zeroed 4-byte padding so cuDF's word-sized mask reads stay in bounds.
+/// Bits at positions `>= len + arrow_offset` within the final data byte are unspecified, as
+/// Arrow permits.
 pub(super) async fn export_arrow_validity_buffer(
     validity: Validity,
     len: usize,
@@ -788,7 +788,7 @@ pub(super) fn repack_arrow_validity_buffer(
     Ok(BufferHandle::new_device(Arc::new(output_device)).slice(0..output_bytes))
 }
 
-/// Export a standard Vortex list as Arrow `List`: validity, offsets, and one child array.
+/// Export a Vortex list-view as an Arrow Device array with `List` layout.
 async fn export_list_view(
     listview: ListViewArray,
     ctx: &mut CudaExecutionCtx,
@@ -815,29 +815,12 @@ async fn export_list_view(
     .await
 }
 
+/// Export a standard Vortex list as an Arrow Device array with `List` layout.
 async fn export_list(
     array: ListArray,
     child_export: ListChildExport,
     ctx: &mut CudaExecutionCtx,
 ) -> VortexResult<(ArrowArray, SyncEvent)> {
-    let (elements, len, validity_buffer, null_count, offsets_buffer) =
-        list_layout_parts(array, ctx).await?;
-    export_list_layout(
-        elements,
-        len,
-        validity_buffer,
-        null_count,
-        offsets_buffer,
-        child_export,
-        ctx,
-    )
-    .await
-}
-
-async fn list_layout_parts(
-    array: ListArray,
-    ctx: &mut CudaExecutionCtx,
-) -> VortexResult<(ArrayRef, usize, Option<BufferHandle>, i64, BufferHandle)> {
     let len = array.len();
     let ListDataParts {
         elements,
@@ -848,7 +831,16 @@ async fn list_layout_parts(
 
     let (validity_buffer, null_count) = export_arrow_validity_buffer(validity, len, 0, ctx).await?;
     let offsets_buffer = export_arrow_list_offsets(offsets, ctx).await?;
-    Ok((elements, len, validity_buffer, null_count, offsets_buffer))
+    export_list_layout(
+        elements,
+        len,
+        validity_buffer,
+        null_count,
+        offsets_buffer,
+        child_export,
+        ctx,
+    )
+    .await
 }
 
 #[derive(Clone, Copy)]
@@ -858,6 +850,11 @@ pub(super) enum ListChildExport {
     PreserveConcreteLayout,
     /// Canonicalize temporary encodings introduced by the host ListView
     /// rebuild, while still preserving rebuilt dictionary children.
+    ///
+    /// This is not equivalent to `export_array`: the take-based rebuild wraps
+    /// children in transient encodings (for example `take` returns `Dict`
+    /// arrays nested inside struct fields) that the pre-computed export schema
+    /// does not include, so they must be canonicalized away before export.
     RebuiltListViewChild,
 }
 
@@ -880,7 +877,7 @@ impl ListChildExport {
     }
 }
 
-/// Build the shared Arrow `List` parent once offsets and validity are ready on device.
+/// Build the shared Arrow Device `List` parent once offsets and validity are ready.
 pub(super) async fn export_list_layout(
     elements: ArrayRef,
     len: usize,
@@ -891,24 +888,7 @@ pub(super) async fn export_list_layout(
     ctx: &mut CudaExecutionCtx,
 ) -> VortexResult<(ArrowArray, SyncEvent)> {
     let elements_child = child_export.export(elements, ctx).await?;
-    export_list_layout_with_child(
-        elements_child,
-        len,
-        validity_buffer,
-        null_count,
-        offsets_buffer,
-        ctx,
-    )
-}
 
-fn export_list_layout_with_child(
-    elements_child: ArrowArray,
-    len: usize,
-    validity_buffer: Option<BufferHandle>,
-    null_count: i64,
-    offsets_buffer: BufferHandle,
-    ctx: &mut CudaExecutionCtx,
-) -> VortexResult<(ArrowArray, SyncEvent)> {
     let mut private_data = PrivateData::new(
         vec![validity_buffer, Some(offsets_buffer)],
         vec![elements_child],
@@ -929,7 +909,7 @@ fn export_list_layout_with_child(
     Ok((arrow_list, sync_event))
 }
 
-/// Export a Vortex fixed-size-list as Arrow `List`.
+/// Export a Vortex fixed-size-list as an Arrow Device array with `List` layout.
 ///
 /// cuDF's Arrow Device import accepts `List`/`LargeList` as cuDF `LIST`, but rejects
 /// `FixedSizeList`, so emit equivalent standard Arrow `List` offsets.
@@ -981,7 +961,7 @@ async fn fixed_size_list_offsets(
     .await
 }
 
-/// Return cuDF-supported Arrow `List` offsets as an `i32` device buffer.
+/// Return Arrow Device `List` offsets as an `i32` device buffer.
 async fn export_arrow_list_offsets(
     offsets: ArrayRef,
     ctx: &mut CudaExecutionCtx,

diff --git a/vortex-cuda/src/arrow/list_view.rs b/vortex-cuda/src/arrow/list_view.rs
@@ -39,7 +39,7 @@ use crate::CudaExecutionCtx;
 use crate::cub::exclusive_sum_i32;
 use crate::executor::CudaArrayExt;
 
-/// Export a Vortex list-view as Arrow `List` using device kernels.
+/// Export a Vortex list-view as an Arrow Device array with `List` layout using device kernels.
 ///
 /// Reuses contiguous children; rebuilds non-contiguous primitive or dictionary-code children.
 pub(super) async fn export_device_list_view(
@@ -121,7 +121,7 @@ enum DeviceListViewOffsets {
     RequiresRebuild,
 }
 
-/// Build cuDF-supported `i32` Arrow `List` offsets from list-view offset/size device buffers.
+/// Build Arrow Device `List` offsets from list-view offset/size device buffers.
 #[expect(clippy::cognitive_complexity)]
 async fn export_device_list_view_offsets(
     offsets_ptype: PType,
@@ -391,7 +391,7 @@ async fn export_rebuilt_dict_list_view(
     .await
 }
 
-/// Rebuild a non-contiguous primitive list-view child and export it as an Arrow List.
+/// Rebuild a non-contiguous primitive list-view child and export it as Arrow Device `List`.
 #[expect(clippy::too_many_arguments)]
 async fn export_rebuilt_primitive_list_view(
     elements: ArrayRef,

diff --git a/vortex-cuda/src/dynamic_dispatch/plan_builder.rs b/vortex-cuda/src/dynamic_dispatch/plan_builder.rs
@@ -139,7 +139,9 @@ fn is_dyn_dispatch_cast_compatible(array: &ArrayRef) -> bool {
     let Ok(source_ptype) = PType::try_from(cast.child_at(0).dtype()) else {
         return false;
     };
-    let target_ptype = cast.scalar_fn().as_::<Cast>().as_ptype();
+    let Ok(target_ptype) = PType::try_from(cast.scalar_fn().as_::<Cast>()) else {
+        return false;
+    };
 
     // Implemented as unsigned dictionary-code casts to cuDF's signed index types.
     // LOAD/BITUNPACK materialize directly into the target-width output type.
@@ -892,3 +894,29 @@ impl FusedPlan {
         len * final_elem_bytes.max(output_elem_bytes)
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use vortex::array::IntoArray;
+    use vortex::array::arrays::PrimitiveArray;
+    use vortex::array::builtins::ArrayBuiltins;
+    use vortex::dtype::DType;
+    use vortex::dtype::Nullability;
+
+    use super::*;
+
+    #[test]
+    fn cast_to_non_primitive_target_is_not_dyn_dispatch_compatible() -> VortexResult<()> {
+        let cast = PrimitiveArray::from_iter([0u8, 1])
+            .into_array()
+            .cast(DType::Bool(Nullability::NonNullable))?;
+
+        assert!(!is_dyn_dispatch_cast_compatible(&cast));
+        assert!(matches!(
+            DispatchPlan::new(&cast, CudaDispatchMode::DynDispatchOnly)?,
+            DispatchPlan::Unfused
+        ));
+
+        Ok(())
+    }
+}