From ea1bc69693e127e1bfa00c2056eaea9039a7d2bd Mon Sep 17 00:00:00 2001
From: Alexander Droste <alexander.droste@protonmail.com>
Date: Mon, 1 Jun 2026 15:41:49 +0000
Subject: [PATCH 1/6] feat: emit Decimal32/Decimal64 from the Arrow exporter

Cherry-pick of #8197 (Joe Isaacs). Enables narrow Arrow decimal types
in the shared Vortex-to-Arrow mapping now that DataFusion 51+ and
arrow-rs 56+ support them.

Signed-off-by: Alexander Droste <alex@spiraldb.com>
Co-Authored-By: Joe Isaacs <joe.isaacs@live.co.uk>

Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>
---
 vortex-array/src/dtype/arrow.rs | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)
diff --git a/vortex-array/src/dtype/arrow.rs b/vortex-array/src/dtype/arrow.rs
index 8dd80082533..d0dd1fc48ac 100644
--- a/vortex-array/src/dtype/arrow.rs
+++ b/vortex-array/src/dtype/arrow.rs
@@ -298,13 +298,12 @@ pub(crate) fn to_data_type_naive(dtype: &DType) -> VortexResult<DataType> {
             let scale = dt.scale();
 
             match precision {
-                // This code is commented out until DataFusion improves its support for smaller decimals.
-                // // DECIMAL32_MAX_PRECISION
-                // 0..=9 => DataType::Decimal32(precision, scale),
-                // // DECIMAL64_MAX_PRECISION
-                // 10..=18 => DataType::Decimal64(precision, scale),
+                // DECIMAL32_MAX_PRECISION
+                0..=9 => DataType::Decimal32(precision, scale),
+                // DECIMAL64_MAX_PRECISION
+                10..=18 => DataType::Decimal64(precision, scale),
                 // DECIMAL128_MAX_PRECISION
-                0..=38 => DataType::Decimal128(precision, scale),
+                19..=38 => DataType::Decimal128(precision, scale),
                 // DECIMAL256_MAX_PRECISION
                 39.. => DataType::Decimal256(precision, scale),
             }
@@ -450,6 +449,25 @@ mod test {
         );
     }
 
+    #[rstest]
+    #[case(1, DataType::Decimal32(1, 0))]
+    #[case(9, DataType::Decimal32(9, 0))]
+    #[case(10, DataType::Decimal64(10, 0))]
+    #[case(18, DataType::Decimal64(18, 0))]
+    #[case(19, DataType::Decimal128(19, 0))]
+    #[case(38, DataType::Decimal128(38, 0))]
+    #[case(39, DataType::Decimal256(39, 0))]
+    #[case(76, DataType::Decimal256(76, 0))]
+    fn test_decimal_dtype_to_arrow_smallest_width(
+        #[case] precision: u8,
+        #[case] expected: DataType,
+    ) {
+        use crate::dtype::DecimalDType;
+
+        let dtype = DType::Decimal(DecimalDType::new(precision, 0), Nullability::NonNullable);
+        assert_eq!(dtype.to_arrow_dtype().unwrap(), expected);
+    }
+
     #[test]
     fn test_variant_dtype_to_arrow_dtype_errors() {
         let err = DType::Variant(Nullability::NonNullable)

From 8e2edf602547eed7768270d6861208b08e2adf51 Mon Sep 17 00:00:00 2001
From: Alexander Droste <alexander.droste@protonmail.com>
Date: Mon, 1 Jun 2026 15:42:00 +0000
Subject: [PATCH 2/6] feat(cuda): widen decimals on-device for Arrow C Device
 export

Vortex DecimalArray decouples physical storage width from declared
precision, but Arrow fixes the buffer width by precision. Add a CUDA
kernel that widens decimal value buffers on the GPU during export so
they match the Arrow-required layout (Decimal32/64/128/256).

The kernel handles all source widths (i8..i256) since compression can
produce narrow storage for any precision. Narrowing (storage wider than
the target) is rejected rather than silently truncated, since detecting
overflow on-device would need a device-to-host sync.

Signed-off-by: Alexander Droste <alex@spiraldb.com>
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>
---
 vortex-cuda/kernels/src/decimal_cast.cu | 107 ++++++
 vortex-cuda/src/arrow/canonical.rs      | 428 ++++++++++++++++++++++--
 vortex-cuda/src/arrow/mod.rs            |  16 +-
 vortex-test/e2e-cuda/src/lib.rs         |  10 +-
 4 files changed, 520 insertions(+), 41 deletions(-)
 create mode 100644 vortex-cuda/kernels/src/decimal_cast.cu

diff --git a/vortex-cuda/kernels/src/decimal_cast.cu b/vortex-cuda/kernels/src/decimal_cast.cu
new file mode 100644
index 00000000000..22a8a130729
--- /dev/null
+++ b/vortex-cuda/kernels/src/decimal_cast.cu
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#include "config.cuh"
+#include "types.cuh"
+#include <stdint.h>
+#include <type_traits>
+
+// Arrow decimal schemas fix the physical values buffer width:
+//   - Decimal32: 4 bytes per value.
+//   - Decimal64: 8 bytes per value.
+//   - Decimal128: 16 bytes per value.
+//   - Decimal256: 32 bytes per value.
+//
+// Vortex storage width can differ, so export casts to the schema-implied width.
+// Rust-side export rejects narrowing casts because detecting overflow on-device
+// would require synchronizing an overflow flag back to the host.
+
+// Low 64-bit conversion for Decimal32/64 outputs.
+template <typename Input>
+__device__ __forceinline__ int64_t decimal_to_i64(Input value) {
+    if constexpr (std::is_same_v<Input, int128_t>) {
+        return value.lo;
+    } else if constexpr (std::is_same_v<Input, int256_t>) {
+        return value.parts[0];
+    } else {
+        return static_cast<int64_t>(value);
+    }
+}
+
+// 128-bit conversion for Decimal128 outputs.
+template <typename Input>
+__device__ __forceinline__ int128_t decimal_to_i128(Input value) {
+    if constexpr (std::is_same_v<Input, int128_t>) {
+        return value;
+    } else if constexpr (std::is_same_v<Input, int256_t>) {
+        return int128_t {value.parts[0], value.parts[1]};
+    } else {
+        const int64_t lo = static_cast<int64_t>(value);
+        const int64_t hi = value < 0 ? -1 : 0;
+        return int128_t {lo, hi};
+    }
+}
+
+// Convert one value to the Arrow schema's physical width.
+template <typename Output, typename Input>
+__device__ __forceinline__ Output decimal_cast_value(Input value) {
+    if constexpr (std::is_same_v<Output, int32_t>) {
+        return static_cast<int32_t>(decimal_to_i64(value));
+    } else if constexpr (std::is_same_v<Output, int64_t>) {
+        return decimal_to_i64(value);
+    } else if constexpr (std::is_same_v<Output, int128_t>) {
+        return decimal_to_i128(value);
+    } else {
+        static_assert(std::is_same_v<Output, int256_t>);
+        if constexpr (std::is_same_v<Input, int256_t>) {
+            return value;
+        } else {
+            const int128_t value128 = decimal_to_i128(value);
+            const int64_t sign = value128.hi < 0 ? -1 : 0;
+            return int256_t {{value128.lo, value128.hi, sign, sign}};
+        }
+    }
+}
+
+// Cast a contiguous values buffer to the Arrow schema's physical width.
+template <typename Input, typename Output>
+__device__ void
+decimal_cast_device(const Input *__restrict input, Output *__restrict output, uint64_t array_len) {
+    const uint64_t worker = blockIdx.x * blockDim.x + threadIdx.x;
+    const uint64_t startElem = start_elem(worker, array_len);
+    const uint64_t stopElem = stop_elem(worker, array_len);
+
+    if (startElem >= array_len) {
+        return;
+    }
+
+    for (uint64_t idx = startElem; idx < stopElem; idx++) {
+        output[idx] = decimal_cast_value<Output>(input[idx]);
+    }
+}
+
+// Generate Decimal32/64/128/256 cast kernels for one input storage type.
+#define GENERATE_DECIMAL_CAST_KERNELS(input_suffix, InputType)                                               \
+    extern "C" __global__ void decimal_cast_##input_suffix##_i32(const InputType *__restrict input,          \
+                                                                 int32_t *__restrict output,                 \
+                                                                 uint64_t array_len) {                       \
+        decimal_cast_device(input, output, array_len);                                                       \
+    }                                                                                                        \
+    extern "C" __global__ void decimal_cast_##input_suffix##_i64(const InputType *__restrict input,          \
+                                                                 int64_t *__restrict output,                 \
+                                                                 uint64_t array_len) {                       \
+        decimal_cast_device(input, output, array_len);                                                       \
+    }                                                                                                        \
+    extern "C" __global__ void decimal_cast_##input_suffix##_i128(const InputType *__restrict input,         \
+                                                                  int128_t *__restrict output,               \
+                                                                  uint64_t array_len) {                      \
+        decimal_cast_device(input, output, array_len);                                                       \
+    }                                                                                                        \
+    extern "C" __global__ void decimal_cast_##input_suffix##_i256(const InputType *__restrict input,         \
+                                                                  int256_t *__restrict output,               \
+                                                                  uint64_t array_len) {                      \
+        decimal_cast_device(input, output, array_len);                                                       \
+    }
+
+FOR_EACH_SIGNED_INT(GENERATE_DECIMAL_CAST_KERNELS)
+FOR_EACH_LARGE_DECIMAL(GENERATE_DECIMAL_CAST_KERNELS)
diff --git a/vortex-cuda/src/arrow/canonical.rs b/vortex-cuda/src/arrow/canonical.rs
index b1bee7c018f..fd6d6c75174 100644
--- a/vortex-cuda/src/arrow/canonical.rs
+++ b/vortex-cuda/src/arrow/canonical.rs
@@ -3,11 +3,15 @@
 
 use std::mem;
 use std::ptr;
+use std::sync::Arc;
 
 use async_trait::async_trait;
+use cudarc::driver::DeviceRepr;
+use cudarc::driver::PushKernelArg;
 use futures::future::BoxFuture;
 use vortex::array::ArrayRef;
 use vortex::array::Canonical;
+use vortex::array::arrays::DecimalArray;
 use vortex::array::arrays::FixedSizeListArray;
 use vortex::array::arrays::ListArray;
 use vortex::array::arrays::PrimitiveArray;
@@ -24,13 +28,17 @@ use vortex::array::arrays::struct_::StructDataParts;
 use vortex::array::arrays::varbinview::VarBinViewDataParts;
 use vortex::array::buffer::BufferHandle;
 use vortex::array::builtins::ArrayBuiltins;
+use vortex::array::match_each_decimal_value_type;
 use vortex::array::validity::Validity;
 use vortex::buffer::Buffer;
 use vortex::buffer::ByteBuffer;
 use vortex::dtype::DType;
+use vortex::dtype::DecimalDType;
 use vortex::dtype::DecimalType;
+use vortex::dtype::NativeDecimalType;
 use vortex::dtype::Nullability;
 use vortex::dtype::PType;
+use vortex::dtype::i256;
 use vortex::error::VortexResult;
 use vortex::error::vortex_bail;
 use vortex::error::vortex_ensure;
@@ -39,6 +47,8 @@ use vortex::extension::datetime::AnyTemporal;
 use vortex::mask::Mask;
 
 use super::list_view::export_device_list_view;
+use crate::CudaBufferExt;
+use crate::CudaDeviceBuffer;
 use crate::CudaExecutionCtx;
 use crate::arrow::ARROW_DEVICE_CUDA;
 use crate::arrow::ArrowArray;
@@ -106,27 +116,7 @@ fn export_canonical(
                 // we don't need a sync event for Null since no data is copied.
                 Ok((array, ptr::null_mut()))
             }
-            Canonical::Decimal(decimal) => {
-                let len = decimal.len();
-                let DecimalDataParts {
-                    values,
-                    values_type,
-                    validity,
-                    ..
-                } = decimal.into_data_parts();
-
-                // TODO(aduffy): GPU kernel for upcasting.
-                vortex_ensure!(
-                    values_type >= DecimalType::I32,
-                    "cannot export DecimalArray with values type {values_type}. must be i32 or wider."
-                );
-
-                let (validity_buffer, null_count) =
-                    export_arrow_validity_buffer(validity, len, 0, ctx).await?;
-                let buffer = ctx.ensure_on_device(values).await?;
-
-                export_fixed_size(buffer, len, 0, validity_buffer, null_count, ctx)
-            }
+            Canonical::Decimal(decimal) => export_decimal(decimal, ctx).await,
             Canonical::Extension(extension) => {
                 if !extension.ext_dtype().is::<AnyTemporal>() {
                     vortex_bail!("only support temporal extension types currently");
@@ -242,6 +232,130 @@ fn export_canonical(
     })
 }
 
+/// Exports decimals with value buffers cast to Arrow's Decimal32/64/128/256 layout.
+///
+/// Decimal values are already decoded; this only adapts the physical buffer width. Keep this in
+/// sync with the schema mapping in `arrow_schema_for_array`. Storage-to-Arrow narrowing is rejected
+/// instead of checked on-device to avoid a device-to-host synchronization point.
+async fn export_decimal(
+    decimal: DecimalArray,
+    ctx: &mut CudaExecutionCtx,
+) -> VortexResult<(ArrowArray, SyncEvent)> {
+    let len = decimal.len();
+    let DecimalDataParts {
+        decimal_dtype,
+        values,
+        values_type,
+        validity,
+    } = decimal.into_data_parts();
+
+    let (validity_buffer, null_count) = export_arrow_validity_buffer(validity, len, 0, ctx).await?;
+    let target_type = arrow_decimal_value_type(decimal_dtype);
+    let values = export_decimal_values(values, values_type, target_type, len, ctx).await?;
+
+    export_fixed_size(values, len, 0, validity_buffer, null_count, ctx)
+}
+
+/// Returns the Arrow physical value type for a decimal dtype.
+///
+/// Must match the shared Vortex-to-Arrow decimal mapping in
+/// `vortex-array/src/dtype/arrow.rs`.
+fn arrow_decimal_value_type(decimal_dtype: DecimalDType) -> DecimalType {
+    match decimal_dtype.precision() {
+        1..=9 => DecimalType::I32,
+        10..=18 => DecimalType::I64,
+        19..=38 => DecimalType::I128,
+        39..=76 => DecimalType::I256,
+        0 => unreachable!("precision must be greater than 0"),
+        p => unreachable!("precision larger than 76 is invalid found precision {p}"),
+    }
+}
+
+/// Ensure the values buffer is on-device and has the Arrow-required decimal width.
+///
+/// Storage wider than the precision-implied Arrow width is rejected. Callers that hit this
+/// should narrow the storage via a decimal cast before exporting.
+async fn export_decimal_values(
+    values: BufferHandle,
+    values_type: DecimalType,
+    target_type: DecimalType,
+    len: usize,
+    ctx: &mut CudaExecutionCtx,
+) -> VortexResult<BufferHandle> {
+    if values_type.byte_width() > target_type.byte_width() {
+        vortex_bail!(
+            "cannot export decimal values from {values_type} storage to Arrow {target_type}: narrowing would require a device-to-host overflow check",
+        );
+    }
+    let values = ctx.ensure_on_device(values).await?;
+    if values_type == target_type {
+        return Ok(values);
+    }
+
+    match_each_decimal_value_type!(values_type, |S| {
+        export_decimal_values_from::<S>(values, target_type, len, ctx).await
+    })
+}
+
+/// Dispatch from a concrete storage type `S` to the Arrow-required output width.
+async fn export_decimal_values_from<S>(
+    values: BufferHandle,
+    target_type: DecimalType,
+    len: usize,
+    ctx: &mut CudaExecutionCtx,
+) -> VortexResult<BufferHandle>
+where
+    S: NativeDecimalType + DeviceRepr,
+{
+    match target_type {
+        DecimalType::I32 => decimal_cast::<S, i32>(values, len, ctx).await,
+        DecimalType::I64 => decimal_cast::<S, i64>(values, len, ctx).await,
+        DecimalType::I128 => decimal_cast::<S, i128>(values, len, ctx).await,
+        DecimalType::I256 => decimal_cast::<S, i256>(values, len, ctx).await,
+        target_type => {
+            vortex_bail!("cannot export DecimalArray as Arrow decimal value type {target_type}")
+        }
+    }
+}
+
+/// Launches the CUDA kernel that casts from Vortex storage type `S` to Arrow output type `D`.
+///
+/// The caller must ensure this is not a narrowing cast.
+async fn decimal_cast<S, D>(
+    values: BufferHandle,
+    len: usize,
+    ctx: &mut CudaExecutionCtx,
+) -> VortexResult<BufferHandle>
+where
+    S: NativeDecimalType + DeviceRepr,
+    D: NativeDecimalType + DeviceRepr,
+{
+    if len == 0 {
+        return ctx
+            .ensure_on_device(BufferHandle::new_host(
+                Buffer::<D>::empty().into_byte_buffer(),
+            ))
+            .await;
+    }
+
+    let output_buffer = ctx.device_alloc::<D>(len)?;
+    let output_device = CudaDeviceBuffer::new(output_buffer);
+
+    let values_view = values.cuda_view::<S>()?;
+    let output_view = output_device.as_view::<D>();
+    let len_u64 = len as u64;
+    let cuda_function = ctx.load_function_with_suffixes(
+        "decimal_cast",
+        &[&S::DECIMAL_TYPE.to_string(), &D::DECIMAL_TYPE.to_string()],
+    )?;
+
+    ctx.launch_kernel(&cuda_function, len, |args| {
+        args.arg(&values_view).arg(&output_view).arg(&len_u64);
+    })?;
+
+    Ok(BufferHandle::new_device(Arc::new(output_device)))
+}
+
 /// Export Vortex validity as an Arrow validity byte buffer.
 ///
 /// Returns `None` for the buffer when Arrow can omit validity because all rows are valid.
@@ -537,16 +651,19 @@ mod tests {
     use vortex::array::arrays::VarBinViewArray;
     use vortex::array::arrays::primitive::PrimitiveArrayExt;
     use vortex::array::arrays::varbinview::BinaryView;
+    use vortex::array::buffer::BufferHandle;
     use vortex::array::validity::Validity;
     use vortex::buffer::Buffer;
     use vortex::buffer::ByteBuffer;
     use vortex::dtype::DType;
     use vortex::dtype::DecimalDType;
     use vortex::dtype::FieldNames;
+    use vortex::dtype::NativeDecimalType;
     use vortex::dtype::NativePType;
     use vortex::dtype::Nullability;
     use vortex::dtype::PType;
     use vortex::dtype::half::f16;
+    use vortex::dtype::i256;
     use vortex::error::VortexExpect;
     use vortex::error::VortexResult;
     use vortex::error::vortex_bail;
@@ -761,6 +878,14 @@ mod tests {
             .collect())
     }
 
+    fn assert_exported_decimal_values<T: NativeDecimalType>(
+        value_buffer: &BufferHandle,
+        expected: &[T],
+    ) {
+        let values = Buffer::<T>::from_byte_buffer(value_buffer.to_host_sync());
+        assert_eq!(values.as_slice(), expected);
+    }
+
     // Build a nested struct fixture with an out-of-line string-view value.
     fn nested_struct_array() -> ArrayRef {
         let nested = StructArray::new(
@@ -855,25 +980,197 @@ mod tests {
         Ok(())
     }
 
+    async fn assert_exported_decimal<T: NativeDecimalType>(
+        array: ArrayRef,
+        expected_data_type: DataType,
+        expected_values: Vec<T>,
+    ) -> VortexResult<()> {
+        let mut ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+            .vortex_expect("failed to create execution context");
+
+        let mut exported = array.export_device_array_with_schema(&mut ctx).await?;
+
+        let field = Field::try_from(&exported.schema)?;
+        assert_eq!(field, Field::new("", expected_data_type, false));
+        assert_eq!(
+            exported.array.array.length,
+            i64::try_from(expected_values.len())?
+        );
+        assert_eq!(exported.array.array.null_count, 0);
+        assert_eq!(exported.array.array.n_buffers, 2);
+        assert_eq!(exported.array.array.n_children, 0);
+        assert!(exported.array.array.release.is_some());
+        assert_eq!(exported.array.device_type, ARROW_DEVICE_CUDA);
+
+        let private_data = unsafe { &*exported.array.array.private_data.cast::<PrivateData>() };
+        let value_buffer = private_data.buffers[1]
+            .as_ref()
+            .vortex_expect("value buffer should be present");
+        assert_eq!(value_buffer.len(), expected_values.len() * size_of::<T>());
+        assert_exported_decimal_values(value_buffer, &expected_values);
+
+        unsafe { release_exported_array(&raw mut exported.array.array) };
+        Ok(())
+    }
+
+    #[rstest]
+    #[case::i8(
+        DecimalArray::from_iter([1i8, -2, 3], DecimalDType::new(2, 1)).into_array(),
+        DataType::Decimal32(2, 1),
+        vec![1i32, -2, 3]
+    )]
+    #[case::i16(
+        DecimalArray::from_iter([100i16, -200, 300], DecimalDType::new(4, 2)).into_array(),
+        DataType::Decimal32(4, 2),
+        vec![100i32, -200, 300]
+    )]
+    #[case::i32(
+        DecimalArray::from_iter([10_000i32, -20_000, 30_000], DecimalDType::new(9, 2)).into_array(),
+        DataType::Decimal32(9, 2),
+        vec![10_000i32, -20_000, 30_000]
+    )]
     #[crate::test]
-    async fn test_export_decimal() -> VortexResult<()> {
+    async fn test_export_decimal32(
+        #[case] array: ArrayRef,
+        #[case] expected_data_type: DataType,
+        #[case] expected_values: Vec<i32>,
+    ) -> VortexResult<()> {
+        assert_exported_decimal(array, expected_data_type, expected_values).await
+    }
+
+    #[rstest]
+    #[case::i32(
+        DecimalArray::from_iter([1_000_000i32, -2_000_000, 3_000_000], DecimalDType::new(10, 2)).into_array(),
+        DataType::Decimal64(10, 2),
+        vec![1_000_000i64, -2_000_000, 3_000_000]
+    )]
+    #[case::i32_boundary(
+        DecimalArray::from_iter([i32::MIN, -1i32, 0, 1, i32::MAX], DecimalDType::new(10, 0)).into_array(),
+        DataType::Decimal64(10, 0),
+        vec![i32::MIN as i64, -1, 0, 1, i32::MAX as i64]
+    )]
+    #[case::i64(
+        DecimalArray::from_iter([1_000_000i64, -2_000_000, 3_000_000], DecimalDType::new(18, 2)).into_array(),
+        DataType::Decimal64(18, 2),
+        vec![1_000_000i64, -2_000_000, 3_000_000]
+    )]
+    #[crate::test]
+    async fn test_export_decimal64(
+        #[case] array: ArrayRef,
+        #[case] expected_data_type: DataType,
+        #[case] expected_values: Vec<i64>,
+    ) -> VortexResult<()> {
+        assert_exported_decimal(array, expected_data_type, expected_values).await
+    }
+
+    #[rstest]
+    #[case::i64_boundary(
+        DecimalArray::from_iter([i64::MIN, -1i64, 0, 1, i64::MAX], DecimalDType::new(19, 0)).into_array(),
+        DataType::Decimal128(19, 0),
+        vec![i64::MIN as i128, -1, 0, 1, i64::MAX as i128]
+    )]
+    #[case::i128(
+        DecimalArray::from_iter([1i128, -2, 3], DecimalDType::new(38, 2)).into_array(),
+        DataType::Decimal128(38, 2),
+        vec![1i128, -2, 3]
+    )]
+    #[crate::test]
+    async fn test_export_decimal128(
+        #[case] array: ArrayRef,
+        #[case] expected_data_type: DataType,
+        #[case] expected_values: Vec<i128>,
+    ) -> VortexResult<()> {
+        assert_exported_decimal(array, expected_data_type, expected_values).await
+    }
+
+    #[crate::test]
+    async fn test_export_empty_decimal() -> VortexResult<()> {
+        assert_exported_decimal(
+            DecimalArray::new(
+                Buffer::<i32>::empty(),
+                DecimalDType::new(9, 2),
+                Validity::NonNullable,
+            )
+            .into_array(),
+            DataType::Decimal32(9, 2),
+            Vec::<i32>::new(),
+        )
+        .await
+    }
+
+    #[crate::test]
+    async fn test_export_empty_decimal_widening() -> VortexResult<()> {
+        assert_exported_decimal(
+            DecimalArray::new(
+                Buffer::<i8>::empty(),
+                DecimalDType::new(9, 2),
+                Validity::NonNullable,
+            )
+            .into_array(),
+            DataType::Decimal32(9, 2),
+            Vec::<i32>::new(),
+        )
+        .await
+    }
+
+    #[crate::test]
+    async fn test_export_decimal_narrowing_errors() -> VortexResult<()> {
         let mut ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
+        let array = DecimalArray::from_iter([i256::from_parts(0, 1)], DecimalDType::new(38, 0))
+            .into_array();
 
-        let array = DecimalArray::from_iter(0i128..5, DecimalDType::new(38, 2)).into_array();
-        let mut device_array = array.export_device_array(&mut ctx).await?;
+        let err = array
+            .export_device_array_with_schema(&mut ctx)
+            .await
+            .unwrap_err();
+        assert!(err.to_string().contains("narrowing would require"));
+        Ok(())
+    }
 
-        assert_eq!(device_array.array.length, 5);
-        assert_eq!(device_array.array.null_count, 0);
-        assert_eq!(device_array.array.n_buffers, 2);
-        assert_eq!(device_array.array.n_children, 0);
-        assert!(device_array.array.release.is_some());
-        assert_eq!(device_array.device_type, ARROW_DEVICE_CUDA);
+    #[crate::test]
+    async fn test_export_decimal_narrowing_from_arrow_import() -> VortexResult<()> {
+        let mut ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+            .vortex_expect("failed to create execution context");
+        let array = DecimalArray::from_iter([0i128, 1, -2], DecimalDType::new(10, 2)).into_array();
 
-        unsafe { release_exported_array(&raw mut device_array.array) };
+        let err = array
+            .export_device_array_with_schema(&mut ctx)
+            .await
+            .unwrap_err();
+        assert!(err.to_string().contains("narrowing would require"));
         Ok(())
     }
 
+    #[rstest]
+    #[case::i64(
+        DecimalArray::from_iter([i64::MIN, -1i64, 1, i64::MAX], DecimalDType::new(39, 0)).into_array(),
+        DataType::Decimal256(39, 0),
+        vec![i256::from_i128(i64::MIN as i128), i256::from_i128(-1), i256::from_i128(1), i256::from_i128(i64::MAX as i128)]
+    )]
+    #[case::i128(
+        DecimalArray::from_iter([1i128, -2, 3], DecimalDType::new(39, 2)).into_array(),
+        DataType::Decimal256(39, 2),
+        vec![i256::from_i128(1), i256::from_i128(-2), i256::from_i128(3)]
+    )]
+    #[case::i256(
+        DecimalArray::from_iter(
+            [i256::from_i128(10), i256::from_i128(-20), i256::from_i128(30)],
+            DecimalDType::new(76, 2),
+        )
+        .into_array(),
+        DataType::Decimal256(76, 2),
+        vec![i256::from_i128(10), i256::from_i128(-20), i256::from_i128(30)]
+    )]
+    #[crate::test]
+    async fn test_export_decimal256(
+        #[case] array: ArrayRef,
+        #[case] expected_data_type: DataType,
+        #[case] expected_values: Vec<i256>,
+    ) -> VortexResult<()> {
+        assert_exported_decimal(array, expected_data_type, expected_values).await
+    }
+
     #[crate::test]
     async fn test_export_temporal() -> VortexResult<()> {
         let mut ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
@@ -1519,6 +1816,14 @@ mod tests {
             &mut ctx,
         )
         .await?;
+
+        let private_data = unsafe { &*decimal.array.private_data.cast::<PrivateData>() };
+        let value_buffer = private_data.buffers[1]
+            .as_ref()
+            .vortex_expect("value buffer should be present");
+        assert_eq!(value_buffer.len(), 3 * size_of::<i64>());
+        assert_exported_decimal_values(value_buffer, &[100i64, 0, 300]);
+
         unsafe { release_exported_array(&raw mut decimal.array) };
 
         Ok(())
@@ -1794,6 +2099,67 @@ mod tests {
         Ok(())
     }
 
+    #[crate::test]
+    async fn test_export_nested_struct_decimal_with_schema() -> VortexResult<()> {
+        let mut ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+            .vortex_expect("failed to create execution context");
+
+        let nested = StructArray::new(
+            FieldNames::from_iter(["amount"]),
+            vec![
+                DecimalArray::from_iter([100i32, -200, 300], DecimalDType::new(9, 2)).into_array(),
+            ],
+            3,
+            Validity::NonNullable,
+        )
+        .into_array();
+        let array = StructArray::new(
+            FieldNames::from_iter(["nested"]),
+            vec![nested],
+            3,
+            Validity::NonNullable,
+        )
+        .into_array();
+        let mut exported = array.export_device_array_with_schema(&mut ctx).await?;
+
+        let schema = Schema::try_from(&exported.schema)?;
+        assert_eq!(
+            schema,
+            Schema::new(vec![Field::new(
+                "nested",
+                DataType::Struct(Fields::from(vec![Field::new(
+                    "amount",
+                    DataType::Decimal32(9, 2),
+                    false,
+                )])),
+                false,
+            )])
+        );
+
+        let children = unsafe {
+            std::slice::from_raw_parts(
+                exported.array.array.children,
+                usize::try_from(exported.array.array.n_children)?,
+            )
+        };
+        let nested_child = unsafe { &*children[0] };
+        let nested_children = unsafe {
+            std::slice::from_raw_parts(
+                nested_child.children,
+                usize::try_from(nested_child.n_children)?,
+            )
+        };
+        let decimal_child = unsafe { &*nested_children[0] };
+        let private_data = unsafe { &*decimal_child.private_data.cast::<PrivateData>() };
+        let value_buffer = private_data.buffers[1]
+            .as_ref()
+            .vortex_expect("value buffer should be present");
+        assert_eq!(value_buffer.len(), 3 * size_of::<i32>());
+
+        unsafe { release_exported_array(&raw mut exported.array.array) };
+        Ok(())
+    }
+
     #[crate::test]
     async fn test_export_primitive_with_schema_is_column_shaped() -> VortexResult<()> {
         let mut ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
diff --git a/vortex-cuda/src/arrow/mod.rs b/vortex-cuda/src/arrow/mod.rs
index e21383d96e7..d72979d688c 100644
--- a/vortex-cuda/src/arrow/mod.rs
+++ b/vortex-cuda/src/arrow/mod.rs
@@ -159,6 +159,9 @@ pub trait DeviceArrayExt {
     ///
     /// The returned array owns any device buffers allocated during export. Call the embedded
     /// Arrow release callback when the consumer is done with the array.
+    ///
+    /// Arrow arrays are not self-describing, so callers that use this method directly must provide
+    /// a matching schema out-of-band.
     async fn export_device_array(
         self,
         ctx: &mut CudaExecutionCtx,
@@ -174,6 +177,13 @@ pub trait DeviceArrayExt {
     /// Top-level struct arrays are exported as table-like Arrow schemas and struct-shaped device
     /// arrays. Top-level non-struct arrays are exported as column-shaped field schemas and
     /// column-shaped device arrays; this method does not wrap them in a single-field struct.
+    ///
+    /// Decimal arrays are exported using Arrow's decimal physical widths by declared precision:
+    /// Decimal32 for precision 1-9, Decimal64 for 10-18, Decimal128 for 19-38, and Decimal256 for
+    /// 39-76. If a decimal array's Vortex storage is wider than the Arrow width implied by its
+    /// precision, export fails instead of narrowing the values. Checking whether such a narrowing is
+    /// safe would require synchronizing an overflow flag from device to host, which this export path
+    /// intentionally avoids.
     async fn export_device_array_with_schema(
         self,
         ctx: &mut CudaExecutionCtx,
@@ -200,11 +210,7 @@ impl DeviceArrayExt for ArrayRef {
     }
 }
 
-/// Build the Arrow C schema that describes the device array exported for `array`.
-///
-/// Top-level Vortex structs are represented as Arrow schemas, which is the shape expected for
-/// table-like consumers. Non-struct arrays are represented as a single Arrow field schema, matching
-/// the column-shaped [`ArrowDeviceArray`] returned by [`DeviceArrayExt::export_device_array`].
+/// Build the Arrow C schema that describes the exported device array.
 fn arrow_schema_for_array(
     array: &ArrayRef,
     ctx: &mut CudaExecutionCtx,
diff --git a/vortex-test/e2e-cuda/src/lib.rs b/vortex-test/e2e-cuda/src/lib.rs
index 11a540ff70e..7e16b2c0e56 100644
--- a/vortex-test/e2e-cuda/src/lib.rs
+++ b/vortex-test/e2e-cuda/src/lib.rs
@@ -14,7 +14,7 @@ use std::sync::LazyLock;
 use arrow_array::Array;
 use arrow_array::ArrayRef as ArrowArrayRef;
 use arrow_array::Date32Array;
-use arrow_array::Decimal128Array;
+use arrow_array::Decimal64Array;
 use arrow_array::StringArray;
 use arrow_array::cast::AsArray;
 use arrow_array::ffi::FFI_ArrowArray;
@@ -147,8 +147,8 @@ fn export_array_inner(schema_ptr: &mut FFI_ArrowSchema, array_ptr: &mut ArrowDev
         }
     };
     let decimal = DecimalArray::from_option_iter(
-        [Some(0i128), Some(1), None, Some(3), Some(4)],
-        DecimalDType::new(38, 2),
+        [Some(0i32), Some(1), None, Some(3), Some(4)],
+        DecimalDType::new(10, 2),
     );
     let strings = VarBinViewArray::from_iter_nullable_str([
         Some("one"),
@@ -244,8 +244,8 @@ fn validate_array_inner(ffi_schema: &FFI_ArrowSchema, ffi_array: &mut FFI_ArrowA
             &mut SESSION.create_execution_ctx(),
         )
         .expect("expected primitive Arrow array");
-    let decimal = Decimal128Array::from_iter([Some(0i128), Some(1), None, Some(3), Some(4)])
-        .with_precision_and_scale(38, 2)
+    let decimal = Decimal64Array::from_iter([Some(0i64), Some(1), None, Some(3), Some(4)])
+        .with_precision_and_scale(10, 2)
         .expect("with_precision_and_scale");
     let string = StringArray::from_iter([
         Some("one"),

From 3080790dec80655506e6a55dbf40600a63e72d62 Mon Sep 17 00:00:00 2001
From: Alexander Droste <alexander.droste@protonmail.com>
Date: Wed, 3 Jun 2026 14:06:05 +0000
Subject: [PATCH 3/6] wip

Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>
---
 vortex-array/src/dtype/arrow.rs    | 22 +++-----
 vortex-cuda/src/arrow/canonical.rs | 15 ++++--
 vortex-cuda/src/arrow/mod.rs       | 84 ++++++++++++++++++++++++++++--
 3 files changed, 99 insertions(+), 22 deletions(-)

diff --git a/vortex-array/src/dtype/arrow.rs b/vortex-array/src/dtype/arrow.rs
index d0dd1fc48ac..103c52daf93 100644
--- a/vortex-array/src/dtype/arrow.rs
+++ b/vortex-array/src/dtype/arrow.rs
@@ -298,12 +298,13 @@ pub(crate) fn to_data_type_naive(dtype: &DType) -> VortexResult<DataType> {
             let scale = dt.scale();
 
             match precision {
-                // DECIMAL32_MAX_PRECISION
-                0..=9 => DataType::Decimal32(precision, scale),
-                // DECIMAL64_MAX_PRECISION
-                10..=18 => DataType::Decimal64(precision, scale),
+                // This code is commented out until DataFusion improves its support for smaller decimals.
+                // // DECIMAL32_MAX_PRECISION
+                // 0..=9 => DataType::Decimal32(precision, scale),
+                // // DECIMAL64_MAX_PRECISION
+                // 10..=18 => DataType::Decimal64(precision, scale),
                 // DECIMAL128_MAX_PRECISION
-                19..=38 => DataType::Decimal128(precision, scale),
+                0..=38 => DataType::Decimal128(precision, scale),
                 // DECIMAL256_MAX_PRECISION
                 39.. => DataType::Decimal256(precision, scale),
             }
@@ -450,18 +451,11 @@ mod test {
     }
 
     #[rstest]
-    #[case(1, DataType::Decimal32(1, 0))]
-    #[case(9, DataType::Decimal32(9, 0))]
-    #[case(10, DataType::Decimal64(10, 0))]
-    #[case(18, DataType::Decimal64(18, 0))]
-    #[case(19, DataType::Decimal128(19, 0))]
+    #[case(1, DataType::Decimal128(1, 0))]
     #[case(38, DataType::Decimal128(38, 0))]
     #[case(39, DataType::Decimal256(39, 0))]
     #[case(76, DataType::Decimal256(76, 0))]
-    fn test_decimal_dtype_to_arrow_smallest_width(
-        #[case] precision: u8,
-        #[case] expected: DataType,
-    ) {
+    fn test_decimal_dtype_to_arrow(#[case] precision: u8, #[case] expected: DataType) {
         use crate::dtype::DecimalDType;
 
         let dtype = DType::Decimal(DecimalDType::new(precision, 0), Nullability::NonNullable);
diff --git a/vortex-cuda/src/arrow/canonical.rs b/vortex-cuda/src/arrow/canonical.rs
index fd6d6c75174..31d65f09796 100644
--- a/vortex-cuda/src/arrow/canonical.rs
+++ b/vortex-cuda/src/arrow/canonical.rs
@@ -32,8 +32,11 @@ use vortex::array::match_each_decimal_value_type;
 use vortex::array::validity::Validity;
 use vortex::buffer::Buffer;
 use vortex::buffer::ByteBuffer;
+<<<<<<< HEAD
 use vortex::dtype::DType;
 use vortex::dtype::DecimalDType;
+=======
+>>>>>>> dbce147e9 (wip)
 use vortex::dtype::DecimalType;
 use vortex::dtype::NativeDecimalType;
 use vortex::dtype::Nullability;
@@ -56,6 +59,7 @@ use crate::arrow::ArrowDeviceArray;
 use crate::arrow::ExportDeviceArray;
 use crate::arrow::PrivateData;
 use crate::arrow::SyncEvent;
+use crate::arrow::cuda_decimal_value_type;
 use crate::executor::CudaArrayExt;
 
 /// An implementation of `ExportDeviceArray` that exports Vortex arrays to `ArrowDeviceArray` by
@@ -234,9 +238,9 @@ fn export_canonical(
 
 /// Exports decimals with value buffers cast to Arrow's Decimal32/64/128/256 layout.
 ///
-/// Decimal values are already decoded; this only adapts the physical buffer width. Keep this in
-/// sync with the schema mapping in `arrow_schema_for_array`. Storage-to-Arrow narrowing is rejected
-/// instead of checked on-device to avoid a device-to-host synchronization point.
+/// Decimal values are already decoded; this only adapts the physical buffer width. Storage-to-Arrow
+/// narrowing is rejected instead of checked on-device to avoid a device-to-host synchronization
+/// point.
 async fn export_decimal(
     decimal: DecimalArray,
     ctx: &mut CudaExecutionCtx,
@@ -250,12 +254,13 @@ async fn export_decimal(
     } = decimal.into_data_parts();
 
     let (validity_buffer, null_count) = export_arrow_validity_buffer(validity, len, 0, ctx).await?;
-    let target_type = arrow_decimal_value_type(decimal_dtype);
+    let target_type = cuda_decimal_value_type(decimal_dtype);
     let values = export_decimal_values(values, values_type, target_type, len, ctx).await?;
 
     export_fixed_size(values, len, 0, validity_buffer, null_count, ctx)
 }
 
+<<<<<<< HEAD
 /// Returns the Arrow physical value type for a decimal dtype.
 ///
 /// Must match the shared Vortex-to-Arrow decimal mapping in
@@ -271,6 +276,8 @@ fn arrow_decimal_value_type(decimal_dtype: DecimalDType) -> DecimalType {
     }
 }
 
+=======
+>>>>>>> dbce147e9 (wip)
 /// Ensure the values buffer is on-device and has the Arrow-required decimal width.
 ///
 /// Storage wider than the precision-implied Arrow width is rejected. Callers that hit this
diff --git a/vortex-cuda/src/arrow/mod.rs b/vortex-cuda/src/arrow/mod.rs
index d72979d688c..53479dfbad0 100644
--- a/vortex-cuda/src/arrow/mod.rs
+++ b/vortex-cuda/src/arrow/mod.rs
@@ -16,6 +16,9 @@ use std::fmt::Debug;
 use std::ptr;
 use std::sync::Arc;
 
+use arrow_schema::DataType;
+use arrow_schema::Field;
+use arrow_schema::Schema;
 use arrow_schema::ffi::FFI_ArrowSchema;
 use async_trait::async_trait;
 pub(crate) use canonical::CanonicalDeviceArrayExport;
@@ -26,6 +29,11 @@ use vortex::array::ArrayRef;
 use vortex::array::arrow::ArrowSessionExt;
 use vortex::array::buffer::BufferHandle;
 use vortex::dtype::DType;
+<<<<<<< HEAD
+=======
+use vortex::dtype::DecimalDType;
+use vortex::dtype::DecimalType;
+>>>>>>> dbce147e9 (wip)
 use vortex::dtype::StructFields;
 use vortex::error::VortexResult;
 use vortex::error::vortex_err;
@@ -161,7 +169,8 @@ pub trait DeviceArrayExt {
     /// Arrow release callback when the consumer is done with the array.
     ///
     /// Arrow arrays are not self-describing, so callers that use this method directly must provide
-    /// a matching schema out-of-band.
+    /// a matching schema out-of-band. Prefer [`Self::export_device_array_with_schema`] unless a
+    /// consumer already has the CUDA export schema.
     async fn export_device_array(
         self,
         ctx: &mut CudaExecutionCtx,
@@ -170,9 +179,8 @@ pub trait DeviceArrayExt {
     /// Export this array as an Arrow C Device array together with its matching Arrow C schema.
     ///
     /// Arrow arrays are not self-describing: consumers need both the [`ArrowDeviceArray`] and an
-    /// Arrow schema to interpret the buffer layout. This helper derives the schema from the
-    /// Vortex dtype using the session's Arrow conversion rules and returns it alongside the device
-    /// array.
+    /// Arrow schema to interpret the buffer layout. This helper derives the schema that matches the
+    /// CUDA device export layout and returns it alongside the device array.
     ///
     /// Top-level struct arrays are exported as table-like Arrow schemas and struct-shaped device
     /// arrays. Top-level non-struct arrays are exported as column-shaped field schemas and
@@ -215,6 +223,7 @@ fn arrow_schema_for_array(
     array: &ArrayRef,
     ctx: &mut CudaExecutionCtx,
 ) -> VortexResult<FFI_ArrowSchema> {
+<<<<<<< HEAD
     let arrow = ctx.execution_ctx().session().arrow();
     let dtype = arrow_device_export_dtype(array.dtype());
     match &dtype {
@@ -222,6 +231,73 @@ fn arrow_schema_for_array(
         _ => Ok(FFI_ArrowSchema::try_from(
             arrow.to_arrow_field("", &dtype)?,
         )?),
+=======
+    match array.dtype() {
+        DType::Struct(struct_dtype, _) => Ok(FFI_ArrowSchema::try_from(Schema::new(
+            cuda_arrow_struct_fields(struct_dtype, ctx)?,
+        ))?),
+        _ => Ok(FFI_ArrowSchema::try_from(cuda_arrow_field(
+            "",
+            array.dtype(),
+            ctx,
+        )?)?),
+    }
+}
+
+fn cuda_arrow_struct_fields(
+    struct_dtype: &StructFields,
+    ctx: &mut CudaExecutionCtx,
+) -> VortexResult<Vec<Field>> {
+    let mut fields = Vec::with_capacity(struct_dtype.nfields());
+    for (field_name, field_dtype) in struct_dtype.names().iter().zip(struct_dtype.fields()) {
+        fields.push(cuda_arrow_field(field_name.as_ref(), &field_dtype, ctx)?);
+    }
+    Ok(fields)
+}
+
+fn cuda_arrow_field(
+    name: impl AsRef<str>,
+    dtype: &DType,
+    ctx: &mut CudaExecutionCtx,
+) -> VortexResult<Field> {
+    let field = ctx
+        .execution_ctx()
+        .session()
+        .arrow()
+        .to_arrow_field(name.as_ref(), dtype)?;
+
+    let data_type = match dtype {
+        DType::Decimal(decimal_dtype, _) => cuda_arrow_decimal_data_type(*decimal_dtype),
+        DType::Struct(struct_dtype, _) => {
+            DataType::Struct(cuda_arrow_struct_fields(struct_dtype, ctx)?.into())
+        }
+        _ => return Ok(field),
+    };
+
+    Ok(
+        Field::new(field.name().clone(), data_type, field.is_nullable())
+            .with_metadata(field.metadata().clone()),
+    )
+}
+
+fn cuda_arrow_decimal_data_type(decimal_dtype: DecimalDType) -> DataType {
+    match cuda_decimal_value_type(decimal_dtype) {
+        DecimalType::I32 => DataType::Decimal32(decimal_dtype.precision(), decimal_dtype.scale()),
+        DecimalType::I64 => DataType::Decimal64(decimal_dtype.precision(), decimal_dtype.scale()),
+        DecimalType::I128 => DataType::Decimal128(decimal_dtype.precision(), decimal_dtype.scale()),
+        DecimalType::I256 => DataType::Decimal256(decimal_dtype.precision(), decimal_dtype.scale()),
+        decimal_type => unreachable!("unsupported decimal value type {decimal_type}"),
+    }
+}
+
+pub(crate) fn cuda_decimal_value_type(decimal_dtype: DecimalDType) -> DecimalType {
+    match decimal_dtype.precision() {
+        1..=9 => DecimalType::I32,
+        10..=18 => DecimalType::I64,
+        19..=38 => DecimalType::I128,
+        39..=76 => DecimalType::I256,
+        p => unreachable!("precision {p} is invalid for a DecimalDType"),
+>>>>>>> dbce147e9 (wip)
     }
 }
 

From ef7e89e1ffd038d688611f1a7dcded9fc372d3bf Mon Sep 17 00:00:00 2001
From: Alexander Droste <alexander.droste@protonmail.com>
Date: Wed, 3 Jun 2026 14:17:02 +0000
Subject: [PATCH 4/6] doc

Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>
---
 vortex-cuda/src/arrow/canonical.rs | 22 ----------------------
 vortex-cuda/src/arrow/mod.rs       | 24 +++---------------------
 2 files changed, 3 insertions(+), 43 deletions(-)

diff --git a/vortex-cuda/src/arrow/canonical.rs b/vortex-cuda/src/arrow/canonical.rs
index 31d65f09796..af180400675 100644
--- a/vortex-cuda/src/arrow/canonical.rs
+++ b/vortex-cuda/src/arrow/canonical.rs
@@ -32,11 +32,7 @@ use vortex::array::match_each_decimal_value_type;
 use vortex::array::validity::Validity;
 use vortex::buffer::Buffer;
 use vortex::buffer::ByteBuffer;
-<<<<<<< HEAD
 use vortex::dtype::DType;
-use vortex::dtype::DecimalDType;
-=======
->>>>>>> dbce147e9 (wip)
 use vortex::dtype::DecimalType;
 use vortex::dtype::NativeDecimalType;
 use vortex::dtype::Nullability;
@@ -260,24 +256,6 @@ async fn export_decimal(
     export_fixed_size(values, len, 0, validity_buffer, null_count, ctx)
 }
 
-<<<<<<< HEAD
-/// Returns the Arrow physical value type for a decimal dtype.
-///
-/// Must match the shared Vortex-to-Arrow decimal mapping in
-/// `vortex-array/src/dtype/arrow.rs`.
-fn arrow_decimal_value_type(decimal_dtype: DecimalDType) -> DecimalType {
-    match decimal_dtype.precision() {
-        1..=9 => DecimalType::I32,
-        10..=18 => DecimalType::I64,
-        19..=38 => DecimalType::I128,
-        39..=76 => DecimalType::I256,
-        0 => unreachable!("precision must be greater than 0"),
-        p => unreachable!("precision larger than 76 is invalid found precision {p}"),
-    }
-}
-
-=======
->>>>>>> dbce147e9 (wip)
 /// Ensure the values buffer is on-device and has the Arrow-required decimal width.
 ///
 /// Storage wider than the precision-implied Arrow width is rejected. Callers that hit this
diff --git a/vortex-cuda/src/arrow/mod.rs b/vortex-cuda/src/arrow/mod.rs
index 53479dfbad0..99fa14add3d 100644
--- a/vortex-cuda/src/arrow/mod.rs
+++ b/vortex-cuda/src/arrow/mod.rs
@@ -29,11 +29,8 @@ use vortex::array::ArrayRef;
 use vortex::array::arrow::ArrowSessionExt;
 use vortex::array::buffer::BufferHandle;
 use vortex::dtype::DType;
-<<<<<<< HEAD
-=======
 use vortex::dtype::DecimalDType;
 use vortex::dtype::DecimalType;
->>>>>>> dbce147e9 (wip)
 use vortex::dtype::StructFields;
 use vortex::error::VortexResult;
 use vortex::error::vortex_err;
@@ -186,12 +183,8 @@ pub trait DeviceArrayExt {
     /// arrays. Top-level non-struct arrays are exported as column-shaped field schemas and
     /// column-shaped device arrays; this method does not wrap them in a single-field struct.
     ///
-    /// Decimal arrays are exported using Arrow's decimal physical widths by declared precision:
-    /// Decimal32 for precision 1-9, Decimal64 for 10-18, Decimal128 for 19-38, and Decimal256 for
-    /// 39-76. If a decimal array's Vortex storage is wider than the Arrow width implied by its
-    /// precision, export fails instead of narrowing the values. Checking whether such a narrowing is
-    /// safe would require synchronizing an overflow flag from device to host, which this export path
-    /// intentionally avoids.
+    /// Decimal exports use the Arrow decimal width implied by precision; storage wider than that
+    /// width is rejected rather than narrowed on device.
     async fn export_device_array_with_schema(
         self,
         ctx: &mut CudaExecutionCtx,
@@ -223,23 +216,13 @@ fn arrow_schema_for_array(
     array: &ArrayRef,
     ctx: &mut CudaExecutionCtx,
 ) -> VortexResult<FFI_ArrowSchema> {
-<<<<<<< HEAD
-    let arrow = ctx.execution_ctx().session().arrow();
     let dtype = arrow_device_export_dtype(array.dtype());
     match &dtype {
-        DType::Struct(..) => Ok(FFI_ArrowSchema::try_from(arrow.to_arrow_schema(&dtype)?)?),
-        _ => Ok(FFI_ArrowSchema::try_from(
-            arrow.to_arrow_field("", &dtype)?,
-        )?),
-=======
-    match array.dtype() {
         DType::Struct(struct_dtype, _) => Ok(FFI_ArrowSchema::try_from(Schema::new(
             cuda_arrow_struct_fields(struct_dtype, ctx)?,
         ))?),
         _ => Ok(FFI_ArrowSchema::try_from(cuda_arrow_field(
-            "",
-            array.dtype(),
-            ctx,
+            "", &dtype, ctx,
         )?)?),
     }
 }
@@ -297,7 +280,6 @@ pub(crate) fn cuda_decimal_value_type(decimal_dtype: DecimalDType) -> DecimalTyp
         19..=38 => DecimalType::I128,
         39..=76 => DecimalType::I256,
         p => unreachable!("precision {p} is invalid for a DecimalDType"),
->>>>>>> dbce147e9 (wip)
     }
 }
 

From 4ffda61eabc4f9d730bf7e09aea2201a16208817 Mon Sep 17 00:00:00 2001
From: Alexander Droste <alexander.droste@protonmail.com>
Date: Wed, 3 Jun 2026 17:14:56 +0000
Subject: [PATCH 5/6] more test

Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>
---
 vortex-test/e2e-cuda/src/lib.rs | 49 ++++++++++++++++++++++++++-------
 1 file changed, 39 insertions(+), 10 deletions(-)

diff --git a/vortex-test/e2e-cuda/src/lib.rs b/vortex-test/e2e-cuda/src/lib.rs
index 7e16b2c0e56..56e9c11e0f9 100644
--- a/vortex-test/e2e-cuda/src/lib.rs
+++ b/vortex-test/e2e-cuda/src/lib.rs
@@ -14,7 +14,9 @@ use std::sync::LazyLock;
 use arrow_array::Array;
 use arrow_array::ArrayRef as ArrowArrayRef;
 use arrow_array::Date32Array;
+use arrow_array::Decimal32Array;
 use arrow_array::Decimal64Array;
+use arrow_array::Decimal128Array;
 use arrow_array::StringArray;
 use arrow_array::cast::AsArray;
 use arrow_array::ffi::FFI_ArrowArray;
@@ -146,10 +148,20 @@ fn export_array_inner(schema_ptr: &mut FFI_ArrowSchema, array_ptr: &mut ArrowDev
             return 1;
         }
     };
-    let decimal = DecimalArray::from_option_iter(
+    // cuDF supports Arrow decimal device imports through Decimal128. Decimal256 is intentionally
+    // not included here because cuDF has no DECIMAL256 type_id or Arrow interop mapping.
+    let decimal32 = DecimalArray::from_option_iter(
+        [Some(0i8), Some(1), None, Some(3), Some(4)],
+        DecimalDType::new(9, 2),
+    );
+    let decimal64 = DecimalArray::from_option_iter(
         [Some(0i32), Some(1), None, Some(3), Some(4)],
         DecimalDType::new(10, 2),
     );
+    let decimal128 = DecimalArray::from_option_iter(
+        [Some(0i64), Some(1), None, Some(3), Some(4)],
+        DecimalDType::new(19, 2),
+    );
     let strings = VarBinViewArray::from_iter_nullable_str([
         Some("one"),
         None,
@@ -166,7 +178,9 @@ fn export_array_inner(schema_ptr: &mut FFI_ArrowSchema, array_ptr: &mut ArrowDev
     let array = StructArray::new(
         FieldNames::from_iter([
             "prims",
-            "decimals",
+            "decimal32",
+            "decimal64",
+            "decimal128",
             "strings",
             "dates",
             "lists",
@@ -174,7 +188,9 @@ fn export_array_inner(schema_ptr: &mut FFI_ArrowSchema, array_ptr: &mut ArrowDev
         ]),
         vec![
             primitive,
-            decimal.into_array(),
+            decimal32.into_array(),
+            decimal64.into_array(),
+            decimal128.into_array(),
             strings.into_array(),
             dates.into_array(),
             list_array(),
@@ -244,8 +260,15 @@ fn validate_array_inner(ffi_schema: &FFI_ArrowSchema, ffi_array: &mut FFI_ArrowA
             &mut SESSION.create_execution_ctx(),
         )
         .expect("expected primitive Arrow array");
-    let decimal = Decimal64Array::from_iter([Some(0i64), Some(1), None, Some(3), Some(4)])
-        .with_precision_and_scale(10, 2)
+    let decimal32 = Decimal32Array::from_iter([Some(0i32), Some(1), None, Some(3), Some(4)])
+        // cuDF stores decimals using the maximum precision for the physical width and preserves scale.
+        .with_precision_and_scale(9, 2)
+        .expect("with_precision_and_scale");
+    let decimal64 = Decimal64Array::from_iter([Some(0i64), Some(1), None, Some(3), Some(4)])
+        .with_precision_and_scale(18, 2)
+        .expect("with_precision_and_scale");
+    let decimal128 = Decimal128Array::from_iter([Some(0i128), Some(1), None, Some(3), Some(4)])
+        .with_precision_and_scale(38, 2)
         .expect("with_precision_and_scale");
     let string = StringArray::from_iter([
         Some("one"),
@@ -270,7 +293,9 @@ fn validate_array_inner(ffi_schema: &FFI_ArrowSchema, ffi_array: &mut FFI_ArrowA
 
     let expected_fields = Fields::from_iter([
         Field::new("prims", primitive.data_type().clone(), true),
-        Field::new("decimals", decimal.data_type().clone(), true),
+        Field::new("decimal32", decimal32.data_type().clone(), true),
+        Field::new("decimal64", decimal64.data_type().clone(), true),
+        Field::new("decimal128", decimal128.data_type().clone(), true),
         Field::new("strings", string.data_type().clone(), true),
         Field::new("dates", date.data_type().clone(), true),
         cudf_list_field("lists"),
@@ -278,12 +303,16 @@ fn validate_array_inner(ffi_schema: &FFI_ArrowSchema, ffi_array: &mut FFI_ArrowA
     ]);
     if &expected_fields != struct_array.fields() {
         eprintln!("wrong fields for host array");
+        eprintln!("expected fields: {expected_fields:#?}");
+        eprintln!("actual fields: {:#?}", struct_array.fields());
         return 1;
     }
 
-    let expected_arrays: [ArrowArrayRef; 4] = [
+    let expected_arrays: [ArrowArrayRef; 6] = [
         primitive,
-        Arc::new(decimal),
+        Arc::new(decimal32),
+        Arc::new(decimal64),
+        Arc::new(decimal128),
         Arc::new(string),
         Arc::new(date),
     ];
@@ -299,11 +328,11 @@ fn validate_array_inner(ffi_schema: &FFI_ArrowSchema, ffi_array: &mut FFI_ArrowA
         }
     }
 
-    if !list_values_eq(list.as_ref(), struct_array.column(4).as_ref()) {
+    if !list_values_eq(list.as_ref(), struct_array.column(6).as_ref()) {
         eprintln!("wrong values for lists column");
         return 1;
     }
-    if !list_values_eq(fixed_size_list.as_ref(), struct_array.column(5).as_ref()) {
+    if !list_values_eq(fixed_size_list.as_ref(), struct_array.column(7).as_ref()) {
         eprintln!("wrong values for fixed_lists column");
         return 1;
     }

From ba179edd66144c80fde94757191bf2f957098a4a Mon Sep 17 00:00:00 2001
From: Alexander Droste <alexander.droste@protonmail.com>
Date: Wed, 3 Jun 2026 17:26:35 +0000
Subject: [PATCH 6/6] fix

Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>
---
 vortex-test/e2e-cuda/src/lib.rs | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/vortex-test/e2e-cuda/src/lib.rs b/vortex-test/e2e-cuda/src/lib.rs
index 56e9c11e0f9..04b84efb03d 100644
--- a/vortex-test/e2e-cuda/src/lib.rs
+++ b/vortex-test/e2e-cuda/src/lib.rs
@@ -303,8 +303,8 @@ fn validate_array_inner(ffi_schema: &FFI_ArrowSchema, ffi_array: &mut FFI_ArrowA
     ]);
     if &expected_fields != struct_array.fields() {
         eprintln!("wrong fields for host array");
-        eprintln!("expected fields: {expected_fields:#?}");
-        eprintln!("actual fields: {:#?}", struct_array.fields());
+        eprintln!("expected fields: {}", format_fields(&expected_fields));
+        eprintln!("actual fields: {}", format_fields(struct_array.fields()));
         return 1;
     }
 
@@ -344,6 +344,21 @@ fn cudf_list_field(name: &str) -> Field {
     Field::new_list(name, Field::new("element", DataType::Int32, false), true)
 }
 
+fn format_fields(fields: &Fields) -> String {
+    fields
+        .iter()
+        .map(|field| {
+            format!(
+                "{}: {}{}",
+                field.name(),
+                field.data_type(),
+                if field.is_nullable() { "?" } else { "" }
+            )
+        })
+        .collect::<Vec<_>>()
+        .join(", ")
+}
+
 fn list_values_eq(expected: &dyn Array, actual: &dyn Array) -> bool {
     let expected = expected.as_list::<i32>();
     let actual = actual.as_list::<i32>();