From ab00ba06512dbad88f9f1cdc7f6a0c768988c0c5 Mon Sep 17 00:00:00 2001 From: Mikhail Kot Date: Fri, 15 May 2026 16:15:29 +0100 Subject: [PATCH] duckdb: flatten runend arrays on export if requested Signed-off-by: Mikhail Kot --- vortex-duckdb/src/exporter/canonical.rs | 46 ++++++++++ vortex-duckdb/src/exporter/list.rs | 114 +++++++++++++++++++----- vortex-duckdb/src/exporter/mod.rs | 26 +----- vortex-duckdb/src/exporter/run_end.rs | 11 ++- 4 files changed, 153 insertions(+), 44 deletions(-) create mode 100644 vortex-duckdb/src/exporter/canonical.rs diff --git a/vortex-duckdb/src/exporter/canonical.rs b/vortex-duckdb/src/exporter/canonical.rs new file mode 100644 index 00000000000..a46b22df6f9 --- /dev/null +++ b/vortex-duckdb/src/exporter/canonical.rs @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +use vortex::array::ArrayRef; +use vortex::array::Canonical; +use vortex::array::ExecutionCtx; +use vortex::array::arrays::TemporalArray; +use vortex::error::VortexResult; +use vortex::error::vortex_bail; + +use crate::exporter::ColumnExporter; +use crate::exporter::ConversionCache; +use crate::exporter::all_invalid; +use crate::exporter::bool; +use crate::exporter::decimal; +use crate::exporter::fixed_size_list; +use crate::exporter::list_view; +use crate::exporter::primitive; +use crate::exporter::struct_; +use crate::exporter::temporal; +use crate::exporter::varbinview; + +pub(crate) fn new_exporter( + array: ArrayRef, + cache: &ConversionCache, + ctx: &mut ExecutionCtx, +) -> VortexResult> { + match array.execute::(ctx)? { + Canonical::Null(_) => Ok(all_invalid::new_exporter()), + Canonical::Bool(array) => bool::new_exporter(array, ctx), + Canonical::Primitive(array) => primitive::new_exporter(array, ctx), + Canonical::Decimal(array) => decimal::new_exporter(array, ctx), + Canonical::VarBinView(array) => varbinview::new_exporter(array, ctx), + Canonical::List(array) => list_view::new_exporter(array, cache, ctx), + Canonical::FixedSizeList(array) => fixed_size_list::new_exporter(array, cache, ctx), + Canonical::Struct(array) => struct_::new_exporter(array, cache, ctx), + Canonical::Extension(ext) => { + if let Ok(temporal_array) = TemporalArray::try_from(ext) { + return temporal::new_exporter(temporal_array, ctx); + } + vortex_bail!("no non-temporal extension exporter") + } + Canonical::Variant(_) => { + vortex_bail!("Variant arrays can't be exported to DuckDB") + } + } +} diff --git a/vortex-duckdb/src/exporter/list.rs b/vortex-duckdb/src/exporter/list.rs index f0a64846d5b..b002e66d546 100644 --- a/vortex-duckdb/src/exporter/list.rs +++ b/vortex-duckdb/src/exporter/list.rs @@ -150,7 +150,11 @@ mod tests { use vortex::array::validity::Validity; use vortex::buffer::Buffer; use vortex::buffer::buffer; + use vortex::dtype::DType; + use vortex::dtype::PType; + use vortex::encodings::runend::RunEnd; use vortex::error::VortexExpect; + use vortex::error::VortexResult; use super::*; use crate::SESSION; @@ -160,13 +164,12 @@ mod tests { #[test] fn test_export_empty_list() { - let list = unsafe { - ListArray::new_unchecked( - Buffer::::empty().into_array(), - Buffer::::empty().into_array(), - Validity::AllValid, - ) - } + let list = ListArray::try_new( + Buffer::::empty().into_array(), + buffer![0u32].into_array(), + Validity::AllValid, + ) + .vortex_expect("list creation should succeed") .into_array(); let list_type = LogicalType::list_type(LogicalType::uint32()) @@ -189,20 +192,91 @@ mod tests { } #[test] - fn test_export_non_empty_list_of_strings() { - let list = unsafe { - ListArray::new_unchecked( - >::from_iter([ - Some("abc"), - Some("def"), - None, - Some("ghi"), - ]) - .into_array(), - buffer![0u8, 1, 2, 3, 4].into_array(), - Validity::from_iter([true, true, false, true]), + fn test_export_u64_list() { + let list = ListArray::try_new( + buffer![1u64, 2, 3, 4, 5].into_array(), + buffer![0u8, 1, 2, 3, 4, 5].into_array(), + Validity::AllValid, + ) + .vortex_expect("list creation should succeed") + .into_array(); + assert_eq!( + list.dtype(), + &DType::List( + Arc::new(DType::Primitive(PType::U64, false.into())), + true.into() ) - } + ); + + let list_type = LogicalType::list_type(LogicalType::uint64()) + .vortex_expect("LogicalTypeRef creation should succeed for test data"); + let mut chunk = DataChunk::new([list_type]); + + let mut ctx = SESSION.create_execution_ctx(); + new_array_exporter(list, &ConversionCache::default(), &mut ctx) + .unwrap() + .export(0, 5, chunk.get_vector_mut(0), &mut ctx) + .unwrap(); + chunk.set_len(5); + + assert_eq!( + format!("{}", String::try_from(&*chunk).unwrap()), + r#"Chunk - [1 Columns] +- FLAT UBIGINT[]: 5 = [ [1], [2], [3], [4], [5]] +"# + ); + } + + // Ensure runend-compressed list is properly flattened + #[test] + fn test_export_list_with_runend_elements() -> VortexResult<()> { + let mut ctx = SESSION.create_execution_ctx(); + let elements = RunEnd::encode(buffer![100u32, 100, 200, 200, 200].into_array(), &mut ctx)?; + + let list = ListArray::try_new( + elements.into_array(), + buffer![0u32, 2, 5].into_array(), + Validity::AllValid, + ) + .vortex_expect("list creation should succeed") + .into_array(); + + let list_type = LogicalType::list_type(LogicalType::uint32()) + .vortex_expect("LogicalTypeRef creation should succeed for test data"); + let mut chunk = DataChunk::new([list_type]); + + new_array_exporter(list, &ConversionCache::default(), &mut ctx)?.export( + 0, + 2, + chunk.get_vector_mut(0), + &mut ctx, + )?; + chunk.set_len(2); + + assert_eq!( + format!("{}", String::try_from(&*chunk)?), + r#"Chunk - [1 Columns] +- FLAT UINTEGER[]: 2 = [ [100, 100], [200, 200, 200]] +"# + ); + + Ok(()) + } + + #[test] + fn test_export_non_empty_list_of_strings() { + let list = ListArray::try_new( + >::from_iter([ + Some("abc"), + Some("def"), + None, + Some("ghi"), + ]) + .into_array(), + buffer![0u8, 1, 2, 3, 4].into_array(), + Validity::from_iter([true, true, false, true]), + ) + .vortex_expect("list creation should succeed") .into_array(); let list_type = LogicalType::list_type(LogicalType::varchar()) diff --git a/vortex-duckdb/src/exporter/mod.rs b/vortex-duckdb/src/exporter/mod.rs index 517776f5521..a438adc59d8 100644 --- a/vortex-duckdb/src/exporter/mod.rs +++ b/vortex-duckdb/src/exporter/mod.rs @@ -4,6 +4,7 @@ mod all_invalid; mod bool; mod cache; +mod canonical; mod constant; mod decimal; mod dict; @@ -22,13 +23,11 @@ mod vector; pub use cache::ConversionCache; pub use decimal::precision_to_duckdb_storage_size; use vortex::array::ArrayRef; -use vortex::array::Canonical; use vortex::array::ExecutionCtx; use vortex::array::arrays::Constant; use vortex::array::arrays::Dict; use vortex::array::arrays::List; use vortex::array::arrays::StructArray; -use vortex::array::arrays::TemporalArray; use vortex::array::arrays::struct_::StructArrayExt; use vortex::buffer::BitChunks; use vortex::encodings::runend::RunEnd; @@ -191,7 +190,7 @@ fn new_array_exporter_with_flatten( }; let array = match array.try_downcast::() { - Ok(array) => return run_end::new_exporter(array, cache, ctx), + Ok(array) => return run_end::new_exporter_with_flatten(array, cache, ctx, flatten), Err(array) => array, }; @@ -205,26 +204,7 @@ fn new_array_exporter_with_flatten( Err(array) => array, }; - // Otherwise, we fall back to canonical - match array.execute::(ctx)? { - Canonical::Null(_) => Ok(all_invalid::new_exporter()), - Canonical::Bool(array) => bool::new_exporter(array, ctx), - Canonical::Primitive(array) => primitive::new_exporter(array, ctx), - Canonical::Decimal(array) => decimal::new_exporter(array, ctx), - Canonical::VarBinView(array) => varbinview::new_exporter(array, ctx), - Canonical::List(array) => list_view::new_exporter(array, cache, ctx), - Canonical::FixedSizeList(array) => fixed_size_list::new_exporter(array, cache, ctx), - Canonical::Struct(array) => struct_::new_exporter(array, cache, ctx), - Canonical::Extension(ext) => { - if let Ok(temporal_array) = TemporalArray::try_from(ext) { - return temporal::new_exporter(temporal_array, ctx); - } - vortex_bail!("no non-temporal extension exporter") - } - Canonical::Variant(_) => { - vortex_bail!("Variant arrays can't be exported to DuckDB") - } - } + canonical::new_exporter(array, cache, ctx) } /// Copy the sliced bits from source into target, returning whether all copied bits are zero, diff --git a/vortex-duckdb/src/exporter/run_end.rs b/vortex-duckdb/src/exporter/run_end.rs index 442e612147c..de3c08c4759 100644 --- a/vortex-duckdb/src/exporter/run_end.rs +++ b/vortex-duckdb/src/exporter/run_end.rs @@ -5,6 +5,7 @@ use std::marker::PhantomData; use vortex::array::ArrayRef; use vortex::array::ExecutionCtx; +use vortex::array::IntoArray; use vortex::array::arrays::PrimitiveArray; use vortex::array::match_each_integer_ptype; use vortex::array::search_sorted::SearchSorted; @@ -20,6 +21,7 @@ use crate::duckdb::SelectionVector; use crate::duckdb::VectorRef; use crate::exporter::ColumnExporter; use crate::exporter::cache::ConversionCache; +use crate::exporter::canonical; use crate::exporter::new_array_exporter; /// We export run-end arrays to a DuckDB dictionary vector, using a selection vector to @@ -32,11 +34,18 @@ struct RunEndExporter { run_end_offset: usize, } -pub(crate) fn new_exporter( +pub(crate) fn new_exporter_with_flatten( array: RunEndArray, cache: &ConversionCache, ctx: &mut ExecutionCtx, + flatten: bool, ) -> VortexResult> { + // Our canonicalization is faster than creating a dictionary vector and + // letting duckdb flatten it for us. + if flatten { + return canonical::new_exporter(array.into_array(), cache, ctx); + } + let offset = array.offset(); let ends = array.ends().clone(); let values = array.values().clone();