Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions vortex-duckdb/cpp/expr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,12 @@ extern "C" duckdb_vx_expr_class duckdb_vx_expr_get_class(duckdb_vx_expr ffi_expr
return static_cast<duckdb_vx_expr_class>(expr->GetExpressionClass());
}

extern "C" duckdb_logical_type duckdb_vx_expr_get_return_type(duckdb_vx_expr ffi_expr) {
D_ASSERT(ffi_expr);
auto expr = reinterpret_cast<Expression *>(ffi_expr);
return reinterpret_cast<duckdb_logical_type>(&expr->return_type);
}

extern "C" const char *duckdb_vx_expr_get_bound_column_ref_get_name(duckdb_vx_expr ffi_expr) {
if (!ffi_expr) {
return nullptr;
Expand Down
4 changes: 4 additions & 0 deletions vortex-duckdb/cpp/include/expr.h
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,10 @@ typedef enum DUCKDB_VX_EXPR_TYPE {

duckdb_vx_expr_class duckdb_vx_expr_get_class(duckdb_vx_expr expr);

/// Return the (bound) return type of the expression. The logical type is borrowed from the
/// expression and must not be freed.
duckdb_logical_type duckdb_vx_expr_get_return_type(duckdb_vx_expr expr);

const char *duckdb_vx_expr_get_bound_column_ref_get_name(duckdb_vx_expr expr);

duckdb_value duckdb_vx_expr_bound_constant_get_value(duckdb_vx_expr expr);
Expand Down
68 changes: 68 additions & 0 deletions vortex-duckdb/src/convert/expr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ use vortex::expr::get_item;
use vortex::expr::is_not_null;
use vortex::expr::is_null;
use vortex::expr::list_contains;
use vortex::expr::list_length;
use vortex::expr::lit;
use vortex::expr::not;
use vortex::expr::or_collect;
Expand All @@ -37,6 +38,7 @@ use vortex::scalar_fn::fns::like::LikeOptions;
use vortex::scalar_fn::fns::literal::Literal;
use vortex::scalar_fn::fns::operators::Operator;

use crate::cpp::DUCKDB_TYPE;
use crate::cpp::DUCKDB_VX_EXPR_TYPE;
use crate::duckdb;
use crate::duckdb::BoundFunction;
Expand All @@ -57,6 +59,20 @@ fn from_bound_str(value: &duckdb::ExpressionRef) -> VortexResult<String> {
}
}

/// Whether the expression's return type is a `LIST` or fixed-size `ARRAY`.
fn returns_a_list(expr: &duckdb::ExpressionRef) -> bool {
matches!(
expr.return_type().as_type_id(),
DUCKDB_TYPE::DUCKDB_TYPE_LIST | DUCKDB_TYPE::DUCKDB_TYPE_ARRAY
)
}

/// Wrap `expr` in `list_length`. Since vortex `list_length` returns u64 but duckdb equivalents
/// return i64, we must cast as well.
fn build_list_length(expr: Expression, nullability: Nullability) -> Expression {
cast(list_length(expr), DType::Primitive(PType::I64, nullability))
}

fn try_from_bound_function(
func: &BoundFunction,
col_sub: Option<&Expression>,
Expand Down Expand Up @@ -115,6 +131,37 @@ fn try_from_bound_function(
};
Like.new_expr(LikeOptions::default(), [value, lit(pattern)])
}
"array_length" => {
let children = func.children().collect::<Vec<_>>();
// Only accept array_length(expr) rather than array_length(expr, dim).
if children.len() != 1 {
return Ok(None);
}
let Some(col) = try_from_expression_inner(children[0], col_sub)? else {
return Ok(None);
};

// We don't know the column's nullability here, so we set it to nullable.
build_list_length(col, Nullability::Nullable)
}
// len/length semantics depend on the return type of underlying expr.
"len" | "length" => {
let children: Vec<_> = func.children().collect();
vortex_ensure!(children.len() == 1);
let child = children[0];

if returns_a_list(child) {
let Some(col) = try_from_expression_inner(child, col_sub)? else {
return Ok(None);
};

// Same nullability rationale as in "array_length" branch.
let list_len_expr = build_list_length(col, Nullability::Nullable);
return Ok(Some(list_len_expr));
} else {
return Ok(None);
}
}
_ => {
debug!("bound function {}", func.scalar_function.name());
return Ok(None);
Expand All @@ -137,6 +184,11 @@ pub(super) fn try_from_bound_expression_with_col_sub(
try_from_expression_inner(value, Some(col_sub))
}

fn is_supported_length_alias(func: &BoundFunction) -> bool {
let children: Vec<_> = func.children().collect();
children.len() == 1 && returns_a_list(children[0])
}

// Called before pushdown_complex_filter or a table filter expression call.
// As we support complex filter pushdown, Duckdb pushes expressions to Vortex.
// However, it doesn't know what type of expressions we can handle. Here we list
Expand Down Expand Up @@ -173,6 +225,8 @@ pub fn can_push_expression(value: &duckdb::ExpressionRef) -> bool {
|| name == "~~"
|| name == "!~~"
|| name == "strlen"
|| name == "array_length"
|| (matches!(name, "len" | "length") && is_supported_length_alias(&func))
}
ExpressionClass::BoundOperator(op) => {
if !matches!(
Expand All @@ -190,6 +244,13 @@ pub fn can_push_expression(value: &duckdb::ExpressionRef) -> bool {
}
}

/// Applies `list_length` expression to a duckdb field
fn list_length_on_field(field: &DuckdbField) -> Expression {
let col = get_item(field.name.as_str(), root());

build_list_length(col, field.dtype.nullability())
}

pub fn try_from_projection_expression(
value: &duckdb::ExpressionRef,
field: &DuckdbField,
Expand All @@ -208,6 +269,13 @@ pub fn try_from_projection_expression(
let col = cast(col, dtype);
Some(col)
}
"array_length" => {
// Only accept array_length(expr) rather than array_length(expr, dim).
(func.children().count() == 1).then(|| list_length_on_field(field))
}
// len/length have different semantics depending on field dtype.
"len" | "length" => matches!(field.dtype, DType::List(..) | DType::FixedSizeList(..))
.then(|| list_length_on_field(field)),
_ => None,
})
}
Expand Down
7 changes: 7 additions & 0 deletions vortex-duckdb/src/duckdb/expr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ use std::ptr;
use crate::cpp;
use crate::cpp::duckdb_vx_expr_class;
use crate::duckdb::DDBString;
use crate::duckdb::LogicalType;
use crate::duckdb::LogicalTypeRef;
use crate::duckdb::ScalarFunction;
use crate::duckdb::ScalarFunctionRef;
use crate::duckdb::Value;
Expand All @@ -33,6 +35,11 @@ impl ExpressionRef {
unsafe { cpp::duckdb_vx_expr_get_class(self.as_ptr()) }
}

/// The return type of this expression.
pub fn return_type(&self) -> &LogicalTypeRef {
unsafe { LogicalType::borrow(cpp::duckdb_vx_expr_get_return_type(self.as_ptr())) }
}

/// Match the subclass of the expression.
pub fn as_class(&self) -> Option<ExpressionClass<'_>> {
Some(
Expand Down
98 changes: 98 additions & 0 deletions vortex-duckdb/src/e2e_test/vortex_scan_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1014,3 +1014,101 @@ fn test_geometry() {
let area = vec.as_slice_with_len::<f64>(chunk.len().as_())[0];
assert_eq!(area, 1000.0);
}

/// `SELECT array_length(list)` / `len(list)` / `length(list)` should push the list-length
/// computation into the Vortex scan (computed from offsets, without materializing the list
/// elements) and return the per-row element counts.
#[test]
fn test_vortex_scan_list_length_projection() {
let file = RUNTIME.block_on(async {
let integers = PrimitiveArray::from_iter([
10i32, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150,
]);
// Variable-length lists with 3, 4, 1, 5, 2 elements respectively.
let offsets = buffer![0i32, 3, 7, 8, 13, 15];
let list_array = ListArray::try_new(
integers.into_array(),
offsets.into_array(),
Validity::AllValid,
)
.unwrap();

write_single_column_vortex_file("int_list", list_array).await
});

let conn = database_connection();
let file_path = file.path().to_string_lossy();

// `len`/`length` bind to the same DuckDB function set as `array_length` for list arguments.
for func in ["array_length", "len", "length"] {
let result = conn
.query(&format!("SELECT {func}(int_list) FROM '{file_path}'"))
.unwrap();

let mut lengths = Vec::new();
for chunk in result {
let len = chunk.len().as_();
let vec = chunk.get_vector(0);
lengths.extend_from_slice(vec.as_slice_with_len::<i64>(len));
}

assert_eq!(lengths, vec![3, 4, 1, 5, 2], "{func}(int_list) mismatch");
}
}

/// `WHERE array_length(list) >= k` should push down as a complex filter.
#[test]
fn test_vortex_scan_list_length_filter() {
let file = RUNTIME.block_on(async {
let integers = PrimitiveArray::from_iter([
10i32, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150,
]);
// Variable-length lists with 3, 4, 1, 5, 2 elements respectively.
let offsets = buffer![0i32, 3, 7, 8, 13, 15];
let list_array = ListArray::try_new(
integers.into_array(),
offsets.into_array(),
Validity::AllValid,
)
.unwrap();

write_single_column_vortex_file("int_list", list_array).await
});

// Lists with length >= 4: the 4-element and 5-element lists => 2 rows.
let count = scan_vortex_file_single_row::<i64, i64>(
file,
"SELECT COUNT(*) FROM ? WHERE array_length(int_list) >= 4",
0,
);
assert_eq!(count, 2);
}

/// `array_length`/`len`/`length` over a FixedSizeList column. The length is the fixed list size.
#[test]
fn test_vortex_scan_fixed_size_list_length_projection() {
let file = RUNTIME.block_on(async {
// 6 fixed-size lists of 4 i32 elements each.
let elements = (0..24i32).collect::<PrimitiveArray>();
let fsl = FixedSizeListArray::new(elements.into_array(), 4, Validity::AllValid, 6);
write_single_column_vortex_file("int_lists", fsl).await
});

let conn = database_connection();
let file_path = file.path().to_string_lossy();

for func in ["array_length", "len", "length"] {
let result = conn
.query(&format!("SELECT {func}(int_lists) FROM '{file_path}'"))
.unwrap();

let mut lengths = Vec::new();
for chunk in result {
let len = chunk.len().as_();
let vec = chunk.get_vector(0);
lengths.extend_from_slice(vec.as_slice_with_len::<i64>(len));
}

assert_eq!(lengths, vec![4i64; 6], "{func}(int_lists) mismatch");
}
}
Loading
Loading