From 72669136129a616c33c1e183e840d5d979e48ff0 Mon Sep 17 00:00:00 2001 From: Hu Shenggang Date: Thu, 18 Jun 2026 17:34:05 +0800 Subject: [PATCH 1/2] [fix](be) Fix nested lambda argument binding Issue Number: None Related PR: None Problem Summary: Nested lambda expressions can contain lambda argument ColumnRefs with the same column ids as outer lambda arguments. Binding lambda arguments only by column id makes BE unable to distinguish current-scope arguments from captured outer arguments, especially when inner lambdas shadow outer argument names. This change serializes per-lambda argument names through a dedicated Thrift field, uses scope-aware name binding in BE, lets inner lambda argument names shadow outer ones, and keeps a legacy single-layer fallback that infers lambda arguments from ColumnRef ids. For old FE nested lambda plans without lambda metadata, BE now returns a clear error instead of executing ambiguous bindings. Fix nested lambda argument binding for array_map. Nested lambda plans from old FE versions without lambda metadata now return an error. - Test: - Unit Test: ./run-fe-ut.sh --run org.apache.doris.analysis.ExprToThriftBehaviorTest - Unit Test: ./run-be-ut.sh --run --filter=ArrayMapFunctionTest.*:VColumnRefTest.* - Regression test: doris-local-regression all -d query_p0/sql_functions/array_functions -s test_nested_array_map -forceGenOut - Regression test: doris-local-regression start && doris-local-regression run -d query_p0/sql_functions/array_functions -s test_nested_array_map - Format: git diff --check; build-support/check-format.sh - Behavior changed: Yes. Nested lambda arguments are resolved by FE-provided argument names, and old FE nested lambda plans without metadata fail fast. - Does this need documentation: No --- .../lambda_execution_context.h | 116 ++ .../exprs/lambda_function/lambda_function.h | 3 +- .../lambda_function/varray_map_function.cpp | 246 ++-- .../lambda_function/varray_sort_function.cpp | 179 ++- be/src/exprs/vcolumn_ref.h | 66 +- be/src/exprs/vexpr_context.cpp | 11 + be/src/exprs/vexpr_context.h | 39 +- be/src/exprs/vlambda_function_call_expr.h | 2 +- be/src/exprs/vlambda_function_expr.h | 14 +- be/src/exprs/vslot_ref.h | 1 + .../array_map_function_test.cpp | 1152 +++++++++++++++++ be/test/exprs/vcolumn_ref_test.cpp | 148 +++ .../doris/analysis/ExprToThriftVisitor.java | 1 + .../analysis/ExprToThriftBehaviorTest.java | 16 + gensrc/thrift/Exprs.thrift | 4 + .../array_functions/test_nested_array_map.out | 10 + .../test_nested_array_map.groovy | 86 ++ 17 files changed, 1972 insertions(+), 122 deletions(-) create mode 100644 be/src/exprs/lambda_function/lambda_execution_context.h create mode 100644 be/test/exprs/lambda_function/array_map_function_test.cpp create mode 100644 be/test/exprs/vcolumn_ref_test.cpp create mode 100644 regression-test/data/query_p0/sql_functions/array_functions/test_nested_array_map.out create mode 100644 regression-test/suites/query_p0/sql_functions/array_functions/test_nested_array_map.groovy diff --git a/be/src/exprs/lambda_function/lambda_execution_context.h b/be/src/exprs/lambda_function/lambda_execution_context.h new file mode 100644 index 00000000000000..6068c3e47d1590 --- /dev/null +++ b/be/src/exprs/lambda_function/lambda_execution_context.h @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include +#include +#include +#include +#include + +namespace doris { + +class LambdaExecutionContext { +public: + struct Binding { + std::string name; + int column_position = -1; + }; + + struct Frame { + bool bind_by_name = true; + bool parent_bindings_visible = true; + std::vector argument_bindings; + }; + + struct ResolveResult { + bool searched_named_scope = false; + bool found = false; + int column_position = -1; + }; + + class FrameGuard { + public: + FrameGuard(LambdaExecutionContext& context, Frame frame) : _context(&context) { + _context->push_frame(std::move(frame)); + } + + FrameGuard(FrameGuard&& other) = delete; + FrameGuard& operator=(FrameGuard&& other) = delete; + FrameGuard(const FrameGuard&) = delete; + FrameGuard& operator=(const FrameGuard&) = delete; + + ~FrameGuard() { release(); } + + private: + void release() { + if (_context != nullptr) { + _context->pop_frame(); + _context = nullptr; + } + } + + LambdaExecutionContext* _context; + }; + + void push_frame(Frame frame) { _frames.push_back(std::move(frame)); } + + void pop_frame() { + DCHECK(!_frames.empty()); + _frames.pop_back(); + } + + ResolveResult resolve_column_position(const std::string& name) const { + ResolveResult result; + for (const auto& frame : std::ranges::reverse_view(_frames)) { + result.searched_named_scope |= frame.bind_by_name; + for (const auto& argument_binding : + std::ranges::reverse_view(frame.argument_bindings)) { + if (argument_binding.name == name) { + result.found = true; + result.column_position = argument_binding.column_position; + return result; + } + } + if (!frame.parent_bindings_visible) { + break; + } + } + return result; + } + + void collect_visible_binding_column_positions(std::set& column_positions) const { + for (const auto& _frame : std::ranges::reverse_view(_frames)) { + for (const auto& binding : _frame.argument_bindings) { + if (binding.column_position >= 0) { + column_positions.insert(binding.column_position); + } + } + if (!_frame.parent_bindings_visible) { + break; + } + } + } + +private: + std::vector _frames; +}; + +} // namespace doris diff --git a/be/src/exprs/lambda_function/lambda_function.h b/be/src/exprs/lambda_function/lambda_function.h index 451b61569e9ca8..5658978591cb69 100644 --- a/be/src/exprs/lambda_function/lambda_function.h +++ b/be/src/exprs/lambda_function/lambda_function.h @@ -31,7 +31,8 @@ class LambdaFunction { virtual std::string get_name() const = 0; - virtual doris::Status prepare(RuntimeState* state) { + virtual doris::Status prepare(RuntimeState* state, const VExprSPtrs& children) { + static_cast(children); batch_size = state->batch_size(); return Status::OK(); } diff --git a/be/src/exprs/lambda_function/varray_map_function.cpp b/be/src/exprs/lambda_function/varray_map_function.cpp index db82c46500062c..5c279f15f434cd 100644 --- a/be/src/exprs/lambda_function/varray_map_function.cpp +++ b/be/src/exprs/lambda_function/varray_map_function.cpp @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -28,6 +29,7 @@ #include "core/block/columns_with_type_and_name.h" #include "core/column/column.h" #include "core/column/column_array.h" +#include "core/column/column_nothing.h" #include "core/column/column_nullable.h" #include "core/column/column_vector.h" #include "core/data_type/data_type.h" @@ -36,18 +38,17 @@ #include "core/data_type/data_type_number.h" #include "exec/common/util.hpp" #include "exprs/aggregate/aggregate_function.h" +#include "exprs/lambda_function/lambda_execution_context.h" #include "exprs/lambda_function/lambda_function.h" #include "exprs/lambda_function/lambda_function_factory.h" #include "exprs/vcolumn_ref.h" -#include "exprs/vslot_ref.h" +#include "exprs/vexpr_context.h" +#include "exprs/vlambda_function_expr.h" namespace doris { -class VExprContext; // extend a block with all required parameters struct LambdaArgs { - // the lambda function need the column ids of all the slots - std::vector output_slot_ref_indexs; // which line is extended to the original block int64_t current_row_idx = 0; // when a block is filled, the array may be truncated, recording where it was truncated @@ -76,34 +77,18 @@ class ArrayMapFunction : public LambdaFunction { std::string get_name() const override { return name; } + Status prepare(RuntimeState* state, const VExprSPtrs& children) override { + RETURN_IF_ERROR(LambdaFunction::prepare(state, children)); + DCHECK_GE(children.size(), 2); + + return _prepare_lambda_argument_binding(children[0], children.size() - 1, + _lambda_argument_binding); + } + Status execute(VExprContext* context, const Block* block, const Selector* expr_selector, size_t count, ColumnPtr& result_column, const DataTypePtr& result_type, const VExprSPtrs& children) const override { LambdaArgs args_info; - // collect used slot ref in lambda function body - std::vector& output_slot_ref_indexs = args_info.output_slot_ref_indexs; - _collect_slot_ref_column_id(children[0], output_slot_ref_indexs); - - int gap = 0; - if (!output_slot_ref_indexs.empty()) { - auto max_id = std::ranges::max_element(output_slot_ref_indexs); - gap = *max_id + 1; - _set_column_ref_column_id(children[0], gap); - } - - std::vector names(gap); - DataTypes data_types(gap); - - for (int i = 0; i < gap; ++i) { - if (_contains_column_id(output_slot_ref_indexs, i)) { - names[i] = block->get_by_position(i).name; - data_types[i] = block->get_by_position(i).type; - } else { - // padding some mock data to hold the position, like call block#rows function need - names[i] = "temp"; - data_types[i] = std::make_shared(); - } - } ///* array_map(lambda,arg1,arg2,.....) */// //1. child[1:end]->execute(src_block) @@ -126,6 +111,7 @@ class ArrayMapFunction : public LambdaFunction { ColumnPtr first_array_offsets = nullptr; //2. get the result column from executed expr, and the needed is nested column of array std::vector lambda_datas(arguments.size()); + DataTypes lambda_argument_types(arguments.size()); for (int i = 0; i < arguments.size(); ++i) { const auto& array_column_type_name = arguments[i]; @@ -153,7 +139,6 @@ class ArrayMapFunction : public LambdaFunction { // here is the array column const auto& col_array = assert_cast(*column_array); - const auto& col_type = assert_cast(*type_array); if (i == 0) { nested_array_column_rows = col_array.get_data_ptr()->size(); @@ -180,15 +165,72 @@ class ArrayMapFunction : public LambdaFunction { } } lambda_datas[i] = col_array.get_data_ptr(); - names.push_back("R" + array_column_type_name.name); - data_types.push_back(col_type.get_nested_type()); + const auto& col_type = assert_cast(*type_array); + lambda_argument_types[i] = col_type.get_nested_type(); + } + std::set required_input_column_ids; + children[0]->collect_slot_column_ids(required_input_column_ids); + context->lambda_execution_context().collect_visible_binding_column_positions( + required_input_column_ids); + const int lambda_argument_base = + required_input_column_ids.empty() ? 0 : *required_input_column_ids.rbegin() + 1; + if (!_lambda_argument_binding.bind_by_name) { + RETURN_IF_ERROR( + _set_legacy_lambda_argument_gap(children[0]->get_child(0), lambda_argument_base, + _lambda_argument_binding.argument_size)); + } + std::vector names(lambda_argument_base); + DataTypes data_types(lambda_argument_base); + std::vector materialized_input_columns(lambda_argument_base, false); + names.reserve(lambda_argument_base + arguments.size()); + data_types.reserve(lambda_argument_base + arguments.size()); + for (int column_id : required_input_column_ids) { + if (column_id < 0 || block == nullptr || + static_cast(column_id) >= block->columns()) { + return Status::InternalError( + "array_map lambda input column id {} is outside input block, block={}", + column_id, block == nullptr ? "nullptr" : block->dump_structure()); + } + materialized_input_columns[column_id] = true; + names[column_id] = block->get_by_position(column_id).name; + data_types[column_id] = block->get_by_position(column_id).type; + } + for (int i = 0; i < lambda_argument_base; ++i) { + if (!materialized_input_columns[i]) { + // Keep sparse input positions stable for SlotRef/parent lambda bindings without + // materializing unrelated wide-table columns into every lambda batch. + names[i] = "temp"; + data_types[i] = std::make_shared(); + } + } + for (int i = 0; i < arguments.size(); ++i) { + const auto& array_column_type_name = arguments[i]; + if (_lambda_argument_binding.bind_by_name && + i < _lambda_argument_binding.names.size()) { + names.push_back(_lambda_argument_binding.names[i]); + } else { + names.push_back("R" + array_column_type_name.name); + } + data_types.push_back(lambda_argument_types[i]); + } + + LambdaExecutionContext::Frame lambda_frame; + lambda_frame.bind_by_name = _lambda_argument_binding.bind_by_name; + lambda_frame.parent_bindings_visible = true; + for (int i = 0; i < _lambda_argument_binding.argument_size; ++i) { + const int column_position = lambda_argument_base + i; + if (_lambda_argument_binding.bind_by_name) { + lambda_frame.argument_bindings.push_back( + {_lambda_argument_binding.names[i], column_position}); + } } + LambdaExecutionContext::FrameGuard lambda_frame_guard(context->lambda_execution_context(), + std::move(lambda_frame)); // if column_array is NULL, we know the array_data_column will not write any data, // so the column is empty. eg : (x) -> concat('|',x + "1"). if still execute the lambda function, will cause the bolck rows are not equal // the x column is empty, but "|" is const literal, size of column is 1, so the block rows is 1, but the x column is empty, will be coredump. - if (std::any_of(lambda_datas.begin(), lambda_datas.end(), - [](const auto& v) { return v->empty(); })) { + if (std::ranges::any_of(lambda_datas, [](const auto& v) { return v->empty(); })) { DataTypePtr nested_type; bool is_nullable = result_type->is_nullable(); if (is_nullable) { @@ -231,33 +273,26 @@ class ArrayMapFunction : public LambdaFunction { if (mem_reuse) { columns[i] = lambda_block.get_by_position(i).column->assert_mutable(); } else { - if (_contains_column_id(output_slot_ref_indexs, i) || i >= gap) { - // TODO: maybe could create const column, so not insert_many_from when extand data - // but now here handle batch_size of array nested data every time, so maybe have different rows - columns[i] = data_types[i]->create_column(); - } else { - columns[i] = data_types[i] - ->create_column_const_with_default_value(0) - ->assert_mutable(); - } + columns[i] = data_types[i]->create_column(); } } // batch_size of array nested data every time inorder to avoid memory overflow - while (columns[gap]->size() < batch_size) { - long max_step = batch_size - columns[gap]->size(); + while (columns[lambda_argument_base]->size() < batch_size) { + long max_step = batch_size - columns[lambda_argument_base]->size(); long current_step = std::min( max_step, (long)(args_info.cur_size - args_info.current_offset_in_array)); size_t pos = args_info.array_start + args_info.current_offset_in_array; for (int i = 0; i < arguments.size() && current_step > 0; ++i) { - columns[gap + i]->insert_range_from(*lambda_datas[i], pos, current_step); + columns[lambda_argument_base + i]->insert_range_from(*lambda_datas[i], pos, + current_step); } args_info.current_offset_in_array += current_step; args_info.current_repeat_times += current_step; if (args_info.current_offset_in_array >= args_info.cur_size) { args_info.current_row_eos = true; } - _extend_data(columns, block, args_info.current_repeat_times, gap, - args_info.current_row_idx, output_slot_ref_indexs); + _repeat_input_columns(columns, block, args_info.current_repeat_times, + materialized_input_columns, args_info.current_row_idx); args_info.current_repeat_times = 0; if (args_info.current_row_eos) { //current row is end of array, move to next row @@ -329,52 +364,105 @@ class ArrayMapFunction : public LambdaFunction { } private: - bool _contains_column_id(const std::vector& output_slot_ref_indexs, int id) const { - const auto it = std::find(output_slot_ref_indexs.begin(), output_slot_ref_indexs.end(), id); - return it != output_slot_ref_indexs.end(); + struct LambdaArgumentBinding { + bool bind_by_name = true; + size_t argument_size = 0; + std::vector names; + }; + + Status _prepare_lambda_argument_binding(const VExprSPtr& expr, size_t expected_argument_size, + LambdaArgumentBinding& argument_binding) const { + DORIS_CHECK_EQ(expr->node_type(), TExprNodeType::LAMBDA_FUNCTION_EXPR); + const auto* lambda_expr = assert_cast(expr.get()); + + argument_binding.argument_size = 0; + argument_binding.names.clear(); + argument_binding.bind_by_name = lambda_expr->has_argument_names(); + + if (!argument_binding.bind_by_name) { + if (_contains_nested_lambda_call(expr->get_child(0))) { + return Status::InternalError( + "Cannot resolve nested lambda argument without lambda metadata"); + } + argument_binding.argument_size = expected_argument_size; + argument_binding.names.resize(expected_argument_size); + return Status::OK(); + } + + argument_binding.names = lambda_expr->argument_names(); + if (argument_binding.names.size() > expected_argument_size) { + return Status::InternalError( + "lambda argument metadata size exceeds parameter size, maximum={}, actual={}", + expected_argument_size, argument_binding.names.size()); + } + argument_binding.argument_size = argument_binding.names.size(); + if (std::ranges::any_of(argument_binding.names, + [](const auto& argument_name) { return argument_name.empty(); })) { + return Status::InternalError("lambda argument metadata contains empty name"); + } + return Status::OK(); } - void _set_column_ref_column_id(VExprSPtr expr, int gap) const { - for (const auto& child : expr->children()) { - if (child->is_column_ref()) { - auto* ref = static_cast(child.get()); - ref->set_gap(gap); - } else { - _set_column_ref_column_id(child, gap); + Status _set_legacy_lambda_argument_gap(const VExprSPtr& expr, int lambda_argument_base, + size_t argument_size) const { + if (expr->is_column_ref()) { + auto* ref = static_cast(expr.get()); + if (ref->column_id() >= 0 && static_cast(ref->column_id()) < argument_size) { + const int argument_index = ref->column_id(); + ref->set_gap(lambda_argument_base + argument_index - ref->column_id()); } + return Status::OK(); } - } - void _collect_slot_ref_column_id(VExprSPtr expr, - std::vector& output_slot_ref_indexs) const { for (const auto& child : expr->children()) { - if (child->is_slot_ref()) { - const auto* ref = static_cast(child.get()); - output_slot_ref_indexs.push_back(ref->column_id()); - } else { - _collect_slot_ref_column_id(child, output_slot_ref_indexs); - } + RETURN_IF_ERROR( + _set_legacy_lambda_argument_gap(child, lambda_argument_base, argument_size)); } + return Status::OK(); } - void _extend_data(std::vector& columns, const Block* block, - int current_repeat_times, int size, int64_t current_row_idx, - const std::vector& output_slot_ref_indexs) const { - if (!current_repeat_times || !size) { + bool _is_lambda_call_with_lambda_expr(const VExprSPtr& expr) const { + return expr->node_type() == TExprNodeType::LAMBDA_FUNCTION_CALL_EXPR && + !expr->children().empty() && + expr->children()[0]->node_type() == TExprNodeType::LAMBDA_FUNCTION_EXPR; + } + + bool _contains_nested_lambda_call(const VExprSPtr& expr) const { + if (_is_lambda_call_with_lambda_expr(expr)) { + return true; + } + return std::ranges::any_of(expr->children(), [this](const auto& child) { + return _contains_nested_lambda_call(child); + }); + } + + void _repeat_input_columns(std::vector& columns, const Block* block, + int repeat_times, + const std::vector& materialized_input_columns, + int64_t row_idx) const { + if (!repeat_times || materialized_input_columns.empty()) { return; } - for (int i = 0; i < size; i++) { - if (_contains_column_id(output_slot_ref_indexs, i)) { - auto src_column = - block->get_by_position(i).column->convert_to_full_column_if_const(); - columns[i]->insert_many_from(*src_column, current_row_idx, current_repeat_times); - } else { - // must be column const - DCHECK(is_column_const(*columns[i])); - columns[i]->resize(columns[i]->size() + current_repeat_times); + for (size_t i = 0; i < materialized_input_columns.size(); i++) { + if (!materialized_input_columns[i]) { + columns[i]->resize(columns[i]->size() + repeat_times); + continue; } + DORIS_CHECK(block != nullptr); + auto src_column = block->get_by_position(i).column->convert_to_full_column_if_const(); + if (check_and_get_column(src_column.get())) { + // A ColumnNothing in the outer block is a placeholder for an unmaterialized + // virtual column. Keep it as a placeholder in the lambda block as well, so + // VirtualSlotRef can still materialize it lazily if the lambda body reads it. + if (!check_and_get_column(columns[i].get())) { + columns[i] = ColumnNothing::create(columns[i]->size()); + } + } + columns[i]->insert_many_from(*src_column, row_idx, repeat_times); } } + + LambdaArgumentBinding _lambda_argument_binding; }; void register_function_array_map(doris::LambdaFunctionFactory& factory) { diff --git a/be/src/exprs/lambda_function/varray_sort_function.cpp b/be/src/exprs/lambda_function/varray_sort_function.cpp index 6d14609400365c..dfd2ed1a8ae58e 100644 --- a/be/src/exprs/lambda_function/varray_sort_function.cpp +++ b/be/src/exprs/lambda_function/varray_sort_function.cpp @@ -17,6 +17,12 @@ #include +#include +#include +#include +#include +#include + #include "common/status.h" #include "core/assert_cast.h" #include "core/block/block.h" @@ -28,14 +34,16 @@ #include "core/column/column_vector.h" #include "core/data_type/data_type.h" #include "exec/common/util.hpp" +#include "exprs/lambda_function/lambda_execution_context.h" #include "exprs/lambda_function/lambda_function.h" #include "exprs/lambda_function/lambda_function_factory.h" +#include "exprs/vcolumn_ref.h" #include "exprs/vexpr.h" +#include "exprs/vexpr_context.h" +#include "exprs/vlambda_function_expr.h" namespace doris { -class VExprContext; - using ConstColumnVariant = std::variantnode_type(), TExprNodeType::LAMBDA_FUNCTION_EXPR); + const auto* lambda_expr = assert_cast(children[0].get()); + const std::vector* argument_names = + lambda_expr->has_argument_names() ? &lambda_expr->argument_names() : nullptr; + RETURN_IF_ERROR(_set_comparator_argument_gap(children[0]->get_child(0), argument_names)); + return Status::OK(); + } + Status execute(VExprContext* context, const Block* block, const Selector* expr_selector, size_t count, ColumnPtr& result_column, const DataTypePtr& result_type, const VExprSPtrs& children) const override { @@ -129,9 +148,9 @@ class ArraySortFunction : public LambdaFunction { * 1 20 1 nullable(int) * 2 1/-1/0 ... tinyint * The size of a column is always 1; we only need to use it to store the specific values ​​in the array for comparison. - */ + */ Block lambda_block; - for (int i = 0; i <= 2; i++) { + for (int i = 0; i < 2; i++) { lambda_block.insert(ColumnWithTypeAndName(nested_nullable_column.clone_empty(), col_type.get_nested_type(), "temp")); } @@ -148,7 +167,14 @@ class ArraySortFunction : public LambdaFunction { temp_nullmap_data[i]->resize(1); }; - int lambda_res_id = 2; + // array_sort's comparator arguments are represented by ColumnRef column ids 0 and 1. + // They are position-based instead of name-based because FE may reuse the first + // argument name for the second comparator ColumnRef and distinguish them only by id. + LambdaExecutionContext::Frame lambda_frame; + lambda_frame.bind_by_name = false; + lambda_frame.parent_bindings_visible = false; + LambdaExecutionContext::FrameGuard lambda_frame_guard(context->lambda_execution_context(), + std::move(lambda_frame)); // 3. sort array by executing lambda function // During the sorting process, the parameter columns of lambda_block are first populated using prepare_lambda_input, @@ -175,12 +201,14 @@ class ArraySortFunction : public LambdaFunction { } }; + const int lambda_result_base = static_cast(lambda_block.columns()); for (int row = 0; row < input_rows; ++row) { auto start = off_data[row - 1]; auto end = off_data[row]; std::sort(&permutation[start], &permutation[end], [&](size_t i, size_t j) { prepare_lambda_input(i, 0); prepare_lambda_input(j, 1); + int lambda_res_id = lambda_result_base; auto status = children[0]->execute(context, &lambda_block, &lambda_res_id); if (!status.ok()) [[unlikely]] { @@ -197,6 +225,7 @@ class ArraySortFunction : public LambdaFunction { // only -1, 0, 1 long cmp = assert_cast(full_res_col.get()) ->get_data()[0]; + lambda_block.erase_tail(lambda_result_base); return cmp < 0; }); @@ -221,6 +250,146 @@ class ArraySortFunction : public LambdaFunction { return Status::OK(); } +private: + Status _set_comparator_argument_gap(const VExprSPtr& expr, + const std::vector* argument_names) const { + if (expr->is_column_ref()) { + auto* ref = static_cast(expr.get()); + RETURN_IF_ERROR(_validate_comparator_argument_ref(*ref, argument_names)); + ref->set_gap(0); + return Status::OK(); + } + + if (expr->is_slot_ref() || expr->is_virtual_slot_ref()) { + return Status::NotSupported( + "array_sort comparator only supports its own lambda arguments, but found " + "captured slot ref '{}'", + expr->expr_name()); + } + + if (_is_lambda_call_with_lambda_expr(expr)) { + // array_sort comparator arguments live in a position-based, comparator-local frame + // that is invisible to nested lambda frames. Reject unsupported nested captures during + // prepare, otherwise execution would later fail with an internal missing-column error. + // For example, array_sort((x, y) -> array_map(z -> z + x, nested_arr), arr) is + // rejected because the inner array_map lambda captures the comparator-local x; while + // array_sort((x, y) -> array_map(x -> x + 1, nested_arr), arr) is still valid because + // the inner x is array_map's own argument and shadows the comparator argument. + RETURN_IF_ERROR(_reject_nested_lambda_capture_of_comparator_argument( + assert_cast(expr->children()[0].get()), + argument_names)); + for (int i = 1; i < expr->children().size(); ++i) { + RETURN_IF_ERROR(_set_comparator_argument_gap(expr->children()[i], argument_names)); + } + return Status::OK(); + } + + for (const auto& child : expr->children()) { + RETURN_IF_ERROR(_set_comparator_argument_gap(child, argument_names)); + } + return Status::OK(); + } + + Status _reject_nested_lambda_capture_of_comparator_argument( + const VLambdaFunctionExpr* lambda_expr, + const std::vector* comparator_argument_names) const { + if (!lambda_expr->has_argument_names()) { + return Status::InternalError( + "Cannot validate nested lambda capture in array_sort comparator without lambda " + "metadata"); + } + return _reject_nested_lambda_capture_of_comparator_argument(lambda_expr->get_child(0), + comparator_argument_names, + lambda_expr->argument_names()); + } + + Status _reject_nested_lambda_capture_of_comparator_argument( + const VExprSPtr& expr, const std::vector* comparator_argument_names, + const std::vector& in_scope_lambda_argument_names) const { + // Names in in_scope_lambda_argument_names are declared by the nested lambda scopes that + // enclose expr. They can legally shadow array_sort comparator argument names, so a + // ColumnRef matching one of these names should be treated as a local nested-lambda + // argument instead of an unsupported capture from the array_sort comparator. + if (expr->is_column_ref()) { + if (std::ranges::find(in_scope_lambda_argument_names, expr->expr_name()) != + in_scope_lambda_argument_names.end()) { + return Status::OK(); + } + if (comparator_argument_names != nullptr && + std::ranges::find(*comparator_argument_names, expr->expr_name()) != + comparator_argument_names->end()) { + return Status::NotSupported( + "array_sort comparator does not support nested lambda capturing comparator " + "argument '{}'", + expr->expr_name()); + } + return Status::NotSupported( + "array_sort comparator only supports nested lambda arguments inside nested " + "lambda bodies, but found captured column ref '{}'", + expr->expr_name()); + } + + if (expr->is_slot_ref() || expr->is_virtual_slot_ref()) { + return Status::NotSupported( + "array_sort comparator only supports nested lambda arguments inside nested " + "lambda bodies, but found captured slot ref '{}'", + expr->expr_name()); + } + + if (_is_lambda_call_with_lambda_expr(expr)) { + const auto* lambda_expr = + assert_cast(expr->children()[0].get()); + if (!lambda_expr->has_argument_names()) { + return Status::InternalError( + "Cannot validate nested lambda capture in array_sort comparator without " + "lambda metadata"); + } + auto nested_in_scope_lambda_argument_names = in_scope_lambda_argument_names; + nested_in_scope_lambda_argument_names.insert( + nested_in_scope_lambda_argument_names.end(), + lambda_expr->argument_names().begin(), lambda_expr->argument_names().end()); + RETURN_IF_ERROR(_reject_nested_lambda_capture_of_comparator_argument( + lambda_expr->get_child(0), comparator_argument_names, + nested_in_scope_lambda_argument_names)); + for (int i = 1; i < expr->children().size(); ++i) { + RETURN_IF_ERROR(_reject_nested_lambda_capture_of_comparator_argument( + expr->children()[i], comparator_argument_names, + in_scope_lambda_argument_names)); + } + return Status::OK(); + } + + for (const auto& child : expr->children()) { + RETURN_IF_ERROR(_reject_nested_lambda_capture_of_comparator_argument( + child, comparator_argument_names, in_scope_lambda_argument_names)); + } + return Status::OK(); + } + + Status _validate_comparator_argument_ref(const VColumnRef& ref, + const std::vector* argument_names) const { + if (ref.column_id() < 0 || ref.column_id() >= 2) { + return Status::NotSupported( + "array_sort comparator only supports its own lambda arguments, but found " + "column ref '{}' with invalid column id {}", + ref.expr_name(), ref.column_id()); + } + if (argument_names != nullptr && + std::ranges::find(*argument_names, ref.expr_name()) == argument_names->end()) { + return Status::NotSupported( + "array_sort comparator only supports its own lambda arguments, but found " + "captured column ref '{}'", + ref.expr_name()); + } + return Status::OK(); + } + + bool _is_lambda_call_with_lambda_expr(const VExprSPtr& expr) const { + return expr->node_type() == TExprNodeType::LAMBDA_FUNCTION_CALL_EXPR && + !expr->children().empty() && + expr->children()[0]->node_type() == TExprNodeType::LAMBDA_FUNCTION_EXPR; + } + #define DISPATCH_PRIMITIVE_TYPE(TYPE, COLUMN_CLASS) \ case TYPE: \ column_variant = &assert_cast(column); \ diff --git a/be/src/exprs/vcolumn_ref.h b/be/src/exprs/vcolumn_ref.h index e4485e5815e02f..4609b1e0c29130 100644 --- a/be/src/exprs/vcolumn_ref.h +++ b/be/src/exprs/vcolumn_ref.h @@ -18,8 +18,11 @@ #pragma once #include +#include "common/exception.h" #include "exprs/function/function.h" +#include "exprs/lambda_function/lambda_execution_context.h" #include "exprs/vexpr.h" +#include "exprs/vexpr_context.h" #include "runtime/descriptors.h" #include "runtime/runtime_state.h" @@ -59,14 +62,27 @@ class VColumnRef final : public VExpr { Status execute_column_impl(VExprContext* context, const Block* block, const Selector* selector, size_t count, ColumnPtr& result_column) const override { DCHECK(_open_finished || block == nullptr); - auto origin_column = block->get_by_position(_column_id + _gap).column; + const int column_position = _get_column_position(context, block); + if (column_position < 0 || column_position >= block->columns()) { + return Status::InternalError( + "input block not contain column ref {}, column_id={}, gap={}, block={}", + _column_name, _column_id, _gap.load(), block->dump_structure()); + } + auto origin_column = block->get_by_position(column_position).column; result_column = filter_column_with_selector(origin_column, selector, count); return Status::OK(); } DataTypePtr execute_type(const Block* block) const override { DCHECK(_open_finished || block == nullptr); - return block->get_by_position(_column_id + _gap).type; + const int column_position = _get_column_position_without_context(block); + if (column_position < 0 || column_position >= block->columns()) { + throw doris::Exception( + ErrorCode::INTERNAL_ERROR, + "input block not contain column ref {}, column_id={}, gap={}, block={}", + _column_name, _column_id, _gap.load(), block->dump_structure()); + } + return block->get_by_position(column_position).type; } bool is_constant() const override { return false; } @@ -76,11 +92,14 @@ class VColumnRef final : public VExpr { const std::string& expr_name() const override { return _column_name; } void set_gap(int gap) { - if (_gap == 0) { - _gap = gap; - } + _gap = gap; + _gap_set = true; } + int get_gap() const { return _gap.load(); } + + bool has_gap() const { return _gap_set.load(); } + std::string debug_string() const override { std::stringstream out; out << "VColumnRef(slot_id: " << _column_id << ",column_name: " << _column_name @@ -91,8 +110,45 @@ class VColumnRef final : public VExpr { double execute_cost() const override { return 0.0; } private: + int _get_column_position(VExprContext* context, const Block* block) const { + if (context != nullptr) { + const auto resolve_result = + context->lambda_execution_context().resolve_column_position(_column_name); + if (resolve_result.found) { + return resolve_result.column_position; + } + if (resolve_result.searched_named_scope) { + return -1; + } + } + return _get_column_position_without_context(block); + } + + int _get_column_position_without_context(const Block* block) const { + if (!_gap_set.load()) { + const int position_by_name = _find_column_position_by_name(block); + if (position_by_name >= 0) { + return position_by_name; + } + } + return _column_id + _gap.load(); + } + + int _find_column_position_by_name(const Block* block) const { + if (block == nullptr) { + return -1; + } + for (int position = block->columns() - 1; position >= 0; --position) { + if (block->get_by_position(position).name == _column_name) { + return position; + } + } + return -1; + } + int _column_id; std::atomic _gap = 0; + std::atomic _gap_set = false; std::string _column_name; }; } // namespace doris diff --git a/be/src/exprs/vexpr_context.cpp b/be/src/exprs/vexpr_context.cpp index b9abce25588dd7..85cdaf29c58229 100644 --- a/be/src/exprs/vexpr_context.cpp +++ b/be/src/exprs/vexpr_context.cpp @@ -19,7 +19,9 @@ #include #include +#include #include +#include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/exception.h" @@ -31,6 +33,7 @@ #include "core/column/column_const.h" #include "exec/common/util.hpp" #include "exprs/function_context.h" +#include "exprs/lambda_function/lambda_execution_context.h" #include "exprs/vexpr.h" #include "runtime/runtime_state.h" #include "runtime/thread_context.h" @@ -44,6 +47,10 @@ class RowDescriptor; namespace doris { +VExprContext::VExprContext(VExprSPtr expr) + : _root(std::move(expr)), + _lambda_execution_context(std::make_unique()) {} + VExprContext::~VExprContext() { // In runtime filter, only create expr context to get expr root, will not call // prepare or open, so that it is not need to call close. And call close may core @@ -58,6 +65,10 @@ VExprContext::~VExprContext() { } } +LambdaExecutionContext& VExprContext::lambda_execution_context() { + return *_lambda_execution_context; +} + Status VExprContext::execute(Block* block, int* result_column_id) { Status st; RETURN_IF_CATCH_EXCEPTION({ diff --git a/be/src/exprs/vexpr_context.h b/be/src/exprs/vexpr_context.h index 72398c71d05fc5..655bfc226db59a 100644 --- a/be/src/exprs/vexpr_context.h +++ b/be/src/exprs/vexpr_context.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -53,6 +54,7 @@ class ColumnIterator; namespace doris { class ScoreRuntime; +class LambdaExecutionContext; using ScoreRuntimeSPtr = std::shared_ptr; class IndexExecContext { @@ -239,7 +241,7 @@ class VExprContext { ENABLE_FACTORY_CREATOR(VExprContext); public: - VExprContext(VExprSPtr expr) : _root(std::move(expr)) {} + VExprContext(VExprSPtr expr); ~VExprContext(); [[nodiscard]] Status prepare(RuntimeState* state, const RowDescriptor& row_desc); [[nodiscard]] Status open(RuntimeState* state); @@ -263,6 +265,8 @@ class VExprContext { std::shared_ptr get_index_context() const { return _index_context; } + LambdaExecutionContext& lambda_execution_context(); + /// Creates a FunctionContext, and returns the index that's passed to fn_context() to /// retrieve the created context. Exprs that need a FunctionContext should call this in /// Prepare() and save the returned index. 'varargs_buffer_size', if specified, is the @@ -340,36 +344,9 @@ class VExprContext { void clone_fn_contexts(VExprContext* other); - VExprContext& operator=(const VExprContext& other) { - if (this == &other) { - return *this; - } + VExprContext& operator=(const VExprContext& other) = delete; - _root = other._root; - _is_clone = other._is_clone; - _prepared = other._prepared; - _opened = other._opened; - - for (const auto& fn : other._fn_contexts) { - _fn_contexts.emplace_back(fn->clone()); - } - - _last_result_column_id = other._last_result_column_id; - _depth_num = other._depth_num; - return *this; - } - - VExprContext& operator=(VExprContext&& other) { - _root = other._root; - other._root = nullptr; - _is_clone = other._is_clone; - _prepared = other._prepared; - _opened = other._opened; - _fn_contexts = std::move(other._fn_contexts); - _last_result_column_id = other._last_result_column_id; - _depth_num = other._depth_num; - return *this; - } + VExprContext& operator=(VExprContext&& other) = delete; [[nodiscard]] static size_t get_memory_usage(const VExprContextSPtrs& contexts) { size_t usage = 0; @@ -427,6 +404,8 @@ class VExprContext { segment_v2::AnnRangeSearchRuntime _ann_range_search_runtime; bool _suitable_for_ann_index = true; + std::unique_ptr _lambda_execution_context; + std::unique_ptr _rf_selectivity = std::make_unique(); }; diff --git a/be/src/exprs/vlambda_function_call_expr.h b/be/src/exprs/vlambda_function_call_expr.h index 64b8118ed600c8..a3764fc13fdf05 100644 --- a/be/src/exprs/vlambda_function_call_expr.h +++ b/be/src/exprs/vlambda_function_call_expr.h @@ -50,7 +50,7 @@ class VLambdaFunctionCallExpr : public VExpr { return Status::InternalError("Lambda Function {} is not implemented.", _fn.name.function_name); } - RETURN_IF_ERROR(_lambda_function->prepare(state)); + RETURN_IF_ERROR(_lambda_function->prepare(state, _children)); _prepare_finished = true; return Status::OK(); } diff --git a/be/src/exprs/vlambda_function_expr.h b/be/src/exprs/vlambda_function_expr.h index 028489cb8821a7..9b8f7474287830 100644 --- a/be/src/exprs/vlambda_function_expr.h +++ b/be/src/exprs/vlambda_function_expr.h @@ -16,6 +16,9 @@ // under the License. #pragma once +#include +#include + #include "common/global_types.h" #include "exprs/function/function.h" #include "exprs/vexpr.h" @@ -25,7 +28,10 @@ class VLambdaFunctionExpr final : public VExpr { ENABLE_FACTORY_CREATOR(VLambdaFunctionExpr); public: - VLambdaFunctionExpr(const TExprNode& node) : VExpr(node) {} + VLambdaFunctionExpr(const TExprNode& node) + : VExpr(node), + _has_argument_names(node.__isset.lambda_argument_names), + _argument_names(node.lambda_argument_names) {} ~VLambdaFunctionExpr() override = default; Status prepare(RuntimeState* state, const RowDescriptor& desc, VExprContext* context) override { @@ -58,9 +64,15 @@ class VLambdaFunctionExpr final : public VExpr { const std::string& expr_name() const override { return _expr_name; } + bool has_argument_names() const { return _has_argument_names; } + + const std::vector& argument_names() const { return _argument_names; } + uint64_t get_digest(uint64_t seed) const override { return 0; } private: const std::string _expr_name = "vlambda_function_expr"; + const bool _has_argument_names; + const std::vector _argument_names; }; } // namespace doris diff --git a/be/src/exprs/vslot_ref.h b/be/src/exprs/vslot_ref.h index ef61edc384c2f2..f3baeb15596337 100644 --- a/be/src/exprs/vslot_ref.h +++ b/be/src/exprs/vslot_ref.h @@ -40,6 +40,7 @@ class VSlotRef MOCK_REMOVE(final) : public VExpr { #ifdef BE_TEST VSlotRef() = default; void set_slot_id(int slot_id) { _slot_id = slot_id; } + void set_column_name(const std::string* column_name) { _column_name = column_name; } #endif void set_column_id(int column_id) { _column_id = column_id; } Status prepare(RuntimeState* state, const RowDescriptor& desc, VExprContext* context) override; diff --git a/be/test/exprs/lambda_function/array_map_function_test.cpp b/be/test/exprs/lambda_function/array_map_function_test.cpp new file mode 100644 index 00000000000000..6e79867e268fa6 --- /dev/null +++ b/be/test/exprs/lambda_function/array_map_function_test.cpp @@ -0,0 +1,1152 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include +#include +#include + +#include "core/assert_cast.h" +#include "core/block/block.h" +#include "core/column/column_array.h" +#include "core/column/column_const.h" +#include "core/column/column_nullable.h" +#include "core/column/column_vector.h" +#include "core/data_type/data_type_array.h" +#include "core/data_type/data_type_nullable.h" +#include "core/data_type/data_type_number.h" +#include "exprs/vcolumn_ref.h" +#include "exprs/vexpr_context.h" +#include "exprs/vlambda_function_call_expr.h" +#include "exprs/vlambda_function_expr.h" +#include "exprs/vslot_ref.h" +#include "runtime/descriptors.h" +#include "runtime/runtime_state.h" + +namespace doris { + +class MockColumnExpr final : public VExpr { +public: + MockColumnExpr(ColumnPtr column, DataTypePtr type, std::string name) + : VExpr(type, false), + _column(std::move(column)), + _type(std::move(type)), + _name(std::move(name)) {} + + const std::string& expr_name() const override { return _name; } + + Status execute_column_impl(VExprContext* /*context*/, const Block* /*block*/, + const Selector* /*selector*/, size_t /*count*/, + ColumnPtr& result_column) const override { + result_column = _column; + return Status::OK(); + } + + DataTypePtr execute_type(const Block* /*block*/) const override { return _type; } + +private: + ColumnPtr _column; + DataTypePtr _type; + std::string _name; +}; + +class MockConstColumnExpr final : public VExpr { +public: + MockConstColumnExpr(ColumnPtr column, DataTypePtr type, std::string name) + : VExpr(type, false), + _column(std::move(column)), + _type(std::move(type)), + _name(std::move(name)) {} + + const std::string& expr_name() const override { return _name; } + + Status execute_column_impl(VExprContext* /*context*/, const Block* /*block*/, + const Selector* /*selector*/, size_t count, + ColumnPtr& result_column) const override { + result_column = ColumnConst::create(_column, count); + return Status::OK(); + } + + DataTypePtr execute_type(const Block* /*block*/) const override { return _type; } + +private: + ColumnPtr _column; + DataTypePtr _type; + std::string _name; +}; + +class MockBodyExpr final : public VExpr { +public: + MockBodyExpr(DataTypePtr type, std::string name) + : VExpr(type, false), _type(std::move(type)), _name(std::move(name)) {} + + const std::string& expr_name() const override { return _name; } + + Status execute_column_impl(VExprContext* /*context*/, const Block* /*block*/, + const Selector* /*selector*/, size_t /*count*/, + ColumnPtr& /*result_column*/) const override { + return Status::InternalError("mock body should not be executed"); + } + + DataTypePtr execute_type(const Block* /*block*/) const override { return _type; } + +private: + DataTypePtr _type; + std::string _name; +}; + +class MockSubtractExpr final : public VExpr { +public: + explicit MockSubtractExpr(DataTypePtr type) : VExpr(type, false), _type(std::move(type)) {} + + const std::string& expr_name() const override { return _name; } + + Status execute_column_impl(VExprContext* context, const Block* block, const Selector* selector, + size_t count, ColumnPtr& result_column) const override { + ColumnPtr left; + ColumnPtr right; + RETURN_IF_ERROR(get_child(0)->execute_column(context, block, selector, count, left)); + RETURN_IF_ERROR(get_child(1)->execute_column(context, block, selector, count, right)); + left = left->convert_to_full_column_if_const(); + right = right->convert_to_full_column_if_const(); + + const auto& left_data = _get_int_data(left); + const auto& right_data = _get_int_data(right); + auto result = ColumnInt32::create(); + for (size_t i = 0; i < count; ++i) { + result->insert_value(left_data.get_element(i) - right_data.get_element(i)); + } + result_column = std::move(result); + return Status::OK(); + } + + DataTypePtr execute_type(const Block* /*block*/) const override { return _type; } + +private: + const ColumnInt32& _get_int_data(const ColumnPtr& column) const { + if (const auto* nullable = check_and_get_column(column.get())) { + return assert_cast(nullable->get_nested_column()); + } + return assert_cast(*column); + } + + DataTypePtr _type; + std::string _name = "mock_subtract"; +}; + +class MockAddExpr final : public VExpr { +public: + explicit MockAddExpr(DataTypePtr type) : VExpr(type, false), _type(std::move(type)) {} + + const std::string& expr_name() const override { return _name; } + + Status execute_column_impl(VExprContext* context, const Block* block, const Selector* selector, + size_t count, ColumnPtr& result_column) const override { + ColumnPtr left; + ColumnPtr right; + RETURN_IF_ERROR(get_child(0)->execute_column(context, block, selector, count, left)); + RETURN_IF_ERROR(get_child(1)->execute_column(context, block, selector, count, right)); + left = left->convert_to_full_column_if_const(); + right = right->convert_to_full_column_if_const(); + + const auto& left_data = _get_int_data(left); + const auto& right_data = _get_int_data(right); + auto result = ColumnInt32::create(); + for (size_t i = 0; i < count; ++i) { + result->insert_value(left_data.get_element(i) + right_data.get_element(i)); + } + result_column = std::move(result); + return Status::OK(); + } + + DataTypePtr execute_type(const Block* /*block*/) const override { return _type; } + +private: + const ColumnInt32& _get_int_data(const ColumnPtr& column) const { + if (const auto* nullable = check_and_get_column(column.get())) { + return assert_cast(nullable->get_nested_column()); + } + return assert_cast(*column); + } + + DataTypePtr _type; + std::string _name = "mock_add"; +}; + +class MockMultiplyExpr final : public VExpr { +public: + explicit MockMultiplyExpr(DataTypePtr type) : VExpr(type, false), _type(std::move(type)) {} + + const std::string& expr_name() const override { return _name; } + + Status execute_column_impl(VExprContext* context, const Block* block, const Selector* selector, + size_t count, ColumnPtr& result_column) const override { + ColumnPtr left; + ColumnPtr right; + RETURN_IF_ERROR(get_child(0)->execute_column(context, block, selector, count, left)); + RETURN_IF_ERROR(get_child(1)->execute_column(context, block, selector, count, right)); + left = left->convert_to_full_column_if_const(); + right = right->convert_to_full_column_if_const(); + + const auto& left_data = _get_int_data(left); + const auto& right_data = _get_int_data(right); + auto result = ColumnInt32::create(); + for (size_t i = 0; i < count; ++i) { + result->insert_value(left_data.get_element(i) * right_data.get_element(i)); + } + result_column = std::move(result); + return Status::OK(); + } + + DataTypePtr execute_type(const Block* /*block*/) const override { return _type; } + +private: + const ColumnInt32& _get_int_data(const ColumnPtr& column) const { + if (const auto* nullable = check_and_get_column(column.get())) { + return assert_cast(nullable->get_nested_column()); + } + return assert_cast(*column); + } + + DataTypePtr _type; + std::string _name = "mock_multiply"; +}; + +class MockCompareExpr final : public VExpr { +public: + explicit MockCompareExpr(DataTypePtr type) : VExpr(type, false), _type(std::move(type)) {} + + const std::string& expr_name() const override { return _name; } + + Status execute_column_impl(VExprContext* context, const Block* block, const Selector* selector, + size_t count, ColumnPtr& result_column) const override { + ColumnPtr left; + ColumnPtr right; + RETURN_IF_ERROR(get_child(0)->execute_column(context, block, selector, count, left)); + RETURN_IF_ERROR(get_child(1)->execute_column(context, block, selector, count, right)); + left = left->convert_to_full_column_if_const(); + right = right->convert_to_full_column_if_const(); + + const auto& left_data = _get_int_data(left); + const auto& right_data = _get_int_data(right); + auto result = ColumnInt8::create(); + for (size_t i = 0; i < count; ++i) { + const auto left_value = left_data.get_element(i); + const auto right_value = right_data.get_element(i); + int8_t compare_result = 0; + if (left_value < right_value) { + compare_result = -1; + } else if (left_value > right_value) { + compare_result = 1; + } + result->insert_value(compare_result); + } + result_column = std::move(result); + return Status::OK(); + } + + DataTypePtr execute_type(const Block* /*block*/) const override { return _type; } + +private: + const ColumnInt32& _get_int_data(const ColumnPtr& column) const { + if (const auto* nullable = check_and_get_column(column.get())) { + return assert_cast(nullable->get_nested_column()); + } + return assert_cast(*column); + } + + DataTypePtr _type; + std::string _name = "mock_compare"; +}; + +static TExprNode make_lambda_call_node(const DataTypePtr& type, int num_children, + const std::string& function_name = "array_map") { + TExprNode node; + node.__set_node_type(TExprNodeType::LAMBDA_FUNCTION_CALL_EXPR); + node.__set_num_children(num_children); + node.__set_type(type->to_thrift()); + node.__set_is_nullable(type->is_nullable()); + + TFunction fn; + TFunctionName fn_name; + fn_name.__set_function_name(function_name); + fn.__set_name(fn_name); + node.__set_fn(fn); + return node; +} + +static TExprNode make_lambda_expr_node(const DataTypePtr& type, + const std::vector& argument_names, + bool set_argument_names = true) { + TExprNode node; + node.__set_node_type(TExprNodeType::LAMBDA_FUNCTION_EXPR); + node.__set_num_children(1); + node.__set_type(type->to_thrift()); + node.__set_is_nullable(type->is_nullable()); + if (set_argument_names) { + node.__set_lambda_argument_names(argument_names); + } + return node; +} + +static TExprNode make_column_ref_node(int column_id, const std::string& column_name, + const DataTypePtr& type) { + TExprNode node; + node.__set_node_type(TExprNodeType::COLUMN_REF); + node.__set_num_children(0); + node.__set_type(type->to_thrift()); + node.__set_is_nullable(type->is_nullable()); + + TColumnRef column_ref; + column_ref.__set_column_id(column_id); + column_ref.__set_column_name(column_name); + node.__set_column_ref(column_ref); + return node; +} + +static VExprSPtr make_slot_ref(int column_id, const std::string& column_name, + const DataTypePtr& type) { + static std::vector> column_names; + column_names.push_back(std::make_unique(column_name)); + + auto ref = VSlotRef::create_shared(); + ref->set_node_type(TExprNodeType::SLOT_REF); + ref->set_slot_id(-1); + ref->set_column_id(column_id); + ref->set_column_name(column_names.back().get()); + ref->data_type() = type; + return ref; +} + +static ColumnPtr make_int_column(const std::vector& values) { + auto column = ColumnInt32::create(); + for (auto value : values) { + column->insert_value(value); + } + return column; +} + +static ColumnPtr make_int_array_column(const std::vector>& rows) { + auto int_column = ColumnInt32::create(); + auto offsets = ColumnArray::ColumnOffsets::create(); + int64_t offset = 0; + for (const auto& row : rows) { + for (auto value : row) { + int_column->insert_value(value); + } + offset += row.size(); + offsets->insert_value(offset); + } + auto int_null_map = ColumnUInt8::create(int_column->size(), 0); + auto nullable_int_column = + ColumnNullable::create(std::move(int_column), std::move(int_null_map)); + return ColumnArray::create(std::move(nullable_int_column), std::move(offsets)); +} + +static ColumnPtr make_nested_int_array_column() { + // Two input rows: + // row 0: [[1, 2], [3]] + // row 1: [[4, 5]] + auto int_column = ColumnInt32::create(); + for (int32_t value : {1, 2, 3, 4, 5}) { + int_column->insert_value(value); + } + auto int_null_map = ColumnUInt8::create(int_column->size(), 0); + auto nullable_int_column = + ColumnNullable::create(std::move(int_column), std::move(int_null_map)); + + auto inner_offsets = ColumnArray::ColumnOffsets::create(); + for (int64_t offset : {2, 3, 5}) { + inner_offsets->insert_value(offset); + } + auto inner_array_column = + ColumnArray::create(std::move(nullable_int_column), std::move(inner_offsets)); + auto inner_array_null_map = ColumnUInt8::create(inner_array_column->size(), 0); + auto nullable_inner_array_column = + ColumnNullable::create(std::move(inner_array_column), std::move(inner_array_null_map)); + + auto outer_offsets = ColumnArray::ColumnOffsets::create(); + for (int64_t offset : {2, 3}) { + outer_offsets->insert_value(offset); + } + return ColumnArray::create(std::move(nullable_inner_array_column), std::move(outer_offsets)); +} + +static ColumnPtr make_nested_unsorted_int_array_column() { + // Two input rows: + // row 0: [[2, 1], [3]] + // row 1: [[5, 4]] + auto int_column = ColumnInt32::create(); + for (int32_t value : {2, 1, 3, 5, 4}) { + int_column->insert_value(value); + } + auto int_null_map = ColumnUInt8::create(int_column->size(), 0); + auto nullable_int_column = + ColumnNullable::create(std::move(int_column), std::move(int_null_map)); + + auto inner_offsets = ColumnArray::ColumnOffsets::create(); + for (int64_t offset : {2, 3, 5}) { + inner_offsets->insert_value(offset); + } + auto inner_array_column = + ColumnArray::create(std::move(nullable_int_column), std::move(inner_offsets)); + auto inner_array_null_map = ColumnUInt8::create(inner_array_column->size(), 0); + auto nullable_inner_array_column = + ColumnNullable::create(std::move(inner_array_column), std::move(inner_array_null_map)); + + auto outer_offsets = ColumnArray::ColumnOffsets::create(); + for (int64_t offset : {2, 3}) { + outer_offsets->insert_value(offset); + } + return ColumnArray::create(std::move(nullable_inner_array_column), std::move(outer_offsets)); +} + +static void open_expr(const VExprSPtr& expr, VExprContext* context) { + RuntimeState state; + RowDescriptor row_desc; + ASSERT_TRUE(expr->prepare(&state, row_desc, context).ok()); + ASSERT_TRUE(expr->open(&state, context, FunctionContext::THREAD_LOCAL).ok()); +} + +TEST(ArrayMapFunctionTest, NestedLambdaWithSameArgumentNameUsesInnerScope) { + auto int_type = std::make_shared(); + auto array_int_type = std::make_shared(int_type); + auto nested_array_int_type = std::make_shared(array_int_type); + + auto root = + VLambdaFunctionCallExpr::create_shared(make_lambda_call_node(nested_array_int_type, 2)); + auto outer_lambda = + VLambdaFunctionExpr::create_shared(make_lambda_expr_node(array_int_type, {"x"})); + auto inner_call = + VLambdaFunctionCallExpr::create_shared(make_lambda_call_node(array_int_type, 2)); + auto inner_lambda = VLambdaFunctionExpr::create_shared(make_lambda_expr_node(int_type, {"x"})); + + // This mirrors array_map(x -> array_map(x -> x, x), nested_array). Both + // lambda arguments are named "x"; only their scopes and element types are + // different. The non-ordinal column ids cover FE plans where a lambda + // argument ColumnRef id is not the same as its argument position. + auto inner_x = VColumnRef::create_shared(make_column_ref_node(2, "x", int_type)); + auto outer_x = VColumnRef::create_shared(make_column_ref_node(1, "x", array_int_type)); + + inner_lambda->add_child(inner_x); + inner_call->add_child(inner_lambda); + inner_call->add_child(outer_x); + outer_lambda->add_child(inner_call); + root->add_child(outer_lambda); + root->add_child(std::make_shared(make_nested_int_array_column(), + nested_array_int_type, "nested_array")); + + VExprContext context(root); + open_expr(root, &context); + + // Keep one ordinary input column before the array_map argument. The test + // verifies nested lambda gap calculation when lambda blocks need to preserve + // existing input columns as well as append current lambda arguments. + Block block; + block.insert({make_int_column({10, 20}), int_type, "ordinary_input"}); + + ColumnPtr result; + auto status = root->execute_column(&context, &block, nullptr, block.rows(), result); + ASSERT_TRUE(status.ok()) << status.to_string(); + + const auto& outer_array = assert_cast(*result); + ASSERT_EQ(outer_array.size(), 2); + ASSERT_EQ(outer_array.get_offsets()[0], 2); + ASSERT_EQ(outer_array.get_offsets()[1], 3); + + const auto* nullable_inner_arrays = + check_and_get_column(&outer_array.get_data()); + ASSERT_NE(nullable_inner_arrays, nullptr); + for (size_t i = 0; i < nullable_inner_arrays->size(); ++i) { + EXPECT_FALSE(nullable_inner_arrays->is_null_at(i)); + } + + const auto& inner_arrays = + assert_cast(nullable_inner_arrays->get_nested_column()); + ASSERT_EQ(inner_arrays.size(), 3); + ASSERT_EQ(inner_arrays.get_offsets()[0], 2); + ASSERT_EQ(inner_arrays.get_offsets()[1], 3); + ASSERT_EQ(inner_arrays.get_offsets()[2], 5); + + const auto* nullable_values = check_and_get_column(&inner_arrays.get_data()); + ASSERT_NE(nullable_values, nullptr); + for (size_t i = 0; i < nullable_values->size(); ++i) { + EXPECT_FALSE(nullable_values->is_null_at(i)); + } + + const auto& values = assert_cast(nullable_values->get_nested_column()); + ASSERT_EQ(values.size(), 5); + EXPECT_EQ(values.get_element(0), 1); + EXPECT_EQ(values.get_element(1), 2); + EXPECT_EQ(values.get_element(2), 3); + EXPECT_EQ(values.get_element(3), 4); + EXPECT_EQ(values.get_element(4), 5); +} + +TEST(ArrayMapFunctionTest, NamedLambdaWithFewerArgumentsThanArraysUsesDeclaredBindings) { + auto int_type = std::make_shared(); + auto array_int_type = std::make_shared(int_type); + + auto root = VLambdaFunctionCallExpr::create_shared(make_lambda_call_node(array_int_type, 3)); + auto lambda = VLambdaFunctionExpr::create_shared(make_lambda_expr_node(int_type, {"x"})); + + // This mirrors a BE-compatible plan shape like array_map(x -> x, arr1, arr2). + // Only x is part of the lambda binding frame; the extra array input is still + // materialized for size/offset validation but must not require a lambda name. + auto x = VColumnRef::create_shared(make_column_ref_node(5, "x", int_type)); + + lambda->add_child(x); + root->add_child(lambda); + root->add_child(std::make_shared(make_int_array_column({{10, 20}, {30}}), + array_int_type, "arr1")); + root->add_child(std::make_shared(make_int_array_column({{1, 2}, {3}}), + array_int_type, "arr2")); + + VExprContext context(root); + open_expr(root, &context); + + Block block; + ColumnPtr result; + auto status = root->execute_column(&context, &block, nullptr, 2, result); + ASSERT_TRUE(status.ok()) << status.to_string(); + + const auto& result_array = assert_cast(*result); + ASSERT_EQ(result_array.size(), 2); + ASSERT_EQ(result_array.get_offsets()[0], 2); + ASSERT_EQ(result_array.get_offsets()[1], 3); + const auto* nullable_values = check_and_get_column(&result_array.get_data()); + ASSERT_NE(nullable_values, nullptr); + const auto& values = assert_cast(nullable_values->get_nested_column()); + ASSERT_EQ(values.size(), 3); + EXPECT_EQ(values.get_element(0), 10); + EXPECT_EQ(values.get_element(1), 20); + EXPECT_EQ(values.get_element(2), 30); +} + +TEST(ArrayMapFunctionTest, NestedArraySortInsideArrayMapSkipsArrayMapArgumentInference) { + auto int_type = std::make_shared(); + auto int8_type = std::make_shared(); + auto array_int_type = std::make_shared(int_type); + auto nullable_array_int_type = std::make_shared(array_int_type); + auto nested_array_int_type = std::make_shared(array_int_type); + + auto root = + VLambdaFunctionCallExpr::create_shared(make_lambda_call_node(nested_array_int_type, 2)); + auto outer_lambda = + VLambdaFunctionExpr::create_shared(make_lambda_expr_node(array_int_type, {"a"})); + auto sort_call = VLambdaFunctionCallExpr::create_shared( + make_lambda_call_node(nullable_array_int_type, 2, "array_sort")); + auto sort_lambda = + VLambdaFunctionExpr::create_shared(make_lambda_expr_node(int8_type, {"a", "b"})); + auto compare = std::make_shared(int8_type); + + // This mirrors array_map(a -> array_sort((a, b) -> a - b, a), nested_array). + // FE represents the second comparator argument by cloning the first + // ColumnRef and only changing column_id to 1, so both comparator ColumnRefs + // can have the same name. array_sort must keep its comparator arguments + // position-based instead of using array_map's name-based binding. + auto sort_a = VColumnRef::create_shared(make_column_ref_node(0, "a", int_type)); + auto sort_b = VColumnRef::create_shared(make_column_ref_node(1, "a", int_type)); + auto outer_a = VColumnRef::create_shared(make_column_ref_node(1, "a", array_int_type)); + + compare->add_child(sort_a); + compare->add_child(sort_b); + sort_lambda->add_child(compare); + sort_call->add_child(sort_lambda); + sort_call->add_child(outer_a); + outer_lambda->add_child(sort_call); + root->add_child(outer_lambda); + root->add_child(std::make_shared(make_nested_unsorted_int_array_column(), + nested_array_int_type, "nested_array")); + + VExprContext context(root); + open_expr(root, &context); + + Block block; + block.insert({make_int_column({10, 20}), int_type, "ordinary_input"}); + + ColumnPtr result; + auto status = root->execute_column(&context, &block, nullptr, block.rows(), result); + ASSERT_TRUE(status.ok()) << status.to_string(); + + const auto& outer_array = assert_cast(*result); + ASSERT_EQ(outer_array.size(), 2); + ASSERT_EQ(outer_array.get_offsets()[0], 2); + ASSERT_EQ(outer_array.get_offsets()[1], 3); + + const auto* nullable_inner_arrays = + check_and_get_column(&outer_array.get_data()); + ASSERT_NE(nullable_inner_arrays, nullptr); + + const auto& inner_arrays = + assert_cast(nullable_inner_arrays->get_nested_column()); + ASSERT_EQ(inner_arrays.size(), 3); + ASSERT_EQ(inner_arrays.get_offsets()[0], 2); + ASSERT_EQ(inner_arrays.get_offsets()[1], 3); + ASSERT_EQ(inner_arrays.get_offsets()[2], 5); + + const auto* nullable_values = check_and_get_column(&inner_arrays.get_data()); + ASSERT_NE(nullable_values, nullptr); + const auto& values = assert_cast(nullable_values->get_nested_column()); + ASSERT_EQ(values.size(), 5); + EXPECT_EQ(values.get_element(0), 1); + EXPECT_EQ(values.get_element(1), 2); + EXPECT_EQ(values.get_element(2), 3); + EXPECT_EQ(values.get_element(3), 4); + EXPECT_EQ(values.get_element(4), 5); +} + +TEST(ArrayMapFunctionTest, NestedArraySortComparatorCapturingOuterArgumentReturnsError) { + auto int_type = std::make_shared(); + auto int8_type = std::make_shared(); + auto array_int_type = std::make_shared(int_type); + auto nested_array_int_type = std::make_shared(array_int_type); + + auto root = + VLambdaFunctionCallExpr::create_shared(make_lambda_call_node(nested_array_int_type, 2)); + auto outer_lambda = + VLambdaFunctionExpr::create_shared(make_lambda_expr_node(array_int_type, {"i"})); + auto sort_call = VLambdaFunctionCallExpr::create_shared( + make_lambda_call_node(array_int_type, 2, "array_sort")); + auto sort_lambda = + VLambdaFunctionExpr::create_shared(make_lambda_expr_node(int8_type, {"x", "y"})); + auto add = std::make_shared(int_type); + + // This mirrors: + // array_map(i -> array_sort((x, y) -> x + i, arr2), arr1) + // array_sort's comparator executes against a two-column temporary block that + // only contains x and y. Capturing i from the outer array_map would otherwise + // be silently resolved by column id as x. + auto sort_x = VColumnRef::create_shared(make_column_ref_node(0, "x", int_type)); + auto outer_i = VColumnRef::create_shared(make_column_ref_node(0, "i", int_type)); + + add->add_child(sort_x); + add->add_child(outer_i); + sort_lambda->add_child(add); + sort_call->add_child(sort_lambda); + sort_call->add_child(std::make_shared(make_int_array_column({{2, 1}, {4, 3}}), + array_int_type, "arr2")); + outer_lambda->add_child(sort_call); + root->add_child(outer_lambda); + root->add_child(std::make_shared(make_int_array_column({{10, 20}, {30}}), + array_int_type, "arr1")); + + VExprContext context(root); + RuntimeState state; + RowDescriptor row_desc; + auto status = root->prepare(&state, row_desc, &context); + EXPECT_FALSE(status.ok()); + EXPECT_NE( + status.to_string().find("array_sort comparator only supports its own lambda arguments"), + std::string::npos); + EXPECT_NE(status.to_string().find("captured column ref 'i'"), std::string::npos); +} + +TEST(ArrayMapFunctionTest, + NestedArraySortComparatorNestedLambdaCapturingComparatorArgumentReturnsError) { + auto int_type = std::make_shared(); + auto int8_type = std::make_shared(); + auto array_int_type = std::make_shared(int_type); + + auto root = VLambdaFunctionCallExpr::create_shared( + make_lambda_call_node(array_int_type, 2, "array_sort")); + auto sort_lambda = + VLambdaFunctionExpr::create_shared(make_lambda_expr_node(int8_type, {"x", "y"})); + auto inner_call = + VLambdaFunctionCallExpr::create_shared(make_lambda_call_node(array_int_type, 2)); + auto inner_lambda = VLambdaFunctionExpr::create_shared(make_lambda_expr_node(int_type, {"z"})); + auto add = std::make_shared(int_type); + + // This mirrors: + // array_sort((x, y) -> array_map(z -> z + x, [1, 2]), arr) + // The inner array_map's lambda frame cannot see array_sort comparator-local x/y because + // array_sort intentionally uses a position-based anonymous comparator frame. + auto inner_z = VColumnRef::create_shared(make_column_ref_node(0, "z", int_type)); + auto comparator_x = VColumnRef::create_shared(make_column_ref_node(0, "x", int_type)); + + add->add_child(inner_z); + add->add_child(comparator_x); + inner_lambda->add_child(add); + inner_call->add_child(inner_lambda); + inner_call->add_child(std::make_shared(make_int_array_column({{1, 2}}), + array_int_type, "inner_array")); + sort_lambda->add_child(inner_call); + root->add_child(sort_lambda); + root->add_child(std::make_shared(make_int_array_column({{2, 1}, {4, 3}}), + array_int_type, "arr")); + + VExprContext context(root); + RuntimeState state; + RowDescriptor row_desc; + auto status = root->prepare(&state, row_desc, &context); + EXPECT_FALSE(status.ok()); + EXPECT_NE(status.to_string().find( + "array_sort comparator does not support nested lambda capturing comparator " + "argument 'x'"), + std::string::npos); +} + +TEST(ArrayMapFunctionTest, NestedLambdaCapturesOuterSlotRefFromInnerBody) { + auto int_type = std::make_shared(); + auto array_int_type = std::make_shared(int_type); + auto nested_array_int_type = std::make_shared(array_int_type); + + auto root = + VLambdaFunctionCallExpr::create_shared(make_lambda_call_node(nested_array_int_type, 2)); + auto outer_lambda = + VLambdaFunctionExpr::create_shared(make_lambda_expr_node(array_int_type, {"x"})); + auto inner_call = + VLambdaFunctionCallExpr::create_shared(make_lambda_call_node(array_int_type, 2)); + auto inner_lambda = VLambdaFunctionExpr::create_shared(make_lambda_expr_node(int_type, {"y"})); + auto add = std::make_shared(int_type); + + // This mirrors: + // array_map(x -> array_map(y -> y + k1, [1, 2]), arr) + // The outer array_map body does not reference k1 directly. The BE still + // needs to carry the full input block through the outer lambda block and + // inherit it into the inner lambda block. + auto inner_y = VColumnRef::create_shared(make_column_ref_node(0, "y", int_type)); + auto k1 = make_slot_ref(0, "k1", int_type); + + add->add_child(inner_y); + add->add_child(k1); + inner_lambda->add_child(add); + inner_call->add_child(inner_lambda); + inner_call->add_child(std::make_shared(make_int_array_column({{1, 2}}), + array_int_type, "inner_array")); + outer_lambda->add_child(inner_call); + root->add_child(outer_lambda); + root->add_child(std::make_shared(make_int_array_column({{10, 20}, {30}}), + array_int_type, "outer_array")); + + VExprContext context(root); + open_expr(root, &context); + + Block block; + block.insert({make_int_column({100, 200}), int_type, "k1"}); + + ColumnPtr result; + auto status = root->execute_column(&context, &block, nullptr, block.rows(), result); + ASSERT_TRUE(status.ok()) << status.to_string(); + + const auto& outer_array = assert_cast(*result); + ASSERT_EQ(outer_array.size(), 2); + ASSERT_EQ(outer_array.get_offsets()[0], 2); + ASSERT_EQ(outer_array.get_offsets()[1], 3); + + const auto* nullable_inner_arrays = + check_and_get_column(&outer_array.get_data()); + ASSERT_NE(nullable_inner_arrays, nullptr); + const auto& inner_arrays = + assert_cast(nullable_inner_arrays->get_nested_column()); + ASSERT_EQ(inner_arrays.size(), 3); + ASSERT_EQ(inner_arrays.get_offsets()[0], 2); + ASSERT_EQ(inner_arrays.get_offsets()[1], 4); + ASSERT_EQ(inner_arrays.get_offsets()[2], 6); + + const auto* nullable_values = check_and_get_column(&inner_arrays.get_data()); + ASSERT_NE(nullable_values, nullptr); + const auto& values = assert_cast(nullable_values->get_nested_column()); + ASSERT_EQ(values.size(), 6); + EXPECT_EQ(values.get_element(0), 101); + EXPECT_EQ(values.get_element(1), 102); + EXPECT_EQ(values.get_element(2), 101); + EXPECT_EQ(values.get_element(3), 102); + EXPECT_EQ(values.get_element(4), 201); + EXPECT_EQ(values.get_element(5), 202); +} + +TEST(ArrayMapFunctionTest, LegacySingleLambdaWithoutArgumentMetadataUsesColumnId) { + auto int_type = std::make_shared(); + auto array_int_type = std::make_shared(int_type); + + auto root = VLambdaFunctionCallExpr::create_shared(make_lambda_call_node(array_int_type, 2)); + auto lambda = VLambdaFunctionExpr::create_shared( + make_lambda_expr_node(int_type, {"x"}, false /*set_argument_names*/)); + + // Simulate an old FE plan without lambda_argument_names. Single-layer + // lambda can still use the ColumnRef id to bind argument position 0. + auto x = VColumnRef::create_shared(make_column_ref_node(0, "legacy_x", int_type)); + lambda->add_child(x); + root->add_child(lambda); + root->add_child(std::make_shared(make_int_array_column({{10, 20}}), + array_int_type, "array")); + + VExprContext context(root); + open_expr(root, &context); + + Block block; + ColumnPtr result; + auto status = root->execute_column(&context, &block, nullptr, 1, result); + ASSERT_TRUE(status.ok()) << status.to_string(); + + const auto& result_array = assert_cast(*result); + ASSERT_EQ(result_array.size(), 1); + ASSERT_EQ(result_array.get_offsets()[0], 2); + const auto* nullable_values = check_and_get_column(&result_array.get_data()); + ASSERT_NE(nullable_values, nullptr); + const auto& values = assert_cast(nullable_values->get_nested_column()); + ASSERT_EQ(values.size(), 2); + EXPECT_EQ(values.get_element(0), 10); + EXPECT_EQ(values.get_element(1), 20); +} + +TEST(ArrayMapFunctionTest, LegacyNestedLambdaWithoutArgumentMetadataReturnsError) { + auto int_type = std::make_shared(); + auto array_int_type = std::make_shared(int_type); + auto nested_array_int_type = std::make_shared(array_int_type); + + auto root = + VLambdaFunctionCallExpr::create_shared(make_lambda_call_node(nested_array_int_type, 2)); + auto outer_lambda = VLambdaFunctionExpr::create_shared( + make_lambda_expr_node(array_int_type, {"x"}, false /*set_argument_names*/)); + auto inner_call = + VLambdaFunctionCallExpr::create_shared(make_lambda_call_node(array_int_type, 2)); + auto inner_lambda = VLambdaFunctionExpr::create_shared( + make_lambda_expr_node(int_type, {"y"}, false /*set_argument_names*/)); + auto subtract = std::make_shared(int_type); + + auto outer_x = VColumnRef::create_shared(make_column_ref_node(0, "x", int_type)); + auto inner_y = VColumnRef::create_shared(make_column_ref_node(0, "y", int_type)); + + subtract->add_child(outer_x); + subtract->add_child(inner_y); + inner_lambda->add_child(subtract); + inner_call->add_child(inner_lambda); + inner_call->add_child(std::make_shared(make_int_array_column({{1, 2}}), + array_int_type, "inner_array")); + outer_lambda->add_child(inner_call); + root->add_child(outer_lambda); + root->add_child(std::make_shared(make_int_array_column({{10, 20}}), + array_int_type, "outer_array")); + + VExprContext context(root); + RuntimeState state; + RowDescriptor row_desc; + auto status = root->prepare(&state, row_desc, &context); + EXPECT_FALSE(status.ok()); + EXPECT_NE(status.to_string().find( + "Cannot resolve nested lambda argument without lambda metadata"), + std::string::npos); +} + +TEST(ArrayMapFunctionTest, NestedLambdaUsesOuterArgumentsAndInputSlotRefArray) { + auto int_type = std::make_shared(); + auto array_int_type = std::make_shared(int_type); + auto nested_array_int_type = std::make_shared(array_int_type); + + auto root = + VLambdaFunctionCallExpr::create_shared(make_lambda_call_node(nested_array_int_type, 3)); + auto outer_lambda = + VLambdaFunctionExpr::create_shared(make_lambda_expr_node(array_int_type, {"x", "y"})); + auto inner_call = + VLambdaFunctionCallExpr::create_shared(make_lambda_call_node(array_int_type, 2)); + auto inner_lambda = VLambdaFunctionExpr::create_shared(make_lambda_expr_node(int_type, {"z"})); + auto subtract = std::make_shared(int_type); + auto add = std::make_shared(int_type); + auto multiply = std::make_shared(int_type); + + // This mirrors: + // array_map((x, y) -> array_map(z -> (y - z) * (x + z), arr3), arr1, arr2) + // arr3 is a sparse ordinary input SlotRef. Each lambda block must keep required + // input positions before appending its own arguments, so the inner array_map can + // use arr3 while resolving x/y/z from the nested lambda context by name. + auto y_for_subtract = VColumnRef::create_shared(make_column_ref_node(10, "y", int_type)); + auto z_for_subtract = VColumnRef::create_shared(make_column_ref_node(0, "z", int_type)); + auto x_for_add = VColumnRef::create_shared(make_column_ref_node(9, "x", int_type)); + auto z_for_add = VColumnRef::create_shared(make_column_ref_node(0, "z", int_type)); + auto arr3 = make_slot_ref(4, "arr3", array_int_type); + + subtract->add_child(y_for_subtract); + subtract->add_child(z_for_subtract); + add->add_child(x_for_add); + add->add_child(z_for_add); + multiply->add_child(subtract); + multiply->add_child(add); + inner_lambda->add_child(multiply); + inner_call->add_child(inner_lambda); + inner_call->add_child(arr3); + outer_lambda->add_child(inner_call); + root->add_child(outer_lambda); + root->add_child(std::make_shared(make_int_array_column({{10, 20}, {30}}), + array_int_type, "arr1")); + root->add_child(std::make_shared(make_int_array_column({{100, 200}, {300}}), + array_int_type, "arr2")); + + VExprContext context(root); + open_expr(root, &context); + + Block block; + block.insert({make_int_column({0, 0}), int_type, "unused0"}); + block.insert({make_int_column({1, 1}), int_type, "unused1"}); + block.insert({make_int_column({2, 2}), int_type, "unused2"}); + block.insert({make_int_column({3, 3}), int_type, "unused3"}); + block.insert({make_int_array_column({{1, 2}, {3}}), array_int_type, "arr3"}); + + ColumnPtr result; + auto status = root->execute_column(&context, &block, nullptr, block.rows(), result); + ASSERT_TRUE(status.ok()) << status.to_string(); + + const auto& outer_array = assert_cast(*result); + ASSERT_EQ(outer_array.size(), 2); + ASSERT_EQ(outer_array.get_offsets()[0], 2); + ASSERT_EQ(outer_array.get_offsets()[1], 3); + + const auto* nullable_inner_arrays = + check_and_get_column(&outer_array.get_data()); + ASSERT_NE(nullable_inner_arrays, nullptr); + const auto& inner_arrays = + assert_cast(nullable_inner_arrays->get_nested_column()); + ASSERT_EQ(inner_arrays.size(), 3); + ASSERT_EQ(inner_arrays.get_offsets()[0], 2); + ASSERT_EQ(inner_arrays.get_offsets()[1], 4); + ASSERT_EQ(inner_arrays.get_offsets()[2], 5); + + const auto* nullable_values = check_and_get_column(&inner_arrays.get_data()); + ASSERT_NE(nullable_values, nullptr); + const auto& values = assert_cast(nullable_values->get_nested_column()); + ASSERT_EQ(values.size(), 5); + EXPECT_EQ(values.get_element(0), 1089); + EXPECT_EQ(values.get_element(1), 1176); + EXPECT_EQ(values.get_element(2), 4179); + EXPECT_EQ(values.get_element(3), 4356); + EXPECT_EQ(values.get_element(4), 9801); +} + +TEST(ArrayMapFunctionTest, ThreeLevelNestedLambdaCapturesAllOuterArguments) { + auto int_type = std::make_shared(); + auto array_int_type = std::make_shared(int_type); + auto nested_array_int_type = std::make_shared(array_int_type); + auto three_level_array_int_type = std::make_shared(nested_array_int_type); + + auto root = VLambdaFunctionCallExpr::create_shared( + make_lambda_call_node(three_level_array_int_type, 2)); + auto outer_lambda = + VLambdaFunctionExpr::create_shared(make_lambda_expr_node(nested_array_int_type, {"x"})); + auto middle_call = + VLambdaFunctionCallExpr::create_shared(make_lambda_call_node(nested_array_int_type, 2)); + auto middle_lambda = + VLambdaFunctionExpr::create_shared(make_lambda_expr_node(array_int_type, {"y"})); + auto inner_call = + VLambdaFunctionCallExpr::create_shared(make_lambda_call_node(array_int_type, 2)); + auto inner_lambda = VLambdaFunctionExpr::create_shared(make_lambda_expr_node(int_type, {"z"})); + auto add_zy = std::make_shared(int_type); + auto add_zyx = std::make_shared(int_type); + + // This mirrors: + // array_map(x -> array_map(y -> array_map(z -> z + y + x, [1, 2]), + // [10, 20]), + // [100]) + // It verifies LambdaExecutionContext works as a stack: the innermost z + // shadows nothing, y is resolved from the middle frame, and x is resolved + // from the outer frame. + auto z = VColumnRef::create_shared(make_column_ref_node(0, "z", int_type)); + auto y = VColumnRef::create_shared(make_column_ref_node(7, "y", int_type)); + auto x = VColumnRef::create_shared(make_column_ref_node(9, "x", int_type)); + + add_zy->add_child(z); + add_zy->add_child(y); + add_zyx->add_child(add_zy); + add_zyx->add_child(x); + inner_lambda->add_child(add_zyx); + inner_call->add_child(inner_lambda); + inner_call->add_child(std::make_shared(make_int_array_column({{1, 2}}), + array_int_type, "inner_array")); + middle_lambda->add_child(inner_call); + middle_call->add_child(middle_lambda); + middle_call->add_child(std::make_shared(make_int_array_column({{10, 20}}), + array_int_type, "middle_array")); + outer_lambda->add_child(middle_call); + root->add_child(outer_lambda); + root->add_child(std::make_shared(make_int_array_column({{100}}), array_int_type, + "outer_array")); + + VExprContext context(root); + open_expr(root, &context); + + Block block; + ColumnPtr result; + auto status = root->execute_column(&context, &block, nullptr, 1, result); + ASSERT_TRUE(status.ok()) << status.to_string(); + + const auto& level3_arrays = assert_cast(*result); + ASSERT_EQ(level3_arrays.size(), 1); + ASSERT_EQ(level3_arrays.get_offsets()[0], 1); + + const auto* nullable_level2_arrays = + check_and_get_column(&level3_arrays.get_data()); + ASSERT_NE(nullable_level2_arrays, nullptr); + const auto& level2_arrays = + assert_cast(nullable_level2_arrays->get_nested_column()); + ASSERT_EQ(level2_arrays.size(), 1); + ASSERT_EQ(level2_arrays.get_offsets()[0], 2); + + const auto* nullable_level1_arrays = + check_and_get_column(&level2_arrays.get_data()); + ASSERT_NE(nullable_level1_arrays, nullptr); + const auto& level1_arrays = + assert_cast(nullable_level1_arrays->get_nested_column()); + ASSERT_EQ(level1_arrays.size(), 2); + ASSERT_EQ(level1_arrays.get_offsets()[0], 2); + ASSERT_EQ(level1_arrays.get_offsets()[1], 4); + + const auto* nullable_values = check_and_get_column(&level1_arrays.get_data()); + ASSERT_NE(nullable_values, nullptr); + const auto& values = assert_cast(nullable_values->get_nested_column()); + ASSERT_EQ(values.size(), 4); + EXPECT_EQ(values.get_element(0), 111); + EXPECT_EQ(values.get_element(1), 112); + EXPECT_EQ(values.get_element(2), 121); + EXPECT_EQ(values.get_element(3), 122); +} + +TEST(ArrayMapFunctionTest, NestedLambdaCapturesOuterArgumentWithSameElementType) { + auto int_type = std::make_shared(); + auto array_int_type = std::make_shared(int_type); + auto nested_array_int_type = std::make_shared(array_int_type); + + auto root = + VLambdaFunctionCallExpr::create_shared(make_lambda_call_node(nested_array_int_type, 2)); + auto outer_lambda = + VLambdaFunctionExpr::create_shared(make_lambda_expr_node(array_int_type, {"x"})); + auto inner_call = + VLambdaFunctionCallExpr::create_shared(make_lambda_call_node(array_int_type, 2)); + auto inner_lambda = VLambdaFunctionExpr::create_shared(make_lambda_expr_node(int_type, {"y"})); + auto subtract = std::make_shared(int_type); + + // This mirrors array_map(x -> array_map(y -> y - x, [1, 2]), [10, 20]). + // The captured outer x and the inner y both use INT type, so only names + // plus lambda scope can disambiguate them. The non-ordinal id of outer x + // covers FE plans where lambda argument ColumnRef ids are not 0-based. + auto outer_x = VColumnRef::create_shared(make_column_ref_node(1, "x", int_type)); + auto inner_y = VColumnRef::create_shared(make_column_ref_node(0, "y", int_type)); + + subtract->add_child(inner_y); + subtract->add_child(outer_x); + inner_lambda->add_child(subtract); + inner_call->add_child(inner_lambda); + inner_call->add_child(std::make_shared(make_int_array_column({{1, 2}}), + array_int_type, "inner_array")); + outer_lambda->add_child(inner_call); + root->add_child(outer_lambda); + root->add_child(std::make_shared(make_int_array_column({{10, 20}}), + array_int_type, "outer_array")); + + VExprContext context(root); + open_expr(root, &context); + + // No ordinary input column is needed by this expression. The outer lambda + // block only appends its own x argument, while the inner lambda resolves y + // from the nearest frame and x from the outer frame. + Block block; + + ColumnPtr result; + auto status = root->execute_column(&context, &block, nullptr, 1, result); + ASSERT_TRUE(status.ok()) << status.to_string(); + + const auto& outer_array = assert_cast(*result); + ASSERT_EQ(outer_array.size(), 1); + ASSERT_EQ(outer_array.get_offsets()[0], 2); + + const auto* nullable_inner_arrays = + check_and_get_column(&outer_array.get_data()); + ASSERT_NE(nullable_inner_arrays, nullptr); + const auto& inner_arrays = + assert_cast(nullable_inner_arrays->get_nested_column()); + ASSERT_EQ(inner_arrays.size(), 2); + ASSERT_EQ(inner_arrays.get_offsets()[0], 2); + ASSERT_EQ(inner_arrays.get_offsets()[1], 4); + + const auto* nullable_values = check_and_get_column(&inner_arrays.get_data()); + ASSERT_NE(nullable_values, nullptr); + const auto& values = assert_cast(nullable_values->get_nested_column()); + ASSERT_EQ(values.size(), 4); + EXPECT_EQ(values.get_element(0), -9); + EXPECT_EQ(values.get_element(1), -8); + EXPECT_EQ(values.get_element(2), -19); + EXPECT_EQ(values.get_element(3), -18); +} + +TEST(ArrayMapFunctionTest, NestedLambdaCapturesOuterArgumentBeforeInnerArgument) { + auto int_type = std::make_shared(); + auto array_int_type = std::make_shared(int_type); + auto nested_array_int_type = std::make_shared(array_int_type); + + auto root = + VLambdaFunctionCallExpr::create_shared(make_lambda_call_node(nested_array_int_type, 2)); + auto outer_lambda = + VLambdaFunctionExpr::create_shared(make_lambda_expr_node(array_int_type, {"x"})); + auto inner_call = + VLambdaFunctionCallExpr::create_shared(make_lambda_call_node(array_int_type, 2)); + auto inner_lambda = VLambdaFunctionExpr::create_shared(make_lambda_expr_node(int_type, {"y"})); + auto subtract = std::make_shared(int_type); + + // This mirrors array_map(x -> array_map(y -> x - y, [1, 2]), [10, 20]). + // The outer x and inner y both use INT type. FE-provided lambda argument + // names must make x bind to the outer lambda even when x is visited before + // y in the inner lambda body. The non-ordinal id of outer x covers FE plans + // where lambda argument ColumnRef ids are not 0-based. + auto outer_x = VColumnRef::create_shared(make_column_ref_node(1, "x", int_type)); + auto inner_y = VColumnRef::create_shared(make_column_ref_node(0, "y", int_type)); + + subtract->add_child(outer_x); + subtract->add_child(inner_y); + inner_lambda->add_child(subtract); + inner_call->add_child(inner_lambda); + inner_call->add_child(std::make_shared(make_int_array_column({{1, 2}}), + array_int_type, "inner_array")); + outer_lambda->add_child(inner_call); + root->add_child(outer_lambda); + root->add_child(std::make_shared(make_int_array_column({{10, 20}}), + array_int_type, "outer_array")); + + VExprContext context(root); + open_expr(root, &context); + + Block block; + + ColumnPtr result; + auto status = root->execute_column(&context, &block, nullptr, 1, result); + ASSERT_TRUE(status.ok()) << status.to_string(); + + const auto& outer_array = assert_cast(*result); + ASSERT_EQ(outer_array.size(), 1); + ASSERT_EQ(outer_array.get_offsets()[0], 2); + + const auto* nullable_inner_arrays = + check_and_get_column(&outer_array.get_data()); + ASSERT_NE(nullable_inner_arrays, nullptr); + const auto& inner_arrays = + assert_cast(nullable_inner_arrays->get_nested_column()); + ASSERT_EQ(inner_arrays.size(), 2); + ASSERT_EQ(inner_arrays.get_offsets()[0], 2); + ASSERT_EQ(inner_arrays.get_offsets()[1], 4); + + const auto* nullable_values = check_and_get_column(&inner_arrays.get_data()); + ASSERT_NE(nullable_values, nullptr); + const auto& values = assert_cast(nullable_values->get_nested_column()); + ASSERT_EQ(values.size(), 4); + EXPECT_EQ(values.get_element(0), 9); + EXPECT_EQ(values.get_element(1), 8); + EXPECT_EQ(values.get_element(2), 19); + EXPECT_EQ(values.get_element(3), 18); +} + +} // namespace doris diff --git a/be/test/exprs/vcolumn_ref_test.cpp b/be/test/exprs/vcolumn_ref_test.cpp new file mode 100644 index 00000000000000..c8eaa6094986fe --- /dev/null +++ b/be/test/exprs/vcolumn_ref_test.cpp @@ -0,0 +1,148 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "exprs/vcolumn_ref.h" + +#include +#include +#include + +#include +#include +#include + +#include "common/exception.h" +#include "core/assert_cast.h" +#include "core/block/block.h" +#include "core/column/column_vector.h" +#include "core/data_type/data_type_number.h" +#include "exprs/vexpr_context.h" +#include "runtime/descriptors.h" +#include "runtime/runtime_state.h" + +namespace doris { + +static TTypeDesc make_int_type_desc() { + TTypeDesc type_desc; + TTypeNode type_node; + TScalarType scalar_type; + scalar_type.__set_type(TPrimitiveType::INT); + type_node.__set_type(TTypeNodeType::SCALAR); + type_node.__set_scalar_type(scalar_type); + type_desc.types.push_back(type_node); + return type_desc; +} + +static TExprNode make_column_ref_node(int column_id, const std::string& column_name) { + TExprNode node; + node.__set_node_type(TExprNodeType::COLUMN_REF); + node.__set_num_children(0); + node.__set_type(make_int_type_desc()); + node.__set_is_nullable(false); + + TColumnRef column_ref; + column_ref.__set_column_id(column_id); + column_ref.__set_column_name(column_name); + node.__set_column_ref(column_ref); + return node; +} + +static ColumnPtr make_int_column(const std::vector& values) { + auto column = ColumnInt32::create(); + for (auto value : values) { + column->insert_value(value); + } + return column; +} + +static Block make_int_block() { + auto int_type = std::make_shared(); + Block block; + block.insert({make_int_column({10, 11}), int_type, "c0"}); + block.insert({make_int_column({20, 21}), int_type, "c1"}); + block.insert({make_int_column({30, 31}), int_type, "c2"}); + return block; +} + +static void open_expr(const VExprSPtr& expr, VExprContext* context) { + RuntimeState state; + RowDescriptor row_desc; + ASSERT_TRUE(expr->prepare(&state, row_desc, context).ok()); + ASSERT_TRUE(expr->open(&state, context, FunctionContext::THREAD_LOCAL).ok()); +} + +static std::vector get_int_values(const ColumnPtr& column) { + const auto* int_column = assert_cast(column.get()); + std::vector values; + for (auto value : int_column->get_data()) { + values.push_back(value); + } + return values; +} + +TEST(VColumnRefTest, SetGapOverridesPreviousGap) { + auto ref = VColumnRef::create_shared(make_column_ref_node(0, "x")); + VExprContext context(ref); + open_expr(ref, &context); + + auto block = make_int_block(); + ColumnPtr result; + + EXPECT_FALSE(ref->has_gap()); + ref->set_gap(0); + EXPECT_TRUE(ref->has_gap()); + + ref->set_gap(1); + auto status = ref->execute_column(&context, &block, nullptr, block.rows(), result); + ASSERT_TRUE(status.ok()) << status.to_string(); + EXPECT_EQ(get_int_values(result), std::vector({20, 21})); + + result.reset(); + ref->set_gap(2); + status = ref->execute_column(&context, &block, nullptr, block.rows(), result); + ASSERT_TRUE(status.ok()) << status.to_string(); + EXPECT_EQ(get_int_values(result), std::vector({30, 31})); +} + +TEST(VColumnRefTest, OutOfRangeColumnPositionReturnsError) { + auto ref = VColumnRef::create_shared(make_column_ref_node(1, "x")); + VExprContext context(ref); + open_expr(ref, &context); + + auto block = make_int_block(); + ref->set_gap(3); + + ColumnPtr result; + auto status = ref->execute_column(&context, &block, nullptr, block.rows(), result); + EXPECT_FALSE(status.ok()); + EXPECT_NE(status.to_string().find("input block not contain column ref x"), std::string::npos); + EXPECT_NE(status.to_string().find("column_id=1"), std::string::npos); + EXPECT_NE(status.to_string().find("gap=3"), std::string::npos); +} + +TEST(VColumnRefTest, OutOfRangeExecuteTypeThrowsException) { + auto ref = VColumnRef::create_shared(make_column_ref_node(1, "x")); + VExprContext context(ref); + open_expr(ref, &context); + + auto block = make_int_block(); + ref->set_gap(3); + + EXPECT_THROW({ static_cast(ref->execute_type(&block)); }, Exception); +} + +} // namespace doris diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/ExprToThriftVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/ExprToThriftVisitor.java index 9d5fd94ed3e1d1..3df252e8fed56b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/ExprToThriftVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/ExprToThriftVisitor.java @@ -497,6 +497,7 @@ public Void visitLambdaFunctionCallExpr(LambdaFunctionCallExpr expr, TExprNode m @Override public Void visitLambdaFunctionExpr(LambdaFunctionExpr expr, TExprNode msg) { msg.setNodeType(TExprNodeType.LAMBDA_FUNCTION_EXPR); + msg.setLambdaArgumentNames(expr.getNames()); return null; } diff --git a/fe/fe-core/src/test/java/org/apache/doris/analysis/ExprToThriftBehaviorTest.java b/fe/fe-core/src/test/java/org/apache/doris/analysis/ExprToThriftBehaviorTest.java index 68e69cf3e27a0d..ac40a4f174f682 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/analysis/ExprToThriftBehaviorTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/analysis/ExprToThriftBehaviorTest.java @@ -274,6 +274,22 @@ public void testTreesToThrift() { Assertions.assertEquals(TExprNodeType.STRING_LITERAL, result.get(2).getNodes().get(0).node_type); } + @Test + public void testLambdaFunctionExprSerializesArgumentNames() { + SlotRef slotX = new SlotRef(null, "x"); + SlotRef slotY = new SlotRef(null, "yy"); + LambdaFunctionExpr expr = new LambdaFunctionExpr( + new IntLiteral(1L), Lists.newArrayList("x", "yy"), + Lists.newArrayList(slotX, slotY), false); + + TExprNode node = firstNode(expr); + + Assertions.assertEquals(TExprNodeType.LAMBDA_FUNCTION_EXPR, node.node_type); + Assertions.assertFalse(node.isSetLabel()); + Assertions.assertTrue(node.isSetLambdaArgumentNames()); + Assertions.assertEquals(Lists.newArrayList("x", "yy"), node.getLambdaArgumentNames()); + } + // ======================== Helpers ======================== private static TExprNode firstNode(Expr expr) { diff --git a/gensrc/thrift/Exprs.thrift b/gensrc/thrift/Exprs.thrift index c17199d74edf91..131b580f16566f 100644 --- a/gensrc/thrift/Exprs.thrift +++ b/gensrc/thrift/Exprs.thrift @@ -325,6 +325,10 @@ struct TExprNode { 39: optional bool is_cast_nullable 40: optional TSearchParam search_param 41: optional bool short_circuit_evaluation + // Lambda argument names in the current lambda scope. It is used by BE to + // distinguish current-scope lambda arguments from captured outer lambda + // arguments when nested lambda expressions contain duplicated column ids. + 42: optional list lambda_argument_names } // A flattened representation of a tree of Expr nodes, obtained by depth-first diff --git a/regression-test/data/query_p0/sql_functions/array_functions/test_nested_array_map.out b/regression-test/data/query_p0/sql_functions/array_functions/test_nested_array_map.out new file mode 100644 index 00000000000000..9367e005c711ff --- /dev/null +++ b/regression-test/data/query_p0/sql_functions/array_functions/test_nested_array_map.out @@ -0,0 +1,10 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select -- +1 [5, 7, 9] +2 [11, 22, 3] + +-- !select2 -- +[[-9, -8], [-19, -18]] + +-- !select_same_name_shadow -- +[[2, 3], [4, 5]] diff --git a/regression-test/suites/query_p0/sql_functions/array_functions/test_nested_array_map.groovy b/regression-test/suites/query_p0/sql_functions/array_functions/test_nested_array_map.groovy new file mode 100644 index 00000000000000..307ca0373dc592 --- /dev/null +++ b/regression-test/suites/query_p0/sql_functions/array_functions/test_nested_array_map.groovy @@ -0,0 +1,86 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_nested_array_map") { + sql "DROP TABLE IF EXISTS test_nested_array_map_insert_src" + sql "DROP TABLE IF EXISTS test_nested_array_map_insert_dst" + + sql """ + CREATE TABLE test_nested_array_map_insert_src ( + id INT, + bucket_counts ARRAY + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1" + ) + """ + + sql """ + CREATE TABLE test_nested_array_map_insert_dst ( + id INT, + bucket_counts ARRAY + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1" + ) + """ + + sql """ + INSERT INTO test_nested_array_map_insert_src VALUES + (1, [1, 2, 3]), + (1, [4, 5, 6]), + (2, [10, 20]), + (2, [1, 2, 3]) + """ + + sql """ + INSERT INTO test_nested_array_map_insert_dst (id, bucket_counts) + WITH rollup_grouped AS ( + SELECT + id, + ARRAY_AGG(bucket_counts) AS bucket_count_arrays, + MAX(ARRAY_SIZE(bucket_counts)) AS max_bucket_len + FROM test_nested_array_map_insert_src + GROUP BY id + ) + SELECT + id, + ARRAY_MAP( + i -> ARRAY_SUM(ARRAY_MAP(a -> COALESCE(a[CAST(i AS INT)], 0), bucket_count_arrays)), + ARRAY_RANGE(1, max_bucket_len + 1) + ) AS bucket_counts + FROM rollup_grouped + """ + + order_qt_select """ + SELECT id, bucket_counts + FROM test_nested_array_map_insert_dst + ORDER BY id + """ + + qt_select2 """ + select array_map(x -> array_map(y -> y - x, [1, 2]), [10, 20]); + """ + + qt_select_same_name_shadow """ + select array_map(x -> array_map(x -> x + 1, x), [[1, 2], [3, 4]]); + """ +} From 84b6fc3edd9afd8b76d23c5d2c3023da6cfc39bb Mon Sep 17 00:00:00 2001 From: Hu Shenggang Date: Sat, 27 Jun 2026 18:47:16 +0800 Subject: [PATCH 2/2] [refactor](be) Move legacy lambda binding into execution frame Issue Number: None Related PR: None Problem Summary: Legacy lambda argument binding updated VColumnRef gap fields on shared expression nodes during execution. Since VExprContext clones share the same root expression tree, that mutable per-execution binding state can race across cloned contexts. This change moves legacy positional lambda bindings into LambdaExecutionContext::Frame, resolves VColumnRef by frame column id for positional frames, and removes the mutable gap fields from VColumnRef. Array sort comparator bindings are also represented as comparator-local positional frame entries. None - Test: - Unit Test: ./run-be-ut.sh --run --filter=VColumnRefTest.*:ArrayMapFunctionTest.* - Format: git diff --check; build-support/check-format.sh - Static Analysis: build-support/run-clang-tidy.sh --build-dir be/ut_build_ASAN (attempted; failed on existing/toolchain diagnostics including missing stddef.h, unmatched NOLINTEND in core/types.h, and pre-existing function-size/complexity warnings) - Behavior changed: No - Does this need documentation: No --- .../lambda_function/varray_map_function.cpp | 5 +- be/src/exprs/vcolumn_ref.h | 20 +- .../array_map_function_test.cpp | 257 ++++++++++++++++++ 3 files changed, 264 insertions(+), 18 deletions(-) diff --git a/be/src/exprs/lambda_function/varray_map_function.cpp b/be/src/exprs/lambda_function/varray_map_function.cpp index 5c279f15f434cd..4ada8469788302 100644 --- a/be/src/exprs/lambda_function/varray_map_function.cpp +++ b/be/src/exprs/lambda_function/varray_map_function.cpp @@ -185,11 +185,10 @@ class ArrayMapFunction : public LambdaFunction { names.reserve(lambda_argument_base + arguments.size()); data_types.reserve(lambda_argument_base + arguments.size()); for (int column_id : required_input_column_ids) { - if (column_id < 0 || block == nullptr || - static_cast(column_id) >= block->columns()) { + if (column_id < 0 || static_cast(column_id) >= block->columns()) { return Status::InternalError( "array_map lambda input column id {} is outside input block, block={}", - column_id, block == nullptr ? "nullptr" : block->dump_structure()); + column_id, block->dump_structure()); } materialized_input_columns[column_id] = true; names[column_id] = block->get_by_position(column_id).name; diff --git a/be/src/exprs/vcolumn_ref.h b/be/src/exprs/vcolumn_ref.h index 4609b1e0c29130..479ac5bfca6e9c 100644 --- a/be/src/exprs/vcolumn_ref.h +++ b/be/src/exprs/vcolumn_ref.h @@ -91,14 +91,11 @@ class VColumnRef final : public VExpr { const std::string& expr_name() const override { return _column_name; } - void set_gap(int gap) { - _gap = gap; - _gap_set = true; - } + void set_gap(int gap) { _gap = gap; } int get_gap() const { return _gap.load(); } - bool has_gap() const { return _gap_set.load(); } + bool has_gap() const { return _gap.load() >= 0; } std::string debug_string() const override { std::stringstream out; @@ -125,19 +122,13 @@ class VColumnRef final : public VExpr { } int _get_column_position_without_context(const Block* block) const { - if (!_gap_set.load()) { - const int position_by_name = _find_column_position_by_name(block); - if (position_by_name >= 0) { - return position_by_name; - } + if (_gap.load() == -1) { + return _find_column_position_by_name(block); } return _column_id + _gap.load(); } int _find_column_position_by_name(const Block* block) const { - if (block == nullptr) { - return -1; - } for (int position = block->columns() - 1; position >= 0; --position) { if (block->get_by_position(position).name == _column_name) { return position; @@ -147,8 +138,7 @@ class VColumnRef final : public VExpr { } int _column_id; - std::atomic _gap = 0; - std::atomic _gap_set = false; + std::atomic _gap = -1; std::string _column_name; }; } // namespace doris diff --git a/be/test/exprs/lambda_function/array_map_function_test.cpp b/be/test/exprs/lambda_function/array_map_function_test.cpp index 6e79867e268fa6..42469b52c3bc04 100644 --- a/be/test/exprs/lambda_function/array_map_function_test.cpp +++ b/be/test/exprs/lambda_function/array_map_function_test.cpp @@ -703,6 +703,263 @@ TEST(ArrayMapFunctionTest, std::string::npos); } +TEST(ArrayMapFunctionTest, NestedArraySortComparatorNestedLambdaCanShadowComparatorArgument) { + auto int_type = std::make_shared(); + auto int8_type = std::make_shared(); + auto array_int_type = std::make_shared(int_type); + + auto root = VLambdaFunctionCallExpr::create_shared( + make_lambda_call_node(array_int_type, 2, "array_sort")); + auto sort_lambda = + VLambdaFunctionExpr::create_shared(make_lambda_expr_node(int8_type, {"x", "y"})); + auto inner_call = + VLambdaFunctionCallExpr::create_shared(make_lambda_call_node(array_int_type, 2)); + auto inner_lambda = VLambdaFunctionExpr::create_shared(make_lambda_expr_node(int_type, {"x"})); + + // This mirrors: + // array_sort((x, y) -> array_map(x -> x, [1, 2]), arr) + // The inner array_map argument named x shadows the array_sort comparator argument x. + auto inner_x = VColumnRef::create_shared(make_column_ref_node(0, "x", int_type)); + + inner_lambda->add_child(inner_x); + inner_call->add_child(inner_lambda); + inner_call->add_child(std::make_shared(make_int_array_column({{1, 2}}), + array_int_type, "inner_array")); + sort_lambda->add_child(inner_call); + root->add_child(sort_lambda); + root->add_child(std::make_shared(make_int_array_column({{2, 1}, {4, 3}}), + array_int_type, "arr")); + + VExprContext context(root); + RuntimeState state; + RowDescriptor row_desc; + auto status = root->prepare(&state, row_desc, &context); + EXPECT_TRUE(status.ok()) << status.to_string(); +} + +TEST(ArrayMapFunctionTest, + NestedArraySortComparatorNestedLambdaCapturingNonComparatorColumnReturnsError) { + auto int_type = std::make_shared(); + auto int8_type = std::make_shared(); + auto array_int_type = std::make_shared(int_type); + + auto root = VLambdaFunctionCallExpr::create_shared( + make_lambda_call_node(array_int_type, 2, "array_sort")); + auto sort_lambda = + VLambdaFunctionExpr::create_shared(make_lambda_expr_node(int8_type, {"x", "y"})); + auto inner_call = + VLambdaFunctionCallExpr::create_shared(make_lambda_call_node(array_int_type, 2)); + auto inner_lambda = VLambdaFunctionExpr::create_shared(make_lambda_expr_node(int_type, {"z"})); + auto add = std::make_shared(int_type); + + // This mirrors: + // array_sort((x, y) -> array_map(z -> z + outer_col, [1, 2]), arr) + // Only the nested lambda's own arguments are visible in nested lambda bodies. + auto inner_z = VColumnRef::create_shared(make_column_ref_node(0, "z", int_type)); + auto outer_col = VColumnRef::create_shared(make_column_ref_node(2, "outer_col", int_type)); + + add->add_child(inner_z); + add->add_child(outer_col); + inner_lambda->add_child(add); + inner_call->add_child(inner_lambda); + inner_call->add_child(std::make_shared(make_int_array_column({{1, 2}}), + array_int_type, "inner_array")); + sort_lambda->add_child(inner_call); + root->add_child(sort_lambda); + root->add_child(std::make_shared(make_int_array_column({{2, 1}, {4, 3}}), + array_int_type, "arr")); + + VExprContext context(root); + RuntimeState state; + RowDescriptor row_desc; + auto status = root->prepare(&state, row_desc, &context); + EXPECT_FALSE(status.ok()); + EXPECT_NE(status.to_string().find( + "array_sort comparator only supports nested lambda arguments inside nested " + "lambda bodies, but found captured column ref 'outer_col'"), + std::string::npos); +} + +TEST(ArrayMapFunctionTest, NestedArraySortComparatorNestedLambdaCapturingSlotRefReturnsError) { + auto int_type = std::make_shared(); + auto int8_type = std::make_shared(); + auto array_int_type = std::make_shared(int_type); + + auto root = VLambdaFunctionCallExpr::create_shared( + make_lambda_call_node(array_int_type, 2, "array_sort")); + auto sort_lambda = + VLambdaFunctionExpr::create_shared(make_lambda_expr_node(int8_type, {"x", "y"})); + auto inner_call = + VLambdaFunctionCallExpr::create_shared(make_lambda_call_node(array_int_type, 2)); + auto inner_lambda = VLambdaFunctionExpr::create_shared(make_lambda_expr_node(int_type, {"z"})); + auto add = std::make_shared(int_type); + + // This mirrors: + // array_sort((x, y) -> array_map(z -> z + k1, [1, 2]), arr) + // SlotRefs from outside the nested lambda are not available inside array_sort's comparator. + auto inner_z = VColumnRef::create_shared(make_column_ref_node(0, "z", int_type)); + auto k1 = make_slot_ref(0, "k1", int_type); + + add->add_child(inner_z); + add->add_child(k1); + inner_lambda->add_child(add); + inner_call->add_child(inner_lambda); + inner_call->add_child(std::make_shared(make_int_array_column({{1, 2}}), + array_int_type, "inner_array")); + sort_lambda->add_child(inner_call); + root->add_child(sort_lambda); + root->add_child(std::make_shared(make_int_array_column({{2, 1}, {4, 3}}), + array_int_type, "arr")); + + VExprContext context(root); + RuntimeState state; + RowDescriptor row_desc; + auto status = root->prepare(&state, row_desc, &context); + EXPECT_FALSE(status.ok()); + EXPECT_NE(status.to_string().find( + "array_sort comparator only supports nested lambda arguments inside nested " + "lambda bodies, but found captured slot ref 'k1'"), + std::string::npos); +} + +TEST(ArrayMapFunctionTest, + NestedArraySortComparatorNestedLambdaWithoutArgumentMetadataReturnsError) { + auto int_type = std::make_shared(); + auto int8_type = std::make_shared(); + auto array_int_type = std::make_shared(int_type); + + auto root = VLambdaFunctionCallExpr::create_shared( + make_lambda_call_node(array_int_type, 2, "array_sort")); + auto sort_lambda = + VLambdaFunctionExpr::create_shared(make_lambda_expr_node(int8_type, {"x", "y"})); + auto inner_call = + VLambdaFunctionCallExpr::create_shared(make_lambda_call_node(array_int_type, 2)); + auto inner_lambda = VLambdaFunctionExpr::create_shared( + make_lambda_expr_node(int_type, {"z"}, false /*set_argument_names*/)); + + // This mirrors an old FE plan where the nested array_map lambda has no argument metadata. + auto inner_z = VColumnRef::create_shared(make_column_ref_node(0, "z", int_type)); + + inner_lambda->add_child(inner_z); + inner_call->add_child(inner_lambda); + inner_call->add_child(std::make_shared(make_int_array_column({{1, 2}}), + array_int_type, "inner_array")); + sort_lambda->add_child(inner_call); + root->add_child(sort_lambda); + root->add_child(std::make_shared(make_int_array_column({{2, 1}, {4, 3}}), + array_int_type, "arr")); + + VExprContext context(root); + RuntimeState state; + RowDescriptor row_desc; + auto status = root->prepare(&state, row_desc, &context); + EXPECT_FALSE(status.ok()); + EXPECT_NE(status.to_string().find( + "Cannot validate nested lambda capture in array_sort comparator without " + "lambda metadata"), + std::string::npos); +} + +TEST(ArrayMapFunctionTest, + NestedArraySortComparatorTwoLevelNestedLambdaCanCaptureOuterNestedArgument) { + auto int_type = std::make_shared(); + auto int8_type = std::make_shared(); + auto array_int_type = std::make_shared(int_type); + auto nested_array_int_type = std::make_shared(array_int_type); + + auto root = VLambdaFunctionCallExpr::create_shared( + make_lambda_call_node(array_int_type, 2, "array_sort")); + auto sort_lambda = + VLambdaFunctionExpr::create_shared(make_lambda_expr_node(int8_type, {"x", "y"})); + auto outer_call = + VLambdaFunctionCallExpr::create_shared(make_lambda_call_node(nested_array_int_type, 2)); + auto outer_lambda = + VLambdaFunctionExpr::create_shared(make_lambda_expr_node(array_int_type, {"z"})); + auto inner_call = + VLambdaFunctionCallExpr::create_shared(make_lambda_call_node(array_int_type, 2)); + auto inner_lambda = VLambdaFunctionExpr::create_shared(make_lambda_expr_node(int_type, {"q"})); + auto add = std::make_shared(int_type); + + // This mirrors: + // array_sort((x, y) -> array_map(z -> array_map(q -> q + z, [1, 2]), [3, 4]), arr) + // q is local to the inner array_map, while z is declared by the enclosing nested array_map. + auto inner_q = VColumnRef::create_shared(make_column_ref_node(0, "q", int_type)); + auto outer_z = VColumnRef::create_shared(make_column_ref_node(0, "z", int_type)); + + add->add_child(inner_q); + add->add_child(outer_z); + inner_lambda->add_child(add); + inner_call->add_child(inner_lambda); + inner_call->add_child(std::make_shared(make_int_array_column({{1, 2}}), + array_int_type, "inner_array")); + outer_lambda->add_child(inner_call); + outer_call->add_child(outer_lambda); + outer_call->add_child(std::make_shared(make_int_array_column({{3, 4}}), + array_int_type, "outer_array")); + sort_lambda->add_child(outer_call); + root->add_child(sort_lambda); + root->add_child(std::make_shared(make_int_array_column({{2, 1}, {4, 3}}), + array_int_type, "arr")); + + VExprContext context(root); + RuntimeState state; + RowDescriptor row_desc; + auto status = root->prepare(&state, row_desc, &context); + EXPECT_TRUE(status.ok()) << status.to_string(); +} + +TEST(ArrayMapFunctionTest, + NestedArraySortComparatorNestedLambdaCallWithoutArgumentMetadataReturnsError) { + auto int_type = std::make_shared(); + auto int8_type = std::make_shared(); + auto array_int_type = std::make_shared(int_type); + auto nested_array_int_type = std::make_shared(array_int_type); + + auto root = VLambdaFunctionCallExpr::create_shared( + make_lambda_call_node(array_int_type, 2, "array_sort")); + auto sort_lambda = + VLambdaFunctionExpr::create_shared(make_lambda_expr_node(int8_type, {"x", "y"})); + auto outer_call = + VLambdaFunctionCallExpr::create_shared(make_lambda_call_node(nested_array_int_type, 2)); + auto outer_lambda = + VLambdaFunctionExpr::create_shared(make_lambda_expr_node(array_int_type, {"z"})); + auto inner_call = + VLambdaFunctionCallExpr::create_shared(make_lambda_call_node(array_int_type, 2)); + auto inner_lambda = VLambdaFunctionExpr::create_shared( + make_lambda_expr_node(int_type, {"q"}, false /*set_argument_names*/)); + auto add = std::make_shared(int_type); + + // This mirrors an old FE plan where a nested lambda call under another nested lambda has no + // argument metadata. + auto inner_q = VColumnRef::create_shared(make_column_ref_node(0, "q", int_type)); + auto outer_z = VColumnRef::create_shared(make_column_ref_node(0, "z", int_type)); + + add->add_child(inner_q); + add->add_child(outer_z); + inner_lambda->add_child(add); + inner_call->add_child(inner_lambda); + inner_call->add_child(std::make_shared(make_int_array_column({{1, 2}}), + array_int_type, "inner_array")); + outer_lambda->add_child(inner_call); + outer_call->add_child(outer_lambda); + outer_call->add_child(std::make_shared(make_int_array_column({{3, 4}}), + array_int_type, "outer_array")); + sort_lambda->add_child(outer_call); + root->add_child(sort_lambda); + root->add_child(std::make_shared(make_int_array_column({{2, 1}, {4, 3}}), + array_int_type, "arr")); + + VExprContext context(root); + RuntimeState state; + RowDescriptor row_desc; + auto status = root->prepare(&state, row_desc, &context); + EXPECT_FALSE(status.ok()); + EXPECT_NE(status.to_string().find( + "Cannot validate nested lambda capture in array_sort comparator without " + "lambda metadata"), + std::string::npos); +} + TEST(ArrayMapFunctionTest, NestedLambdaCapturesOuterSlotRefFromInnerBody) { auto int_type = std::make_shared(); auto array_int_type = std::make_shared(int_type);