From 54131304a8f33d98181c25ce6fddc7f2d129c796 Mon Sep 17 00:00:00 2001 From: "mingmwang@ebay.com" Date: Fri, 10 Feb 2023 17:17:52 +0800 Subject: [PATCH 01/35] Top Down Sort Enforer --- datafusion/core/src/execution/context.rs | 5 +- .../physical_optimizer/dist_enforcement.rs | 141 +- datafusion/core/src/physical_optimizer/mod.rs | 1 + .../src/physical_optimizer/repartition.rs | 56 +- .../physical_optimizer/sort_enforcement.rs | 51 +- .../physical_optimizer/sort_enforcement2.rs | 2101 +++++++++++++++++ .../core/src/physical_optimizer/utils.rs | 5 +- .../physical_plan/joins/sort_merge_join.rs | 18 +- datafusion/core/src/physical_plan/mod.rs | 4 +- datafusion/core/src/physical_plan/planner.rs | 1 + .../core/src/physical_plan/sorts/sort.rs | 7 +- .../sorts/sort_preserving_merge.rs | 15 +- datafusion/core/src/physical_plan/union.rs | 19 +- .../windows/bounded_window_agg_exec.rs | 33 +- .../physical_plan/windows/window_agg_exec.rs | 29 +- datafusion/core/tests/sql/explain_analyze.rs | 2 +- datafusion/core/tests/sql/joins.rs | 24 +- datafusion/core/tests/sql/window.rs | 71 +- datafusion/physical-expr/src/lib.rs | 3 +- datafusion/physical-expr/src/sort_expr.rs | 52 + datafusion/physical-expr/src/utils.rs | 217 +- 21 files changed, 2653 insertions(+), 202 deletions(-) create mode 100644 datafusion/core/src/physical_optimizer/sort_enforcement2.rs diff --git a/datafusion/core/src/execution/context.rs b/datafusion/core/src/execution/context.rs index 99a49d04da7de..eac60c5ecb5dd 100644 --- a/datafusion/core/src/execution/context.rs +++ b/datafusion/core/src/execution/context.rs @@ -97,7 +97,7 @@ use crate::execution::memory_pool::MemoryPool; use crate::physical_optimizer::global_sort_selection::GlobalSortSelection; use crate::physical_optimizer::pipeline_checker::PipelineChecker; use crate::physical_optimizer::pipeline_fixer::PipelineFixer; -use crate::physical_optimizer::sort_enforcement::EnforceSorting; +use crate::physical_optimizer::sort_enforcement2::TopDownEnforceSorting; use datafusion_optimizer::OptimizerConfig; use datafusion_sql::planner::object_name_to_table_reference; use uuid::Uuid; @@ -1068,6 +1068,7 @@ impl QueryPlanner for DefaultQueryPlanner { session_state: &SessionState, ) -> Result> { let planner = DefaultPhysicalPlanner::default(); + println!("optimized logical plan {:?}", logical_plan); planner .create_physical_plan(logical_plan, session_state) .await @@ -1487,7 +1488,7 @@ impl SessionState { // ordering. Please make sure that the whole plan tree is determined before this rule. // Note that one should always run this rule after running the EnforceDistribution rule // as the latter may break local sorting requirements. - Arc::new(EnforceSorting::new()), + Arc::new(TopDownEnforceSorting::new()), // The CoalesceBatches rule will not influence the distribution and ordering of the // whole plan tree. Therefore, to avoid influencing other rules, it should run last. Arc::new(CoalesceBatches::new()), diff --git a/datafusion/core/src/physical_optimizer/dist_enforcement.rs b/datafusion/core/src/physical_optimizer/dist_enforcement.rs index c6c2bd40e39f7..4fe76ac30d94c 100644 --- a/datafusion/core/src/physical_optimizer/dist_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/dist_enforcement.rs @@ -38,11 +38,11 @@ use datafusion_expr::logical_plan::JoinType; use datafusion_physical_expr::equivalence::EquivalenceProperties; use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::expressions::NoOp; +use datafusion_physical_expr::utils::map_columns_before_projection; use datafusion_physical_expr::{ expr_list_eq_strict_order, normalize_expr_with_equivalence_properties, AggregateExpr, PhysicalExpr, }; -use std::collections::HashMap; use std::sync::Arc; /// The EnforceDistribution rule ensures that distribution requirements are met @@ -492,30 +492,6 @@ fn reorder_aggregate_keys( } } -fn map_columns_before_projection( - parent_required: &[Arc], - proj_exprs: &[(Arc, String)], -) -> Vec> { - let mut column_mapping = HashMap::new(); - for (expression, name) in proj_exprs.iter() { - if let Some(column) = expression.as_any().downcast_ref::() { - column_mapping.insert(name.clone(), column.clone()); - }; - } - let new_required: Vec> = parent_required - .iter() - .filter_map(|r| { - if let Some(column) = r.as_any().downcast_ref::() { - column_mapping.get(column.name()) - } else { - None - } - }) - .map(|e| Arc::new(e.clone()) as Arc) - .collect::>(); - new_required -} - fn shift_right_required( parent_required: &[Arc], left_columns_len: usize, @@ -969,7 +945,7 @@ mod tests { use super::*; use crate::datasource::listing::PartitionedFile; use crate::datasource::object_store::ObjectStoreUrl; - use crate::physical_optimizer::sort_enforcement::EnforceSorting; + use crate::physical_optimizer::sort_enforcement2::TopDownEnforceSorting; use crate::physical_plan::aggregates::{ AggregateExec, AggregateMode, PhysicalGroupBy, }; @@ -1015,6 +991,27 @@ mod tests { )) } + fn parquet_multiple_exec() -> Arc { + Arc::new(ParquetExec::new( + FileScanConfig { + object_store_url: ObjectStoreUrl::parse("test:///").unwrap(), + file_schema: schema(), + file_groups: vec![ + vec![PartitionedFile::new("x".to_string(), 100)], + vec![PartitionedFile::new("y".to_string(), 100)], + ], + statistics: Statistics::default(), + projection: None, + limit: None, + table_partition_cols: vec![], + output_ordering: None, + infinite_source: false, + }, + None, + None, + )) + } + fn projection_exec_with_alias( input: Arc, alias_pairs: Vec<(String, String)>, @@ -1134,7 +1131,7 @@ mod tests { // `EnforceSorting` and `EnfoceDistribution`. // TODO: Orthogonalize the tests here just to verify `EnforceDistribution` and create // new tests for the cascade. - let optimizer = EnforceSorting {}; + let optimizer = TopDownEnforceSorting {}; let optimized = optimizer.optimize(optimized, &config)?; // Now format correctly @@ -1873,7 +1870,7 @@ mod tests { #[test] fn multi_smj_joins() -> Result<()> { - let left = parquet_exec(); + let left = parquet_multiple_exec(); let alias_pairs: Vec<(String, String)> = vec![ ("a".to_string(), "a1".to_string()), ("b".to_string(), "b1".to_string()), @@ -1881,7 +1878,7 @@ mod tests { ("d".to_string(), "d1".to_string()), ("e".to_string(), "e1".to_string()), ]; - let right = projection_exec_with_alias(parquet_exec(), alias_pairs); + let right = projection_exec_with_alias(parquet_multiple_exec(), alias_pairs); // SortMergeJoin does not support RightSemi and RightAnti join now let join_types = vec![ @@ -1912,7 +1909,7 @@ mod tests { )]; let top_join = sort_merge_join_exec( join.clone(), - parquet_exec(), + parquet_multiple_exec(), &top_join_on, &join_type, ); @@ -1925,33 +1922,33 @@ mod tests { vec![ top_join_plan.as_str(), join_plan.as_str(), - "SortExec: [a@0 ASC]", - "RepartitionExec: partitioning=Hash([Column { name: \"a\", index: 0 }], 10), input_partitions=1", - "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", - "SortExec: [b1@1 ASC]", - "RepartitionExec: partitioning=Hash([Column { name: \"b1\", index: 1 }], 10), input_partitions=1", + "SortExec: [a@0 ASC], global=false", + "RepartitionExec: partitioning=Hash([Column { name: \"a\", index: 0 }], 10), input_partitions=2", + "ParquetExec: limit=None, partitions={2 groups: [[x], [y]]}, projection=[a, b, c, d, e]", + "SortExec: [b1@1 ASC], global=false", + "RepartitionExec: partitioning=Hash([Column { name: \"b1\", index: 1 }], 10), input_partitions=2", "ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", - "SortExec: [c@2 ASC]", - "RepartitionExec: partitioning=Hash([Column { name: \"c\", index: 2 }], 10), input_partitions=1", - "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", + "ParquetExec: limit=None, partitions={2 groups: [[x], [y]]}, projection=[a, b, c, d, e]", + "SortExec: [c@2 ASC], global=false", + "RepartitionExec: partitioning=Hash([Column { name: \"c\", index: 2 }], 10), input_partitions=2", + "ParquetExec: limit=None, partitions={2 groups: [[x], [y]]}, projection=[a, b, c, d, e]", ], // Should include 4 RepartitionExecs _ => vec![ top_join_plan.as_str(), - "SortExec: [a@0 ASC]", + "SortExec: [a@0 ASC], global=false", "RepartitionExec: partitioning=Hash([Column { name: \"a\", index: 0 }], 10), input_partitions=10", join_plan.as_str(), - "SortExec: [a@0 ASC]", - "RepartitionExec: partitioning=Hash([Column { name: \"a\", index: 0 }], 10), input_partitions=1", - "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", - "SortExec: [b1@1 ASC]", - "RepartitionExec: partitioning=Hash([Column { name: \"b1\", index: 1 }], 10), input_partitions=1", + "SortExec: [a@0 ASC], global=false", + "RepartitionExec: partitioning=Hash([Column { name: \"a\", index: 0 }], 10), input_partitions=2", + "ParquetExec: limit=None, partitions={2 groups: [[x], [y]]}, projection=[a, b, c, d, e]", + "SortExec: [b1@1 ASC], global=false", + "RepartitionExec: partitioning=Hash([Column { name: \"b1\", index: 1 }], 10), input_partitions=2", "ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", - "SortExec: [c@2 ASC]", - "RepartitionExec: partitioning=Hash([Column { name: \"c\", index: 2 }], 10), input_partitions=1", - "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", + "ParquetExec: limit=None, partitions={2 groups: [[x], [y]]}, projection=[a, b, c, d, e]", + "SortExec: [c@2 ASC], global=false", + "RepartitionExec: partitioning=Hash([Column { name: \"c\", index: 2 }], 10), input_partitions=2", + "ParquetExec: limit=None, partitions={2 groups: [[x], [y]]}, projection=[a, b, c, d, e]", ], }; assert_optimized!(expected, top_join); @@ -1966,7 +1963,7 @@ mod tests { )]; let top_join = sort_merge_join_exec( join, - parquet_exec(), + parquet_multiple_exec(), &top_join_on, &join_type, ); @@ -1978,33 +1975,33 @@ mod tests { JoinType::Inner | JoinType::Right => vec![ top_join_plan.as_str(), join_plan.as_str(), - "SortExec: [a@0 ASC]", - "RepartitionExec: partitioning=Hash([Column { name: \"a\", index: 0 }], 10), input_partitions=1", - "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", - "SortExec: [b1@1 ASC]", - "RepartitionExec: partitioning=Hash([Column { name: \"b1\", index: 1 }], 10), input_partitions=1", + "SortExec: [a@0 ASC], global=false", + "RepartitionExec: partitioning=Hash([Column { name: \"a\", index: 0 }], 10), input_partitions=2", + "ParquetExec: limit=None, partitions={2 groups: [[x], [y]]}, projection=[a, b, c, d, e]", + "SortExec: [b1@1 ASC], global=false", + "RepartitionExec: partitioning=Hash([Column { name: \"b1\", index: 1 }], 10), input_partitions=2", "ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", - "SortExec: [c@2 ASC]", - "RepartitionExec: partitioning=Hash([Column { name: \"c\", index: 2 }], 10), input_partitions=1", - "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", + "ParquetExec: limit=None, partitions={2 groups: [[x], [y]]}, projection=[a, b, c, d, e]", + "SortExec: [c@2 ASC], global=false", + "RepartitionExec: partitioning=Hash([Column { name: \"c\", index: 2 }], 10), input_partitions=2", + "ParquetExec: limit=None, partitions={2 groups: [[x], [y]]}, projection=[a, b, c, d, e]", ], // Should include 4 RepartitionExecs and 4 SortExecs _ => vec![ top_join_plan.as_str(), - "SortExec: [b1@6 ASC]", + "SortExec: [b1@6 ASC], global=false", "RepartitionExec: partitioning=Hash([Column { name: \"b1\", index: 6 }], 10), input_partitions=10", join_plan.as_str(), - "SortExec: [a@0 ASC]", - "RepartitionExec: partitioning=Hash([Column { name: \"a\", index: 0 }], 10), input_partitions=1", - "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", - "SortExec: [b1@1 ASC]", - "RepartitionExec: partitioning=Hash([Column { name: \"b1\", index: 1 }], 10), input_partitions=1", + "SortExec: [a@0 ASC], global=false", + "RepartitionExec: partitioning=Hash([Column { name: \"a\", index: 0 }], 10), input_partitions=2", + "ParquetExec: limit=None, partitions={2 groups: [[x], [y]]}, projection=[a, b, c, d, e]", + "SortExec: [b1@1 ASC], global=false", + "RepartitionExec: partitioning=Hash([Column { name: \"b1\", index: 1 }], 10), input_partitions=2", "ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", - "SortExec: [c@2 ASC]", - "RepartitionExec: partitioning=Hash([Column { name: \"c\", index: 2 }], 10), input_partitions=1", - "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", + "ParquetExec: limit=None, partitions={2 groups: [[x], [y]]}, projection=[a, b, c, d, e]", + "SortExec: [c@2 ASC], global=false", + "RepartitionExec: partitioning=Hash([Column { name: \"c\", index: 2 }], 10), input_partitions=2", + "ParquetExec: limit=None, partitions={2 groups: [[x], [y]]}, projection=[a, b, c, d, e]", ], }; assert_optimized!(expected, top_join); @@ -2065,15 +2062,15 @@ mod tests { // Only two RepartitionExecs added let expected = &[ "SortMergeJoin: join_type=Inner, on=[(Column { name: \"b3\", index: 1 }, Column { name: \"b2\", index: 1 }), (Column { name: \"a3\", index: 0 }, Column { name: \"a2\", index: 0 })]", - "SortExec: [b3@1 ASC,a3@0 ASC]", "ProjectionExec: expr=[a1@0 as a3, b1@1 as b3]", "ProjectionExec: expr=[a1@1 as a1, b1@0 as b1]", + "SortExec: [b1@0 ASC,a1@1 ASC], global=false", "AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[]", "RepartitionExec: partitioning=Hash([Column { name: \"b1\", index: 0 }, Column { name: \"a1\", index: 1 }], 10), input_partitions=1", "AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[]", "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", - "SortExec: [b2@1 ASC,a2@0 ASC]", "ProjectionExec: expr=[a@1 as a2, b@0 as b2]", + "SortExec: [b@0 ASC,a@1 ASC], global=false", "AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[]", "RepartitionExec: partitioning=Hash([Column { name: \"b\", index: 0 }, Column { name: \"a\", index: 1 }], 10), input_partitions=1", "AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a], aggr=[]", @@ -2103,8 +2100,8 @@ mod tests { // The optimizer should not add an additional SortExec as the // data is already sorted + // SortPreservingMergeExec is also removed from the final plan let expected = &[ - "SortPreservingMergeExec: [a@0 ASC]", "CoalesceBatchesExec: target_batch_size=4096", "ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[a@0 ASC], projection=[a, b, c, d, e]", ]; diff --git a/datafusion/core/src/physical_optimizer/mod.rs b/datafusion/core/src/physical_optimizer/mod.rs index 3958a546a92df..d0b0a917616e8 100644 --- a/datafusion/core/src/physical_optimizer/mod.rs +++ b/datafusion/core/src/physical_optimizer/mod.rs @@ -31,6 +31,7 @@ pub mod sort_enforcement; mod utils; pub mod pipeline_fixer; +pub mod sort_enforcement2; #[cfg(test)] pub mod test_utils; diff --git a/datafusion/core/src/physical_optimizer/repartition.rs b/datafusion/core/src/physical_optimizer/repartition.rs index 1285b9089c8d8..84501b5cfbe05 100644 --- a/datafusion/core/src/physical_optimizer/repartition.rs +++ b/datafusion/core/src/physical_optimizer/repartition.rs @@ -307,6 +307,7 @@ mod tests { use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; use crate::physical_plan::union::UnionExec; use crate::physical_plan::{displayable, DisplayFormatType, Statistics}; + use datafusion_physical_expr::{new_sort_requirements, PhysicalSortRequirements}; fn schema() -> SchemaRef { Arc::new(Schema::new(vec![Field::new("c1", DataType::Boolean, true)])) @@ -355,6 +356,33 @@ mod tests { )) } + // Created a sorted parquet exec with multiple files + fn parquet_exec_multiple_sorted() -> Arc { + let sort_exprs = vec![PhysicalSortExpr { + expr: col("c1", &schema()).unwrap(), + options: SortOptions::default(), + }]; + + Arc::new(ParquetExec::new( + FileScanConfig { + object_store_url: ObjectStoreUrl::parse("test:///").unwrap(), + file_schema: schema(), + file_groups: vec![ + vec![PartitionedFile::new("x".to_string(), 100)], + vec![PartitionedFile::new("y".to_string(), 100)], + ], + statistics: Statistics::default(), + projection: None, + limit: None, + table_partition_cols: vec![], + output_ordering: Some(sort_exprs), + infinite_source: false, + }, + None, + None, + )) + } + fn sort_preserving_merge_exec( input: Arc, ) -> Arc { @@ -556,7 +584,7 @@ mod tests { "GlobalLimitExec: skip=0, fetch=100", "LocalLimitExec: fetch=100", // data is sorted so can't repartition here - "SortExec: [c1@0 ASC]", + "SortExec: [c1@0 ASC], global=true", "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[c1]", ]; @@ -574,7 +602,7 @@ mod tests { "FilterExec: c1@0", // data is sorted so can't repartition here even though // filter would benefit from parallelism, the answers might be wrong - "SortExec: [c1@0 ASC]", + "SortExec: [c1@0 ASC], global=true", "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[c1]", ]; @@ -662,7 +690,7 @@ mod tests { // need repartiton and resort as the data was not sorted correctly let expected = &[ "SortPreservingMergeExec: [c1@0 ASC]", - "SortExec: [c1@0 ASC]", + "SortExec: [c1@0 ASC], global=false", "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[c1]", ]; @@ -674,12 +702,12 @@ mod tests { #[test] fn repartition_ignores_sort_preserving_merge() -> Result<()> { // sort preserving merge already sorted input, - let plan = sort_preserving_merge_exec(parquet_exec_sorted()); + let plan = sort_preserving_merge_exec(parquet_exec_multiple_sorted()); // should not repartition / sort (as the data was already sorted) let expected = &[ "SortPreservingMergeExec: [c1@0 ASC]", - "ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[c1@0 ASC], projection=[c1]", + "ParquetExec: limit=None, partitions={2 groups: [[x], [y]]}, output_ordering=[c1@0 ASC], projection=[c1]", ]; assert_optimized!(expected, plan); @@ -762,7 +790,7 @@ mod tests { // needs to repartition / sort as the data was not sorted correctly let expected = &[ "SortPreservingMergeExec: [c1@0 ASC]", - "SortExec: [c1@0 ASC]", + "SortExec: [c1@0 ASC], global=false", "ProjectionExec: expr=[c1@0 as c1]", "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[c1]", @@ -775,13 +803,14 @@ mod tests { #[test] fn repartition_ignores_transitively_with_projection() -> Result<()> { // sorted input - let plan = sort_preserving_merge_exec(projection_exec(parquet_exec_sorted())); + let plan = + sort_preserving_merge_exec(projection_exec(parquet_exec_multiple_sorted())); // data should not be repartitioned / resorted let expected = &[ "SortPreservingMergeExec: [c1@0 ASC]", "ProjectionExec: expr=[c1@0 as c1]", - "ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[c1@0 ASC], projection=[c1]", + "ParquetExec: limit=None, partitions={2 groups: [[x], [y]]}, output_ordering=[c1@0 ASC], projection=[c1]", ]; assert_optimized!(expected, plan); @@ -796,7 +825,7 @@ mod tests { let expected = &[ "SortPreservingMergeExec: [c1@0 ASC]", // Expect repartition on the input to the sort (as it can benefit from additional parallelism) - "SortExec: [c1@0 ASC]", + "SortExec: [c1@0 ASC], global=false", "ProjectionExec: expr=[c1@0 as c1]", "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[c1]", @@ -814,7 +843,7 @@ mod tests { let expected = &[ "SortPreservingMergeExec: [c1@0 ASC]", // Expect repartition on the input to the sort (as it can benefit from additional parallelism) - "SortExec: [c1@0 ASC]", + "SortExec: [c1@0 ASC], global=false", "FilterExec: c1@0", "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[c1]", @@ -834,7 +863,7 @@ mod tests { let expected = &[ "SortPreservingMergeExec: [c1@0 ASC]", // Expect repartition on the input to the sort (as it can benefit from additional parallelism) - "SortExec: [c1@0 ASC]", + "SortExec: [c1@0 ASC], global=false", "ProjectionExec: expr=[c1@0 as c1]", "FilterExec: c1@0", // repartition is lowest down @@ -881,8 +910,9 @@ mod tests { } // model that it requires the output ordering of its input - fn required_input_ordering(&self) -> Vec> { - vec![self.input.output_ordering()] + fn required_input_ordering(&self) -> Vec>> { + let ordering_requirements = new_sort_requirements(self.output_ordering()); + vec![ordering_requirements] } fn with_new_children( diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement.rs b/datafusion/core/src/physical_optimizer/sort_enforcement.rs index c9a3c8fec293e..d0d983597abad 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement.rs @@ -39,7 +39,10 @@ use crate::physical_plan::windows::{BoundedWindowAggExec, WindowAggExec}; use crate::physical_plan::{with_new_children_if_necessary, ExecutionPlan}; use arrow::datatypes::SchemaRef; use datafusion_common::{reverse_sort_options, DataFusionError}; -use datafusion_physical_expr::utils::{ordering_satisfy, ordering_satisfy_concrete}; +use datafusion_physical_expr::utils::{ + create_sort_expr_from_requirement, ordering_satisfy, + ordering_satisfy_requirement_concrete, +}; use datafusion_physical_expr::window::WindowExpr; use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr}; use itertools::izip; @@ -177,16 +180,16 @@ fn ensure_sorting( let physical_ordering = child.output_ordering(); match (required_ordering, physical_ordering) { (Some(required_ordering), Some(physical_ordering)) => { - let is_ordering_satisfied = ordering_satisfy_concrete( + let is_ordering_satisfied = ordering_satisfy_requirement_concrete( physical_ordering, - required_ordering, + &required_ordering, || child.equivalence_properties(), ); if !is_ordering_satisfied { // Make sure we preserve the ordering requirements: update_child_to_remove_unnecessary_sort(child, sort_onwards)?; - let sort_expr = required_ordering.to_vec(); - *child = add_sort_above_child(child, sort_expr)?; + let sort_expr = create_sort_expr_from_requirement(&required_ordering); + *child = add_sort_above_child(child, sort_expr, None)?; sort_onwards.push((idx, child.clone())) } if let [first, ..] = sort_onwards.as_slice() { @@ -236,8 +239,8 @@ fn ensure_sorting( } (Some(required), None) => { // Ordering requirement is not met, we should add a SortExec to the plan. - let sort_expr = required.to_vec(); - *child = add_sort_above_child(child, sort_expr)?; + let sort_expr = create_sort_expr_from_requirement(&required); + *child = add_sort_above_child(child, sort_expr, None)?; *sort_onwards = vec![(idx, child.clone())]; } (None, Some(_)) => { @@ -654,12 +657,12 @@ mod tests { let physical_plan = sort_exec(vec![sort_expr("nullable_col", &schema)], input); let expected_input = vec![ - "SortExec: [nullable_col@0 ASC]", - " SortExec: [non_nullable_col@1 ASC]", + "SortExec: [nullable_col@0 ASC], global=true", + " SortExec: [non_nullable_col@1 ASC], global=true", " MemoryExec: partitions=0, partition_sizes=[]", ]; let expected_optimized = vec![ - "SortExec: [nullable_col@0 ASC]", + "SortExec: [nullable_col@0 ASC], global=true", " MemoryExec: partitions=0, partition_sizes=[]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); @@ -708,9 +711,9 @@ mod tests { let expected_input = vec![ "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", " FilterExec: NOT non_nullable_col@1", - " SortExec: [non_nullable_col@1 ASC NULLS LAST]", + " SortExec: [non_nullable_col@1 ASC NULLS LAST], global=true", " WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " SortExec: [non_nullable_col@1 DESC]", + " SortExec: [non_nullable_col@1 DESC], global=true", " MemoryExec: partitions=0, partition_sizes=[]", ]; @@ -718,7 +721,7 @@ mod tests { "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(NULL) }]", " FilterExec: NOT non_nullable_col@1", " WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " SortExec: [non_nullable_col@1 DESC]", + " SortExec: [non_nullable_col@1 DESC], global=true", " MemoryExec: partitions=0, partition_sizes=[]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); @@ -740,7 +743,7 @@ mod tests { ]; let expected_optimized = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortExec: [nullable_col@0 ASC]", + " SortExec: [nullable_col@0 ASC], global=true", " MemoryExec: partitions=0, partition_sizes=[]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); @@ -748,7 +751,7 @@ mod tests { } #[tokio::test] - async fn test_remove_unnecessary_sort1() -> Result<()> { + async fn test_remove_unnecessary_sort3() -> Result<()> { let schema = create_test_schema()?; let source = memory_exec(&schema); let sort_exprs = vec![sort_expr("nullable_col", &schema)]; @@ -760,15 +763,15 @@ mod tests { let physical_plan = sort_preserving_merge_exec(sort_exprs, sort); let expected_input = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortExec: [nullable_col@0 ASC]", + " SortExec: [nullable_col@0 ASC], global=true", " SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortExec: [nullable_col@0 ASC]", + " SortExec: [nullable_col@0 ASC], global=true", " MemoryExec: partitions=0, partition_sizes=[]", ]; let expected_optimized = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC]", " SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortExec: [nullable_col@0 ASC]", + " SortExec: [nullable_col@0 ASC], global=true", " MemoryExec: partitions=0, partition_sizes=[]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); @@ -787,12 +790,12 @@ mod tests { let physical_plan = sort_preserving_merge_exec(sort_exprs, sort); let expected_input = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " SortExec: [nullable_col@0 ASC]", + " SortExec: [nullable_col@0 ASC], global=true", " MemoryExec: partitions=0, partition_sizes=[]", ]; let expected_optimized = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", " MemoryExec: partitions=0, partition_sizes=[]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); @@ -817,7 +820,7 @@ mod tests { "SortPreservingMergeExec: [nullable_col@0 ASC]", " UnionExec", " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: [nullable_col@0 ASC]", + " SortExec: [nullable_col@0 ASC], global=true", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; // should not add a sort at the output of the union, input plan should not be changed @@ -848,7 +851,7 @@ mod tests { "SortPreservingMergeExec: [nullable_col@0 ASC]", " UnionExec", " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: [nullable_col@0 ASC]", + " SortExec: [nullable_col@0 ASC], global=true", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; // should not add a sort at the output of the union, input plan should not be changed @@ -882,13 +885,13 @@ mod tests { "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", " UnionExec", " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; // should remove unnecessary sorting from below and move it to top let expected_optimized = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=false", " UnionExec", " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement2.rs b/datafusion/core/src/physical_optimizer/sort_enforcement2.rs new file mode 100644 index 0000000000000..984d817db9a63 --- /dev/null +++ b/datafusion/core/src/physical_optimizer/sort_enforcement2.rs @@ -0,0 +1,2101 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! EnforceSorting optimizer rule inspects the physical plan with respect +//! to local sorting requirements and does the following: +//! - Adds a [SortExec] when a requirement is not met, +//! - Removes an already-existing [SortExec] if it is possible to prove +//! that this sort is unnecessary +//! The rule can work on valid *and* invalid physical plans with respect to +//! sorting requirements, but always produces a valid physical plan in this sense. +//! +//! A non-realistic but easy to follow example for sort removals: Assume that we +//! somehow get the fragment +//! "SortExec: [nullable_col@0 ASC]", +//! " SortExec: [non_nullable_col@1 ASC]", +//! in the physical plan. The first sort is unnecessary since its result is overwritten +//! by another SortExec. Therefore, this rule removes it from the physical plan. +use crate::config::ConfigOptions; +use crate::error::Result; +use crate::execution::context::TaskContext; +use crate::physical_optimizer::utils::add_sort_above_child; +use crate::physical_optimizer::PhysicalOptimizerRule; +use crate::physical_plan::filter::FilterExec; +use crate::physical_plan::joins::utils::JoinSide; +use crate::physical_plan::joins::SortMergeJoinExec; +use crate::physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; +use crate::physical_plan::projection::ProjectionExec; +use crate::physical_plan::repartition::RepartitionExec; +use crate::physical_plan::rewrite::TreeNodeRewritable; +use crate::physical_plan::sorts::sort::SortExec; +use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; +use crate::physical_plan::union::UnionExec; +use crate::physical_plan::windows::{BoundedWindowAggExec, WindowAggExec}; +use crate::physical_plan::{ + displayable, with_new_children_if_necessary, DisplayFormatType, ExecutionPlan, + Partitioning, SendableRecordBatchStream, +}; +use arrow::datatypes::SchemaRef; +use datafusion_common::{reverse_sort_options, DataFusionError, Statistics}; +use datafusion_physical_expr::expressions::Column; +use datafusion_physical_expr::utils::{ + create_sort_expr_from_requirement, map_requirement_before_projection, + ordering_satisfy, ordering_satisfy_requirement, requirements_compatible, +}; +use datafusion_physical_expr::{ + EquivalenceProperties, PhysicalExpr, PhysicalSortExpr, PhysicalSortRequirements, +}; +use itertools::izip; +use std::any::Any; +use std::iter::zip; +use std::sync::Arc; + +/// This rule inspects SortExec's in the given physical plan and removes the +/// ones it can prove unnecessary. +#[derive(Default)] +pub struct TopDownEnforceSorting {} + +impl TopDownEnforceSorting { + #[allow(missing_docs)] + pub fn new() -> Self { + Self {} + } +} + +/// This is a "data class" we use within the [EnforceSorting] rule +#[derive(Debug, Clone)] +struct PlanWithSortRequirements { + /// Current plan + plan: Arc, + /// Whether the plan could impact the final result ordering + impact_result_ordering: bool, + /// Parent required sort ordering + required_ordering: Option>, + /// The adjusted request sort ordering to children. + /// By default they are the same as the plan's required input ordering, but can be adjusted based on parent required sort ordering properties. + adjusted_request_ordering: Vec>>, +} + +impl PlanWithSortRequirements { + pub fn init(plan: Arc) -> Self { + let impact_result_ordering = plan.output_ordering().is_some() + || plan.output_partitioning().partition_count() == 1 + || plan.as_any().downcast_ref::().is_some() + || plan.as_any().downcast_ref::().is_some(); + let request_ordering = plan.required_input_ordering(); + PlanWithSortRequirements { + plan, + impact_result_ordering, + required_ordering: None, + adjusted_request_ordering: request_ordering, + } + } + + pub fn new_without_impact_result_ordering(plan: Arc) -> Self { + let request_ordering = plan.required_input_ordering(); + PlanWithSortRequirements { + plan, + impact_result_ordering: false, + required_ordering: None, + adjusted_request_ordering: request_ordering, + } + } + + pub fn children(&self) -> Vec { + let plan_children = self.plan.children(); + assert_eq!(plan_children.len(), self.adjusted_request_ordering.len()); + let child_impact_result_ordering = if self + .plan + .as_any() + .downcast_ref::() + .is_some() + || self + .plan + .as_any() + .downcast_ref::() + .is_some() + { + true + } else if self.plan.as_any().downcast_ref::().is_some() { + false + } else { + self.plan.maintains_input_order().iter().all(|o| *o) + && self.impact_result_ordering + }; + println!( + "child_impact_result_ordering {:?}", + child_impact_result_ordering + ); + plan_children + .into_iter() + .zip(self.adjusted_request_ordering.clone().into_iter()) + .map(|(child, required)| { + let from_parent = required; + let child_request_ordering = child.required_input_ordering(); + PlanWithSortRequirements { + plan: child, + impact_result_ordering: child_impact_result_ordering, + required_ordering: from_parent, + adjusted_request_ordering: child_request_ordering, + } + }) + .collect() + } +} + +impl TreeNodeRewritable for PlanWithSortRequirements { + fn map_children(self, transform: F) -> Result + where + F: FnMut(Self) -> Result, + { + let children = self.children(); + if children.is_empty() { + Ok(self) + } else { + let new_children = children + .into_iter() + .map(transform) + .collect::>>()?; + + let children_plans = new_children + .iter() + .map(|elem| elem.plan.clone()) + .collect::>(); + let plan = with_new_children_if_necessary(self.plan, children_plans)?; + Ok(PlanWithSortRequirements { + plan, + impact_result_ordering: self.impact_result_ordering, + required_ordering: self.required_ordering, + adjusted_request_ordering: self.adjusted_request_ordering, + }) + } + } +} + +impl PhysicalOptimizerRule for TopDownEnforceSorting { + fn optimize( + &self, + plan: Arc, + _config: &ConfigOptions, + ) -> Result> { + // Execute a Top-Down process(Preorder Traversal) to ensure the sort requirements: + let plan_requirements = PlanWithSortRequirements::init(plan); + let adjusted = plan_requirements.transform_down(&ensure_sorting)?; + // Remove the TombStoneExec + let final_plan = adjusted.plan.transform_up(&|plan| { + if let Some(tombstone_exec) = plan.as_any().downcast_ref::() { + Ok(Some(tombstone_exec.input.clone())) + } else { + Ok(None) + } + })?; + Ok(final_plan) + } + + fn name(&self) -> &str { + "EnforceSorting2" + } + + fn schema_check(&self) -> bool { + true + } +} + +fn ensure_sorting( + requirements: PlanWithSortRequirements, +) -> Result> { + println!( + "=== Current plan ===\n{}\n", + displayable(requirements.plan.as_ref()).indent() + ); + println!( + "impact_result_ordering: {:?}, parent required_ordering {:?}, adjusted request ordering {:?}", + requirements.impact_result_ordering, requirements.required_ordering, requirements.adjusted_request_ordering, + ); + if let Some(sort_exec) = requirements.plan.as_any().downcast_ref::() { + // Remove unnecessary global SortExec + if !sort_exec.preserve_partitioning() { + if !requirements.impact_result_ordering + && requirements.required_ordering.is_none() + { + println!("remove sort_exec due to no need to keep ordering"); + return Ok(Some(PlanWithSortRequirements { + plan: Arc::new(TombStoneExec::new(sort_exec.input().clone())), + impact_result_ordering: false, + required_ordering: None, + adjusted_request_ordering: vec![None], + })); + } else if ordering_satisfy( + sort_exec.input().output_ordering(), + sort_exec.output_ordering(), + || sort_exec.input().equivalence_properties(), + ) && sort_exec.input().output_partitioning().partition_count() == 1 + { + println!("remove sort_exec due to child already satisfy"); + return Ok(Some(PlanWithSortRequirements { + plan: Arc::new(TombStoneExec::new(sort_exec.input().clone())), + impact_result_ordering: true, + required_ordering: None, + adjusted_request_ordering: vec![requirements.required_ordering], + })); + } + } + } else if let Some(sort_pres_exec) = requirements + .plan + .as_any() + .downcast_ref::() + { + // SortPreservingMergeExec + SortExec(local/global) is the same as the global SortExec + // Remove unnecessary SortPreservingMergeExec + SortExec(local/global) + if let Some(child_sort_exec) = + sort_pres_exec.input().as_any().downcast_ref::() + { + if sort_pres_exec.expr() == child_sort_exec.expr() { + if !requirements.impact_result_ordering + && requirements.required_ordering.is_none() + { + println!("remove SortPreservingMergeExec + SortExec due to no need to keep ordering"); + return Ok(Some(PlanWithSortRequirements { + plan: Arc::new(TombStoneExec::new( + child_sort_exec.input().clone(), + )), + impact_result_ordering: false, + required_ordering: None, + adjusted_request_ordering: vec![None], + })); + } else if ordering_satisfy( + child_sort_exec.input().output_ordering(), + child_sort_exec.output_ordering(), + || child_sort_exec.input().equivalence_properties(), + ) && child_sort_exec + .input() + .output_partitioning() + .partition_count() + == 1 + { + println!("remove SortPreservingMergeExec + SortExec due to child already satisfy"); + return Ok(Some(PlanWithSortRequirements { + plan: Arc::new(TombStoneExec::new( + child_sort_exec.input().clone(), + )), + impact_result_ordering: true, + required_ordering: None, + adjusted_request_ordering: vec![requirements.required_ordering], + })); + } + } + } else { + // Remove unnecessary SortPreservingMergeExec only + if !requirements.impact_result_ordering { + println!( + "remove SortPreservingMergeExec due to no need to keep ordering" + ); + return Ok(Some(PlanWithSortRequirements { + plan: Arc::new(TombStoneExec::new(sort_pres_exec.input().clone())), + impact_result_ordering: false, + required_ordering: None, + adjusted_request_ordering: vec![requirements.required_ordering], + })); + } else if ordering_satisfy( + sort_pres_exec.input().output_ordering(), + Some(sort_pres_exec.expr()), + || sort_pres_exec.input().equivalence_properties(), + ) && sort_pres_exec + .input() + .output_partitioning() + .partition_count() + == 1 + { + println!("remove SortPreservingMergeExec due to child already satisfy"); + return Ok(Some(PlanWithSortRequirements { + plan: Arc::new(TombStoneExec::new(sort_pres_exec.input().clone())), + impact_result_ordering: true, + required_ordering: None, + adjusted_request_ordering: vec![requirements.required_ordering], + })); + } + } + } + println!("no removing"); + let plan = &requirements.plan; + let parent_required = requirements.required_ordering.as_deref(); + if ordering_satisfy_requirement(plan.output_ordering(), parent_required, || { + plan.equivalence_properties() + }) { + // Can satisfy the parent requirements, clear the requirements + println!( + "Can satisfy the parent requirements, impact_result_ordering {:?}", + requirements.impact_result_ordering + ); + if plan.as_any().downcast_ref::().is_some() + || plan + .as_any() + .downcast_ref::() + .is_some() + { + let request_child = requirements.adjusted_request_ordering[0].as_deref(); + let reversed_request_child = reverse_window_sort_requirements(request_child); + + if should_reverse_window_sort_requirements( + plan.clone(), + request_child, + reversed_request_child.as_deref(), + ) { + println!("Should reverse top window sort_requirements"); + let (window_expr, input_schema, partition_keys) = if let Some(exec) = + plan.as_any().downcast_ref::() + { + ( + exec.window_expr(), + exec.input_schema(), + exec.partition_keys.clone(), + ) + } else if let Some(exec) = plan.as_any().downcast_ref::() { + ( + exec.window_expr(), + exec.input_schema(), + exec.partition_keys.clone(), + ) + } else { + return Err(DataFusionError::Plan( + "Expects to receive either WindowAggExec of BoundedWindowAggExec" + .to_string(), + )); + }; + let new_window_expr = window_expr + .iter() + .map(|e| e.get_reverse_expr()) + .collect::>>(); + let new_physical_ordering = create_sort_expr_from_requirement( + reversed_request_child.clone().unwrap().as_ref(), + ); + if let Some(window_expr) = new_window_expr { + let uses_bounded_memory = + window_expr.iter().all(|e| e.uses_bounded_memory()); + // If all window expressions can run with bounded memory, choose the + // bounded window variant: + let new_plan = if uses_bounded_memory { + Arc::new(BoundedWindowAggExec::try_new( + window_expr, + plan.children()[0].clone(), + input_schema, + partition_keys, + Some(new_physical_ordering), + )?) as _ + } else { + Arc::new(WindowAggExec::try_new( + window_expr, + plan.children()[0].clone(), + input_schema, + partition_keys, + Some(new_physical_ordering), + )?) as _ + }; + println!("Reverse WindowAggExec expressions and push down the reversed requirements"); + + return Ok(Some(PlanWithSortRequirements { + plan: new_plan, + impact_result_ordering: false, + required_ordering: None, + adjusted_request_ordering: vec![reversed_request_child], + })); + } + } else { + println!("Should not reverse top window sort_requirements"); + } + } else if let Some(_) = plan.as_any().downcast_ref::() { + return Ok(Some(PlanWithSortRequirements { + plan: plan.clone(), + impact_result_ordering: false, + required_ordering: None, + adjusted_request_ordering: requirements.adjusted_request_ordering.clone(), + })); + } + return Ok(Some(PlanWithSortRequirements { + plan: plan.clone(), + impact_result_ordering: requirements.impact_result_ordering, + required_ordering: None, + adjusted_request_ordering: requirements.adjusted_request_ordering, + })); + } else if let Some(sort_exec) = plan.as_any().downcast_ref::() { + println!("Modify current SortExec to satisfy the parent requirements"); + // If the current plan is a SortExec, update the SortExec to satisfy the parent requirements + let parent_required_expr = + create_sort_expr_from_requirement(parent_required.unwrap()); + let new_plan = add_sort_above_child( + &sort_exec.input, + parent_required_expr, + sort_exec.fetch(), + )?; + return Ok(Some( + PlanWithSortRequirements::new_without_impact_result_ordering(new_plan), + )); + } else { + println!("Can not satisfy the parent requirements, try to push down"); + // Can not satisfy the parent requirements, check whether should push down the requirements. Add new SortExec when the parent requirements can not be pushed down + let parent_required_expr = + create_sort_expr_from_requirement(parent_required.unwrap()); + let maintains_input_order = plan.maintains_input_order(); + // If the current plan is a leaf node or can not maintain any of the input ordering, can not pushed down requirements. + // For RepartitionExec, we always choose to not push down the sort requirements even the RepartitionExec(input_partition=1) could maintain input ordering. + // For UnionExec, we can always push down + if (maintains_input_order.is_empty() + || !maintains_input_order.iter().any(|o| *o) + || plan.as_any().downcast_ref::().is_some() + || plan.as_any().downcast_ref::().is_some() + || plan.as_any().downcast_ref::().is_some() + || plan.as_any().downcast_ref::().is_some()) + && plan.as_any().downcast_ref::().is_none() + { + let new_plan = add_sort_above_child(plan, parent_required_expr, None)?; + return Ok(Some( + PlanWithSortRequirements::new_without_impact_result_ordering(new_plan), + )); + } else if let Some(window_agg_exec) = + plan.as_any().downcast_ref::() + { + let window_expr = window_agg_exec.window_expr(); + let request_child = requirements.adjusted_request_ordering[0].as_deref(); + if requirements_compatible(request_child, parent_required, || { + plan.children()[0].equivalence_properties() + }) { + println!("WindowAggExec child requirements are more specific, no need to add SortExec"); + return Ok(Some(PlanWithSortRequirements { + plan: plan.clone(), + impact_result_ordering: true, + required_ordering: None, + adjusted_request_ordering: requirements.adjusted_request_ordering, + })); + } else if requirements_compatible(parent_required, request_child, || { + plan.children()[0].equivalence_properties() + }) { + println!("Parent requirements are more specific, adjust WindowAggExec child requirements and push down the requirements"); + let adjusted = parent_required.map(|r| r.to_vec()); + return Ok(Some(PlanWithSortRequirements { + plan: plan.clone(), + impact_result_ordering: true, + required_ordering: None, + adjusted_request_ordering: vec![adjusted], + })); + } else { + let should_reverse = can_reverse_window_request( + window_expr[0].partition_by(), + parent_required, + request_child, + &window_agg_exec.input().schema(), + ); + if should_reverse { + let new_window_expr = window_expr + .iter() + .map(|e| e.get_reverse_expr()) + .collect::>>(); + if let Some(window_expr) = new_window_expr { + let new_plan = Arc::new(WindowAggExec::try_new( + window_expr, + window_agg_exec.children()[0].clone(), + window_agg_exec.input_schema(), + window_agg_exec.partition_keys.clone(), + Some(parent_required_expr.to_vec()), + )?) as _; + println!("Reverse WindowAggExec expressions and push down the requirements"); + return Ok(Some( + PlanWithSortRequirements::new_without_impact_result_ordering( + new_plan, + ), + )); + } else { + println!("Can not push down, add new SortExec"); + let new_plan = + add_sort_above_child(plan, parent_required_expr, None)?; + return Ok(Some( + PlanWithSortRequirements::new_without_impact_result_ordering( + new_plan, + ), + )); + } + } else { + // Can not push down, add new SortExec + println!("Can not push down, add new SortExec"); + let new_plan = + add_sort_above_child(plan, parent_required_expr, None)?; + return Ok(Some( + PlanWithSortRequirements::new_without_impact_result_ordering( + new_plan, + ), + )); + } + } + } else if let Some(window_agg_exec) = + plan.as_any().downcast_ref::() + { + let window_expr = window_agg_exec.window_expr(); + let request_child = &plan.required_input_ordering()[0]; + if requirements_compatible(request_child.as_deref(), parent_required, || { + plan.children()[0].equivalence_properties() + }) { + println!("BoundedWindowAggExec child requirements are more specific, no need to add SortExec"); + return Ok(Some(PlanWithSortRequirements { + plan: plan.clone(), + impact_result_ordering: true, + required_ordering: None, + adjusted_request_ordering: requirements.adjusted_request_ordering, + })); + } else if requirements_compatible( + parent_required, + request_child.as_deref(), + || plan.children()[0].equivalence_properties(), + ) { + println!("Parent requirements are more specific, adjust BoundedWindowAggExec child requirements and push down the requirements"); + let adjusted = parent_required.map(|r| r.to_vec()); + return Ok(Some(PlanWithSortRequirements { + plan: plan.clone(), + impact_result_ordering: true, + required_ordering: None, + adjusted_request_ordering: vec![adjusted], + })); + } else { + let should_reverse = can_reverse_window_request( + window_expr[0].partition_by(), + parent_required, + request_child.as_deref(), + &window_agg_exec.input().schema(), + ); + if should_reverse { + let new_window_expr = window_expr + .iter() + .map(|e| e.get_reverse_expr()) + .collect::>>(); + if let Some(window_expr) = new_window_expr { + let new_plan = Arc::new(BoundedWindowAggExec::try_new( + window_expr, + window_agg_exec.children()[0].clone(), + window_agg_exec.input_schema(), + window_agg_exec.partition_keys.clone(), + Some(parent_required_expr.to_vec()), + )?) as _; + println!("Reverse BoundedWindowAggExec expressions and push down the requirements"); + return Ok(Some( + PlanWithSortRequirements::new_without_impact_result_ordering( + new_plan, + ), + )); + } else { + println!("Can not push down, add new SortExec"); + let new_plan = + add_sort_above_child(plan, parent_required_expr, None)?; + return Ok(Some( + PlanWithSortRequirements::new_without_impact_result_ordering( + new_plan, + ), + )); + } + } else { + // Can not push down, add new SortExec + println!("Can not push down, add new SortExec"); + let new_plan = + add_sort_above_child(plan, parent_required_expr, None)?; + return Ok(Some( + PlanWithSortRequirements::new_without_impact_result_ordering( + new_plan, + ), + )); + } + } + } else if let Some(smj) = plan.as_any().downcast_ref::() { + // If the current plan is SortMergeJoinExec + let left_columns_len = smj.left.schema().fields().len(); + let expr_source_side = + expr_source_sides(&parent_required_expr, left_columns_len); + match expr_source_side { + Some(JoinSide::Left) if maintains_input_order[0] => { + if requirements_compatible( + plan.required_input_ordering()[0].as_deref(), + parent_required, + || plan.children()[0].equivalence_properties(), + ) { + println!("Requirements are compatible with SMJ"); + return Ok(Some(PlanWithSortRequirements { + plan: plan.clone(), + impact_result_ordering: true, + required_ordering: None, + adjusted_request_ordering: requirements + .adjusted_request_ordering, + })); + } else { + // Can not push down, add new SortExec + println!("Can not push down, add new SortExec"); + let new_plan = + add_sort_above_child(plan, parent_required_expr, None)?; + return Ok(Some( + PlanWithSortRequirements::new_without_impact_result_ordering( + new_plan, + ), + )); + } + } + Some(JoinSide::Right) if maintains_input_order[1] => { + let shift_right_required = + shift_right_required(parent_required.unwrap(), left_columns_len); + if requirements_compatible( + plan.required_input_ordering()[1].as_deref(), + shift_right_required.as_deref(), + || plan.children()[1].equivalence_properties(), + ) { + println!("Requirements are compatible with SMJ"); + return Ok(Some(PlanWithSortRequirements { + plan: plan.clone(), + impact_result_ordering: true, + required_ordering: None, + adjusted_request_ordering: requirements + .adjusted_request_ordering, + })); + } else { + // Can not push down, add new SortExec + println!("Can not push down, add new SortExec"); + let new_plan = + add_sort_above_child(plan, parent_required_expr, None)?; + return Ok(Some( + PlanWithSortRequirements::new_without_impact_result_ordering( + new_plan, + ), + )); + } + } + _ => { + println!("Can not decide the expr side for SortMergeJoinExec, can not push down, add SortExec"); + let new_plan = + add_sort_above_child(plan, parent_required_expr, None)?; + return Ok(Some( + PlanWithSortRequirements::new_without_impact_result_ordering( + new_plan, + ), + )); + } + } + } else if plan.required_input_ordering().iter().any(Option::is_some) { + // If the current plan has its own ordering requirements to its children, check whether the requirements + // are compatible with the parent requirements. + println!( + "the current plan has its own ordering requirements, {:?}", + plan.required_input_ordering() + ); + + let plan_children = plan.children(); + let compatible_with_children = izip!( + maintains_input_order.iter(), + plan.required_input_ordering().into_iter(), + plan_children.iter() + ) + .map(|(can_push_down, request_child, child)| { + *can_push_down + && requirements_compatible( + request_child.as_deref(), + parent_required, + || child.equivalence_properties(), + ) + }) + .collect::>(); + println!( + "plan.equivalence_properties() {:?}", + plan.equivalence_properties() + ); + println!("compatible_with_children {:?}", compatible_with_children); + if compatible_with_children.iter().all(|a| *a) { + // Requirements are compatible, not need to push down. + println!("Requirements are compatible, no need to push down"); + return Ok(Some(PlanWithSortRequirements { + plan: plan.clone(), + impact_result_ordering: true, + required_ordering: None, + adjusted_request_ordering: requirements.adjusted_request_ordering, + })); + } else { + let can_adjust_child_requirements = plan + .required_input_ordering() + .into_iter() + .zip(plan_children.iter()) + .map(|(request_child, child)| { + requirements_compatible( + parent_required, + request_child.as_deref(), + || child.equivalence_properties(), + ) + }) + .collect::>(); + if can_adjust_child_requirements.iter().all(|a| *a) { + // Adjust child requirements and push down the requirements + println!("Adjust child requirements and push down the requirements"); + let adjusted = parent_required.map(|r| r.to_vec()); + return Ok(Some(PlanWithSortRequirements { + plan: plan.clone(), + impact_result_ordering: true, + required_ordering: None, + adjusted_request_ordering: vec![ + adjusted; + can_adjust_child_requirements + .len() + ], + })); + } else { + // Can not push down, add new SortExec + println!("Can not push down, add new SortExec"); + let new_plan = + add_sort_above_child(plan, parent_required_expr, None)?; + return Ok(Some( + PlanWithSortRequirements::new_without_impact_result_ordering( + new_plan, + ), + )); + } + } + } else { + // The current plan does not have its own ordering requirements to its children, consider push down the requirements + if let Some(ProjectionExec { expr, .. }) = + plan.as_any().downcast_ref::() + { + // For Projection, we need to transform the requirements to the columns before the Projection + // And then to push down the requirements + let new_requirement = + map_requirement_before_projection(parent_required, expr); + if new_requirement.is_some() { + println!("Push requirements down to Projection"); + Ok(Some(PlanWithSortRequirements { + plan: plan.clone(), + impact_result_ordering: true, + required_ordering: None, + adjusted_request_ordering: vec![new_requirement], + })) + } else { + // Can not push down, add new SortExec + println!( + "Can not push requirements down to Projection, add SortExec" + ); + let new_plan = + add_sort_above_child(plan, parent_required_expr, None)?; + return Ok(Some( + PlanWithSortRequirements::new_without_impact_result_ordering( + new_plan, + ), + )); + } + } else { + println!("Push down requirements."); + return Ok(Some(PlanWithSortRequirements { + plan: plan.clone(), + impact_result_ordering: requirements.impact_result_ordering, + required_ordering: None, + adjusted_request_ordering: vec![ + requirements.required_ordering; + requirements + .adjusted_request_ordering + .len() + ], + })); + } + } + } +} + +fn expr_source_sides( + required_exprs: &[PhysicalSortExpr], + left_columns_len: usize, +) -> Option { + let all_column_sides = required_exprs + .iter() + .filter_map(|r| { + if let Some(col) = r.expr.as_any().downcast_ref::() { + if col.index() < left_columns_len { + Some(JoinSide::Left) + } else { + Some(JoinSide::Right) + } + } else { + None + } + }) + .collect::>(); + + // If the exprs are all coming from one side, the requirements can be pushed down + if all_column_sides.len() != required_exprs.len() { + None + } else if all_column_sides + .iter() + .all(|side| matches!(side, JoinSide::Left)) + { + Some(JoinSide::Left) + } else if all_column_sides + .iter() + .all(|side| matches!(side, JoinSide::Right)) + { + Some(JoinSide::Right) + } else { + None + } +} + +fn shift_right_required( + parent_required: &[PhysicalSortRequirements], + left_columns_len: usize, +) -> Option> { + let new_right_required: Vec = parent_required + .iter() + .filter_map(|r| { + if let Some(col) = r.expr.as_any().downcast_ref::() { + if col.index() >= left_columns_len { + Some(PhysicalSortRequirements { + expr: Arc::new(Column::new( + col.name(), + col.index() - left_columns_len, + )) as Arc, + sort_options: r.sort_options.clone(), + }) + } else { + None + } + } else { + None + } + }) + .collect::>(); + + // if the parent required are all comming from the right side, the requirements can be pushdown + if new_right_required.len() != parent_required.len() { + None + } else { + Some(new_right_required) + } +} + +#[derive(Debug)] +/// This structure stores extra column information required to remove unnecessary sorts. +pub struct ColumnInfo { + reverse: bool, + is_partition: bool, +} + +fn can_reverse_window_request( + partition_keys: &[Arc], + required: Option<&[PhysicalSortRequirements]>, + request_ordering: Option<&[PhysicalSortRequirements]>, + input_schema: &SchemaRef, +) -> bool { + match (required, request_ordering) { + (_, None) => false, + (None, Some(_)) => false, + (Some(required), Some(request_ordering)) => { + if required.len() > request_ordering.len() { + return false; + } + let mut col_infos = vec![]; + for (required_expr, request_expr) in zip(required, request_ordering) { + let column = required_expr.expr.clone(); + let is_partition = partition_keys.iter().any(|e| e.eq(&column)); + let reverse = check_alignment(input_schema, request_expr, required_expr); + col_infos.push(ColumnInfo { + reverse, + is_partition, + }); + } + let order_by_sections = col_infos + .iter() + .filter(|elem| !elem.is_partition) + .collect::>(); + let should_reverse_order_bys = if order_by_sections.is_empty() { + false + } else { + let first_reverse = order_by_sections[0].reverse; + first_reverse + }; + should_reverse_order_bys + } + } +} + +/// Compares window expression's `window_request` and `parent_required_expr` ordering, returns +/// whether we should reverse the window expression's ordering in order to meet parent's requirements. +fn check_alignment( + input_schema: &SchemaRef, + window_request: &PhysicalSortRequirements, + parent_required_expr: &PhysicalSortRequirements, +) -> bool { + if parent_required_expr.expr.eq(&window_request.expr) + && window_request.sort_options.is_some() + && parent_required_expr.sort_options.is_some() + { + let nullable = parent_required_expr.expr.nullable(input_schema).unwrap(); + let window_request_opts = window_request.sort_options.unwrap(); + let parent_required_opts = parent_required_expr.sort_options.unwrap(); + let is_reversed = if nullable { + window_request_opts == reverse_sort_options(parent_required_opts) + } else { + // If the column is not nullable, NULLS FIRST/LAST is not important. + window_request_opts.descending != parent_required_opts.descending + }; + is_reversed + } else { + false + } +} + +fn reverse_window_sort_requirements( + request_child: Option<&[PhysicalSortRequirements]>, +) -> Option> { + let reversed_request = request_child.map(|request| { + request + .iter() + .map(|req| match req.sort_options { + None => req.clone(), + Some(ops) => PhysicalSortRequirements { + expr: req.expr.clone(), + sort_options: Some(reverse_sort_options(ops)), + }, + }) + .collect::>() + }); + reversed_request +} + +fn should_reverse_window_sort_requirements( + window_plan: Arc, + top_requirement: Option<&[PhysicalSortRequirements]>, + top_reversed_requirement: Option<&[PhysicalSortRequirements]>, +) -> bool { + if top_requirement.is_none() { + return false; + } + let flags = window_plan + .children() + .into_iter() + .map(|child| { + // If the child is leaf node, check the output ordering + if child.children().is_empty() + && ordering_satisfy_requirement( + child.output_ordering(), + top_requirement, + || child.equivalence_properties(), + ) + { + false + } else if child.children().is_empty() + && ordering_satisfy_requirement( + child.output_ordering(), + top_reversed_requirement, + || child.equivalence_properties(), + ) + { + true + } else if child.as_any().downcast_ref::().is_some() + || child + .as_any() + .downcast_ref::() + .is_some() + { + // If the child is WindowExec, check the child requirements + if requirements_compatible( + top_requirement, + child.required_input_ordering()[0].as_deref(), + || child.equivalence_properties(), + ) || requirements_compatible( + child.required_input_ordering()[0].as_deref(), + top_requirement, + || child.equivalence_properties(), + ) || requirements_compatible( + top_reversed_requirement, + child.required_input_ordering()[0].as_deref(), + || child.equivalence_properties(), + ) || requirements_compatible( + child.required_input_ordering()[0].as_deref(), + top_reversed_requirement, + || child.equivalence_properties(), + ) { + should_reverse_window_sort_requirements( + child, + top_requirement, + top_reversed_requirement, + ) + } else { + false + } + } else if requirements_compatible( + top_reversed_requirement, + window_plan.required_input_ordering()[0].as_deref(), + || window_plan.equivalence_properties(), + ) || requirements_compatible( + window_plan.required_input_ordering()[0].as_deref(), + top_reversed_requirement, + || window_plan.equivalence_properties(), + ) { + true + } else { + false + } + }) + .collect::>(); + + flags.iter().all(|o| *o) +} + +/// A TombStoneExec execution plan generated during optimization process, should be removed finally +#[derive(Debug)] +struct TombStoneExec { + /// The input plan + pub input: Arc, +} + +impl TombStoneExec { + pub fn new(input: Arc) -> Self { + Self { input } + } +} + +impl ExecutionPlan for TombStoneExec { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.input.schema() + } + + fn output_partitioning(&self) -> Partitioning { + self.input.output_partitioning() + } + + fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> { + self.input.output_ordering() + } + + fn maintains_input_order(&self) -> Vec { + vec![true] + } + + fn equivalence_properties(&self) -> EquivalenceProperties { + self.input.equivalence_properties() + } + + fn children(&self) -> Vec> { + vec![self.input.clone()] + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> Result> { + Ok(Arc::new(TombStoneExec::new(children[0].clone()))) + } + + fn execute( + &self, + _partition: usize, + _context: Arc, + ) -> Result { + Err(DataFusionError::Internal(format!( + "TombStoneExec, invalid plan" + ))) + } + + fn fmt_as( + &self, + t: DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + match t { + DisplayFormatType::Default => { + write!(f, "TombStoneExec") + } + } + } + + fn statistics(&self) -> Statistics { + Statistics::default() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::datasource::listing::PartitionedFile; + use crate::datasource::object_store::ObjectStoreUrl; + use crate::physical_plan::displayable; + use crate::physical_plan::file_format::{FileScanConfig, ParquetExec}; + use crate::physical_plan::filter::FilterExec; + use crate::physical_plan::memory::MemoryExec; + use crate::physical_plan::repartition::RepartitionExec; + use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; + use crate::physical_plan::union::UnionExec; + use crate::physical_plan::windows::create_window_expr; + use crate::prelude::SessionContext; + use arrow::compute::SortOptions; + use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; + use datafusion_common::{Result, Statistics}; + use datafusion_expr::{AggregateFunction, WindowFrame, WindowFunction}; + use datafusion_physical_expr::expressions::{col, NotExpr}; + use datafusion_physical_expr::PhysicalSortExpr; + use std::sync::Arc; + + fn create_test_schema() -> Result { + let nullable_column = Field::new("nullable_col", DataType::Int32, true); + let non_nullable_column = Field::new("non_nullable_col", DataType::Int32, false); + let schema = Arc::new(Schema::new(vec![nullable_column, non_nullable_column])); + + Ok(schema) + } + + #[tokio::test] + async fn test_is_column_aligned_nullable() -> Result<()> { + let schema = create_test_schema()?; + let params = vec![ + ((true, true), (false, false), true), + ((true, true), (false, true), false), + ((true, true), (true, false), false), + ((true, false), (false, true), true), + ((true, false), (false, false), false), + ((true, false), (true, true), false), + ]; + for ( + (physical_desc, physical_nulls_first), + (req_desc, req_nulls_first), + reverse_expected, + ) in params + { + let physical_ordering = PhysicalSortRequirements { + expr: col("nullable_col", &schema)?, + sort_options: Some(SortOptions { + descending: physical_desc, + nulls_first: physical_nulls_first, + }), + }; + let required_ordering = PhysicalSortRequirements { + expr: col("nullable_col", &schema)?, + sort_options: Some(SortOptions { + descending: req_desc, + nulls_first: req_nulls_first, + }), + }; + let reverse = + check_alignment(&schema, &physical_ordering, &required_ordering); + assert_eq!(reverse, reverse_expected); + } + + Ok(()) + } + + #[tokio::test] + async fn test_is_column_aligned_non_nullable() -> Result<()> { + let schema = create_test_schema()?; + + let params = vec![ + ((true, true), (false, false), true), + ((true, true), (false, true), true), + ((true, true), (true, false), false), + ((true, false), (false, true), true), + ((true, false), (false, false), true), + ((true, false), (true, true), false), + ]; + for ( + (physical_desc, physical_nulls_first), + (req_desc, req_nulls_first), + reverse_expected, + ) in params + { + let physical_ordering = PhysicalSortRequirements { + expr: col("non_nullable_col", &schema)?, + sort_options: Some(SortOptions { + descending: physical_desc, + nulls_first: physical_nulls_first, + }), + }; + let required_ordering = PhysicalSortRequirements { + expr: col("non_nullable_col", &schema)?, + sort_options: Some(SortOptions { + descending: req_desc, + nulls_first: req_nulls_first, + }), + }; + let reverse = + check_alignment(&schema, &physical_ordering, &required_ordering); + assert_eq!(reverse, reverse_expected); + } + + Ok(()) + } + + /// Runs the sort enforcement optimizer and asserts the plan + /// against the original and expected plans + /// + /// `$EXPECTED_PLAN_LINES`: input plan + /// `$EXPECTED_OPTIMIZED_PLAN_LINES`: optimized plan + /// `$PLAN`: the plan to optimized + /// + macro_rules! assert_optimized { + ($EXPECTED_PLAN_LINES: expr, $EXPECTED_OPTIMIZED_PLAN_LINES: expr, $PLAN: expr) => { + let session_ctx = SessionContext::new(); + let state = session_ctx.state(); + + let physical_plan = $PLAN; + let formatted = displayable(physical_plan.as_ref()).indent().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + + let expected_plan_lines: Vec<&str> = $EXPECTED_PLAN_LINES + .iter().map(|s| *s).collect(); + + assert_eq!( + expected_plan_lines, actual, + "\n**Original Plan Mismatch\n\nexpected:\n\n{expected_plan_lines:#?}\nactual:\n\n{actual:#?}\n\n" + ); + + let expected_optimized_lines: Vec<&str> = $EXPECTED_OPTIMIZED_PLAN_LINES + .iter().map(|s| *s).collect(); + + // Run the actual optimizer + let optimized_physical_plan = + TopDownEnforceSorting::new().optimize(physical_plan, state.config_options())?; + + let formatted = displayable(optimized_physical_plan.as_ref()) + .indent() + .to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected_optimized_lines, actual, + "\n**Optimized Plan Mismatch\n\nexpected:\n\n{expected_optimized_lines:#?}\nactual:\n\n{actual:#?}\n\n" + ); + + }; + } + + #[tokio::test] + async fn test_not_remove_sort_window_multilayer() -> Result<()> { + let schema = create_test_schema()?; + let source = memory_exec(&schema); + + let sort_exprs = vec![sort_expr_options( + "non_nullable_col", + &source.schema(), + SortOptions { + descending: true, + nulls_first: true, + }, + )]; + let sort = sort_exec(sort_exprs.clone(), source); + + let window_agg = window_exec("non_nullable_col", sort_exprs, sort); + + let sort_exprs = vec![sort_expr_options( + "non_nullable_col", + &window_agg.schema(), + SortOptions { + descending: false, + nulls_first: false, + }, + )]; + + let sort = sort_exec(sort_exprs.clone(), window_agg); + let filter = filter_exec( + Arc::new(NotExpr::new( + col("non_nullable_col", schema.as_ref()).unwrap(), + )), + sort, + ); + + // let filter_exec = sort_exec; + let physical_plan = window_exec("non_nullable_col", sort_exprs, filter); + + let expected_input = vec![ + "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + " FilterExec: NOT non_nullable_col@1", + " SortExec: [non_nullable_col@1 ASC NULLS LAST], global=true", + " WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + " SortExec: [non_nullable_col@1 DESC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + + // let expected_optimized = vec![ + // "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(NULL) }]", + // " FilterExec: NOT non_nullable_col@1", + // " WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + // " SortExec: [non_nullable_col@1 DESC]", + // " MemoryExec: partitions=0, partition_sizes=[]", + // ]; + assert_optimized!(expected_input, expected_input, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_multiple_sort_window_exec() -> Result<()> { + let schema = create_test_schema()?; + let source = memory_exec(&schema); + + let sort_exprs1 = vec![sort_expr("nullable_col", &schema)]; + let sort_exprs2 = vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ]; + + let sort1 = sort_exec(sort_exprs1.clone(), source); + let window_agg1 = window_exec("non_nullable_col", sort_exprs1.clone(), sort1); + let window_agg2 = window_exec("non_nullable_col", sort_exprs2, window_agg1); + // let filter_exec = sort_exec; + let physical_plan = window_exec("non_nullable_col", sort_exprs1, window_agg2); + + let expected_input = vec![ + "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + " WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + " WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + " SortExec: [nullable_col@0 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + + let expected_optimized = vec![ + "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + " WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + " WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_add_required_sort() -> Result<()> { + let schema = create_test_schema()?; + let source = memory_exec(&schema); + + let sort_exprs = vec![sort_expr("nullable_col", &schema)]; + + let physical_plan = sort_preserving_merge_exec(sort_exprs, source); + + let expected_input = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + let expected_optimized = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " SortExec: [nullable_col@0 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_remove_unnecessary_sort1() -> Result<()> { + let schema = create_test_schema()?; + let source = memory_exec(&schema); + let input = sort_exec(vec![sort_expr("non_nullable_col", &schema)], source); + let physical_plan = sort_exec(vec![sort_expr("nullable_col", &schema)], input); + + let expected_input = vec![ + "SortExec: [nullable_col@0 ASC], global=true", + " SortExec: [non_nullable_col@1 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + // Keep the top SortExec + let expected_optimized = [ + "SortExec: [nullable_col@0 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_remove_unnecessary_sort2() -> Result<()> { + let schema = create_test_schema()?; + let source = memory_exec(&schema); + + let input = sort_exec(vec![sort_expr("non_nullable_col", &schema)], source); + let input2 = sort_exec( + vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ], + input, + ); + let physical_plan = sort_exec(vec![sort_expr("nullable_col", &schema)], input2); + + let expected_input = vec![ + "SortExec: [nullable_col@0 ASC], global=true", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " SortExec: [non_nullable_col@1 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + // Keep the middle SortExec + let expected_optimized = [ + "SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_remove_unnecessary_sort3() -> Result<()> { + let schema = create_test_schema()?; + let source = memory_exec(&schema); + let sort_exprs = vec![sort_expr("nullable_col", &schema)]; + let sort = sort_exec(sort_exprs.clone(), source); + let spm = sort_preserving_merge_exec(sort_exprs, sort); + + let sort_exprs = vec![sort_expr("nullable_col", &schema)]; + let sort = sort_exec(sort_exprs.clone(), spm); + let physical_plan = sort_preserving_merge_exec(sort_exprs, sort); + let expected_input = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " SortExec: [nullable_col@0 ASC], global=true", + " SortPreservingMergeExec: [nullable_col@0 ASC]", + " SortExec: [nullable_col@0 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + let expected_optimized = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " SortExec: [nullable_col@0 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_remove_unnecessary_sort4() -> Result<()> { + let schema = create_test_schema()?; + let source = memory_exec(&schema); + let sort_exprs = vec![sort_expr("non_nullable_col", &schema)]; + let sort = sort_exec(sort_exprs.clone(), source); + let spm = sort_preserving_merge_exec(sort_exprs, sort); + + let sort_exprs = vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ]; + let sort2 = sort_exec(sort_exprs.clone(), spm); + let spm2 = sort_preserving_merge_exec(sort_exprs, sort2); + + let sort_exprs = vec![sort_expr("nullable_col", &schema)]; + let sort3 = sort_exec(sort_exprs.clone(), spm2); + let physical_plan = sort_preserving_merge_exec(sort_exprs, sort3); + + let expected_input = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " SortExec: [nullable_col@0 ASC], global=true", + " SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " SortPreservingMergeExec: [non_nullable_col@1 ASC]", + " SortExec: [non_nullable_col@1 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + // Keep the middle SortPreservingMergeExec + SortExec + let expected_optimized = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_remove_unnecessary_sort5() -> Result<()> { + let schema = create_test_schema()?; + let source = memory_exec(&schema); + let sort_exprs = vec![sort_expr("non_nullable_col", &schema)]; + let sort = sort_exec(sort_exprs.clone(), source); + let spm = sort_preserving_merge_exec(sort_exprs, sort); + + let sort_exprs = vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ]; + let sort2 = sort_exec(sort_exprs.clone(), spm); + let spm2 = sort_preserving_merge_exec(sort_exprs, sort2); + + let sort_exprs = vec![sort_expr("nullable_col", &schema)]; + let sort3 = sort_exec(sort_exprs.clone(), spm2); + let physical_plan = repartition_exec(repartition_exec(sort3)); + + let expected_input = vec![ + "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10", + " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", + " SortExec: [nullable_col@0 ASC], global=true", + " SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " SortPreservingMergeExec: [non_nullable_col@1 ASC]", + " SortExec: [non_nullable_col@1 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + + let expected_optimized = vec![ + "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10", + " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=0", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_remove_unnecessary_spm1() -> Result<()> { + let schema = create_test_schema()?; + let source = memory_exec(&schema); + let input = sort_preserving_merge_exec( + vec![sort_expr("non_nullable_col", &schema)], + source, + ); + let physical_plan = sort_exec(vec![sort_expr("nullable_col", &schema)], input); + + let expected_input = vec![ + "SortExec: [nullable_col@0 ASC], global=true", + " SortPreservingMergeExec: [non_nullable_col@1 ASC]", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + let expected_optimized = vec![ + "SortExec: [nullable_col@0 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_remove_unnecessary_spm2() -> Result<()> { + let schema = create_test_schema()?; + let source = memory_exec(&schema); + let input = sort_preserving_merge_exec( + vec![sort_expr("non_nullable_col", &schema)], + source, + ); + let input2 = sort_preserving_merge_exec( + vec![sort_expr("non_nullable_col", &schema)], + input, + ); + let physical_plan = + sort_preserving_merge_exec(vec![sort_expr("nullable_col", &schema)], input2); + + let expected_input = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " SortPreservingMergeExec: [non_nullable_col@1 ASC]", + " SortPreservingMergeExec: [non_nullable_col@1 ASC]", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + let expected_optimized = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " SortExec: [nullable_col@0 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_change_wrong_sorting() -> Result<()> { + let schema = create_test_schema()?; + let source = memory_exec(&schema); + let sort_exprs = vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ]; + let sort = sort_exec(vec![sort_exprs[0].clone()], source); + let physical_plan = sort_preserving_merge_exec(sort_exprs, sort); + let expected_input = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + " SortExec: [nullable_col@0 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + let expected_optimized = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_change_wrong_sorting2() -> Result<()> { + let schema = create_test_schema()?; + let source = memory_exec(&schema); + let sort_exprs = vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ]; + let spm1 = sort_preserving_merge_exec(sort_exprs.clone(), source); + let sort2 = sort_exec(vec![sort_exprs[0].clone()], spm1); + let physical_plan = + sort_preserving_merge_exec(vec![sort_exprs[1].clone()], sort2); + + let expected_input = vec![ + "SortPreservingMergeExec: [non_nullable_col@1 ASC]", + " SortExec: [nullable_col@0 ASC], global=true", + " SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + let expected_optimized = vec![ + "SortPreservingMergeExec: [non_nullable_col@1 ASC]", + " SortExec: [non_nullable_col@1 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_union_inputs_sorted() -> Result<()> { + let schema = create_test_schema()?; + + let source1 = parquet_exec(&schema); + let sort_exprs = vec![sort_expr("nullable_col", &schema)]; + let sort = sort_exec(sort_exprs.clone(), source1); + + let source2 = parquet_exec_sorted(&schema, sort_exprs.clone()); + + let union = union_exec(vec![source2, sort]); + let physical_plan = sort_preserving_merge_exec(sort_exprs, union); + + // one input to the union is already sorted, one is not. + let expected_input = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " UnionExec", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + " SortExec: [nullable_col@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + // should not add a sort at the output of the union, input plan should not be changed + let expected_optimized = expected_input.clone(); + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_union_inputs_different_sorted() -> Result<()> { + let schema = create_test_schema()?; + + let source1 = parquet_exec(&schema); + let sort_exprs = vec![sort_expr("nullable_col", &schema)]; + let sort = sort_exec(sort_exprs.clone(), source1); + + let parquet_sort_exprs = vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ]; + let source2 = parquet_exec_sorted(&schema, parquet_sort_exprs); + + let union = union_exec(vec![source2, sort]); + let physical_plan = sort_preserving_merge_exec(sort_exprs, union); + + // one input to the union is already sorted, one is not. + let expected_input = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " UnionExec", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", + " SortExec: [nullable_col@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + // should not add a sort at the output of the union, input plan should not be changed + let expected_optimized = expected_input.clone(); + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_union_inputs_different_sorted2() -> Result<()> { + let schema = create_test_schema()?; + + let source1 = parquet_exec(&schema); + let sort_exprs = vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ]; + let sort = sort_exec(sort_exprs.clone(), source1); + + let parquet_sort_exprs = vec![sort_expr("nullable_col", &schema)]; + let source2 = parquet_exec_sorted(&schema, parquet_sort_exprs); + + let union = union_exec(vec![source2, sort]); + let physical_plan = sort_preserving_merge_exec(sort_exprs, union); + + // Input is an invalid plan. In this case rule should add required sorting in appropriate places. + // First ParquetExec has output ordering(nullable_col@0 ASC). However, it doesn't satisfy required ordering + // of SortPreservingMergeExec. Hence rule should remove unnecessary sort for second child of the UnionExec + // and put a sort above Union to satisfy required ordering. + let expected_input = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + " UnionExec", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + + let expected_optimized = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + " UnionExec", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_union_inputs_different_sorted3() -> Result<()> { + let schema = create_test_schema()?; + + let source1 = parquet_exec(&schema); + let sort_exprs1 = vec![sort_expr("nullable_col", &schema)]; + let sort_exprs2 = vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ]; + let sort = sort_exec(sort_exprs1.clone(), source1); + + let source2 = parquet_exec_sorted(&schema, sort_exprs2.clone()); + + let union = union_exec(vec![source2, sort]); + let physical_plan = sort_preserving_merge_exec(sort_exprs2, union); + + // Input is an invalid plan. In this case rule should add required sorting in appropriate places. + let expected_input = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + " UnionExec", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", + " SortExec: [nullable_col@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + // expect to replace the wrong SortExec with the correct one + let expected_optimized = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + " UnionExec", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_union_inputs_different_sorted4() -> Result<()> { + let schema = create_test_schema()?; + + let source1 = parquet_exec(&schema); + let sort_exprs1 = vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ]; + let sort1 = sort_exec(sort_exprs1, source1.clone()); + let sort_exprs2 = vec![sort_expr("nullable_col", &schema)]; + let sort2 = sort_exec(sort_exprs2, source1); + + let parquet_sort_exprs = vec![sort_expr("nullable_col", &schema)]; + let source2 = parquet_exec_sorted(&schema, parquet_sort_exprs.clone()); + + let union = union_exec(vec![sort1, source2, sort2]); + let physical_plan = sort_preserving_merge_exec(parquet_sort_exprs, union); + + // First input to the union is not Sorted (SortExec is finer than required ordering by the SortPreservingMergeExec above). + // Second input to the union is already Sorted (matches with the required ordering by the SortPreservingMergeExec above). + // Third input to the union is not Sorted (SortExec is matches required ordering by the SortPreservingMergeExec above). + let expected_input = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " UnionExec", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + " SortExec: [nullable_col@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + // should adjust sorting in the first input of the union such that it is not unnecessarily fine + let expected_optimized = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " UnionExec", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + " SortExec: [nullable_col@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_union_inputs_different_sorted5() -> Result<()> { + let schema = create_test_schema()?; + + let source1 = parquet_exec(&schema); + let sort_exprs1 = vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ]; + let sort_exprs2 = vec![sort_expr("nullable_col", &schema)]; + let sort1 = sort_exec(sort_exprs2.clone(), source1.clone()); + let sort2 = sort_exec(sort_exprs2.clone(), source1); + + let source2 = parquet_exec_sorted(&schema, sort_exprs2); + + let union = union_exec(vec![sort1, source2, sort2]); + let physical_plan = sort_preserving_merge_exec(sort_exprs1, union); + + // First input to the union is not Sorted (SortExec is finer than required ordering by the SortPreservingMergeExec above). + // Second input to the union is already Sorted (matches with the required ordering by the SortPreservingMergeExec above). + // Third input to the union is not Sorted (SortExec is matches required ordering by the SortPreservingMergeExec above). + let expected_input = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + " UnionExec", + " SortExec: [nullable_col@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + " SortExec: [nullable_col@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + // should adjust sorting in the first input of the union such that it is not unnecessarily fine + let expected_optimized = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + " UnionExec", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_union_inputs_different_sorted6() -> Result<()> { + let schema = create_test_schema()?; + + let source1 = parquet_exec(&schema); + let sort_exprs1 = vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ]; + let sort_exprs2 = vec![ + sort_expr("nullable_col", &schema), + sort_expr_options( + "non_nullable_col", + &schema, + SortOptions { + descending: true, + nulls_first: false, + }, + ), + ]; + let sort_exprs3 = vec![sort_expr("nullable_col", &schema)]; + let sort1 = sort_exec(sort_exprs1, source1.clone()); + let sort2 = sort_exec(sort_exprs2, source1); + + let union = union_exec(vec![sort1, sort2]); + let physical_plan = sort_preserving_merge_exec(sort_exprs3, union); + + // Union doesn't preserve any of the inputs ordering. However, we should be able to change unnecessarily fine + // SortExecs under UnionExec with required SortExecs that are absolutely necessary. + let expected_input = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " UnionExec", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 DESC NULLS LAST], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + assert_optimized!(expected_input, expected_input, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_do_not_remove_sort_with_limit() -> Result<()> { + let schema = create_test_schema()?; + + let source1 = parquet_exec(&schema); + let sort_exprs = vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ]; + let sort = sort_exec(sort_exprs.clone(), source1); + let limit = limit_exec(sort); + + let parquet_sort_exprs = vec![sort_expr("nullable_col", &schema)]; + let source2 = parquet_exec_sorted(&schema, parquet_sort_exprs); + + let union = union_exec(vec![source2, limit]); + let repartition = repartition_exec(union); + let physical_plan = sort_preserving_merge_exec(sort_exprs, repartition); + + let expected_input = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", + " UnionExec", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + " GlobalLimitExec: skip=0, fetch=100", + " LocalLimitExec: fetch=100", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + + // expect to keep the bottom SortExec + let expected_optimized = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=false", + " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", + " UnionExec", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + " GlobalLimitExec: skip=0, fetch=100", + " LocalLimitExec: fetch=100", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + /// make PhysicalSortExpr with default options + fn sort_expr(name: &str, schema: &Schema) -> PhysicalSortExpr { + sort_expr_options(name, schema, SortOptions::default()) + } + + /// PhysicalSortExpr with specified options + fn sort_expr_options( + name: &str, + schema: &Schema, + options: SortOptions, + ) -> PhysicalSortExpr { + PhysicalSortExpr { + expr: col(name, schema).unwrap(), + options, + } + } + + fn memory_exec(schema: &SchemaRef) -> Arc { + Arc::new(MemoryExec::try_new(&[], schema.clone(), None).unwrap()) + } + + fn sort_exec( + sort_exprs: impl IntoIterator, + input: Arc, + ) -> Arc { + let sort_exprs = sort_exprs.into_iter().collect(); + Arc::new(SortExec::try_new(sort_exprs, input, None).unwrap()) + } + + fn sort_preserving_merge_exec( + sort_exprs: impl IntoIterator, + input: Arc, + ) -> Arc { + let sort_exprs = sort_exprs.into_iter().collect(); + Arc::new(SortPreservingMergeExec::new(sort_exprs, input)) + } + + fn filter_exec( + predicate: Arc, + input: Arc, + ) -> Arc { + Arc::new(FilterExec::try_new(predicate, input).unwrap()) + } + + fn limit_exec(input: Arc) -> Arc { + Arc::new(GlobalLimitExec::new( + Arc::new(LocalLimitExec::new(input, 100)), + 0, + Some(100), + )) + } + + fn repartition_exec(input: Arc) -> Arc { + Arc::new( + RepartitionExec::try_new(input, Partitioning::RoundRobinBatch(10)).unwrap(), + ) + } + + fn window_exec( + col_name: &str, + sort_exprs: impl IntoIterator, + input: Arc, + ) -> Arc { + let sort_exprs: Vec<_> = sort_exprs.into_iter().collect(); + let schema = input.schema(); + + Arc::new( + WindowAggExec::try_new( + vec![create_window_expr( + &WindowFunction::AggregateFunction(AggregateFunction::Count), + "count".to_owned(), + &[col(col_name, &schema).unwrap()], + &[], + &sort_exprs, + Arc::new(WindowFrame::new(true)), + schema.as_ref(), + ) + .unwrap()], + input.clone(), + input.schema(), + vec![], + Some(sort_exprs), + ) + .unwrap(), + ) + } + + /// Create a non sorted parquet exec + fn parquet_exec(schema: &SchemaRef) -> Arc { + Arc::new(ParquetExec::new( + FileScanConfig { + object_store_url: ObjectStoreUrl::parse("test:///").unwrap(), + file_schema: schema.clone(), + file_groups: vec![vec![PartitionedFile::new("x".to_string(), 100)]], + statistics: Statistics::default(), + projection: None, + limit: None, + table_partition_cols: vec![], + output_ordering: None, + infinite_source: false, + }, + None, + None, + )) + } + + // Created a sorted parquet exec + fn parquet_exec_sorted( + schema: &SchemaRef, + sort_exprs: impl IntoIterator, + ) -> Arc { + let sort_exprs = sort_exprs.into_iter().collect(); + + Arc::new(ParquetExec::new( + FileScanConfig { + object_store_url: ObjectStoreUrl::parse("test:///").unwrap(), + file_schema: schema.clone(), + file_groups: vec![vec![PartitionedFile::new("x".to_string(), 100)]], + statistics: Statistics::default(), + projection: None, + limit: None, + table_partition_cols: vec![], + output_ordering: Some(sort_exprs), + infinite_source: false, + }, + None, + None, + )) + } + + fn union_exec(input: Vec>) -> Arc { + Arc::new(UnionExec::new(input)) + } +} diff --git a/datafusion/core/src/physical_optimizer/utils.rs b/datafusion/core/src/physical_optimizer/utils.rs index 13e04bbc2ae83..0fcb5c411a18c 100644 --- a/datafusion/core/src/physical_optimizer/utils.rs +++ b/datafusion/core/src/physical_optimizer/utils.rs @@ -53,16 +53,17 @@ pub fn optimize_children( pub fn add_sort_above_child( child: &Arc, sort_expr: Vec, + fetch: Option, ) -> Result> { let new_child = if child.output_partitioning().partition_count() > 1 { Arc::new(SortExec::new_with_partitioning( sort_expr, child.clone(), true, - None, + fetch, )) as Arc } else { - Arc::new(SortExec::try_new(sort_expr, child.clone(), None)?) + Arc::new(SortExec::try_new(sort_expr, child.clone(), fetch)?) as Arc }; Ok(new_child) diff --git a/datafusion/core/src/physical_plan/joins/sort_merge_join.rs b/datafusion/core/src/physical_plan/joins/sort_merge_join.rs index 28df317a8ac3b..edbfdf042c331 100644 --- a/datafusion/core/src/physical_plan/joins/sort_merge_join.rs +++ b/datafusion/core/src/physical_plan/joins/sort_merge_join.rs @@ -34,6 +34,7 @@ use arrow::compute::{concat_batches, take, SortOptions}; use arrow::datatypes::{DataType, SchemaRef, TimeUnit}; use arrow::error::{ArrowError, Result as ArrowResult}; use arrow::record_batch::RecordBatch; +use datafusion_physical_expr::{new_sort_requirements, PhysicalSortRequirements}; use futures::{Stream, StreamExt}; use crate::error::DataFusionError; @@ -221,8 +222,10 @@ impl ExecutionPlan for SortMergeJoinExec { ] } - fn required_input_ordering(&self) -> Vec> { - vec![Some(&self.left_sort_exprs), Some(&self.right_sort_exprs)] + fn required_input_ordering(&self) -> Vec>> { + let left_requirements = new_sort_requirements(Some(&self.left_sort_exprs)); + let right_requirements = new_sort_requirements(Some(&self.right_sort_exprs)); + vec![left_requirements, right_requirements] } fn output_partitioning(&self) -> Partitioning { @@ -239,6 +242,17 @@ impl ExecutionPlan for SortMergeJoinExec { self.output_ordering.as_deref() } + fn maintains_input_order(&self) -> Vec { + match self.join_type { + JoinType::Inner => vec![true, true], + JoinType::Left | JoinType::LeftSemi | JoinType::LeftAnti => vec![true, false], + JoinType::Right | JoinType::RightSemi | JoinType::RightAnti => { + vec![false, true] + } + _ => vec![false, false], + } + } + fn equivalence_properties(&self) -> EquivalenceProperties { let left_columns_len = self.left.schema().fields.len(); combine_join_equivalence_properties( diff --git a/datafusion/core/src/physical_plan/mod.rs b/datafusion/core/src/physical_plan/mod.rs index 0ec7b16ef1316..e02ed421bf4d9 100644 --- a/datafusion/core/src/physical_plan/mod.rs +++ b/datafusion/core/src/physical_plan/mod.rs @@ -142,7 +142,7 @@ pub trait ExecutionPlan: Debug + Send + Sync { /// NOTE that checking `!is_empty()` does **not** check for a /// required input ordering. Instead, the correct check is that at /// least one entry must be `Some` - fn required_input_ordering(&self) -> Vec> { + fn required_input_ordering(&self) -> Vec>> { vec![None; self.children().len()] } @@ -592,11 +592,11 @@ impl Distribution { use datafusion_physical_expr::expressions::Column; pub use datafusion_physical_expr::window::WindowExpr; -use datafusion_physical_expr::EquivalenceProperties; use datafusion_physical_expr::{ expr_list_eq_strict_order, normalize_expr_with_equivalence_properties, }; pub use datafusion_physical_expr::{AggregateExpr, PhysicalExpr}; +use datafusion_physical_expr::{EquivalenceProperties, PhysicalSortRequirements}; /// Applies an optional projection to a [`SchemaRef`], returning the /// projected schema diff --git a/datafusion/core/src/physical_plan/planner.rs b/datafusion/core/src/physical_plan/planner.rs index a3cd42ecba28e..5370ab0a8c70b 100644 --- a/datafusion/core/src/physical_plan/planner.rs +++ b/datafusion/core/src/physical_plan/planner.rs @@ -1893,6 +1893,7 @@ mod tests { let session_state = make_session_state(); // optimize the logical plan let logical_plan = session_state.optimize(logical_plan)?; + println!("optimized logical plan {:?}", logical_plan); let planner = DefaultPhysicalPlanner::default(); planner .create_physical_plan(&logical_plan, &session_state) diff --git a/datafusion/core/src/physical_plan/sorts/sort.rs b/datafusion/core/src/physical_plan/sorts/sort.rs index b75fe0d80d6c1..4b6199cd88310 100644 --- a/datafusion/core/src/physical_plan/sorts/sort.rs +++ b/datafusion/core/src/physical_plan/sorts/sort.rs @@ -798,7 +798,12 @@ impl ExecutionPlan for SortExec { match t { DisplayFormatType::Default => { let expr: Vec = self.expr.iter().map(|e| e.to_string()).collect(); - write!(f, "SortExec: [{}]", expr.join(",")) + write!( + f, + "SortExec: [{}], global={}", + expr.join(","), + !self.preserve_partitioning + ) } } } diff --git a/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs b/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs index 658a5f9fc1767..38a67cb4a7f49 100644 --- a/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs +++ b/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs @@ -48,7 +48,9 @@ use crate::physical_plan::{ Distribution, ExecutionPlan, Partitioning, PhysicalExpr, RecordBatchStream, SendableRecordBatchStream, Statistics, }; -use datafusion_physical_expr::EquivalenceProperties; +use datafusion_physical_expr::{ + new_sort_requirements, EquivalenceProperties, PhysicalSortRequirements, +}; /// Sort preserving merge execution plan /// @@ -127,12 +129,17 @@ impl ExecutionPlan for SortPreservingMergeExec { vec![Distribution::UnspecifiedDistribution] } - fn required_input_ordering(&self) -> Vec> { - vec![Some(&self.expr)] + fn required_input_ordering(&self) -> Vec>> { + let ordering_requirements = new_sort_requirements(Some(&self.expr)); + vec![ordering_requirements] } fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> { - Some(&self.expr) + self.input.output_ordering() + } + + fn maintains_input_order(&self) -> Vec { + vec![true] } fn equivalence_properties(&self) -> EquivalenceProperties { diff --git a/datafusion/core/src/physical_plan/union.rs b/datafusion/core/src/physical_plan/union.rs index df78058082f59..c05700e61bf78 100644 --- a/datafusion/core/src/physical_plan/union.rs +++ b/datafusion/core/src/physical_plan/union.rs @@ -48,7 +48,6 @@ use crate::{ error::Result, physical_plan::{expressions, metrics::BaselineMetrics}, }; -use datafusion_physical_expr::utils::ordering_satisfy; use tokio::macros::support::thread_rng_n; /// `UnionExec`: `UNION ALL` execution plan. @@ -232,22 +231,8 @@ impl ExecutionPlan for UnionExec { } fn maintains_input_order(&self) -> Vec { - // If the Union has an output ordering, it maintains at least one - // child's ordering (i.e. the meet). - // For instance, assume that the first child is SortExpr('a','b','c'), - // the second child is SortExpr('a','b') and the third child is - // SortExpr('a','b'). The output ordering would be SortExpr('a','b'), - // which is the "meet" of all input orderings. In this example, this - // function will return vec![false, true, true], indicating that we - // preserve the orderings for the 2nd and the 3rd children. - self.inputs() - .iter() - .map(|child| { - ordering_satisfy(self.output_ordering(), child.output_ordering(), || { - child.equivalence_properties() - }) - }) - .collect() + let main_input_order = self.output_ordering().is_some(); + vec![main_input_order; self.inputs.len()] } fn with_new_children( diff --git a/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs b/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs index 7fc3c638097fa..b4e4685228b19 100644 --- a/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs +++ b/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs @@ -54,7 +54,9 @@ use datafusion_physical_expr::window::{ PartitionBatchState, PartitionBatches, PartitionKey, PartitionWindowAggStates, WindowAggState, WindowState, }; -use datafusion_physical_expr::{EquivalenceProperties, PhysicalExpr}; +use datafusion_physical_expr::{ + EquivalenceProperties, PhysicalExpr, PhysicalSortRequirements, +}; use indexmap::IndexMap; use log::debug; @@ -123,7 +125,7 @@ impl BoundedWindowAggExec { let mut result = vec![]; // All window exprs have the same partition by, so we just use the first one: let partition_by = self.window_expr()[0].partition_by(); - let sort_keys = self.sort_keys.as_deref().unwrap_or(&[]); + let sort_keys = self.output_ordering().unwrap_or(&[]); for item in partition_by { if let Some(a) = sort_keys.iter().find(|&e| e.expr.eq(item)) { result.push(a.clone()); @@ -167,9 +169,28 @@ impl ExecutionPlan for BoundedWindowAggExec { self.input().output_ordering() } - fn required_input_ordering(&self) -> Vec> { - let sort_keys = self.sort_keys.as_deref(); - vec![sort_keys] + fn required_input_ordering(&self) -> Vec>> { + let partition_keys = self.window_expr()[0].partition_by(); + let requirements = self.sort_keys.as_deref().map(|ordering| { + ordering + .iter() + .map(|o| { + let is_partition = partition_keys.iter().any(|e| e.eq(&o.expr)); + if is_partition { + PhysicalSortRequirements { + expr: o.expr.clone(), + sort_options: None, + } + } else { + PhysicalSortRequirements { + expr: o.expr.clone(), + sort_options: Some(o.options.clone()), + } + } + }) + .collect::>() + }); + vec![requirements] } fn required_input_distribution(&self) -> Vec { @@ -177,7 +198,6 @@ impl ExecutionPlan for BoundedWindowAggExec { debug!("No partition defined for BoundedWindowAggExec!!!"); vec![Distribution::SinglePartition] } else { - //TODO support PartitionCollections if there is no common partition columns in the window_expr vec![Distribution::HashPartitioned(self.partition_keys.clone())] } } @@ -436,6 +456,7 @@ impl SortedPartitionByBoundedWindowStream { ) -> Self { let state = window_expr.iter().map(|_| IndexMap::new()).collect(); let empty_batch = RecordBatch::new_empty(schema.clone()); + println!("partition_by_sort_keys {:?}", partition_by_sort_keys); Self { schema, input, diff --git a/datafusion/core/src/physical_plan/windows/window_agg_exec.rs b/datafusion/core/src/physical_plan/windows/window_agg_exec.rs index fbd05fa884857..e5a17043f6cc7 100644 --- a/datafusion/core/src/physical_plan/windows/window_agg_exec.rs +++ b/datafusion/core/src/physical_plan/windows/window_agg_exec.rs @@ -39,6 +39,7 @@ use arrow::{ record_batch::RecordBatch, }; use datafusion_common::DataFusionError; +use datafusion_physical_expr::PhysicalSortRequirements; use futures::stream::Stream; use futures::{ready, StreamExt}; use log::debug; @@ -114,7 +115,7 @@ impl WindowAggExec { let mut result = vec![]; // All window exprs have the same partition by, so we just use the first one: let partition_by = self.window_expr()[0].partition_by(); - let sort_keys = self.sort_keys.as_deref().unwrap_or(&[]); + let sort_keys = self.output_ordering().unwrap_or(&[]); for item in partition_by { if let Some(a) = sort_keys.iter().find(|&e| e.expr.eq(item)) { result.push(a.clone()); @@ -172,9 +173,28 @@ impl ExecutionPlan for WindowAggExec { vec![true] } - fn required_input_ordering(&self) -> Vec> { - let sort_keys = self.sort_keys.as_deref(); - vec![sort_keys] + fn required_input_ordering(&self) -> Vec>> { + let partition_keys = self.window_expr()[0].partition_by(); + let requirements = self.sort_keys.as_deref().map(|ordering| { + ordering + .iter() + .map(|o| { + let is_partition = partition_keys.iter().any(|e| e.eq(&o.expr)); + if is_partition { + PhysicalSortRequirements { + expr: o.expr.clone(), + sort_options: None, + } + } else { + PhysicalSortRequirements { + expr: o.expr.clone(), + sort_options: Some(o.options.clone()), + } + } + }) + .collect::>() + }); + vec![requirements] } fn required_input_distribution(&self) -> Vec { @@ -182,7 +202,6 @@ impl ExecutionPlan for WindowAggExec { debug!("No partition defined for WindowAggExec!!!"); vec![Distribution::SinglePartition] } else { - //TODO support PartitionCollections if there is no common partition columns in the window_expr vec![Distribution::HashPartitioned(self.partition_keys.clone())] } } diff --git a/datafusion/core/tests/sql/explain_analyze.rs b/datafusion/core/tests/sql/explain_analyze.rs index 01bd94e8e4bbe..9d75cb974052f 100644 --- a/datafusion/core/tests/sql/explain_analyze.rs +++ b/datafusion/core/tests/sql/explain_analyze.rs @@ -605,7 +605,7 @@ async fn test_physical_plan_display_indent() { let expected = vec![ "GlobalLimitExec: skip=0, fetch=10", " SortPreservingMergeExec: [the_min@2 DESC]", - " SortExec: [the_min@2 DESC]", + " SortExec: [the_min@2 DESC], global=false", " ProjectionExec: expr=[c1@0 as c1, MAX(aggregate_test_100.c12)@1 as MAX(aggregate_test_100.c12), MIN(aggregate_test_100.c12)@2 as the_min]", " AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[MAX(aggregate_test_100.c12), MIN(aggregate_test_100.c12)]", " CoalesceBatchesExec: target_batch_size=4096", diff --git a/datafusion/core/tests/sql/joins.rs b/datafusion/core/tests/sql/joins.rs index e0bd1a523c4ad..06489d3082744 100644 --- a/datafusion/core/tests/sql/joins.rs +++ b/datafusion/core/tests/sql/joins.rs @@ -1881,12 +1881,12 @@ async fn sort_merge_join_on_date32() -> Result<()> { let expected = vec![ "ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3, c4@3 as c4, c1@4 as c1, c2@5 as c2, c3@6 as c3, c4@7 as c4]", " SortMergeJoin: join_type=Inner, on=[(Column { name: \"c1\", index: 0 }, Column { name: \"c1\", index: 0 })]", - " SortExec: [c1@0 ASC]", + " SortExec: [c1@0 ASC], global=false", " CoalesceBatchesExec: target_batch_size=4096", " RepartitionExec: partitioning=Hash([Column { name: \"c1\", index: 0 }], 2), input_partitions=2", " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", " MemoryExec: partitions=1, partition_sizes=[1]", - " SortExec: [c1@0 ASC]", + " SortExec: [c1@0 ASC], global=false", " CoalesceBatchesExec: target_batch_size=4096", " RepartitionExec: partitioning=Hash([Column { name: \"c1\", index: 0 }], 2), input_partitions=2", " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", @@ -1927,13 +1927,13 @@ async fn sort_merge_join_on_decimal() -> Result<()> { "ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3, c4@3 as c4, c1@4 as c1, c2@5 as c2, c3@6 as c3, c4@7 as c4]", " ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3, c4@3 as c4, c1@5 as c1, c2@6 as c2, c3@7 as c3, c4@8 as c4]", " SortMergeJoin: join_type=Right, on=[(Column { name: \"CAST(t1.c3 AS Decimal128(10, 2))\", index: 4 }, Column { name: \"c3\", index: 2 })]", - " SortExec: [CAST(t1.c3 AS Decimal128(10, 2))@4 ASC]", + " SortExec: [CAST(t1.c3 AS Decimal128(10, 2))@4 ASC], global=false", " CoalesceBatchesExec: target_batch_size=4096", " RepartitionExec: partitioning=Hash([Column { name: \"CAST(t1.c3 AS Decimal128(10, 2))\", index: 4 }], 2), input_partitions=2", " ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3, c4@3 as c4, CAST(c3@2 AS Decimal128(10, 2)) as CAST(t1.c3 AS Decimal128(10, 2))]", " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", " MemoryExec: partitions=1, partition_sizes=[1]", - " SortExec: [c3@2 ASC]", + " SortExec: [c3@2 ASC], global=false", " CoalesceBatchesExec: target_batch_size=4096", " RepartitionExec: partitioning=Hash([Column { name: \"c3\", index: 2 }], 2), input_partitions=2", " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", @@ -1980,7 +1980,7 @@ async fn left_semi_join() -> Result<()> { let physical_plan = dataframe.create_physical_plan().await?; let expected = if repartition_joins { vec![ - "SortExec: [t1_id@0 ASC NULLS LAST]", + "SortExec: [t1_id@0 ASC NULLS LAST], global=true", " CoalescePartitionsExec", " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name]", " CoalesceBatchesExec: target_batch_size=4096", @@ -1997,7 +1997,7 @@ async fn left_semi_join() -> Result<()> { ] } else { vec![ - "SortExec: [t1_id@0 ASC NULLS LAST]", + "SortExec: [t1_id@0 ASC NULLS LAST], global=true", " CoalescePartitionsExec", " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name]", " CoalesceBatchesExec: target_batch_size=4096", @@ -2062,7 +2062,7 @@ async fn left_semi_join() -> Result<()> { let physical_plan = dataframe.create_physical_plan().await?; let expected = if repartition_joins { vec![ - "SortExec: [t1_id@0 ASC NULLS LAST]", + "SortExec: [t1_id@0 ASC NULLS LAST], global=true", " CoalescePartitionsExec", " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name]", " CoalesceBatchesExec: target_batch_size=4096", @@ -2078,7 +2078,7 @@ async fn left_semi_join() -> Result<()> { ] } else { vec![ - "SortExec: [t1_id@0 ASC NULLS LAST]", + "SortExec: [t1_id@0 ASC NULLS LAST], global=true", " CoalescePartitionsExec", " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name]", " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", @@ -2259,7 +2259,7 @@ async fn right_semi_join() -> Result<()> { let dataframe = ctx.sql(sql).await.expect(&msg); let physical_plan = dataframe.create_physical_plan().await?; let expected = if repartition_joins { - vec![ "SortExec: [t1_id@0 ASC NULLS LAST]", + vec![ "SortExec: [t1_id@0 ASC NULLS LAST], global=true", " CoalescePartitionsExec", " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int]", " CoalesceBatchesExec: target_batch_size=4096", @@ -2275,7 +2275,7 @@ async fn right_semi_join() -> Result<()> { ] } else { vec![ - "SortExec: [t1_id@0 ASC NULLS LAST]", + "SortExec: [t1_id@0 ASC NULLS LAST], global=true", " CoalescePartitionsExec", " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int]", " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", @@ -2307,7 +2307,7 @@ async fn right_semi_join() -> Result<()> { let dataframe = ctx.sql(sql).await.expect(&msg); let physical_plan = dataframe.create_physical_plan().await?; let expected = if repartition_joins { - vec![ "SortExec: [t1_id@0 ASC NULLS LAST]", + vec![ "SortExec: [t1_id@0 ASC NULLS LAST], global=true", " CoalescePartitionsExec", " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int]", " CoalesceBatchesExec: target_batch_size=4096", @@ -2323,7 +2323,7 @@ async fn right_semi_join() -> Result<()> { ] } else { vec![ - "SortExec: [t1_id@0 ASC NULLS LAST]", + "SortExec: [t1_id@0 ASC NULLS LAST], global=true", " CoalescePartitionsExec", " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int]", " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", diff --git a/datafusion/core/tests/sql/window.rs b/datafusion/core/tests/sql/window.rs index 22feeed2cb49b..22ed983b6b1ad 100644 --- a/datafusion/core/tests/sql/window.rs +++ b/datafusion/core/tests/sql/window.rs @@ -1686,7 +1686,7 @@ async fn test_window_agg_sort() -> Result<()> { " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", " BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: \"SUM(aggregate_test_100.c9)\", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt32(NULL)), end_bound: CurrentRow }]", " BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: \"SUM(aggregate_test_100.c9)\", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt32(NULL)), end_bound: CurrentRow }]", - " SortExec: [c9@1 ASC NULLS LAST,c8@0 ASC NULLS LAST]", + " SortExec: [c9@1 ASC NULLS LAST,c8@0 ASC NULLS LAST], global=true", ] }; @@ -1719,7 +1719,7 @@ async fn over_order_by_sort_keys_sorting_prefix_compacting() -> Result<()> { " WindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: \"SUM(aggregate_test_100.c9)\", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)) }]", " BoundedWindowAggExec: wdw=[MAX(aggregate_test_100.c9): Ok(Field { name: \"MAX(aggregate_test_100.c9)\", data_type: UInt32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt32(NULL)), end_bound: CurrentRow }]", " BoundedWindowAggExec: wdw=[MIN(aggregate_test_100.c9): Ok(Field { name: \"MIN(aggregate_test_100.c9)\", data_type: UInt32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt32(NULL)), end_bound: CurrentRow }]", - " SortExec: [c2@0 ASC NULLS LAST,c9@1 ASC NULLS LAST]", + " SortExec: [c2@0 ASC NULLS LAST,c9@1 ASC NULLS LAST], global=true", ] }; @@ -1747,15 +1747,15 @@ async fn over_order_by_sort_keys_sorting_global_order_compacting() -> Result<()> // 3 SortExec are added let expected = { vec![ - "SortExec: [c2@0 ASC NULLS LAST]", + "SortExec: [c2@0 ASC NULLS LAST], global=true", " CoalescePartitionsExec", " ProjectionExec: expr=[c2@0 as c2, MAX(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as MAX(aggregate_test_100.c9), SUM(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@4 as SUM(aggregate_test_100.c9), MIN(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as MIN(aggregate_test_100.c9)]", " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", " WindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: \"SUM(aggregate_test_100.c9)\", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)) }]", " BoundedWindowAggExec: wdw=[MAX(aggregate_test_100.c9): Ok(Field { name: \"MAX(aggregate_test_100.c9)\", data_type: UInt32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt32(NULL)), end_bound: CurrentRow }]", - " SortExec: [c9@1 ASC NULLS LAST,c2@0 ASC NULLS LAST]", + " SortExec: [c9@1 ASC NULLS LAST,c2@0 ASC NULLS LAST], global=true", " BoundedWindowAggExec: wdw=[MIN(aggregate_test_100.c9): Ok(Field { name: \"MIN(aggregate_test_100.c9)\", data_type: UInt32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt32(NULL)), end_bound: CurrentRow }]", - " SortExec: [c2@0 ASC NULLS LAST,c9@1 ASC NULLS LAST]", + " SortExec: [c2@0 ASC NULLS LAST,c9@1 ASC NULLS LAST], global=true", ] }; @@ -1791,11 +1791,11 @@ async fn test_window_partition_by_order_by() -> Result<()> { vec![ "ProjectionExec: expr=[SUM(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as SUM(aggregate_test_100.c4), COUNT(UInt8(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@4 as COUNT(UInt8(1))]", " BoundedWindowAggExec: wdw=[COUNT(UInt8(1)): Ok(Field { name: \"COUNT(UInt8(1))\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(1)) }]", - " SortExec: [c1@0 ASC NULLS LAST,c2@1 ASC NULLS LAST]", + " SortExec: [c1@0 ASC NULLS LAST,c2@1 ASC NULLS LAST], global=false", " CoalesceBatchesExec: target_batch_size=4096", " RepartitionExec: partitioning=Hash([Column { name: \"c1\", index: 0 }], 2), input_partitions=2", " BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c4): Ok(Field { name: \"SUM(aggregate_test_100.c4)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(1)) }]", - " SortExec: [c1@0 ASC NULLS LAST,c2@1 ASC NULLS LAST]", + " SortExec: [c1@0 ASC NULLS LAST,c2@1 ASC NULLS LAST], global=false", " CoalesceBatchesExec: target_batch_size=4096", " RepartitionExec: partitioning=Hash([Column { name: \"c1\", index: 0 }, Column { name: \"c2\", index: 1 }], 2), input_partitions=2", " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", @@ -1835,7 +1835,7 @@ async fn test_window_agg_sort_reversed_plan() -> Result<()> { " GlobalLimitExec: skip=0, fetch=5", " BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: \"SUM(aggregate_test_100.c9)\", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: Following(UInt64(1)) }]", " BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: \"SUM(aggregate_test_100.c9)\", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)) }]", - " SortExec: [c9@0 DESC]", + " SortExec: [c9@0 DESC], global=true", ] }; @@ -1891,7 +1891,7 @@ async fn test_window_agg_sort_reversed_plan_builtin() -> Result<()> { " GlobalLimitExec: skip=0, fetch=5", " BoundedWindowAggExec: wdw=[FIRST_VALUE(aggregate_test_100.c9): Ok(Field { name: \"FIRST_VALUE(aggregate_test_100.c9)\", data_type: UInt32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: Following(UInt64(1)) }, LAG(aggregate_test_100.c9,Int64(2),Int64(10101)): Ok(Field { name: \"LAG(aggregate_test_100.c9,Int64(2),Int64(10101))\", data_type: UInt32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(UInt32(NULL)) }, LEAD(aggregate_test_100.c9,Int64(2),Int64(10101)): Ok(Field { name: \"LEAD(aggregate_test_100.c9,Int64(2),Int64(10101))\", data_type: UInt32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(UInt32(NULL)) }]", " BoundedWindowAggExec: wdw=[FIRST_VALUE(aggregate_test_100.c9): Ok(Field { name: \"FIRST_VALUE(aggregate_test_100.c9)\", data_type: UInt32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)) }, LAG(aggregate_test_100.c9,Int64(2),Int64(10101)): Ok(Field { name: \"LAG(aggregate_test_100.c9,Int64(2),Int64(10101))\", data_type: UInt32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, LEAD(aggregate_test_100.c9,Int64(2),Int64(10101)): Ok(Field { name: \"LEAD(aggregate_test_100.c9,Int64(2),Int64(10101))\", data_type: UInt32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }]", - " SortExec: [c9@0 DESC]", + " SortExec: [c9@0 DESC], global=true", ] }; @@ -1942,9 +1942,9 @@ async fn test_window_agg_sort_non_reversed_plan() -> Result<()> { " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", " GlobalLimitExec: skip=0, fetch=5", " BoundedWindowAggExec: wdw=[ROW_NUMBER(): Ok(Field { name: \"ROW_NUMBER()\", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)) }]", - " SortExec: [c9@0 ASC NULLS LAST]", + " SortExec: [c9@0 ASC NULLS LAST], global=true", " BoundedWindowAggExec: wdw=[ROW_NUMBER(): Ok(Field { name: \"ROW_NUMBER()\", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)) }]", - " SortExec: [c9@0 DESC]", + " SortExec: [c9@0 DESC], global=true", ] }; @@ -1995,11 +1995,10 @@ async fn test_window_agg_sort_multi_layer_non_reversed_plan() -> Result<()> { "ProjectionExec: expr=[c9@2 as c9, SUM(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@5 as sum1, SUM(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c1 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@3 as sum2, ROW_NUMBER() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@4 as rn2]", " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", " GlobalLimitExec: skip=0, fetch=5", - " BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: \"SUM(aggregate_test_100.c9)\", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)) }]", - " SortExec: [c9@2 ASC NULLS LAST,c1@0 ASC NULLS LAST,c2@1 ASC NULLS LAST]", - " BoundedWindowAggExec: wdw=[ROW_NUMBER(): Ok(Field { name: \"ROW_NUMBER()\", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)) }]", - " BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: \"SUM(aggregate_test_100.c9)\", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)) }]", - " SortExec: [c9@2 DESC,c1@0 DESC]", + " BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: \"SUM(aggregate_test_100.c9)\", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: Following(UInt64(1)) }]", + " BoundedWindowAggExec: wdw=[ROW_NUMBER(): Ok(Field { name: \"ROW_NUMBER()\", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)) }]", + " BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: \"SUM(aggregate_test_100.c9)\", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)) }]", + " SortExec: [c9@2 DESC,c1@0 DESC,c2@1 DESC], global=true", ] }; @@ -2013,15 +2012,15 @@ async fn test_window_agg_sort_multi_layer_non_reversed_plan() -> Result<()> { let actual = execute_to_batches(&ctx, sql).await; let expected = vec![ - "+-----------+------------+-----------+-----+", - "| c9 | sum1 | sum2 | rn2 |", - "+-----------+------------+-----------+-----+", - "| 28774375 | 745354217 | 91818943 | 100 |", - "| 63044568 | 988558066 | 232866360 | 99 |", - "| 141047417 | 1285934966 | 374546521 | 98 |", - "| 141680161 | 1654839259 | 519841132 | 97 |", - "| 145294611 | 1980231675 | 745354217 | 96 |", - "+-----------+------------+-----------+-----+", + "+------------+-------------+-------------+-----+", + "| c9 | sum1 | sum2 | rn2 |", + "+------------+-------------+-------------+-----+", + "| 4268716378 | 8498370520 | 24997484146 | 1 |", + "| 4229654142 | 12714811027 | 29012926487 | 2 |", + "| 4216440507 | 16858984380 | 28743001064 | 3 |", + "| 4144173353 | 20935849039 | 28472563256 | 4 |", + "| 4076864659 | 24997484146 | 28118515915 | 5 |", + "+------------+-------------+-------------+-----+", ]; assert_batches_eq!(expected, &actual); @@ -2085,15 +2084,15 @@ async fn test_window_agg_complex_plan() -> Result<()> { " GlobalLimitExec: skip=0, fetch=5", " WindowAggExec: wdw=[SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(10)), end_bound: Following(Int64(11)) }, SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow }, SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: Following(Int64(11)) }, SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)) }]", " BoundedWindowAggExec: wdw=[SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow }]", - " SortExec: [c3@2 ASC NULLS LAST,c2@1 ASC NULLS LAST]", + " SortExec: [c3@2 ASC NULLS LAST,c2@1 ASC NULLS LAST], global=true", " BoundedWindowAggExec: wdw=[SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow }]", - " SortExec: [c3@2 ASC NULLS LAST,c1@0 ASC]", + " SortExec: [c3@2 ASC NULLS LAST,c1@0 ASC], global=true", " WindowAggExec: wdw=[SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)) }]", " WindowAggExec: wdw=[SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(11)), end_bound: Following(Int64(10)) }, SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)) }, SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(11)), end_bound: Following(Int64(NULL)) }, SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow }]", " WindowAggExec: wdw=[SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(10)), end_bound: Following(Int64(11)) }, SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow }, SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: Following(Int64(11)) }, SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)) }]", " WindowAggExec: wdw=[SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(10)), end_bound: Following(Int64(11)) }, SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow }, SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: Following(Int64(11)) }, SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)) }]", " BoundedWindowAggExec: wdw=[SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow }]", - " SortExec: [c3@2 DESC,c1@0 ASC NULLS LAST]", + " SortExec: [c3@2 DESC,c1@0 ASC NULLS LAST], global=true", ] }; @@ -2134,7 +2133,7 @@ async fn test_window_agg_sort_orderby_reversed_partitionby_plan() -> Result<()> " GlobalLimitExec: skip=0, fetch=5", " BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: \"SUM(aggregate_test_100.c9)\", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)) }]", " BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: \"SUM(aggregate_test_100.c9)\", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)) }]", - " SortExec: [c1@0 ASC NULLS LAST,c9@1 DESC]", + " SortExec: [c1@0 ASC NULLS LAST,c9@1 DESC], global=true", ] }; @@ -2189,7 +2188,7 @@ async fn test_window_agg_sort_partitionby_reversed_plan() -> Result<()> { " GlobalLimitExec: skip=0, fetch=5", " BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: \"SUM(aggregate_test_100.c9)\", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: Following(UInt64(1)) }]", " BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: \"SUM(aggregate_test_100.c9)\", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)) }]", - " SortExec: [c1@0 ASC NULLS LAST,c9@1 DESC]", + " SortExec: [c1@0 ASC NULLS LAST,c9@1 DESC], global=true", ] }; @@ -2243,7 +2242,7 @@ async fn test_window_agg_sort_orderby_reversed_binary_expr() -> Result<()> { " GlobalLimitExec: skip=0, fetch=5", " WindowAggExec: wdw=[SUM(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: \"SUM(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW\", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int16(NULL)) }]", " BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: \"SUM(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW\", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int16(NULL)), end_bound: CurrentRow }]", - " SortExec: [CAST(c3@1 AS Int16) + c4@2 DESC,c9@3 DESC,c2@0 ASC NULLS LAST]", + " SortExec: [CAST(c3@1 AS Int16) + c4@2 DESC,c9@3 DESC,c2@0 ASC NULLS LAST], global=true", ] }; @@ -2356,7 +2355,7 @@ async fn test_window_agg_sort_orderby_reversed_partitionby_reversed_plan() -> Re " GlobalLimitExec: skip=0, fetch=5", " BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: \"SUM(aggregate_test_100.c9)\", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt32(NULL)), end_bound: CurrentRow }]", " BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: \"SUM(aggregate_test_100.c9)\", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int8(NULL)), end_bound: CurrentRow }]", - " SortExec: [c3@1 DESC,c9@2 DESC,c2@0 ASC NULLS LAST]", + " SortExec: [c3@1 DESC,c9@2 DESC,c2@0 ASC NULLS LAST], global=true", ] }; @@ -2524,7 +2523,7 @@ mod tests { vec![ "ProjectionExec: expr=[sum1@0 as sum1, sum2@1 as sum2, sum3@2 as sum3, min1@3 as min1, min2@4 as min2, min3@5 as min3, max1@6 as max1, max2@7 as max2, max3@8 as max3, cnt1@9 as cnt1, cnt2@10 as cnt2, sumr1@11 as sumr1, sumr2@12 as sumr2, sumr3@13 as sumr3, minr1@14 as minr1, minr2@15 as minr2, minr3@16 as minr3, maxr1@17 as maxr1, maxr2@18 as maxr2, maxr3@19 as maxr3, cntr1@20 as cntr1, cntr2@21 as cntr2, sum4@22 as sum4, cnt3@23 as cnt3]", " GlobalLimitExec: skip=0, fetch=5", - " SortExec: [inc_col@24 DESC]", + " SortExec: [inc_col@24 DESC], global=true", " ProjectionExec: expr=[SUM(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@14 as sum1, SUM(annotated_data.desc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@15 as sum2, SUM(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@16 as sum3, MIN(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@17 as min1, MIN(annotated_data.desc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@18 as min2, MIN(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@19 as min3, MAX(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@20 as max1, MAX(annotated_data.desc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@21 as max2, MAX(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@22 as max3, COUNT(UInt8(1)) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING@23 as cnt1, COUNT(UInt8(1)) ORDER BY [annotated_data.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING@24 as cnt2, SUM(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING@3 as sumr1, SUM(annotated_data.desc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING@4 as sumr2, SUM(annotated_data.desc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@5 as sumr3, MIN(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@6 as minr1, MIN(annotated_data.desc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@7 as minr2, MIN(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@8 as minr3, MAX(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@9 as maxr1, MAX(annotated_data.desc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@10 as maxr2, MAX(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@11 as maxr3, COUNT(UInt8(1)) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@12 as cntr1, COUNT(UInt8(1)) ORDER BY [annotated_data.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING@13 as cntr2, SUM(annotated_data.desc_col) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING@25 as sum4, COUNT(UInt8(1)) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING@26 as cnt3, inc_col@1 as inc_col]", " BoundedWindowAggExec: wdw=[SUM(annotated_data.desc_col): Ok(Field { name: \"SUM(annotated_data.desc_col)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(8)), end_bound: Following(UInt64(1)) }, COUNT(UInt8(1)): Ok(Field { name: \"COUNT(UInt8(1))\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(8)), end_bound: Following(UInt64(1)) }]", " BoundedWindowAggExec: wdw=[SUM(annotated_data.inc_col): Ok(Field { name: \"SUM(annotated_data.inc_col)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, SUM(annotated_data.desc_col): Ok(Field { name: \"SUM(annotated_data.desc_col)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(5)), end_bound: Following(Int32(1)) }, SUM(annotated_data.inc_col): Ok(Field { name: \"SUM(annotated_data.inc_col)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)) }, MIN(annotated_data.inc_col): Ok(Field { name: \"MIN(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, MIN(annotated_data.desc_col): Ok(Field { name: \"MIN(annotated_data.desc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(5)), end_bound: Following(Int32(1)) }, MIN(annotated_data.inc_col): Ok(Field { name: \"MIN(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)) }, MAX(annotated_data.inc_col): Ok(Field { name: \"MAX(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, MAX(annotated_data.desc_col): Ok(Field { name: \"MAX(annotated_data.desc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(5)), end_bound: Following(Int32(1)) }, MAX(annotated_data.inc_col): Ok(Field { name: \"MAX(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)) }, COUNT(UInt8(1)): Ok(Field { name: \"COUNT(UInt8(1))\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(4)), end_bound: Following(Int32(8)) }, COUNT(UInt8(1)): Ok(Field { name: \"COUNT(UInt8(1))\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(8)), end_bound: Following(UInt64(1)) }]", @@ -2599,7 +2598,7 @@ mod tests { vec![ "ProjectionExec: expr=[fv1@0 as fv1, fv2@1 as fv2, lv1@2 as lv1, lv2@3 as lv2, nv1@4 as nv1, nv2@5 as nv2, rn1@6 as rn1, rn2@7 as rn2, rank1@8 as rank1, rank2@9 as rank2, dense_rank1@10 as dense_rank1, dense_rank2@11 as dense_rank2, lag1@12 as lag1, lag2@13 as lag2, lead1@14 as lead1, lead2@15 as lead2, fvr1@16 as fvr1, fvr2@17 as fvr2, lvr1@18 as lvr1, lvr2@19 as lvr2, lagr1@20 as lagr1, lagr2@21 as lagr2, leadr1@22 as leadr1, leadr2@23 as leadr2]", " GlobalLimitExec: skip=0, fetch=5", - " SortExec: [ts@24 DESC]", + " SortExec: [ts@24 DESC], global=true", " ProjectionExec: expr=[FIRST_VALUE(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@10 as fv1, FIRST_VALUE(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@11 as fv2, LAST_VALUE(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@12 as lv1, LAST_VALUE(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@13 as lv2, NTH_VALUE(annotated_data.inc_col,Int64(5)) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@14 as nv1, NTH_VALUE(annotated_data.inc_col,Int64(5)) ORDER BY [annotated_data.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@15 as nv2, ROW_NUMBER() ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@16 as rn1, ROW_NUMBER() ORDER BY [annotated_data.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@17 as rn2, RANK() ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@18 as rank1, RANK() ORDER BY [annotated_data.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@19 as rank2, DENSE_RANK() ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@20 as dense_rank1, DENSE_RANK() ORDER BY [annotated_data.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@21 as dense_rank2, LAG(annotated_data.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@22 as lag1, LAG(annotated_data.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@23 as lag2, LEAD(annotated_data.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@24 as lead1, LEAD(annotated_data.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@25 as lead2, FIRST_VALUE(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@2 as fvr1, FIRST_VALUE(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@3 as fvr2, LAST_VALUE(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@4 as lvr1, LAST_VALUE(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@5 as lvr2, LAG(annotated_data.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@6 as lagr1, LAG(annotated_data.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@7 as lagr2, LEAD(annotated_data.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@8 as leadr1, LEAD(annotated_data.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@9 as leadr2, ts@0 as ts]", " BoundedWindowAggExec: wdw=[FIRST_VALUE(annotated_data.inc_col): Ok(Field { name: \"FIRST_VALUE(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, FIRST_VALUE(annotated_data.inc_col): Ok(Field { name: \"FIRST_VALUE(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, LAST_VALUE(annotated_data.inc_col): Ok(Field { name: \"LAST_VALUE(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, LAST_VALUE(annotated_data.inc_col): Ok(Field { name: \"LAST_VALUE(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, NTH_VALUE(annotated_data.inc_col,Int64(5)): Ok(Field { name: \"NTH_VALUE(annotated_data.inc_col,Int64(5))\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, NTH_VALUE(annotated_data.inc_col,Int64(5)): Ok(Field { name: \"NTH_VALUE(annotated_data.inc_col,Int64(5))\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, ROW_NUMBER(): Ok(Field { name: \"ROW_NUMBER()\", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, ROW_NUMBER(): Ok(Field { name: \"ROW_NUMBER()\", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, RANK(): Ok(Field { name: \"RANK()\", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, RANK(): Ok(Field { name: \"RANK()\", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, DENSE_RANK(): Ok(Field { name: \"DENSE_RANK()\", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, DENSE_RANK(): Ok(Field { name: \"DENSE_RANK()\", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, LAG(annotated_data.inc_col,Int64(1),Int64(1001)): Ok(Field { name: \"LAG(annotated_data.inc_col,Int64(1),Int64(1001))\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, LAG(annotated_data.inc_col,Int64(2),Int64(1002)): Ok(Field { name: \"LAG(annotated_data.inc_col,Int64(2),Int64(1002))\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, LEAD(annotated_data.inc_col,Int64(-1),Int64(1001)): Ok(Field { name: \"LEAD(annotated_data.inc_col,Int64(-1),Int64(1001))\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, LEAD(annotated_data.inc_col,Int64(4),Int64(1004)): Ok(Field { name: \"LEAD(annotated_data.inc_col,Int64(4),Int64(1004))\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }]", " BoundedWindowAggExec: wdw=[FIRST_VALUE(annotated_data.inc_col): Ok(Field { name: \"FIRST_VALUE(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, FIRST_VALUE(annotated_data.inc_col): Ok(Field { name: \"FIRST_VALUE(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)) }, LAST_VALUE(annotated_data.inc_col): Ok(Field { name: \"LAST_VALUE(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, LAST_VALUE(annotated_data.inc_col): Ok(Field { name: \"LAST_VALUE(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)) }, LAG(annotated_data.inc_col,Int64(1),Int64(1001)): Ok(Field { name: \"LAG(annotated_data.inc_col,Int64(1),Int64(1001))\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, LAG(annotated_data.inc_col,Int64(2),Int64(1002)): Ok(Field { name: \"LAG(annotated_data.inc_col,Int64(2),Int64(1002))\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)) }, LEAD(annotated_data.inc_col,Int64(-1),Int64(1001)): Ok(Field { name: \"LEAD(annotated_data.inc_col,Int64(-1),Int64(1001))\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, LEAD(annotated_data.inc_col,Int64(4),Int64(1004)): Ok(Field { name: \"LEAD(annotated_data.inc_col,Int64(4),Int64(1004))\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)) }]", @@ -2658,7 +2657,7 @@ mod tests { vec![ "ProjectionExec: expr=[sum1@0 as sum1, sum2@1 as sum2, min1@2 as min1, min2@3 as min2, max1@4 as max1, max2@5 as max2, count1@6 as count1, count2@7 as count2, avg1@8 as avg1, avg2@9 as avg2]", " GlobalLimitExec: skip=0, fetch=5", - " SortExec: [inc_col@10 ASC NULLS LAST]", + " SortExec: [inc_col@10 ASC NULLS LAST], global=true", " ProjectionExec: expr=[SUM(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@7 as sum1, SUM(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@2 as sum2, MIN(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@8 as min1, MIN(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@3 as min2, MAX(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@9 as max1, MAX(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@4 as max2, COUNT(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@10 as count1, COUNT(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@5 as count2, AVG(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@11 as avg1, AVG(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@6 as avg2, inc_col@1 as inc_col]", " BoundedWindowAggExec: wdw=[SUM(annotated_data.inc_col): Ok(Field { name: \"SUM(annotated_data.inc_col)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: Following(Int32(5)) }, MIN(annotated_data.inc_col): Ok(Field { name: \"MIN(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: Following(Int32(5)) }, MAX(annotated_data.inc_col): Ok(Field { name: \"MAX(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: Following(Int32(5)) }, COUNT(annotated_data.inc_col): Ok(Field { name: \"COUNT(annotated_data.inc_col)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: Following(Int32(5)) }, AVG(annotated_data.inc_col): Ok(Field { name: \"AVG(annotated_data.inc_col)\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: Following(Int32(5)) }]", " BoundedWindowAggExec: wdw=[SUM(annotated_data.inc_col): Ok(Field { name: \"SUM(annotated_data.inc_col)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: Following(Int32(3)) }, MIN(annotated_data.inc_col): Ok(Field { name: \"MIN(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: Following(Int32(3)) }, MAX(annotated_data.inc_col): Ok(Field { name: \"MAX(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: Following(Int32(3)) }, COUNT(annotated_data.inc_col): Ok(Field { name: \"COUNT(annotated_data.inc_col)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: Following(Int32(3)) }, AVG(annotated_data.inc_col): Ok(Field { name: \"AVG(annotated_data.inc_col)\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: Following(Int32(3)) }]", @@ -2712,7 +2711,7 @@ mod tests { vec![ "ProjectionExec: expr=[first_value1@0 as first_value1, first_value2@1 as first_value2, last_value1@2 as last_value1, last_value2@3 as last_value2, nth_value1@4 as nth_value1]", " GlobalLimitExec: skip=0, fetch=5", - " SortExec: [inc_col@5 ASC NULLS LAST]", + " SortExec: [inc_col@5 ASC NULLS LAST], global=true", " ProjectionExec: expr=[FIRST_VALUE(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING@4 as first_value1, FIRST_VALUE(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@2 as first_value2, LAST_VALUE(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING@5 as last_value1, LAST_VALUE(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@3 as last_value2, NTH_VALUE(annotated_data.inc_col,Int64(2)) ORDER BY [annotated_data.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING@6 as nth_value1, inc_col@1 as inc_col]", " BoundedWindowAggExec: wdw=[FIRST_VALUE(annotated_data.inc_col): Ok(Field { name: \"FIRST_VALUE(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(1)) }, LAST_VALUE(annotated_data.inc_col): Ok(Field { name: \"LAST_VALUE(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(1)) }, NTH_VALUE(annotated_data.inc_col,Int64(2)): Ok(Field { name: \"NTH_VALUE(annotated_data.inc_col,Int64(2))\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(1)) }]", " BoundedWindowAggExec: wdw=[FIRST_VALUE(annotated_data.inc_col): Ok(Field { name: \"FIRST_VALUE(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(3)) }, LAST_VALUE(annotated_data.inc_col): Ok(Field { name: \"LAST_VALUE(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(3)) }]", diff --git a/datafusion/physical-expr/src/lib.rs b/datafusion/physical-expr/src/lib.rs index c9658a048ca84..9022f39d23c69 100644 --- a/datafusion/physical-expr/src/lib.rs +++ b/datafusion/physical-expr/src/lib.rs @@ -53,8 +53,9 @@ pub use physical_expr::{AnalysisContext, ExprBoundaries, PhysicalExpr}; pub use planner::create_physical_expr; pub use scalar_function::ScalarFunctionExpr; pub use sort_expr::PhysicalSortExpr; +pub use sort_expr::PhysicalSortRequirements; pub use utils::{ - expr_list_eq_any_order, expr_list_eq_strict_order, + expr_list_eq_any_order, expr_list_eq_strict_order, new_sort_requirements, normalize_expr_with_equivalence_properties, normalize_out_expr_with_alias_schema, normalize_sort_expr_with_equivalence_properties, sort_expr_list_eq_strict_order, split_conjunction, diff --git a/datafusion/physical-expr/src/sort_expr.rs b/datafusion/physical-expr/src/sort_expr.rs index f8172dabf65aa..a25802e9980ab 100644 --- a/datafusion/physical-expr/src/sort_expr.rs +++ b/datafusion/physical-expr/src/sort_expr.rs @@ -69,4 +69,56 @@ impl PhysicalSortExpr { options: Some(self.options), }) } + + pub fn satisfy(&self, requirement: &PhysicalSortRequirements) -> bool { + if requirement.sort_options.is_some() { + self.options == requirement.sort_options.unwrap() + && self.expr.eq(&requirement.expr) + } else { + self.expr.eq(&requirement.expr) + } + } +} + +/// Represents sort requirement associated with a plan +#[derive(Clone, Debug)] +pub struct PhysicalSortRequirements { + /// Physical expression representing the column to sort + pub expr: Arc, + /// Option to specify how the given column should be sorted. + /// If not specified, the PhysicalSortRequirements does not have specific requirements on the sort options. + pub sort_options: Option, +} + +impl PartialEq for PhysicalSortRequirements { + fn eq(&self, other: &PhysicalSortRequirements) -> bool { + self.sort_options == other.sort_options && self.expr.eq(&other.expr) + } +} + +impl std::fmt::Display for PhysicalSortRequirements { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let opts_string = if let Some(sort_options) = self.sort_options { + match (sort_options.descending, sort_options.nulls_first) { + (true, true) => "DESC", + (true, false) => "DESC NULLS LAST", + (false, true) => "ASC", + (false, false) => "ASC NULLS LAST", + } + } else { + "NA" + }; + write!(f, "{} {}", self.expr, opts_string) + } +} + +impl PhysicalSortRequirements { + /// Requirement is compatible with the other means the current requirement is equal or more specific than the other + pub fn compatible(&self, other: &PhysicalSortRequirements) -> bool { + if other.sort_options.is_some() { + self.eq(other) + } else { + self.expr.eq(&other.expr) + } + } } diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index d6d5054ffef03..533a606b69e85 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -21,11 +21,12 @@ use crate::expressions::Column; use crate::expressions::UnKnownColumn; use crate::rewrite::TreeNodeRewritable; use crate::PhysicalSortExpr; -use crate::{EquivalenceProperties, PhysicalExpr}; +use crate::{EquivalenceProperties, PhysicalExpr, PhysicalSortRequirements}; use datafusion_expr::Operator; use arrow::datatypes::SchemaRef; +use arrow_schema::SortOptions; use std::collections::HashMap; use std::sync::Arc; @@ -168,6 +169,21 @@ pub fn normalize_expr_with_equivalence_properties( .unwrap_or(expr) } +pub fn new_sort_requirements( + sort_keys: Option<&[PhysicalSortExpr]>, +) -> Option> { + let ordering_requirements = sort_keys.map(|ordering| { + ordering + .iter() + .map(|o| PhysicalSortRequirements { + expr: o.expr.clone(), + sort_options: Some(o.options.clone()), + }) + .collect::>() + }); + ordering_requirements +} + pub fn normalize_sort_expr_with_equivalence_properties( sort_expr: PhysicalSortExpr, eq_properties: &[EquivalentClass], @@ -185,6 +201,24 @@ pub fn normalize_sort_expr_with_equivalence_properties( } } +pub fn normalize_sort_requirement_with_equivalence_properties( + sort_requirement: PhysicalSortRequirements, + eq_properties: &[EquivalentClass], +) -> PhysicalSortRequirements { + let normalized_expr = normalize_expr_with_equivalence_properties( + sort_requirement.expr.clone(), + eq_properties, + ); + if sort_requirement.expr.ne(&normalized_expr) { + PhysicalSortRequirements { + expr: normalized_expr, + sort_options: sort_requirement.sort_options.clone(), + } + } else { + sort_requirement + } +} + /// Checks whether given ordering requirements are satisfied by provided [PhysicalSortExpr]s. pub fn ordering_satisfy EquivalenceProperties>( provided: Option<&[PhysicalSortExpr]>, @@ -200,7 +234,7 @@ pub fn ordering_satisfy EquivalenceProperties>( } } -pub fn ordering_satisfy_concrete EquivalenceProperties>( +fn ordering_satisfy_concrete EquivalenceProperties>( provided: &[PhysicalSortExpr], required: &[PhysicalSortExpr], equal_properties: F, @@ -235,6 +269,185 @@ pub fn ordering_satisfy_concrete EquivalenceProperties>( } } +pub fn ordering_satisfy_requirement EquivalenceProperties>( + provided: Option<&[PhysicalSortExpr]>, + required: Option<&[PhysicalSortRequirements]>, + equal_properties: F, +) -> bool { + match (provided, required) { + (_, None) => true, + (None, Some(_)) => false, + (Some(provided), Some(required)) => { + ordering_satisfy_requirement_concrete(provided, required, equal_properties) + } + } +} + +pub fn ordering_satisfy_requirement_concrete EquivalenceProperties>( + provided: &[PhysicalSortExpr], + required: &[PhysicalSortRequirements], + equal_properties: F, +) -> bool { + if required.len() > provided.len() { + false + } else if required + .iter() + .zip(provided.iter()) + .all(|(order1, order2)| order2.satisfy(order1)) + { + true + } else if let eq_classes @ [_, ..] = equal_properties().classes() { + let normalized_requirements = required + .iter() + .map(|e| { + normalize_sort_requirement_with_equivalence_properties( + e.clone(), + eq_classes, + ) + }) + .collect::>(); + let normalized_provided_exprs = provided + .iter() + .map(|e| { + normalize_sort_expr_with_equivalence_properties(e.clone(), eq_classes) + }) + .collect::>(); + normalized_requirements + .iter() + .zip(normalized_provided_exprs.iter()) + .all(|(order1, order2)| order2.satisfy(order1)) + } else { + false + } +} + +/// Provided requirements are compatible with the required, which means the provided requirements are equal or more specific than the required +pub fn requirements_compatible EquivalenceProperties>( + provided: Option<&[PhysicalSortRequirements]>, + required: Option<&[PhysicalSortRequirements]>, + equal_properties: F, +) -> bool { + match (provided, required) { + (_, None) => true, + (None, Some(_)) => false, + (Some(provided), Some(required)) => { + if required.len() > provided.len() { + false + } else if required + .iter() + .zip(provided.iter()) + .all(|(req, pro)| pro.compatible(req)) + { + true + } else if let eq_classes @ [_, ..] = equal_properties().classes() { + let normalized_required = required + .iter() + .map(|e| { + normalize_sort_requirement_with_equivalence_properties( + e.clone(), + eq_classes, + ) + }) + .collect::>(); + let normalized_provided = provided + .iter() + .map(|e| { + normalize_sort_requirement_with_equivalence_properties( + e.clone(), + eq_classes, + ) + }) + .collect::>(); + normalized_required + .iter() + .zip(normalized_provided.iter()) + .all(|(req, pro)| pro.compatible(req)) + } else { + false + } + } + } +} + +pub fn map_columns_before_projection( + parent_required: &[Arc], + proj_exprs: &[(Arc, String)], +) -> Vec> { + let mut column_mapping = HashMap::new(); + for (expression, name) in proj_exprs.iter() { + if let Some(column) = expression.as_any().downcast_ref::() { + column_mapping.insert(name.clone(), column.clone()); + }; + } + let new_required: Vec> = parent_required + .iter() + .filter_map(|r| { + if let Some(column) = r.as_any().downcast_ref::() { + column_mapping.get(column.name()) + } else { + None + } + }) + .map(|e| Arc::new(e.clone()) as Arc) + .collect::>(); + new_required +} + +pub fn map_requirement_before_projection( + parent_required: Option<&[PhysicalSortRequirements]>, + proj_exprs: &[(Arc, String)], +) -> Option> { + if let Some(requirement) = parent_required { + let required_expr = create_sort_expr_from_requirement(requirement) + .iter() + .map(|sort_expr| sort_expr.expr.clone()) + .collect::>(); + let new_exprs = map_columns_before_projection(&required_expr, proj_exprs); + if new_exprs.len() == requirement.len() { + let new_request = new_exprs + .iter() + .zip(requirement.iter()) + .map(|(new, old)| PhysicalSortRequirements { + expr: new.clone(), + sort_options: old.sort_options.clone(), + }) + .collect::>(); + Some(new_request) + } else { + None + } + } else { + None + } +} + +pub fn create_sort_expr_from_requirement( + required: &[PhysicalSortRequirements], +) -> Vec { + let parent_required_expr = required + .iter() + .map(|prop| { + if prop.sort_options.is_some() { + PhysicalSortExpr { + expr: prop.expr.clone(), + options: prop.sort_options.unwrap().clone(), + } + } else { + PhysicalSortExpr { + expr: prop.expr.clone(), + options: SortOptions { + // By default, create sort key with ASC is true and NULLS LAST to be consistent with + // PostgreSQL's rule: https://www.postgresql.org/docs/current/queries-order.html + descending: false, + nulls_first: false, + }, + } + } + }) + .collect::>(); + parent_required_expr +} + #[cfg(test)] mod tests { From 01e87d69d1bf18f2e760872d1f664e8dc515d940 Mon Sep 17 00:00:00 2001 From: "mingmwang@ebay.com" Date: Fri, 10 Feb 2023 20:06:56 +0800 Subject: [PATCH 02/35] Add support to optimize parallel sorting --- .../physical_optimizer/dist_enforcement.rs | 77 +++++++---- .../global_sort_selection.rs | 4 +- .../physical_optimizer/sort_enforcement.rs | 6 +- .../physical_optimizer/sort_enforcement2.rs | 128 +++++++++++------- .../sorts/sort_preserving_merge.rs | 18 +++ datafusion/core/tests/sql/joins.rs | 16 +-- datafusion/core/tests/sql/window.rs | 31 +++-- datafusion/physical-expr/src/utils.rs | 3 +- 8 files changed, 181 insertions(+), 102 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/dist_enforcement.rs b/datafusion/core/src/physical_optimizer/dist_enforcement.rs index 4fe76ac30d94c..3bc599c0cb0d2 100644 --- a/datafusion/core/src/physical_optimizer/dist_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/dist_enforcement.rs @@ -30,6 +30,7 @@ use crate::physical_plan::projection::ProjectionExec; use crate::physical_plan::repartition::RepartitionExec; use crate::physical_plan::rewrite::TreeNodeRewritable; use crate::physical_plan::sorts::sort::SortOptions; +use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; use crate::physical_plan::windows::WindowAggExec; use crate::physical_plan::Partitioning; use crate::physical_plan::{with_new_children_if_necessary, Distribution, ExecutionPlan}; @@ -38,11 +39,14 @@ use datafusion_expr::logical_plan::JoinType; use datafusion_physical_expr::equivalence::EquivalenceProperties; use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::expressions::NoOp; -use datafusion_physical_expr::utils::map_columns_before_projection; +use datafusion_physical_expr::utils::{ + create_sort_expr_from_requirement, map_columns_before_projection, +}; use datafusion_physical_expr::{ expr_list_eq_strict_order, normalize_expr_with_equivalence_properties, AggregateExpr, PhysicalExpr, }; +use itertools::izip; use std::sync::Arc; /// The EnforceDistribution rule ensures that distribution requirements are met @@ -80,7 +84,9 @@ impl PhysicalOptimizerRule for EnforceDistribution { } else { plan }; + // Distribution enforcement needs to be applied bottom-up. + let repartition_sorts = config.optimizer.repartition_sorts; new_plan.transform_up(&{ |plan| { let adjusted = if !top_down_join_key_reordering { @@ -88,7 +94,11 @@ impl PhysicalOptimizerRule for EnforceDistribution { } else { plan }; - Ok(Some(ensure_distribution(adjusted, target_partitions)?)) + Ok(Some(ensure_distribution( + adjusted, + target_partitions, + repartition_sorts, + )?)) } }) } @@ -819,6 +829,7 @@ fn new_join_conditions( fn ensure_distribution( plan: Arc, target_partitions: usize, + repartition_sort: bool, ) -> Result> { if plan.children().is_empty() { return Ok(plan); @@ -829,31 +840,43 @@ fn ensure_distribution( assert_eq!(children.len(), required_input_distributions.len()); // Add RepartitionExec to guarantee output partitioning - let new_children: Result>> = children - .into_iter() - .zip(required_input_distributions.into_iter()) - .map(|(child, required)| { - if child - .output_partitioning() - .satisfy(required.clone(), || child.equivalence_properties()) - { - Ok(child) - } else { - let new_child: Result> = match required { - Distribution::SinglePartition - if child.output_partitioning().partition_count() > 1 => - { + let new_children: Result>> = izip!( + children.into_iter(), + required_input_distributions.into_iter(), + plan.required_input_ordering().into_iter(), + ) + .map(|(child, required, required_ordering)| { + if child + .output_partitioning() + .satisfy(required.clone(), || child.equivalence_properties()) + { + Ok(child) + } else { + let new_child: Result> = match required { + Distribution::SinglePartition + if child.output_partitioning().partition_count() > 1 => + { + if repartition_sort && required_ordering.is_some() { + let new_physical_ordering = create_sort_expr_from_requirement( + required_ordering.unwrap().as_ref(), + ); + Ok(Arc::new(SortPreservingMergeExec::new_for_distribuion( + new_physical_ordering, + child.clone(), + ))) + } else { Ok(Arc::new(CoalescePartitionsExec::new(child.clone()))) } - _ => { - let partition = required.create_partitioning(target_partitions); - Ok(Arc::new(RepartitionExec::try_new(child, partition)?)) - } - }; - new_child - } - }) - .collect(); + } + _ => { + let partition = required.create_partitioning(target_partitions); + Ok(Arc::new(RepartitionExec::try_new(child, partition)?)) + } + }; + new_child + } + }) + .collect(); with_new_children_if_necessary(plan, new_children?) } @@ -1654,6 +1677,7 @@ mod tests { let bottom_left_join = ensure_distribution( hash_join_exec(left.clone(), right.clone(), &join_on, &JoinType::Inner), 10, + false, )?; // Projection(a as A, a as AA, b as B, c as C) @@ -1684,6 +1708,7 @@ mod tests { let bottom_right_join = ensure_distribution( hash_join_exec(left, right.clone(), &join_on, &JoinType::Inner), 10, + false, )?; // Join on (B == b1 and C == c and AA = a1) @@ -1773,6 +1798,7 @@ mod tests { let bottom_left_join = ensure_distribution( hash_join_exec(left.clone(), right.clone(), &join_on, &JoinType::Inner), 10, + false, )?; // Projection(a as A, a as AA, b as B, c as C) @@ -1803,6 +1829,7 @@ mod tests { let bottom_right_join = ensure_distribution( hash_join_exec(left, right.clone(), &join_on, &JoinType::Inner), 10, + false, )?; // Join on (B == b1 and C == c and AA = a1) diff --git a/datafusion/core/src/physical_optimizer/global_sort_selection.rs b/datafusion/core/src/physical_optimizer/global_sort_selection.rs index 81b4b59e3a142..ec64756b8b74b 100644 --- a/datafusion/core/src/physical_optimizer/global_sort_selection.rs +++ b/datafusion/core/src/physical_optimizer/global_sort_selection.rs @@ -48,7 +48,7 @@ impl PhysicalOptimizerRule for GlobalSortSelection { fn optimize( &self, plan: Arc, - _config: &ConfigOptions, + config: &ConfigOptions, ) -> Result> { plan.transform_up(&|plan| { Ok(plan @@ -56,9 +56,9 @@ impl PhysicalOptimizerRule for GlobalSortSelection { .downcast_ref::() .and_then(|sort_exec| { if sort_exec.input().output_partitioning().partition_count() > 1 - && sort_exec.fetch().is_some() // It's already preserving the partitioning so that it can be regarded as a local sort && !sort_exec.preserve_partitioning() + && (sort_exec.fetch().is_some() || config.optimizer.repartition_sorts) { let sort = SortExec::new_with_partitioning( sort_exec.expr().to_vec(), diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement.rs b/datafusion/core/src/physical_optimizer/sort_enforcement.rs index 217744d486321..14579786c2ddb 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement.rs @@ -693,7 +693,11 @@ fn change_corresponding_coalesce_in_sub_plan( Some(sort_expr), || coalesce_input.equivalence_properties(), ) { - return add_sort_above_child(&coalesce_input, sort_expr.to_vec(), None); + return add_sort_above_child( + &coalesce_input, + sort_expr.to_vec(), + None, + ); } } coalesce_input diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement2.rs b/datafusion/core/src/physical_optimizer/sort_enforcement2.rs index 984d817db9a63..2ef3c6111b428 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement2.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement2.rs @@ -132,6 +132,8 @@ impl PlanWithSortRequirements { true } else if self.plan.as_any().downcast_ref::().is_some() { false + } else if self.plan.as_any().downcast_ref::().is_some() { + self.plan.output_ordering().is_some() && self.impact_result_ordering } else { self.plan.maintains_input_order().iter().all(|o| *o) && self.impact_result_ordering @@ -195,8 +197,24 @@ impl PhysicalOptimizerRule for TopDownEnforceSorting { // Execute a Top-Down process(Preorder Traversal) to ensure the sort requirements: let plan_requirements = PlanWithSortRequirements::init(plan); let adjusted = plan_requirements.transform_down(&ensure_sorting)?; + // Execute a Top-Down process(Preorder Traversal) to remove all the unnecessary Sort + let adjusted_plan = adjusted.plan.transform_down(&|plan| { + if let Some(sort_exec) = plan.as_any().downcast_ref::() { + if ordering_satisfy( + sort_exec.input().output_ordering(), + sort_exec.output_ordering(), + || sort_exec.input().equivalence_properties(), + ) { + Ok(Some(Arc::new(TombStoneExec::new(sort_exec.input().clone())))) + } else { + Ok(None) + } + } else { + Ok(None) + } + })?; // Remove the TombStoneExec - let final_plan = adjusted.plan.transform_up(&|plan| { + let final_plan = adjusted_plan.transform_up(&|plan| { if let Some(tombstone_exec) = plan.as_any().downcast_ref::() { Ok(Some(tombstone_exec.input.clone())) } else { @@ -243,7 +261,7 @@ fn ensure_sorting( sort_exec.input().output_ordering(), sort_exec.output_ordering(), || sort_exec.input().equivalence_properties(), - ) && sort_exec.input().output_partitioning().partition_count() == 1 + ) { println!("remove sort_exec due to child already satisfy"); return Ok(Some(PlanWithSortRequirements { @@ -259,38 +277,78 @@ fn ensure_sorting( .as_any() .downcast_ref::() { - // SortPreservingMergeExec + SortExec(local/global) is the same as the global SortExec - // Remove unnecessary SortPreservingMergeExec + SortExec(local/global) - if let Some(child_sort_exec) = - sort_pres_exec.input().as_any().downcast_ref::() - { - if sort_pres_exec.expr() == child_sort_exec.expr() { - if !requirements.impact_result_ordering - && requirements.required_ordering.is_none() - { - println!("remove SortPreservingMergeExec + SortExec due to no need to keep ordering"); + if !sort_pres_exec.satisfy_distribution() { + // SortPreservingMergeExec + SortExec(local/global) is the same as the global SortExec + // Remove unnecessary SortPreservingMergeExec + SortExec(local/global) + if let Some(child_sort_exec) = + sort_pres_exec.input().as_any().downcast_ref::() + { + if sort_pres_exec.expr() == child_sort_exec.expr() { + if !requirements.impact_result_ordering + && requirements.required_ordering.is_none() + { + println!("remove SortPreservingMergeExec + SortExec due to no need to keep ordering"); + return Ok(Some(PlanWithSortRequirements { + plan: Arc::new(TombStoneExec::new( + child_sort_exec.input().clone(), + )), + impact_result_ordering: false, + required_ordering: None, + adjusted_request_ordering: vec![None], + })); + } else if ordering_satisfy( + child_sort_exec.input().output_ordering(), + child_sort_exec.output_ordering(), + || child_sort_exec.input().equivalence_properties(), + ) && child_sort_exec + .input() + .output_partitioning() + .partition_count() + == 1 + { + println!("remove SortPreservingMergeExec + SortExec due to child already satisfy"); + return Ok(Some(PlanWithSortRequirements { + plan: Arc::new(TombStoneExec::new( + child_sort_exec.input().clone(), + )), + impact_result_ordering: true, + required_ordering: None, + adjusted_request_ordering: vec![ + requirements.required_ordering, + ], + })); + } + } + } else { + // Remove unnecessary SortPreservingMergeExec only + if !requirements.impact_result_ordering { + println!( + "remove SortPreservingMergeExec due to no need to keep ordering" + ); return Ok(Some(PlanWithSortRequirements { plan: Arc::new(TombStoneExec::new( - child_sort_exec.input().clone(), + sort_pres_exec.input().clone(), )), impact_result_ordering: false, required_ordering: None, - adjusted_request_ordering: vec![None], + adjusted_request_ordering: vec![requirements.required_ordering], })); } else if ordering_satisfy( - child_sort_exec.input().output_ordering(), - child_sort_exec.output_ordering(), - || child_sort_exec.input().equivalence_properties(), - ) && child_sort_exec + sort_pres_exec.input().output_ordering(), + Some(sort_pres_exec.expr()), + || sort_pres_exec.input().equivalence_properties(), + ) && sort_pres_exec .input() .output_partitioning() .partition_count() == 1 { - println!("remove SortPreservingMergeExec + SortExec due to child already satisfy"); + println!( + "remove SortPreservingMergeExec due to child already satisfy" + ); return Ok(Some(PlanWithSortRequirements { plan: Arc::new(TombStoneExec::new( - child_sort_exec.input().clone(), + sort_pres_exec.input().clone(), )), impact_result_ordering: true, required_ordering: None, @@ -298,36 +356,6 @@ fn ensure_sorting( })); } } - } else { - // Remove unnecessary SortPreservingMergeExec only - if !requirements.impact_result_ordering { - println!( - "remove SortPreservingMergeExec due to no need to keep ordering" - ); - return Ok(Some(PlanWithSortRequirements { - plan: Arc::new(TombStoneExec::new(sort_pres_exec.input().clone())), - impact_result_ordering: false, - required_ordering: None, - adjusted_request_ordering: vec![requirements.required_ordering], - })); - } else if ordering_satisfy( - sort_pres_exec.input().output_ordering(), - Some(sort_pres_exec.expr()), - || sort_pres_exec.input().equivalence_properties(), - ) && sort_pres_exec - .input() - .output_partitioning() - .partition_count() - == 1 - { - println!("remove SortPreservingMergeExec due to child already satisfy"); - return Ok(Some(PlanWithSortRequirements { - plan: Arc::new(TombStoneExec::new(sort_pres_exec.input().clone())), - impact_result_ordering: true, - required_ordering: None, - adjusted_request_ordering: vec![requirements.required_ordering], - })); - } } } println!("no removing"); diff --git a/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs b/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs index 8a73061e486f0..4a0bfce0874a2 100644 --- a/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs +++ b/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs @@ -85,6 +85,8 @@ pub struct SortPreservingMergeExec { expr: Vec, /// Execution metrics metrics: ExecutionPlanMetricsSet, + /// use SortPreservingMergeExec to satisfy the Sort Distribution + satisfy_distribution: bool } impl SortPreservingMergeExec { @@ -94,6 +96,17 @@ impl SortPreservingMergeExec { input, expr, metrics: ExecutionPlanMetricsSet::new(), + satisfy_distribution: false + } + } + + /// Create a new SortPreservingMergeExec to satisfy the Sort Distribution + pub fn new_for_distribuion(expr: Vec, input: Arc) -> Self { + Self { + input, + expr, + metrics: ExecutionPlanMetricsSet::new(), + satisfy_distribution: true } } @@ -106,6 +119,11 @@ impl SortPreservingMergeExec { pub fn expr(&self) -> &[PhysicalSortExpr] { &self.expr } + + /// satisfy the Sort Distribution requirements + pub fn satisfy_distribution(&self) -> bool{ + self.satisfy_distribution + } } impl ExecutionPlan for SortPreservingMergeExec { diff --git a/datafusion/core/tests/sql/joins.rs b/datafusion/core/tests/sql/joins.rs index c360b2f240d0a..b92242bed52b0 100644 --- a/datafusion/core/tests/sql/joins.rs +++ b/datafusion/core/tests/sql/joins.rs @@ -1981,7 +1981,7 @@ async fn left_semi_join() -> Result<()> { let expected = if repartition_joins { vec![ "SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]", - " SortExec: [t1_id@0 ASC NULLS LAST]", + " SortExec: [t1_id@0 ASC NULLS LAST], global=false", " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name]", " CoalesceBatchesExec: target_batch_size=4096", " HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(Column { name: \"t1_id\", index: 0 }, Column { name: \"t2_id\", index: 0 })]", @@ -1998,7 +1998,7 @@ async fn left_semi_join() -> Result<()> { } else { vec![ "SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]", - " SortExec: [t1_id@0 ASC NULLS LAST]", + " SortExec: [t1_id@0 ASC NULLS LAST], global=false", " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name]", " CoalesceBatchesExec: target_batch_size=4096", " HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(Column { name: \"t1_id\", index: 0 }, Column { name: \"t2_id\", index: 0 })]", @@ -2063,7 +2063,7 @@ async fn left_semi_join() -> Result<()> { let expected = if repartition_joins { vec![ "SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]", - " SortExec: [t1_id@0 ASC NULLS LAST]", + " SortExec: [t1_id@0 ASC NULLS LAST], global=false", " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name]", " CoalesceBatchesExec: target_batch_size=4096", " HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(Column { name: \"t1_id\", index: 0 }, Column { name: \"t2_id\", index: 0 })]", @@ -2079,7 +2079,7 @@ async fn left_semi_join() -> Result<()> { } else { vec![ "SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]", - " SortExec: [t1_id@0 ASC NULLS LAST]", + " SortExec: [t1_id@0 ASC NULLS LAST], global=false", " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name]", " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", " CoalesceBatchesExec: target_batch_size=4096", @@ -2260,7 +2260,7 @@ async fn right_semi_join() -> Result<()> { let physical_plan = dataframe.create_physical_plan().await?; let expected = if repartition_joins { vec![ "SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]", - " SortExec: [t1_id@0 ASC NULLS LAST]", + " SortExec: [t1_id@0 ASC NULLS LAST], global=false", " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int]", " CoalesceBatchesExec: target_batch_size=4096", " HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(Column { name: \"t2_id\", index: 0 }, Column { name: \"t1_id\", index: 0 })], filter=BinaryExpr { left: Column { name: \"t2_name\", index: 1 }, op: NotEq, right: Column { name: \"t1_name\", index: 0 } }", @@ -2276,7 +2276,7 @@ async fn right_semi_join() -> Result<()> { } else { vec![ "SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]", - " SortExec: [t1_id@0 ASC NULLS LAST]", + " SortExec: [t1_id@0 ASC NULLS LAST], global=false", " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int]", " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", " CoalesceBatchesExec: target_batch_size=4096", @@ -2308,7 +2308,7 @@ async fn right_semi_join() -> Result<()> { let physical_plan = dataframe.create_physical_plan().await?; let expected = if repartition_joins { vec![ "SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]", - " SortExec: [t1_id@0 ASC NULLS LAST]", + " SortExec: [t1_id@0 ASC NULLS LAST], global=false", " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int]", " CoalesceBatchesExec: target_batch_size=4096", " HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(Column { name: \"t2_id\", index: 0 }, Column { name: \"t1_id\", index: 0 })], filter=BinaryExpr { left: Column { name: \"t2_name\", index: 0 }, op: NotEq, right: Column { name: \"t1_name\", index: 1 } }", @@ -2324,7 +2324,7 @@ async fn right_semi_join() -> Result<()> { } else { vec![ "SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]", - " SortExec: [t1_id@0 ASC NULLS LAST]", + " SortExec: [t1_id@0 ASC NULLS LAST], global=false", " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int]", " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", " CoalesceBatchesExec: target_batch_size=4096", diff --git a/datafusion/core/tests/sql/window.rs b/datafusion/core/tests/sql/window.rs index 89df1045039d0..91ab5c32f3977 100644 --- a/datafusion/core/tests/sql/window.rs +++ b/datafusion/core/tests/sql/window.rs @@ -1748,7 +1748,7 @@ async fn over_order_by_sort_keys_sorting_global_order_compacting() -> Result<()> let expected = { vec![ "SortPreservingMergeExec: [c2@0 ASC NULLS LAST]", - " SortExec: [c2@0 ASC NULLS LAST]", + " SortExec: [c2@0 ASC NULLS LAST], global=false", " ProjectionExec: expr=[c2@0 as c2, MAX(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as MAX(aggregate_test_100.c9), SUM(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@4 as SUM(aggregate_test_100.c9), MIN(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as MIN(aggregate_test_100.c9)]", " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", " WindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: \"SUM(aggregate_test_100.c9)\", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)) }]", @@ -2403,7 +2403,7 @@ async fn test_window_agg_global_sort() -> Result<()> { "SortPreservingMergeExec: [c1@0 ASC NULLS LAST]", " ProjectionExec: expr=[c1@0 as c1, ROW_NUMBER() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as rn1]", " BoundedWindowAggExec: wdw=[ROW_NUMBER(): Ok(Field { name: \"ROW_NUMBER()\", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)) }]", - " SortExec: [c1@0 ASC NULLS LAST]", + " SortExec: [c1@0 ASC NULLS LAST], global=false", " CoalesceBatchesExec: target_batch_size=8192", " RepartitionExec: partitioning=Hash([Column { name: \"c1\", index: 0 }], 2), input_partitions=2", " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", @@ -2438,11 +2438,11 @@ async fn test_window_agg_global_sort_parallelize_sort_disabled() -> Result<()> { // Only 1 SortExec was added let expected = { vec![ - "SortExec: [c1@0 ASC NULLS LAST]", + "SortExec: [c1@0 ASC NULLS LAST], global=true", " CoalescePartitionsExec", " ProjectionExec: expr=[c1@0 as c1, ROW_NUMBER() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as rn1]", " BoundedWindowAggExec: wdw=[ROW_NUMBER(): Ok(Field { name: \"ROW_NUMBER()\", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)) }]", - " SortExec: [c1@0 ASC NULLS LAST]", + " SortExec: [c1@0 ASC NULLS LAST], global=false", " CoalesceBatchesExec: target_batch_size=8192", " RepartitionExec: partitioning=Hash([Column { name: \"c1\", index: 0 }], 2), input_partitions=2", " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", @@ -2480,16 +2480,17 @@ async fn test_window_agg_global_sort_intermediate_parallel_sort() -> Result<()> // Only 1 SortExec was added let expected = { vec![ - "SortExec: [c1@0 ASC NULLS LAST]", - " ProjectionExec: expr=[c1@0 as c1, SUM(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING@2 as sum1, SUM(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@3 as sum2]", - " BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: \"SUM(aggregate_test_100.c9)\", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)) }]", - " SortPreservingMergeExec: [c9@1 ASC NULLS LAST]", - " SortExec: [c9@1 ASC NULLS LAST]", - " BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: \"SUM(aggregate_test_100.c9)\", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(3)) }]", - " SortExec: [c1@0 ASC NULLS LAST,c9@1 ASC NULLS LAST]", - " CoalesceBatchesExec: target_batch_size=8192", - " RepartitionExec: partitioning=Hash([Column { name: \"c1\", index: 0 }], 2), input_partitions=2", - " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", + "SortPreservingMergeExec: [c1@0 ASC NULLS LAST]", + " SortExec: [c1@0 ASC NULLS LAST], global=false", + " ProjectionExec: expr=[c1@0 as c1, SUM(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING@2 as sum1, SUM(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@3 as sum2]", + " BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: \"SUM(aggregate_test_100.c9)\", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)) }]", + " SortPreservingMergeExec: [c9@1 ASC NULLS LAST]", + " SortExec: [c9@1 ASC NULLS LAST], global=false", + " BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: \"SUM(aggregate_test_100.c9)\", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(3)) }]", + " SortExec: [c1@0 ASC NULLS LAST,c9@1 ASC NULLS LAST], global=false", + " CoalesceBatchesExec: target_batch_size=8192", + " RepartitionExec: partitioning=Hash([Column { name: \"c1\", index: 0 }], 2), input_partitions=2", + " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", ] }; @@ -2524,7 +2525,7 @@ async fn test_window_agg_with_global_limit() -> Result<()> { " AggregateExec: mode=Final, gby=[], aggr=[ARRAYAGG(aggregate_test_100.c13)]", " AggregateExec: mode=Partial, gby=[], aggr=[ARRAYAGG(aggregate_test_100.c13)]", " GlobalLimitExec: skip=0, fetch=1", - " SortExec: [c13@0 ASC NULLS LAST]", + " SortExec: [c13@0 ASC NULLS LAST], global=true", " ProjectionExec: expr=[c13@0 as c13]", ] }; diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index 533a606b69e85..44a33b2d45fb3 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -219,7 +219,7 @@ pub fn normalize_sort_requirement_with_equivalence_properties( } } -/// Checks whether given ordering requirements are satisfied by provided [PhysicalSortExpr]s. +/// Checks whether the required [PhysicalSortExpr]s are satisfied by the provided [PhysicalSortExpr]s. pub fn ordering_satisfy EquivalenceProperties>( provided: Option<&[PhysicalSortExpr]>, required: Option<&[PhysicalSortExpr]>, @@ -269,6 +269,7 @@ fn ordering_satisfy_concrete EquivalenceProperties>( } } +/// Checks whether the required ordering requirements are satisfied by the provided [PhysicalSortExpr]s. pub fn ordering_satisfy_requirement EquivalenceProperties>( provided: Option<&[PhysicalSortExpr]>, required: Option<&[PhysicalSortRequirements]>, From 06b40640e10a8d7bccb6a43be29bd816ed65e119 Mon Sep 17 00:00:00 2001 From: "mingmwang@ebay.com" Date: Mon, 13 Feb 2023 11:03:13 +0800 Subject: [PATCH 03/35] fix UT --- .../src/physical_optimizer/sort_enforcement2.rs | 14 +++++++++++++- datafusion/core/tests/sql/window.rs | 12 +++++++----- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement2.rs b/datafusion/core/src/physical_optimizer/sort_enforcement2.rs index 2ef3c6111b428..daee039817153 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement2.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement2.rs @@ -1056,7 +1056,19 @@ fn should_reverse_window_sort_requirements( top_reversed_requirement, ) } else { - false + if requirements_compatible( + top_reversed_requirement, + window_plan.required_input_ordering()[0].as_deref(), + || window_plan.equivalence_properties(), + ) || requirements_compatible( + window_plan.required_input_ordering()[0].as_deref(), + top_reversed_requirement, + || window_plan.equivalence_properties(), + ) { + true + } else { + false + } } } else if requirements_compatible( top_reversed_requirement, diff --git a/datafusion/core/tests/sql/window.rs b/datafusion/core/tests/sql/window.rs index 91ab5c32f3977..2508478c29316 100644 --- a/datafusion/core/tests/sql/window.rs +++ b/datafusion/core/tests/sql/window.rs @@ -2087,12 +2087,14 @@ async fn test_window_agg_complex_plan() -> Result<()> { " SortExec: [c3@2 ASC NULLS LAST,c2@1 ASC NULLS LAST], global=true", " BoundedWindowAggExec: wdw=[SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow }]", " SortExec: [c3@2 ASC NULLS LAST,c1@0 ASC], global=true", - " WindowAggExec: wdw=[SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)) }]", - " WindowAggExec: wdw=[SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(11)), end_bound: Following(Int64(10)) }, SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)) }, SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(11)), end_bound: Following(Int64(NULL)) }, SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow }]", - " WindowAggExec: wdw=[SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(10)), end_bound: Following(Int64(11)) }, SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow }, SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: Following(Int64(11)) }, SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)) }]", + " BoundedWindowAggExec: wdw=[SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow }]", + " SortExec: [c3@2 ASC NULLS LAST,c1@0 DESC], global=true", + " WindowAggExec: wdw=[SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(11)), end_bound: Following(Int64(10)) }, SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)) }, SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(11)), end_bound: Following(Int64(NULL)) }, SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow }]", " WindowAggExec: wdw=[SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(10)), end_bound: Following(Int64(11)) }, SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow }, SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: Following(Int64(11)) }, SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)) }]", - " BoundedWindowAggExec: wdw=[SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow }]", - " SortExec: [c3@2 DESC,c1@0 ASC NULLS LAST], global=true", + " SortExec: [c3@2 DESC NULLS LAST], global=true", + " WindowAggExec: wdw=[SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(10)), end_bound: Following(Int64(11)) }, SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow }, SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: Following(Int64(11)) }, SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)) }]", + " BoundedWindowAggExec: wdw=[SUM(null_cases.c1): Ok(Field { name: \"SUM(null_cases.c1)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow }]", + " SortExec: [c3@2 DESC,c1@0 ASC NULLS LAST], global=true", ] }; From 601a4d08ea258f35b7c2bb3d827bbb5e22032f51 Mon Sep 17 00:00:00 2001 From: "mingmwang@ebay.com" Date: Mon, 13 Feb 2023 17:37:14 +0800 Subject: [PATCH 04/35] add more UTs to sort_enforcement2.rs --- .../physical_optimizer/sort_enforcement.rs | 30 +- .../physical_optimizer/sort_enforcement2.rs | 580 +++++++++++------- .../windows/bounded_window_agg_exec.rs | 4 +- .../physical_plan/windows/window_agg_exec.rs | 4 +- 4 files changed, 400 insertions(+), 218 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement.rs b/datafusion/core/src/physical_optimizer/sort_enforcement.rs index 5253a2da437fe..eb5b4789e539c 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement.rs @@ -27,7 +27,7 @@ //! somehow get the fragment //! //! ```text -//! SortExec: [nullable_col@0 ASC] +//! SortExec: [nullable_co l@0 ASC] //! SortExec: [non_nullable_col@1 ASC] //! ``` //! @@ -1185,6 +1185,34 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_remove_unnecessary_sort1() -> Result<()> { + let schema = create_test_schema()?; + let source = memory_exec(&schema); + let sort_exprs = vec![sort_expr("nullable_col", &schema)]; + let sort = sort_exec(sort_exprs.clone(), source); + let spm = sort_preserving_merge_exec(sort_exprs, sort); + + let sort_exprs = vec![sort_expr("nullable_col", &schema)]; + let sort = sort_exec(sort_exprs.clone(), spm); + let physical_plan = sort_preserving_merge_exec(sort_exprs, sort); + let expected_input = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " SortExec: [nullable_col@0 ASC], global=true", + " SortPreservingMergeExec: [nullable_col@0 ASC]", + " SortExec: [nullable_col@0 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + let expected_optimized = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " SortPreservingMergeExec: [nullable_col@0 ASC]", + " SortExec: [nullable_col@0 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + #[tokio::test] async fn test_remove_unnecessary_sort2() -> Result<()> { let schema = create_test_schema()?; diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement2.rs b/datafusion/core/src/physical_optimizer/sort_enforcement2.rs index a4f9a27e32327..b624fd762dce0 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement2.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement2.rs @@ -25,8 +25,12 @@ //! //! A non-realistic but easy to follow example for sort removals: Assume that we //! somehow get the fragment -//! "SortExec: [nullable_col@0 ASC]", -//! " SortExec: [non_nullable_col@1 ASC]", +//! +//! ```text +//! SortExec: [nullable_co l@0 ASC] +//! SortExec: [non_nullable_col@1 ASC] +//! ``` +//! //! in the physical plan. The first sort is unnecessary since its result is overwritten //! by another SortExec. Therefore, this rule removes it from the physical plan. use crate::config::ConfigOptions; @@ -278,49 +282,47 @@ fn ensure_sorting( .as_any() .downcast_ref::() { - if !sort_pres_exec.satisfy_distribution() { - // SortPreservingMergeExec + SortExec(local/global) is the same as the global SortExec - // Remove unnecessary SortPreservingMergeExec + SortExec(local/global) - if let Some(child_sort_exec) = - sort_pres_exec.input().as_any().downcast_ref::() - { - if sort_pres_exec.expr() == child_sort_exec.expr() { - if !requirements.impact_result_ordering - && requirements.required_ordering.is_none() - { - println!("remove SortPreservingMergeExec + SortExec due to no need to keep ordering"); - return Ok(Some(PlanWithSortRequirements { - plan: Arc::new(TombStoneExec::new( - child_sort_exec.input().clone(), - )), - impact_result_ordering: false, - required_ordering: None, - adjusted_request_ordering: vec![None], - })); - } else if ordering_satisfy( - child_sort_exec.input().output_ordering(), - child_sort_exec.output_ordering(), - || child_sort_exec.input().equivalence_properties(), - ) && child_sort_exec - .input() - .output_partitioning() - .partition_count() - == 1 - { - println!("remove SortPreservingMergeExec + SortExec due to child already satisfy"); - return Ok(Some(PlanWithSortRequirements { - plan: Arc::new(TombStoneExec::new( - child_sort_exec.input().clone(), - )), - impact_result_ordering: true, - required_ordering: None, - adjusted_request_ordering: vec![ - requirements.required_ordering, - ], - })); - } + // SortPreservingMergeExec + SortExec(local/global) is the same as the global SortExec + // Remove unnecessary SortPreservingMergeExec + SortExec(local/global) + if let Some(child_sort_exec) = + sort_pres_exec.input().as_any().downcast_ref::() + { + if sort_pres_exec.expr() == child_sort_exec.expr() { + if !requirements.impact_result_ordering + && requirements.required_ordering.is_none() + { + println!("remove SortPreservingMergeExec + SortExec due to no need to keep ordering"); + return Ok(Some(PlanWithSortRequirements { + plan: Arc::new(TombStoneExec::new( + child_sort_exec.input().clone(), + )), + impact_result_ordering: false, + required_ordering: None, + adjusted_request_ordering: vec![None], + })); + } else if ordering_satisfy( + child_sort_exec.input().output_ordering(), + child_sort_exec.output_ordering(), + || child_sort_exec.input().equivalence_properties(), + ) && child_sort_exec + .input() + .output_partitioning() + .partition_count() + == 1 + { + println!("remove SortPreservingMergeExec + SortExec due to child already satisfy"); + return Ok(Some(PlanWithSortRequirements { + plan: Arc::new(TombStoneExec::new( + child_sort_exec.input().clone(), + )), + impact_result_ordering: true, + required_ordering: None, + adjusted_request_ordering: vec![requirements.required_ordering], + })); } - } else { + } + } else { + if !sort_pres_exec.satisfy_distribution() { // Remove unnecessary SortPreservingMergeExec only if !requirements.impact_result_ordering { println!( @@ -1198,6 +1200,13 @@ mod tests { Ok(schema) } + // Util function to get string representation of a physical plan + fn get_plan_string(plan: &Arc) -> Vec { + let formatted = displayable(plan.as_ref()).indent().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + actual.iter().map(|elem| elem.to_string()).collect() + } + #[tokio::test] async fn test_is_column_aligned_nullable() -> Result<()> { let schema = create_test_schema()?; @@ -1307,11 +1316,8 @@ mod tests { // Run the actual optimizer let optimized_physical_plan = TopDownEnforceSorting::new().optimize(physical_plan, state.config_options())?; - - let formatted = displayable(optimized_physical_plan.as_ref()) - .indent() - .to_string(); - let actual: Vec<&str> = formatted.trim().lines().collect(); + // Get string representation of the plan + let actual = get_plan_string(&optimized_physical_plan); assert_eq!( expected_optimized_lines, actual, "\n**Optimized Plan Mismatch\n\nexpected:\n\n{expected_optimized_lines:#?}\nactual:\n\n{actual:#?}\n\n" @@ -1321,7 +1327,27 @@ mod tests { } #[tokio::test] - async fn test_not_remove_sort_window_multilayer() -> Result<()> { + async fn test_remove_unnecessary_sort() -> Result<()> { + let schema = create_test_schema()?; + let source = memory_exec(&schema); + let input = sort_exec(vec![sort_expr("non_nullable_col", &schema)], source); + let physical_plan = sort_exec(vec![sort_expr("nullable_col", &schema)], input); + + let expected_input = vec![ + "SortExec: [nullable_col@0 ASC], global=true", + " SortExec: [non_nullable_col@1 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + let expected_optimized = vec![ + "SortExec: [nullable_col@0 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_remove_unnecessary_sort_window_multilayer() -> Result<()> { let schema = create_test_schema()?; let source = memory_exec(&schema); @@ -1347,6 +1373,8 @@ mod tests { )]; let sort = sort_exec(sort_exprs.clone(), window_agg); + + // Add dummy layer propagating Sort above, to test whether sort can be removed from multi layer before let filter = filter_exec( Arc::new(NotExpr::new( col("non_nullable_col", schema.as_ref()).unwrap(), @@ -1366,14 +1394,14 @@ mod tests { " MemoryExec: partitions=0, partition_sizes=[]", ]; - // let expected_optimized = vec![ - // "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(NULL) }]", - // " FilterExec: NOT non_nullable_col@1", - // " WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - // " SortExec: [non_nullable_col@1 DESC]", - // " MemoryExec: partitions=0, partition_sizes=[]", - // ]; - assert_optimized!(expected_input, expected_input, physical_plan); + let expected_optimized = vec![ + "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(NULL) }]", + " FilterExec: NOT non_nullable_col@1", + " WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + " SortExec: [non_nullable_col@1 DESC]", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); Ok(()) } @@ -1437,57 +1465,6 @@ mod tests { #[tokio::test] async fn test_remove_unnecessary_sort1() -> Result<()> { - let schema = create_test_schema()?; - let source = memory_exec(&schema); - let input = sort_exec(vec![sort_expr("non_nullable_col", &schema)], source); - let physical_plan = sort_exec(vec![sort_expr("nullable_col", &schema)], input); - - let expected_input = vec![ - "SortExec: [nullable_col@0 ASC], global=true", - " SortExec: [non_nullable_col@1 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - // Keep the top SortExec - let expected_optimized = [ - "SortExec: [nullable_col@0 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_remove_unnecessary_sort2() -> Result<()> { - let schema = create_test_schema()?; - let source = memory_exec(&schema); - - let input = sort_exec(vec![sort_expr("non_nullable_col", &schema)], source); - let input2 = sort_exec( - vec![ - sort_expr("nullable_col", &schema), - sort_expr("non_nullable_col", &schema), - ], - input, - ); - let physical_plan = sort_exec(vec![sort_expr("nullable_col", &schema)], input2); - - let expected_input = vec![ - "SortExec: [nullable_col@0 ASC], global=true", - " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " SortExec: [non_nullable_col@1 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - // Keep the middle SortExec - let expected_optimized = [ - "SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_remove_unnecessary_sort3() -> Result<()> { let schema = create_test_schema()?; let source = memory_exec(&schema); let sort_exprs = vec![sort_expr("nullable_col", &schema)]; @@ -1514,7 +1491,7 @@ mod tests { } #[tokio::test] - async fn test_remove_unnecessary_sort4() -> Result<()> { + async fn test_remove_unnecessary_sort2() -> Result<()> { let schema = create_test_schema()?; let source = memory_exec(&schema); let sort_exprs = vec![sort_expr("non_nullable_col", &schema)]; @@ -1529,22 +1506,23 @@ mod tests { let spm2 = sort_preserving_merge_exec(sort_exprs, sort2); let sort_exprs = vec![sort_expr("nullable_col", &schema)]; - let sort3 = sort_exec(sort_exprs.clone(), spm2); - let physical_plan = sort_preserving_merge_exec(sort_exprs, sort3); + let sort3 = sort_exec(sort_exprs, spm2); + let physical_plan = repartition_exec(repartition_exec(sort3)); let expected_input = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortExec: [nullable_col@0 ASC], global=true", - " SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " SortPreservingMergeExec: [non_nullable_col@1 ASC]", - " SortExec: [non_nullable_col@1 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", + "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10", + " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", + " SortExec: [nullable_col@0 ASC], global=true", + " SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " SortPreservingMergeExec: [non_nullable_col@1 ASC]", + " SortExec: [non_nullable_col@1 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", ]; - // Keep the middle SortPreservingMergeExec + SortExec + let expected_optimized = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10", + " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=0", " MemoryExec: partitions=0, partition_sizes=[]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); @@ -1552,7 +1530,7 @@ mod tests { } #[tokio::test] - async fn test_remove_unnecessary_sort5() -> Result<()> { + async fn test_remove_unnecessary_sort3() -> Result<()> { let schema = create_test_schema()?; let source = memory_exec(&schema); let sort_exprs = vec![sort_expr("non_nullable_col", &schema)]; @@ -1563,33 +1541,92 @@ mod tests { sort_expr("nullable_col", &schema), sort_expr("non_nullable_col", &schema), ]; - let sort2 = sort_exec(sort_exprs.clone(), spm); + let repartition_exec = repartition_exec(spm); + let sort2 = sort_exec(sort_exprs.clone(), repartition_exec); let spm2 = sort_preserving_merge_exec(sort_exprs, sort2); - let sort_exprs = vec![sort_expr("nullable_col", &schema)]; - let sort3 = sort_exec(sort_exprs.clone(), spm2); - let physical_plan = repartition_exec(repartition_exec(sort3)); + let physical_plan = aggregate_exec(spm2); + // When removing a `SortPreservingMergeExec`, make sure that partitioning + // requirements are not violated. In some cases, we may need to replace + // it with a `CoalescePartitionsExec` instead of directly removing it. let expected_input = vec![ - "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: [nullable_col@0 ASC], global=true", - " SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " SortPreservingMergeExec: [non_nullable_col@1 ASC]", - " SortExec: [non_nullable_col@1 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", + "AggregateExec: mode=Final, gby=[], aggr=[]", + " SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", + " SortPreservingMergeExec: [non_nullable_col@1 ASC]", + " SortExec: [non_nullable_col@1 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", ]; let expected_optimized = vec![ - "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=0", + "AggregateExec: mode=Final, gby=[], aggr=[]", + " CoalescePartitionsExec", + " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=0", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_remove_unnecessary_sort4() -> Result<()> { + let schema = create_test_schema()?; + let source = memory_exec(&schema); + let sort_exprs = vec![sort_expr("nullable_col", &schema)]; + let sort = sort_exec(sort_exprs.clone(), source); + let spm = sort_preserving_merge_exec(sort_exprs, sort); + + let sort_exprs = vec![sort_expr("nullable_col", &schema)]; + let sort = sort_exec(sort_exprs.clone(), spm); + let physical_plan = sort_preserving_merge_exec(sort_exprs, sort); + let expected_input = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " SortExec: [nullable_col@0 ASC], global=true", + " SortPreservingMergeExec: [nullable_col@0 ASC]", + " SortExec: [nullable_col@0 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + let expected_optimized = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " SortExec: [nullable_col@0 ASC], global=true", " MemoryExec: partitions=0, partition_sizes=[]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); Ok(()) } + #[tokio::test] + async fn test_remove_unnecessary_sort5() -> Result<()> { + let schema = create_test_schema()?; + let source = memory_exec(&schema); + + let input = sort_exec(vec![sort_expr("non_nullable_col", &schema)], source); + let input2 = sort_exec( + vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ], + input, + ); + let physical_plan = sort_exec(vec![sort_expr("nullable_col", &schema)], input2); + + let expected_input = vec![ + "SortExec: [nullable_col@0 ASC], global=true", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " SortExec: [non_nullable_col@1 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + // Keep the middle SortExec + let expected_optimized = [ + "SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + #[tokio::test] async fn test_remove_unnecessary_spm1() -> Result<()> { let schema = create_test_schema()?; @@ -1643,6 +1680,52 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_do_not_remove_sort_with_limit() -> Result<()> { + let schema = create_test_schema()?; + + let source1 = parquet_exec(&schema); + let sort_exprs = vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ]; + let sort = sort_exec(sort_exprs.clone(), source1); + let limit = limit_exec(sort); + + let parquet_sort_exprs = vec![sort_expr("nullable_col", &schema)]; + let source2 = parquet_exec_sorted(&schema, parquet_sort_exprs); + + let union = union_exec(vec![source2, limit]); + let repartition = repartition_exec(union); + let physical_plan = sort_preserving_merge_exec(sort_exprs, repartition); + + let expected_input = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", + " UnionExec", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + " GlobalLimitExec: skip=0, fetch=100", + " LocalLimitExec: fetch=100", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + + // We should keep the bottom `SortExec`. + let expected_optimized = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=false", + " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", + " UnionExec", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + " GlobalLimitExec: skip=0, fetch=100", + " LocalLimitExec: fetch=100", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + #[tokio::test] async fn test_change_wrong_sorting() -> Result<()> { let schema = create_test_schema()?; @@ -1799,32 +1882,40 @@ mod tests { let schema = create_test_schema()?; let source1 = parquet_exec(&schema); - let sort_exprs1 = vec![sort_expr("nullable_col", &schema)]; - let sort_exprs2 = vec![ + let sort_exprs1 = vec![ sort_expr("nullable_col", &schema), sort_expr("non_nullable_col", &schema), ]; - let sort = sort_exec(sort_exprs1.clone(), source1); + let sort1 = sort_exec(sort_exprs1, source1.clone()); + let sort_exprs2 = vec![sort_expr("nullable_col", &schema)]; + let sort2 = sort_exec(sort_exprs2, source1); - let source2 = parquet_exec_sorted(&schema, sort_exprs2.clone()); + let parquet_sort_exprs = vec![sort_expr("nullable_col", &schema)]; + let source2 = parquet_exec_sorted(&schema, parquet_sort_exprs.clone()); - let union = union_exec(vec![source2, sort]); - let physical_plan = sort_preserving_merge_exec(sort_exprs2, union); + let union = union_exec(vec![sort1, source2, sort2]); + let physical_plan = sort_preserving_merge_exec(parquet_sort_exprs, union); - // Input is an invalid plan. In this case rule should add required sorting in appropriate places. + // First input to the union is not Sorted (SortExec is finer than required ordering by the SortPreservingMergeExec above). + // Second input to the union is already Sorted (matches with the required ordering by the SortPreservingMergeExec above). + // Third input to the union is not Sorted (SortExec is matches required ordering by the SortPreservingMergeExec above). let expected_input = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + "SortPreservingMergeExec: [nullable_col@0 ASC]", " UnionExec", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", " SortExec: [nullable_col@0 ASC], global=true", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; - // expect to replace the wrong SortExec with the correct one + // should adjust sorting in the first input of the union such that it is not unnecessarily fine let expected_optimized = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + "SortPreservingMergeExec: [nullable_col@0 ASC]", " UnionExec", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " SortExec: [nullable_col@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + " SortExec: [nullable_col@0 ASC], global=true", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); @@ -1840,36 +1931,36 @@ mod tests { sort_expr("nullable_col", &schema), sort_expr("non_nullable_col", &schema), ]; - let sort1 = sort_exec(sort_exprs1, source1.clone()); let sort_exprs2 = vec![sort_expr("nullable_col", &schema)]; - let sort2 = sort_exec(sort_exprs2, source1); + let sort1 = sort_exec(sort_exprs2.clone(), source1.clone()); + let sort2 = sort_exec(sort_exprs2.clone(), source1); - let parquet_sort_exprs = vec![sort_expr("nullable_col", &schema)]; - let source2 = parquet_exec_sorted(&schema, parquet_sort_exprs.clone()); + let source2 = parquet_exec_sorted(&schema, sort_exprs2); let union = union_exec(vec![sort1, source2, sort2]); - let physical_plan = sort_preserving_merge_exec(parquet_sort_exprs, union); + let physical_plan = sort_preserving_merge_exec(sort_exprs1, union); - // First input to the union is not Sorted (SortExec is finer than required ordering by the SortPreservingMergeExec above). - // Second input to the union is already Sorted (matches with the required ordering by the SortPreservingMergeExec above). - // Third input to the union is not Sorted (SortExec is matches required ordering by the SortPreservingMergeExec above). + // Ordering requirement of the `SortPreservingMergeExec` is not met. + // Should modify the plan to ensure that all three inputs to the + // `UnionExec` satisfy the ordering, OR add a single sort after + // the `UnionExec` (both of which are equally good for this example). let expected_input = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", + "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", " UnionExec", - " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " SortExec: [nullable_col@0 ASC], global=true", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", " SortExec: [nullable_col@0 ASC], global=true", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; - // should adjust sorting in the first input of the union such that it is not unnecessarily fine let expected_optimized = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", + "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", " UnionExec", " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: [nullable_col@0 ASC], global=true", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); @@ -1885,44 +1976,101 @@ mod tests { sort_expr("nullable_col", &schema), sort_expr("non_nullable_col", &schema), ]; - let sort_exprs2 = vec![sort_expr("nullable_col", &schema)]; - let sort1 = sort_exec(sort_exprs2.clone(), source1.clone()); - let sort2 = sort_exec(sort_exprs2.clone(), source1); - - let source2 = parquet_exec_sorted(&schema, sort_exprs2); + let sort_exprs2 = vec![ + sort_expr("nullable_col", &schema), + sort_expr_options( + "non_nullable_col", + &schema, + SortOptions { + descending: true, + nulls_first: false, + }, + ), + ]; + let sort_exprs3 = vec![sort_expr("nullable_col", &schema)]; + let sort1 = sort_exec(sort_exprs1, source1.clone()); + let sort2 = sort_exec(sort_exprs2, source1); - let union = union_exec(vec![sort1, source2, sort2]); - let physical_plan = sort_preserving_merge_exec(sort_exprs1, union); + let union = union_exec(vec![sort1, sort2]); + let physical_plan = sort_preserving_merge_exec(sort_exprs3, union); - // First input to the union is not Sorted (SortExec is finer than required ordering by the SortPreservingMergeExec above). - // Second input to the union is already Sorted (matches with the required ordering by the SortPreservingMergeExec above). - // Third input to the union is not Sorted (SortExec is matches required ordering by the SortPreservingMergeExec above). + // The `UnionExec` doesn't preserve any of the inputs ordering in the + // example below. However, we should be able to change the unnecessarily + // fine `SortExec`s below with required `SortExec`s that are absolutely necessary. let expected_input = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " UnionExec", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 DESC NULLS LAST], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + let expected_optimized = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", " UnionExec", " SortExec: [nullable_col@0 ASC], global=true", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", " SortExec: [nullable_col@0 ASC], global=true", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; - // should adjust sorting in the first input of the union such that it is not unnecessarily fine + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_union_inputs_different_sorted6() -> Result<()> { + let schema = create_test_schema()?; + + let source1 = parquet_exec(&schema); + let sort_exprs1 = vec![sort_expr("nullable_col", &schema)]; + let sort1 = sort_exec(sort_exprs1, source1.clone()); + let sort_exprs2 = vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ]; + let repartition = repartition_exec(source1); + let spm = sort_preserving_merge_exec(sort_exprs2, repartition); + + let parquet_sort_exprs = vec![sort_expr("nullable_col", &schema)]; + let source2 = parquet_exec_sorted(&schema, parquet_sort_exprs.clone()); + + let union = union_exec(vec![sort1, source2, spm]); + let physical_plan = sort_preserving_merge_exec(parquet_sort_exprs, union); + + // The plan is not valid as it is -- the input ordering requirement + // of the `SortPreservingMergeExec` under the third child of the + // `UnionExec` is not met. We should add a `SortExec` below it. + // At the same time, this ordering requirement is unnecessarily fine. + // The final plan should be valid AND the ordering of the third child + // shouldn't be finer than necessary. + let expected_input = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " UnionExec", + " SortExec: [nullable_col@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + " SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + // Should adjust the requirement in the third input of the union so + // that it is not unnecessarily fine. let expected_optimized = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " UnionExec", - " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " UnionExec", + " SortExec: [nullable_col@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + " SortExec: [nullable_col@0 ASC], global=false", + " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); Ok(()) } #[tokio::test] - async fn test_union_inputs_different_sorted6() -> Result<()> { + async fn test_union_inputs_different_sorted7() -> Result<()> { let schema = create_test_schema()?; let source1 = parquet_exec(&schema); @@ -1963,46 +2111,48 @@ mod tests { } #[tokio::test] - async fn test_do_not_remove_sort_with_limit() -> Result<()> { + async fn test_window_multi_path_sort() -> Result<()> { let schema = create_test_schema()?; - let source1 = parquet_exec(&schema); - let sort_exprs = vec![ + let sort_exprs1 = vec![ sort_expr("nullable_col", &schema), sort_expr("non_nullable_col", &schema), ]; - let sort = sort_exec(sort_exprs.clone(), source1); - let limit = limit_exec(sort); - - let parquet_sort_exprs = vec![sort_expr("nullable_col", &schema)]; - let source2 = parquet_exec_sorted(&schema, parquet_sort_exprs); + let sort_exprs2 = vec![sort_expr("nullable_col", &schema)]; + // reverse sorting of sort_exprs2 + let reversed_sort_exprs2 = vec![sort_expr_options( + "nullable_col", + &schema, + SortOptions { + descending: true, + nulls_first: false, + }, + )]; + let source1 = parquet_exec_sorted(&schema, sort_exprs1); + let source2 = parquet_exec_sorted(&schema, sort_exprs2); + let sort1 = sort_exec(reversed_sort_exprs2.clone(), source1); + let sort2 = sort_exec(reversed_sort_exprs2.clone(), source2); - let union = union_exec(vec![source2, limit]); - let repartition = repartition_exec(union); - let physical_plan = sort_preserving_merge_exec(sort_exprs, repartition); + let union = union_exec(vec![sort1, sort2]); + let physical_plan = window_exec("nullable_col", reversed_sort_exprs2, union); + // The `WindowAggExec` gets its sorting from multiple children jointly. + // During the removal of `SortExec`s, it should be able to remove the + // corresponding SortExecs together. Also, the inputs of these `SortExec`s + // are not necessarily the same to be able to remove them. let expected_input = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", - " UnionExec", + "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + " UnionExec", + " SortExec: [nullable_col@0 DESC NULLS LAST], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", + " SortExec: [nullable_col@0 DESC NULLS LAST], global=true", " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - " GlobalLimitExec: skip=0, fetch=100", - " LocalLimitExec: fetch=100", - " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; - - // expect to keep the bottom SortExec let expected_optimized = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=false", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", - " UnionExec", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - " GlobalLimitExec: skip=0, fetch=100", - " LocalLimitExec: fetch=100", - " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(NULL) }]", + " UnionExec", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); Ok(()) diff --git a/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs b/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs index 6222561bb1a27..2affa9f364109 100644 --- a/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs +++ b/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs @@ -124,7 +124,9 @@ impl BoundedWindowAggExec { let mut result = vec![]; // All window exprs have the same partition by, so we just use the first one: let partition_by = self.window_expr()[0].partition_by(); - let sort_keys = self.output_ordering().unwrap_or(self.sort_keys.as_deref().unwrap_or(&[])); + let sort_keys = self + .output_ordering() + .unwrap_or(self.sort_keys.as_deref().unwrap_or(&[])); for item in partition_by { if let Some(a) = sort_keys.iter().find(|&e| e.expr.eq(item)) { result.push(a.clone()); diff --git a/datafusion/core/src/physical_plan/windows/window_agg_exec.rs b/datafusion/core/src/physical_plan/windows/window_agg_exec.rs index 7ab9951ae107c..16404579233ca 100644 --- a/datafusion/core/src/physical_plan/windows/window_agg_exec.rs +++ b/datafusion/core/src/physical_plan/windows/window_agg_exec.rs @@ -115,7 +115,9 @@ impl WindowAggExec { let mut result = vec![]; // All window exprs have the same partition by, so we just use the first one: let partition_by = self.window_expr()[0].partition_by(); - let sort_keys = self.output_ordering().unwrap_or(self.sort_keys.as_deref().unwrap_or(&[])); + let sort_keys = self + .output_ordering() + .unwrap_or(self.sort_keys.as_deref().unwrap_or(&[])); for item in partition_by { if let Some(a) = sort_keys.iter().find(|&e| e.expr.eq(item)) { result.push(a.clone()); From 5285a3ef2ba51cfc8856fb6d1232368fdfbaa3eb Mon Sep 17 00:00:00 2001 From: "mingmwang@ebay.com" Date: Wed, 15 Feb 2023 16:11:51 +0800 Subject: [PATCH 05/35] refine codebase --- datafusion/core/src/dataframe.rs | 8 +- datafusion/core/src/execution/context.rs | 1 - .../physical_optimizer/dist_enforcement.rs | 19 +- .../physical_optimizer/sort_enforcement2.rs | 970 ++++++++++-------- datafusion/core/src/physical_plan/planner.rs | 1 - .../sorts/sort_preserving_merge.rs | 21 - .../windows/bounded_window_agg_exec.rs | 5 +- .../physical_plan/windows/window_agg_exec.rs | 4 +- datafusion/physical-expr/src/utils.rs | 13 +- datafusion/sql/src/statement.rs | 8 +- .../substrait/src/logical_plan/consumer.rs | 122 ++- 11 files changed, 618 insertions(+), 554 deletions(-) diff --git a/datafusion/core/src/dataframe.rs b/datafusion/core/src/dataframe.rs index 26fe5c0512049..883f203bbd2b3 100644 --- a/datafusion/core/src/dataframe.rs +++ b/datafusion/core/src/dataframe.rs @@ -459,9 +459,11 @@ impl DataFrame { .and_then(|r| r.columns().first()) .and_then(|c| c.as_any().downcast_ref::()) .and_then(|a| a.values().first()) - .ok_or(DataFusionError::Internal( - "Unexpected output when collecting for count()".to_string(), - ))? as usize; + .ok_or_else(|| { + DataFusionError::Internal( + "Unexpected output when collecting for count()".to_string(), + ) + })? as usize; Ok(len) } diff --git a/datafusion/core/src/execution/context.rs b/datafusion/core/src/execution/context.rs index 992b2291ced00..4fd55ea571828 100644 --- a/datafusion/core/src/execution/context.rs +++ b/datafusion/core/src/execution/context.rs @@ -1089,7 +1089,6 @@ impl QueryPlanner for DefaultQueryPlanner { session_state: &SessionState, ) -> Result> { let planner = DefaultPhysicalPlanner::default(); - println!("optimized logical plan {:?}", logical_plan); planner .create_physical_plan(logical_plan, session_state) .await diff --git a/datafusion/core/src/physical_optimizer/dist_enforcement.rs b/datafusion/core/src/physical_optimizer/dist_enforcement.rs index 3bc599c0cb0d2..222b0f5e1a925 100644 --- a/datafusion/core/src/physical_optimizer/dist_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/dist_enforcement.rs @@ -856,14 +856,17 @@ fn ensure_distribution( Distribution::SinglePartition if child.output_partitioning().partition_count() > 1 => { - if repartition_sort && required_ordering.is_some() { - let new_physical_ordering = create_sort_expr_from_requirement( - required_ordering.unwrap().as_ref(), - ); - Ok(Arc::new(SortPreservingMergeExec::new_for_distribuion( - new_physical_ordering, - child.clone(), - ))) + if repartition_sort { + if let Some(ordering) = required_ordering { + let new_physical_ordering = + create_sort_expr_from_requirement(ordering.as_ref()); + Ok(Arc::new(SortPreservingMergeExec::new( + new_physical_ordering, + child.clone(), + ))) + } else { + Ok(Arc::new(CoalescePartitionsExec::new(child.clone()))) + } } else { Ok(Arc::new(CoalescePartitionsExec::new(child.clone()))) } diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement2.rs b/datafusion/core/src/physical_optimizer/sort_enforcement2.rs index b624fd762dce0..25c407e2a7f72 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement2.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement2.rs @@ -31,13 +31,14 @@ //! SortExec: [non_nullable_col@1 ASC] //! ``` //! -//! in the physical plan. The first sort is unnecessary since its result is overwritten -//! by another SortExec. Therefore, this rule removes it from the physical plan. +//! in the physical plan. The child sort is unnecessary since its result is overwritten +//! by the parent SortExec. Therefore, this rule removes it from the physical plan. use crate::config::ConfigOptions; use crate::error::Result; use crate::execution::context::TaskContext; use crate::physical_optimizer::utils::add_sort_above_child; use crate::physical_optimizer::PhysicalOptimizerRule; +use crate::physical_plan::coalesce_partitions::CoalescePartitionsExec; use crate::physical_plan::filter::FilterExec; use crate::physical_plan::joins::utils::JoinSide; use crate::physical_plan::joins::SortMergeJoinExec; @@ -50,7 +51,7 @@ use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; use crate::physical_plan::union::UnionExec; use crate::physical_plan::windows::{BoundedWindowAggExec, WindowAggExec}; use crate::physical_plan::{ - displayable, with_new_children_if_necessary, DisplayFormatType, ExecutionPlan, + with_new_children_if_necessary, DisplayFormatType, Distribution, ExecutionPlan, Partitioning, SendableRecordBatchStream, }; use arrow::datatypes::SchemaRef; @@ -60,15 +61,16 @@ use datafusion_physical_expr::utils::{ create_sort_expr_from_requirement, map_requirement_before_projection, ordering_satisfy, ordering_satisfy_requirement, requirements_compatible, }; +use datafusion_physical_expr::window::WindowExpr; use datafusion_physical_expr::{ - EquivalenceProperties, PhysicalExpr, PhysicalSortExpr, PhysicalSortRequirements, + new_sort_requirements, EquivalenceProperties, PhysicalExpr, PhysicalSortExpr, + PhysicalSortRequirements, }; use itertools::izip; use std::any::Any; -use std::iter::zip; use std::sync::Arc; -/// This rule inspects SortExec's in the given physical plan and removes the +/// This rule implements a TOP-Downinspects SortExec's in the given physical plan and removes the /// ones it can prove unnecessary. #[derive(Default)] pub struct TopDownEnforceSorting {} @@ -87,6 +89,8 @@ struct PlanWithSortRequirements { plan: Arc, /// Whether the plan could impact the final result ordering impact_result_ordering: bool, + /// Parent has the SinglePartition requirement to children + satisfy_single_distribution: bool, /// Parent required sort ordering required_ordering: Option>, /// The adjusted request sort ordering to children. @@ -97,13 +101,14 @@ struct PlanWithSortRequirements { impl PlanWithSortRequirements { pub fn init(plan: Arc) -> Self { let impact_result_ordering = plan.output_ordering().is_some() - || plan.output_partitioning().partition_count() == 1 + || plan.output_partitioning().partition_count() <= 1 || plan.as_any().downcast_ref::().is_some() || plan.as_any().downcast_ref::().is_some(); let request_ordering = plan.required_input_ordering(); PlanWithSortRequirements { plan, impact_result_ordering, + satisfy_single_distribution: false, required_ordering: None, adjusted_request_ordering: request_ordering, } @@ -114,6 +119,7 @@ impl PlanWithSortRequirements { PlanWithSortRequirements { plan, impact_result_ordering: false, + satisfy_single_distribution: false, required_ordering: None, adjusted_request_ordering: request_ordering, } @@ -122,44 +128,43 @@ impl PlanWithSortRequirements { pub fn children(&self) -> Vec { let plan_children = self.plan.children(); assert_eq!(plan_children.len(), self.adjusted_request_ordering.len()); - let child_impact_result_ordering = if self - .plan - .as_any() - .downcast_ref::() - .is_some() - || self - .plan - .as_any() - .downcast_ref::() - .is_some() - { - true - } else if self.plan.as_any().downcast_ref::().is_some() { - false - } else if self.plan.as_any().downcast_ref::().is_some() { - self.plan.output_ordering().is_some() && self.impact_result_ordering - } else { - self.plan.maintains_input_order().iter().all(|o| *o) - && self.impact_result_ordering - }; - println!( - "child_impact_result_ordering {:?}", - child_impact_result_ordering - ); - plan_children - .into_iter() - .zip(self.adjusted_request_ordering.clone().into_iter()) - .map(|(child, required)| { - let from_parent = required; + + izip!( + plan_children.into_iter(), + self.adjusted_request_ordering.clone().into_iter(), + self.plan.maintains_input_order().into_iter(), + self.plan.required_input_distribution().into_iter(), + ) + .map( + |(child, from_parent, maintains_input_order, required_dist)| { + let child_satisfy_single_distribution = + matches!(required_dist, Distribution::SinglePartition); + let child_impact_result_ordering = if self + .plan + .as_any() + .downcast_ref::() + .is_some() + || self + .plan + .as_any() + .downcast_ref::() + .is_some() + { + true + } else { + maintains_input_order && self.impact_result_ordering + }; let child_request_ordering = child.required_input_ordering(); PlanWithSortRequirements { plan: child, impact_result_ordering: child_impact_result_ordering, + satisfy_single_distribution: child_satisfy_single_distribution, required_ordering: from_parent, adjusted_request_ordering: child_request_ordering, } - }) - .collect() + }, + ) + .collect() } } @@ -185,6 +190,7 @@ impl TreeNodeRewritable for PlanWithSortRequirements { Ok(PlanWithSortRequirements { plan, impact_result_ordering: self.impact_result_ordering, + satisfy_single_distribution: self.satisfy_single_distribution, required_ordering: self.required_ordering, adjusted_request_ordering: self.adjusted_request_ordering, }) @@ -242,40 +248,10 @@ impl PhysicalOptimizerRule for TopDownEnforceSorting { fn ensure_sorting( requirements: PlanWithSortRequirements, ) -> Result> { - println!( - "=== Current plan ===\n{}\n", - displayable(requirements.plan.as_ref()).indent() - ); - println!( - "impact_result_ordering: {:?}, parent required_ordering {:?}, adjusted request ordering {:?}", - requirements.impact_result_ordering, requirements.required_ordering, requirements.adjusted_request_ordering, - ); if let Some(sort_exec) = requirements.plan.as_any().downcast_ref::() { - // Remove unnecessary global SortExec - if !sort_exec.preserve_partitioning() { - if !requirements.impact_result_ordering - && requirements.required_ordering.is_none() - { - println!("remove sort_exec due to no need to keep ordering"); - return Ok(Some(PlanWithSortRequirements { - plan: Arc::new(TombStoneExec::new(sort_exec.input().clone())), - impact_result_ordering: false, - required_ordering: None, - adjusted_request_ordering: vec![None], - })); - } else if ordering_satisfy( - sort_exec.input().output_ordering(), - sort_exec.output_ordering(), - || sort_exec.input().equivalence_properties(), - ) { - println!("remove sort_exec due to child already satisfy"); - return Ok(Some(PlanWithSortRequirements { - plan: Arc::new(TombStoneExec::new(sort_exec.input().clone())), - impact_result_ordering: true, - required_ordering: None, - adjusted_request_ordering: vec![requirements.required_ordering], - })); - } + // Remove unnecessary SortExec(local/global) + if let Some(result) = analyze_immediate_sort_removal(&requirements, sort_exec) { + return Ok(Some(result)); } } else if let Some(sort_pres_exec) = requirements .plan @@ -288,96 +264,55 @@ fn ensure_sorting( sort_pres_exec.input().as_any().downcast_ref::() { if sort_pres_exec.expr() == child_sort_exec.expr() { - if !requirements.impact_result_ordering - && requirements.required_ordering.is_none() + if let Some(result) = + analyze_immediate_sort_removal(&requirements, child_sort_exec) { - println!("remove SortPreservingMergeExec + SortExec due to no need to keep ordering"); - return Ok(Some(PlanWithSortRequirements { - plan: Arc::new(TombStoneExec::new( - child_sort_exec.input().clone(), - )), - impact_result_ordering: false, - required_ordering: None, - adjusted_request_ordering: vec![None], - })); - } else if ordering_satisfy( - child_sort_exec.input().output_ordering(), - child_sort_exec.output_ordering(), - || child_sort_exec.input().equivalence_properties(), - ) && child_sort_exec - .input() - .output_partitioning() - .partition_count() - == 1 - { - println!("remove SortPreservingMergeExec + SortExec due to child already satisfy"); - return Ok(Some(PlanWithSortRequirements { - plan: Arc::new(TombStoneExec::new( - child_sort_exec.input().clone(), - )), - impact_result_ordering: true, - required_ordering: None, - adjusted_request_ordering: vec![requirements.required_ordering], - })); + return Ok(Some(result)); } } - } else { - if !sort_pres_exec.satisfy_distribution() { - // Remove unnecessary SortPreservingMergeExec only - if !requirements.impact_result_ordering { - println!( - "remove SortPreservingMergeExec due to no need to keep ordering" - ); - return Ok(Some(PlanWithSortRequirements { - plan: Arc::new(TombStoneExec::new( - sort_pres_exec.input().clone(), - )), - impact_result_ordering: false, - required_ordering: None, - adjusted_request_ordering: vec![requirements.required_ordering], - })); - } else if ordering_satisfy( - sort_pres_exec.input().output_ordering(), - Some(sort_pres_exec.expr()), - || sort_pres_exec.input().equivalence_properties(), - ) && sort_pres_exec - .input() - .output_partitioning() - .partition_count() - == 1 - { - println!( - "remove SortPreservingMergeExec due to child already satisfy" - ); - return Ok(Some(PlanWithSortRequirements { - plan: Arc::new(TombStoneExec::new( - sort_pres_exec.input().clone(), - )), - impact_result_ordering: true, - required_ordering: None, - adjusted_request_ordering: vec![requirements.required_ordering], - })); - } + } else if !requirements.satisfy_single_distribution + || sort_pres_exec + .input() + .output_partitioning() + .partition_count() + <= 1 + { + if let Some(result) = + analyze_immediate_spm_removal(&requirements, sort_pres_exec) + { + return Ok(Some(result)); } } } - println!("no removing"); let plan = &requirements.plan; let parent_required = requirements.required_ordering.as_deref(); if ordering_satisfy_requirement(plan.output_ordering(), parent_required, || { plan.equivalence_properties() }) { - // Can satisfy the parent requirements, clear the requirements - println!( - "Can satisfy the parent requirements, impact_result_ordering {:?}", - requirements.impact_result_ordering - ); - if plan.as_any().downcast_ref::().is_some() + // Can satisfy the parent requirements, change the adjusted_request_ordering for UnionExec and WindowAggExec(BoundedWindowAggExec) + if let Some(union_exec) = plan.as_any().downcast_ref::() { + // UnionExec does not have real sort requirements for its input. Here we change the adjusted_request_ordering to UnionExec's output ordering and + // propagate the sort requirements down to correct the unnecessary descendant SortExec under the UnionExec + let adjusted = new_sort_requirements(union_exec.output_ordering()); + return Ok(Some(PlanWithSortRequirements { + plan: plan.clone(), + impact_result_ordering: requirements.impact_result_ordering, + satisfy_single_distribution: requirements.satisfy_single_distribution, + required_ordering: None, + adjusted_request_ordering: vec![ + adjusted; + requirements + .adjusted_request_ordering + .len() + ], + })); + } else if plan.as_any().downcast_ref::().is_some() || plan .as_any() .downcast_ref::() .is_some() { + // WindowAggExec(BoundedWindowAggExec) might reverse their sort requirements let request_child = requirements.adjusted_request_ordering[0].as_deref(); let reversed_request_child = reverse_window_sort_requirements(request_child); @@ -386,27 +321,12 @@ fn ensure_sorting( request_child, reversed_request_child.as_deref(), ) { - println!("Should reverse top window sort_requirements"); - let (window_expr, input_schema, partition_keys) = if let Some(exec) = - plan.as_any().downcast_ref::() - { - ( - exec.window_expr(), - exec.input_schema(), - exec.partition_keys.clone(), - ) - } else if let Some(exec) = plan.as_any().downcast_ref::() { - ( - exec.window_expr(), - exec.input_schema(), - exec.partition_keys.clone(), - ) - } else { - return Err(DataFusionError::Plan( - "Expects to receive either WindowAggExec of BoundedWindowAggExec" - .to_string(), - )); - }; + let WindowExecInfo { + window_expr, + input_schema, + partition_keys, + } = extract_window_info_from_plan(plan).unwrap(); + let new_window_expr = window_expr .iter() .map(|e| e.get_reverse_expr()) @@ -426,7 +346,7 @@ fn ensure_sorting( input_schema, partition_keys, Some(new_physical_ordering), - )?) as _ + )?) as Arc } else { Arc::new(WindowAggExec::try_new( window_expr, @@ -434,37 +354,28 @@ fn ensure_sorting( input_schema, partition_keys, Some(new_physical_ordering), - )?) as _ + )?) as Arc }; - println!("Reverse WindowAggExec expressions and push down the reversed requirements"); - return Ok(Some(PlanWithSortRequirements { plan: new_plan, impact_result_ordering: false, + satisfy_single_distribution: requirements + .satisfy_single_distribution, required_ordering: None, adjusted_request_ordering: vec![reversed_request_child], })); } - } else { - println!("Should not reverse top window sort_requirements"); } - } else if let Some(_) = plan.as_any().downcast_ref::() { - return Ok(Some(PlanWithSortRequirements { - plan: plan.clone(), - impact_result_ordering: false, - required_ordering: None, - adjusted_request_ordering: requirements.adjusted_request_ordering.clone(), - })); } - return Ok(Some(PlanWithSortRequirements { + Ok(Some(PlanWithSortRequirements { plan: plan.clone(), impact_result_ordering: requirements.impact_result_ordering, + satisfy_single_distribution: requirements.satisfy_single_distribution, required_ordering: None, adjusted_request_ordering: requirements.adjusted_request_ordering, - })); + })) } else if let Some(sort_exec) = plan.as_any().downcast_ref::() { - println!("Modify current SortExec to satisfy the parent requirements"); - // If the current plan is a SortExec, update the SortExec to satisfy the parent requirements + // If the current plan is a SortExec, modify current SortExec to satisfy the parent requirements let parent_required_expr = create_sort_expr_from_requirement(parent_required.unwrap()); let new_plan = add_sort_above_child( @@ -472,12 +383,11 @@ fn ensure_sorting( parent_required_expr, sort_exec.fetch(), )?; - return Ok(Some( + Ok(Some( PlanWithSortRequirements::new_without_impact_result_ordering(new_plan), - )); + )) } else { - println!("Can not satisfy the parent requirements, try to push down"); - // Can not satisfy the parent requirements, check whether should push down the requirements. Add new SortExec when the parent requirements can not be pushed down + // Can not satisfy the parent requirements, check whether the requirements can be pushed down. If not, add new SortExec. let parent_required_expr = create_sort_expr_from_requirement(parent_required.unwrap()); let maintains_input_order = plan.maintains_input_order(); @@ -493,158 +403,88 @@ fn ensure_sorting( && plan.as_any().downcast_ref::().is_none() { let new_plan = add_sort_above_child(plan, parent_required_expr, None)?; - return Ok(Some( + Ok(Some( PlanWithSortRequirements::new_without_impact_result_ordering(new_plan), - )); - } else if let Some(window_agg_exec) = - plan.as_any().downcast_ref::() + )) + } else if plan.as_any().downcast_ref::().is_some() + || plan + .as_any() + .downcast_ref::() + .is_some() { - let window_expr = window_agg_exec.window_expr(); let request_child = requirements.adjusted_request_ordering[0].as_deref(); if requirements_compatible(request_child, parent_required, || { plan.children()[0].equivalence_properties() }) { - println!("WindowAggExec child requirements are more specific, no need to add SortExec"); - return Ok(Some(PlanWithSortRequirements { - plan: plan.clone(), - impact_result_ordering: true, - required_ordering: None, - adjusted_request_ordering: requirements.adjusted_request_ordering, - })); + // request child requirements are more specific, no need to push down the parent requirements + Ok(None) } else if requirements_compatible(parent_required, request_child, || { plan.children()[0].equivalence_properties() }) { - println!("Parent requirements are more specific, adjust WindowAggExec child requirements and push down the requirements"); + // parent requirements are more specific, adjust the request child requirements and push down the new requirements let adjusted = parent_required.map(|r| r.to_vec()); - return Ok(Some(PlanWithSortRequirements { + Ok(Some(PlanWithSortRequirements { plan: plan.clone(), impact_result_ordering: true, + satisfy_single_distribution: requirements.satisfy_single_distribution, required_ordering: None, adjusted_request_ordering: vec![adjusted], - })); + })) } else { - let should_reverse = can_reverse_window_request( - window_expr[0].partition_by(), + let WindowExecInfo { + window_expr, + input_schema, + partition_keys, + } = extract_window_info_from_plan(plan).unwrap(); + if should_reverse_window_exec( parent_required, request_child, - &window_agg_exec.input().schema(), - ); - if should_reverse { - let new_window_expr = window_expr - .iter() - .map(|e| e.get_reverse_expr()) - .collect::>>(); - if let Some(window_expr) = new_window_expr { - let new_plan = Arc::new(WindowAggExec::try_new( - window_expr, - window_agg_exec.children()[0].clone(), - window_agg_exec.input_schema(), - window_agg_exec.partition_keys.clone(), - Some(parent_required_expr.to_vec()), - )?) as _; - println!("Reverse WindowAggExec expressions and push down the requirements"); - return Ok(Some( - PlanWithSortRequirements::new_without_impact_result_ordering( - new_plan, - ), - )); - } else { - println!("Can not push down, add new SortExec"); - let new_plan = - add_sort_above_child(plan, parent_required_expr, None)?; - return Ok(Some( - PlanWithSortRequirements::new_without_impact_result_ordering( - new_plan, - ), - )); - } - } else { - // Can not push down, add new SortExec - println!("Can not push down, add new SortExec"); - let new_plan = - add_sort_above_child(plan, parent_required_expr, None)?; - return Ok(Some( - PlanWithSortRequirements::new_without_impact_result_ordering( - new_plan, - ), - )); - } - } - } else if let Some(window_agg_exec) = - plan.as_any().downcast_ref::() - { - let window_expr = window_agg_exec.window_expr(); - let request_child = &plan.required_input_ordering()[0]; - if requirements_compatible(request_child.as_deref(), parent_required, || { - plan.children()[0].equivalence_properties() - }) { - println!("BoundedWindowAggExec child requirements are more specific, no need to add SortExec"); - return Ok(Some(PlanWithSortRequirements { - plan: plan.clone(), - impact_result_ordering: true, - required_ordering: None, - adjusted_request_ordering: requirements.adjusted_request_ordering, - })); - } else if requirements_compatible( - parent_required, - request_child.as_deref(), - || plan.children()[0].equivalence_properties(), - ) { - println!("Parent requirements are more specific, adjust BoundedWindowAggExec child requirements and push down the requirements"); - let adjusted = parent_required.map(|r| r.to_vec()); - return Ok(Some(PlanWithSortRequirements { - plan: plan.clone(), - impact_result_ordering: true, - required_ordering: None, - adjusted_request_ordering: vec![adjusted], - })); - } else { - let should_reverse = can_reverse_window_request( - window_expr[0].partition_by(), - parent_required, - request_child.as_deref(), - &window_agg_exec.input().schema(), - ); - if should_reverse { + &input_schema, + ) { + let new_physical_ordering = parent_required_expr.to_vec(); let new_window_expr = window_expr .iter() .map(|e| e.get_reverse_expr()) .collect::>>(); if let Some(window_expr) = new_window_expr { - let new_plan = Arc::new(BoundedWindowAggExec::try_new( - window_expr, - window_agg_exec.children()[0].clone(), - window_agg_exec.input_schema(), - window_agg_exec.partition_keys.clone(), - Some(parent_required_expr.to_vec()), - )?) as _; - println!("Reverse BoundedWindowAggExec expressions and push down the requirements"); - return Ok(Some( - PlanWithSortRequirements::new_without_impact_result_ordering( - new_plan, - ), - )); - } else { - println!("Can not push down, add new SortExec"); - let new_plan = - add_sort_above_child(plan, parent_required_expr, None)?; - return Ok(Some( - PlanWithSortRequirements::new_without_impact_result_ordering( - new_plan, - ), - )); + let uses_bounded_memory = + window_expr.iter().all(|e| e.uses_bounded_memory()); + let new_plan = if uses_bounded_memory { + Arc::new(BoundedWindowAggExec::try_new( + window_expr, + plan.children()[0].clone(), + input_schema, + partition_keys, + Some(new_physical_ordering), + )?) as Arc + } else { + Arc::new(WindowAggExec::try_new( + window_expr, + plan.children()[0].clone(), + input_schema, + partition_keys, + Some(new_physical_ordering), + )?) as Arc + }; + let adjusted_request_ordering = + new_plan.required_input_ordering(); + return Ok(Some(PlanWithSortRequirements { + plan: new_plan, + impact_result_ordering: false, + satisfy_single_distribution: requirements + .satisfy_single_distribution, + required_ordering: None, + adjusted_request_ordering, + })); } - } else { - // Can not push down, add new SortExec - println!("Can not push down, add new SortExec"); - let new_plan = - add_sort_above_child(plan, parent_required_expr, None)?; - return Ok(Some( - PlanWithSortRequirements::new_without_impact_result_ordering( - new_plan, - ), - )); } + // Can not push down requirements, add new SortExec + let new_plan = add_sort_above_child(plan, parent_required_expr, None)?; + Ok(Some( + PlanWithSortRequirements::new_without_impact_result_ordering( + new_plan, + ), + )) } } else if let Some(smj) = plan.as_any().downcast_ref::() { // If the current plan is SortMergeJoinExec @@ -659,23 +499,25 @@ fn ensure_sorting( || plan.children()[0].equivalence_properties(), ) { println!("Requirements are compatible with SMJ"); - return Ok(Some(PlanWithSortRequirements { + Ok(Some(PlanWithSortRequirements { plan: plan.clone(), impact_result_ordering: true, + satisfy_single_distribution: requirements + .satisfy_single_distribution, required_ordering: None, adjusted_request_ordering: requirements .adjusted_request_ordering, - })); + })) } else { // Can not push down, add new SortExec println!("Can not push down, add new SortExec"); let new_plan = add_sort_above_child(plan, parent_required_expr, None)?; - return Ok(Some( + Ok(Some( PlanWithSortRequirements::new_without_impact_result_ordering( new_plan, ), - )); + )) } } Some(JoinSide::Right) if maintains_input_order[1] => { @@ -687,34 +529,36 @@ fn ensure_sorting( || plan.children()[1].equivalence_properties(), ) { println!("Requirements are compatible with SMJ"); - return Ok(Some(PlanWithSortRequirements { + Ok(Some(PlanWithSortRequirements { plan: plan.clone(), impact_result_ordering: true, + satisfy_single_distribution: requirements + .satisfy_single_distribution, required_ordering: None, adjusted_request_ordering: requirements .adjusted_request_ordering, - })); + })) } else { // Can not push down, add new SortExec println!("Can not push down, add new SortExec"); let new_plan = add_sort_above_child(plan, parent_required_expr, None)?; - return Ok(Some( + Ok(Some( PlanWithSortRequirements::new_without_impact_result_ordering( new_plan, ), - )); + )) } } _ => { println!("Can not decide the expr side for SortMergeJoinExec, can not push down, add SortExec"); let new_plan = add_sort_above_child(plan, parent_required_expr, None)?; - return Ok(Some( + Ok(Some( PlanWithSortRequirements::new_without_impact_result_ordering( new_plan, ), - )); + )) } } } else if plan.required_input_ordering().iter().any(Option::is_some) { @@ -740,20 +584,15 @@ fn ensure_sorting( ) }) .collect::>(); - println!( - "plan.equivalence_properties() {:?}", - plan.equivalence_properties() - ); - println!("compatible_with_children {:?}", compatible_with_children); if compatible_with_children.iter().all(|a| *a) { // Requirements are compatible, not need to push down. - println!("Requirements are compatible, no need to push down"); - return Ok(Some(PlanWithSortRequirements { + Ok(Some(PlanWithSortRequirements { plan: plan.clone(), impact_result_ordering: true, + satisfy_single_distribution: requirements.satisfy_single_distribution, required_ordering: None, adjusted_request_ordering: requirements.adjusted_request_ordering, - })); + })) } else { let can_adjust_child_requirements = plan .required_input_ordering() @@ -771,26 +610,28 @@ fn ensure_sorting( // Adjust child requirements and push down the requirements println!("Adjust child requirements and push down the requirements"); let adjusted = parent_required.map(|r| r.to_vec()); - return Ok(Some(PlanWithSortRequirements { + Ok(Some(PlanWithSortRequirements { plan: plan.clone(), impact_result_ordering: true, + satisfy_single_distribution: requirements + .satisfy_single_distribution, required_ordering: None, adjusted_request_ordering: vec![ adjusted; can_adjust_child_requirements .len() ], - })); + })) } else { // Can not push down, add new SortExec println!("Can not push down, add new SortExec"); let new_plan = add_sort_above_child(plan, parent_required_expr, None)?; - return Ok(Some( + Ok(Some( PlanWithSortRequirements::new_without_impact_result_ordering( new_plan, ), - )); + )) } } } else { @@ -807,6 +648,8 @@ fn ensure_sorting( Ok(Some(PlanWithSortRequirements { plan: plan.clone(), impact_result_ordering: true, + satisfy_single_distribution: requirements + .satisfy_single_distribution, required_ordering: None, adjusted_request_ordering: vec![new_requirement], })) @@ -817,30 +660,113 @@ fn ensure_sorting( ); let new_plan = add_sort_above_child(plan, parent_required_expr, None)?; - return Ok(Some( + Ok(Some( PlanWithSortRequirements::new_without_impact_result_ordering( new_plan, ), - )); + )) } } else { println!("Push down requirements."); - return Ok(Some(PlanWithSortRequirements { + Ok(Some(PlanWithSortRequirements { plan: plan.clone(), impact_result_ordering: requirements.impact_result_ordering, required_ordering: None, + satisfy_single_distribution: requirements.satisfy_single_distribution, adjusted_request_ordering: vec![ requirements.required_ordering; requirements .adjusted_request_ordering .len() ], - })); + })) } } } } +/// Analyzes a given `Sort` (`plan`) to determine whether the Sort can be removed: +/// 1) The input already has a finer ordering than this `Sort` enforces. +/// 2) The `Sort` does not impact the final result ordering. +fn analyze_immediate_sort_removal( + requirements: &PlanWithSortRequirements, + sort_exec: &SortExec, +) -> Option { + if ordering_satisfy( + sort_exec.input().output_ordering(), + sort_exec.output_ordering(), + || sort_exec.input().equivalence_properties(), + ) { + Some(PlanWithSortRequirements { + plan: Arc::new(TombStoneExec::new(sort_exec.input().clone())), + impact_result_ordering: requirements.impact_result_ordering, + satisfy_single_distribution: requirements.satisfy_single_distribution, + required_ordering: None, + adjusted_request_ordering: vec![requirements.required_ordering.clone()], + }) + } + // Remove unnecessary SortExec + else if !requirements.impact_result_ordering { + if requirements.satisfy_single_distribution + && !sort_exec.preserve_partitioning() + && sort_exec.input().output_partitioning().partition_count() > 1 + { + Some(PlanWithSortRequirements { + plan: Arc::new(CoalescePartitionsExec::new(sort_exec.input().clone())), + impact_result_ordering: false, + satisfy_single_distribution: false, + required_ordering: None, + adjusted_request_ordering: vec![requirements.required_ordering.clone()], + }) + } else { + Some(PlanWithSortRequirements { + plan: Arc::new(TombStoneExec::new(sort_exec.input().clone())), + impact_result_ordering: false, + satisfy_single_distribution: false, + required_ordering: None, + adjusted_request_ordering: vec![requirements.required_ordering.clone()], + }) + } + } else { + None + } +} + +/// Analyzes a given `SortPreservingMergeExec` (`plan`) to determine whether the SortPreservingMergeExec can be removed: +/// 1) The input already has a finer ordering than this `SortPreservingMergeExec` enforces. +/// 2) The `SortPreservingMergeExec` does not impact the final result ordering. +fn analyze_immediate_spm_removal( + requirements: &PlanWithSortRequirements, + spm_exec: &SortPreservingMergeExec, +) -> Option { + if ordering_satisfy( + spm_exec.input().output_ordering(), + Some(spm_exec.expr()), + || spm_exec.input().equivalence_properties(), + ) && spm_exec.input().output_partitioning().partition_count() <= 1 + { + Some(PlanWithSortRequirements { + plan: Arc::new(TombStoneExec::new(spm_exec.input().clone())), + impact_result_ordering: true, + satisfy_single_distribution: false, + required_ordering: None, + adjusted_request_ordering: vec![requirements.required_ordering.clone()], + }) + } + // Remove unnecessary SortPreservingMergeExec only + else if !requirements.impact_result_ordering { + Some(PlanWithSortRequirements { + plan: Arc::new(TombStoneExec::new(spm_exec.input().clone())), + impact_result_ordering: false, + satisfy_single_distribution: false, + required_ordering: None, + adjusted_request_ordering: vec![requirements.required_ordering.clone()], + }) + } else { + None + } +} + fn expr_source_sides( required_exprs: &[PhysicalSortExpr], left_columns_len: usize, @@ -892,7 +818,7 @@ fn shift_right_required( col.name(), col.index() - left_columns_len, )) as Arc, - sort_options: r.sort_options.clone(), + sort_options: r.sort_options, }) } else { None @@ -911,51 +837,6 @@ fn shift_right_required( } } -#[derive(Debug)] -/// This structure stores extra column information required to remove unnecessary sorts. -pub struct ColumnInfo { - reverse: bool, - is_partition: bool, -} - -fn can_reverse_window_request( - partition_keys: &[Arc], - required: Option<&[PhysicalSortRequirements]>, - request_ordering: Option<&[PhysicalSortRequirements]>, - input_schema: &SchemaRef, -) -> bool { - match (required, request_ordering) { - (_, None) => false, - (None, Some(_)) => false, - (Some(required), Some(request_ordering)) => { - if required.len() > request_ordering.len() { - return false; - } - let mut col_infos = vec![]; - for (required_expr, request_expr) in zip(required, request_ordering) { - let column = required_expr.expr.clone(); - let is_partition = partition_keys.iter().any(|e| e.eq(&column)); - let reverse = check_alignment(input_schema, request_expr, required_expr); - col_infos.push(ColumnInfo { - reverse, - is_partition, - }); - } - let order_by_sections = col_infos - .iter() - .filter(|elem| !elem.is_partition) - .collect::>(); - let should_reverse_order_bys = if order_by_sections.is_empty() { - false - } else { - let first_reverse = order_by_sections[0].reverse; - first_reverse - }; - should_reverse_order_bys - } - } -} - /// Compares window expression's `window_request` and `parent_required_expr` ordering, returns /// whether we should reverse the window expression's ordering in order to meet parent's requirements. fn check_alignment( @@ -970,13 +851,12 @@ fn check_alignment( let nullable = parent_required_expr.expr.nullable(input_schema).unwrap(); let window_request_opts = window_request.sort_options.unwrap(); let parent_required_opts = parent_required_expr.sort_options.unwrap(); - let is_reversed = if nullable { + if nullable { window_request_opts == reverse_sort_options(parent_required_opts) } else { // If the column is not nullable, NULLS FIRST/LAST is not important. window_request_opts.descending != parent_required_opts.descending - }; - is_reversed + } } else { false } @@ -985,7 +865,7 @@ fn check_alignment( fn reverse_window_sort_requirements( request_child: Option<&[PhysicalSortRequirements]>, ) -> Option> { - let reversed_request = request_child.map(|request| { + request_child.map(|request| { request .iter() .map(|req| match req.sort_options { @@ -996,10 +876,12 @@ fn reverse_window_sort_requirements( }, }) .collect::>() - }); - reversed_request + }) } +/// Whether to reverse the top WindowExec's sort requirements. +/// Considering the requirements of the descendants WindowExecs and leaf nodes' output ordering. +/// TODO!considering all the cases fn should_reverse_window_sort_requirements( window_plan: Arc, top_requirement: Option<&[PhysicalSortRequirements]>, @@ -1059,7 +941,7 @@ fn should_reverse_window_sort_requirements( top_reversed_requirement, ) } else { - if requirements_compatible( + requirements_compatible( top_reversed_requirement, window_plan.required_input_ordering()[0].as_deref(), || window_plan.equivalence_properties(), @@ -1067,24 +949,18 @@ fn should_reverse_window_sort_requirements( window_plan.required_input_ordering()[0].as_deref(), top_reversed_requirement, || window_plan.equivalence_properties(), - ) { - true - } else { - false - } + ) } - } else if requirements_compatible( - top_reversed_requirement, - window_plan.required_input_ordering()[0].as_deref(), - || window_plan.equivalence_properties(), - ) || requirements_compatible( - window_plan.required_input_ordering()[0].as_deref(), - top_reversed_requirement, - || window_plan.equivalence_properties(), - ) { - true } else { - false + requirements_compatible( + top_reversed_requirement, + window_plan.required_input_ordering()[0].as_deref(), + || window_plan.equivalence_properties(), + ) || requirements_compatible( + window_plan.required_input_ordering()[0].as_deref(), + top_reversed_requirement, + || window_plan.equivalence_properties(), + ) } }) .collect::>(); @@ -1092,6 +968,71 @@ fn should_reverse_window_sort_requirements( flags.iter().all(|o| *o) } +fn should_reverse_window_exec( + required: Option<&[PhysicalSortRequirements]>, + request_ordering: Option<&[PhysicalSortRequirements]>, + input_schema: &SchemaRef, +) -> bool { + match (required, request_ordering) { + (_, None) => false, + (None, Some(_)) => false, + (Some(required), Some(request_ordering)) => { + if required.len() > request_ordering.len() { + return false; + } + let alignment_flags = required + .iter() + .zip(request_ordering.iter()) + .filter_map(|(required_expr, request_expr)| { + // Only check the alignment of non-partition columns + if request_expr.sort_options.is_some() + && required_expr.sort_options.is_some() + { + Some(check_alignment(input_schema, request_expr, required_expr)) + } else if request_expr.expr.eq(&required_expr.expr) { + None + } else { + Some(false) + } + }) + .collect::>(); + if alignment_flags.is_empty() { + false + } else { + alignment_flags.iter().all(|o| *o) + } + } + } +} + +fn extract_window_info_from_plan( + plan: &Arc, +) -> Option { + if let Some(exec) = plan.as_any().downcast_ref::() { + Some(WindowExecInfo { + window_expr: exec.window_expr().to_vec(), + input_schema: exec.input_schema(), + partition_keys: exec.partition_keys.clone(), + }) + } else { + plan.as_any() + .downcast_ref::() + .map(|exec| WindowExecInfo { + window_expr: exec.window_expr().to_vec(), + input_schema: exec.input_schema(), + partition_keys: exec.partition_keys.clone(), + }) + } +} + +#[derive(Debug)] +/// This structure stores extra Window information required to create a new WindowExec +pub struct WindowExecInfo { + window_expr: Vec>, + input_schema: SchemaRef, + partition_keys: Vec>, +} + /// A TombStoneExec execution plan generated during optimization process, should be removed finally #[derive(Debug)] struct TombStoneExec { @@ -1146,9 +1087,9 @@ impl ExecutionPlan for TombStoneExec { _partition: usize, _context: Arc, ) -> Result { - Err(DataFusionError::Internal(format!( - "TombStoneExec, invalid plan" - ))) + Err(DataFusionError::Internal( + "TombStoneExec, invalid plan".to_string(), + )) } fn fmt_as( @@ -1190,6 +1131,7 @@ mod tests { use datafusion_expr::{AggregateFunction, WindowFrame, WindowFunction}; use datafusion_physical_expr::expressions::{col, NotExpr}; use datafusion_physical_expr::PhysicalSortExpr; + use std::ops::Deref; use std::sync::Arc; fn create_test_schema() -> Result { @@ -1286,6 +1228,126 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_should_reverse_window() -> Result<()> { + let schema = create_test_schema()?; + + // partition by nullable_col order by non_nullable_col + let window_request_ordering1 = vec![ + PhysicalSortRequirements { + expr: col("nullable_col", &schema)?, + sort_options: None, + }, + PhysicalSortRequirements { + expr: col("non_nullable_col", &schema)?, + sort_options: Some(SortOptions { + descending: true, + nulls_first: true, + }), + }, + ]; + let required_ordering1 = vec![ + PhysicalSortRequirements { + expr: col("nullable_col", &schema)?, + sort_options: None, + }, + PhysicalSortRequirements { + expr: col("non_nullable_col", &schema)?, + sort_options: Some(SortOptions { + descending: false, + nulls_first: false, + }), + }, + ]; + + let reverse = should_reverse_window_exec( + Some(required_ordering1.deref()), + Some(window_request_ordering1.deref()), + &schema, + ); + assert!(reverse); + + // order by nullable_col, non_nullable_col + let window_request_ordering2 = vec![ + PhysicalSortRequirements { + expr: col("nullable_col", &schema)?, + sort_options: Some(SortOptions { + descending: true, + nulls_first: true, + }), + }, + PhysicalSortRequirements { + expr: col("non_nullable_col", &schema)?, + sort_options: Some(SortOptions { + descending: true, + nulls_first: true, + }), + }, + ]; + + let required_ordering2 = vec![ + PhysicalSortRequirements { + expr: col("nullable_col", &schema)?, + sort_options: None, + }, + PhysicalSortRequirements { + expr: col("non_nullable_col", &schema)?, + sort_options: Some(SortOptions { + descending: false, + nulls_first: false, + }), + }, + ]; + + let reverse = should_reverse_window_exec( + Some(required_ordering2.deref()), + Some(window_request_ordering2.deref()), + &schema, + ); + assert!(reverse); + + // wrong partition columns + let window_request_ordering3 = vec![ + PhysicalSortRequirements { + expr: col("nullable_col", &schema)?, + sort_options: Some(SortOptions { + descending: true, + nulls_first: true, + }), + }, + PhysicalSortRequirements { + expr: col("non_nullable_col", &schema)?, + sort_options: Some(SortOptions { + descending: true, + nulls_first: true, + }), + }, + ]; + + let required_ordering3 = vec![ + PhysicalSortRequirements { + expr: col("non_nullable_col", &schema)?, + sort_options: None, + }, + PhysicalSortRequirements { + expr: col("non_nullable_col", &schema)?, + sort_options: Some(SortOptions { + descending: false, + nulls_first: false, + }), + }, + ]; + + let reverse = should_reverse_window_exec( + Some(required_ordering3.deref()), + Some(window_request_ordering3.deref()), + &schema, + ); + assert!(!reverse); + + Ok(()) + } + /// Runs the sort enforcement optimizer and asserts the plan /// against the original and expected plans /// @@ -1347,7 +1409,7 @@ mod tests { } #[tokio::test] - async fn test_remove_unnecessary_sort_window_multilayer() -> Result<()> { + async fn test_not_remove_top_sort_window_multilayer() -> Result<()> { let schema = create_test_schema()?; let source = memory_exec(&schema); @@ -1374,7 +1436,7 @@ mod tests { let sort = sort_exec(sort_exprs.clone(), window_agg); - // Add dummy layer propagating Sort above, to test whether sort can be removed from multi layer before + // Add dummy layer propagating Sort above, the top Sort should not be removed let filter = filter_exec( Arc::new(NotExpr::new( col("non_nullable_col", schema.as_ref()).unwrap(), @@ -1395,11 +1457,12 @@ mod tests { ]; let expected_optimized = vec![ - "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(NULL) }]", + "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", " FilterExec: NOT non_nullable_col@1", - " WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " SortExec: [non_nullable_col@1 DESC]", - " MemoryExec: partitions=0, partition_sizes=[]", + " SortExec: [non_nullable_col@1 ASC NULLS LAST], global=true", + " WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + " SortExec: [non_nullable_col@1 DESC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); Ok(()) @@ -1855,8 +1918,7 @@ mod tests { // Input is an invalid plan. In this case rule should add required sorting in appropriate places. // First ParquetExec has output ordering(nullable_col@0 ASC). However, it doesn't satisfy required ordering - // of SortPreservingMergeExec. Hence rule should remove unnecessary sort for second child of the UnionExec - // and put a sort above Union to satisfy required ordering. + // of SortPreservingMergeExec. let expected_input = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", " UnionExec", @@ -2078,32 +2140,20 @@ mod tests { sort_expr("nullable_col", &schema), sort_expr("non_nullable_col", &schema), ]; - let sort_exprs2 = vec![ - sort_expr("nullable_col", &schema), - sort_expr_options( - "non_nullable_col", - &schema, - SortOptions { - descending: true, - nulls_first: false, - }, - ), - ]; let sort_exprs3 = vec![sort_expr("nullable_col", &schema)]; - let sort1 = sort_exec(sort_exprs1, source1.clone()); - let sort2 = sort_exec(sort_exprs2, source1); + let sort1 = sort_exec(sort_exprs1.clone(), source1.clone()); + let sort2 = sort_exec(sort_exprs1, source1); let union = union_exec(vec![sort1, sort2]); let physical_plan = sort_preserving_merge_exec(sort_exprs3, union); - // Union doesn't preserve any of the inputs ordering. However, we should be able to change unnecessarily fine - // SortExecs under UnionExec with required SortExecs that are absolutely necessary. + // Union preserves the inputs ordering and we should not change any of the SortExecs under UnionExec let expected_input = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC]", " UnionExec", " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " SortExec: [nullable_col@0 ASC,non_nullable_col@1 DESC NULLS LAST], global=true", + " SortExec: [nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; assert_optimized!(expected_input, expected_input, physical_plan); @@ -2137,9 +2187,47 @@ mod tests { let physical_plan = window_exec("nullable_col", reversed_sort_exprs2, union); // The `WindowAggExec` gets its sorting from multiple children jointly. - // During the removal of `SortExec`s, it should be able to remove the - // corresponding SortExecs together. Also, the inputs of these `SortExec`s - // are not necessarily the same to be able to remove them. + // The SortExecs should be kept to ensure the final result ordering + let expected_input = vec![ + "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + " UnionExec", + " SortExec: [nullable_col@0 DESC NULLS LAST], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", + " SortExec: [nullable_col@0 DESC NULLS LAST], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + ]; + assert_optimized!(expected_input, expected_input, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_window_multi_path_sort2() -> Result<()> { + let schema = create_test_schema()?; + + let sort_exprs1 = vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ]; + let sort_exprs2 = vec![sort_expr("nullable_col", &schema)]; + // reverse sorting of sort_exprs2 + let reversed_sort_exprs2 = vec![sort_expr_options( + "nullable_col", + &schema, + SortOptions { + descending: true, + nulls_first: false, + }, + )]; + let source1 = parquet_exec_sorted(&schema, sort_exprs1); + let source2 = parquet_exec_sorted(&schema, sort_exprs2.clone()); + let sort1 = sort_exec(reversed_sort_exprs2.clone(), source1); + let sort2 = sort_exec(reversed_sort_exprs2, source2); + + let union = union_exec(vec![sort1, sort2]); + let physical_plan = window_exec("nullable_col", sort_exprs2, union); + + // The `WindowAggExec` gets its sorting from multiple children jointly. + // The SortExecs should be kept to ensure the final result ordering let expected_input = vec![ "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", " UnionExec", @@ -2149,7 +2237,7 @@ mod tests { " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", ]; let expected_optimized = vec![ - "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(NULL) }]", + "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", " UnionExec", " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", diff --git a/datafusion/core/src/physical_plan/planner.rs b/datafusion/core/src/physical_plan/planner.rs index 2214d0168ed87..b6269a560386c 100644 --- a/datafusion/core/src/physical_plan/planner.rs +++ b/datafusion/core/src/physical_plan/planner.rs @@ -1900,7 +1900,6 @@ mod tests { let session_state = make_session_state(); // optimize the logical plan let logical_plan = session_state.optimize(logical_plan)?; - println!("optimized logical plan {:?}", logical_plan); let planner = DefaultPhysicalPlanner::default(); planner .create_physical_plan(&logical_plan, &session_state) diff --git a/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs b/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs index 9af412a9585bb..2d7018f38a335 100644 --- a/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs +++ b/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs @@ -85,8 +85,6 @@ pub struct SortPreservingMergeExec { expr: Vec, /// Execution metrics metrics: ExecutionPlanMetricsSet, - /// use SortPreservingMergeExec to satisfy the Sort Distribution - satisfy_distribution: bool, } impl SortPreservingMergeExec { @@ -96,20 +94,6 @@ impl SortPreservingMergeExec { input, expr, metrics: ExecutionPlanMetricsSet::new(), - satisfy_distribution: false, - } - } - - /// Create a new SortPreservingMergeExec to satisfy the Sort Distribution - pub fn new_for_distribuion( - expr: Vec, - input: Arc, - ) -> Self { - Self { - input, - expr, - metrics: ExecutionPlanMetricsSet::new(), - satisfy_distribution: true, } } @@ -122,11 +106,6 @@ impl SortPreservingMergeExec { pub fn expr(&self) -> &[PhysicalSortExpr] { &self.expr } - - /// satisfy the Sort Distribution requirements - pub fn satisfy_distribution(&self) -> bool { - self.satisfy_distribution - } } impl ExecutionPlan for SortPreservingMergeExec { diff --git a/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs b/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs index 2affa9f364109..aee8d3edf93f4 100644 --- a/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs +++ b/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs @@ -126,7 +126,7 @@ impl BoundedWindowAggExec { let partition_by = self.window_expr()[0].partition_by(); let sort_keys = self .output_ordering() - .unwrap_or(self.sort_keys.as_deref().unwrap_or(&[])); + .unwrap_or_else(|| self.sort_keys.as_deref().unwrap_or(&[])); for item in partition_by { if let Some(a) = sort_keys.iter().find(|&e| e.expr.eq(item)) { result.push(a.clone()); @@ -188,7 +188,7 @@ impl ExecutionPlan for BoundedWindowAggExec { } else { PhysicalSortRequirements { expr: o.expr.clone(), - sort_options: Some(o.options.clone()), + sort_options: Some(o.options), } } }) @@ -460,7 +460,6 @@ impl SortedPartitionByBoundedWindowStream { ) -> Self { let state = window_expr.iter().map(|_| IndexMap::new()).collect(); let empty_batch = RecordBatch::new_empty(schema.clone()); - println!("partition_by_sort_keys {:?}", partition_by_sort_keys); Self { schema, input, diff --git a/datafusion/core/src/physical_plan/windows/window_agg_exec.rs b/datafusion/core/src/physical_plan/windows/window_agg_exec.rs index 16404579233ca..43614e34adc82 100644 --- a/datafusion/core/src/physical_plan/windows/window_agg_exec.rs +++ b/datafusion/core/src/physical_plan/windows/window_agg_exec.rs @@ -117,7 +117,7 @@ impl WindowAggExec { let partition_by = self.window_expr()[0].partition_by(); let sort_keys = self .output_ordering() - .unwrap_or(self.sort_keys.as_deref().unwrap_or(&[])); + .unwrap_or_else(|| self.sort_keys.as_deref().unwrap_or(&[])); for item in partition_by { if let Some(a) = sort_keys.iter().find(|&e| e.expr.eq(item)) { result.push(a.clone()); @@ -193,7 +193,7 @@ impl ExecutionPlan for WindowAggExec { } else { PhysicalSortRequirements { expr: o.expr.clone(), - sort_options: Some(o.options.clone()), + sort_options: Some(o.options), } } }) diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index 44a33b2d45fb3..fa9b8d134efde 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -172,16 +172,15 @@ pub fn normalize_expr_with_equivalence_properties( pub fn new_sort_requirements( sort_keys: Option<&[PhysicalSortExpr]>, ) -> Option> { - let ordering_requirements = sort_keys.map(|ordering| { + sort_keys.map(|ordering| { ordering .iter() .map(|o| PhysicalSortRequirements { expr: o.expr.clone(), - sort_options: Some(o.options.clone()), + sort_options: Some(o.options), }) .collect::>() - }); - ordering_requirements + }) } pub fn normalize_sort_expr_with_equivalence_properties( @@ -212,7 +211,7 @@ pub fn normalize_sort_requirement_with_equivalence_properties( if sort_requirement.expr.ne(&normalized_expr) { PhysicalSortRequirements { expr: normalized_expr, - sort_options: sort_requirement.sort_options.clone(), + sort_options: sort_requirement.sort_options, } } else { sort_requirement @@ -410,7 +409,7 @@ pub fn map_requirement_before_projection( .zip(requirement.iter()) .map(|(new, old)| PhysicalSortRequirements { expr: new.clone(), - sort_options: old.sort_options.clone(), + sort_options: old.sort_options, }) .collect::>(); Some(new_request) @@ -431,7 +430,7 @@ pub fn create_sort_expr_from_requirement( if prop.sort_options.is_some() { PhysicalSortExpr { expr: prop.expr.clone(), - options: prop.sort_options.unwrap().clone(), + options: prop.sort_options.unwrap(), } } else { PhysicalSortExpr { diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index 201cf7a852432..4c3260415f4be 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -700,11 +700,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let mut assign_map = assignments .iter() .map(|assign| { - let col_name: &Ident = assign - .id - .iter() - .last() - .ok_or(DataFusionError::Plan("Empty column id".to_string()))?; + let col_name: &Ident = assign.id.iter().last().ok_or_else(|| { + DataFusionError::Plan("Empty column id".to_string()) + })?; // Validate that the assignment target column exists table_schema.field_with_unqualified_name(&col_name.value)?; Ok((col_name.value.clone(), assign.value.clone())) diff --git a/datafusion/substrait/src/logical_plan/consumer.rs b/datafusion/substrait/src/logical_plan/consumer.rs index afb83058a4023..f57b56b9b004f 100644 --- a/datafusion/substrait/src/logical_plan/consumer.rs +++ b/datafusion/substrait/src/logical_plan/consumer.rs @@ -612,69 +612,67 @@ pub async fn from_substrait_rex( ))), } } - Some(RexType::Literal(lit)) => { - match &lit.literal_type { - Some(LiteralType::I8(n)) => { - Ok(Arc::new(Expr::Literal(ScalarValue::Int8(Some(*n as i8))))) - } - Some(LiteralType::I16(n)) => { - Ok(Arc::new(Expr::Literal(ScalarValue::Int16(Some(*n as i16))))) - } - Some(LiteralType::I32(n)) => { - Ok(Arc::new(Expr::Literal(ScalarValue::Int32(Some(*n))))) - } - Some(LiteralType::I64(n)) => { - Ok(Arc::new(Expr::Literal(ScalarValue::Int64(Some(*n))))) - } - Some(LiteralType::Boolean(b)) => { - Ok(Arc::new(Expr::Literal(ScalarValue::Boolean(Some(*b))))) - } - Some(LiteralType::Date(d)) => { - Ok(Arc::new(Expr::Literal(ScalarValue::Date32(Some(*d))))) - } - Some(LiteralType::Fp32(f)) => { - Ok(Arc::new(Expr::Literal(ScalarValue::Float32(Some(*f))))) - } - Some(LiteralType::Fp64(f)) => { - Ok(Arc::new(Expr::Literal(ScalarValue::Float64(Some(*f))))) - } - Some(LiteralType::Decimal(d)) => { - let value: [u8; 16] = d.value.clone().try_into().or(Err( - DataFusionError::Substrait( - "Failed to parse decimal value".to_string(), - ), - ))?; - let p = d.precision.try_into().map_err(|e| { - DataFusionError::Substrait(format!( - "Failed to parse decimal precision: {e}" - )) - })?; - let s = d.scale.try_into().map_err(|e| { - DataFusionError::Substrait(format!( - "Failed to parse decimal scale: {e}" - )) - })?; - Ok(Arc::new(Expr::Literal(ScalarValue::Decimal128( - Some(std::primitive::i128::from_le_bytes(value)), - p, - s, - )))) - } - Some(LiteralType::String(s)) => { - Ok(Arc::new(Expr::Literal(ScalarValue::Utf8(Some(s.clone()))))) - } - Some(LiteralType::Binary(b)) => Ok(Arc::new(Expr::Literal( - ScalarValue::Binary(Some(b.clone())), - ))), - Some(LiteralType::Null(ntype)) => { - Ok(Arc::new(Expr::Literal(from_substrait_null(ntype)?))) - } - _ => Err(DataFusionError::NotImplemented(format!( - "Unsupported literal_type: {:?}", - lit.literal_type - ))), + Some(RexType::Literal(lit)) => match &lit.literal_type { + Some(LiteralType::I8(n)) => { + Ok(Arc::new(Expr::Literal(ScalarValue::Int8(Some(*n as i8))))) } - } + Some(LiteralType::I16(n)) => { + Ok(Arc::new(Expr::Literal(ScalarValue::Int16(Some(*n as i16))))) + } + Some(LiteralType::I32(n)) => { + Ok(Arc::new(Expr::Literal(ScalarValue::Int32(Some(*n))))) + } + Some(LiteralType::I64(n)) => { + Ok(Arc::new(Expr::Literal(ScalarValue::Int64(Some(*n))))) + } + Some(LiteralType::Boolean(b)) => { + Ok(Arc::new(Expr::Literal(ScalarValue::Boolean(Some(*b))))) + } + Some(LiteralType::Date(d)) => { + Ok(Arc::new(Expr::Literal(ScalarValue::Date32(Some(*d))))) + } + Some(LiteralType::Fp32(f)) => { + Ok(Arc::new(Expr::Literal(ScalarValue::Float32(Some(*f))))) + } + Some(LiteralType::Fp64(f)) => { + Ok(Arc::new(Expr::Literal(ScalarValue::Float64(Some(*f))))) + } + Some(LiteralType::Decimal(d)) => { + let value: [u8; 16] = d.value.clone().try_into().map_err(|_| { + DataFusionError::Substrait( + "Failed to parse decimal value".to_string(), + ) + })?; + let p = d.precision.try_into().map_err(|e| { + DataFusionError::Substrait(format!( + "Failed to parse decimal precision: {e}" + )) + })?; + let s = d.scale.try_into().map_err(|e| { + DataFusionError::Substrait(format!( + "Failed to parse decimal scale: {e}" + )) + })?; + Ok(Arc::new(Expr::Literal(ScalarValue::Decimal128( + Some(std::primitive::i128::from_le_bytes(value)), + p, + s, + )))) + } + Some(LiteralType::String(s)) => { + Ok(Arc::new(Expr::Literal(ScalarValue::Utf8(Some(s.clone()))))) + } + Some(LiteralType::Binary(b)) => Ok(Arc::new(Expr::Literal( + ScalarValue::Binary(Some(b.clone())), + ))), + Some(LiteralType::Null(ntype)) => { + Ok(Arc::new(Expr::Literal(from_substrait_null(ntype)?))) + } + _ => Err(DataFusionError::NotImplemented(format!( + "Unsupported literal_type: {:?}", + lit.literal_type + ))), + }, _ => Err(DataFusionError::NotImplemented( "unsupported rex_type".to_string(), )), From 245428dd5e46262f44075d77586cdfb27f16b69a Mon Sep 17 00:00:00 2001 From: "mingmwang@ebay.com" Date: Thu, 16 Feb 2023 00:10:35 +0800 Subject: [PATCH 06/35] Fix SortMergeJoin case --- .../physical_optimizer/sort_enforcement2.rs | 733 ++++++++++++++---- .../core/src/physical_optimizer/utils.rs | 21 - 2 files changed, 569 insertions(+), 185 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement2.rs b/datafusion/core/src/physical_optimizer/sort_enforcement2.rs index bba088339e6ff..5d57285262d5b 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement2.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement2.rs @@ -36,7 +36,7 @@ use crate::config::ConfigOptions; use crate::error::Result; use crate::execution::context::TaskContext; -use crate::physical_optimizer::utils::add_sort_above_child; +use crate::physical_optimizer::utils::add_sort_above; use crate::physical_optimizer::PhysicalOptimizerRule; use crate::physical_plan::coalesce_partitions::CoalescePartitionsExec; use crate::physical_plan::filter::FilterExec; @@ -56,6 +56,7 @@ use crate::physical_plan::{ }; use arrow::datatypes::SchemaRef; use datafusion_common::{reverse_sort_options, DataFusionError, Statistics}; +use datafusion_expr::JoinType; use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::utils::{ create_sort_expr_from_requirement, map_requirement_before_projection, @@ -68,9 +69,10 @@ use datafusion_physical_expr::{ }; use itertools::izip; use std::any::Any; +use std::ops::Deref; use std::sync::Arc; -/// This rule implements a TOP-Downinspects SortExec's in the given physical plan and removes the +/// This rule implements a Top-Down approach to inspects SortExec's in the given physical plan and removes the /// ones it can prove unnecessary. #[derive(Default)] pub struct TopDownEnforceSorting {} @@ -82,7 +84,7 @@ impl TopDownEnforceSorting { } } -/// This is a "data class" we use within the [EnforceSorting] rule +/// This is a "data class" we use within the [TopDownEnforceSorting] rule #[derive(Debug, Clone)] struct PlanWithSortRequirements { /// Current plan @@ -237,7 +239,7 @@ impl PhysicalOptimizerRule for TopDownEnforceSorting { } fn name(&self) -> &str { - "EnforceSorting2" + "TopDownEnforceSorting" } fn schema_check(&self) -> bool { @@ -378,11 +380,8 @@ fn ensure_sorting( // If the current plan is a SortExec, modify current SortExec to satisfy the parent requirements let parent_required_expr = create_sort_expr_from_requirement(parent_required.unwrap()); - let new_plan = add_sort_above_child( - &sort_exec.input, - parent_required_expr, - sort_exec.fetch(), - )?; + let mut new_plan = sort_exec.input.clone(); + add_sort_above(&mut new_plan, parent_required_expr)?; Ok(Some( PlanWithSortRequirements::new_without_impact_result_ordering(new_plan), )) @@ -402,7 +401,8 @@ fn ensure_sorting( || plan.as_any().downcast_ref::().is_some()) && plan.as_any().downcast_ref::().is_none() { - let new_plan = add_sort_above_child(plan, parent_required_expr, None)?; + let mut new_plan = plan.clone(); + add_sort_above(&mut new_plan, parent_required_expr)?; Ok(Some( PlanWithSortRequirements::new_without_impact_result_ordering(new_plan), )) @@ -425,7 +425,7 @@ fn ensure_sorting( let adjusted = parent_required.map(|r| r.to_vec()); Ok(Some(PlanWithSortRequirements { plan: plan.clone(), - impact_result_ordering: true, + impact_result_ordering: requirements.impact_result_ordering, satisfy_single_distribution: requirements.satisfy_single_distribution, required_ordering: None, adjusted_request_ordering: vec![adjusted], @@ -479,7 +479,8 @@ fn ensure_sorting( } } // Can not push down requirements, add new SortExec - let new_plan = add_sort_above_child(plan, parent_required_expr, None)?; + let mut new_plan = plan.clone(); + add_sort_above(&mut new_plan, parent_required_expr)?; Ok(Some( PlanWithSortRequirements::new_without_impact_result_ordering( new_plan, @@ -490,70 +491,42 @@ fn ensure_sorting( // If the current plan is SortMergeJoinExec let left_columns_len = smj.left.schema().fields().len(); let expr_source_side = - expr_source_sides(&parent_required_expr, left_columns_len); + expr_source_sides(&parent_required_expr, smj.join_type, left_columns_len); match expr_source_side { Some(JoinSide::Left) if maintains_input_order[0] => { - if requirements_compatible( - plan.required_input_ordering()[0].as_deref(), + try_pushdown_requirements_to_join( + &requirements, + plan, parent_required, - || plan.children()[0].equivalence_properties(), - ) { - println!("Requirements are compatible with SMJ"); - Ok(Some(PlanWithSortRequirements { - plan: plan.clone(), - impact_result_ordering: true, - satisfy_single_distribution: requirements - .satisfy_single_distribution, - required_ordering: None, - adjusted_request_ordering: requirements - .adjusted_request_ordering, - })) - } else { - // Can not push down, add new SortExec - println!("Can not push down, add new SortExec"); - let new_plan = - add_sort_above_child(plan, parent_required_expr, None)?; - Ok(Some( - PlanWithSortRequirements::new_without_impact_result_ordering( - new_plan, - ), - )) - } + parent_required_expr, + JoinSide::Left, + ) } Some(JoinSide::Right) if maintains_input_order[1] => { - let shift_right_required = - shift_right_required(parent_required.unwrap(), left_columns_len); - if requirements_compatible( - plan.required_input_ordering()[1].as_deref(), - shift_right_required.as_deref(), - || plan.children()[1].equivalence_properties(), - ) { - println!("Requirements are compatible with SMJ"); - Ok(Some(PlanWithSortRequirements { - plan: plan.clone(), - impact_result_ordering: true, - satisfy_single_distribution: requirements - .satisfy_single_distribution, - required_ordering: None, - adjusted_request_ordering: requirements - .adjusted_request_ordering, - })) - } else { - // Can not push down, add new SortExec - println!("Can not push down, add new SortExec"); - let new_plan = - add_sort_above_child(plan, parent_required_expr, None)?; - Ok(Some( - PlanWithSortRequirements::new_without_impact_result_ordering( - new_plan, - ), - )) - } + let new_right_required = match smj.join_type { + JoinType::Inner | JoinType::Right => shift_right_required( + parent_required.unwrap(), + left_columns_len, + )?, + JoinType::RightSemi | JoinType::RightAnti => { + parent_required.unwrap().to_vec() + } + _ => Err(DataFusionError::Plan( + "Unexpected SortMergeJoin type here".to_string(), + ))?, + }; + try_pushdown_requirements_to_join( + &requirements, + plan, + Some(new_right_required.deref()), + parent_required_expr, + JoinSide::Right, + ) } _ => { - println!("Can not decide the expr side for SortMergeJoinExec, can not push down, add SortExec"); - let new_plan = - add_sort_above_child(plan, parent_required_expr, None)?; + // Can not decide the expr side for SortMergeJoinExec, can not push down, add SortExec; + let mut new_plan = plan.clone(); + add_sort_above(&mut new_plan, parent_required_expr)?; Ok(Some( PlanWithSortRequirements::new_without_impact_result_ordering( new_plan, @@ -564,11 +537,6 @@ fn ensure_sorting( } else if plan.required_input_ordering().iter().any(Option::is_some) { // If the current plan has its own ordering requirements to its children, check whether the requirements // are compatible with the parent requirements. - println!( - "the current plan has its own ordering requirements, {:?}", - plan.required_input_ordering() - ); - let plan_children = plan.children(); let compatible_with_children = izip!( maintains_input_order.iter(), @@ -586,13 +554,7 @@ fn ensure_sorting( .collect::>(); if compatible_with_children.iter().all(|a| *a) { // Requirements are compatible, not need to push down. - Ok(Some(PlanWithSortRequirements { - plan: plan.clone(), - impact_result_ordering: true, - satisfy_single_distribution: requirements.satisfy_single_distribution, - required_ordering: None, - adjusted_request_ordering: requirements.adjusted_request_ordering, - })) + Ok(None) } else { let can_adjust_child_requirements = plan .required_input_ordering() @@ -608,11 +570,10 @@ fn ensure_sorting( .collect::>(); if can_adjust_child_requirements.iter().all(|a| *a) { // Adjust child requirements and push down the requirements - println!("Adjust child requirements and push down the requirements"); let adjusted = parent_required.map(|r| r.to_vec()); Ok(Some(PlanWithSortRequirements { plan: plan.clone(), - impact_result_ordering: true, + impact_result_ordering: requirements.impact_result_ordering, satisfy_single_distribution: requirements .satisfy_single_distribution, required_ordering: None, @@ -624,9 +585,8 @@ fn ensure_sorting( })) } else { // Can not push down, add new SortExec - println!("Can not push down, add new SortExec"); - let new_plan = - add_sort_above_child(plan, parent_required_expr, None)?; + let mut new_plan = plan.clone(); + add_sort_above(&mut new_plan, parent_required_expr)?; Ok(Some( PlanWithSortRequirements::new_without_impact_result_ordering( new_plan, @@ -644,10 +604,9 @@ fn ensure_sorting( let new_requirement = map_requirement_before_projection(parent_required, expr); if new_requirement.is_some() { - println!("Push requirements down to Projection"); Ok(Some(PlanWithSortRequirements { plan: plan.clone(), - impact_result_ordering: true, + impact_result_ordering: requirements.impact_result_ordering, satisfy_single_distribution: requirements .satisfy_single_distribution, required_ordering: None, @@ -655,11 +614,8 @@ fn ensure_sorting( })) } else { // Can not push down, add new SortExec - println!( - "Can not push requirements down to Projection, add SortExec" - ); - let new_plan = - add_sort_above_child(plan, parent_required_expr, None)?; + let mut new_plan = plan.clone(); + add_sort_above(&mut new_plan, parent_required_expr)?; Ok(Some( PlanWithSortRequirements::new_without_impact_result_ordering( new_plan, @@ -667,7 +623,6 @@ fn ensure_sorting( )) } } else { - println!("Push down requirements."); Ok(Some(PlanWithSortRequirements { plan: plan.clone(), impact_result_ordering: requirements.impact_result_ordering, @@ -767,76 +722,6 @@ fn analyze_immediate_spm_removal( } } -fn expr_source_sides( - required_exprs: &[PhysicalSortExpr], - left_columns_len: usize, -) -> Option { - let all_column_sides = required_exprs - .iter() - .filter_map(|r| { - if let Some(col) = r.expr.as_any().downcast_ref::() { - if col.index() < left_columns_len { - Some(JoinSide::Left) - } else { - Some(JoinSide::Right) - } - } else { - None - } - }) - .collect::>(); - - // If the exprs are all coming from one side, the requirements can be pushed down - if all_column_sides.len() != required_exprs.len() { - None - } else if all_column_sides - .iter() - .all(|side| matches!(side, JoinSide::Left)) - { - Some(JoinSide::Left) - } else if all_column_sides - .iter() - .all(|side| matches!(side, JoinSide::Right)) - { - Some(JoinSide::Right) - } else { - None - } -} - -fn shift_right_required( - parent_required: &[PhysicalSortRequirements], - left_columns_len: usize, -) -> Option> { - let new_right_required: Vec = parent_required - .iter() - .filter_map(|r| { - if let Some(col) = r.expr.as_any().downcast_ref::() { - if col.index() >= left_columns_len { - Some(PhysicalSortRequirements { - expr: Arc::new(Column::new( - col.name(), - col.index() - left_columns_len, - )) as Arc, - sort_options: r.sort_options, - }) - } else { - None - } - } else { - None - } - }) - .collect::>(); - - // if the parent required are all comming from the right side, the requirements can be pushdown - if new_right_required.len() != parent_required.len() { - None - } else { - Some(new_right_required) - } -} - /// Compares window expression's `window_request` and `parent_required_expr` ordering, returns /// whether we should reverse the window expression's ordering in order to meet parent's requirements. fn check_alignment( @@ -1025,6 +910,169 @@ fn extract_window_info_from_plan( } } +fn try_pushdown_requirements_to_join( + requirements: &PlanWithSortRequirements, + plan: &Arc, + parent_required: Option<&[PhysicalSortRequirements]>, + parent_required_expr: Vec, + push_side: JoinSide, +) -> Result> { + let child_idx = match push_side { + JoinSide::Left => 0, + JoinSide::Right => 1, + }; + if requirements_compatible( + plan.required_input_ordering()[child_idx].as_deref(), + parent_required, + || plan.children()[child_idx].equivalence_properties(), + ) { + // parent requirements are compatible with the SortMergeJoinExec + Ok(None) + } else if requirements_compatible( + parent_required, + plan.required_input_ordering()[child_idx].as_deref(), + || plan.children()[child_idx].equivalence_properties(), + ) { + // parent requirements are more specific, adjust the SortMergeJoinExec child requirements and push down the new requirements + let new_adjusted = match push_side { + JoinSide::Left => vec![ + parent_required.map(|r| r.to_vec()), + requirements.adjusted_request_ordering[1].clone(), + ], + JoinSide::Right => vec![ + requirements.adjusted_request_ordering[0].clone(), + parent_required.map(|r| r.to_vec()), + ], + }; + Ok(Some(PlanWithSortRequirements { + plan: plan.clone(), + impact_result_ordering: requirements.impact_result_ordering, + satisfy_single_distribution: requirements.satisfy_single_distribution, + required_ordering: None, + adjusted_request_ordering: new_adjusted, + })) + } else { + // Can not push down, add new SortExec + let mut new_plan = plan.clone(); + add_sort_above(&mut new_plan, parent_required_expr)?; + Ok(Some( + PlanWithSortRequirements::new_without_impact_result_ordering(new_plan), + )) + } +} + +fn expr_source_sides( + required_exprs: &[PhysicalSortExpr], + join_type: JoinType, + left_columns_len: usize, +) -> Option { + match join_type { + JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full => { + let all_column_sides = required_exprs + .iter() + .filter_map(|r| { + if let Some(col) = r.expr.as_any().downcast_ref::() { + if col.index() < left_columns_len { + Some(JoinSide::Left) + } else { + Some(JoinSide::Right) + } + } else { + None + } + }) + .collect::>(); + + // If the exprs are all coming from one side, the requirements can be pushed down + if all_column_sides.len() != required_exprs.len() { + None + } else if all_column_sides + .iter() + .all(|side| matches!(side, JoinSide::Left)) + { + Some(JoinSide::Left) + } else if all_column_sides + .iter() + .all(|side| matches!(side, JoinSide::Right)) + { + Some(JoinSide::Right) + } else { + None + } + } + JoinType::LeftSemi | JoinType::LeftAnti => { + if required_exprs + .iter() + .filter_map(|r| { + if r.expr.as_any().downcast_ref::().is_some() { + Some(JoinSide::Left) + } else { + None + } + }) + .count() + != required_exprs.len() + { + None + } else { + Some(JoinSide::Left) + } + } + JoinType::RightSemi | JoinType::RightAnti => { + if required_exprs + .iter() + .filter_map(|r| { + if r.expr.as_any().downcast_ref::().is_some() { + Some(JoinSide::Right) + } else { + None + } + }) + .count() + != required_exprs.len() + { + None + } else { + Some(JoinSide::Right) + } + } + } +} + +fn shift_right_required( + parent_required: &[PhysicalSortRequirements], + left_columns_len: usize, +) -> Result> { + let new_right_required: Vec = parent_required + .iter() + .filter_map(|r| { + if let Some(col) = r.expr.as_any().downcast_ref::() { + if col.index() >= left_columns_len { + Some(PhysicalSortRequirements { + expr: Arc::new(Column::new( + col.name(), + col.index() - left_columns_len, + )) as Arc, + sort_options: r.sort_options, + }) + } else { + None + } + } else { + None + } + }) + .collect::>(); + if new_right_required.len() != parent_required.len() { + Err(DataFusionError::Plan( + "Expect to shift all the parent required column indexes for SortMergeJoin" + .to_string(), + )) + } else { + Ok(new_right_required) + } +} + #[derive(Debug)] /// This structure stores extra Window information required to create a new WindowExec pub struct WindowExecInfo { @@ -1119,6 +1167,7 @@ mod tests { use crate::physical_plan::displayable; use crate::physical_plan::file_format::{FileScanConfig, ParquetExec}; use crate::physical_plan::filter::FilterExec; + use crate::physical_plan::joins::utils::JoinOn; use crate::physical_plan::memory::MemoryExec; use crate::physical_plan::repartition::RepartitionExec; use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; @@ -1128,6 +1177,7 @@ mod tests { use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion_common::{Result, Statistics}; + use datafusion_expr::logical_plan::JoinType; use datafusion_expr::{AggregateFunction, WindowFrame, WindowFunction}; use datafusion_physical_expr::expressions::{col, NotExpr}; use datafusion_physical_expr::PhysicalSortExpr; @@ -1142,6 +1192,13 @@ mod tests { Ok(schema) } + fn create_test_schema2() -> Result { + let col_a = Field::new("col_a", DataType::Int32, true); + let col_b = Field::new("col_b", DataType::Int32, true); + let schema = Arc::new(Schema::new(vec![col_a, col_b])); + Ok(schema) + } + // Util function to get string representation of a physical plan fn get_plan_string(plan: &Arc) -> Vec { let formatted = displayable(plan.as_ref()).indent().to_string(); @@ -2160,6 +2217,111 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_union_inputs_different_sorted8() -> Result<()> { + let schema = create_test_schema()?; + + let source1 = parquet_exec(&schema); + let sort_exprs1 = vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ]; + let sort_exprs2 = vec![ + sort_expr_options( + "nullable_col", + &schema, + SortOptions { + descending: true, + nulls_first: false, + }, + ), + sort_expr_options( + "non_nullable_col", + &schema, + SortOptions { + descending: true, + nulls_first: false, + }, + ), + ]; + let sort1 = sort_exec(sort_exprs1, source1.clone()); + let sort2 = sort_exec(sort_exprs2, source1); + + let physical_plan = union_exec(vec![sort1, sort2]); + + // The `UnionExec` doesn't preserve any of the inputs ordering in the + // example below. + let expected_input = vec![ + "UnionExec", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " SortExec: expr=[nullable_col@0 DESC NULLS LAST,non_nullable_col@1 DESC NULLS LAST], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + let expected_optimized = vec![ + "UnionExec", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_union_inputs_different_sorted_with_limit() -> Result<()> { + let schema = create_test_schema()?; + + let source1 = parquet_exec(&schema); + let sort_exprs1 = vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ]; + let sort_exprs2 = vec![ + sort_expr("nullable_col", &schema), + sort_expr_options( + "non_nullable_col", + &schema, + SortOptions { + descending: true, + nulls_first: false, + }, + ), + ]; + let sort_exprs3 = vec![sort_expr("nullable_col", &schema)]; + let sort1 = sort_exec(sort_exprs1, source1.clone()); + + let sort2 = sort_exec(sort_exprs2, source1); + let limit = local_limit_exec(sort2); + let limit = global_limit_exec(limit); + + let union = union_exec(vec![sort1, limit]); + let physical_plan = sort_preserving_merge_exec(sort_exprs3, union); + + // Should not change the unnecessarily fine `SortExec`s because there is `LimitExec` + let expected_input = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " UnionExec", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " GlobalLimitExec: skip=0, fetch=100", + " LocalLimitExec: fetch=100", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 DESC NULLS LAST], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + let expected_optimized = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " UnionExec", + " SortExec: expr=[nullable_col@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " GlobalLimitExec: skip=0, fetch=100", + " LocalLimitExec: fetch=100", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 DESC NULLS LAST], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + #[tokio::test] async fn test_window_multi_path_sort() -> Result<()> { let schema = create_test_schema()?; @@ -2246,6 +2408,230 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_sort_merge_join_order_by_left() -> Result<()> { + let left_schema = create_test_schema()?; + let right_schema = create_test_schema2()?; + + let left = parquet_exec(&left_schema); + let right = parquet_exec(&right_schema); + + // Join on (nullable_col == col_a) + let join_on = vec![( + Column::new_with_schema("nullable_col", &left.schema()).unwrap(), + Column::new_with_schema("col_a", &right.schema()).unwrap(), + )]; + + let join_types = vec![ + JoinType::Inner, + JoinType::Left, + JoinType::Right, + JoinType::Full, + JoinType::LeftSemi, + JoinType::LeftAnti, + ]; + for join_type in join_types { + let join = + sort_merge_join_exec(left.clone(), right.clone(), &join_on, &join_type); + let sort_exprs = vec![ + sort_expr("nullable_col", &join.schema()), + sort_expr("non_nullable_col", &join.schema()), + ]; + let physical_plan = sort_preserving_merge_exec(sort_exprs.clone(), join); + + let join_plan = + format!(" SortMergeJoin: join_type={join_type}, on=[(Column {{ name: \"nullable_col\", index: 0 }}, Column {{ name: \"col_a\", index: 0 }})]"); + let join_plan2 = + format!(" SortMergeJoin: join_type={join_type}, on=[(Column {{ name: \"nullable_col\", index: 0 }}, Column {{ name: \"col_a\", index: 0 }})]"); + + let expected_input = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + join_plan.as_str(), + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", + ]; + let expected_optimized = match join_type { + JoinType::Inner + | JoinType::Left + | JoinType::LeftSemi + | JoinType::LeftAnti => { + // can push down the sort requirements and save 1 SortExec + vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + join_plan.as_str(), + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " SortExec: expr=[col_a@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", + ] + } + _ => { + // can not push down the sort requirements + vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + join_plan2.as_str(), + " SortExec: expr=[nullable_col@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " SortExec: expr=[col_a@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", + ] + } + }; + assert_optimized!(expected_input, expected_optimized, physical_plan); + } + Ok(()) + } + + #[tokio::test] + async fn test_sort_merge_join_order_by_right() -> Result<()> { + let left_schema = create_test_schema()?; + let right_schema = create_test_schema2()?; + + let left = parquet_exec(&left_schema); + let right = parquet_exec(&right_schema); + + // Join on (nullable_col == col_a) + let join_on = vec![( + Column::new_with_schema("nullable_col", &left.schema()).unwrap(), + Column::new_with_schema("col_a", &right.schema()).unwrap(), + )]; + + let join_types = vec![ + JoinType::Inner, + JoinType::Left, + JoinType::Right, + JoinType::Full, + JoinType::RightAnti, + ]; + for join_type in join_types { + let join = + sort_merge_join_exec(left.clone(), right.clone(), &join_on, &join_type); + let sort_exprs = vec![ + sort_expr("col_a", &join.schema()), + sort_expr("col_b", &join.schema()), + ]; + let physical_plan = sort_preserving_merge_exec(sort_exprs, join); + + let join_plan = + format!(" SortMergeJoin: join_type={join_type}, on=[(Column {{ name: \"nullable_col\", index: 0 }}, Column {{ name: \"col_a\", index: 0 }})]"); + let spm_plan = match join_type { + JoinType::RightAnti => { + "SortPreservingMergeExec: [col_a@0 ASC,col_b@1 ASC]" + } + _ => "SortPreservingMergeExec: [col_a@2 ASC,col_b@3 ASC]", + }; + let join_plan2 = + format!(" SortMergeJoin: join_type={join_type}, on=[(Column {{ name: \"nullable_col\", index: 0 }}, Column {{ name: \"col_a\", index: 0 }})]"); + + let expected_input = vec![ + spm_plan, + join_plan.as_str(), + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", + ]; + let expected_optimized = match join_type { + JoinType::Inner | JoinType::Right | JoinType::RightAnti => { + // can push down the sort requirements and save 1 SortExec + vec![ + spm_plan, + join_plan.as_str(), + " SortExec: expr=[nullable_col@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " SortExec: expr=[col_a@0 ASC,col_b@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", + ] + } + _ => { + // can not push down the sort requirements for Left and Full join. + vec![ + spm_plan, + " SortExec: expr=[col_a@2 ASC,col_b@3 ASC], global=true", + join_plan2.as_str(), + " SortExec: expr=[nullable_col@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " SortExec: expr=[col_a@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", + ] + } + }; + assert_optimized!(expected_input, expected_optimized, physical_plan); + } + Ok(()) + } + + #[tokio::test] + async fn test_sort_merge_join_complex_order_by() -> Result<()> { + let left_schema = create_test_schema()?; + let right_schema = create_test_schema2()?; + + let left = parquet_exec(&left_schema); + let right = parquet_exec(&right_schema); + + // Join on (nullable_col == col_a) + let join_on = vec![( + Column::new_with_schema("nullable_col", &left.schema()).unwrap(), + Column::new_with_schema("col_a", &right.schema()).unwrap(), + )]; + + let join = sort_merge_join_exec(left, right, &join_on, &JoinType::Inner); + + // order by (col_b, col_a) + let sort_exprs1 = vec![ + sort_expr("col_b", &join.schema()), + sort_expr("col_a", &join.schema()), + ]; + let physical_plan = sort_preserving_merge_exec(sort_exprs1, join.clone()); + + let expected_input = vec![ + "SortPreservingMergeExec: [col_b@3 ASC,col_a@2 ASC]", + " SortMergeJoin: join_type=Inner, on=[(Column { name: \"nullable_col\", index: 0 }, Column { name: \"col_a\", index: 0 })]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", + ]; + + // can not push down the sort requirements, need to add SortExec + let expected_optimized = vec![ + "SortPreservingMergeExec: [col_b@3 ASC,col_a@2 ASC]", + " SortExec: expr=[col_b@3 ASC,col_a@2 ASC], global=true", + " SortMergeJoin: join_type=Inner, on=[(Column { name: \"nullable_col\", index: 0 }, Column { name: \"col_a\", index: 0 })]", + " SortExec: expr=[nullable_col@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " SortExec: expr=[col_a@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + + // order by (nullable_col, col_b, col_a) + let sort_exprs2 = vec![ + sort_expr("nullable_col", &join.schema()), + sort_expr("col_b", &join.schema()), + sort_expr("col_a", &join.schema()), + ]; + let physical_plan = sort_preserving_merge_exec(sort_exprs2, join); + + let expected_input = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC,col_b@3 ASC,col_a@2 ASC]", + " SortMergeJoin: join_type=Inner, on=[(Column { name: \"nullable_col\", index: 0 }, Column { name: \"col_a\", index: 0 })]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", + ]; + + // can not push down the sort requirements, need to add SortExec + let expected_optimized = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC,col_b@3 ASC,col_a@2 ASC]", + " SortExec: expr=[nullable_col@0 ASC,col_b@3 ASC,col_a@2 ASC], global=true", + " SortMergeJoin: join_type=Inner, on=[(Column { name: \"nullable_col\", index: 0 }, Column { name: \"col_a\", index: 0 })]", + " SortExec: expr=[nullable_col@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " SortExec: expr=[col_a@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + + Ok(()) + } + /// make PhysicalSortExpr with default options fn sort_expr(name: &str, schema: &Schema) -> PhysicalSortExpr { sort_expr_options(name, schema, SortOptions::default()) @@ -2397,4 +2783,23 @@ mod tests { .unwrap(), ) } + + fn sort_merge_join_exec( + left: Arc, + right: Arc, + join_on: &JoinOn, + join_type: &JoinType, + ) -> Arc { + Arc::new( + SortMergeJoinExec::try_new( + left, + right, + join_on.clone(), + *join_type, + vec![SortOptions::default(); join_on.len()], + false, + ) + .unwrap(), + ) + } } diff --git a/datafusion/core/src/physical_optimizer/utils.rs b/datafusion/core/src/physical_optimizer/utils.rs index 31c47bae03ad6..b6666fbefae1e 100644 --- a/datafusion/core/src/physical_optimizer/utils.rs +++ b/datafusion/core/src/physical_optimizer/utils.rs @@ -67,24 +67,3 @@ pub fn add_sort_above( } Ok(()) } - -/// Util function to add SortExec above child -/// preserving the original partitioning -pub fn add_sort_above_child( - child: &Arc, - sort_expr: Vec, - fetch: Option, -) -> Result> { - let new_child = if child.output_partitioning().partition_count() > 1 { - Arc::new(SortExec::new_with_partitioning( - sort_expr, - child.clone(), - true, - fetch, - )) as Arc - } else { - Arc::new(SortExec::try_new(sort_expr, child.clone(), fetch)?) - as Arc - }; - Ok(new_child) -} From abad84c4209a8e6a9e8a9452740c472ead0dfcdb Mon Sep 17 00:00:00 2001 From: "mingmwang@ebay.com" Date: Thu, 16 Feb 2023 16:30:47 +0800 Subject: [PATCH 07/35] Fix reverse window sort requirements --- .../physical_optimizer/sort_enforcement2.rs | 9 +++++++ datafusion/core/tests/sql/window.rs | 27 ++++++++++--------- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement2.rs b/datafusion/core/src/physical_optimizer/sort_enforcement2.rs index 5d57285262d5b..7d0d431a16d9f 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement2.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement2.rs @@ -775,6 +775,15 @@ fn should_reverse_window_sort_requirements( if top_requirement.is_none() { return false; } + let WindowExecInfo { window_expr, .. } = + extract_window_info_from_plan(&window_plan).unwrap(); + let reverse_window_expr = window_expr + .iter() + .map(|e| e.get_reverse_expr()) + .collect::>>(); + if reverse_window_expr.is_none() { + return false; + } let flags = window_plan .children() .into_iter() diff --git a/datafusion/core/tests/sql/window.rs b/datafusion/core/tests/sql/window.rs index 578f75360350c..116f37ace11be 100644 --- a/datafusion/core/tests/sql/window.rs +++ b/datafusion/core/tests/sql/window.rs @@ -1601,10 +1601,11 @@ async fn test_window_agg_sort_multi_layer_non_reversed_plan() -> Result<()> { vec![ "ProjectionExec: expr=[c9@2 as c9, SUM(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@5 as sum1, SUM(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c1 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@3 as sum2, ROW_NUMBER() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@4 as rn2]", " GlobalLimitExec: skip=0, fetch=5", - " BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: \"SUM(aggregate_test_100.c9)\", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: Following(UInt64(1)) }]", - " BoundedWindowAggExec: wdw=[ROW_NUMBER(): Ok(Field { name: \"ROW_NUMBER()\", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)) }]", - " BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: \"SUM(aggregate_test_100.c9)\", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)) }]", - " SortExec: expr=[c9@2 DESC,c1@0 DESC,c2@1 DESC], global=true", + " BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: \"SUM(aggregate_test_100.c9)\", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)) }]", + " SortExec: expr=[c9@2 ASC NULLS LAST,c1@0 ASC NULLS LAST,c2@1 ASC NULLS LAST], global=true", + " BoundedWindowAggExec: wdw=[ROW_NUMBER(): Ok(Field { name: \"ROW_NUMBER()\", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)) }]", + " BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: \"SUM(aggregate_test_100.c9)\", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)) }]", + " SortExec: expr=[c9@2 DESC,c1@0 DESC], global=true", ] }; @@ -1618,15 +1619,15 @@ async fn test_window_agg_sort_multi_layer_non_reversed_plan() -> Result<()> { let actual = execute_to_batches(&ctx, sql).await; let expected = vec![ - "+------------+-------------+-------------+-----+", - "| c9 | sum1 | sum2 | rn2 |", - "+------------+-------------+-------------+-----+", - "| 4268716378 | 8498370520 | 24997484146 | 1 |", - "| 4229654142 | 12714811027 | 29012926487 | 2 |", - "| 4216440507 | 16858984380 | 28743001064 | 3 |", - "| 4144173353 | 20935849039 | 28472563256 | 4 |", - "| 4076864659 | 24997484146 | 28118515915 | 5 |", - "+------------+-------------+-------------+-----+", + "+-----------+------------+-----------+-----+", + "| c9 | sum1 | sum2 | rn2 |", + "+-----------+------------+-----------+-----+", + "| 28774375 | 745354217 | 91818943 | 100 |", + "| 63044568 | 988558066 | 232866360 | 99 |", + "| 141047417 | 1285934966 | 374546521 | 98 |", + "| 141680161 | 1654839259 | 519841132 | 97 |", + "| 145294611 | 1980231675 | 745354217 | 96 |", + "+-----------+------------+-----------+-----+", ]; assert_batches_eq!(expected, &actual); From 6a53df0a65af636126946a35e02724028879b642 Mon Sep 17 00:00:00 2001 From: "mingmwang@ebay.com" Date: Thu, 16 Feb 2023 16:50:28 +0800 Subject: [PATCH 08/35] fix test comments --- datafusion/core/src/physical_optimizer/sort_enforcement2.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement2.rs b/datafusion/core/src/physical_optimizer/sort_enforcement2.rs index 7d0d431a16d9f..7cd83ee7e7ec0 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement2.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement2.rs @@ -2397,8 +2397,8 @@ mod tests { let union = union_exec(vec![sort1, sort2]); let physical_plan = window_exec("nullable_col", sort_exprs2, union); - // The `WindowAggExec` gets its sorting from multiple children jointly. - // The SortExecs should be kept to ensure the final result ordering + // The `WindowAggExec` can get its required sorting from the leaf nodes directly. + // The unnecessary SortExecs should be removed let expected_input = vec![ "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", " UnionExec", From c5c5bc8cb62fbdb02b455be704952c8054478357 Mon Sep 17 00:00:00 2001 From: "mingmwang@ebay.com" Date: Thu, 16 Feb 2023 22:11:53 +0800 Subject: [PATCH 09/35] add determine_children_requirement() method --- .../physical_optimizer/sort_enforcement2.rs | 349 +++++++++--------- 1 file changed, 174 insertions(+), 175 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement2.rs b/datafusion/core/src/physical_optimizer/sort_enforcement2.rs index 7cd83ee7e7ec0..e2d5e1d8c7251 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement2.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement2.rs @@ -297,9 +297,6 @@ fn ensure_sorting( // propagate the sort requirements down to correct the unnecessary descendant SortExec under the UnionExec let adjusted = new_sort_requirements(union_exec.output_ordering()); return Ok(Some(PlanWithSortRequirements { - plan: plan.clone(), - impact_result_ordering: requirements.impact_result_ordering, - satisfy_single_distribution: requirements.satisfy_single_distribution, required_ordering: None, adjusted_request_ordering: vec![ adjusted; @@ -307,6 +304,7 @@ fn ensure_sorting( .adjusted_request_ordering .len() ], + ..requirements })); } else if plan.as_any().downcast_ref::().is_some() || plan @@ -370,11 +368,8 @@ fn ensure_sorting( } } Ok(Some(PlanWithSortRequirements { - plan: plan.clone(), - impact_result_ordering: requirements.impact_result_ordering, - satisfy_single_distribution: requirements.satisfy_single_distribution, required_ordering: None, - adjusted_request_ordering: requirements.adjusted_request_ordering, + ..requirements })) } else if let Some(sort_exec) = plan.as_any().downcast_ref::() { // If the current plan is a SortExec, modify current SortExec to satisfy the parent requirements @@ -413,79 +408,79 @@ fn ensure_sorting( .is_some() { let request_child = requirements.adjusted_request_ordering[0].as_deref(); - if requirements_compatible(request_child, parent_required, || { - plan.children()[0].equivalence_properties() - }) { - // request child requirements are more specific, no need to push down the parent requirements - Ok(None) - } else if requirements_compatible(parent_required, request_child, || { - plan.children()[0].equivalence_properties() - }) { - // parent requirements are more specific, adjust the request child requirements and push down the new requirements - let adjusted = parent_required.map(|r| r.to_vec()); - Ok(Some(PlanWithSortRequirements { - plan: plan.clone(), - impact_result_ordering: requirements.impact_result_ordering, - satisfy_single_distribution: requirements.satisfy_single_distribution, - required_ordering: None, - adjusted_request_ordering: vec![adjusted], - })) - } else { - let WindowExecInfo { - window_expr, - input_schema, - partition_keys, - } = extract_window_info_from_plan(plan).unwrap(); - if should_reverse_window_exec( - parent_required, - request_child, - &input_schema, - ) { - let new_physical_ordering = parent_required_expr.to_vec(); - let new_window_expr = window_expr - .iter() - .map(|e| e.get_reverse_expr()) - .collect::>>(); - if let Some(window_expr) = new_window_expr { - let uses_bounded_memory = - window_expr.iter().all(|e| e.uses_bounded_memory()); - let new_plan = if uses_bounded_memory { - Arc::new(BoundedWindowAggExec::try_new( - window_expr, - plan.children()[0].clone(), - input_schema, - partition_keys, - Some(new_physical_ordering), - )?) as Arc - } else { - Arc::new(WindowAggExec::try_new( - window_expr, - plan.children()[0].clone(), - input_schema, - partition_keys, - Some(new_physical_ordering), - )?) as Arc - }; - let adjusted_request_ordering = - new_plan.required_input_ordering(); - return Ok(Some(PlanWithSortRequirements { - plan: new_plan, - impact_result_ordering: false, - satisfy_single_distribution: requirements - .satisfy_single_distribution, - required_ordering: None, - adjusted_request_ordering, - })); + let child_plan = plan.children()[0].clone(); + match determine_children_requirement( + parent_required, + request_child, + child_plan, + ) { + RequirementsCompatibility::Satisfy => Ok(None), + RequirementsCompatibility::Compatible(adjusted) => { + Ok(Some(PlanWithSortRequirements { + required_ordering: None, + adjusted_request_ordering: vec![adjusted], + ..requirements + })) + } + RequirementsCompatibility::NonCompatible => { + let WindowExecInfo { + window_expr, + input_schema, + partition_keys, + } = extract_window_info_from_plan(plan).unwrap(); + if should_reverse_window_exec( + parent_required, + request_child, + &input_schema, + ) { + let new_physical_ordering = parent_required_expr.to_vec(); + let new_window_expr = window_expr + .iter() + .map(|e| e.get_reverse_expr()) + .collect::>>(); + if let Some(window_expr) = new_window_expr { + let uses_bounded_memory = + window_expr.iter().all(|e| e.uses_bounded_memory()); + let new_plan = if uses_bounded_memory { + Arc::new(BoundedWindowAggExec::try_new( + window_expr, + plan.children()[0].clone(), + input_schema, + partition_keys, + Some(new_physical_ordering), + )?) + as Arc + } else { + Arc::new(WindowAggExec::try_new( + window_expr, + plan.children()[0].clone(), + input_schema, + partition_keys, + Some(new_physical_ordering), + )?) + as Arc + }; + let adjusted_request_ordering = + new_plan.required_input_ordering(); + return Ok(Some(PlanWithSortRequirements { + plan: new_plan, + impact_result_ordering: false, + satisfy_single_distribution: requirements + .satisfy_single_distribution, + required_ordering: None, + adjusted_request_ordering, + })); + } } + // Can not push down requirements, add new SortExec + let mut new_plan = plan.clone(); + add_sort_above(&mut new_plan, parent_required_expr)?; + Ok(Some( + PlanWithSortRequirements::new_without_impact_result_ordering( + new_plan, + ), + )) } - // Can not push down requirements, add new SortExec - let mut new_plan = plan.clone(); - add_sort_above(&mut new_plan, parent_required_expr)?; - Ok(Some( - PlanWithSortRequirements::new_without_impact_result_ordering( - new_plan, - ), - )) } } else if let Some(smj) = plan.as_any().downcast_ref::() { // If the current plan is SortMergeJoinExec @@ -496,7 +491,6 @@ fn ensure_sorting( Some(JoinSide::Left) if maintains_input_order[0] => { try_pushdown_requirements_to_join( &requirements, - plan, parent_required, parent_required_expr, JoinSide::Left, @@ -517,7 +511,6 @@ fn ensure_sorting( }; try_pushdown_requirements_to_join( &requirements, - plan, Some(new_right_required.deref()), parent_required_expr, JoinSide::Right, @@ -535,8 +528,6 @@ fn ensure_sorting( } } } else if plan.required_input_ordering().iter().any(Option::is_some) { - // If the current plan has its own ordering requirements to its children, check whether the requirements - // are compatible with the parent requirements. let plan_children = plan.children(); let compatible_with_children = izip!( maintains_input_order.iter(), @@ -544,55 +535,43 @@ fn ensure_sorting( plan_children.iter() ) .map(|(can_push_down, request_child, child)| { - *can_push_down - && requirements_compatible( - request_child.as_deref(), + if *can_push_down { + determine_children_requirement( parent_required, - || child.equivalence_properties(), + request_child.as_deref(), + child.clone(), ) + } else { + RequirementsCompatibility::NonCompatible + } }) .collect::>(); - if compatible_with_children.iter().all(|a| *a) { - // Requirements are compatible, not need to push down. + if compatible_with_children + .iter() + .all(|a| matches!(a, RequirementsCompatibility::Satisfy)) + { + // Requirements are satisfied, not need to push down. Ok(None) + } else if compatible_with_children + .iter() + .all(|a| matches!(a, RequirementsCompatibility::Compatible(_))) + { + // Adjust child requirements and push down the requirements + let adjusted = parent_required.map(|r| r.to_vec()); + Ok(Some(PlanWithSortRequirements { + required_ordering: None, + adjusted_request_ordering: vec![adjusted; plan_children.len()], + ..requirements + })) } else { - let can_adjust_child_requirements = plan - .required_input_ordering() - .into_iter() - .zip(plan_children.iter()) - .map(|(request_child, child)| { - requirements_compatible( - parent_required, - request_child.as_deref(), - || child.equivalence_properties(), - ) - }) - .collect::>(); - if can_adjust_child_requirements.iter().all(|a| *a) { - // Adjust child requirements and push down the requirements - let adjusted = parent_required.map(|r| r.to_vec()); - Ok(Some(PlanWithSortRequirements { - plan: plan.clone(), - impact_result_ordering: requirements.impact_result_ordering, - satisfy_single_distribution: requirements - .satisfy_single_distribution, - required_ordering: None, - adjusted_request_ordering: vec![ - adjusted; - can_adjust_child_requirements - .len() - ], - })) - } else { - // Can not push down, add new SortExec - let mut new_plan = plan.clone(); - add_sort_above(&mut new_plan, parent_required_expr)?; - Ok(Some( - PlanWithSortRequirements::new_without_impact_result_ordering( - new_plan, - ), - )) - } + // Can not push down, add new SortExec + let mut new_plan = plan.clone(); + add_sort_above(&mut new_plan, parent_required_expr)?; + Ok(Some( + PlanWithSortRequirements::new_without_impact_result_ordering( + new_plan, + ), + )) } } else { // The current plan does not have its own ordering requirements to its children, consider push down the requirements @@ -601,16 +580,13 @@ fn ensure_sorting( { // For Projection, we need to transform the requirements to the columns before the Projection // And then to push down the requirements - let new_requirement = + let new_adjusted = map_requirement_before_projection(parent_required, expr); - if new_requirement.is_some() { + if new_adjusted.is_some() { Ok(Some(PlanWithSortRequirements { - plan: plan.clone(), - impact_result_ordering: requirements.impact_result_ordering, - satisfy_single_distribution: requirements - .satisfy_single_distribution, required_ordering: None, - adjusted_request_ordering: vec![new_requirement], + adjusted_request_ordering: vec![new_adjusted], + ..requirements })) } else { // Can not push down, add new SortExec @@ -624,16 +600,14 @@ fn ensure_sorting( } } else { Ok(Some(PlanWithSortRequirements { - plan: plan.clone(), - impact_result_ordering: requirements.impact_result_ordering, required_ordering: None, - satisfy_single_distribution: requirements.satisfy_single_distribution, adjusted_request_ordering: vec![ requirements.required_ordering; requirements .adjusted_request_ordering .len() ], + ..requirements })) } } @@ -722,6 +696,31 @@ fn analyze_immediate_spm_removal( } } +/// Determine the children requirements +/// If the children requirements are more specific, do not push down the parent requirements +/// If the the parent requirements are more specific, push down the parent requirements +/// If they are not compatible, need to add Sort. +fn determine_children_requirement( + parent_required: Option<&[PhysicalSortRequirements]>, + request_child: Option<&[PhysicalSortRequirements]>, + child_plan: Arc, +) -> RequirementsCompatibility { + if requirements_compatible(request_child, parent_required, || { + child_plan.equivalence_properties() + }) { + // request child requirements are more specific, no need to push down the parent requirements + RequirementsCompatibility::Satisfy + } else if requirements_compatible(parent_required, request_child, || { + child_plan.equivalence_properties() + }) { + // parent requirements are more specific, adjust the request child requirements and push down the new requirements + let adjusted = parent_required.map(|r| r.to_vec()); + RequirementsCompatibility::Compatible(adjusted) + } else { + RequirementsCompatibility::NonCompatible + } +} + /// Compares window expression's `window_request` and `parent_required_expr` ordering, returns /// whether we should reverse the window expression's ordering in order to meet parent's requirements. fn check_alignment( @@ -921,52 +920,41 @@ fn extract_window_info_from_plan( fn try_pushdown_requirements_to_join( requirements: &PlanWithSortRequirements, - plan: &Arc, parent_required: Option<&[PhysicalSortRequirements]>, - parent_required_expr: Vec, + sort_expr: Vec, push_side: JoinSide, ) -> Result> { let child_idx = match push_side { JoinSide::Left => 0, JoinSide::Right => 1, }; - if requirements_compatible( - plan.required_input_ordering()[child_idx].as_deref(), - parent_required, - || plan.children()[child_idx].equivalence_properties(), - ) { - // parent requirements are compatible with the SortMergeJoinExec - Ok(None) - } else if requirements_compatible( - parent_required, - plan.required_input_ordering()[child_idx].as_deref(), - || plan.children()[child_idx].equivalence_properties(), - ) { - // parent requirements are more specific, adjust the SortMergeJoinExec child requirements and push down the new requirements - let new_adjusted = match push_side { - JoinSide::Left => vec![ - parent_required.map(|r| r.to_vec()), - requirements.adjusted_request_ordering[1].clone(), - ], - JoinSide::Right => vec![ - requirements.adjusted_request_ordering[0].clone(), - parent_required.map(|r| r.to_vec()), - ], - }; - Ok(Some(PlanWithSortRequirements { - plan: plan.clone(), - impact_result_ordering: requirements.impact_result_ordering, - satisfy_single_distribution: requirements.satisfy_single_distribution, - required_ordering: None, - adjusted_request_ordering: new_adjusted, - })) - } else { - // Can not push down, add new SortExec - let mut new_plan = plan.clone(); - add_sort_above(&mut new_plan, parent_required_expr)?; - Ok(Some( - PlanWithSortRequirements::new_without_impact_result_ordering(new_plan), - )) + let request_child = requirements.adjusted_request_ordering[child_idx].as_deref(); + let child_plan = requirements.plan.children()[child_idx].clone(); + match determine_children_requirement(parent_required, request_child, child_plan) { + RequirementsCompatibility::Satisfy => Ok(None), + RequirementsCompatibility::Compatible(adjusted) => { + let new_adjusted = match push_side { + JoinSide::Left => { + vec![adjusted, requirements.adjusted_request_ordering[1].clone()] + } + JoinSide::Right => { + vec![requirements.adjusted_request_ordering[0].clone(), adjusted] + } + }; + Ok(Some(PlanWithSortRequirements { + required_ordering: None, + adjusted_request_ordering: new_adjusted, + ..requirements.clone() + })) + } + RequirementsCompatibility::NonCompatible => { + // Can not push down, add new SortExec + let mut new_plan = requirements.plan.clone(); + add_sort_above(&mut new_plan, sort_expr)?; + Ok(Some( + PlanWithSortRequirements::new_without_impact_result_ordering(new_plan), + )) + } } } @@ -1082,14 +1070,25 @@ fn shift_right_required( } } -#[derive(Debug)] /// This structure stores extra Window information required to create a new WindowExec +#[derive(Debug)] pub struct WindowExecInfo { window_expr: Vec>, input_schema: SchemaRef, partition_keys: Vec>, } +/// Define the Requirements Compatibility +#[derive(Debug)] +pub enum RequirementsCompatibility { + /// Requirements satisfy + Satisfy, + /// Requirements compatible + Compatible(Option>), + /// Requirements not compatible + NonCompatible, +} + /// A TombStoneExec execution plan generated during optimization process, should be removed finally #[derive(Debug)] struct TombStoneExec { From e6c09efba837b96d1040e1c329f05fcd69c55e7b Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Mon, 20 Feb 2023 11:17:25 +0300 Subject: [PATCH 10/35] Convert Topdown to pushdown then unify t --- datafusion/core/src/execution/context.rs | 4 +- .../physical_optimizer/dist_enforcement.rs | 9 +- datafusion/core/src/physical_optimizer/mod.rs | 1 - .../src/physical_optimizer/repartition.rs | 6 +- .../physical_optimizer/sort_enforcement.rs | 2033 +++++++++--- .../physical_optimizer/sort_enforcement2.rs | 2813 ----------------- .../core/src/physical_plan/sorts/sort.rs | 6 +- datafusion/core/tests/sql/explain_analyze.rs | 1 + datafusion/core/tests/sql/joins.rs | 2 + datafusion/core/tests/sql/window.rs | 42 +- datafusion/physical-expr/src/utils.rs | 4 + 11 files changed, 1605 insertions(+), 3316 deletions(-) delete mode 100644 datafusion/core/src/physical_optimizer/sort_enforcement2.rs diff --git a/datafusion/core/src/execution/context.rs b/datafusion/core/src/execution/context.rs index 4fd55ea571828..0873895f79f65 100644 --- a/datafusion/core/src/execution/context.rs +++ b/datafusion/core/src/execution/context.rs @@ -97,7 +97,7 @@ use crate::execution::memory_pool::MemoryPool; use crate::physical_optimizer::global_sort_selection::GlobalSortSelection; use crate::physical_optimizer::pipeline_checker::PipelineChecker; use crate::physical_optimizer::pipeline_fixer::PipelineFixer; -use crate::physical_optimizer::sort_enforcement2::TopDownEnforceSorting; +use crate::physical_optimizer::sort_enforcement::EnforceSorting; use datafusion_optimizer::OptimizerConfig; use datafusion_sql::planner::object_name_to_table_reference; use uuid::Uuid; @@ -1532,7 +1532,7 @@ impl SessionState { // ordering. Please make sure that the whole plan tree is determined before this rule. // Note that one should always run this rule after running the EnforceDistribution rule // as the latter may break local sorting requirements. - Arc::new(TopDownEnforceSorting::new()), + Arc::new(EnforceSorting::new()), // The CoalesceBatches rule will not influence the distribution and ordering of the // whole plan tree. Therefore, to avoid influencing other rules, it should run last. Arc::new(CoalesceBatches::new()), diff --git a/datafusion/core/src/physical_optimizer/dist_enforcement.rs b/datafusion/core/src/physical_optimizer/dist_enforcement.rs index 7917339841c52..5ebc55f9667b9 100644 --- a/datafusion/core/src/physical_optimizer/dist_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/dist_enforcement.rs @@ -971,7 +971,7 @@ mod tests { use super::*; use crate::datasource::listing::PartitionedFile; use crate::datasource::object_store::ObjectStoreUrl; - use crate::physical_optimizer::sort_enforcement2::TopDownEnforceSorting; + use crate::physical_optimizer::sort_enforcement::EnforceSorting; use crate::physical_plan::aggregates::{ AggregateExec, AggregateMode, PhysicalGroupBy, }; @@ -1157,8 +1157,7 @@ mod tests { // `EnforceSorting` and `EnfoceDistribution`. // TODO: Orthogonalize the tests here just to verify `EnforceDistribution` and create // new tests for the cascade. - let optimizer = TopDownEnforceSorting {}; - let optimized = optimizer.optimize(optimized, &config)?; + let optimized = EnforceSorting::new().optimize(optimized, &config)?; // Now format correctly let plan = displayable(optimized.as_ref()).indent().to_string(); @@ -2092,15 +2091,15 @@ mod tests { // Only two RepartitionExecs added let expected = &[ "SortMergeJoin: join_type=Inner, on=[(Column { name: \"b3\", index: 1 }, Column { name: \"b2\", index: 1 }), (Column { name: \"a3\", index: 0 }, Column { name: \"a2\", index: 0 })]", + "SortExec: expr=[b3@1 ASC,a3@0 ASC], global=false", "ProjectionExec: expr=[a1@0 as a3, b1@1 as b3]", "ProjectionExec: expr=[a1@1 as a1, b1@0 as b1]", - "SortExec: expr=[b1@0 ASC,a1@1 ASC], global=false", "AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[]", "RepartitionExec: partitioning=Hash([Column { name: \"b1\", index: 0 }, Column { name: \"a1\", index: 1 }], 10), input_partitions=1", "AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[]", "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", + "SortExec: expr=[b2@1 ASC,a2@0 ASC], global=false", "ProjectionExec: expr=[a@1 as a2, b@0 as b2]", - "SortExec: expr=[b@0 ASC,a@1 ASC], global=false", "AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[]", "RepartitionExec: partitioning=Hash([Column { name: \"b\", index: 0 }, Column { name: \"a\", index: 1 }], 10), input_partitions=1", "AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a], aggr=[]", diff --git a/datafusion/core/src/physical_optimizer/mod.rs b/datafusion/core/src/physical_optimizer/mod.rs index d0b0a917616e8..3958a546a92df 100644 --- a/datafusion/core/src/physical_optimizer/mod.rs +++ b/datafusion/core/src/physical_optimizer/mod.rs @@ -31,7 +31,6 @@ pub mod sort_enforcement; mod utils; pub mod pipeline_fixer; -pub mod sort_enforcement2; #[cfg(test)] pub mod test_utils; diff --git a/datafusion/core/src/physical_optimizer/repartition.rs b/datafusion/core/src/physical_optimizer/repartition.rs index d1723d93a363d..be6d869e75a42 100644 --- a/datafusion/core/src/physical_optimizer/repartition.rs +++ b/datafusion/core/src/physical_optimizer/repartition.rs @@ -874,9 +874,7 @@ mod tests { sort_preserving_merge_exec(sort_exec(projection_exec(parquet_exec()), true)); let expected = &[ - "SortPreservingMergeExec: [c1@0 ASC]", - // Expect repartition on the input to the sort (as it can benefit from additional parallelism) - "SortExec: expr=[c1@0 ASC], global=false", + "SortExec: expr=[c1@0 ASC], global=true", "ProjectionExec: expr=[c1@0 as c1]", "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[c1]", ]; @@ -1058,7 +1056,6 @@ mod tests { // parallelization potentially could break sort order let expected = &[ - "SortPreservingMergeExec: [c1@0 ASC]", "ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[c1@0 ASC], projection=[c1]", ]; @@ -1108,7 +1105,6 @@ mod tests { // data should not be repartitioned / resorted let expected = &[ - "SortPreservingMergeExec: [c1@0 ASC]", "ProjectionExec: expr=[c1@0 as c1]", "ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[c1@0 ASC], projection=[c1]", ]; diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement.rs b/datafusion/core/src/physical_optimizer/sort_enforcement.rs index 06b17b4507bd8..7bc7e957da16a 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement.rs @@ -38,7 +38,12 @@ use crate::error::Result; use crate::physical_optimizer::utils::add_sort_above; use crate::physical_optimizer::PhysicalOptimizerRule; use crate::physical_plan::coalesce_partitions::CoalescePartitionsExec; +use crate::physical_plan::filter::FilterExec; +use crate::physical_plan::joins::utils::JoinSide; +use crate::physical_plan::joins::SortMergeJoinExec; use crate::physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; +use crate::physical_plan::projection::ProjectionExec; +use crate::physical_plan::repartition::RepartitionExec; use crate::physical_plan::rewrite::TreeNodeRewritable; use crate::physical_plan::sorts::sort::SortExec; use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; @@ -47,13 +52,18 @@ use crate::physical_plan::windows::{BoundedWindowAggExec, WindowAggExec}; use crate::physical_plan::{with_new_children_if_necessary, Distribution, ExecutionPlan}; use arrow::datatypes::SchemaRef; use datafusion_common::{reverse_sort_options, DataFusionError}; +use datafusion_expr::JoinType; +use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::utils::{ - create_sort_expr_from_requirement, ordering_satisfy, - ordering_satisfy_requirement_concrete, + create_sort_expr_from_requirement, ordering_satisfy, ordering_satisfy_requirement, + ordering_satisfy_requirement_concrete, requirements_compatible, +}; +use datafusion_physical_expr::{ + new_sort_requirements, PhysicalExpr, PhysicalSortExpr, PhysicalSortRequirements, }; -use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr}; use itertools::{concat, izip}; use std::iter::zip; +use std::ops::Deref; use std::sync::Arc; /// This rule inspects `SortExec`'s in the given physical plan and removes the @@ -68,6 +78,28 @@ impl EnforceSorting { } } +/// Checks whether the given executor is a limit; +/// i.e. either a `LocalLimitExec` or a `GlobalLimitExec`. +fn is_limit(plan: &Arc) -> bool { + plan.as_any().is::() || plan.as_any().is::() +} + +/// Checks whether the given executor is a widnow; +/// i.e. either a `WindowAggExec` or a `BoundedWindowAggExec`. +fn is_window(plan: &Arc) -> bool { + plan.as_any().is::() || plan.as_any().is::() +} + +/// Checks whether the given executor is a `SortExec`. +fn is_sort(plan: &Arc) -> bool { + plan.as_any().is::() +} + +/// Checks whether the given executor is a `SortPreservingMergeExec`. +fn is_sort_preserving_merge(plan: &Arc) -> bool { + plan.as_any().is::() +} + /// This object implements a tree that we use while keeping track of paths /// leading to `SortExec`s. #[derive(Debug, Clone)] @@ -149,19 +181,11 @@ impl PlanWithCorrespondingSort { return None; } let is_spm = is_sort_preserving_merge(plan); - let is_union = plan.as_any().is::(); - // If the executor is a `UnionExec`, and it has an output ordering; - // then it at least partially maintains some child's output ordering. - // Therefore, we propagate this information upwards. - let partially_maintains = is_union && plan.output_ordering().is_some(); let required_orderings = plan.required_input_ordering(); let flags = plan.maintains_input_order(); let children = izip!(flags, item.sort_onwards, required_orderings) .filter_map(|(maintains, element, required_ordering)| { - if (required_ordering.is_none() - && (maintains || partially_maintains)) - || is_spm - { + if (required_ordering.is_none() && maintains) || is_spm { element } else { None @@ -210,6 +234,96 @@ impl TreeNodeRewritable for PlanWithCorrespondingSort { } } +/// This function enforces sorting requirements and makes optimizations without +/// violating these requirements whenever possible. +fn ensure_sorting( + requirements: PlanWithCorrespondingSort, +) -> Result> { + // Perform naive analysis at the beginning -- remove already-satisfied sorts: + let plan = requirements.plan; + let mut children = plan.children(); + if children.is_empty() { + return Ok(None); + } + let mut sort_onwards = requirements.sort_onwards; + if let Some(result) = analyze_immediate_sort_removal(&plan, &sort_onwards) { + return Ok(Some(result)); + } + for (idx, (child, sort_onwards, required_ordering)) in izip!( + children.iter_mut(), + sort_onwards.iter_mut(), + plan.required_input_ordering() + ) + .enumerate() + { + let physical_ordering = child.output_ordering(); + match (required_ordering, physical_ordering) { + (Some(required_ordering), Some(physical_ordering)) => { + if !ordering_satisfy_requirement_concrete( + physical_ordering, + &required_ordering, + || child.equivalence_properties(), + ) { + // Make sure we preserve the ordering requirements: + update_child_to_remove_unnecessary_sort( + child, + sort_onwards, + &plan, + idx, + )?; + let sort_expr = create_sort_expr_from_requirement(&required_ordering); + add_sort_above(child, sort_expr)?; + if is_sort(child) { + *sort_onwards = Some(ExecTree::new(child.clone(), idx, vec![])); + } else { + *sort_onwards = None; + } + } + } + (Some(required), None) => { + // Ordering requirement is not met, we should add a `SortExec` to the plan. + let sort_expr = create_sort_expr_from_requirement(&required); + add_sort_above(child, sort_expr)?; + *sort_onwards = Some(ExecTree::new(child.clone(), idx, vec![])); + } + (None, Some(_)) => { + // We have a `SortExec` whose effect may be neutralized by + // another order-imposing operator. Remove this sort. + if !plan.maintains_input_order()[idx] { + update_child_to_remove_unnecessary_sort( + child, + sort_onwards, + &plan, + idx, + )?; + } + } + (None, None) => {} + } + } + // For window expressions, we can remove some sorts when we can + // calculate the result in reverse: + if is_window(&plan) { + if let Some(tree) = &mut sort_onwards[0] { + if let Some(result) = analyze_window_sort_removal(tree, &plan)? { + return Ok(Some(result)); + } + } + } else if is_sort_preserving_merge(&plan) + && children[0].output_partitioning().partition_count() <= 1 + { + // sort preserving merge can removed. Input already has single partition + return Ok(Some(PlanWithCorrespondingSort { + plan: children[0].clone(), + sort_onwards: vec![sort_onwards[0].clone()], + })); + } + Ok(Some(PlanWithCorrespondingSort { + plan: plan.with_new_children(children)?, + sort_onwards, + })) +} + /// This object is used within the [EnforceSorting] rule to track the closest /// `CoalescePartitionsExec` descendant(s) for every child of a plan. #[derive(Debug, Clone)] @@ -219,7 +333,7 @@ struct PlanWithCorrespondingCoalescePartitions { // child until the `CoalescePartitionsExec`(s) -- could be multiple for // n-ary plans like Union -- that affect the output partitioning of the // child. If the child has no connection to any `CoalescePartitionsExec`, - // simplify store None (and not a subtree). + // simply store None (and not a subtree). coalesce_onwards: Vec>, } @@ -249,11 +363,11 @@ impl PlanWithCorrespondingCoalescePartitions { // maintain a single partition. If we just saw a `CoalescePartitionsExec` // operator, we reset the tree and start accumulating. let plan = item.plan; - if plan.as_any().is::() { - Some(ExecTree::new(plan, idx, vec![])) - } else if plan.children().is_empty() { + if plan.children().is_empty() { // Plan has no children, there is nothing to propagate. None + } else if plan.as_any().is::() { + Some(ExecTree::new(plan, idx, vec![])) } else { let children = item .coalesce_onwards @@ -313,38 +427,6 @@ impl TreeNodeRewritable for PlanWithCorrespondingCoalescePartitions { } } -/// The boolean flag `repartition_sorts` defined in the config indicates -/// whether we elect to transform CoalescePartitionsExec + SortExec cascades -/// into SortExec + SortPreservingMergeExec cascades, which enables us to -/// perform sorting in parallel. -impl PhysicalOptimizerRule for EnforceSorting { - fn optimize( - &self, - plan: Arc, - config: &ConfigOptions, - ) -> Result> { - let plan_requirements = PlanWithCorrespondingSort::new(plan); - let adjusted = plan_requirements.transform_up(&ensure_sorting)?; - if config.optimizer.repartition_sorts { - let plan_with_coalesce_partitions = - PlanWithCorrespondingCoalescePartitions::new(adjusted.plan); - let parallel = - plan_with_coalesce_partitions.transform_up(¶llelize_sorts)?; - Ok(parallel.plan) - } else { - Ok(adjusted.plan) - } - } - - fn name(&self) -> &str { - "EnforceSorting" - } - - fn schema_check(&self) -> bool { - true - } -} - /// This function turns plans of the form /// "SortExec: expr=[a@0 ASC]", /// " CoalescePartitionsExec", @@ -359,142 +441,258 @@ fn parallelize_sorts( requirements: PlanWithCorrespondingCoalescePartitions, ) -> Result> { let plan = requirements.plan; - if plan.children().is_empty() { + let mut coalesce_onwards = requirements.coalesce_onwards; + if plan.children().is_empty() + // We only do action when plan is either SortExec, SortPreservingMergeExec or CoalescePartitionsExec + // all of them have single child. If 0th child is `None` we can immediately return. + || coalesce_onwards[0].is_none() + { return Ok(None); } - let mut coalesce_onwards = requirements.coalesce_onwards; // We know that `plan` has children, so `coalesce_onwards` is non-empty. - if coalesce_onwards[0].is_some() { - if (is_sort(&plan) || is_sort_preserving_merge(&plan)) - // Make sure that Sort is actually global sort - && plan.output_partitioning().partition_count() == 1 - { - // If there is a connection between a `CoalescePartitionsExec` and a - // Global Sort that satisfy the requirements (i.e. intermediate - // executors don't require single partition), then we can - // replace the `CoalescePartitionsExec`+ GlobalSort cascade with - // the `SortExec` + `SortPreservingMergeExec` - // cascade to parallelize sorting. - let mut prev_layer = plan.clone(); - update_child_to_remove_coalesce(&mut prev_layer, &mut coalesce_onwards[0])?; - let sort_exprs = get_sort_exprs(&plan)?; - add_sort_above(&mut prev_layer, sort_exprs.to_vec())?; - let spm = SortPreservingMergeExec::new(sort_exprs.to_vec(), prev_layer); - return Ok(Some(PlanWithCorrespondingCoalescePartitions { - plan: Arc::new(spm), - coalesce_onwards: vec![None], - })); - } else if plan.as_any().is::() { - // There is an unnecessary `CoalescePartitionExec` in the plan. - let mut prev_layer = plan.clone(); - update_child_to_remove_coalesce(&mut prev_layer, &mut coalesce_onwards[0])?; - let new_plan = plan.with_new_children(vec![prev_layer])?; - return Ok(Some(PlanWithCorrespondingCoalescePartitions { - plan: new_plan, - coalesce_onwards: vec![None], - })); - } + if (is_sort(&plan) || is_sort_preserving_merge(&plan)) + // Make sure that Sort is actually global sort + && plan.output_partitioning().partition_count() <= 1 + { + // If there is a connection between a `CoalescePartitionsExec` and a + // Global Sort that satisfy the requirements (i.e. intermediate + // executors don't require single partition), then we can + // replace the `CoalescePartitionsExec`+ GlobalSort cascade with + // the `SortExec` + `SortPreservingMergeExec` + // cascade to parallelize sorting. + let mut prev_layer = plan.clone(); + update_child_to_remove_coalesce(&mut prev_layer, &mut coalesce_onwards[0])?; + let sort_exprs = get_sort_exprs(&plan)?; + add_sort_above(&mut prev_layer, sort_exprs.to_vec())?; + let spm = SortPreservingMergeExec::new(sort_exprs.to_vec(), prev_layer); + return Ok(Some(PlanWithCorrespondingCoalescePartitions { + plan: Arc::new(spm), + coalesce_onwards: vec![None], + })); + } else if plan.as_any().is::() { + // There is an unnecessary `CoalescePartitionExec` in the plan. + let mut prev_layer = plan.clone(); + update_child_to_remove_coalesce(&mut prev_layer, &mut coalesce_onwards[0])?; + let new_plan = plan.with_new_children(vec![prev_layer])?; + return Ok(Some(PlanWithCorrespondingCoalescePartitions { + plan: new_plan, + coalesce_onwards: vec![None], + })); } + Ok(Some(PlanWithCorrespondingCoalescePartitions { plan, coalesce_onwards, })) } -/// Checks whether the given executor is a limit; -/// i.e. either a `LocalLimitExec` or a `GlobalLimitExec`. -fn is_limit(plan: &Arc) -> bool { - plan.as_any().is::() || plan.as_any().is::() -} - -/// Checks whether the given executor is a `SortExec`. -fn is_sort(plan: &Arc) -> bool { - plan.as_any().is::() +/// This is a "data class" we use within the [TopDownEnforceSorting] rule +#[derive(Debug, Clone)] +struct TopDownSortPushDown { + /// Current plan + plan: Arc, + /// Whether the plan could impact the final result ordering + impact_result_ordering: bool, + /// Parent has the SinglePartition requirement to children + satisfy_single_distribution: bool, + /// Parent required sort ordering + required_ordering: Option>, + /// The adjusted request sort ordering to children. + /// By default they are the same as the plan's required input ordering, but can be adjusted based on parent required sort ordering properties. + adjusted_request_ordering: Vec>>, } -/// Checks whether the given executor is a `SortPreservingMergeExec`. -fn is_sort_preserving_merge(plan: &Arc) -> bool { - plan.as_any().is::() -} +impl TopDownSortPushDown { + pub fn init(plan: Arc) -> Self { + let impact_result_ordering = plan.output_ordering().is_some() + || plan.output_partitioning().partition_count() <= 1 + || is_limit(&plan); + let request_ordering = plan.required_input_ordering(); + TopDownSortPushDown { + plan, + impact_result_ordering, + satisfy_single_distribution: false, + required_ordering: None, + adjusted_request_ordering: request_ordering, + } + } -/// This function enforces sorting requirements and makes optimizations without -/// violating these requirements whenever possible. -fn ensure_sorting( - requirements: PlanWithCorrespondingSort, -) -> Result> { - // Perform naive analysis at the beginning -- remove already-satisfied sorts: - let plan = requirements.plan; - let mut children = plan.children(); - if children.is_empty() { - return Ok(None); + pub fn new_without_impact_result_ordering(plan: Arc) -> Self { + let request_ordering = plan.required_input_ordering(); + TopDownSortPushDown { + plan, + impact_result_ordering: false, + satisfy_single_distribution: false, + required_ordering: None, + adjusted_request_ordering: request_ordering, + } } - let mut sort_onwards = requirements.sort_onwards; - if let Some(result) = analyze_immediate_sort_removal(&plan, &sort_onwards) { - return Ok(Some(result)); + + pub fn children(&self) -> Vec { + let plan_children = self.plan.children(); + assert_eq!(plan_children.len(), self.adjusted_request_ordering.len()); + + izip!( + plan_children.into_iter(), + self.adjusted_request_ordering.clone().into_iter(), + self.plan.maintains_input_order().into_iter(), + self.plan.required_input_distribution().into_iter(), + ) + .map( + |(child, from_parent, maintains_input_order, required_dist)| { + let child_satisfy_single_distribution = + matches!(required_dist, Distribution::SinglePartition); + let child_impact_result_ordering = if is_limit(&self.plan) { + true + } else { + maintains_input_order && self.impact_result_ordering + }; + let child_request_ordering = child.required_input_ordering(); + TopDownSortPushDown { + plan: child, + impact_result_ordering: child_impact_result_ordering, + satisfy_single_distribution: child_satisfy_single_distribution, + required_ordering: from_parent, + adjusted_request_ordering: child_request_ordering, + } + }, + ) + .collect() } - for (idx, (child, sort_onwards, required_ordering)) in izip!( - children.iter_mut(), - sort_onwards.iter_mut(), - plan.required_input_ordering() - ) - .enumerate() +} + +impl TreeNodeRewritable for TopDownSortPushDown { + fn map_children(self, transform: F) -> Result + where + F: FnMut(Self) -> Result, { - let physical_ordering = child.output_ordering(); - match (required_ordering, physical_ordering) { - (Some(required_ordering), Some(physical_ordering)) => { - let is_ordering_satisfied = ordering_satisfy_requirement_concrete( - physical_ordering, - &required_ordering, - || child.equivalence_properties(), - ); - if !is_ordering_satisfied { - // Make sure we preserve the ordering requirements: - update_child_to_remove_unnecessary_sort(child, sort_onwards, &plan)?; - let sort_expr = create_sort_expr_from_requirement(&required_ordering); - add_sort_above(child, sort_expr)?; - *sort_onwards = Some(ExecTree::new(child.clone(), idx, vec![])); - } - if let Some(tree) = sort_onwards { - // For window expressions, we can remove some sorts when we can - // calculate the result in reverse: - if plan.as_any().is::() - || plan.as_any().is::() - { - if let Some(result) = analyze_window_sort_removal(tree, &plan)? { - return Ok(Some(result)); - } - } - } - } - (Some(required), None) => { - // Ordering requirement is not met, we should add a `SortExec` to the plan. - let sort_expr = create_sort_expr_from_requirement(&required); - add_sort_above(child, sort_expr)?; - *sort_onwards = Some(ExecTree::new(child.clone(), idx, vec![])); - } - (None, Some(_)) => { - // We have a `SortExec` whose effect may be neutralized by - // another order-imposing operator. Remove or update this sort: - if !plan.maintains_input_order()[idx] { - let count = plan.output_ordering().map_or(0, |e| e.len()); - if (count > 0) && !is_sort(&plan) { - update_child_to_change_finer_sort(child, sort_onwards, count)?; - } else { - update_child_to_remove_unnecessary_sort( - child, - sort_onwards, - &plan, - )?; - } - } + let children = self.children(); + if children.is_empty() { + Ok(self) + } else { + let new_children = children + .into_iter() + .map(transform) + .collect::>>()?; + + let children_plans = new_children + .iter() + .map(|elem| elem.plan.clone()) + .collect::>(); + let plan = with_new_children_if_necessary(self.plan, children_plans)?; + Ok(TopDownSortPushDown { + plan, + impact_result_ordering: self.impact_result_ordering, + satisfy_single_distribution: self.satisfy_single_distribution, + required_ordering: self.required_ordering, + adjusted_request_ordering: self.adjusted_request_ordering, + }) + } + } +} + +fn pushdown_sorts( + requirements: TopDownSortPushDown, +) -> Result> { + let plan = &requirements.plan; + let parent_required = requirements.required_ordering.as_deref(); + if let Some(sort_exec) = plan.as_any().downcast_ref::() { + let mut new_plan = plan.clone(); + if !ordering_satisfy_requirement(plan.output_ordering(), parent_required, || { + plan.equivalence_properties() + }) { + // If the current plan is a SortExec, modify current SortExec to satisfy the parent requirements + let parent_required_expr = + create_sort_expr_from_requirement(parent_required.unwrap()); + new_plan = sort_exec.input.clone(); + add_sort_above(&mut new_plan, parent_required_expr)?; + }; + let required_ordering = new_sort_requirements(new_plan.output_ordering()); + let child = &new_plan.children()[0]; + if let Some(adjusted) = + pushdown_requirement_to_children(child, required_ordering.as_deref())? + { + // Can push down requirements + Ok(Some(TopDownSortPushDown { + plan: child.clone(), + required_ordering, + adjusted_request_ordering: adjusted, + ..requirements + })) + } else { + // Can not push down requirements + Ok(Some( + TopDownSortPushDown::new_without_impact_result_ordering(new_plan), + )) + } + } else { + // Executors other than SortExec + if ordering_satisfy_requirement(plan.output_ordering(), parent_required, || { + plan.equivalence_properties() + }) { + Ok(Some(TopDownSortPushDown { + required_ordering: None, + ..requirements + })) + } else { + // Can not satisfy the parent requirements, check whether the requirements can be pushed down. If not, add new SortExec. + let parent_required_expr = + create_sort_expr_from_requirement(parent_required.unwrap()); + if let Some(adjusted) = pushdown_requirement_to_children( + plan, + requirements.required_ordering.as_deref(), + )? { + Ok(Some(TopDownSortPushDown { + plan: plan.clone(), + adjusted_request_ordering: adjusted, + ..requirements + })) + } else { + // Can not push down requirements, add new SortExec + let mut new_plan = plan.clone(); + add_sort_above(&mut new_plan, parent_required_expr)?; + Ok(Some( + TopDownSortPushDown::new_without_impact_result_ordering(new_plan), + )) } - (None, None) => {} } } - Ok(Some(PlanWithCorrespondingSort { - plan: plan.with_new_children(children)?, - sort_onwards, - })) +} + +/// The boolean flag `repartition_sorts` defined in the config indicates +/// whether we elect to transform CoalescePartitionsExec + SortExec cascades +/// into SortExec + SortPreservingMergeExec cascades, which enables us to +/// perform sorting in parallel. +impl PhysicalOptimizerRule for EnforceSorting { + fn optimize( + &self, + plan: Arc, + config: &ConfigOptions, + ) -> Result> { + let plan_requirements = PlanWithCorrespondingSort::new(plan); + let adjusted = plan_requirements.transform_up(&ensure_sorting)?; + let new_plan = if config.optimizer.repartition_sorts { + let plan_with_coalesce_partitions = + PlanWithCorrespondingCoalescePartitions::new(adjusted.plan); + let parallel = + plan_with_coalesce_partitions.transform_up(¶llelize_sorts)?; + parallel.plan + } else { + adjusted.plan + }; + // Execute a Top-Down process(Preorder Traversal) to ensure the sort requirements: + let sort_pushdown = TopDownSortPushDown::init(new_plan); + let adjusted = sort_pushdown.transform_down(&pushdown_sorts)?; + Ok(adjusted.plan) + } + + fn name(&self) -> &str { + "EnforceSorting" + } + + fn schema_check(&self) -> bool { + true + } } /// Analyzes a given `SortExec` (`plan`) to determine whether its input already @@ -691,6 +889,7 @@ fn update_child_to_remove_unnecessary_sort( child: &mut Arc, sort_onwards: &mut Option, parent: &Arc, + child_idx: usize, ) -> Result<()> { if let Some(sort_onwards) = sort_onwards { let requires_single_partition = matches!( @@ -703,6 +902,14 @@ fn update_child_to_remove_unnecessary_sort( )?; } *sort_onwards = None; + // Deleting sort may invalidate distribution + let requires_single_partition = matches!( + parent.required_input_distribution()[child_idx], + Distribution::SinglePartition + ); + if requires_single_partition && child.output_partitioning().partition_count() > 1 { + *child = Arc::new(CoalescePartitionsExec::new(child.clone())) as _; + } Ok(()) } @@ -740,50 +947,6 @@ fn remove_corresponding_sort_from_sub_plan( } } -/// Updates child to modify the unnecessarily fine sorting below it. -fn update_child_to_change_finer_sort( - child: &mut Arc, - sort_onwards: &mut Option, - n_sort_expr: usize, -) -> Result<()> { - if let Some(sort_onwards) = sort_onwards { - *child = change_finer_sort_in_sub_plan(sort_onwards, n_sort_expr)?; - } - Ok(()) -} - -/// Change the unnecessarily fine sort in `sort_onwards`. -fn change_finer_sort_in_sub_plan( - sort_onwards: &mut ExecTree, - n_sort_expr: usize, -) -> Result> { - let plan = &sort_onwards.plan; - // A `SortExec` is always at the bottom of the tree. - if is_sort(plan) { - let mut prev_layer = plan.children()[0].clone(); - let new_sort_expr = get_sort_exprs(plan)?[0..n_sort_expr].to_vec(); - add_sort_above(&mut prev_layer, new_sort_expr)?; - *sort_onwards = ExecTree::new(prev_layer.clone(), sort_onwards.idx, vec![]); - Ok(prev_layer) - } else { - let mut children = plan.children(); - for item in &mut sort_onwards.children { - children[item.idx] = change_finer_sort_in_sub_plan(item, n_sort_expr)?; - } - if is_sort_preserving_merge(plan) { - let new_sort_expr = get_sort_exprs(plan)?[0..n_sort_expr].to_vec(); - let updated_plan = Arc::new(SortPreservingMergeExec::new( - new_sort_expr, - children[0].clone(), - )) as Arc; - sort_onwards.plan = updated_plan.clone(); - Ok(updated_plan) - } else { - plan.clone().with_new_children(children) - } - } -} - /// Converts an [ExecutionPlan] trait object to a [PhysicalSortExpr] slice when possible. fn get_sort_exprs(sort_any: &Arc) -> Result<&[PhysicalSortExpr]> { if let Some(sort_exec) = sort_any.as_any().downcast_ref::() { @@ -888,6 +1051,286 @@ fn check_alignment( } } +fn pushdown_requirement_to_children( + plan: &Arc, + parent_required: Option<&[PhysicalSortRequirements]>, +) -> Result>>>> { + let maintains_input_order = plan.maintains_input_order(); + if is_window(plan) { + let required_input_ordering = plan.required_input_ordering(); + let request_child = required_input_ordering[0].as_deref(); + let child_plan = plan.children()[0].clone(); + match determine_children_requirement(parent_required, request_child, child_plan) { + RequirementsCompatibility::Satisfy => { + Ok(Some(vec![request_child.map(|r| r.to_vec())])) + } + RequirementsCompatibility::Compatible(adjusted) => Ok(Some(vec![adjusted])), + RequirementsCompatibility::NonCompatible => Ok(None), + } + } else if plan.as_any().is::() { + // UnionExec does not have real sort requirements for its input. Here we change the adjusted_request_ordering to UnionExec's output ordering and + // propagate the sort requirements down to correct the unnecessary descendant SortExec under the UnionExec + Ok(Some(vec![ + parent_required.map(|elem| elem.to_vec()); + plan.children().len() + ])) + } else if let Some(smj) = plan.as_any().downcast_ref::() { + // If the current plan is SortMergeJoinExec + let left_columns_len = smj.left.schema().fields().len(); + let parent_required_expr = + create_sort_expr_from_requirement(parent_required.unwrap()); + let expr_source_side = + expr_source_sides(&parent_required_expr, smj.join_type, left_columns_len); + match expr_source_side { + Some(JoinSide::Left) if maintains_input_order[0] => { + try_pushdown_requirements_to_join( + plan, + parent_required, + parent_required_expr, + JoinSide::Left, + ) + } + Some(JoinSide::Right) if maintains_input_order[1] => { + let new_right_required = match smj.join_type { + JoinType::Inner | JoinType::Right => { + shift_right_required(parent_required.unwrap(), left_columns_len)? + } + JoinType::RightSemi | JoinType::RightAnti => { + parent_required.unwrap().to_vec() + } + _ => Err(DataFusionError::Plan( + "Unexpected SortMergeJoin type here".to_string(), + ))?, + }; + try_pushdown_requirements_to_join( + plan, + Some(new_right_required.deref()), + parent_required_expr, + JoinSide::Right, + ) + } + _ => { + // Can not decide the expr side for SortMergeJoinExec, can not push down + Ok(None) + } + } + } else if maintains_input_order.is_empty() + || !maintains_input_order.iter().any(|o| *o) + || plan.as_any().is::() + || plan.as_any().is::() + // TODO: Add support for Projection push down + || plan.as_any().is::() + || is_limit(plan) + { + // If the current plan is a leaf node or can not maintain any of the input ordering, can not pushed down requirements. + // For RepartitionExec, we always choose to not push down the sort requirements even the RepartitionExec(input_partition=1) could maintain input ordering. + // For RepartitionExec, we always choose to not push down the sort requirements even the RepartitionExec(input_partition=1) could maintain input ordering. + // Pushing down is not beneficial + Ok(None) + } else { + Ok(Some(vec![ + parent_required.map(|elem| elem.to_vec()); + plan.children().len() + ])) + } + // // TODO: Add support for Projection push down + // else if let Some(ProjectionExec { expr, .. }) = + // plan.as_any().downcast_ref::() + // { + // // For Projection, we need to transform the requirements to the columns before the Projection + // // And then to push down the requirements + // let new_adjusted = map_requirement_before_projection(parent_required, expr); + // if new_adjusted.is_some() { + // Ok(Some(vec![new_adjusted])) + // } else { + // Ok(None) + // } + // } +} + +/// Determine the children requirements +/// If the children requirements are more specific, do not push down the parent requirements +/// If the the parent requirements are more specific, push down the parent requirements +/// If they are not compatible, need to add Sort. +fn determine_children_requirement( + parent_required: Option<&[PhysicalSortRequirements]>, + request_child: Option<&[PhysicalSortRequirements]>, + child_plan: Arc, +) -> RequirementsCompatibility { + if requirements_compatible(request_child, parent_required, || { + child_plan.equivalence_properties() + }) { + // request child requirements are more specific, no need to push down the parent requirements + RequirementsCompatibility::Satisfy + } else if requirements_compatible(parent_required, request_child, || { + child_plan.equivalence_properties() + }) { + // parent requirements are more specific, adjust the request child requirements and push down the new requirements + let adjusted = parent_required.map(|r| r.to_vec()); + RequirementsCompatibility::Compatible(adjusted) + } else { + RequirementsCompatibility::NonCompatible + } +} + +fn try_pushdown_requirements_to_join( + plan: &Arc, + parent_required: Option<&[PhysicalSortRequirements]>, + sort_expr: Vec, + push_side: JoinSide, +) -> Result>>>> { + let child_idx = match push_side { + JoinSide::Left => 0, + JoinSide::Right => 1, + }; + let required_input_ordering = plan.required_input_ordering(); + let request_child = required_input_ordering[child_idx].as_deref(); + let child_plan = plan.children()[child_idx].clone(); + match determine_children_requirement(parent_required, request_child, child_plan) { + RequirementsCompatibility::Satisfy => Ok(None), + RequirementsCompatibility::Compatible(adjusted) => { + let new_adjusted = match push_side { + JoinSide::Left => { + vec![adjusted, required_input_ordering[1].clone()] + } + JoinSide::Right => { + vec![required_input_ordering[0].clone(), adjusted] + } + }; + Ok(Some(new_adjusted)) + } + RequirementsCompatibility::NonCompatible => { + // Can not push down, add new SortExec + let mut new_plan = plan.clone(); + add_sort_above(&mut new_plan, sort_expr)?; + Ok(None) + } + } +} + +fn expr_source_sides( + required_exprs: &[PhysicalSortExpr], + join_type: JoinType, + left_columns_len: usize, +) -> Option { + match join_type { + JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full => { + let all_column_sides = required_exprs + .iter() + .filter_map(|r| { + if let Some(col) = r.expr.as_any().downcast_ref::() { + if col.index() < left_columns_len { + Some(JoinSide::Left) + } else { + Some(JoinSide::Right) + } + } else { + None + } + }) + .collect::>(); + + // If the exprs are all coming from one side, the requirements can be pushed down + if all_column_sides.len() != required_exprs.len() { + None + } else if all_column_sides + .iter() + .all(|side| matches!(side, JoinSide::Left)) + { + Some(JoinSide::Left) + } else if all_column_sides + .iter() + .all(|side| matches!(side, JoinSide::Right)) + { + Some(JoinSide::Right) + } else { + None + } + } + JoinType::LeftSemi | JoinType::LeftAnti => { + if required_exprs + .iter() + .filter_map(|r| { + if r.expr.as_any().downcast_ref::().is_some() { + Some(JoinSide::Left) + } else { + None + } + }) + .count() + != required_exprs.len() + { + None + } else { + Some(JoinSide::Left) + } + } + JoinType::RightSemi | JoinType::RightAnti => { + if required_exprs + .iter() + .filter_map(|r| { + if r.expr.as_any().downcast_ref::().is_some() { + Some(JoinSide::Right) + } else { + None + } + }) + .count() + != required_exprs.len() + { + None + } else { + Some(JoinSide::Right) + } + } + } +} + +fn shift_right_required( + parent_required: &[PhysicalSortRequirements], + left_columns_len: usize, +) -> Result> { + let new_right_required: Vec = parent_required + .iter() + .filter_map(|r| { + if let Some(col) = r.expr.as_any().downcast_ref::() { + if col.index() >= left_columns_len { + Some(PhysicalSortRequirements { + expr: Arc::new(Column::new( + col.name(), + col.index() - left_columns_len, + )) as Arc, + sort_options: r.sort_options, + }) + } else { + None + } + } else { + None + } + }) + .collect::>(); + if new_right_required.len() != parent_required.len() { + Err(DataFusionError::Plan( + "Expect to shift all the parent required column indexes for SortMergeJoin" + .to_string(), + )) + } else { + Ok(new_right_required) + } +} + +/// Define the Requirements Compatibility +#[derive(Debug)] +pub enum RequirementsCompatibility { + /// Requirements satisfy + Satisfy, + /// Requirements compatible + Compatible(Option>), + /// Requirements not compatible + NonCompatible, +} + #[cfg(test)] mod tests { use super::*; @@ -898,6 +1341,8 @@ mod tests { use crate::physical_plan::aggregates::{AggregateExec, AggregateMode}; use crate::physical_plan::file_format::{FileScanConfig, ParquetExec}; use crate::physical_plan::filter::FilterExec; + use crate::physical_plan::joins::utils::JoinOn; + use crate::physical_plan::joins::SortMergeJoinExec; use crate::physical_plan::memory::MemoryExec; use crate::physical_plan::repartition::RepartitionExec; use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; @@ -908,7 +1353,9 @@ mod tests { use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion_common::{Result, Statistics}; + use datafusion_expr::JoinType; use datafusion_expr::{AggregateFunction, WindowFrame, WindowFunction}; + use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::expressions::{col, NotExpr}; use datafusion_physical_expr::PhysicalSortExpr; use std::sync::Arc; @@ -921,6 +1368,13 @@ mod tests { Ok(schema) } + fn create_test_schema2() -> Result { + let col_a = Field::new("col_a", DataType::Int32, true); + let col_b = Field::new("col_b", DataType::Int32, true); + let schema = Arc::new(Schema::new(vec![col_a, col_b])); + Ok(schema) + } + // Util function to get string representation of a physical plan fn get_plan_string(plan: &Arc) -> Vec { let formatted = displayable(plan.as_ref()).indent().to_string(); @@ -1039,6 +1493,7 @@ mod tests { // Run the actual optimizer let optimized_physical_plan = EnforceSorting::new().optimize(physical_plan, state.config_options())?; + // Get string representation of the plan let actual = get_plan_string(&optimized_physical_plan); assert_eq!( @@ -1049,85 +1504,6 @@ mod tests { }; } - #[tokio::test] - async fn test_remove_unnecessary_sort() -> Result<()> { - let schema = create_test_schema()?; - let source = memory_exec(&schema); - let input = sort_exec(vec![sort_expr("non_nullable_col", &schema)], source); - let physical_plan = sort_exec(vec![sort_expr("nullable_col", &schema)], input); - - let expected_input = vec![ - "SortExec: expr=[nullable_col@0 ASC], global=true", - " SortExec: expr=[non_nullable_col@1 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - let expected_optimized = vec![ - "SortExec: expr=[nullable_col@0 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_remove_unnecessary_sort_window_multilayer() -> Result<()> { - let schema = create_test_schema()?; - let source = memory_exec(&schema); - - let sort_exprs = vec![sort_expr_options( - "non_nullable_col", - &source.schema(), - SortOptions { - descending: true, - nulls_first: true, - }, - )]; - let sort = sort_exec(sort_exprs.clone(), source); - - let window_agg = window_exec("non_nullable_col", sort_exprs, sort); - - let sort_exprs = vec![sort_expr_options( - "non_nullable_col", - &window_agg.schema(), - SortOptions { - descending: false, - nulls_first: false, - }, - )]; - - let sort = sort_exec(sort_exprs.clone(), window_agg); - - // Add dummy layer propagating Sort above, to test whether sort can be removed from multi layer before - let filter = filter_exec( - Arc::new(NotExpr::new( - col("non_nullable_col", schema.as_ref()).unwrap(), - )), - sort, - ); - - // let filter_exec = sort_exec; - let physical_plan = window_exec("non_nullable_col", sort_exprs, filter); - - let expected_input = vec![ - "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " FilterExec: NOT non_nullable_col@1", - " SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], global=true", - " WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " SortExec: expr=[non_nullable_col@1 DESC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - - let expected_optimized = vec![ - "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(NULL) }]", - " FilterExec: NOT non_nullable_col@1", - " WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " SortExec: expr=[non_nullable_col@1 DESC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - #[tokio::test] async fn test_add_required_sort() -> Result<()> { let schema = create_test_schema()?; @@ -1142,37 +1518,28 @@ mod tests { " MemoryExec: partitions=0, partition_sizes=[]", ]; let expected_optimized = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", + "SortExec: expr=[nullable_col@0 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); Ok(()) } #[tokio::test] - async fn test_remove_unnecessary_sort1() -> Result<()> { + async fn test_remove_unnecessary_sort() -> Result<()> { let schema = create_test_schema()?; let source = memory_exec(&schema); - let sort_exprs = vec![sort_expr("nullable_col", &schema)]; - let sort = sort_exec(sort_exprs.clone(), source); - let spm = sort_preserving_merge_exec(sort_exprs, sort); + let input = sort_exec(vec![sort_expr("non_nullable_col", &schema)], source); + let physical_plan = sort_exec(vec![sort_expr("nullable_col", &schema)], input); - let sort_exprs = vec![sort_expr("nullable_col", &schema)]; - let sort = sort_exec(sort_exprs.clone(), spm); - let physical_plan = sort_preserving_merge_exec(sort_exprs, sort); let expected_input = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", + "SortExec: expr=[nullable_col@0 ASC], global=true", + " SortExec: expr=[non_nullable_col@1 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", ]; let expected_optimized = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", + "SortExec: expr=[nullable_col@0 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); Ok(()) @@ -1230,7 +1597,12 @@ mod tests { sort_expr("non_nullable_col", &schema), ]; let repartition_exec = repartition_exec(spm); - let sort2 = sort_exec(sort_exprs.clone(), repartition_exec); + let sort2 = Arc::new(SortExec::new_with_partitioning( + sort_exprs.clone(), + repartition_exec, + true, + None, + )) as _; let spm2 = sort_preserving_merge_exec(sort_exprs, sort2); let physical_plan = aggregate_exec(spm2); @@ -1241,7 +1613,7 @@ mod tests { let expected_input = vec![ "AggregateExec: mode=Final, gby=[], aggr=[]", " SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=false", " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", " SortPreservingMergeExec: [non_nullable_col@1 ASC]", " SortExec: expr=[non_nullable_col@1 ASC], global=true", @@ -1277,82 +1649,247 @@ mod tests { " MemoryExec: partitions=0, partition_sizes=[]", ]; let expected_optimized = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", + "SortExec: expr=[nullable_col@0 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); Ok(()) } #[tokio::test] - async fn test_do_not_remove_sort_with_limit() -> Result<()> { + async fn test_remove_unnecessary_sort5() -> Result<()> { let schema = create_test_schema()?; + let source = memory_exec(&schema); - let source1 = parquet_exec(&schema); - let sort_exprs = vec![ - sort_expr("nullable_col", &schema), - sort_expr("non_nullable_col", &schema), - ]; - let sort = sort_exec(sort_exprs.clone(), source1); - let limit = local_limit_exec(sort); - let limit = global_limit_exec(limit); - - let parquet_sort_exprs = vec![sort_expr("nullable_col", &schema)]; - let source2 = parquet_exec_sorted(&schema, parquet_sort_exprs); - - let union = union_exec(vec![source2, limit]); - let repartition = repartition_exec(union); - let physical_plan = sort_preserving_merge_exec(sort_exprs, repartition); + let input = sort_exec(vec![sort_expr("non_nullable_col", &schema)], source); + let input2 = sort_exec( + vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ], + input, + ); + let physical_plan = sort_exec(vec![sort_expr("nullable_col", &schema)], input2); let expected_input = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", - " UnionExec", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - " GlobalLimitExec: skip=0, fetch=100", - " LocalLimitExec: fetch=100", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + "SortExec: expr=[nullable_col@0 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " SortExec: expr=[non_nullable_col@1 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", ]; - - // We should keep the bottom `SortExec`. - let expected_optimized = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=false", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", - " UnionExec", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - " GlobalLimitExec: skip=0, fetch=100", - " LocalLimitExec: fetch=100", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + // Keep the middle SortExec + let expected_optimized = [ + "SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); Ok(()) } #[tokio::test] - async fn test_change_wrong_sorting() -> Result<()> { + async fn test_remove_unnecessary_sort6() -> Result<()> { let schema = create_test_schema()?; - let source = memory_exec(&schema); - let sort_exprs = vec![ - sort_expr("nullable_col", &schema), - sort_expr("non_nullable_col", &schema), - ]; - let sort = sort_exec(vec![sort_exprs[0].clone()], source); - let physical_plan = sort_preserving_merge_exec(sort_exprs, sort); + let source1 = repartition_exec(memory_exec(&schema)); + + let source2 = repartition_exec(memory_exec(&schema)); + let union = union_exec(vec![source1, source2]); + + let sort_exprs = vec![sort_expr("non_nullable_col", &schema)]; + // let sort = sort_exec(sort_exprs.clone(), union); + let sort = Arc::new(SortExec::new_with_partitioning( + sort_exprs.clone(), + union, + true, + None, + )) as _; + let spm = sort_preserving_merge_exec(sort_exprs, sort); + + let filter = filter_exec( + Arc::new(NotExpr::new( + col("non_nullable_col", schema.as_ref()).unwrap(), + )), + spm, + ); + + let sort_exprs = vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ]; + let physical_plan = sort_exec(sort_exprs, filter); + + // When removing a `SortPreservingMergeExec`, make sure that partitioning + // requirements are not violated. In some cases, we may need to replace + // it with a `CoalescePartitionsExec` instead of directly removing it. let expected_input = vec![ + "SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " FilterExec: NOT non_nullable_col@1", + " SortPreservingMergeExec: [non_nullable_col@1 ASC]", + " SortExec: expr=[non_nullable_col@1 ASC], global=false", + " UnionExec", + " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=0", + " MemoryExec: partitions=0, partition_sizes=[]", + " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=0", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + + let expected_optimized = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " SortExec: expr=[nullable_col@0 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=false", + " FilterExec: NOT non_nullable_col@1", + " UnionExec", + " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=0", + " MemoryExec: partitions=0, partition_sizes=[]", + " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=0", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_remove_unnecessary_spm1() -> Result<()> { + let schema = create_test_schema()?; + let source = memory_exec(&schema); + let input = sort_preserving_merge_exec( + vec![sort_expr("non_nullable_col", &schema)], + source, + ); + let physical_plan = sort_exec(vec![sort_expr("nullable_col", &schema)], input); + + let expected_input = vec![ + "SortExec: expr=[nullable_col@0 ASC], global=true", + " SortPreservingMergeExec: [non_nullable_col@1 ASC]", " MemoryExec: partitions=0, partition_sizes=[]", ]; let expected_optimized = vec![ + "SortExec: expr=[nullable_col@0 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_remove_unnecessary_spm2() -> Result<()> { + let schema = create_test_schema()?; + let source = memory_exec(&schema); + let input = sort_preserving_merge_exec( + vec![sort_expr("non_nullable_col", &schema)], + source, + ); + let input2 = sort_preserving_merge_exec( + vec![sort_expr("non_nullable_col", &schema)], + input, + ); + let physical_plan = + sort_preserving_merge_exec(vec![sort_expr("nullable_col", &schema)], input2); + + let expected_input = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " SortPreservingMergeExec: [non_nullable_col@1 ASC]", + " SortPreservingMergeExec: [non_nullable_col@1 ASC]", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + let expected_optimized = vec![ + "SortExec: expr=[nullable_col@0 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_do_not_remove_sort_with_limit() -> Result<()> { + let schema = create_test_schema()?; + + let source1 = parquet_exec(&schema); + let sort_exprs = vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ]; + let sort = sort_exec(sort_exprs.clone(), source1); + let limit = limit_exec(sort); + + let parquet_sort_exprs = vec![sort_expr("nullable_col", &schema)]; + let source2 = parquet_exec_sorted(&schema, parquet_sort_exprs); + + let union = union_exec(vec![source2, limit]); + let repartition = repartition_exec(union); + let physical_plan = sort_preserving_merge_exec(sort_exprs, repartition); + + let expected_input = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", + " UnionExec", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + " GlobalLimitExec: skip=0, fetch=100", + " LocalLimitExec: fetch=100", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + + // We should keep the bottom `SortExec`. + let expected_optimized = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=false", + " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", + " UnionExec", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + " GlobalLimitExec: skip=0, fetch=100", + " LocalLimitExec: fetch=100", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_change_wrong_sorting() -> Result<()> { + let schema = create_test_schema()?; + let source = memory_exec(&schema); + let sort_exprs = vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ]; + let sort = sort_exec(vec![sort_exprs[0].clone()], source); + let physical_plan = sort_preserving_merge_exec(sort_exprs, sort); + let expected_input = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + " SortExec: expr=[nullable_col@0 ASC], global=true", " MemoryExec: partitions=0, partition_sizes=[]", ]; + let expected_optimized = vec![ + "SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_change_wrong_sorting2() -> Result<()> { + let schema = create_test_schema()?; + let source = memory_exec(&schema); + let sort_exprs = vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ]; + let spm1 = sort_preserving_merge_exec(sort_exprs.clone(), source); + let sort2 = sort_exec(vec![sort_exprs[0].clone()], spm1); + let physical_plan = + sort_preserving_merge_exec(vec![sort_exprs[1].clone()], sort2); + + let expected_input = vec![ + "SortPreservingMergeExec: [non_nullable_col@1 ASC]", + " SortExec: expr=[nullable_col@0 ASC], global=true", + " SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + let expected_optimized = vec![ + "SortExec: expr=[non_nullable_col@1 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; assert_optimized!(expected_input, expected_optimized, physical_plan); Ok(()) } @@ -1434,8 +1971,7 @@ mod tests { // Input is an invalid plan. In this case rule should add required sorting in appropriate places. // First ParquetExec has output ordering(nullable_col@0 ASC). However, it doesn't satisfy required ordering - // of SortPreservingMergeExec. Hence rule should remove unnecessary sort for second child of the UnionExec - // and put a sort above Union to satisfy required ordering. + // of SortPreservingMergeExec. let expected_input = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", " UnionExec", @@ -1443,12 +1979,13 @@ mod tests { " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; - // should remove unnecessary sorting from below and move it to top + let expected_optimized = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=false", - " UnionExec", + " UnionExec", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); @@ -1532,115 +2069,565 @@ mod tests { " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; let expected_optimized = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=false", - " UnionExec", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + " UnionExec", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_union_inputs_different_sorted5() -> Result<()> { + let schema = create_test_schema()?; + + let source1 = parquet_exec(&schema); + let sort_exprs1 = vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ]; + let sort_exprs2 = vec![ + sort_expr("nullable_col", &schema), + sort_expr_options( + "non_nullable_col", + &schema, + SortOptions { + descending: true, + nulls_first: false, + }, + ), + ]; + let sort_exprs3 = vec![sort_expr("nullable_col", &schema)]; + let sort1 = sort_exec(sort_exprs1, source1.clone()); + let sort2 = sort_exec(sort_exprs2, source1); + + let union = union_exec(vec![sort1, sort2]); + let physical_plan = sort_preserving_merge_exec(sort_exprs3, union); + + // The `UnionExec` doesn't preserve any of the inputs ordering in the + // example below. However, we should be able to change the unnecessarily + // fine `SortExec`s below with required `SortExec`s that are absolutely necessary. + let expected_input = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " UnionExec", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 DESC NULLS LAST], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + let expected_optimized = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " UnionExec", + " SortExec: expr=[nullable_col@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " SortExec: expr=[nullable_col@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + #[ignore] + async fn test_union_inputs_different_sorted6() -> Result<()> { + let schema = create_test_schema()?; + + let source1 = parquet_exec(&schema); + let sort_exprs1 = vec![sort_expr("nullable_col", &schema)]; + let sort1 = sort_exec(sort_exprs1, source1.clone()); + let sort_exprs2 = vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ]; + let repartition = repartition_exec(source1); + let spm = sort_preserving_merge_exec(sort_exprs2, repartition); + + let parquet_sort_exprs = vec![sort_expr("nullable_col", &schema)]; + let source2 = parquet_exec_sorted(&schema, parquet_sort_exprs.clone()); + + let union = union_exec(vec![sort1, source2, spm]); + let physical_plan = sort_preserving_merge_exec(parquet_sort_exprs, union); + + // The plan is not valid as it is -- the input ordering requirement + // of the `SortPreservingMergeExec` under the third child of the + // `UnionExec` is not met. We should add a `SortExec` below it. + // At the same time, this ordering requirement is unnecessarily fine. + // The final plan should be valid AND the ordering of the third child + // shouldn't be finer than necessary. + let expected_input = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " UnionExec", + " SortExec: expr=[nullable_col@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + " SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + // Should adjust the requirement in the third input of the union so + // that it is not unnecessarily fine. + let expected_optimized = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " UnionExec", + " SortExec: expr=[nullable_col@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + " SortExec: expr=[nullable_col@0 ASC], global=false", + " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_union_inputs_different_sorted7() -> Result<()> { + let schema = create_test_schema()?; + + let source1 = parquet_exec(&schema); + let sort_exprs1 = vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ]; + let sort_exprs3 = vec![sort_expr("nullable_col", &schema)]; + let sort1 = sort_exec(sort_exprs1.clone(), source1.clone()); + let sort2 = sort_exec(sort_exprs1, source1); + + let union = union_exec(vec![sort1, sort2]); + let physical_plan = sort_preserving_merge_exec(sort_exprs3, union); + + // Union preserves the inputs ordering and we should not change any of the SortExecs under UnionExec + let expected_input = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " UnionExec", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + assert_optimized!(expected_input, expected_input, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_union_inputs_different_sorted8() -> Result<()> { + let schema = create_test_schema()?; + + let source1 = parquet_exec(&schema); + let sort_exprs1 = vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ]; + let sort_exprs2 = vec![ + sort_expr_options( + "nullable_col", + &schema, + SortOptions { + descending: true, + nulls_first: false, + }, + ), + sort_expr_options( + "non_nullable_col", + &schema, + SortOptions { + descending: true, + nulls_first: false, + }, + ), + ]; + let sort1 = sort_exec(sort_exprs1, source1.clone()); + let sort2 = sort_exec(sort_exprs2, source1); + + let physical_plan = union_exec(vec![sort1, sort2]); + + // The `UnionExec` doesn't preserve any of the inputs ordering in the + // example below. + let expected_input = vec![ + "UnionExec", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " SortExec: expr=[nullable_col@0 DESC NULLS LAST,non_nullable_col@1 DESC NULLS LAST], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + let expected_optimized = vec![ + "UnionExec", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_union_inputs_different_sorted_with_limit() -> Result<()> { + let schema = create_test_schema()?; + + let source1 = parquet_exec(&schema); + let sort_exprs1 = vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ]; + let sort_exprs2 = vec![ + sort_expr("nullable_col", &schema), + sort_expr_options( + "non_nullable_col", + &schema, + SortOptions { + descending: true, + nulls_first: false, + }, + ), + ]; + let sort_exprs3 = vec![sort_expr("nullable_col", &schema)]; + let sort1 = sort_exec(sort_exprs1, source1.clone()); + + let sort2 = sort_exec(sort_exprs2, source1); + let limit = local_limit_exec(sort2); + let limit = global_limit_exec(limit); + + let union = union_exec(vec![sort1, limit]); + let physical_plan = sort_preserving_merge_exec(sort_exprs3, union); + + // Should not change the unnecessarily fine `SortExec`s because there is `LimitExec` + let expected_input = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " UnionExec", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " GlobalLimitExec: skip=0, fetch=100", + " LocalLimitExec: fetch=100", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 DESC NULLS LAST], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + let expected_optimized = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " UnionExec", + " SortExec: expr=[nullable_col@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " GlobalLimitExec: skip=0, fetch=100", + " LocalLimitExec: fetch=100", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 DESC NULLS LAST], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_sort_merge_join_order_by_left() -> Result<()> { + let left_schema = create_test_schema()?; + let right_schema = create_test_schema2()?; + + let left = parquet_exec(&left_schema); + let right = parquet_exec(&right_schema); + + // Join on (nullable_col == col_a) + let join_on = vec![( + Column::new_with_schema("nullable_col", &left.schema()).unwrap(), + Column::new_with_schema("col_a", &right.schema()).unwrap(), + )]; + + let join_types = vec![ + JoinType::Inner, + JoinType::Left, + JoinType::Right, + JoinType::Full, + JoinType::LeftSemi, + JoinType::LeftAnti, + ]; + for join_type in join_types { + let join = + sort_merge_join_exec(left.clone(), right.clone(), &join_on, &join_type); + let sort_exprs = vec![ + sort_expr("nullable_col", &join.schema()), + sort_expr("non_nullable_col", &join.schema()), + ]; + let physical_plan = sort_preserving_merge_exec(sort_exprs.clone(), join); + + let join_plan = + format!("SortMergeJoin: join_type={join_type}, on=[(Column {{ name: \"nullable_col\", index: 0 }}, Column {{ name: \"col_a\", index: 0 }})]"); + let join_plan2 = + format!(" SortMergeJoin: join_type={join_type}, on=[(Column {{ name: \"nullable_col\", index: 0 }}, Column {{ name: \"col_a\", index: 0 }})]"); + let expected_input = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", + join_plan2.as_str(), + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", + ]; + let expected_optimized = match join_type { + JoinType::Inner + | JoinType::Left + | JoinType::LeftSemi + | JoinType::LeftAnti => { + // can push down the sort requirements and save 1 SortExec + vec![ + join_plan.as_str(), + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " SortExec: expr=[col_a@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", + ] + } + _ => { + // can not push down the sort requirements + vec![ + "SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + join_plan2.as_str(), + " SortExec: expr=[nullable_col@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " SortExec: expr=[col_a@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", + ] + } + }; + assert_optimized!(expected_input, expected_optimized, physical_plan); + } + Ok(()) + } + + #[tokio::test] + async fn test_sort_merge_join_order_by_right() -> Result<()> { + let left_schema = create_test_schema()?; + let right_schema = create_test_schema2()?; + + let left = parquet_exec(&left_schema); + let right = parquet_exec(&right_schema); + + // Join on (nullable_col == col_a) + let join_on = vec![( + Column::new_with_schema("nullable_col", &left.schema()).unwrap(), + Column::new_with_schema("col_a", &right.schema()).unwrap(), + )]; + + let join_types = vec![ + JoinType::Inner, + JoinType::Left, + JoinType::Right, + JoinType::Full, + JoinType::RightAnti, + ]; + for join_type in join_types { + let join = + sort_merge_join_exec(left.clone(), right.clone(), &join_on, &join_type); + let sort_exprs = vec![ + sort_expr("col_a", &join.schema()), + sort_expr("col_b", &join.schema()), + ]; + let physical_plan = sort_preserving_merge_exec(sort_exprs, join); + + let join_plan = + format!("SortMergeJoin: join_type={join_type}, on=[(Column {{ name: \"nullable_col\", index: 0 }}, Column {{ name: \"col_a\", index: 0 }})]"); + let spm_plan = match join_type { + JoinType::RightAnti => { + "SortPreservingMergeExec: [col_a@0 ASC,col_b@1 ASC]" + } + _ => "SortPreservingMergeExec: [col_a@2 ASC,col_b@3 ASC]", + }; + let join_plan2 = + format!(" SortMergeJoin: join_type={join_type}, on=[(Column {{ name: \"nullable_col\", index: 0 }}, Column {{ name: \"col_a\", index: 0 }})]"); + let expected_input = vec![ + spm_plan, + join_plan2.as_str(), + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", + ]; + let expected_optimized = match join_type { + JoinType::Inner | JoinType::Right | JoinType::RightAnti => { + // can push down the sort requirements and save 1 SortExec + vec![ + join_plan.as_str(), + " SortExec: expr=[nullable_col@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " SortExec: expr=[col_a@0 ASC,col_b@1 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", + ] + } + _ => { + // can not push down the sort requirements for Left and Full join. + vec![ + "SortExec: expr=[col_a@2 ASC,col_b@3 ASC], global=true", + join_plan2.as_str(), + " SortExec: expr=[nullable_col@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " SortExec: expr=[col_a@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", + ] + } + }; + assert_optimized!(expected_input, expected_optimized, physical_plan); + } + Ok(()) + } + + #[tokio::test] + async fn test_sort_merge_join_complex_order_by() -> Result<()> { + let left_schema = create_test_schema()?; + let right_schema = create_test_schema2()?; + + let left = parquet_exec(&left_schema); + let right = parquet_exec(&right_schema); + + // Join on (nullable_col == col_a) + let join_on = vec![( + Column::new_with_schema("nullable_col", &left.schema()).unwrap(), + Column::new_with_schema("col_a", &right.schema()).unwrap(), + )]; + + let join = sort_merge_join_exec(left, right, &join_on, &JoinType::Inner); + + // order by (col_b, col_a) + let sort_exprs1 = vec![ + sort_expr("col_b", &join.schema()), + sort_expr("col_a", &join.schema()), + ]; + let physical_plan = sort_preserving_merge_exec(sort_exprs1, join.clone()); + + let expected_input = vec![ + "SortPreservingMergeExec: [col_b@3 ASC,col_a@2 ASC]", + " SortMergeJoin: join_type=Inner, on=[(Column { name: \"nullable_col\", index: 0 }, Column { name: \"col_a\", index: 0 })]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", + ]; + + // can not push down the sort requirements, need to add SortExec + let expected_optimized = vec![ + "SortExec: expr=[col_b@3 ASC,col_a@2 ASC], global=true", + " SortMergeJoin: join_type=Inner, on=[(Column { name: \"nullable_col\", index: 0 }, Column { name: \"col_a\", index: 0 })]", + " SortExec: expr=[nullable_col@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " SortExec: expr=[col_a@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + + // order by (nullable_col, col_b, col_a) + let sort_exprs2 = vec![ + sort_expr("nullable_col", &join.schema()), + sort_expr("col_b", &join.schema()), + sort_expr("col_a", &join.schema()), + ]; + let physical_plan = sort_preserving_merge_exec(sort_exprs2, join); + + let expected_input = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC,col_b@3 ASC,col_a@2 ASC]", + " SortMergeJoin: join_type=Inner, on=[(Column { name: \"nullable_col\", index: 0 }, Column { name: \"col_a\", index: 0 })]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", + ]; + + // can not push down the sort requirements, need to add SortExec + let expected_optimized = vec![ + "SortExec: expr=[nullable_col@0 ASC,col_b@3 ASC,col_a@2 ASC], global=true", + " SortMergeJoin: join_type=Inner, on=[(Column { name: \"nullable_col\", index: 0 }, Column { name: \"col_a\", index: 0 })]", + " SortExec: expr=[nullable_col@0 ASC], global=true", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " SortExec: expr=[col_a@0 ASC], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) } #[tokio::test] - async fn test_union_inputs_different_sorted5() -> Result<()> { + async fn test_not_remove_top_sort_window_multilayer() -> Result<()> { let schema = create_test_schema()?; + let source = memory_exec(&schema); - let source1 = parquet_exec(&schema); - let sort_exprs1 = vec![ - sort_expr("nullable_col", &schema), - sort_expr("non_nullable_col", &schema), - ]; - let sort_exprs2 = vec![ - sort_expr("nullable_col", &schema), - sort_expr_options( - "non_nullable_col", - &schema, - SortOptions { - descending: true, - nulls_first: false, - }, - ), - ]; - let sort_exprs3 = vec![sort_expr("nullable_col", &schema)]; - let sort1 = sort_exec(sort_exprs1, source1.clone()); - let sort2 = sort_exec(sort_exprs2, source1); + let sort_exprs = vec![sort_expr_options( + "non_nullable_col", + &source.schema(), + SortOptions { + descending: true, + nulls_first: true, + }, + )]; + let sort = sort_exec(sort_exprs.clone(), source); - let union = union_exec(vec![sort1, sort2]); - let physical_plan = sort_preserving_merge_exec(sort_exprs3, union); + let window_agg = bounded_window_exec("non_nullable_col", sort_exprs, sort); + + let sort_exprs = vec![sort_expr_options( + "non_nullable_col", + &window_agg.schema(), + SortOptions { + descending: false, + nulls_first: false, + }, + )]; + + let sort = sort_exec(sort_exprs.clone(), window_agg); + + // Add dummy layer propagating Sort above, the top Sort should not be removed + let filter = filter_exec( + Arc::new(NotExpr::new( + col("non_nullable_col", schema.as_ref()).unwrap(), + )), + sort, + ); + + // let filter_exec = sort_exec; + let physical_plan = bounded_window_exec("non_nullable_col", sort_exprs, filter); - // The `UnionExec` doesn't preserve any of the inputs ordering in the - // example below. However, we should be able to change the unnecessarily - // fine `SortExec`s below with required `SortExec`s that are absolutely necessary. let expected_input = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " UnionExec", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 DESC NULLS LAST], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + " FilterExec: NOT non_nullable_col@1", + " SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], global=true", + " BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + " SortExec: expr=[non_nullable_col@1 DESC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", ]; + let expected_optimized = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " UnionExec", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(NULL) }]", + " FilterExec: NOT non_nullable_col@1", + " BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + " SortExec: expr=[non_nullable_col@1 DESC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); Ok(()) } #[tokio::test] - async fn test_union_inputs_different_sorted6() -> Result<()> { + async fn test_multiple_sort_window_exec() -> Result<()> { let schema = create_test_schema()?; + let source = memory_exec(&schema); - let source1 = parquet_exec(&schema); let sort_exprs1 = vec![sort_expr("nullable_col", &schema)]; - let sort1 = sort_exec(sort_exprs1, source1.clone()); let sort_exprs2 = vec![ sort_expr("nullable_col", &schema), sort_expr("non_nullable_col", &schema), ]; - let repartition = repartition_exec(source1); - let spm = sort_preserving_merge_exec(sort_exprs2, repartition); - - let parquet_sort_exprs = vec![sort_expr("nullable_col", &schema)]; - let source2 = parquet_exec_sorted(&schema, parquet_sort_exprs.clone()); - let union = union_exec(vec![sort1, source2, spm]); - let physical_plan = sort_preserving_merge_exec(parquet_sort_exprs, union); + let sort1 = sort_exec(sort_exprs1.clone(), source); + let window_agg1 = + bounded_window_exec("non_nullable_col", sort_exprs1.clone(), sort1); + let window_agg2 = + bounded_window_exec("non_nullable_col", sort_exprs2, window_agg1); + // let filter_exec = sort_exec; + let physical_plan = + bounded_window_exec("non_nullable_col", sort_exprs1, window_agg2); - // The plan is not valid as it is -- the input ordering requirement - // of the `SortPreservingMergeExec` under the third child of the - // `UnionExec` is not met. We should add a `SortExec` below it. - // At the same time, this ordering requirement is unnecessarily fine. - // The final plan should be valid AND the ordering of the third child - // shouldn't be finer than necessary. let expected_input = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " UnionExec", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - " SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + " BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + " BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + " SortExec: expr=[nullable_col@0 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", ]; - // Should adjust the requirement in the third input of the union so - // that it is not unnecessarily fine. + let expected_optimized = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " UnionExec", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - " SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortExec: expr=[nullable_col@0 ASC], global=false", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + " BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + " BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); Ok(()) @@ -1670,14 +2657,14 @@ mod tests { let sort2 = sort_exec(sort_exprs3.clone(), source2); let union = union_exec(vec![sort1, sort2]); - let physical_plan = window_exec("nullable_col", sort_exprs3, union); + let physical_plan = bounded_window_exec("nullable_col", sort_exprs3, union); // The `WindowAggExec` gets its sorting from multiple children jointly. // During the removal of `SortExec`s, it should be able to remove the // corresponding SortExecs together. Also, the inputs of these `SortExec`s // are not necessarily the same to be able to remove them. let expected_input = vec![ - "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", " UnionExec", " SortExec: expr=[nullable_col@0 DESC NULLS LAST], global=true", " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", @@ -1694,6 +2681,55 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_window_multi_path_sort2() -> Result<()> { + let schema = create_test_schema()?; + + let sort_exprs1 = vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ]; + let sort_exprs2 = vec![sort_expr("nullable_col", &schema)]; + // reverse sorting of sort_exprs2 + let reversed_sort_exprs2 = vec![sort_expr_options( + "nullable_col", + &schema, + SortOptions { + descending: true, + nulls_first: false, + }, + )]; + let source1 = parquet_exec_sorted(&schema, sort_exprs1); + let source2 = parquet_exec_sorted(&schema, sort_exprs2.clone()); + let sort1 = sort_exec(reversed_sort_exprs2.clone(), source1); + let sort2 = sort_exec(reversed_sort_exprs2, source2); + + let union = union_exec(vec![sort1, sort2]); + let coalesce = Arc::new(CoalescePartitionsExec::new(union)) as _; + let physical_plan = bounded_window_exec("nullable_col", sort_exprs2, coalesce); + + // The `WindowAggExec` can get its required sorting from the leaf nodes directly. + // The unnecessary SortExecs should be removed + let expected_input = vec![ + "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + " CoalescePartitionsExec", + " UnionExec", + " SortExec: expr=[nullable_col@0 DESC NULLS LAST], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", + " SortExec: expr=[nullable_col@0 DESC NULLS LAST], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + ]; + let expected_optimized = vec![ + "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + " SortPreservingMergeExec: [nullable_col@0 ASC]", + " UnionExec", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + #[tokio::test] async fn test_multilayer_coalesce_partitions() -> Result<()> { let schema = create_test_schema()?; @@ -1732,6 +2768,47 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_coalesce_propagate() -> Result<()> { + let schema = create_test_schema()?; + let source = memory_exec(&schema); + let repartition = repartition_exec(source); + let coalesce_partitions = Arc::new(CoalescePartitionsExec::new(repartition)); + let repartition = repartition_exec(coalesce_partitions); + let sort_exprs = vec![sort_expr("nullable_col", &schema)]; + // Add local sort + let sort = Arc::new(SortExec::new_with_partitioning( + sort_exprs.clone(), + repartition, + true, + None, + )) as _; + let spm = sort_preserving_merge_exec(sort_exprs.clone(), sort); + let sort = sort_exec(sort_exprs, spm); + + let physical_plan = sort.clone(); + // Sort Parallelize rule should end Coalesce + Sort linkage when Sort is Global Sort + // Also input plan is not valid as it is. We need to add SortExec before SortPreservingMergeExec. + let expected_input = vec![ + "SortExec: expr=[nullable_col@0 ASC], global=true", + " SortPreservingMergeExec: [nullable_col@0 ASC]", + " SortExec: expr=[nullable_col@0 ASC], global=false", + " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", + " CoalescePartitionsExec", + " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=0", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + let expected_optimized = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " SortExec: expr=[nullable_col@0 ASC], global=false", + " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10", + " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=0", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + #[tokio::test] // With new change in SortEnforcement EnforceSorting->EnforceDistribution->EnforceSorting // should produce same result with EnforceDistribution+EnforceSorting @@ -1746,7 +2823,7 @@ mod tests { let memory_exec = memory_exec(&schema); let sort_exprs = vec![sort_expr("nullable_col", &schema)]; - let window = window_exec("nullable_col", sort_exprs.clone(), memory_exec); + let window = bounded_window_exec("nullable_col", sort_exprs.clone(), memory_exec); let repartition = repartition_exec(window); let orig_plan = Arc::new(SortExec::new_with_partitioning( @@ -1781,47 +2858,6 @@ mod tests { Ok(()) } - #[tokio::test] - async fn test_coalesce_propagate() -> Result<()> { - let schema = create_test_schema()?; - let source = memory_exec(&schema); - let repartition = repartition_exec(source); - let coalesce_partitions = Arc::new(CoalescePartitionsExec::new(repartition)); - let repartition = repartition_exec(coalesce_partitions); - let sort_exprs = vec![sort_expr("nullable_col", &schema)]; - // Add local sort - let sort = Arc::new(SortExec::new_with_partitioning( - sort_exprs.clone(), - repartition, - true, - None, - )) as _; - let spm = sort_preserving_merge_exec(sort_exprs.clone(), sort); - let sort = sort_exec(sort_exprs, spm); - - let physical_plan = sort.clone(); - // Sort Parallelize rule should end Coalesce + Sort linkage when Sort is Global Sort - // Also input plan is not valid as it is. We need to add SortExec before SortPreservingMergeExec. - let expected_input = vec![ - "SortExec: expr=[nullable_col@0 ASC], global=true", - " SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortExec: expr=[nullable_col@0 ASC], global=false", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " CoalescePartitionsExec", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=0", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - let expected_optimized = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortExec: expr=[nullable_col@0 ASC], global=false", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=0", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - /// make PhysicalSortExpr with default options fn sort_expr(name: &str, schema: &Schema) -> PhysicalSortExpr { sort_expr_options(name, schema, SortOptions::default()) @@ -1866,7 +2902,7 @@ mod tests { Arc::new(FilterExec::try_new(predicate, input).unwrap()) } - fn window_exec( + fn bounded_window_exec( col_name: &str, sort_exprs: impl IntoIterator, input: Arc, @@ -1875,7 +2911,7 @@ mod tests { let schema = input.schema(); Arc::new( - WindowAggExec::try_new( + BoundedWindowAggExec::try_new( vec![create_window_expr( &WindowFunction::AggregateFunction(AggregateFunction::Count), "count".to_owned(), @@ -1942,6 +2978,10 @@ mod tests { Arc::new(UnionExec::new(input)) } + fn limit_exec(input: Arc) -> Arc { + global_limit_exec(local_limit_exec(input)) + } + fn local_limit_exec(input: Arc) -> Arc { Arc::new(LocalLimitExec::new(input, 100)) } @@ -1969,4 +3009,23 @@ mod tests { .unwrap(), ) } + + fn sort_merge_join_exec( + left: Arc, + right: Arc, + join_on: &JoinOn, + join_type: &JoinType, + ) -> Arc { + Arc::new( + SortMergeJoinExec::try_new( + left, + right, + join_on.clone(), + *join_type, + vec![SortOptions::default(); join_on.len()], + false, + ) + .unwrap(), + ) + } } diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement2.rs b/datafusion/core/src/physical_optimizer/sort_enforcement2.rs deleted file mode 100644 index e2d5e1d8c7251..0000000000000 --- a/datafusion/core/src/physical_optimizer/sort_enforcement2.rs +++ /dev/null @@ -1,2813 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! EnforceSorting optimizer rule inspects the physical plan with respect -//! to local sorting requirements and does the following: -//! - Adds a [SortExec] when a requirement is not met, -//! - Removes an already-existing [SortExec] if it is possible to prove -//! that this sort is unnecessary -//! The rule can work on valid *and* invalid physical plans with respect to -//! sorting requirements, but always produces a valid physical plan in this sense. -//! -//! A non-realistic but easy to follow example for sort removals: Assume that we -//! somehow get the fragment -//! -//! ```text -//! SortExec: expr=[nullable_col@0 ASC] -//! SortExec: expr=[non_nullable_col@1 ASC] -//! ``` -//! -//! in the physical plan. The child sort is unnecessary since its result is overwritten -//! by the parent SortExec. Therefore, this rule removes it from the physical plan. -use crate::config::ConfigOptions; -use crate::error::Result; -use crate::execution::context::TaskContext; -use crate::physical_optimizer::utils::add_sort_above; -use crate::physical_optimizer::PhysicalOptimizerRule; -use crate::physical_plan::coalesce_partitions::CoalescePartitionsExec; -use crate::physical_plan::filter::FilterExec; -use crate::physical_plan::joins::utils::JoinSide; -use crate::physical_plan::joins::SortMergeJoinExec; -use crate::physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; -use crate::physical_plan::projection::ProjectionExec; -use crate::physical_plan::repartition::RepartitionExec; -use crate::physical_plan::rewrite::TreeNodeRewritable; -use crate::physical_plan::sorts::sort::SortExec; -use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; -use crate::physical_plan::union::UnionExec; -use crate::physical_plan::windows::{BoundedWindowAggExec, WindowAggExec}; -use crate::physical_plan::{ - with_new_children_if_necessary, DisplayFormatType, Distribution, ExecutionPlan, - Partitioning, SendableRecordBatchStream, -}; -use arrow::datatypes::SchemaRef; -use datafusion_common::{reverse_sort_options, DataFusionError, Statistics}; -use datafusion_expr::JoinType; -use datafusion_physical_expr::expressions::Column; -use datafusion_physical_expr::utils::{ - create_sort_expr_from_requirement, map_requirement_before_projection, - ordering_satisfy, ordering_satisfy_requirement, requirements_compatible, -}; -use datafusion_physical_expr::window::WindowExpr; -use datafusion_physical_expr::{ - new_sort_requirements, EquivalenceProperties, PhysicalExpr, PhysicalSortExpr, - PhysicalSortRequirements, -}; -use itertools::izip; -use std::any::Any; -use std::ops::Deref; -use std::sync::Arc; - -/// This rule implements a Top-Down approach to inspects SortExec's in the given physical plan and removes the -/// ones it can prove unnecessary. -#[derive(Default)] -pub struct TopDownEnforceSorting {} - -impl TopDownEnforceSorting { - #[allow(missing_docs)] - pub fn new() -> Self { - Self {} - } -} - -/// This is a "data class" we use within the [TopDownEnforceSorting] rule -#[derive(Debug, Clone)] -struct PlanWithSortRequirements { - /// Current plan - plan: Arc, - /// Whether the plan could impact the final result ordering - impact_result_ordering: bool, - /// Parent has the SinglePartition requirement to children - satisfy_single_distribution: bool, - /// Parent required sort ordering - required_ordering: Option>, - /// The adjusted request sort ordering to children. - /// By default they are the same as the plan's required input ordering, but can be adjusted based on parent required sort ordering properties. - adjusted_request_ordering: Vec>>, -} - -impl PlanWithSortRequirements { - pub fn init(plan: Arc) -> Self { - let impact_result_ordering = plan.output_ordering().is_some() - || plan.output_partitioning().partition_count() <= 1 - || plan.as_any().downcast_ref::().is_some() - || plan.as_any().downcast_ref::().is_some(); - let request_ordering = plan.required_input_ordering(); - PlanWithSortRequirements { - plan, - impact_result_ordering, - satisfy_single_distribution: false, - required_ordering: None, - adjusted_request_ordering: request_ordering, - } - } - - pub fn new_without_impact_result_ordering(plan: Arc) -> Self { - let request_ordering = plan.required_input_ordering(); - PlanWithSortRequirements { - plan, - impact_result_ordering: false, - satisfy_single_distribution: false, - required_ordering: None, - adjusted_request_ordering: request_ordering, - } - } - - pub fn children(&self) -> Vec { - let plan_children = self.plan.children(); - assert_eq!(plan_children.len(), self.adjusted_request_ordering.len()); - - izip!( - plan_children.into_iter(), - self.adjusted_request_ordering.clone().into_iter(), - self.plan.maintains_input_order().into_iter(), - self.plan.required_input_distribution().into_iter(), - ) - .map( - |(child, from_parent, maintains_input_order, required_dist)| { - let child_satisfy_single_distribution = - matches!(required_dist, Distribution::SinglePartition); - let child_impact_result_ordering = if self - .plan - .as_any() - .downcast_ref::() - .is_some() - || self - .plan - .as_any() - .downcast_ref::() - .is_some() - { - true - } else { - maintains_input_order && self.impact_result_ordering - }; - let child_request_ordering = child.required_input_ordering(); - PlanWithSortRequirements { - plan: child, - impact_result_ordering: child_impact_result_ordering, - satisfy_single_distribution: child_satisfy_single_distribution, - required_ordering: from_parent, - adjusted_request_ordering: child_request_ordering, - } - }, - ) - .collect() - } -} - -impl TreeNodeRewritable for PlanWithSortRequirements { - fn map_children(self, transform: F) -> Result - where - F: FnMut(Self) -> Result, - { - let children = self.children(); - if children.is_empty() { - Ok(self) - } else { - let new_children = children - .into_iter() - .map(transform) - .collect::>>()?; - - let children_plans = new_children - .iter() - .map(|elem| elem.plan.clone()) - .collect::>(); - let plan = with_new_children_if_necessary(self.plan, children_plans)?; - Ok(PlanWithSortRequirements { - plan, - impact_result_ordering: self.impact_result_ordering, - satisfy_single_distribution: self.satisfy_single_distribution, - required_ordering: self.required_ordering, - adjusted_request_ordering: self.adjusted_request_ordering, - }) - } - } -} - -impl PhysicalOptimizerRule for TopDownEnforceSorting { - fn optimize( - &self, - plan: Arc, - _config: &ConfigOptions, - ) -> Result> { - // Execute a Top-Down process(Preorder Traversal) to ensure the sort requirements: - let plan_requirements = PlanWithSortRequirements::init(plan); - let adjusted = plan_requirements.transform_down(&ensure_sorting)?; - // Execute a Top-Down process(Preorder Traversal) to remove all the unnecessary Sort - let adjusted_plan = adjusted.plan.transform_down(&|plan| { - if let Some(sort_exec) = plan.as_any().downcast_ref::() { - if ordering_satisfy( - sort_exec.input().output_ordering(), - sort_exec.output_ordering(), - || sort_exec.input().equivalence_properties(), - ) { - Ok(Some(Arc::new(TombStoneExec::new( - sort_exec.input().clone(), - )))) - } else { - Ok(None) - } - } else { - Ok(None) - } - })?; - // Remove the TombStoneExec - let final_plan = adjusted_plan.transform_up(&|plan| { - if let Some(tombstone_exec) = plan.as_any().downcast_ref::() { - Ok(Some(tombstone_exec.input.clone())) - } else { - Ok(None) - } - })?; - Ok(final_plan) - } - - fn name(&self) -> &str { - "TopDownEnforceSorting" - } - - fn schema_check(&self) -> bool { - true - } -} - -fn ensure_sorting( - requirements: PlanWithSortRequirements, -) -> Result> { - if let Some(sort_exec) = requirements.plan.as_any().downcast_ref::() { - // Remove unnecessary SortExec(local/global) - if let Some(result) = analyze_immediate_sort_removal(&requirements, sort_exec) { - return Ok(Some(result)); - } - } else if let Some(sort_pres_exec) = requirements - .plan - .as_any() - .downcast_ref::() - { - // SortPreservingMergeExec + SortExec(local/global) is the same as the global SortExec - // Remove unnecessary SortPreservingMergeExec + SortExec(local/global) - if let Some(child_sort_exec) = - sort_pres_exec.input().as_any().downcast_ref::() - { - if sort_pres_exec.expr() == child_sort_exec.expr() { - if let Some(result) = - analyze_immediate_sort_removal(&requirements, child_sort_exec) - { - return Ok(Some(result)); - } - } - } else if !requirements.satisfy_single_distribution - || sort_pres_exec - .input() - .output_partitioning() - .partition_count() - <= 1 - { - if let Some(result) = - analyze_immediate_spm_removal(&requirements, sort_pres_exec) - { - return Ok(Some(result)); - } - } - } - let plan = &requirements.plan; - let parent_required = requirements.required_ordering.as_deref(); - if ordering_satisfy_requirement(plan.output_ordering(), parent_required, || { - plan.equivalence_properties() - }) { - // Can satisfy the parent requirements, change the adjusted_request_ordering for UnionExec and WindowAggExec(BoundedWindowAggExec) - if let Some(union_exec) = plan.as_any().downcast_ref::() { - // UnionExec does not have real sort requirements for its input. Here we change the adjusted_request_ordering to UnionExec's output ordering and - // propagate the sort requirements down to correct the unnecessary descendant SortExec under the UnionExec - let adjusted = new_sort_requirements(union_exec.output_ordering()); - return Ok(Some(PlanWithSortRequirements { - required_ordering: None, - adjusted_request_ordering: vec![ - adjusted; - requirements - .adjusted_request_ordering - .len() - ], - ..requirements - })); - } else if plan.as_any().downcast_ref::().is_some() - || plan - .as_any() - .downcast_ref::() - .is_some() - { - // WindowAggExec(BoundedWindowAggExec) might reverse their sort requirements - let request_child = requirements.adjusted_request_ordering[0].as_deref(); - let reversed_request_child = reverse_window_sort_requirements(request_child); - - if should_reverse_window_sort_requirements( - plan.clone(), - request_child, - reversed_request_child.as_deref(), - ) { - let WindowExecInfo { - window_expr, - input_schema, - partition_keys, - } = extract_window_info_from_plan(plan).unwrap(); - - let new_window_expr = window_expr - .iter() - .map(|e| e.get_reverse_expr()) - .collect::>>(); - let new_physical_ordering = create_sort_expr_from_requirement( - reversed_request_child.clone().unwrap().as_ref(), - ); - if let Some(window_expr) = new_window_expr { - let uses_bounded_memory = - window_expr.iter().all(|e| e.uses_bounded_memory()); - // If all window expressions can run with bounded memory, choose the - // bounded window variant: - let new_plan = if uses_bounded_memory { - Arc::new(BoundedWindowAggExec::try_new( - window_expr, - plan.children()[0].clone(), - input_schema, - partition_keys, - Some(new_physical_ordering), - )?) as Arc - } else { - Arc::new(WindowAggExec::try_new( - window_expr, - plan.children()[0].clone(), - input_schema, - partition_keys, - Some(new_physical_ordering), - )?) as Arc - }; - return Ok(Some(PlanWithSortRequirements { - plan: new_plan, - impact_result_ordering: false, - satisfy_single_distribution: requirements - .satisfy_single_distribution, - required_ordering: None, - adjusted_request_ordering: vec![reversed_request_child], - })); - } - } - } - Ok(Some(PlanWithSortRequirements { - required_ordering: None, - ..requirements - })) - } else if let Some(sort_exec) = plan.as_any().downcast_ref::() { - // If the current plan is a SortExec, modify current SortExec to satisfy the parent requirements - let parent_required_expr = - create_sort_expr_from_requirement(parent_required.unwrap()); - let mut new_plan = sort_exec.input.clone(); - add_sort_above(&mut new_plan, parent_required_expr)?; - Ok(Some( - PlanWithSortRequirements::new_without_impact_result_ordering(new_plan), - )) - } else { - // Can not satisfy the parent requirements, check whether the requirements can be pushed down. If not, add new SortExec. - let parent_required_expr = - create_sort_expr_from_requirement(parent_required.unwrap()); - let maintains_input_order = plan.maintains_input_order(); - // If the current plan is a leaf node or can not maintain any of the input ordering, can not pushed down requirements. - // For RepartitionExec, we always choose to not push down the sort requirements even the RepartitionExec(input_partition=1) could maintain input ordering. - // For UnionExec, we can always push down - if (maintains_input_order.is_empty() - || !maintains_input_order.iter().any(|o| *o) - || plan.as_any().downcast_ref::().is_some() - || plan.as_any().downcast_ref::().is_some() - || plan.as_any().downcast_ref::().is_some() - || plan.as_any().downcast_ref::().is_some()) - && plan.as_any().downcast_ref::().is_none() - { - let mut new_plan = plan.clone(); - add_sort_above(&mut new_plan, parent_required_expr)?; - Ok(Some( - PlanWithSortRequirements::new_without_impact_result_ordering(new_plan), - )) - } else if plan.as_any().downcast_ref::().is_some() - || plan - .as_any() - .downcast_ref::() - .is_some() - { - let request_child = requirements.adjusted_request_ordering[0].as_deref(); - let child_plan = plan.children()[0].clone(); - match determine_children_requirement( - parent_required, - request_child, - child_plan, - ) { - RequirementsCompatibility::Satisfy => Ok(None), - RequirementsCompatibility::Compatible(adjusted) => { - Ok(Some(PlanWithSortRequirements { - required_ordering: None, - adjusted_request_ordering: vec![adjusted], - ..requirements - })) - } - RequirementsCompatibility::NonCompatible => { - let WindowExecInfo { - window_expr, - input_schema, - partition_keys, - } = extract_window_info_from_plan(plan).unwrap(); - if should_reverse_window_exec( - parent_required, - request_child, - &input_schema, - ) { - let new_physical_ordering = parent_required_expr.to_vec(); - let new_window_expr = window_expr - .iter() - .map(|e| e.get_reverse_expr()) - .collect::>>(); - if let Some(window_expr) = new_window_expr { - let uses_bounded_memory = - window_expr.iter().all(|e| e.uses_bounded_memory()); - let new_plan = if uses_bounded_memory { - Arc::new(BoundedWindowAggExec::try_new( - window_expr, - plan.children()[0].clone(), - input_schema, - partition_keys, - Some(new_physical_ordering), - )?) - as Arc - } else { - Arc::new(WindowAggExec::try_new( - window_expr, - plan.children()[0].clone(), - input_schema, - partition_keys, - Some(new_physical_ordering), - )?) - as Arc - }; - let adjusted_request_ordering = - new_plan.required_input_ordering(); - return Ok(Some(PlanWithSortRequirements { - plan: new_plan, - impact_result_ordering: false, - satisfy_single_distribution: requirements - .satisfy_single_distribution, - required_ordering: None, - adjusted_request_ordering, - })); - } - } - // Can not push down requirements, add new SortExec - let mut new_plan = plan.clone(); - add_sort_above(&mut new_plan, parent_required_expr)?; - Ok(Some( - PlanWithSortRequirements::new_without_impact_result_ordering( - new_plan, - ), - )) - } - } - } else if let Some(smj) = plan.as_any().downcast_ref::() { - // If the current plan is SortMergeJoinExec - let left_columns_len = smj.left.schema().fields().len(); - let expr_source_side = - expr_source_sides(&parent_required_expr, smj.join_type, left_columns_len); - match expr_source_side { - Some(JoinSide::Left) if maintains_input_order[0] => { - try_pushdown_requirements_to_join( - &requirements, - parent_required, - parent_required_expr, - JoinSide::Left, - ) - } - Some(JoinSide::Right) if maintains_input_order[1] => { - let new_right_required = match smj.join_type { - JoinType::Inner | JoinType::Right => shift_right_required( - parent_required.unwrap(), - left_columns_len, - )?, - JoinType::RightSemi | JoinType::RightAnti => { - parent_required.unwrap().to_vec() - } - _ => Err(DataFusionError::Plan( - "Unexpected SortMergeJoin type here".to_string(), - ))?, - }; - try_pushdown_requirements_to_join( - &requirements, - Some(new_right_required.deref()), - parent_required_expr, - JoinSide::Right, - ) - } - _ => { - // Can not decide the expr side for SortMergeJoinExec, can not push down, add SortExec; - let mut new_plan = plan.clone(); - add_sort_above(&mut new_plan, parent_required_expr)?; - Ok(Some( - PlanWithSortRequirements::new_without_impact_result_ordering( - new_plan, - ), - )) - } - } - } else if plan.required_input_ordering().iter().any(Option::is_some) { - let plan_children = plan.children(); - let compatible_with_children = izip!( - maintains_input_order.iter(), - plan.required_input_ordering().into_iter(), - plan_children.iter() - ) - .map(|(can_push_down, request_child, child)| { - if *can_push_down { - determine_children_requirement( - parent_required, - request_child.as_deref(), - child.clone(), - ) - } else { - RequirementsCompatibility::NonCompatible - } - }) - .collect::>(); - if compatible_with_children - .iter() - .all(|a| matches!(a, RequirementsCompatibility::Satisfy)) - { - // Requirements are satisfied, not need to push down. - Ok(None) - } else if compatible_with_children - .iter() - .all(|a| matches!(a, RequirementsCompatibility::Compatible(_))) - { - // Adjust child requirements and push down the requirements - let adjusted = parent_required.map(|r| r.to_vec()); - Ok(Some(PlanWithSortRequirements { - required_ordering: None, - adjusted_request_ordering: vec![adjusted; plan_children.len()], - ..requirements - })) - } else { - // Can not push down, add new SortExec - let mut new_plan = plan.clone(); - add_sort_above(&mut new_plan, parent_required_expr)?; - Ok(Some( - PlanWithSortRequirements::new_without_impact_result_ordering( - new_plan, - ), - )) - } - } else { - // The current plan does not have its own ordering requirements to its children, consider push down the requirements - if let Some(ProjectionExec { expr, .. }) = - plan.as_any().downcast_ref::() - { - // For Projection, we need to transform the requirements to the columns before the Projection - // And then to push down the requirements - let new_adjusted = - map_requirement_before_projection(parent_required, expr); - if new_adjusted.is_some() { - Ok(Some(PlanWithSortRequirements { - required_ordering: None, - adjusted_request_ordering: vec![new_adjusted], - ..requirements - })) - } else { - // Can not push down, add new SortExec - let mut new_plan = plan.clone(); - add_sort_above(&mut new_plan, parent_required_expr)?; - Ok(Some( - PlanWithSortRequirements::new_without_impact_result_ordering( - new_plan, - ), - )) - } - } else { - Ok(Some(PlanWithSortRequirements { - required_ordering: None, - adjusted_request_ordering: vec![ - requirements.required_ordering; - requirements - .adjusted_request_ordering - .len() - ], - ..requirements - })) - } - } - } -} - -/// Analyzes a given `Sort` (`plan`) to determine whether the Sort can be removed: -/// 1) The input already has a finer ordering than this `Sort` enforces. -/// 2) The `Sort` does not impact the final result ordering. -fn analyze_immediate_sort_removal( - requirements: &PlanWithSortRequirements, - sort_exec: &SortExec, -) -> Option { - if ordering_satisfy( - sort_exec.input().output_ordering(), - sort_exec.output_ordering(), - || sort_exec.input().equivalence_properties(), - ) { - Some(PlanWithSortRequirements { - plan: Arc::new(TombStoneExec::new(sort_exec.input().clone())), - impact_result_ordering: requirements.impact_result_ordering, - satisfy_single_distribution: requirements.satisfy_single_distribution, - required_ordering: None, - adjusted_request_ordering: vec![requirements.required_ordering.clone()], - }) - } - // Remove unnecessary SortExec - else if !requirements.impact_result_ordering { - if requirements.satisfy_single_distribution - && !sort_exec.preserve_partitioning() - && sort_exec.input().output_partitioning().partition_count() > 1 - { - Some(PlanWithSortRequirements { - plan: Arc::new(CoalescePartitionsExec::new(sort_exec.input().clone())), - impact_result_ordering: false, - satisfy_single_distribution: false, - required_ordering: None, - adjusted_request_ordering: vec![requirements.required_ordering.clone()], - }) - } else { - Some(PlanWithSortRequirements { - plan: Arc::new(TombStoneExec::new(sort_exec.input().clone())), - impact_result_ordering: false, - satisfy_single_distribution: false, - required_ordering: None, - adjusted_request_ordering: vec![requirements.required_ordering.clone()], - }) - } - } else { - None - } -} - -/// Analyzes a given `SortPreservingMergeExec` (`plan`) to determine whether the SortPreservingMergeExec can be removed: -/// 1) The input already has a finer ordering than this `SortPreservingMergeExec` enforces. -/// 2) The `SortPreservingMergeExec` does not impact the final result ordering. -fn analyze_immediate_spm_removal( - requirements: &PlanWithSortRequirements, - spm_exec: &SortPreservingMergeExec, -) -> Option { - if ordering_satisfy( - spm_exec.input().output_ordering(), - Some(spm_exec.expr()), - || spm_exec.input().equivalence_properties(), - ) && spm_exec.input().output_partitioning().partition_count() <= 1 - { - Some(PlanWithSortRequirements { - plan: Arc::new(TombStoneExec::new(spm_exec.input().clone())), - impact_result_ordering: true, - satisfy_single_distribution: false, - required_ordering: None, - adjusted_request_ordering: vec![requirements.required_ordering.clone()], - }) - } - // Remove unnecessary SortPreservingMergeExec only - else if !requirements.impact_result_ordering { - Some(PlanWithSortRequirements { - plan: Arc::new(TombStoneExec::new(spm_exec.input().clone())), - impact_result_ordering: false, - satisfy_single_distribution: false, - required_ordering: None, - adjusted_request_ordering: vec![requirements.required_ordering.clone()], - }) - } else { - None - } -} - -/// Determine the children requirements -/// If the children requirements are more specific, do not push down the parent requirements -/// If the the parent requirements are more specific, push down the parent requirements -/// If they are not compatible, need to add Sort. -fn determine_children_requirement( - parent_required: Option<&[PhysicalSortRequirements]>, - request_child: Option<&[PhysicalSortRequirements]>, - child_plan: Arc, -) -> RequirementsCompatibility { - if requirements_compatible(request_child, parent_required, || { - child_plan.equivalence_properties() - }) { - // request child requirements are more specific, no need to push down the parent requirements - RequirementsCompatibility::Satisfy - } else if requirements_compatible(parent_required, request_child, || { - child_plan.equivalence_properties() - }) { - // parent requirements are more specific, adjust the request child requirements and push down the new requirements - let adjusted = parent_required.map(|r| r.to_vec()); - RequirementsCompatibility::Compatible(adjusted) - } else { - RequirementsCompatibility::NonCompatible - } -} - -/// Compares window expression's `window_request` and `parent_required_expr` ordering, returns -/// whether we should reverse the window expression's ordering in order to meet parent's requirements. -fn check_alignment( - input_schema: &SchemaRef, - window_request: &PhysicalSortRequirements, - parent_required_expr: &PhysicalSortRequirements, -) -> bool { - if parent_required_expr.expr.eq(&window_request.expr) - && window_request.sort_options.is_some() - && parent_required_expr.sort_options.is_some() - { - let nullable = parent_required_expr.expr.nullable(input_schema).unwrap(); - let window_request_opts = window_request.sort_options.unwrap(); - let parent_required_opts = parent_required_expr.sort_options.unwrap(); - if nullable { - window_request_opts == reverse_sort_options(parent_required_opts) - } else { - // If the column is not nullable, NULLS FIRST/LAST is not important. - window_request_opts.descending != parent_required_opts.descending - } - } else { - false - } -} - -fn reverse_window_sort_requirements( - request_child: Option<&[PhysicalSortRequirements]>, -) -> Option> { - request_child.map(|request| { - request - .iter() - .map(|req| match req.sort_options { - None => req.clone(), - Some(ops) => PhysicalSortRequirements { - expr: req.expr.clone(), - sort_options: Some(reverse_sort_options(ops)), - }, - }) - .collect::>() - }) -} - -/// Whether to reverse the top WindowExec's sort requirements. -/// Considering the requirements of the descendants WindowExecs and leaf nodes' output ordering. -/// TODO!considering all the cases -fn should_reverse_window_sort_requirements( - window_plan: Arc, - top_requirement: Option<&[PhysicalSortRequirements]>, - top_reversed_requirement: Option<&[PhysicalSortRequirements]>, -) -> bool { - if top_requirement.is_none() { - return false; - } - let WindowExecInfo { window_expr, .. } = - extract_window_info_from_plan(&window_plan).unwrap(); - let reverse_window_expr = window_expr - .iter() - .map(|e| e.get_reverse_expr()) - .collect::>>(); - if reverse_window_expr.is_none() { - return false; - } - let flags = window_plan - .children() - .into_iter() - .map(|child| { - // If the child is leaf node, check the output ordering - if child.children().is_empty() - && ordering_satisfy_requirement( - child.output_ordering(), - top_requirement, - || child.equivalence_properties(), - ) - { - false - } else if child.children().is_empty() - && ordering_satisfy_requirement( - child.output_ordering(), - top_reversed_requirement, - || child.equivalence_properties(), - ) - { - true - } else if child.as_any().downcast_ref::().is_some() - || child - .as_any() - .downcast_ref::() - .is_some() - { - // If the child is WindowExec, check the child requirements - if requirements_compatible( - top_requirement, - child.required_input_ordering()[0].as_deref(), - || child.equivalence_properties(), - ) || requirements_compatible( - child.required_input_ordering()[0].as_deref(), - top_requirement, - || child.equivalence_properties(), - ) || requirements_compatible( - top_reversed_requirement, - child.required_input_ordering()[0].as_deref(), - || child.equivalence_properties(), - ) || requirements_compatible( - child.required_input_ordering()[0].as_deref(), - top_reversed_requirement, - || child.equivalence_properties(), - ) { - should_reverse_window_sort_requirements( - child, - top_requirement, - top_reversed_requirement, - ) - } else { - requirements_compatible( - top_reversed_requirement, - window_plan.required_input_ordering()[0].as_deref(), - || window_plan.equivalence_properties(), - ) || requirements_compatible( - window_plan.required_input_ordering()[0].as_deref(), - top_reversed_requirement, - || window_plan.equivalence_properties(), - ) - } - } else { - requirements_compatible( - top_reversed_requirement, - window_plan.required_input_ordering()[0].as_deref(), - || window_plan.equivalence_properties(), - ) || requirements_compatible( - window_plan.required_input_ordering()[0].as_deref(), - top_reversed_requirement, - || window_plan.equivalence_properties(), - ) - } - }) - .collect::>(); - - flags.iter().all(|o| *o) -} - -fn should_reverse_window_exec( - required: Option<&[PhysicalSortRequirements]>, - request_ordering: Option<&[PhysicalSortRequirements]>, - input_schema: &SchemaRef, -) -> bool { - match (required, request_ordering) { - (_, None) => false, - (None, Some(_)) => false, - (Some(required), Some(request_ordering)) => { - if required.len() > request_ordering.len() { - return false; - } - let alignment_flags = required - .iter() - .zip(request_ordering.iter()) - .filter_map(|(required_expr, request_expr)| { - // Only check the alignment of non-partition columns - if request_expr.sort_options.is_some() - && required_expr.sort_options.is_some() - { - Some(check_alignment(input_schema, request_expr, required_expr)) - } else if request_expr.expr.eq(&required_expr.expr) { - None - } else { - Some(false) - } - }) - .collect::>(); - if alignment_flags.is_empty() { - false - } else { - alignment_flags.iter().all(|o| *o) - } - } - } -} - -fn extract_window_info_from_plan( - plan: &Arc, -) -> Option { - if let Some(exec) = plan.as_any().downcast_ref::() { - Some(WindowExecInfo { - window_expr: exec.window_expr().to_vec(), - input_schema: exec.input_schema(), - partition_keys: exec.partition_keys.clone(), - }) - } else { - plan.as_any() - .downcast_ref::() - .map(|exec| WindowExecInfo { - window_expr: exec.window_expr().to_vec(), - input_schema: exec.input_schema(), - partition_keys: exec.partition_keys.clone(), - }) - } -} - -fn try_pushdown_requirements_to_join( - requirements: &PlanWithSortRequirements, - parent_required: Option<&[PhysicalSortRequirements]>, - sort_expr: Vec, - push_side: JoinSide, -) -> Result> { - let child_idx = match push_side { - JoinSide::Left => 0, - JoinSide::Right => 1, - }; - let request_child = requirements.adjusted_request_ordering[child_idx].as_deref(); - let child_plan = requirements.plan.children()[child_idx].clone(); - match determine_children_requirement(parent_required, request_child, child_plan) { - RequirementsCompatibility::Satisfy => Ok(None), - RequirementsCompatibility::Compatible(adjusted) => { - let new_adjusted = match push_side { - JoinSide::Left => { - vec![adjusted, requirements.adjusted_request_ordering[1].clone()] - } - JoinSide::Right => { - vec![requirements.adjusted_request_ordering[0].clone(), adjusted] - } - }; - Ok(Some(PlanWithSortRequirements { - required_ordering: None, - adjusted_request_ordering: new_adjusted, - ..requirements.clone() - })) - } - RequirementsCompatibility::NonCompatible => { - // Can not push down, add new SortExec - let mut new_plan = requirements.plan.clone(); - add_sort_above(&mut new_plan, sort_expr)?; - Ok(Some( - PlanWithSortRequirements::new_without_impact_result_ordering(new_plan), - )) - } - } -} - -fn expr_source_sides( - required_exprs: &[PhysicalSortExpr], - join_type: JoinType, - left_columns_len: usize, -) -> Option { - match join_type { - JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full => { - let all_column_sides = required_exprs - .iter() - .filter_map(|r| { - if let Some(col) = r.expr.as_any().downcast_ref::() { - if col.index() < left_columns_len { - Some(JoinSide::Left) - } else { - Some(JoinSide::Right) - } - } else { - None - } - }) - .collect::>(); - - // If the exprs are all coming from one side, the requirements can be pushed down - if all_column_sides.len() != required_exprs.len() { - None - } else if all_column_sides - .iter() - .all(|side| matches!(side, JoinSide::Left)) - { - Some(JoinSide::Left) - } else if all_column_sides - .iter() - .all(|side| matches!(side, JoinSide::Right)) - { - Some(JoinSide::Right) - } else { - None - } - } - JoinType::LeftSemi | JoinType::LeftAnti => { - if required_exprs - .iter() - .filter_map(|r| { - if r.expr.as_any().downcast_ref::().is_some() { - Some(JoinSide::Left) - } else { - None - } - }) - .count() - != required_exprs.len() - { - None - } else { - Some(JoinSide::Left) - } - } - JoinType::RightSemi | JoinType::RightAnti => { - if required_exprs - .iter() - .filter_map(|r| { - if r.expr.as_any().downcast_ref::().is_some() { - Some(JoinSide::Right) - } else { - None - } - }) - .count() - != required_exprs.len() - { - None - } else { - Some(JoinSide::Right) - } - } - } -} - -fn shift_right_required( - parent_required: &[PhysicalSortRequirements], - left_columns_len: usize, -) -> Result> { - let new_right_required: Vec = parent_required - .iter() - .filter_map(|r| { - if let Some(col) = r.expr.as_any().downcast_ref::() { - if col.index() >= left_columns_len { - Some(PhysicalSortRequirements { - expr: Arc::new(Column::new( - col.name(), - col.index() - left_columns_len, - )) as Arc, - sort_options: r.sort_options, - }) - } else { - None - } - } else { - None - } - }) - .collect::>(); - if new_right_required.len() != parent_required.len() { - Err(DataFusionError::Plan( - "Expect to shift all the parent required column indexes for SortMergeJoin" - .to_string(), - )) - } else { - Ok(new_right_required) - } -} - -/// This structure stores extra Window information required to create a new WindowExec -#[derive(Debug)] -pub struct WindowExecInfo { - window_expr: Vec>, - input_schema: SchemaRef, - partition_keys: Vec>, -} - -/// Define the Requirements Compatibility -#[derive(Debug)] -pub enum RequirementsCompatibility { - /// Requirements satisfy - Satisfy, - /// Requirements compatible - Compatible(Option>), - /// Requirements not compatible - NonCompatible, -} - -/// A TombStoneExec execution plan generated during optimization process, should be removed finally -#[derive(Debug)] -struct TombStoneExec { - /// The input plan - pub input: Arc, -} - -impl TombStoneExec { - pub fn new(input: Arc) -> Self { - Self { input } - } -} - -impl ExecutionPlan for TombStoneExec { - fn as_any(&self) -> &dyn Any { - self - } - - fn schema(&self) -> SchemaRef { - self.input.schema() - } - - fn output_partitioning(&self) -> Partitioning { - self.input.output_partitioning() - } - - fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> { - self.input.output_ordering() - } - - fn maintains_input_order(&self) -> Vec { - vec![true] - } - - fn equivalence_properties(&self) -> EquivalenceProperties { - self.input.equivalence_properties() - } - - fn children(&self) -> Vec> { - vec![self.input.clone()] - } - - fn with_new_children( - self: Arc, - children: Vec>, - ) -> Result> { - Ok(Arc::new(TombStoneExec::new(children[0].clone()))) - } - - fn execute( - &self, - _partition: usize, - _context: Arc, - ) -> Result { - Err(DataFusionError::Internal( - "TombStoneExec, invalid plan".to_string(), - )) - } - - fn fmt_as( - &self, - t: DisplayFormatType, - f: &mut std::fmt::Formatter, - ) -> std::fmt::Result { - match t { - DisplayFormatType::Default => { - write!(f, "TombStoneExec") - } - } - } - - fn statistics(&self) -> Statistics { - Statistics::default() - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::datasource::listing::PartitionedFile; - use crate::datasource::object_store::ObjectStoreUrl; - use crate::physical_plan::aggregates::PhysicalGroupBy; - use crate::physical_plan::aggregates::{AggregateExec, AggregateMode}; - use crate::physical_plan::displayable; - use crate::physical_plan::file_format::{FileScanConfig, ParquetExec}; - use crate::physical_plan::filter::FilterExec; - use crate::physical_plan::joins::utils::JoinOn; - use crate::physical_plan::memory::MemoryExec; - use crate::physical_plan::repartition::RepartitionExec; - use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; - use crate::physical_plan::union::UnionExec; - use crate::physical_plan::windows::create_window_expr; - use crate::prelude::SessionContext; - use arrow::compute::SortOptions; - use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; - use datafusion_common::{Result, Statistics}; - use datafusion_expr::logical_plan::JoinType; - use datafusion_expr::{AggregateFunction, WindowFrame, WindowFunction}; - use datafusion_physical_expr::expressions::{col, NotExpr}; - use datafusion_physical_expr::PhysicalSortExpr; - use std::ops::Deref; - use std::sync::Arc; - - fn create_test_schema() -> Result { - let nullable_column = Field::new("nullable_col", DataType::Int32, true); - let non_nullable_column = Field::new("non_nullable_col", DataType::Int32, false); - let schema = Arc::new(Schema::new(vec![nullable_column, non_nullable_column])); - - Ok(schema) - } - - fn create_test_schema2() -> Result { - let col_a = Field::new("col_a", DataType::Int32, true); - let col_b = Field::new("col_b", DataType::Int32, true); - let schema = Arc::new(Schema::new(vec![col_a, col_b])); - Ok(schema) - } - - // Util function to get string representation of a physical plan - fn get_plan_string(plan: &Arc) -> Vec { - let formatted = displayable(plan.as_ref()).indent().to_string(); - let actual: Vec<&str> = formatted.trim().lines().collect(); - actual.iter().map(|elem| elem.to_string()).collect() - } - - #[tokio::test] - async fn test_is_column_aligned_nullable() -> Result<()> { - let schema = create_test_schema()?; - let params = vec![ - ((true, true), (false, false), true), - ((true, true), (false, true), false), - ((true, true), (true, false), false), - ((true, false), (false, true), true), - ((true, false), (false, false), false), - ((true, false), (true, true), false), - ]; - for ( - (physical_desc, physical_nulls_first), - (req_desc, req_nulls_first), - reverse_expected, - ) in params - { - let physical_ordering = PhysicalSortRequirements { - expr: col("nullable_col", &schema)?, - sort_options: Some(SortOptions { - descending: physical_desc, - nulls_first: physical_nulls_first, - }), - }; - let required_ordering = PhysicalSortRequirements { - expr: col("nullable_col", &schema)?, - sort_options: Some(SortOptions { - descending: req_desc, - nulls_first: req_nulls_first, - }), - }; - let reverse = - check_alignment(&schema, &physical_ordering, &required_ordering); - assert_eq!(reverse, reverse_expected); - } - - Ok(()) - } - - #[tokio::test] - async fn test_is_column_aligned_non_nullable() -> Result<()> { - let schema = create_test_schema()?; - - let params = vec![ - ((true, true), (false, false), true), - ((true, true), (false, true), true), - ((true, true), (true, false), false), - ((true, false), (false, true), true), - ((true, false), (false, false), true), - ((true, false), (true, true), false), - ]; - for ( - (physical_desc, physical_nulls_first), - (req_desc, req_nulls_first), - reverse_expected, - ) in params - { - let physical_ordering = PhysicalSortRequirements { - expr: col("non_nullable_col", &schema)?, - sort_options: Some(SortOptions { - descending: physical_desc, - nulls_first: physical_nulls_first, - }), - }; - let required_ordering = PhysicalSortRequirements { - expr: col("non_nullable_col", &schema)?, - sort_options: Some(SortOptions { - descending: req_desc, - nulls_first: req_nulls_first, - }), - }; - let reverse = - check_alignment(&schema, &physical_ordering, &required_ordering); - assert_eq!(reverse, reverse_expected); - } - - Ok(()) - } - - #[tokio::test] - async fn test_should_reverse_window() -> Result<()> { - let schema = create_test_schema()?; - - // partition by nullable_col order by non_nullable_col - let window_request_ordering1 = vec![ - PhysicalSortRequirements { - expr: col("nullable_col", &schema)?, - sort_options: None, - }, - PhysicalSortRequirements { - expr: col("non_nullable_col", &schema)?, - sort_options: Some(SortOptions { - descending: true, - nulls_first: true, - }), - }, - ]; - let required_ordering1 = vec![ - PhysicalSortRequirements { - expr: col("nullable_col", &schema)?, - sort_options: None, - }, - PhysicalSortRequirements { - expr: col("non_nullable_col", &schema)?, - sort_options: Some(SortOptions { - descending: false, - nulls_first: false, - }), - }, - ]; - - let reverse = should_reverse_window_exec( - Some(required_ordering1.deref()), - Some(window_request_ordering1.deref()), - &schema, - ); - assert!(reverse); - - // order by nullable_col, non_nullable_col - let window_request_ordering2 = vec![ - PhysicalSortRequirements { - expr: col("nullable_col", &schema)?, - sort_options: Some(SortOptions { - descending: true, - nulls_first: true, - }), - }, - PhysicalSortRequirements { - expr: col("non_nullable_col", &schema)?, - sort_options: Some(SortOptions { - descending: true, - nulls_first: true, - }), - }, - ]; - - let required_ordering2 = vec![ - PhysicalSortRequirements { - expr: col("nullable_col", &schema)?, - sort_options: None, - }, - PhysicalSortRequirements { - expr: col("non_nullable_col", &schema)?, - sort_options: Some(SortOptions { - descending: false, - nulls_first: false, - }), - }, - ]; - - let reverse = should_reverse_window_exec( - Some(required_ordering2.deref()), - Some(window_request_ordering2.deref()), - &schema, - ); - assert!(reverse); - - // wrong partition columns - let window_request_ordering3 = vec![ - PhysicalSortRequirements { - expr: col("nullable_col", &schema)?, - sort_options: Some(SortOptions { - descending: true, - nulls_first: true, - }), - }, - PhysicalSortRequirements { - expr: col("non_nullable_col", &schema)?, - sort_options: Some(SortOptions { - descending: true, - nulls_first: true, - }), - }, - ]; - - let required_ordering3 = vec![ - PhysicalSortRequirements { - expr: col("non_nullable_col", &schema)?, - sort_options: None, - }, - PhysicalSortRequirements { - expr: col("non_nullable_col", &schema)?, - sort_options: Some(SortOptions { - descending: false, - nulls_first: false, - }), - }, - ]; - - let reverse = should_reverse_window_exec( - Some(required_ordering3.deref()), - Some(window_request_ordering3.deref()), - &schema, - ); - assert!(!reverse); - - Ok(()) - } - - /// Runs the sort enforcement optimizer and asserts the plan - /// against the original and expected plans - /// - /// `$EXPECTED_PLAN_LINES`: input plan - /// `$EXPECTED_OPTIMIZED_PLAN_LINES`: optimized plan - /// `$PLAN`: the plan to optimized - /// - macro_rules! assert_optimized { - ($EXPECTED_PLAN_LINES: expr, $EXPECTED_OPTIMIZED_PLAN_LINES: expr, $PLAN: expr) => { - let session_ctx = SessionContext::new(); - let state = session_ctx.state(); - - let physical_plan = $PLAN; - let formatted = displayable(physical_plan.as_ref()).indent().to_string(); - let actual: Vec<&str> = formatted.trim().lines().collect(); - - let expected_plan_lines: Vec<&str> = $EXPECTED_PLAN_LINES - .iter().map(|s| *s).collect(); - - assert_eq!( - expected_plan_lines, actual, - "\n**Original Plan Mismatch\n\nexpected:\n\n{expected_plan_lines:#?}\nactual:\n\n{actual:#?}\n\n" - ); - - let expected_optimized_lines: Vec<&str> = $EXPECTED_OPTIMIZED_PLAN_LINES - .iter().map(|s| *s).collect(); - - // Run the actual optimizer - let optimized_physical_plan = - TopDownEnforceSorting::new().optimize(physical_plan, state.config_options())?; - // Get string representation of the plan - let actual = get_plan_string(&optimized_physical_plan); - assert_eq!( - expected_optimized_lines, actual, - "\n**Optimized Plan Mismatch\n\nexpected:\n\n{expected_optimized_lines:#?}\nactual:\n\n{actual:#?}\n\n" - ); - - }; - } - - #[tokio::test] - async fn test_remove_unnecessary_sort() -> Result<()> { - let schema = create_test_schema()?; - let source = memory_exec(&schema); - let input = sort_exec(vec![sort_expr("non_nullable_col", &schema)], source); - let physical_plan = sort_exec(vec![sort_expr("nullable_col", &schema)], input); - - let expected_input = vec![ - "SortExec: expr=[nullable_col@0 ASC], global=true", - " SortExec: expr=[non_nullable_col@1 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - let expected_optimized = vec![ - "SortExec: expr=[nullable_col@0 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_not_remove_top_sort_window_multilayer() -> Result<()> { - let schema = create_test_schema()?; - let source = memory_exec(&schema); - - let sort_exprs = vec![sort_expr_options( - "non_nullable_col", - &source.schema(), - SortOptions { - descending: true, - nulls_first: true, - }, - )]; - let sort = sort_exec(sort_exprs.clone(), source); - - let window_agg = window_exec("non_nullable_col", sort_exprs, sort); - - let sort_exprs = vec![sort_expr_options( - "non_nullable_col", - &window_agg.schema(), - SortOptions { - descending: false, - nulls_first: false, - }, - )]; - - let sort = sort_exec(sort_exprs.clone(), window_agg); - - // Add dummy layer propagating Sort above, the top Sort should not be removed - let filter = filter_exec( - Arc::new(NotExpr::new( - col("non_nullable_col", schema.as_ref()).unwrap(), - )), - sort, - ); - - // let filter_exec = sort_exec; - let physical_plan = window_exec("non_nullable_col", sort_exprs, filter); - - let expected_input = vec![ - "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " FilterExec: NOT non_nullable_col@1", - " SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], global=true", - " WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " SortExec: expr=[non_nullable_col@1 DESC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - - let expected_optimized = vec![ - "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " FilterExec: NOT non_nullable_col@1", - " SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], global=true", - " WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " SortExec: expr=[non_nullable_col@1 DESC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_multiple_sort_window_exec() -> Result<()> { - let schema = create_test_schema()?; - let source = memory_exec(&schema); - - let sort_exprs1 = vec![sort_expr("nullable_col", &schema)]; - let sort_exprs2 = vec![ - sort_expr("nullable_col", &schema), - sort_expr("non_nullable_col", &schema), - ]; - - let sort1 = sort_exec(sort_exprs1.clone(), source); - let window_agg1 = window_exec("non_nullable_col", sort_exprs1.clone(), sort1); - let window_agg2 = window_exec("non_nullable_col", sort_exprs2, window_agg1); - // let filter_exec = sort_exec; - let physical_plan = window_exec("non_nullable_col", sort_exprs1, window_agg2); - - let expected_input = vec![ - "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - - let expected_optimized = vec![ - "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_add_required_sort() -> Result<()> { - let schema = create_test_schema()?; - let source = memory_exec(&schema); - - let sort_exprs = vec![sort_expr("nullable_col", &schema)]; - - let physical_plan = sort_preserving_merge_exec(sort_exprs, source); - - let expected_input = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - let expected_optimized = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_remove_unnecessary_sort1() -> Result<()> { - let schema = create_test_schema()?; - let source = memory_exec(&schema); - let sort_exprs = vec![sort_expr("nullable_col", &schema)]; - let sort = sort_exec(sort_exprs.clone(), source); - let spm = sort_preserving_merge_exec(sort_exprs, sort); - - let sort_exprs = vec![sort_expr("nullable_col", &schema)]; - let sort = sort_exec(sort_exprs.clone(), spm); - let physical_plan = sort_preserving_merge_exec(sort_exprs, sort); - let expected_input = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - let expected_optimized = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_remove_unnecessary_sort2() -> Result<()> { - let schema = create_test_schema()?; - let source = memory_exec(&schema); - let sort_exprs = vec![sort_expr("non_nullable_col", &schema)]; - let sort = sort_exec(sort_exprs.clone(), source); - let spm = sort_preserving_merge_exec(sort_exprs, sort); - - let sort_exprs = vec![ - sort_expr("nullable_col", &schema), - sort_expr("non_nullable_col", &schema), - ]; - let sort2 = sort_exec(sort_exprs.clone(), spm); - let spm2 = sort_preserving_merge_exec(sort_exprs, sort2); - - let sort_exprs = vec![sort_expr("nullable_col", &schema)]; - let sort3 = sort_exec(sort_exprs, spm2); - let physical_plan = repartition_exec(repartition_exec(sort3)); - - let expected_input = vec![ - "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " SortPreservingMergeExec: [non_nullable_col@1 ASC]", - " SortExec: expr=[non_nullable_col@1 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - - let expected_optimized = vec![ - "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=0", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_remove_unnecessary_sort3() -> Result<()> { - let schema = create_test_schema()?; - let source = memory_exec(&schema); - let sort_exprs = vec![sort_expr("non_nullable_col", &schema)]; - let sort = sort_exec(sort_exprs.clone(), source); - let spm = sort_preserving_merge_exec(sort_exprs, sort); - - let sort_exprs = vec![ - sort_expr("nullable_col", &schema), - sort_expr("non_nullable_col", &schema), - ]; - let repartition_exec = repartition_exec(spm); - let sort2 = sort_exec(sort_exprs.clone(), repartition_exec); - let spm2 = sort_preserving_merge_exec(sort_exprs, sort2); - - let physical_plan = aggregate_exec(spm2); - - // When removing a `SortPreservingMergeExec`, make sure that partitioning - // requirements are not violated. In some cases, we may need to replace - // it with a `CoalescePartitionsExec` instead of directly removing it. - let expected_input = vec![ - "AggregateExec: mode=Final, gby=[], aggr=[]", - " SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortPreservingMergeExec: [non_nullable_col@1 ASC]", - " SortExec: expr=[non_nullable_col@1 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - - let expected_optimized = vec![ - "AggregateExec: mode=Final, gby=[], aggr=[]", - " CoalescePartitionsExec", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=0", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_remove_unnecessary_sort4() -> Result<()> { - let schema = create_test_schema()?; - let source = memory_exec(&schema); - let sort_exprs = vec![sort_expr("nullable_col", &schema)]; - let sort = sort_exec(sort_exprs.clone(), source); - let spm = sort_preserving_merge_exec(sort_exprs, sort); - - let sort_exprs = vec![sort_expr("nullable_col", &schema)]; - let sort = sort_exec(sort_exprs.clone(), spm); - let physical_plan = sort_preserving_merge_exec(sort_exprs, sort); - let expected_input = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - let expected_optimized = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_remove_unnecessary_sort5() -> Result<()> { - let schema = create_test_schema()?; - let source = memory_exec(&schema); - - let input = sort_exec(vec![sort_expr("non_nullable_col", &schema)], source); - let input2 = sort_exec( - vec![ - sort_expr("nullable_col", &schema), - sort_expr("non_nullable_col", &schema), - ], - input, - ); - let physical_plan = sort_exec(vec![sort_expr("nullable_col", &schema)], input2); - - let expected_input = vec![ - "SortExec: expr=[nullable_col@0 ASC], global=true", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " SortExec: expr=[non_nullable_col@1 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - // Keep the middle SortExec - let expected_optimized = [ - "SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_remove_unnecessary_spm1() -> Result<()> { - let schema = create_test_schema()?; - let source = memory_exec(&schema); - let input = sort_preserving_merge_exec( - vec![sort_expr("non_nullable_col", &schema)], - source, - ); - let physical_plan = sort_exec(vec![sort_expr("nullable_col", &schema)], input); - - let expected_input = vec![ - "SortExec: expr=[nullable_col@0 ASC], global=true", - " SortPreservingMergeExec: [non_nullable_col@1 ASC]", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - let expected_optimized = vec![ - "SortExec: expr=[nullable_col@0 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_remove_unnecessary_spm2() -> Result<()> { - let schema = create_test_schema()?; - let source = memory_exec(&schema); - let input = sort_preserving_merge_exec( - vec![sort_expr("non_nullable_col", &schema)], - source, - ); - let input2 = sort_preserving_merge_exec( - vec![sort_expr("non_nullable_col", &schema)], - input, - ); - let physical_plan = - sort_preserving_merge_exec(vec![sort_expr("nullable_col", &schema)], input2); - - let expected_input = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortPreservingMergeExec: [non_nullable_col@1 ASC]", - " SortPreservingMergeExec: [non_nullable_col@1 ASC]", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - let expected_optimized = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_do_not_remove_sort_with_limit() -> Result<()> { - let schema = create_test_schema()?; - - let source1 = parquet_exec(&schema); - let sort_exprs = vec![ - sort_expr("nullable_col", &schema), - sort_expr("non_nullable_col", &schema), - ]; - let sort = sort_exec(sort_exprs.clone(), source1); - let limit = limit_exec(sort); - - let parquet_sort_exprs = vec![sort_expr("nullable_col", &schema)]; - let source2 = parquet_exec_sorted(&schema, parquet_sort_exprs); - - let union = union_exec(vec![source2, limit]); - let repartition = repartition_exec(union); - let physical_plan = sort_preserving_merge_exec(sort_exprs, repartition); - - let expected_input = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", - " UnionExec", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - " GlobalLimitExec: skip=0, fetch=100", - " LocalLimitExec: fetch=100", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - ]; - - // We should keep the bottom `SortExec`. - let expected_optimized = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=false", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", - " UnionExec", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - " GlobalLimitExec: skip=0, fetch=100", - " LocalLimitExec: fetch=100", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_change_wrong_sorting() -> Result<()> { - let schema = create_test_schema()?; - let source = memory_exec(&schema); - let sort_exprs = vec![ - sort_expr("nullable_col", &schema), - sort_expr("non_nullable_col", &schema), - ]; - let sort = sort_exec(vec![sort_exprs[0].clone()], source); - let physical_plan = sort_preserving_merge_exec(sort_exprs, sort); - let expected_input = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - let expected_optimized = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_change_wrong_sorting2() -> Result<()> { - let schema = create_test_schema()?; - let source = memory_exec(&schema); - let sort_exprs = vec![ - sort_expr("nullable_col", &schema), - sort_expr("non_nullable_col", &schema), - ]; - let spm1 = sort_preserving_merge_exec(sort_exprs.clone(), source); - let sort2 = sort_exec(vec![sort_exprs[0].clone()], spm1); - let physical_plan = - sort_preserving_merge_exec(vec![sort_exprs[1].clone()], sort2); - - let expected_input = vec![ - "SortPreservingMergeExec: [non_nullable_col@1 ASC]", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - let expected_optimized = vec![ - "SortPreservingMergeExec: [non_nullable_col@1 ASC]", - " SortExec: expr=[non_nullable_col@1 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_union_inputs_sorted() -> Result<()> { - let schema = create_test_schema()?; - - let source1 = parquet_exec(&schema); - let sort_exprs = vec![sort_expr("nullable_col", &schema)]; - let sort = sort_exec(sort_exprs.clone(), source1); - - let source2 = parquet_exec_sorted(&schema, sort_exprs.clone()); - - let union = union_exec(vec![source2, sort]); - let physical_plan = sort_preserving_merge_exec(sort_exprs, union); - - // one input to the union is already sorted, one is not. - let expected_input = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " UnionExec", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - ]; - // should not add a sort at the output of the union, input plan should not be changed - let expected_optimized = expected_input.clone(); - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_union_inputs_different_sorted() -> Result<()> { - let schema = create_test_schema()?; - - let source1 = parquet_exec(&schema); - let sort_exprs = vec![sort_expr("nullable_col", &schema)]; - let sort = sort_exec(sort_exprs.clone(), source1); - - let parquet_sort_exprs = vec![ - sort_expr("nullable_col", &schema), - sort_expr("non_nullable_col", &schema), - ]; - let source2 = parquet_exec_sorted(&schema, parquet_sort_exprs); - - let union = union_exec(vec![source2, sort]); - let physical_plan = sort_preserving_merge_exec(sort_exprs, union); - - // one input to the union is already sorted, one is not. - let expected_input = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " UnionExec", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - ]; - // should not add a sort at the output of the union, input plan should not be changed - let expected_optimized = expected_input.clone(); - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_union_inputs_different_sorted2() -> Result<()> { - let schema = create_test_schema()?; - - let source1 = parquet_exec(&schema); - let sort_exprs = vec![ - sort_expr("nullable_col", &schema), - sort_expr("non_nullable_col", &schema), - ]; - let sort = sort_exec(sort_exprs.clone(), source1); - - let parquet_sort_exprs = vec![sort_expr("nullable_col", &schema)]; - let source2 = parquet_exec_sorted(&schema, parquet_sort_exprs); - - let union = union_exec(vec![source2, sort]); - let physical_plan = sort_preserving_merge_exec(sort_exprs, union); - - // Input is an invalid plan. In this case rule should add required sorting in appropriate places. - // First ParquetExec has output ordering(nullable_col@0 ASC). However, it doesn't satisfy required ordering - // of SortPreservingMergeExec. - let expected_input = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " UnionExec", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - ]; - - let expected_optimized = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " UnionExec", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_union_inputs_different_sorted3() -> Result<()> { - let schema = create_test_schema()?; - - let source1 = parquet_exec(&schema); - let sort_exprs1 = vec![ - sort_expr("nullable_col", &schema), - sort_expr("non_nullable_col", &schema), - ]; - let sort1 = sort_exec(sort_exprs1, source1.clone()); - let sort_exprs2 = vec![sort_expr("nullable_col", &schema)]; - let sort2 = sort_exec(sort_exprs2, source1); - - let parquet_sort_exprs = vec![sort_expr("nullable_col", &schema)]; - let source2 = parquet_exec_sorted(&schema, parquet_sort_exprs.clone()); - - let union = union_exec(vec![sort1, source2, sort2]); - let physical_plan = sort_preserving_merge_exec(parquet_sort_exprs, union); - - // First input to the union is not Sorted (SortExec is finer than required ordering by the SortPreservingMergeExec above). - // Second input to the union is already Sorted (matches with the required ordering by the SortPreservingMergeExec above). - // Third input to the union is not Sorted (SortExec is matches required ordering by the SortPreservingMergeExec above). - let expected_input = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " UnionExec", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - ]; - // should adjust sorting in the first input of the union such that it is not unnecessarily fine - let expected_optimized = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " UnionExec", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_union_inputs_different_sorted4() -> Result<()> { - let schema = create_test_schema()?; - - let source1 = parquet_exec(&schema); - let sort_exprs1 = vec![ - sort_expr("nullable_col", &schema), - sort_expr("non_nullable_col", &schema), - ]; - let sort_exprs2 = vec![sort_expr("nullable_col", &schema)]; - let sort1 = sort_exec(sort_exprs2.clone(), source1.clone()); - let sort2 = sort_exec(sort_exprs2.clone(), source1); - - let source2 = parquet_exec_sorted(&schema, sort_exprs2); - - let union = union_exec(vec![sort1, source2, sort2]); - let physical_plan = sort_preserving_merge_exec(sort_exprs1, union); - - // Ordering requirement of the `SortPreservingMergeExec` is not met. - // Should modify the plan to ensure that all three inputs to the - // `UnionExec` satisfy the ordering, OR add a single sort after - // the `UnionExec` (both of which are equally good for this example). - let expected_input = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " UnionExec", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - ]; - let expected_optimized = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " UnionExec", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_union_inputs_different_sorted5() -> Result<()> { - let schema = create_test_schema()?; - - let source1 = parquet_exec(&schema); - let sort_exprs1 = vec![ - sort_expr("nullable_col", &schema), - sort_expr("non_nullable_col", &schema), - ]; - let sort_exprs2 = vec![ - sort_expr("nullable_col", &schema), - sort_expr_options( - "non_nullable_col", - &schema, - SortOptions { - descending: true, - nulls_first: false, - }, - ), - ]; - let sort_exprs3 = vec![sort_expr("nullable_col", &schema)]; - let sort1 = sort_exec(sort_exprs1, source1.clone()); - let sort2 = sort_exec(sort_exprs2, source1); - - let union = union_exec(vec![sort1, sort2]); - let physical_plan = sort_preserving_merge_exec(sort_exprs3, union); - - // The `UnionExec` doesn't preserve any of the inputs ordering in the - // example below. However, we should be able to change the unnecessarily - // fine `SortExec`s below with required `SortExec`s that are absolutely necessary. - let expected_input = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " UnionExec", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 DESC NULLS LAST], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - ]; - let expected_optimized = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " UnionExec", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_union_inputs_different_sorted6() -> Result<()> { - let schema = create_test_schema()?; - - let source1 = parquet_exec(&schema); - let sort_exprs1 = vec![sort_expr("nullable_col", &schema)]; - let sort1 = sort_exec(sort_exprs1, source1.clone()); - let sort_exprs2 = vec![ - sort_expr("nullable_col", &schema), - sort_expr("non_nullable_col", &schema), - ]; - let repartition = repartition_exec(source1); - let spm = sort_preserving_merge_exec(sort_exprs2, repartition); - - let parquet_sort_exprs = vec![sort_expr("nullable_col", &schema)]; - let source2 = parquet_exec_sorted(&schema, parquet_sort_exprs.clone()); - - let union = union_exec(vec![sort1, source2, spm]); - let physical_plan = sort_preserving_merge_exec(parquet_sort_exprs, union); - - // The plan is not valid as it is -- the input ordering requirement - // of the `SortPreservingMergeExec` under the third child of the - // `UnionExec` is not met. We should add a `SortExec` below it. - // At the same time, this ordering requirement is unnecessarily fine. - // The final plan should be valid AND the ordering of the third child - // shouldn't be finer than necessary. - let expected_input = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " UnionExec", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - " SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - ]; - // Should adjust the requirement in the third input of the union so - // that it is not unnecessarily fine. - let expected_optimized = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " UnionExec", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 ASC], global=false", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_union_inputs_different_sorted7() -> Result<()> { - let schema = create_test_schema()?; - - let source1 = parquet_exec(&schema); - let sort_exprs1 = vec![ - sort_expr("nullable_col", &schema), - sort_expr("non_nullable_col", &schema), - ]; - let sort_exprs3 = vec![sort_expr("nullable_col", &schema)]; - let sort1 = sort_exec(sort_exprs1.clone(), source1.clone()); - let sort2 = sort_exec(sort_exprs1, source1); - - let union = union_exec(vec![sort1, sort2]); - let physical_plan = sort_preserving_merge_exec(sort_exprs3, union); - - // Union preserves the inputs ordering and we should not change any of the SortExecs under UnionExec - let expected_input = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " UnionExec", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - ]; - assert_optimized!(expected_input, expected_input, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_union_inputs_different_sorted8() -> Result<()> { - let schema = create_test_schema()?; - - let source1 = parquet_exec(&schema); - let sort_exprs1 = vec![ - sort_expr("nullable_col", &schema), - sort_expr("non_nullable_col", &schema), - ]; - let sort_exprs2 = vec![ - sort_expr_options( - "nullable_col", - &schema, - SortOptions { - descending: true, - nulls_first: false, - }, - ), - sort_expr_options( - "non_nullable_col", - &schema, - SortOptions { - descending: true, - nulls_first: false, - }, - ), - ]; - let sort1 = sort_exec(sort_exprs1, source1.clone()); - let sort2 = sort_exec(sort_exprs2, source1); - - let physical_plan = union_exec(vec![sort1, sort2]); - - // The `UnionExec` doesn't preserve any of the inputs ordering in the - // example below. - let expected_input = vec![ - "UnionExec", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 DESC NULLS LAST,non_nullable_col@1 DESC NULLS LAST], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - ]; - let expected_optimized = vec![ - "UnionExec", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_union_inputs_different_sorted_with_limit() -> Result<()> { - let schema = create_test_schema()?; - - let source1 = parquet_exec(&schema); - let sort_exprs1 = vec![ - sort_expr("nullable_col", &schema), - sort_expr("non_nullable_col", &schema), - ]; - let sort_exprs2 = vec![ - sort_expr("nullable_col", &schema), - sort_expr_options( - "non_nullable_col", - &schema, - SortOptions { - descending: true, - nulls_first: false, - }, - ), - ]; - let sort_exprs3 = vec![sort_expr("nullable_col", &schema)]; - let sort1 = sort_exec(sort_exprs1, source1.clone()); - - let sort2 = sort_exec(sort_exprs2, source1); - let limit = local_limit_exec(sort2); - let limit = global_limit_exec(limit); - - let union = union_exec(vec![sort1, limit]); - let physical_plan = sort_preserving_merge_exec(sort_exprs3, union); - - // Should not change the unnecessarily fine `SortExec`s because there is `LimitExec` - let expected_input = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " UnionExec", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " GlobalLimitExec: skip=0, fetch=100", - " LocalLimitExec: fetch=100", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 DESC NULLS LAST], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - ]; - let expected_optimized = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " UnionExec", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " GlobalLimitExec: skip=0, fetch=100", - " LocalLimitExec: fetch=100", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 DESC NULLS LAST], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_window_multi_path_sort() -> Result<()> { - let schema = create_test_schema()?; - - let sort_exprs1 = vec![ - sort_expr("nullable_col", &schema), - sort_expr("non_nullable_col", &schema), - ]; - let sort_exprs2 = vec![sort_expr("nullable_col", &schema)]; - // reverse sorting of sort_exprs2 - let reversed_sort_exprs2 = vec![sort_expr_options( - "nullable_col", - &schema, - SortOptions { - descending: true, - nulls_first: false, - }, - )]; - let source1 = parquet_exec_sorted(&schema, sort_exprs1); - let source2 = parquet_exec_sorted(&schema, sort_exprs2); - let sort1 = sort_exec(reversed_sort_exprs2.clone(), source1); - let sort2 = sort_exec(reversed_sort_exprs2.clone(), source2); - - let union = union_exec(vec![sort1, sort2]); - let physical_plan = window_exec("nullable_col", reversed_sort_exprs2, union); - - // The `WindowAggExec` gets its sorting from multiple children jointly. - // The SortExecs should be kept to ensure the final result ordering - let expected_input = vec![ - "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " UnionExec", - " SortExec: expr=[nullable_col@0 DESC NULLS LAST], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 DESC NULLS LAST], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - ]; - assert_optimized!(expected_input, expected_input, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_window_multi_path_sort2() -> Result<()> { - let schema = create_test_schema()?; - - let sort_exprs1 = vec![ - sort_expr("nullable_col", &schema), - sort_expr("non_nullable_col", &schema), - ]; - let sort_exprs2 = vec![sort_expr("nullable_col", &schema)]; - // reverse sorting of sort_exprs2 - let reversed_sort_exprs2 = vec![sort_expr_options( - "nullable_col", - &schema, - SortOptions { - descending: true, - nulls_first: false, - }, - )]; - let source1 = parquet_exec_sorted(&schema, sort_exprs1); - let source2 = parquet_exec_sorted(&schema, sort_exprs2.clone()); - let sort1 = sort_exec(reversed_sort_exprs2.clone(), source1); - let sort2 = sort_exec(reversed_sort_exprs2, source2); - - let union = union_exec(vec![sort1, sort2]); - let physical_plan = window_exec("nullable_col", sort_exprs2, union); - - // The `WindowAggExec` can get its required sorting from the leaf nodes directly. - // The unnecessary SortExecs should be removed - let expected_input = vec![ - "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " UnionExec", - " SortExec: expr=[nullable_col@0 DESC NULLS LAST], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 DESC NULLS LAST], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - ]; - let expected_optimized = vec![ - "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " UnionExec", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_sort_merge_join_order_by_left() -> Result<()> { - let left_schema = create_test_schema()?; - let right_schema = create_test_schema2()?; - - let left = parquet_exec(&left_schema); - let right = parquet_exec(&right_schema); - - // Join on (nullable_col == col_a) - let join_on = vec![( - Column::new_with_schema("nullable_col", &left.schema()).unwrap(), - Column::new_with_schema("col_a", &right.schema()).unwrap(), - )]; - - let join_types = vec![ - JoinType::Inner, - JoinType::Left, - JoinType::Right, - JoinType::Full, - JoinType::LeftSemi, - JoinType::LeftAnti, - ]; - for join_type in join_types { - let join = - sort_merge_join_exec(left.clone(), right.clone(), &join_on, &join_type); - let sort_exprs = vec![ - sort_expr("nullable_col", &join.schema()), - sort_expr("non_nullable_col", &join.schema()), - ]; - let physical_plan = sort_preserving_merge_exec(sort_exprs.clone(), join); - - let join_plan = - format!(" SortMergeJoin: join_type={join_type}, on=[(Column {{ name: \"nullable_col\", index: 0 }}, Column {{ name: \"col_a\", index: 0 }})]"); - let join_plan2 = - format!(" SortMergeJoin: join_type={join_type}, on=[(Column {{ name: \"nullable_col\", index: 0 }}, Column {{ name: \"col_a\", index: 0 }})]"); - - let expected_input = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - join_plan.as_str(), - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", - ]; - let expected_optimized = match join_type { - JoinType::Inner - | JoinType::Left - | JoinType::LeftSemi - | JoinType::LeftAnti => { - // can push down the sort requirements and save 1 SortExec - vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - join_plan.as_str(), - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[col_a@0 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", - ] - } - _ => { - // can not push down the sort requirements - vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - join_plan2.as_str(), - " SortExec: expr=[nullable_col@0 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[col_a@0 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", - ] - } - }; - assert_optimized!(expected_input, expected_optimized, physical_plan); - } - Ok(()) - } - - #[tokio::test] - async fn test_sort_merge_join_order_by_right() -> Result<()> { - let left_schema = create_test_schema()?; - let right_schema = create_test_schema2()?; - - let left = parquet_exec(&left_schema); - let right = parquet_exec(&right_schema); - - // Join on (nullable_col == col_a) - let join_on = vec![( - Column::new_with_schema("nullable_col", &left.schema()).unwrap(), - Column::new_with_schema("col_a", &right.schema()).unwrap(), - )]; - - let join_types = vec![ - JoinType::Inner, - JoinType::Left, - JoinType::Right, - JoinType::Full, - JoinType::RightAnti, - ]; - for join_type in join_types { - let join = - sort_merge_join_exec(left.clone(), right.clone(), &join_on, &join_type); - let sort_exprs = vec![ - sort_expr("col_a", &join.schema()), - sort_expr("col_b", &join.schema()), - ]; - let physical_plan = sort_preserving_merge_exec(sort_exprs, join); - - let join_plan = - format!(" SortMergeJoin: join_type={join_type}, on=[(Column {{ name: \"nullable_col\", index: 0 }}, Column {{ name: \"col_a\", index: 0 }})]"); - let spm_plan = match join_type { - JoinType::RightAnti => { - "SortPreservingMergeExec: [col_a@0 ASC,col_b@1 ASC]" - } - _ => "SortPreservingMergeExec: [col_a@2 ASC,col_b@3 ASC]", - }; - let join_plan2 = - format!(" SortMergeJoin: join_type={join_type}, on=[(Column {{ name: \"nullable_col\", index: 0 }}, Column {{ name: \"col_a\", index: 0 }})]"); - - let expected_input = vec![ - spm_plan, - join_plan.as_str(), - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", - ]; - let expected_optimized = match join_type { - JoinType::Inner | JoinType::Right | JoinType::RightAnti => { - // can push down the sort requirements and save 1 SortExec - vec![ - spm_plan, - join_plan.as_str(), - " SortExec: expr=[nullable_col@0 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[col_a@0 ASC,col_b@1 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", - ] - } - _ => { - // can not push down the sort requirements for Left and Full join. - vec![ - spm_plan, - " SortExec: expr=[col_a@2 ASC,col_b@3 ASC], global=true", - join_plan2.as_str(), - " SortExec: expr=[nullable_col@0 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[col_a@0 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", - ] - } - }; - assert_optimized!(expected_input, expected_optimized, physical_plan); - } - Ok(()) - } - - #[tokio::test] - async fn test_sort_merge_join_complex_order_by() -> Result<()> { - let left_schema = create_test_schema()?; - let right_schema = create_test_schema2()?; - - let left = parquet_exec(&left_schema); - let right = parquet_exec(&right_schema); - - // Join on (nullable_col == col_a) - let join_on = vec![( - Column::new_with_schema("nullable_col", &left.schema()).unwrap(), - Column::new_with_schema("col_a", &right.schema()).unwrap(), - )]; - - let join = sort_merge_join_exec(left, right, &join_on, &JoinType::Inner); - - // order by (col_b, col_a) - let sort_exprs1 = vec![ - sort_expr("col_b", &join.schema()), - sort_expr("col_a", &join.schema()), - ]; - let physical_plan = sort_preserving_merge_exec(sort_exprs1, join.clone()); - - let expected_input = vec![ - "SortPreservingMergeExec: [col_b@3 ASC,col_a@2 ASC]", - " SortMergeJoin: join_type=Inner, on=[(Column { name: \"nullable_col\", index: 0 }, Column { name: \"col_a\", index: 0 })]", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", - ]; - - // can not push down the sort requirements, need to add SortExec - let expected_optimized = vec![ - "SortPreservingMergeExec: [col_b@3 ASC,col_a@2 ASC]", - " SortExec: expr=[col_b@3 ASC,col_a@2 ASC], global=true", - " SortMergeJoin: join_type=Inner, on=[(Column { name: \"nullable_col\", index: 0 }, Column { name: \"col_a\", index: 0 })]", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[col_a@0 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - - // order by (nullable_col, col_b, col_a) - let sort_exprs2 = vec![ - sort_expr("nullable_col", &join.schema()), - sort_expr("col_b", &join.schema()), - sort_expr("col_a", &join.schema()), - ]; - let physical_plan = sort_preserving_merge_exec(sort_exprs2, join); - - let expected_input = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC,col_b@3 ASC,col_a@2 ASC]", - " SortMergeJoin: join_type=Inner, on=[(Column { name: \"nullable_col\", index: 0 }, Column { name: \"col_a\", index: 0 })]", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", - ]; - - // can not push down the sort requirements, need to add SortExec - let expected_optimized = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC,col_b@3 ASC,col_a@2 ASC]", - " SortExec: expr=[nullable_col@0 ASC,col_b@3 ASC,col_a@2 ASC], global=true", - " SortMergeJoin: join_type=Inner, on=[(Column { name: \"nullable_col\", index: 0 }, Column { name: \"col_a\", index: 0 })]", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[col_a@0 ASC], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - - Ok(()) - } - - /// make PhysicalSortExpr with default options - fn sort_expr(name: &str, schema: &Schema) -> PhysicalSortExpr { - sort_expr_options(name, schema, SortOptions::default()) - } - - /// PhysicalSortExpr with specified options - fn sort_expr_options( - name: &str, - schema: &Schema, - options: SortOptions, - ) -> PhysicalSortExpr { - PhysicalSortExpr { - expr: col(name, schema).unwrap(), - options, - } - } - - fn memory_exec(schema: &SchemaRef) -> Arc { - Arc::new(MemoryExec::try_new(&[], schema.clone(), None).unwrap()) - } - - fn sort_exec( - sort_exprs: impl IntoIterator, - input: Arc, - ) -> Arc { - let sort_exprs = sort_exprs.into_iter().collect(); - Arc::new(SortExec::try_new(sort_exprs, input, None).unwrap()) - } - - fn sort_preserving_merge_exec( - sort_exprs: impl IntoIterator, - input: Arc, - ) -> Arc { - let sort_exprs = sort_exprs.into_iter().collect(); - Arc::new(SortPreservingMergeExec::new(sort_exprs, input)) - } - - fn filter_exec( - predicate: Arc, - input: Arc, - ) -> Arc { - Arc::new(FilterExec::try_new(predicate, input).unwrap()) - } - - fn window_exec( - col_name: &str, - sort_exprs: impl IntoIterator, - input: Arc, - ) -> Arc { - let sort_exprs: Vec<_> = sort_exprs.into_iter().collect(); - let schema = input.schema(); - - Arc::new( - WindowAggExec::try_new( - vec![create_window_expr( - &WindowFunction::AggregateFunction(AggregateFunction::Count), - "count".to_owned(), - &[col(col_name, &schema).unwrap()], - &[], - &sort_exprs, - Arc::new(WindowFrame::new(true)), - schema.as_ref(), - ) - .unwrap()], - input.clone(), - input.schema(), - vec![], - Some(sort_exprs), - ) - .unwrap(), - ) - } - - /// Create a non sorted parquet exec - fn parquet_exec(schema: &SchemaRef) -> Arc { - Arc::new(ParquetExec::new( - FileScanConfig { - object_store_url: ObjectStoreUrl::parse("test:///").unwrap(), - file_schema: schema.clone(), - file_groups: vec![vec![PartitionedFile::new("x".to_string(), 100)]], - statistics: Statistics::default(), - projection: None, - limit: None, - table_partition_cols: vec![], - output_ordering: None, - infinite_source: false, - }, - None, - None, - )) - } - - // Created a sorted parquet exec - fn parquet_exec_sorted( - schema: &SchemaRef, - sort_exprs: impl IntoIterator, - ) -> Arc { - let sort_exprs = sort_exprs.into_iter().collect(); - - Arc::new(ParquetExec::new( - FileScanConfig { - object_store_url: ObjectStoreUrl::parse("test:///").unwrap(), - file_schema: schema.clone(), - file_groups: vec![vec![PartitionedFile::new("x".to_string(), 100)]], - statistics: Statistics::default(), - projection: None, - limit: None, - table_partition_cols: vec![], - output_ordering: Some(sort_exprs), - infinite_source: false, - }, - None, - None, - )) - } - - fn union_exec(input: Vec>) -> Arc { - Arc::new(UnionExec::new(input)) - } - - fn limit_exec(input: Arc) -> Arc { - global_limit_exec(local_limit_exec(input)) - } - - fn local_limit_exec(input: Arc) -> Arc { - Arc::new(LocalLimitExec::new(input, 100)) - } - - fn global_limit_exec(input: Arc) -> Arc { - Arc::new(GlobalLimitExec::new(input, 0, Some(100))) - } - - fn repartition_exec(input: Arc) -> Arc { - Arc::new( - RepartitionExec::try_new(input, Partitioning::RoundRobinBatch(10)).unwrap(), - ) - } - - fn aggregate_exec(input: Arc) -> Arc { - let schema = input.schema(); - Arc::new( - AggregateExec::try_new( - AggregateMode::Final, - PhysicalGroupBy::default(), - vec![], - input, - schema, - ) - .unwrap(), - ) - } - - fn sort_merge_join_exec( - left: Arc, - right: Arc, - join_on: &JoinOn, - join_type: &JoinType, - ) -> Arc { - Arc::new( - SortMergeJoinExec::try_new( - left, - right, - join_on.clone(), - *join_type, - vec![SortOptions::default(); join_on.len()], - false, - ) - .unwrap(), - ) - } -} diff --git a/datafusion/core/src/physical_plan/sorts/sort.rs b/datafusion/core/src/physical_plan/sorts/sort.rs index a215bfaed03fd..b985116320c53 100644 --- a/datafusion/core/src/physical_plan/sorts/sort.rs +++ b/datafusion/core/src/physical_plan/sorts/sort.rs @@ -790,6 +790,8 @@ impl ExecutionPlan for SortExec { t: DisplayFormatType, f: &mut std::fmt::Formatter, ) -> std::fmt::Result { + // let is_global = !self.preserve_partitioning; + let is_global = self.output_partitioning().partition_count() <= 1; match t { DisplayFormatType::Default => { let expr: Vec = self.expr.iter().map(|e| e.to_string()).collect(); @@ -799,14 +801,14 @@ impl ExecutionPlan for SortExec { f, "SortExec: fetch={fetch}, expr=[{}], global={}", expr.join(","), - !self.preserve_partitioning + is_global ) } None => write!( f, "SortExec: expr=[{}], global={}", expr.join(","), - !self.preserve_partitioning + is_global ), } } diff --git a/datafusion/core/tests/sql/explain_analyze.rs b/datafusion/core/tests/sql/explain_analyze.rs index 5941946ce8de9..52eabb803bf79 100644 --- a/datafusion/core/tests/sql/explain_analyze.rs +++ b/datafusion/core/tests/sql/explain_analyze.rs @@ -582,6 +582,7 @@ async fn explain_analyze_runs_optimizers() { } #[tokio::test] +#[ignore] async fn test_physical_plan_display_indent() { // Hard code target_partitions as it appears in the RepartitionExec output let config = SessionConfig::new() diff --git a/datafusion/core/tests/sql/joins.rs b/datafusion/core/tests/sql/joins.rs index 779a0006b304f..c5b6b5356b83a 100644 --- a/datafusion/core/tests/sql/joins.rs +++ b/datafusion/core/tests/sql/joins.rs @@ -1964,6 +1964,7 @@ async fn sort_merge_join_on_decimal() -> Result<()> { } #[tokio::test] +#[ignore] async fn left_semi_join() -> Result<()> { let test_repartition_joins = vec![true, false]; for repartition_joins in test_repartition_joins { @@ -2240,6 +2241,7 @@ async fn null_aware_left_anti_join() -> Result<()> { } #[tokio::test] +#[ignore] async fn right_semi_join() -> Result<()> { let test_repartition_joins = vec![true, false]; for repartition_joins in test_repartition_joins { diff --git a/datafusion/core/tests/sql/window.rs b/datafusion/core/tests/sql/window.rs index 116f37ace11be..72a5dcdfdc80e 100644 --- a/datafusion/core/tests/sql/window.rs +++ b/datafusion/core/tests/sql/window.rs @@ -18,6 +18,7 @@ use super::*; use ::parquet::arrow::arrow_writer::ArrowWriter; use ::parquet::file::properties::WriterProperties; +use arrow::util::pretty::print_batches; use datafusion::execution::options::ReadOptions; #[tokio::test] @@ -1635,6 +1636,7 @@ async fn test_window_agg_sort_multi_layer_non_reversed_plan() -> Result<()> { } #[tokio::test] +#[ignore] async fn test_window_agg_complex_plan() -> Result<()> { let ctx = SessionContext::with_config(SessionConfig::new().with_target_partitions(2)); register_aggregate_null_cases_csv(&ctx).await?; @@ -2065,6 +2067,7 @@ async fn test_window_agg_global_sort_parallelize_sort_disabled() -> Result<()> { } #[tokio::test] +#[ignore] async fn test_window_agg_global_sort_intermediate_parallel_sort() -> Result<()> { let config = SessionConfig::new() .with_repartition_windows(true) @@ -2075,7 +2078,7 @@ async fn test_window_agg_global_sort_intermediate_parallel_sort() -> Result<()> let sql = "SELECT c1, \ SUM(C9) OVER (PARTITION BY C1 ORDER BY c9 ASC ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING) as sum1, \ SUM(C9) OVER (ORDER BY c9 ASC ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING) as sum2 \ - FROM aggregate_test_100 ORDER BY c1 ASC"; + FROM aggregate_test_100 ORDER BY c9 ASC"; let msg = format!("Creating logical plan for '{sql}'"); let dataframe = ctx.sql(sql).await.expect(&msg); @@ -2542,3 +2545,40 @@ mod tests { Ok(()) } } + +fn print_plan(plan: &Arc) -> Result<()> { + let formatted = displayable(plan.as_ref()).indent().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + println!("{:#?}", actual); + Ok(()) +} + +#[tokio::test] +async fn test_projection_wrong_push_down() -> Result<()> { + let config = SessionConfig::new(); + let ctx = SessionContext::with_config(config); + register_aggregate_csv(&ctx).await?; + // let sql = "SELECT a.c1, b.c1, SUM(a.c2) FROM aggregate_test_100 as a CROSS JOIN aggregate_test_100 as b GROUP BY a.c1, b.c1 ORDER BY a.c1, b.c1"; + let sql = "SELECT c9, + SUM(c5) OVER(ORDER BY c4 RANGE BETWEEN 3 PRECEDING AND 1 FOLLOWING) as summation2, + SUM(c4) OVER(ORDER BY c3 RANGE 3 PRECEDING) as summation3, + SUM(c4) OVER(ORDER BY c5 RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as summation6, + SUM(c4) OVER(ORDER BY c5 RANGE UNBOUNDED PRECEDING) as summation7, + SUM(c2) OVER(PARTITION BY c5 ORDER BY c5 RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as summation10, + SUM(c4) OVER(PARTITION BY c1 ORDER BY c5 RANGE UNBOUNDED PRECEDING) as summation11, + SUM(c2) OVER(PARTITION BY c1 ORDER BY c5 RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as summation14, + SUM(c4) OVER(PARTITION BY c5 ORDER BY c5 RANGE UNBOUNDED PRECEDING) as summation15, + SUM(c2) OVER(PARTITION BY c5, c7, c9 ORDER BY c5 RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as summation20, + SUM(c2) OVER(PARTITION BY c5 ORDER BY c5 RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as summation21 +FROM aggregate_test_100 +ORDER BY c9;"; + + let msg = format!("Creating logical plan for '{sql}'"); + let dataframe = ctx.sql(sql).await.expect(&msg); + let physical_plan = dataframe.create_physical_plan().await?; + print_plan(&physical_plan)?; + + let actual = execute_to_batches(&ctx, sql).await; + print_batches(&actual)?; + Ok(()) +} diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index fa9b8d134efde..b55bfed9bd1f3 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -397,12 +397,16 @@ pub fn map_requirement_before_projection( parent_required: Option<&[PhysicalSortRequirements]>, proj_exprs: &[(Arc, String)], ) -> Option> { + println!("parent_required: {:?}", parent_required); + println!("proj_exprs: {:?}", proj_exprs); if let Some(requirement) = parent_required { let required_expr = create_sort_expr_from_requirement(requirement) .iter() .map(|sort_expr| sort_expr.expr.clone()) .collect::>(); + println!("required_expr:{:?}", required_expr); let new_exprs = map_columns_before_projection(&required_expr, proj_exprs); + println!("new_exprs:{:?}", new_exprs); if new_exprs.len() == requirement.len() { let new_request = new_exprs .iter() From f4ff6037bc0aca85c4765f681e57093d30d1fc7d Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Mon, 27 Feb 2023 20:00:20 +0300 Subject: [PATCH 11/35] reorganize to decrease diff, remove ignore --- .../physical_optimizer/sort_enforcement.rs | 371 +++++++++--------- 1 file changed, 185 insertions(+), 186 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement.rs b/datafusion/core/src/physical_optimizer/sort_enforcement.rs index 7bc7e957da16a..c6a1c594abffc 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement.rs @@ -234,96 +234,6 @@ impl TreeNodeRewritable for PlanWithCorrespondingSort { } } -/// This function enforces sorting requirements and makes optimizations without -/// violating these requirements whenever possible. -fn ensure_sorting( - requirements: PlanWithCorrespondingSort, -) -> Result> { - // Perform naive analysis at the beginning -- remove already-satisfied sorts: - let plan = requirements.plan; - let mut children = plan.children(); - if children.is_empty() { - return Ok(None); - } - let mut sort_onwards = requirements.sort_onwards; - if let Some(result) = analyze_immediate_sort_removal(&plan, &sort_onwards) { - return Ok(Some(result)); - } - for (idx, (child, sort_onwards, required_ordering)) in izip!( - children.iter_mut(), - sort_onwards.iter_mut(), - plan.required_input_ordering() - ) - .enumerate() - { - let physical_ordering = child.output_ordering(); - match (required_ordering, physical_ordering) { - (Some(required_ordering), Some(physical_ordering)) => { - if !ordering_satisfy_requirement_concrete( - physical_ordering, - &required_ordering, - || child.equivalence_properties(), - ) { - // Make sure we preserve the ordering requirements: - update_child_to_remove_unnecessary_sort( - child, - sort_onwards, - &plan, - idx, - )?; - let sort_expr = create_sort_expr_from_requirement(&required_ordering); - add_sort_above(child, sort_expr)?; - if is_sort(child) { - *sort_onwards = Some(ExecTree::new(child.clone(), idx, vec![])); - } else { - *sort_onwards = None; - } - } - } - (Some(required), None) => { - // Ordering requirement is not met, we should add a `SortExec` to the plan. - let sort_expr = create_sort_expr_from_requirement(&required); - add_sort_above(child, sort_expr)?; - *sort_onwards = Some(ExecTree::new(child.clone(), idx, vec![])); - } - (None, Some(_)) => { - // We have a `SortExec` whose effect may be neutralized by - // another order-imposing operator. Remove this sort. - if !plan.maintains_input_order()[idx] { - update_child_to_remove_unnecessary_sort( - child, - sort_onwards, - &plan, - idx, - )?; - } - } - (None, None) => {} - } - } - // For window expressions, we can remove some sorts when we can - // calculate the result in reverse: - if is_window(&plan) { - if let Some(tree) = &mut sort_onwards[0] { - if let Some(result) = analyze_window_sort_removal(tree, &plan)? { - return Ok(Some(result)); - } - } - } else if is_sort_preserving_merge(&plan) - && children[0].output_partitioning().partition_count() <= 1 - { - // sort preserving merge can removed. Input already has single partition - return Ok(Some(PlanWithCorrespondingSort { - plan: children[0].clone(), - sort_onwards: vec![sort_onwards[0].clone()], - })); - } - Ok(Some(PlanWithCorrespondingSort { - plan: plan.with_new_children(children)?, - sort_onwards, - })) -} - /// This object is used within the [EnforceSorting] rule to track the closest /// `CoalescePartitionsExec` descendant(s) for every child of a plan. #[derive(Debug, Clone)] @@ -427,65 +337,6 @@ impl TreeNodeRewritable for PlanWithCorrespondingCoalescePartitions { } } -/// This function turns plans of the form -/// "SortExec: expr=[a@0 ASC]", -/// " CoalescePartitionsExec", -/// " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", -/// to -/// "SortPreservingMergeExec: [a@0 ASC]", -/// " SortExec: expr=[a@0 ASC]", -/// " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", -/// by following connections from `CoalescePartitionsExec`s to `SortExec`s. -/// By performing sorting in parallel, we can increase performance in some scenarios. -fn parallelize_sorts( - requirements: PlanWithCorrespondingCoalescePartitions, -) -> Result> { - let plan = requirements.plan; - let mut coalesce_onwards = requirements.coalesce_onwards; - if plan.children().is_empty() - // We only do action when plan is either SortExec, SortPreservingMergeExec or CoalescePartitionsExec - // all of them have single child. If 0th child is `None` we can immediately return. - || coalesce_onwards[0].is_none() - { - return Ok(None); - } - // We know that `plan` has children, so `coalesce_onwards` is non-empty. - if (is_sort(&plan) || is_sort_preserving_merge(&plan)) - // Make sure that Sort is actually global sort - && plan.output_partitioning().partition_count() <= 1 - { - // If there is a connection between a `CoalescePartitionsExec` and a - // Global Sort that satisfy the requirements (i.e. intermediate - // executors don't require single partition), then we can - // replace the `CoalescePartitionsExec`+ GlobalSort cascade with - // the `SortExec` + `SortPreservingMergeExec` - // cascade to parallelize sorting. - let mut prev_layer = plan.clone(); - update_child_to_remove_coalesce(&mut prev_layer, &mut coalesce_onwards[0])?; - let sort_exprs = get_sort_exprs(&plan)?; - add_sort_above(&mut prev_layer, sort_exprs.to_vec())?; - let spm = SortPreservingMergeExec::new(sort_exprs.to_vec(), prev_layer); - return Ok(Some(PlanWithCorrespondingCoalescePartitions { - plan: Arc::new(spm), - coalesce_onwards: vec![None], - })); - } else if plan.as_any().is::() { - // There is an unnecessary `CoalescePartitionExec` in the plan. - let mut prev_layer = plan.clone(); - update_child_to_remove_coalesce(&mut prev_layer, &mut coalesce_onwards[0])?; - let new_plan = plan.with_new_children(vec![prev_layer])?; - return Ok(Some(PlanWithCorrespondingCoalescePartitions { - plan: new_plan, - coalesce_onwards: vec![None], - })); - } - - Ok(Some(PlanWithCorrespondingCoalescePartitions { - plan, - coalesce_onwards, - })) -} - /// This is a "data class" we use within the [TopDownEnforceSorting] rule #[derive(Debug, Clone)] struct TopDownSortPushDown { @@ -591,6 +442,191 @@ impl TreeNodeRewritable for TopDownSortPushDown { } } +/// The boolean flag `repartition_sorts` defined in the config indicates +/// whether we elect to transform CoalescePartitionsExec + SortExec cascades +/// into SortExec + SortPreservingMergeExec cascades, which enables us to +/// perform sorting in parallel. +impl PhysicalOptimizerRule for EnforceSorting { + fn optimize( + &self, + plan: Arc, + config: &ConfigOptions, + ) -> Result> { + let plan_requirements = PlanWithCorrespondingSort::new(plan); + let adjusted = plan_requirements.transform_up(&ensure_sorting)?; + let new_plan = if config.optimizer.repartition_sorts { + let plan_with_coalesce_partitions = + PlanWithCorrespondingCoalescePartitions::new(adjusted.plan); + let parallel = + plan_with_coalesce_partitions.transform_up(¶llelize_sorts)?; + parallel.plan + } else { + adjusted.plan + }; + // Execute a Top-Down process(Preorder Traversal) to ensure the sort requirements: + let sort_pushdown = TopDownSortPushDown::init(new_plan); + let adjusted = sort_pushdown.transform_down(&pushdown_sorts)?; + Ok(adjusted.plan) + } + + fn name(&self) -> &str { + "EnforceSorting" + } + + fn schema_check(&self) -> bool { + true + } +} + +/// This function turns plans of the form +/// "SortExec: expr=[a@0 ASC]", +/// " CoalescePartitionsExec", +/// " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", +/// to +/// "SortPreservingMergeExec: [a@0 ASC]", +/// " SortExec: expr=[a@0 ASC]", +/// " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", +/// by following connections from `CoalescePartitionsExec`s to `SortExec`s. +/// By performing sorting in parallel, we can increase performance in some scenarios. +fn parallelize_sorts( + requirements: PlanWithCorrespondingCoalescePartitions, +) -> Result> { + let plan = requirements.plan; + let mut coalesce_onwards = requirements.coalesce_onwards; + if plan.children().is_empty() + // We only do action when plan is either SortExec, SortPreservingMergeExec or CoalescePartitionsExec + // all of them have single child. If 0th child is `None` we can immediately return. + || coalesce_onwards[0].is_none() + { + return Ok(None); + } + // We know that `plan` has children, so `coalesce_onwards` is non-empty. + if (is_sort(&plan) || is_sort_preserving_merge(&plan)) + // Make sure that Sort is actually global sort + && plan.output_partitioning().partition_count() <= 1 + { + // If there is a connection between a `CoalescePartitionsExec` and a + // Global Sort that satisfy the requirements (i.e. intermediate + // executors don't require single partition), then we can + // replace the `CoalescePartitionsExec`+ GlobalSort cascade with + // the `SortExec` + `SortPreservingMergeExec` + // cascade to parallelize sorting. + let mut prev_layer = plan.clone(); + update_child_to_remove_coalesce(&mut prev_layer, &mut coalesce_onwards[0])?; + let sort_exprs = get_sort_exprs(&plan)?; + add_sort_above(&mut prev_layer, sort_exprs.to_vec())?; + let spm = SortPreservingMergeExec::new(sort_exprs.to_vec(), prev_layer); + return Ok(Some(PlanWithCorrespondingCoalescePartitions { + plan: Arc::new(spm), + coalesce_onwards: vec![None], + })); + } else if plan.as_any().is::() { + // There is an unnecessary `CoalescePartitionExec` in the plan. + let mut prev_layer = plan.clone(); + update_child_to_remove_coalesce(&mut prev_layer, &mut coalesce_onwards[0])?; + let new_plan = plan.with_new_children(vec![prev_layer])?; + return Ok(Some(PlanWithCorrespondingCoalescePartitions { + plan: new_plan, + coalesce_onwards: vec![None], + })); + } + + Ok(Some(PlanWithCorrespondingCoalescePartitions { + plan, + coalesce_onwards, + })) +} + +/// This function enforces sorting requirements and makes optimizations without +/// violating these requirements whenever possible. +fn ensure_sorting( + requirements: PlanWithCorrespondingSort, +) -> Result> { + // Perform naive analysis at the beginning -- remove already-satisfied sorts: + let plan = requirements.plan; + let mut children = plan.children(); + if children.is_empty() { + return Ok(None); + } + let mut sort_onwards = requirements.sort_onwards; + if let Some(result) = analyze_immediate_sort_removal(&plan, &sort_onwards) { + return Ok(Some(result)); + } + for (idx, (child, sort_onwards, required_ordering)) in izip!( + children.iter_mut(), + sort_onwards.iter_mut(), + plan.required_input_ordering() + ) + .enumerate() + { + let physical_ordering = child.output_ordering(); + match (required_ordering, physical_ordering) { + (Some(required_ordering), Some(physical_ordering)) => { + if !ordering_satisfy_requirement_concrete( + physical_ordering, + &required_ordering, + || child.equivalence_properties(), + ) { + // Make sure we preserve the ordering requirements: + update_child_to_remove_unnecessary_sort( + child, + sort_onwards, + &plan, + idx, + )?; + let sort_expr = create_sort_expr_from_requirement(&required_ordering); + add_sort_above(child, sort_expr)?; + if is_sort(child) { + *sort_onwards = Some(ExecTree::new(child.clone(), idx, vec![])); + } else { + *sort_onwards = None; + } + } + } + (Some(required), None) => { + // Ordering requirement is not met, we should add a `SortExec` to the plan. + let sort_expr = create_sort_expr_from_requirement(&required); + add_sort_above(child, sort_expr)?; + *sort_onwards = Some(ExecTree::new(child.clone(), idx, vec![])); + } + (None, Some(_)) => { + // We have a `SortExec` whose effect may be neutralized by + // another order-imposing operator. Remove this sort. + if !plan.maintains_input_order()[idx] { + update_child_to_remove_unnecessary_sort( + child, + sort_onwards, + &plan, + idx, + )?; + } + } + (None, None) => {} + } + } + // For window expressions, we can remove some sorts when we can + // calculate the result in reverse: + if is_window(&plan) { + if let Some(tree) = &mut sort_onwards[0] { + if let Some(result) = analyze_window_sort_removal(tree, &plan)? { + return Ok(Some(result)); + } + } + } else if is_sort_preserving_merge(&plan) + && children[0].output_partitioning().partition_count() <= 1 + { + // sort preserving merge can removed. Input already has single partition + return Ok(Some(PlanWithCorrespondingSort { + plan: children[0].clone(), + sort_onwards: vec![sort_onwards[0].clone()], + })); + } + Ok(Some(PlanWithCorrespondingSort { + plan: plan.with_new_children(children)?, + sort_onwards, + })) +} + fn pushdown_sorts( requirements: TopDownSortPushDown, ) -> Result> { @@ -659,42 +695,6 @@ fn pushdown_sorts( } } -/// The boolean flag `repartition_sorts` defined in the config indicates -/// whether we elect to transform CoalescePartitionsExec + SortExec cascades -/// into SortExec + SortPreservingMergeExec cascades, which enables us to -/// perform sorting in parallel. -impl PhysicalOptimizerRule for EnforceSorting { - fn optimize( - &self, - plan: Arc, - config: &ConfigOptions, - ) -> Result> { - let plan_requirements = PlanWithCorrespondingSort::new(plan); - let adjusted = plan_requirements.transform_up(&ensure_sorting)?; - let new_plan = if config.optimizer.repartition_sorts { - let plan_with_coalesce_partitions = - PlanWithCorrespondingCoalescePartitions::new(adjusted.plan); - let parallel = - plan_with_coalesce_partitions.transform_up(¶llelize_sorts)?; - parallel.plan - } else { - adjusted.plan - }; - // Execute a Top-Down process(Preorder Traversal) to ensure the sort requirements: - let sort_pushdown = TopDownSortPushDown::init(new_plan); - let adjusted = sort_pushdown.transform_down(&pushdown_sorts)?; - Ok(adjusted.plan) - } - - fn name(&self) -> &str { - "EnforceSorting" - } - - fn schema_check(&self) -> bool { - true - } -} - /// Analyzes a given `SortExec` (`plan`) to determine whether its input already /// has a finer ordering than this `SortExec` enforces. fn analyze_immediate_sort_removal( @@ -2133,7 +2133,6 @@ mod tests { } #[tokio::test] - #[ignore] async fn test_union_inputs_different_sorted6() -> Result<()> { let schema = create_test_schema()?; From 6c52daa996049649e6cbfeed38243ec03bfca7d9 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 15 Mar 2023 11:29:30 +0300 Subject: [PATCH 12/35] Update test --- datafusion/core/tests/sql/joins.rs | 123 ++++++++++++++--------------- 1 file changed, 60 insertions(+), 63 deletions(-) diff --git a/datafusion/core/tests/sql/joins.rs b/datafusion/core/tests/sql/joins.rs index 662d801aee9d8..052e05df3316d 100644 --- a/datafusion/core/tests/sql/joins.rs +++ b/datafusion/core/tests/sql/joins.rs @@ -1951,7 +1951,6 @@ async fn sort_merge_join_on_decimal() -> Result<()> { } #[tokio::test] -#[ignore] async fn left_semi_join() -> Result<()> { let test_repartition_joins = vec![true, false]; for repartition_joins in test_repartition_joins { @@ -1970,20 +1969,25 @@ async fn left_semi_join() -> Result<()> { vec![ "SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]", " SortExec: expr=[t1_id@0 ASC NULLS LAST], global=false", - " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name]", - " CoalesceBatchesExec: target_batch_size=4096", - " HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(Column { name: \"t1_id\", index: 0 }, Column { name: \"t2_id\", index: 0 })]", - " CoalesceBatchesExec: target_batch_size=4096", - " RepartitionExec: partitioning=Hash([Column { name: \"t1_id\", index: 0 }], 2), input_partitions=2", - " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", + " CoalesceBatchesExec: target_batch_size=4096", + " HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(Column { name: \"t1_id\", index: 0 }, Column { name: \"t2_id\", index: 0 })]", + " CoalesceBatchesExec: target_batch_size=4096", + " RepartitionExec: partitioning=Hash([Column { name: \"t1_id\", index: 0 }], 2), input_partitions=2", + " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", + " MemoryExec: partitions=1, partition_sizes=[1]", + " CoalesceBatchesExec: target_batch_size=4096", + " RepartitionExec: partitioning=Hash([Column { name: \"t2_id\", index: 0 }], 2), input_partitions=2", + " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", + " ProjectionExec: expr=[t2_id@0 as t2_id]", " MemoryExec: partitions=1, partition_sizes=[1]", ] } else { vec![ "SortExec: expr=[t1_id@0 ASC NULLS LAST], global=true", - " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name]", - " CoalesceBatchesExec: target_batch_size=4096", - " HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(Column { name: \"t1_id\", index: 0 }, Column { name: \"t2_id\", index: 0 })]", + " CoalesceBatchesExec: target_batch_size=4096", + " HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(Column { name: \"t1_id\", index: 0 }, Column { name: \"t2_id\", index: 0 })]", + " MemoryExec: partitions=1, partition_sizes=[1]", + " ProjectionExec: expr=[t2_id@0 as t2_id]", " MemoryExec: partitions=1, partition_sizes=[1]", ] }; @@ -2043,26 +2047,24 @@ async fn left_semi_join() -> Result<()> { vec![ "SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]", " SortExec: expr=[t1_id@0 ASC NULLS LAST], global=false", - " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name]", - " CoalesceBatchesExec: target_batch_size=4096", - " HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(Column { name: \"t1_id\", index: 0 }, Column { name: \"t2_id\", index: 0 })]", - " CoalesceBatchesExec: target_batch_size=4096", - " RepartitionExec: partitioning=Hash([Column { name: \"t1_id\", index: 0 }], 2), input_partitions=2", - " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", - " MemoryExec: partitions=1, partition_sizes=[1]", - " CoalesceBatchesExec: target_batch_size=4096", - " RepartitionExec: partitioning=Hash([Column { name: \"t2_id\", index: 0 }], 2), input_partitions=2", - " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", - " MemoryExec: partitions=1, partition_sizes=[1]", + " CoalesceBatchesExec: target_batch_size=4096", + " HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(Column { name: \"t1_id\", index: 0 }, Column { name: \"t2_id\", index: 0 })]", + " CoalesceBatchesExec: target_batch_size=4096", + " RepartitionExec: partitioning=Hash([Column { name: \"t1_id\", index: 0 }], 2), input_partitions=2", + " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", + " MemoryExec: partitions=1, partition_sizes=[1]", + " CoalesceBatchesExec: target_batch_size=4096", + " RepartitionExec: partitioning=Hash([Column { name: \"t2_id\", index: 0 }], 2), input_partitions=2", + " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", + " MemoryExec: partitions=1, partition_sizes=[1]", ] } else { vec![ "SortExec: expr=[t1_id@0 ASC NULLS LAST], global=true", - " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name]", - " CoalesceBatchesExec: target_batch_size=4096", - " HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(Column { name: \"t1_id\", index: 0 }, Column { name: \"t2_id\", index: 0 })]", - " MemoryExec: partitions=1, partition_sizes=[1]", - " MemoryExec: partitions=1, partition_sizes=[1]", + " CoalesceBatchesExec: target_batch_size=4096", + " HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(Column { name: \"t1_id\", index: 0 }, Column { name: \"t2_id\", index: 0 })]", + " MemoryExec: partitions=1, partition_sizes=[1]", + " MemoryExec: partitions=1, partition_sizes=[1]", ] }; let formatted = displayable(physical_plan.as_ref()).indent().to_string(); @@ -2220,7 +2222,6 @@ async fn null_aware_left_anti_join() -> Result<()> { } #[tokio::test] -#[ignore] async fn right_semi_join() -> Result<()> { let test_repartition_joins = vec![true, false]; for repartition_joins in test_repartition_joins { @@ -2236,28 +2237,26 @@ async fn right_semi_join() -> Result<()> { let dataframe = ctx.sql(sql).await.expect(&msg); let physical_plan = dataframe.create_physical_plan().await?; let expected = if repartition_joins { - vec![ "SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]", - " SortExec: expr=[t1_id@0 ASC NULLS LAST], global=false", - " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int]", - " CoalesceBatchesExec: target_batch_size=4096", - " HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(Column { name: \"t2_id\", index: 0 }, Column { name: \"t1_id\", index: 0 })], filter=BinaryExpr { left: Column { name: \"t2_name\", index: 1 }, op: NotEq, right: Column { name: \"t1_name\", index: 0 } }", - " CoalesceBatchesExec: target_batch_size=4096", - " RepartitionExec: partitioning=Hash([Column { name: \"t2_id\", index: 0 }], 2), input_partitions=2", - " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", - " MemoryExec: partitions=1, partition_sizes=[1]", - " CoalesceBatchesExec: target_batch_size=4096", - " RepartitionExec: partitioning=Hash([Column { name: \"t1_id\", index: 0 }], 2), input_partitions=2", - " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", - " MemoryExec: partitions=1, partition_sizes=[1]", + vec!["SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]", + " SortExec: expr=[t1_id@0 ASC NULLS LAST], global=false", + " CoalesceBatchesExec: target_batch_size=4096", + " HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(Column { name: \"t2_id\", index: 0 }, Column { name: \"t1_id\", index: 0 })], filter=BinaryExpr { left: Column { name: \"t2_name\", index: 1 }, op: NotEq, right: Column { name: \"t1_name\", index: 0 } }", + " CoalesceBatchesExec: target_batch_size=4096", + " RepartitionExec: partitioning=Hash([Column { name: \"t2_id\", index: 0 }], 2), input_partitions=2", + " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", + " MemoryExec: partitions=1, partition_sizes=[1]", + " CoalesceBatchesExec: target_batch_size=4096", + " RepartitionExec: partitioning=Hash([Column { name: \"t1_id\", index: 0 }], 2), input_partitions=2", + " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", + " MemoryExec: partitions=1, partition_sizes=[1]", ] } else { vec![ "SortExec: expr=[t1_id@0 ASC NULLS LAST], global=true", - " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int]", - " CoalesceBatchesExec: target_batch_size=4096", - " HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(Column { name: \"t2_id\", index: 0 }, Column { name: \"t1_id\", index: 0 })], filter=BinaryExpr { left: Column { name: \"t2_name\", index: 1 }, op: NotEq, right: Column { name: \"t1_name\", index: 0 } }", - " MemoryExec: partitions=1, partition_sizes=[1]", - " MemoryExec: partitions=1, partition_sizes=[1]", + " CoalesceBatchesExec: target_batch_size=4096", + " HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(Column { name: \"t2_id\", index: 0 }, Column { name: \"t1_id\", index: 0 })], filter=BinaryExpr { left: Column { name: \"t2_name\", index: 1 }, op: NotEq, right: Column { name: \"t1_name\", index: 0 } }", + " MemoryExec: partitions=1, partition_sizes=[1]", + " MemoryExec: partitions=1, partition_sizes=[1]", ] }; let formatted = displayable(physical_plan.as_ref()).indent().to_string(); @@ -2282,28 +2281,26 @@ async fn right_semi_join() -> Result<()> { let dataframe = ctx.sql(sql).await.expect(&msg); let physical_plan = dataframe.create_physical_plan().await?; let expected = if repartition_joins { - vec![ "SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]", - " SortExec: expr=[t1_id@0 ASC NULLS LAST], global=false", - " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int]", - " CoalesceBatchesExec: target_batch_size=4096", - " HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(Column { name: \"t2_id\", index: 0 }, Column { name: \"t1_id\", index: 0 })], filter=BinaryExpr { left: Column { name: \"t2_name\", index: 0 }, op: NotEq, right: Column { name: \"t1_name\", index: 1 } }", - " CoalesceBatchesExec: target_batch_size=4096", - " RepartitionExec: partitioning=Hash([Column { name: \"t2_id\", index: 0 }], 2), input_partitions=2", - " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", - " MemoryExec: partitions=1, partition_sizes=[1]", - " CoalesceBatchesExec: target_batch_size=4096", - " RepartitionExec: partitioning=Hash([Column { name: \"t1_id\", index: 0 }], 2), input_partitions=2", - " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", - " MemoryExec: partitions=1, partition_sizes=[1]", + vec!["SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]", + " SortExec: expr=[t1_id@0 ASC NULLS LAST], global=false", + " CoalesceBatchesExec: target_batch_size=4096", + " HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(Column { name: \"t2_id\", index: 0 }, Column { name: \"t1_id\", index: 0 })], filter=BinaryExpr { left: Column { name: \"t2_name\", index: 0 }, op: NotEq, right: Column { name: \"t1_name\", index: 1 } }", + " CoalesceBatchesExec: target_batch_size=4096", + " RepartitionExec: partitioning=Hash([Column { name: \"t2_id\", index: 0 }], 2), input_partitions=2", + " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", + " MemoryExec: partitions=1, partition_sizes=[1]", + " CoalesceBatchesExec: target_batch_size=4096", + " RepartitionExec: partitioning=Hash([Column { name: \"t1_id\", index: 0 }], 2), input_partitions=2", + " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", + " MemoryExec: partitions=1, partition_sizes=[1]", ] } else { vec![ "SortExec: expr=[t1_id@0 ASC NULLS LAST], global=true", - " ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int]", - " CoalesceBatchesExec: target_batch_size=4096", - " HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(Column { name: \"t2_id\", index: 0 }, Column { name: \"t1_id\", index: 0 })], filter=BinaryExpr { left: Column { name: \"t2_name\", index: 0 }, op: NotEq, right: Column { name: \"t1_name\", index: 1 } }", - " MemoryExec: partitions=1, partition_sizes=[1]", - " MemoryExec: partitions=1, partition_sizes=[1]", + " CoalesceBatchesExec: target_batch_size=4096", + " HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(Column { name: \"t2_id\", index: 0 }, Column { name: \"t1_id\", index: 0 })], filter=BinaryExpr { left: Column { name: \"t2_name\", index: 0 }, op: NotEq, right: Column { name: \"t1_name\", index: 1 } }", + " MemoryExec: partitions=1, partition_sizes=[1]", + " MemoryExec: partitions=1, partition_sizes=[1]", ] }; let formatted = displayable(physical_plan.as_ref()).indent().to_string(); From 4557d20729805add1c6805a39d84b1fe5408c42f Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 15 Mar 2023 11:47:41 +0300 Subject: [PATCH 13/35] retract dist enforcement --- .../physical_optimizer/dist_enforcement.rs | 201 ++++++++---------- 1 file changed, 87 insertions(+), 114 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/dist_enforcement.rs b/datafusion/core/src/physical_optimizer/dist_enforcement.rs index c4baab1a40cba..483451f412515 100644 --- a/datafusion/core/src/physical_optimizer/dist_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/dist_enforcement.rs @@ -29,7 +29,6 @@ use crate::physical_plan::joins::{ use crate::physical_plan::projection::ProjectionExec; use crate::physical_plan::repartition::RepartitionExec; use crate::physical_plan::sorts::sort::SortOptions; -use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; use crate::physical_plan::tree_node::TreeNodeRewritable; use crate::physical_plan::windows::WindowAggExec; use crate::physical_plan::Partitioning; @@ -39,14 +38,11 @@ use datafusion_expr::logical_plan::JoinType; use datafusion_physical_expr::equivalence::EquivalenceProperties; use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::expressions::NoOp; -use datafusion_physical_expr::utils::{ - create_sort_expr_from_requirement, map_columns_before_projection, -}; use datafusion_physical_expr::{ expr_list_eq_strict_order, normalize_expr_with_equivalence_properties, AggregateExpr, PhysicalExpr, }; -use itertools::izip; +use std::collections::HashMap; use std::sync::Arc; /// The EnforceDistribution rule ensures that distribution requirements are met @@ -84,9 +80,7 @@ impl PhysicalOptimizerRule for EnforceDistribution { } else { plan }; - // Distribution enforcement needs to be applied bottom-up. - let repartition_sorts = config.optimizer.repartition_sorts; new_plan.transform_up(&{ |plan| { let adjusted = if !top_down_join_key_reordering { @@ -94,11 +88,7 @@ impl PhysicalOptimizerRule for EnforceDistribution { } else { plan }; - Ok(Some(ensure_distribution( - adjusted, - target_partitions, - repartition_sorts, - )?)) + Ok(Some(ensure_distribution(adjusted, target_partitions)?)) } }) } @@ -502,6 +492,30 @@ fn reorder_aggregate_keys( } } +fn map_columns_before_projection( + parent_required: &[Arc], + proj_exprs: &[(Arc, String)], +) -> Vec> { + let mut column_mapping = HashMap::new(); + for (expression, name) in proj_exprs.iter() { + if let Some(column) = expression.as_any().downcast_ref::() { + column_mapping.insert(name.clone(), column.clone()); + }; + } + let new_required: Vec> = parent_required + .iter() + .filter_map(|r| { + if let Some(column) = r.as_any().downcast_ref::() { + column_mapping.get(column.name()) + } else { + None + } + }) + .map(|e| Arc::new(e.clone()) as Arc) + .collect::>(); + new_required +} + fn shift_right_required( parent_required: &[Arc], left_columns_len: usize, @@ -829,7 +843,6 @@ fn new_join_conditions( fn ensure_distribution( plan: Arc, target_partitions: usize, - repartition_sort: bool, ) -> Result> { if plan.children().is_empty() { return Ok(plan); @@ -840,46 +853,31 @@ fn ensure_distribution( assert_eq!(children.len(), required_input_distributions.len()); // Add RepartitionExec to guarantee output partitioning - let new_children: Result>> = izip!( - children.into_iter(), - required_input_distributions.into_iter(), - plan.required_input_ordering().into_iter(), - ) - .map(|(child, required, required_ordering)| { - if child - .output_partitioning() - .satisfy(required.clone(), || child.equivalence_properties()) - { - Ok(child) - } else { - let new_child: Result> = match required { - Distribution::SinglePartition - if child.output_partitioning().partition_count() > 1 => - { - if repartition_sort { - if let Some(ordering) = required_ordering { - let new_physical_ordering = - create_sort_expr_from_requirement(ordering.as_ref()); - Ok(Arc::new(SortPreservingMergeExec::new( - new_physical_ordering, - child.clone(), - ))) - } else { - Ok(Arc::new(CoalescePartitionsExec::new(child.clone()))) - } - } else { + let new_children: Result>> = children + .into_iter() + .zip(required_input_distributions.into_iter()) + .map(|(child, required)| { + if child + .output_partitioning() + .satisfy(required.clone(), || child.equivalence_properties()) + { + Ok(child) + } else { + let new_child: Result> = match required { + Distribution::SinglePartition + if child.output_partitioning().partition_count() > 1 => + { Ok(Arc::new(CoalescePartitionsExec::new(child.clone()))) } - } - _ => { - let partition = required.create_partitioning(target_partitions); - Ok(Arc::new(RepartitionExec::try_new(child, partition)?)) - } - }; - new_child - } - }) - .collect(); + _ => { + let partition = required.create_partitioning(target_partitions); + Ok(Arc::new(RepartitionExec::try_new(child, partition)?)) + } + }; + new_child + } + }) + .collect(); with_new_children_if_necessary(plan, new_children?) } @@ -1017,27 +1015,6 @@ mod tests { )) } - fn parquet_multiple_exec() -> Arc { - Arc::new(ParquetExec::new( - FileScanConfig { - object_store_url: ObjectStoreUrl::parse("test:///").unwrap(), - file_schema: schema(), - file_groups: vec![ - vec![PartitionedFile::new("x".to_string(), 100)], - vec![PartitionedFile::new("y".to_string(), 100)], - ], - statistics: Statistics::default(), - projection: None, - limit: None, - table_partition_cols: vec![], - output_ordering: None, - infinite_source: false, - }, - None, - None, - )) - } - fn projection_exec_with_alias( input: Arc, alias_pairs: Vec<(String, String)>, @@ -1157,7 +1134,8 @@ mod tests { // `EnforceSorting` and `EnfoceDistribution`. // TODO: Orthogonalize the tests here just to verify `EnforceDistribution` and create // new tests for the cascade. - let optimized = EnforceSorting::new().optimize(optimized, &config)?; + let optimizer = EnforceSorting::new(); + let optimized = optimizer.optimize(optimized, &config)?; // Now format correctly let plan = displayable(optimized.as_ref()).indent().to_string(); @@ -1679,7 +1657,6 @@ mod tests { let bottom_left_join = ensure_distribution( hash_join_exec(left.clone(), right.clone(), &join_on, &JoinType::Inner), 10, - false, )?; // Projection(a as A, a as AA, b as B, c as C) @@ -1710,7 +1687,6 @@ mod tests { let bottom_right_join = ensure_distribution( hash_join_exec(left, right.clone(), &join_on, &JoinType::Inner), 10, - false, )?; // Join on (B == b1 and C == c and AA = a1) @@ -1800,7 +1776,6 @@ mod tests { let bottom_left_join = ensure_distribution( hash_join_exec(left.clone(), right.clone(), &join_on, &JoinType::Inner), 10, - false, )?; // Projection(a as A, a as AA, b as B, c as C) @@ -1831,7 +1806,6 @@ mod tests { let bottom_right_join = ensure_distribution( hash_join_exec(left, right.clone(), &join_on, &JoinType::Inner), 10, - false, )?; // Join on (B == b1 and C == c and AA = a1) @@ -1899,7 +1873,7 @@ mod tests { #[test] fn multi_smj_joins() -> Result<()> { - let left = parquet_multiple_exec(); + let left = parquet_exec(); let alias_pairs: Vec<(String, String)> = vec![ ("a".to_string(), "a1".to_string()), ("b".to_string(), "b1".to_string()), @@ -1907,7 +1881,7 @@ mod tests { ("d".to_string(), "d1".to_string()), ("e".to_string(), "e1".to_string()), ]; - let right = projection_exec_with_alias(parquet_multiple_exec(), alias_pairs); + let right = projection_exec_with_alias(parquet_exec(), alias_pairs); // SortMergeJoin does not support RightSemi and RightAnti join now let join_types = vec![ @@ -1938,7 +1912,7 @@ mod tests { )]; let top_join = sort_merge_join_exec( join.clone(), - parquet_multiple_exec(), + parquet_exec(), &top_join_on, &join_type, ); @@ -1952,32 +1926,32 @@ mod tests { top_join_plan.as_str(), join_plan.as_str(), "SortExec: expr=[a@0 ASC], global=false", - "RepartitionExec: partitioning=Hash([Column { name: \"a\", index: 0 }], 10), input_partitions=2", - "ParquetExec: limit=None, partitions={2 groups: [[x], [y]]}, projection=[a, b, c, d, e]", + "RepartitionExec: partitioning=Hash([Column { name: \"a\", index: 0 }], 10), input_partitions=1", + "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", "SortExec: expr=[b1@1 ASC], global=false", - "RepartitionExec: partitioning=Hash([Column { name: \"b1\", index: 1 }], 10), input_partitions=2", + "RepartitionExec: partitioning=Hash([Column { name: \"b1\", index: 1 }], 10), input_partitions=1", "ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - "ParquetExec: limit=None, partitions={2 groups: [[x], [y]]}, projection=[a, b, c, d, e]", + "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", "SortExec: expr=[c@2 ASC], global=false", - "RepartitionExec: partitioning=Hash([Column { name: \"c\", index: 2 }], 10), input_partitions=2", - "ParquetExec: limit=None, partitions={2 groups: [[x], [y]]}, projection=[a, b, c, d, e]", + "RepartitionExec: partitioning=Hash([Column { name: \"c\", index: 2 }], 10), input_partitions=1", + "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", ], // Should include 4 RepartitionExecs _ => vec![ - top_join_plan.as_str(), - "SortExec: expr=[a@0 ASC], global=false", - "RepartitionExec: partitioning=Hash([Column { name: \"a\", index: 0 }], 10), input_partitions=10", - join_plan.as_str(), - "SortExec: expr=[a@0 ASC], global=false", - "RepartitionExec: partitioning=Hash([Column { name: \"a\", index: 0 }], 10), input_partitions=2", - "ParquetExec: limit=None, partitions={2 groups: [[x], [y]]}, projection=[a, b, c, d, e]", - "SortExec: expr=[b1@1 ASC], global=false", - "RepartitionExec: partitioning=Hash([Column { name: \"b1\", index: 1 }], 10), input_partitions=2", - "ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - "ParquetExec: limit=None, partitions={2 groups: [[x], [y]]}, projection=[a, b, c, d, e]", - "SortExec: expr=[c@2 ASC], global=false", - "RepartitionExec: partitioning=Hash([Column { name: \"c\", index: 2 }], 10), input_partitions=2", - "ParquetExec: limit=None, partitions={2 groups: [[x], [y]]}, projection=[a, b, c, d, e]", + top_join_plan.as_str(), + "SortExec: expr=[a@0 ASC], global=false", + "RepartitionExec: partitioning=Hash([Column { name: \"a\", index: 0 }], 10), input_partitions=10", + join_plan.as_str(), + "SortExec: expr=[a@0 ASC], global=false", + "RepartitionExec: partitioning=Hash([Column { name: \"a\", index: 0 }], 10), input_partitions=1", + "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", + "SortExec: expr=[b1@1 ASC], global=false", + "RepartitionExec: partitioning=Hash([Column { name: \"b1\", index: 1 }], 10), input_partitions=1", + "ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", + "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", + "SortExec: expr=[c@2 ASC], global=false", + "RepartitionExec: partitioning=Hash([Column { name: \"c\", index: 2 }], 10), input_partitions=1", + "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", ], }; assert_optimized!(expected, top_join); @@ -1992,7 +1966,7 @@ mod tests { )]; let top_join = sort_merge_join_exec( join, - parquet_multiple_exec(), + parquet_exec(), &top_join_on, &join_type, ); @@ -2005,15 +1979,15 @@ mod tests { top_join_plan.as_str(), join_plan.as_str(), "SortExec: expr=[a@0 ASC], global=false", - "RepartitionExec: partitioning=Hash([Column { name: \"a\", index: 0 }], 10), input_partitions=2", - "ParquetExec: limit=None, partitions={2 groups: [[x], [y]]}, projection=[a, b, c, d, e]", + "RepartitionExec: partitioning=Hash([Column { name: \"a\", index: 0 }], 10), input_partitions=1", + "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", "SortExec: expr=[b1@1 ASC], global=false", - "RepartitionExec: partitioning=Hash([Column { name: \"b1\", index: 1 }], 10), input_partitions=2", + "RepartitionExec: partitioning=Hash([Column { name: \"b1\", index: 1 }], 10), input_partitions=1", "ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - "ParquetExec: limit=None, partitions={2 groups: [[x], [y]]}, projection=[a, b, c, d, e]", + "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", "SortExec: expr=[c@2 ASC], global=false", - "RepartitionExec: partitioning=Hash([Column { name: \"c\", index: 2 }], 10), input_partitions=2", - "ParquetExec: limit=None, partitions={2 groups: [[x], [y]]}, projection=[a, b, c, d, e]", + "RepartitionExec: partitioning=Hash([Column { name: \"c\", index: 2 }], 10), input_partitions=1", + "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", ], // Should include 4 RepartitionExecs and 4 SortExecs _ => vec![ @@ -2022,15 +1996,15 @@ mod tests { "RepartitionExec: partitioning=Hash([Column { name: \"b1\", index: 6 }], 10), input_partitions=10", join_plan.as_str(), "SortExec: expr=[a@0 ASC], global=false", - "RepartitionExec: partitioning=Hash([Column { name: \"a\", index: 0 }], 10), input_partitions=2", - "ParquetExec: limit=None, partitions={2 groups: [[x], [y]]}, projection=[a, b, c, d, e]", + "RepartitionExec: partitioning=Hash([Column { name: \"a\", index: 0 }], 10), input_partitions=1", + "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", "SortExec: expr=[b1@1 ASC], global=false", - "RepartitionExec: partitioning=Hash([Column { name: \"b1\", index: 1 }], 10), input_partitions=2", + "RepartitionExec: partitioning=Hash([Column { name: \"b1\", index: 1 }], 10), input_partitions=1", "ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - "ParquetExec: limit=None, partitions={2 groups: [[x], [y]]}, projection=[a, b, c, d, e]", + "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", "SortExec: expr=[c@2 ASC], global=false", - "RepartitionExec: partitioning=Hash([Column { name: \"c\", index: 2 }], 10), input_partitions=2", - "ParquetExec: limit=None, partitions={2 groups: [[x], [y]]}, projection=[a, b, c, d, e]", + "RepartitionExec: partitioning=Hash([Column { name: \"c\", index: 2 }], 10), input_partitions=1", + "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", ], }; assert_optimized!(expected, top_join); @@ -2129,7 +2103,6 @@ mod tests { // The optimizer should not add an additional SortExec as the // data is already sorted - // SortPreservingMergeExec is also removed from the final plan let expected = &[ "CoalesceBatchesExec: target_batch_size=4096", "ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[a@0 ASC], projection=[a, b, c, d, e]", From 28c6ea37b1f2955112b374f54db664394520cf8f Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 15 Mar 2023 11:49:15 +0300 Subject: [PATCH 14/35] tmp --- .../physical_optimizer/dist_enforcement.rs | 28 +-- .../physical_optimizer/sort_enforcement.rs | 160 ++++++++---------- 2 files changed, 89 insertions(+), 99 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/dist_enforcement.rs b/datafusion/core/src/physical_optimizer/dist_enforcement.rs index 483451f412515..d30fea9c65ad6 100644 --- a/datafusion/core/src/physical_optimizer/dist_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/dist_enforcement.rs @@ -1938,20 +1938,20 @@ mod tests { ], // Should include 4 RepartitionExecs _ => vec![ - top_join_plan.as_str(), - "SortExec: expr=[a@0 ASC], global=false", - "RepartitionExec: partitioning=Hash([Column { name: \"a\", index: 0 }], 10), input_partitions=10", - join_plan.as_str(), - "SortExec: expr=[a@0 ASC], global=false", - "RepartitionExec: partitioning=Hash([Column { name: \"a\", index: 0 }], 10), input_partitions=1", - "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", - "SortExec: expr=[b1@1 ASC], global=false", - "RepartitionExec: partitioning=Hash([Column { name: \"b1\", index: 1 }], 10), input_partitions=1", - "ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", - "SortExec: expr=[c@2 ASC], global=false", - "RepartitionExec: partitioning=Hash([Column { name: \"c\", index: 2 }], 10), input_partitions=1", - "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", + top_join_plan.as_str(), + "SortExec: expr=[a@0 ASC], global=false", + "RepartitionExec: partitioning=Hash([Column { name: \"a\", index: 0 }], 10), input_partitions=10", + join_plan.as_str(), + "SortExec: expr=[a@0 ASC], global=false", + "RepartitionExec: partitioning=Hash([Column { name: \"a\", index: 0 }], 10), input_partitions=1", + "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", + "SortExec: expr=[b1@1 ASC], global=false", + "RepartitionExec: partitioning=Hash([Column { name: \"b1\", index: 1 }], 10), input_partitions=1", + "ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", + "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", + "SortExec: expr=[c@2 ASC], global=false", + "RepartitionExec: partitioning=Hash([Column { name: \"c\", index: 2 }], 10), input_partitions=1", + "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", ], }; assert_optimized!(expected, top_join); diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement.rs b/datafusion/core/src/physical_optimizer/sort_enforcement.rs index d465259bb320b..6c7117ef5dd4d 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement.rs @@ -1133,19 +1133,7 @@ fn pushdown_requirement_to_children( plan.children().len() ])) } - // // TODO: Add support for Projection push down - // else if let Some(ProjectionExec { expr, .. }) = - // plan.as_any().downcast_ref::() - // { - // // For Projection, we need to transform the requirements to the columns before the Projection - // // And then to push down the requirements - // let new_adjusted = map_requirement_before_projection(parent_required, expr); - // if new_adjusted.is_some() { - // Ok(Some(vec![new_adjusted])) - // } else { - // Ok(None) - // } - // } + // TODO: Add support for Projection push down } /// Determine the children requirements @@ -1505,17 +1493,16 @@ mod tests { } #[tokio::test] - async fn test_add_required_sort() -> Result<()> { + async fn test_remove_unnecessary_sort() -> Result<()> { let schema = create_test_schema()?; let source = memory_exec(&schema); - - let sort_exprs = vec![sort_expr("nullable_col", &schema)]; - - let physical_plan = sort_preserving_merge_exec(sort_exprs, source); + let input = sort_exec(vec![sort_expr("non_nullable_col", &schema)], source); + let physical_plan = sort_exec(vec![sort_expr("nullable_col", &schema)], input); let expected_input = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " MemoryExec: partitions=0, partition_sizes=[]", + "SortExec: expr=[nullable_col@0 ASC], global=true", + " SortExec: expr=[non_nullable_col@1 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", ]; let expected_optimized = vec![ "SortExec: expr=[nullable_col@0 ASC], global=true", @@ -1526,16 +1513,76 @@ mod tests { } #[tokio::test] - async fn test_remove_unnecessary_sort() -> Result<()> { + async fn test_remove_unnecessary_sort_window_multilayer() -> Result<()> { let schema = create_test_schema()?; let source = memory_exec(&schema); - let input = sort_exec(vec![sort_expr("non_nullable_col", &schema)], source); - let physical_plan = sort_exec(vec![sort_expr("nullable_col", &schema)], input); + + let sort_exprs = vec![sort_expr_options( + "non_nullable_col", + &source.schema(), + SortOptions { + descending: true, + nulls_first: true, + }, + )]; + let sort = sort_exec(sort_exprs.clone(), source); + + let window_agg = bounded_window_exec("non_nullable_col", sort_exprs, sort); + + let sort_exprs = vec![sort_expr_options( + "non_nullable_col", + &window_agg.schema(), + SortOptions { + descending: false, + nulls_first: false, + }, + )]; + + let sort = sort_exec(sort_exprs.clone(), window_agg); + + // Add dummy layer propagating Sort above, to test whether sort can be removed from multi layer before + let filter = filter_exec( + Arc::new(NotExpr::new( + col("non_nullable_col", schema.as_ref()).unwrap(), + )), + sort, + ); + + // let filter_exec = sort_exec; + let physical_plan = bounded_window_exec("non_nullable_col", sort_exprs, filter); let expected_input = vec![ - "SortExec: expr=[nullable_col@0 ASC], global=true", - " SortExec: expr=[non_nullable_col@1 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", + "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + " FilterExec: NOT non_nullable_col@1", + " SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], global=true", + " BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + " SortExec: expr=[non_nullable_col@1 DESC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + + let expected_optimized = vec![ + "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(NULL) }]", + " FilterExec: NOT non_nullable_col@1", + " BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + " SortExec: expr=[non_nullable_col@1 DESC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_add_required_sort() -> Result<()> { + let schema = create_test_schema()?; + let source = memory_exec(&schema); + + let sort_exprs = vec![sort_expr("nullable_col", &schema)]; + + let physical_plan = sort_preserving_merge_exec(sort_exprs, source); + + let expected_input = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " MemoryExec: partitions=0, partition_sizes=[]", ]; let expected_optimized = vec![ "SortExec: expr=[nullable_col@0 ASC], global=true", @@ -2254,6 +2301,8 @@ mod tests { " SortExec: expr=[nullable_col@0 DESC NULLS LAST,non_nullable_col@1 DESC NULLS LAST], global=true", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; + // Since `UnionExec` doesn't preserve ordering in the plan above. + // We shouldn't keep SortExecs in the plan. let expected_optimized = vec![ "UnionExec", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", @@ -2534,65 +2583,6 @@ mod tests { Ok(()) } - #[tokio::test] - async fn test_not_remove_top_sort_window_multilayer() -> Result<()> { - let schema = create_test_schema()?; - let source = memory_exec(&schema); - - let sort_exprs = vec![sort_expr_options( - "non_nullable_col", - &source.schema(), - SortOptions { - descending: true, - nulls_first: true, - }, - )]; - let sort = sort_exec(sort_exprs.clone(), source); - - let window_agg = bounded_window_exec("non_nullable_col", sort_exprs, sort); - - let sort_exprs = vec![sort_expr_options( - "non_nullable_col", - &window_agg.schema(), - SortOptions { - descending: false, - nulls_first: false, - }, - )]; - - let sort = sort_exec(sort_exprs.clone(), window_agg); - - // Add dummy layer propagating Sort above, the top Sort should not be removed - let filter = filter_exec( - Arc::new(NotExpr::new( - col("non_nullable_col", schema.as_ref()).unwrap(), - )), - sort, - ); - - // let filter_exec = sort_exec; - let physical_plan = bounded_window_exec("non_nullable_col", sort_exprs, filter); - - let expected_input = vec![ - "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " FilterExec: NOT non_nullable_col@1", - " SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], global=true", - " BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " SortExec: expr=[non_nullable_col@1 DESC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - - let expected_optimized = vec![ - "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(NULL) }]", - " FilterExec: NOT non_nullable_col@1", - " BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " SortExec: expr=[non_nullable_col@1 DESC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - #[tokio::test] async fn test_multiple_sort_window_exec() -> Result<()> { let schema = create_test_schema()?; From 16fd9f5479dff2175d70525273f420b2d6c598c3 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 15 Mar 2023 13:50:18 +0300 Subject: [PATCH 15/35] reorganize files --- datafusion/core/src/physical_optimizer/mod.rs | 1 + .../physical_optimizer/sort_enforcement.rs | 489 +----------------- .../src/physical_optimizer/sort_pushdown.rs | 479 +++++++++++++++++ .../core/src/physical_optimizer/utils.rs | 25 + 4 files changed, 514 insertions(+), 480 deletions(-) create mode 100644 datafusion/core/src/physical_optimizer/sort_pushdown.rs diff --git a/datafusion/core/src/physical_optimizer/mod.rs b/datafusion/core/src/physical_optimizer/mod.rs index 3958a546a92df..5111e552929fd 100644 --- a/datafusion/core/src/physical_optimizer/mod.rs +++ b/datafusion/core/src/physical_optimizer/mod.rs @@ -28,6 +28,7 @@ pub mod pipeline_checker; pub mod pruning; pub mod repartition; pub mod sort_enforcement; +mod sort_pushdown; mod utils; pub mod pipeline_fixer; diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement.rs b/datafusion/core/src/physical_optimizer/sort_enforcement.rs index 6c7117ef5dd4d..e62396cef427d 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement.rs @@ -35,35 +35,26 @@ //! by another SortExec. Therefore, this rule removes it from the physical plan. use crate::config::ConfigOptions; use crate::error::Result; -use crate::physical_optimizer::utils::add_sort_above; +use crate::physical_optimizer::sort_pushdown::{pushdown_sorts, SortPushDown}; +use crate::physical_optimizer::utils::{ + add_sort_above, is_limit, is_sort, is_sort_preserving_merge, is_window, +}; use crate::physical_optimizer::PhysicalOptimizerRule; use crate::physical_plan::coalesce_partitions::CoalescePartitionsExec; -use crate::physical_plan::filter::FilterExec; -use crate::physical_plan::joins::utils::JoinSide; -use crate::physical_plan::joins::SortMergeJoinExec; -use crate::physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; -use crate::physical_plan::projection::ProjectionExec; -use crate::physical_plan::repartition::RepartitionExec; use crate::physical_plan::sorts::sort::SortExec; use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; use crate::physical_plan::tree_node::TreeNodeRewritable; -use crate::physical_plan::union::UnionExec; use crate::physical_plan::windows::{BoundedWindowAggExec, WindowAggExec}; use crate::physical_plan::{with_new_children_if_necessary, Distribution, ExecutionPlan}; use arrow::datatypes::SchemaRef; use datafusion_common::{reverse_sort_options, DataFusionError}; -use datafusion_expr::JoinType; -use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::utils::{ - create_sort_expr_from_requirement, ordering_satisfy, ordering_satisfy_requirement, - ordering_satisfy_requirement_concrete, requirements_compatible, -}; -use datafusion_physical_expr::{ - new_sort_requirements, PhysicalExpr, PhysicalSortExpr, PhysicalSortRequirements, + create_sort_expr_from_requirement, ordering_satisfy, + ordering_satisfy_requirement_concrete, }; +use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr}; use itertools::{concat, izip}; use std::iter::zip; -use std::ops::Deref; use std::sync::Arc; /// This rule inspects `SortExec`'s in the given physical plan and removes the @@ -78,28 +69,6 @@ impl EnforceSorting { } } -/// Checks whether the given executor is a limit; -/// i.e. either a `LocalLimitExec` or a `GlobalLimitExec`. -fn is_limit(plan: &Arc) -> bool { - plan.as_any().is::() || plan.as_any().is::() -} - -/// Checks whether the given executor is a widnow; -/// i.e. either a `WindowAggExec` or a `BoundedWindowAggExec`. -fn is_window(plan: &Arc) -> bool { - plan.as_any().is::() || plan.as_any().is::() -} - -/// Checks whether the given executor is a `SortExec`. -fn is_sort(plan: &Arc) -> bool { - plan.as_any().is::() -} - -/// Checks whether the given executor is a `SortPreservingMergeExec`. -fn is_sort_preserving_merge(plan: &Arc) -> bool { - plan.as_any().is::() -} - /// This object implements a tree that we use while keeping track of paths /// leading to `SortExec`s. #[derive(Debug, Clone)] @@ -337,111 +306,6 @@ impl TreeNodeRewritable for PlanWithCorrespondingCoalescePartitions { } } -/// This is a "data class" we use within the [TopDownEnforceSorting] rule -#[derive(Debug, Clone)] -struct TopDownSortPushDown { - /// Current plan - plan: Arc, - /// Whether the plan could impact the final result ordering - impact_result_ordering: bool, - /// Parent has the SinglePartition requirement to children - satisfy_single_distribution: bool, - /// Parent required sort ordering - required_ordering: Option>, - /// The adjusted request sort ordering to children. - /// By default they are the same as the plan's required input ordering, but can be adjusted based on parent required sort ordering properties. - adjusted_request_ordering: Vec>>, -} - -impl TopDownSortPushDown { - pub fn init(plan: Arc) -> Self { - let impact_result_ordering = plan.output_ordering().is_some() - || plan.output_partitioning().partition_count() <= 1 - || is_limit(&plan); - let request_ordering = plan.required_input_ordering(); - TopDownSortPushDown { - plan, - impact_result_ordering, - satisfy_single_distribution: false, - required_ordering: None, - adjusted_request_ordering: request_ordering, - } - } - - pub fn new_without_impact_result_ordering(plan: Arc) -> Self { - let request_ordering = plan.required_input_ordering(); - TopDownSortPushDown { - plan, - impact_result_ordering: false, - satisfy_single_distribution: false, - required_ordering: None, - adjusted_request_ordering: request_ordering, - } - } - - pub fn children(&self) -> Vec { - let plan_children = self.plan.children(); - assert_eq!(plan_children.len(), self.adjusted_request_ordering.len()); - - izip!( - plan_children.into_iter(), - self.adjusted_request_ordering.clone().into_iter(), - self.plan.maintains_input_order().into_iter(), - self.plan.required_input_distribution().into_iter(), - ) - .map( - |(child, from_parent, maintains_input_order, required_dist)| { - let child_satisfy_single_distribution = - matches!(required_dist, Distribution::SinglePartition); - let child_impact_result_ordering = if is_limit(&self.plan) { - true - } else { - maintains_input_order && self.impact_result_ordering - }; - let child_request_ordering = child.required_input_ordering(); - TopDownSortPushDown { - plan: child, - impact_result_ordering: child_impact_result_ordering, - satisfy_single_distribution: child_satisfy_single_distribution, - required_ordering: from_parent, - adjusted_request_ordering: child_request_ordering, - } - }, - ) - .collect() - } -} - -impl TreeNodeRewritable for TopDownSortPushDown { - fn map_children(self, transform: F) -> Result - where - F: FnMut(Self) -> Result, - { - let children = self.children(); - if children.is_empty() { - Ok(self) - } else { - let new_children = children - .into_iter() - .map(transform) - .collect::>>()?; - - let children_plans = new_children - .iter() - .map(|elem| elem.plan.clone()) - .collect::>(); - let plan = with_new_children_if_necessary(self.plan, children_plans)?; - Ok(TopDownSortPushDown { - plan, - impact_result_ordering: self.impact_result_ordering, - satisfy_single_distribution: self.satisfy_single_distribution, - required_ordering: self.required_ordering, - adjusted_request_ordering: self.adjusted_request_ordering, - }) - } - } -} - /// The boolean flag `repartition_sorts` defined in the config indicates /// whether we elect to transform CoalescePartitionsExec + SortExec cascades /// into SortExec + SortPreservingMergeExec cascades, which enables us to @@ -464,7 +328,7 @@ impl PhysicalOptimizerRule for EnforceSorting { adjusted.plan }; // Execute a Top-Down process(Preorder Traversal) to ensure the sort requirements: - let sort_pushdown = TopDownSortPushDown::init(new_plan); + let sort_pushdown = SortPushDown::init(new_plan); let adjusted = sort_pushdown.transform_down(&pushdown_sorts)?; Ok(adjusted.plan) } @@ -627,74 +491,6 @@ fn ensure_sorting( })) } -fn pushdown_sorts( - requirements: TopDownSortPushDown, -) -> Result> { - let plan = &requirements.plan; - let parent_required = requirements.required_ordering.as_deref(); - if let Some(sort_exec) = plan.as_any().downcast_ref::() { - let mut new_plan = plan.clone(); - if !ordering_satisfy_requirement(plan.output_ordering(), parent_required, || { - plan.equivalence_properties() - }) { - // If the current plan is a SortExec, modify current SortExec to satisfy the parent requirements - let parent_required_expr = - create_sort_expr_from_requirement(parent_required.unwrap()); - new_plan = sort_exec.input.clone(); - add_sort_above(&mut new_plan, parent_required_expr)?; - }; - let required_ordering = new_sort_requirements(new_plan.output_ordering()); - let child = &new_plan.children()[0]; - if let Some(adjusted) = - pushdown_requirement_to_children(child, required_ordering.as_deref())? - { - // Can push down requirements - Ok(Some(TopDownSortPushDown { - plan: child.clone(), - required_ordering, - adjusted_request_ordering: adjusted, - ..requirements - })) - } else { - // Can not push down requirements - Ok(Some( - TopDownSortPushDown::new_without_impact_result_ordering(new_plan), - )) - } - } else { - // Executors other than SortExec - if ordering_satisfy_requirement(plan.output_ordering(), parent_required, || { - plan.equivalence_properties() - }) { - Ok(Some(TopDownSortPushDown { - required_ordering: None, - ..requirements - })) - } else { - // Can not satisfy the parent requirements, check whether the requirements can be pushed down. If not, add new SortExec. - let parent_required_expr = - create_sort_expr_from_requirement(parent_required.unwrap()); - if let Some(adjusted) = pushdown_requirement_to_children( - plan, - requirements.required_ordering.as_deref(), - )? { - Ok(Some(TopDownSortPushDown { - plan: plan.clone(), - adjusted_request_ordering: adjusted, - ..requirements - })) - } else { - // Can not push down requirements, add new SortExec - let mut new_plan = plan.clone(); - add_sort_above(&mut new_plan, parent_required_expr)?; - Ok(Some( - TopDownSortPushDown::new_without_impact_result_ordering(new_plan), - )) - } - } - } -} - /// Analyzes a given `SortExec` (`plan`) to determine whether its input already /// has a finer ordering than this `SortExec` enforces. fn analyze_immediate_sort_removal( @@ -1051,274 +847,6 @@ fn check_alignment( } } -fn pushdown_requirement_to_children( - plan: &Arc, - parent_required: Option<&[PhysicalSortRequirements]>, -) -> Result>>>> { - let maintains_input_order = plan.maintains_input_order(); - if is_window(plan) { - let required_input_ordering = plan.required_input_ordering(); - let request_child = required_input_ordering[0].as_deref(); - let child_plan = plan.children()[0].clone(); - match determine_children_requirement(parent_required, request_child, child_plan) { - RequirementsCompatibility::Satisfy => { - Ok(Some(vec![request_child.map(|r| r.to_vec())])) - } - RequirementsCompatibility::Compatible(adjusted) => Ok(Some(vec![adjusted])), - RequirementsCompatibility::NonCompatible => Ok(None), - } - } else if plan.as_any().is::() { - // UnionExec does not have real sort requirements for its input. Here we change the adjusted_request_ordering to UnionExec's output ordering and - // propagate the sort requirements down to correct the unnecessary descendant SortExec under the UnionExec - Ok(Some(vec![ - parent_required.map(|elem| elem.to_vec()); - plan.children().len() - ])) - } else if let Some(smj) = plan.as_any().downcast_ref::() { - // If the current plan is SortMergeJoinExec - let left_columns_len = smj.left.schema().fields().len(); - let parent_required_expr = - create_sort_expr_from_requirement(parent_required.unwrap()); - let expr_source_side = - expr_source_sides(&parent_required_expr, smj.join_type, left_columns_len); - match expr_source_side { - Some(JoinSide::Left) if maintains_input_order[0] => { - try_pushdown_requirements_to_join( - plan, - parent_required, - parent_required_expr, - JoinSide::Left, - ) - } - Some(JoinSide::Right) if maintains_input_order[1] => { - let new_right_required = match smj.join_type { - JoinType::Inner | JoinType::Right => { - shift_right_required(parent_required.unwrap(), left_columns_len)? - } - JoinType::RightSemi | JoinType::RightAnti => { - parent_required.unwrap().to_vec() - } - _ => Err(DataFusionError::Plan( - "Unexpected SortMergeJoin type here".to_string(), - ))?, - }; - try_pushdown_requirements_to_join( - plan, - Some(new_right_required.deref()), - parent_required_expr, - JoinSide::Right, - ) - } - _ => { - // Can not decide the expr side for SortMergeJoinExec, can not push down - Ok(None) - } - } - } else if maintains_input_order.is_empty() - || !maintains_input_order.iter().any(|o| *o) - || plan.as_any().is::() - || plan.as_any().is::() - // TODO: Add support for Projection push down - || plan.as_any().is::() - || is_limit(plan) - { - // If the current plan is a leaf node or can not maintain any of the input ordering, can not pushed down requirements. - // For RepartitionExec, we always choose to not push down the sort requirements even the RepartitionExec(input_partition=1) could maintain input ordering. - // For RepartitionExec, we always choose to not push down the sort requirements even the RepartitionExec(input_partition=1) could maintain input ordering. - // Pushing down is not beneficial - Ok(None) - } else { - Ok(Some(vec![ - parent_required.map(|elem| elem.to_vec()); - plan.children().len() - ])) - } - // TODO: Add support for Projection push down -} - -/// Determine the children requirements -/// If the children requirements are more specific, do not push down the parent requirements -/// If the the parent requirements are more specific, push down the parent requirements -/// If they are not compatible, need to add Sort. -fn determine_children_requirement( - parent_required: Option<&[PhysicalSortRequirements]>, - request_child: Option<&[PhysicalSortRequirements]>, - child_plan: Arc, -) -> RequirementsCompatibility { - if requirements_compatible(request_child, parent_required, || { - child_plan.equivalence_properties() - }) { - // request child requirements are more specific, no need to push down the parent requirements - RequirementsCompatibility::Satisfy - } else if requirements_compatible(parent_required, request_child, || { - child_plan.equivalence_properties() - }) { - // parent requirements are more specific, adjust the request child requirements and push down the new requirements - let adjusted = parent_required.map(|r| r.to_vec()); - RequirementsCompatibility::Compatible(adjusted) - } else { - RequirementsCompatibility::NonCompatible - } -} - -fn try_pushdown_requirements_to_join( - plan: &Arc, - parent_required: Option<&[PhysicalSortRequirements]>, - sort_expr: Vec, - push_side: JoinSide, -) -> Result>>>> { - let child_idx = match push_side { - JoinSide::Left => 0, - JoinSide::Right => 1, - }; - let required_input_ordering = plan.required_input_ordering(); - let request_child = required_input_ordering[child_idx].as_deref(); - let child_plan = plan.children()[child_idx].clone(); - match determine_children_requirement(parent_required, request_child, child_plan) { - RequirementsCompatibility::Satisfy => Ok(None), - RequirementsCompatibility::Compatible(adjusted) => { - let new_adjusted = match push_side { - JoinSide::Left => { - vec![adjusted, required_input_ordering[1].clone()] - } - JoinSide::Right => { - vec![required_input_ordering[0].clone(), adjusted] - } - }; - Ok(Some(new_adjusted)) - } - RequirementsCompatibility::NonCompatible => { - // Can not push down, add new SortExec - let mut new_plan = plan.clone(); - add_sort_above(&mut new_plan, sort_expr)?; - Ok(None) - } - } -} - -fn expr_source_sides( - required_exprs: &[PhysicalSortExpr], - join_type: JoinType, - left_columns_len: usize, -) -> Option { - match join_type { - JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full => { - let all_column_sides = required_exprs - .iter() - .filter_map(|r| { - if let Some(col) = r.expr.as_any().downcast_ref::() { - if col.index() < left_columns_len { - Some(JoinSide::Left) - } else { - Some(JoinSide::Right) - } - } else { - None - } - }) - .collect::>(); - - // If the exprs are all coming from one side, the requirements can be pushed down - if all_column_sides.len() != required_exprs.len() { - None - } else if all_column_sides - .iter() - .all(|side| matches!(side, JoinSide::Left)) - { - Some(JoinSide::Left) - } else if all_column_sides - .iter() - .all(|side| matches!(side, JoinSide::Right)) - { - Some(JoinSide::Right) - } else { - None - } - } - JoinType::LeftSemi | JoinType::LeftAnti => { - if required_exprs - .iter() - .filter_map(|r| { - if r.expr.as_any().downcast_ref::().is_some() { - Some(JoinSide::Left) - } else { - None - } - }) - .count() - != required_exprs.len() - { - None - } else { - Some(JoinSide::Left) - } - } - JoinType::RightSemi | JoinType::RightAnti => { - if required_exprs - .iter() - .filter_map(|r| { - if r.expr.as_any().downcast_ref::().is_some() { - Some(JoinSide::Right) - } else { - None - } - }) - .count() - != required_exprs.len() - { - None - } else { - Some(JoinSide::Right) - } - } - } -} - -fn shift_right_required( - parent_required: &[PhysicalSortRequirements], - left_columns_len: usize, -) -> Result> { - let new_right_required: Vec = parent_required - .iter() - .filter_map(|r| { - if let Some(col) = r.expr.as_any().downcast_ref::() { - if col.index() >= left_columns_len { - Some(PhysicalSortRequirements { - expr: Arc::new(Column::new( - col.name(), - col.index() - left_columns_len, - )) as Arc, - sort_options: r.sort_options, - }) - } else { - None - } - } else { - None - } - }) - .collect::>(); - if new_right_required.len() != parent_required.len() { - Err(DataFusionError::Plan( - "Expect to shift all the parent required column indexes for SortMergeJoin" - .to_string(), - )) - } else { - Ok(new_right_required) - } -} - -/// Define the Requirements Compatibility -#[derive(Debug)] -pub enum RequirementsCompatibility { - /// Requirements satisfy - Satisfy, - /// Requirements compatible - Compatible(Option>), - /// Requirements not compatible - NonCompatible, -} - #[cfg(test)] mod tests { use super::*; @@ -1331,6 +859,7 @@ mod tests { use crate::physical_plan::filter::FilterExec; use crate::physical_plan::joins::utils::JoinOn; use crate::physical_plan::joins::SortMergeJoinExec; + use crate::physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; use crate::physical_plan::memory::MemoryExec; use crate::physical_plan::repartition::RepartitionExec; use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; diff --git a/datafusion/core/src/physical_optimizer/sort_pushdown.rs b/datafusion/core/src/physical_optimizer/sort_pushdown.rs new file mode 100644 index 0000000000000..01a66236d2265 --- /dev/null +++ b/datafusion/core/src/physical_optimizer/sort_pushdown.rs @@ -0,0 +1,479 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +use crate::physical_optimizer::utils::{add_sort_above, is_limit, is_window}; +use crate::physical_plan::filter::FilterExec; +use crate::physical_plan::joins::utils::JoinSide; +use crate::physical_plan::joins::SortMergeJoinExec; +use crate::physical_plan::projection::ProjectionExec; +use crate::physical_plan::repartition::RepartitionExec; +use crate::physical_plan::sorts::sort::SortExec; +use crate::physical_plan::tree_node::TreeNodeRewritable; +use crate::physical_plan::union::UnionExec; +use crate::physical_plan::{with_new_children_if_necessary, Distribution, ExecutionPlan}; +use datafusion_common::{DataFusionError, Result}; +use datafusion_expr::JoinType; +use datafusion_physical_expr::expressions::Column; +use datafusion_physical_expr::utils::{ + create_sort_expr_from_requirement, ordering_satisfy_requirement, + requirements_compatible, +}; +use datafusion_physical_expr::{ + new_sort_requirements, PhysicalExpr, PhysicalSortExpr, PhysicalSortRequirements, +}; +use itertools::izip; +use std::ops::Deref; +use std::sync::Arc; + +/// This is a "data class" we use within the [EnforceSorting] rule to push down SortExec in the plan. +// By pushing down SortExecs through some Executors in the plan we can increase speed. +#[derive(Debug, Clone)] +pub(crate) struct SortPushDown { + /// Current plan + pub plan: Arc, + /// Whether the plan could impact the final result ordering + impact_result_ordering: bool, + /// Parent has the SinglePartition requirement to children + satisfy_single_distribution: bool, + /// Parent required sort ordering + required_ordering: Option>, + /// The adjusted request sort ordering to children. + /// By default they are the same as the plan's required input ordering, but can be adjusted based on parent required sort ordering properties. + adjusted_request_ordering: Vec>>, +} + +impl SortPushDown { + pub fn init(plan: Arc) -> Self { + let impact_result_ordering = plan.output_ordering().is_some() + || plan.output_partitioning().partition_count() <= 1 + || is_limit(&plan); + let request_ordering = plan.required_input_ordering(); + SortPushDown { + plan, + impact_result_ordering, + satisfy_single_distribution: false, + required_ordering: None, + adjusted_request_ordering: request_ordering, + } + } + + pub fn new_without_impact_result_ordering(plan: Arc) -> Self { + let request_ordering = plan.required_input_ordering(); + SortPushDown { + plan, + impact_result_ordering: false, + satisfy_single_distribution: false, + required_ordering: None, + adjusted_request_ordering: request_ordering, + } + } + + pub fn children(&self) -> Vec { + let plan_children = self.plan.children(); + assert_eq!(plan_children.len(), self.adjusted_request_ordering.len()); + + izip!( + plan_children.into_iter(), + self.adjusted_request_ordering.clone().into_iter(), + self.plan.maintains_input_order().into_iter(), + self.plan.required_input_distribution().into_iter(), + ) + .map( + |(child, from_parent, maintains_input_order, required_dist)| { + let child_satisfy_single_distribution = + matches!(required_dist, Distribution::SinglePartition); + let child_impact_result_ordering = if is_limit(&self.plan) { + true + } else { + maintains_input_order && self.impact_result_ordering + }; + let child_request_ordering = child.required_input_ordering(); + SortPushDown { + plan: child, + impact_result_ordering: child_impact_result_ordering, + satisfy_single_distribution: child_satisfy_single_distribution, + required_ordering: from_parent, + adjusted_request_ordering: child_request_ordering, + } + }, + ) + .collect() + } +} + +impl TreeNodeRewritable for SortPushDown { + fn map_children(self, transform: F) -> Result + where + F: FnMut(Self) -> Result, + { + let children = self.children(); + if children.is_empty() { + Ok(self) + } else { + let new_children = children + .into_iter() + .map(transform) + .collect::>>()?; + + let children_plans = new_children + .iter() + .map(|elem| elem.plan.clone()) + .collect::>(); + let plan = with_new_children_if_necessary(self.plan, children_plans)?; + Ok(SortPushDown { + plan, + impact_result_ordering: self.impact_result_ordering, + satisfy_single_distribution: self.satisfy_single_distribution, + required_ordering: self.required_ordering, + adjusted_request_ordering: self.adjusted_request_ordering, + }) + } + } +} + +pub(crate) fn pushdown_sorts(requirements: SortPushDown) -> Result> { + let plan = &requirements.plan; + let parent_required = requirements.required_ordering.as_deref(); + if let Some(sort_exec) = plan.as_any().downcast_ref::() { + let mut new_plan = plan.clone(); + if !ordering_satisfy_requirement(plan.output_ordering(), parent_required, || { + plan.equivalence_properties() + }) { + // If the current plan is a SortExec, modify current SortExec to satisfy the parent requirements + let parent_required_expr = + create_sort_expr_from_requirement(parent_required.unwrap()); + new_plan = sort_exec.input.clone(); + add_sort_above(&mut new_plan, parent_required_expr)?; + }; + let required_ordering = new_sort_requirements(new_plan.output_ordering()); + let child = &new_plan.children()[0]; + if let Some(adjusted) = + pushdown_requirement_to_children(child, required_ordering.as_deref())? + { + // Can push down requirements + Ok(Some(SortPushDown { + plan: child.clone(), + required_ordering, + adjusted_request_ordering: adjusted, + ..requirements + })) + } else { + // Can not push down requirements + Ok(Some(SortPushDown::new_without_impact_result_ordering( + new_plan, + ))) + } + } else { + // Executors other than SortExec + if ordering_satisfy_requirement(plan.output_ordering(), parent_required, || { + plan.equivalence_properties() + }) { + Ok(Some(SortPushDown { + required_ordering: None, + ..requirements + })) + } else { + // Can not satisfy the parent requirements, check whether the requirements can be pushed down. If not, add new SortExec. + let parent_required_expr = + create_sort_expr_from_requirement(parent_required.unwrap()); + if let Some(adjusted) = pushdown_requirement_to_children( + plan, + requirements.required_ordering.as_deref(), + )? { + Ok(Some(SortPushDown { + plan: plan.clone(), + adjusted_request_ordering: adjusted, + ..requirements + })) + } else { + // Can not push down requirements, add new SortExec + let mut new_plan = plan.clone(); + add_sort_above(&mut new_plan, parent_required_expr)?; + Ok(Some(SortPushDown::new_without_impact_result_ordering( + new_plan, + ))) + } + } + } +} + +fn pushdown_requirement_to_children( + plan: &Arc, + parent_required: Option<&[PhysicalSortRequirements]>, +) -> Result>>>> { + let maintains_input_order = plan.maintains_input_order(); + if is_window(plan) { + let required_input_ordering = plan.required_input_ordering(); + let request_child = required_input_ordering[0].as_deref(); + let child_plan = plan.children()[0].clone(); + match determine_children_requirement(parent_required, request_child, child_plan) { + RequirementsCompatibility::Satisfy => { + Ok(Some(vec![request_child.map(|r| r.to_vec())])) + } + RequirementsCompatibility::Compatible(adjusted) => Ok(Some(vec![adjusted])), + RequirementsCompatibility::NonCompatible => Ok(None), + } + } else if plan.as_any().is::() { + // UnionExec does not have real sort requirements for its input. Here we change the adjusted_request_ordering to UnionExec's output ordering and + // propagate the sort requirements down to correct the unnecessary descendant SortExec under the UnionExec + Ok(Some(vec![ + parent_required.map(|elem| elem.to_vec()); + plan.children().len() + ])) + } else if let Some(smj) = plan.as_any().downcast_ref::() { + // If the current plan is SortMergeJoinExec + let left_columns_len = smj.left.schema().fields().len(); + let parent_required_expr = + create_sort_expr_from_requirement(parent_required.unwrap()); + let expr_source_side = + expr_source_sides(&parent_required_expr, smj.join_type, left_columns_len); + match expr_source_side { + Some(JoinSide::Left) if maintains_input_order[0] => { + try_pushdown_requirements_to_join( + plan, + parent_required, + parent_required_expr, + JoinSide::Left, + ) + } + Some(JoinSide::Right) if maintains_input_order[1] => { + let new_right_required = match smj.join_type { + JoinType::Inner | JoinType::Right => { + shift_right_required(parent_required.unwrap(), left_columns_len)? + } + JoinType::RightSemi | JoinType::RightAnti => { + parent_required.unwrap().to_vec() + } + _ => Err(DataFusionError::Plan( + "Unexpected SortMergeJoin type here".to_string(), + ))?, + }; + try_pushdown_requirements_to_join( + plan, + Some(new_right_required.deref()), + parent_required_expr, + JoinSide::Right, + ) + } + _ => { + // Can not decide the expr side for SortMergeJoinExec, can not push down + Ok(None) + } + } + } else if maintains_input_order.is_empty() + || !maintains_input_order.iter().any(|o| *o) + || plan.as_any().is::() + || plan.as_any().is::() + // TODO: Add support for Projection push down + || plan.as_any().is::() + || is_limit(plan) + { + // If the current plan is a leaf node or can not maintain any of the input ordering, can not pushed down requirements. + // For RepartitionExec, we always choose to not push down the sort requirements even the RepartitionExec(input_partition=1) could maintain input ordering. + // For RepartitionExec, we always choose to not push down the sort requirements even the RepartitionExec(input_partition=1) could maintain input ordering. + // Pushing down is not beneficial + Ok(None) + } else { + Ok(Some(vec![ + parent_required.map(|elem| elem.to_vec()); + plan.children().len() + ])) + } + // TODO: Add support for Projection push down +} + +/// Determine the children requirements +/// If the children requirements are more specific, do not push down the parent requirements +/// If the the parent requirements are more specific, push down the parent requirements +/// If they are not compatible, need to add Sort. +fn determine_children_requirement( + parent_required: Option<&[PhysicalSortRequirements]>, + request_child: Option<&[PhysicalSortRequirements]>, + child_plan: Arc, +) -> RequirementsCompatibility { + if requirements_compatible(request_child, parent_required, || { + child_plan.equivalence_properties() + }) { + // request child requirements are more specific, no need to push down the parent requirements + RequirementsCompatibility::Satisfy + } else if requirements_compatible(parent_required, request_child, || { + child_plan.equivalence_properties() + }) { + // parent requirements are more specific, adjust the request child requirements and push down the new requirements + let adjusted = parent_required.map(|r| r.to_vec()); + RequirementsCompatibility::Compatible(adjusted) + } else { + RequirementsCompatibility::NonCompatible + } +} + +fn try_pushdown_requirements_to_join( + plan: &Arc, + parent_required: Option<&[PhysicalSortRequirements]>, + sort_expr: Vec, + push_side: JoinSide, +) -> Result>>>> { + let child_idx = match push_side { + JoinSide::Left => 0, + JoinSide::Right => 1, + }; + let required_input_ordering = plan.required_input_ordering(); + let request_child = required_input_ordering[child_idx].as_deref(); + let child_plan = plan.children()[child_idx].clone(); + match determine_children_requirement(parent_required, request_child, child_plan) { + RequirementsCompatibility::Satisfy => Ok(None), + RequirementsCompatibility::Compatible(adjusted) => { + let new_adjusted = match push_side { + JoinSide::Left => { + vec![adjusted, required_input_ordering[1].clone()] + } + JoinSide::Right => { + vec![required_input_ordering[0].clone(), adjusted] + } + }; + Ok(Some(new_adjusted)) + } + RequirementsCompatibility::NonCompatible => { + // Can not push down, add new SortExec + let mut new_plan = plan.clone(); + add_sort_above(&mut new_plan, sort_expr)?; + Ok(None) + } + } +} + +fn expr_source_sides( + required_exprs: &[PhysicalSortExpr], + join_type: JoinType, + left_columns_len: usize, +) -> Option { + match join_type { + JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full => { + let all_column_sides = required_exprs + .iter() + .filter_map(|r| { + if let Some(col) = r.expr.as_any().downcast_ref::() { + if col.index() < left_columns_len { + Some(JoinSide::Left) + } else { + Some(JoinSide::Right) + } + } else { + None + } + }) + .collect::>(); + + // If the exprs are all coming from one side, the requirements can be pushed down + if all_column_sides.len() != required_exprs.len() { + None + } else if all_column_sides + .iter() + .all(|side| matches!(side, JoinSide::Left)) + { + Some(JoinSide::Left) + } else if all_column_sides + .iter() + .all(|side| matches!(side, JoinSide::Right)) + { + Some(JoinSide::Right) + } else { + None + } + } + JoinType::LeftSemi | JoinType::LeftAnti => { + if required_exprs + .iter() + .filter_map(|r| { + if r.expr.as_any().downcast_ref::().is_some() { + Some(JoinSide::Left) + } else { + None + } + }) + .count() + != required_exprs.len() + { + None + } else { + Some(JoinSide::Left) + } + } + JoinType::RightSemi | JoinType::RightAnti => { + if required_exprs + .iter() + .filter_map(|r| { + if r.expr.as_any().downcast_ref::().is_some() { + Some(JoinSide::Right) + } else { + None + } + }) + .count() + != required_exprs.len() + { + None + } else { + Some(JoinSide::Right) + } + } + } +} + +fn shift_right_required( + parent_required: &[PhysicalSortRequirements], + left_columns_len: usize, +) -> Result> { + let new_right_required: Vec = parent_required + .iter() + .filter_map(|r| { + if let Some(col) = r.expr.as_any().downcast_ref::() { + if col.index() >= left_columns_len { + Some(PhysicalSortRequirements { + expr: Arc::new(Column::new( + col.name(), + col.index() - left_columns_len, + )) as Arc, + sort_options: r.sort_options, + }) + } else { + None + } + } else { + None + } + }) + .collect::>(); + if new_right_required.len() != parent_required.len() { + Err(DataFusionError::Plan( + "Expect to shift all the parent required column indexes for SortMergeJoin" + .to_string(), + )) + } else { + Ok(new_right_required) + } +} + +/// Define the Requirements Compatibility +#[derive(Debug)] +enum RequirementsCompatibility { + /// Requirements satisfy + Satisfy, + /// Requirements compatible + Compatible(Option>), + /// Requirements not compatible + NonCompatible, +} diff --git a/datafusion/core/src/physical_optimizer/utils.rs b/datafusion/core/src/physical_optimizer/utils.rs index b6666fbefae1e..7c74fdcd89523 100644 --- a/datafusion/core/src/physical_optimizer/utils.rs +++ b/datafusion/core/src/physical_optimizer/utils.rs @@ -21,7 +21,10 @@ use super::optimizer::PhysicalOptimizerRule; use crate::config::ConfigOptions; use crate::error::Result; +use crate::physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; use crate::physical_plan::sorts::sort::SortExec; +use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; +use crate::physical_plan::windows::{BoundedWindowAggExec, WindowAggExec}; use crate::physical_plan::{with_new_children_if_necessary, ExecutionPlan}; use datafusion_physical_expr::utils::ordering_satisfy; use datafusion_physical_expr::PhysicalSortExpr; @@ -67,3 +70,25 @@ pub fn add_sort_above( } Ok(()) } + +/// Checks whether the given executor is a limit; +/// i.e. either a `LocalLimitExec` or a `GlobalLimitExec`. +pub fn is_limit(plan: &Arc) -> bool { + plan.as_any().is::() || plan.as_any().is::() +} + +/// Checks whether the given executor is a widnow; +/// i.e. either a `WindowAggExec` or a `BoundedWindowAggExec`. +pub fn is_window(plan: &Arc) -> bool { + plan.as_any().is::() || plan.as_any().is::() +} + +/// Checks whether the given executor is a `SortExec`. +pub fn is_sort(plan: &Arc) -> bool { + plan.as_any().is::() +} + +/// Checks whether the given executor is a `SortPreservingMergeExec`. +pub fn is_sort_preserving_merge(plan: &Arc) -> bool { + plan.as_any().is::() +} From 80396b6479b65eb24d3f3bec72c153afa1d7aca2 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 15 Mar 2023 14:01:06 +0300 Subject: [PATCH 16/35] reorganize tests --- .../physical_optimizer/sort_enforcement.rs | 358 ++++++++---------- 1 file changed, 158 insertions(+), 200 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement.rs b/datafusion/core/src/physical_optimizer/sort_enforcement.rs index e62396cef427d..9872d80be9825 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement.rs @@ -698,15 +698,8 @@ fn update_child_to_remove_unnecessary_sort( )?; } *sort_onwards = None; - // Deleting sort may invalidate distribution - let requires_single_partition = matches!( - parent.required_input_distribution()[child_idx], - Distribution::SinglePartition - ); - if requires_single_partition && child.output_partitioning().partition_count() > 1 { - *child = Arc::new(CoalescePartitionsExec::new(child.clone())) as _; - } - Ok(()) + // Deleting sort(SortExec+SortPreservingMergeExec) may invalidate distribution requirement + update_child_to_satisfy_distribution(child, parent, child_idx) } /// Removes the sort from the plan in `sort_onwards`. @@ -743,6 +736,24 @@ fn remove_corresponding_sort_from_sub_plan( } } +/// Updates child to remove the unnecessary sorting below it. +fn update_child_to_satisfy_distribution( + child: &mut Arc, + parent: &Arc, + child_idx: usize, +) -> Result<()> { + // If distribution requirement is not satisfied, satisfies it by adding + // CoalescePartitionsExec. + let requires_single_partition = matches!( + parent.required_input_distribution()[child_idx], + Distribution::SinglePartition + ); + if requires_single_partition && child.output_partitioning().partition_count() > 1 { + *child = Arc::new(CoalescePartitionsExec::new(child.clone())) as _; + } + Ok(()) +} + /// Converts an [ExecutionPlan] trait object to a [PhysicalSortExpr] slice when possible. fn get_sort_exprs(sort_any: &Arc) -> Result<&[PhysicalSortExpr]> { if let Some(sort_exec) = sort_any.as_any().downcast_ref::() { @@ -1234,36 +1245,6 @@ mod tests { #[tokio::test] async fn test_remove_unnecessary_sort5() -> Result<()> { - let schema = create_test_schema()?; - let source = memory_exec(&schema); - - let input = sort_exec(vec![sort_expr("non_nullable_col", &schema)], source); - let input2 = sort_exec( - vec![ - sort_expr("nullable_col", &schema), - sort_expr("non_nullable_col", &schema), - ], - input, - ); - let physical_plan = sort_exec(vec![sort_expr("nullable_col", &schema)], input2); - - let expected_input = vec![ - "SortExec: expr=[nullable_col@0 ASC], global=true", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " SortExec: expr=[non_nullable_col@1 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - // Keep the middle SortExec - let expected_optimized = [ - "SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_remove_unnecessary_sort6() -> Result<()> { let schema = create_test_schema()?; let source1 = repartition_exec(memory_exec(&schema)); @@ -1324,29 +1305,6 @@ mod tests { #[tokio::test] async fn test_remove_unnecessary_spm1() -> Result<()> { - let schema = create_test_schema()?; - let source = memory_exec(&schema); - let input = sort_preserving_merge_exec( - vec![sort_expr("non_nullable_col", &schema)], - source, - ); - let physical_plan = sort_exec(vec![sort_expr("nullable_col", &schema)], input); - - let expected_input = vec![ - "SortExec: expr=[nullable_col@0 ASC], global=true", - " SortPreservingMergeExec: [non_nullable_col@1 ASC]", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - let expected_optimized = vec![ - "SortExec: expr=[nullable_col@0 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_remove_unnecessary_spm2() -> Result<()> { let schema = create_test_schema()?; let source = memory_exec(&schema); let input = sort_preserving_merge_exec( @@ -1841,6 +1799,103 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_window_multi_path_sort() -> Result<()> { + let schema = create_test_schema()?; + + let sort_exprs1 = vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ]; + let sort_exprs2 = vec![sort_expr("nullable_col", &schema)]; + // reverse sorting of sort_exprs2 + let sort_exprs3 = vec![sort_expr_options( + "nullable_col", + &schema, + SortOptions { + descending: true, + nulls_first: false, + }, + )]; + let source1 = parquet_exec_sorted(&schema, sort_exprs1); + let source2 = parquet_exec_sorted(&schema, sort_exprs2); + let sort1 = sort_exec(sort_exprs3.clone(), source1); + let sort2 = sort_exec(sort_exprs3.clone(), source2); + + let union = union_exec(vec![sort1, sort2]); + let physical_plan = bounded_window_exec("nullable_col", sort_exprs3, union); + + // The `WindowAggExec` gets its sorting from multiple children jointly. + // During the removal of `SortExec`s, it should be able to remove the + // corresponding SortExecs together. Also, the inputs of these `SortExec`s + // are not necessarily the same to be able to remove them. + let expected_input = vec![ + "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + " UnionExec", + " SortExec: expr=[nullable_col@0 DESC NULLS LAST], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", + " SortExec: expr=[nullable_col@0 DESC NULLS LAST], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + ]; + let expected_optimized = vec![ + "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(NULL) }]", + " UnionExec", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + + #[tokio::test] + async fn test_window_multi_path_sort2() -> Result<()> { + let schema = create_test_schema()?; + + let sort_exprs1 = vec![ + sort_expr("nullable_col", &schema), + sort_expr("non_nullable_col", &schema), + ]; + let sort_exprs2 = vec![sort_expr("nullable_col", &schema)]; + // reverse sorting of sort_exprs2 + let reversed_sort_exprs2 = vec![sort_expr_options( + "nullable_col", + &schema, + SortOptions { + descending: true, + nulls_first: false, + }, + )]; + let source1 = parquet_exec_sorted(&schema, sort_exprs1); + let source2 = parquet_exec_sorted(&schema, sort_exprs2.clone()); + let sort1 = sort_exec(reversed_sort_exprs2.clone(), source1); + let sort2 = sort_exec(reversed_sort_exprs2, source2); + + let union = union_exec(vec![sort1, sort2]); + let coalesce = Arc::new(CoalescePartitionsExec::new(union)) as _; + let physical_plan = bounded_window_exec("nullable_col", sort_exprs2, coalesce); + + // The `WindowAggExec` can get its required sorting from the leaf nodes directly. + // The unnecessary SortExecs should be removed + let expected_input = vec![ + "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + " CoalescePartitionsExec", + " UnionExec", + " SortExec: expr=[nullable_col@0 DESC NULLS LAST], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", + " SortExec: expr=[nullable_col@0 DESC NULLS LAST], global=true", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + ]; + let expected_optimized = vec![ + "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + " SortPreservingMergeExec: [nullable_col@0 ASC]", + " UnionExec", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + #[tokio::test] async fn test_union_inputs_different_sorted_with_limit() -> Result<()> { let schema = create_test_schema()?; @@ -2151,103 +2206,6 @@ mod tests { Ok(()) } - #[tokio::test] - async fn test_window_multi_path_sort() -> Result<()> { - let schema = create_test_schema()?; - - let sort_exprs1 = vec![ - sort_expr("nullable_col", &schema), - sort_expr("non_nullable_col", &schema), - ]; - let sort_exprs2 = vec![sort_expr("nullable_col", &schema)]; - // reverse sorting of sort_exprs2 - let sort_exprs3 = vec![sort_expr_options( - "nullable_col", - &schema, - SortOptions { - descending: true, - nulls_first: false, - }, - )]; - let source1 = parquet_exec_sorted(&schema, sort_exprs1); - let source2 = parquet_exec_sorted(&schema, sort_exprs2); - let sort1 = sort_exec(sort_exprs3.clone(), source1); - let sort2 = sort_exec(sort_exprs3.clone(), source2); - - let union = union_exec(vec![sort1, sort2]); - let physical_plan = bounded_window_exec("nullable_col", sort_exprs3, union); - - // The `WindowAggExec` gets its sorting from multiple children jointly. - // During the removal of `SortExec`s, it should be able to remove the - // corresponding SortExecs together. Also, the inputs of these `SortExec`s - // are not necessarily the same to be able to remove them. - let expected_input = vec![ - "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " UnionExec", - " SortExec: expr=[nullable_col@0 DESC NULLS LAST], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 DESC NULLS LAST], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - ]; - let expected_optimized = vec![ - "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(NULL) }]", - " UnionExec", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_window_multi_path_sort2() -> Result<()> { - let schema = create_test_schema()?; - - let sort_exprs1 = vec![ - sort_expr("nullable_col", &schema), - sort_expr("non_nullable_col", &schema), - ]; - let sort_exprs2 = vec![sort_expr("nullable_col", &schema)]; - // reverse sorting of sort_exprs2 - let reversed_sort_exprs2 = vec![sort_expr_options( - "nullable_col", - &schema, - SortOptions { - descending: true, - nulls_first: false, - }, - )]; - let source1 = parquet_exec_sorted(&schema, sort_exprs1); - let source2 = parquet_exec_sorted(&schema, sort_exprs2.clone()); - let sort1 = sort_exec(reversed_sort_exprs2.clone(), source1); - let sort2 = sort_exec(reversed_sort_exprs2, source2); - - let union = union_exec(vec![sort1, sort2]); - let coalesce = Arc::new(CoalescePartitionsExec::new(union)) as _; - let physical_plan = bounded_window_exec("nullable_col", sort_exprs2, coalesce); - - // The `WindowAggExec` can get its required sorting from the leaf nodes directly. - // The unnecessary SortExecs should be removed - let expected_input = vec![ - "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " CoalescePartitionsExec", - " UnionExec", - " SortExec: expr=[nullable_col@0 DESC NULLS LAST], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 DESC NULLS LAST], global=true", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - ]; - let expected_optimized = vec![ - "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " SortPreservingMergeExec: [nullable_col@0 ASC]", - " UnionExec", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - #[tokio::test] async fn test_multilayer_coalesce_partitions() -> Result<()> { let schema = create_test_schema()?; @@ -2286,47 +2244,6 @@ mod tests { Ok(()) } - #[tokio::test] - async fn test_coalesce_propagate() -> Result<()> { - let schema = create_test_schema()?; - let source = memory_exec(&schema); - let repartition = repartition_exec(source); - let coalesce_partitions = Arc::new(CoalescePartitionsExec::new(repartition)); - let repartition = repartition_exec(coalesce_partitions); - let sort_exprs = vec![sort_expr("nullable_col", &schema)]; - // Add local sort - let sort = Arc::new(SortExec::new_with_partitioning( - sort_exprs.clone(), - repartition, - true, - None, - )) as _; - let spm = sort_preserving_merge_exec(sort_exprs.clone(), sort); - let sort = sort_exec(sort_exprs, spm); - - let physical_plan = sort.clone(); - // Sort Parallelize rule should end Coalesce + Sort linkage when Sort is Global Sort - // Also input plan is not valid as it is. We need to add SortExec before SortPreservingMergeExec. - let expected_input = vec![ - "SortExec: expr=[nullable_col@0 ASC], global=true", - " SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortExec: expr=[nullable_col@0 ASC], global=false", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " CoalescePartitionsExec", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=0", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - let expected_optimized = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortExec: expr=[nullable_col@0 ASC], global=false", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=0", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - #[tokio::test] // With new change in SortEnforcement EnforceSorting->EnforceDistribution->EnforceSorting // should produce same result with EnforceDistribution+EnforceSorting @@ -2376,6 +2293,47 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_coalesce_propagate() -> Result<()> { + let schema = create_test_schema()?; + let source = memory_exec(&schema); + let repartition = repartition_exec(source); + let coalesce_partitions = Arc::new(CoalescePartitionsExec::new(repartition)); + let repartition = repartition_exec(coalesce_partitions); + let sort_exprs = vec![sort_expr("nullable_col", &schema)]; + // Add local sort + let sort = Arc::new(SortExec::new_with_partitioning( + sort_exprs.clone(), + repartition, + true, + None, + )) as _; + let spm = sort_preserving_merge_exec(sort_exprs.clone(), sort); + let sort = sort_exec(sort_exprs, spm); + + let physical_plan = sort.clone(); + // Sort Parallelize rule should end Coalesce + Sort linkage when Sort is Global Sort + // Also input plan is not valid as it is. We need to add SortExec before SortPreservingMergeExec. + let expected_input = vec![ + "SortExec: expr=[nullable_col@0 ASC], global=true", + " SortPreservingMergeExec: [nullable_col@0 ASC]", + " SortExec: expr=[nullable_col@0 ASC], global=false", + " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", + " CoalescePartitionsExec", + " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=0", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + let expected_optimized = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " SortExec: expr=[nullable_col@0 ASC], global=false", + " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10", + " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=0", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + /// make PhysicalSortExpr with default options fn sort_expr(name: &str, schema: &Schema) -> PhysicalSortExpr { sort_expr_options(name, schema, SortOptions::default()) From ef331ccbbc7ea7a2565b15f4f64177aaff57cdbb Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 15 Mar 2023 14:27:25 +0300 Subject: [PATCH 17/35] simplify sort pushdown --- .../physical_optimizer/sort_enforcement.rs | 54 +++++++++---------- .../src/physical_optimizer/sort_pushdown.rs | 41 ++++++-------- 2 files changed, 42 insertions(+), 53 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement.rs b/datafusion/core/src/physical_optimizer/sort_enforcement.rs index 9872d80be9825..682f953249b6a 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement.rs @@ -327,7 +327,7 @@ impl PhysicalOptimizerRule for EnforceSorting { } else { adjusted.plan }; - // Execute a Top-Down process(Preorder Traversal) to ensure the sort requirements: + // Execute a Top-Down process (Preorder Traversal) to push down sorts if they are helpful: let sort_pushdown = SortPushDown::init(new_plan); let adjusted = sort_pushdown.transform_down(&pushdown_sorts)?; Ok(adjusted.plan) @@ -1132,6 +1132,32 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_remove_unnecessary_sort1() -> Result<()> { + let schema = create_test_schema()?; + let source = memory_exec(&schema); + let sort_exprs = vec![sort_expr("nullable_col", &schema)]; + let sort = sort_exec(sort_exprs.clone(), source); + let spm = sort_preserving_merge_exec(sort_exprs, sort); + + let sort_exprs = vec![sort_expr("nullable_col", &schema)]; + let sort = sort_exec(sort_exprs.clone(), spm); + let physical_plan = sort_preserving_merge_exec(sort_exprs, sort); + let expected_input = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " SortExec: expr=[nullable_col@0 ASC], global=true", + " SortPreservingMergeExec: [nullable_col@0 ASC]", + " SortExec: expr=[nullable_col@0 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + let expected_optimized = vec![ + "SortExec: expr=[nullable_col@0 ASC], global=true", + " MemoryExec: partitions=0, partition_sizes=[]", + ]; + assert_optimized!(expected_input, expected_optimized, physical_plan); + Ok(()) + } + #[tokio::test] async fn test_remove_unnecessary_sort2() -> Result<()> { let schema = create_test_schema()?; @@ -1219,32 +1245,6 @@ mod tests { #[tokio::test] async fn test_remove_unnecessary_sort4() -> Result<()> { - let schema = create_test_schema()?; - let source = memory_exec(&schema); - let sort_exprs = vec![sort_expr("nullable_col", &schema)]; - let sort = sort_exec(sort_exprs.clone(), source); - let spm = sort_preserving_merge_exec(sort_exprs, sort); - - let sort_exprs = vec![sort_expr("nullable_col", &schema)]; - let sort = sort_exec(sort_exprs.clone(), spm); - let physical_plan = sort_preserving_merge_exec(sort_exprs, sort); - let expected_input = vec![ - "SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortExec: expr=[nullable_col@0 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - let expected_optimized = vec![ - "SortExec: expr=[nullable_col@0 ASC], global=true", - " MemoryExec: partitions=0, partition_sizes=[]", - ]; - assert_optimized!(expected_input, expected_optimized, physical_plan); - Ok(()) - } - - #[tokio::test] - async fn test_remove_unnecessary_sort5() -> Result<()> { let schema = create_test_schema()?; let source1 = repartition_exec(memory_exec(&schema)); diff --git a/datafusion/core/src/physical_optimizer/sort_pushdown.rs b/datafusion/core/src/physical_optimizer/sort_pushdown.rs index 01a66236d2265..9fdefbd9ed719 100644 --- a/datafusion/core/src/physical_optimizer/sort_pushdown.rs +++ b/datafusion/core/src/physical_optimizer/sort_pushdown.rs @@ -23,7 +23,7 @@ use crate::physical_plan::repartition::RepartitionExec; use crate::physical_plan::sorts::sort::SortExec; use crate::physical_plan::tree_node::TreeNodeRewritable; use crate::physical_plan::union::UnionExec; -use crate::physical_plan::{with_new_children_if_necessary, Distribution, ExecutionPlan}; +use crate::physical_plan::{with_new_children_if_necessary, ExecutionPlan}; use datafusion_common::{DataFusionError, Result}; use datafusion_expr::JoinType; use datafusion_physical_expr::expressions::Column; @@ -46,8 +46,6 @@ pub(crate) struct SortPushDown { pub plan: Arc, /// Whether the plan could impact the final result ordering impact_result_ordering: bool, - /// Parent has the SinglePartition requirement to children - satisfy_single_distribution: bool, /// Parent required sort ordering required_ordering: Option>, /// The adjusted request sort ordering to children. @@ -64,7 +62,6 @@ impl SortPushDown { SortPushDown { plan, impact_result_ordering, - satisfy_single_distribution: false, required_ordering: None, adjusted_request_ordering: request_ordering, } @@ -75,7 +72,6 @@ impl SortPushDown { SortPushDown { plan, impact_result_ordering: false, - satisfy_single_distribution: false, required_ordering: None, adjusted_request_ordering: request_ordering, } @@ -89,27 +85,21 @@ impl SortPushDown { plan_children.into_iter(), self.adjusted_request_ordering.clone().into_iter(), self.plan.maintains_input_order().into_iter(), - self.plan.required_input_distribution().into_iter(), - ) - .map( - |(child, from_parent, maintains_input_order, required_dist)| { - let child_satisfy_single_distribution = - matches!(required_dist, Distribution::SinglePartition); - let child_impact_result_ordering = if is_limit(&self.plan) { - true - } else { - maintains_input_order && self.impact_result_ordering - }; - let child_request_ordering = child.required_input_ordering(); - SortPushDown { - plan: child, - impact_result_ordering: child_impact_result_ordering, - satisfy_single_distribution: child_satisfy_single_distribution, - required_ordering: from_parent, - adjusted_request_ordering: child_request_ordering, - } - }, ) + .map(|(child, from_parent, maintains_input_order)| { + let child_impact_result_ordering = if is_limit(&self.plan) { + true + } else { + maintains_input_order && self.impact_result_ordering + }; + let child_request_ordering = child.required_input_ordering(); + SortPushDown { + plan: child, + impact_result_ordering: child_impact_result_ordering, + required_ordering: from_parent, + adjusted_request_ordering: child_request_ordering, + } + }) .collect() } } @@ -136,7 +126,6 @@ impl TreeNodeRewritable for SortPushDown { Ok(SortPushDown { plan, impact_result_ordering: self.impact_result_ordering, - satisfy_single_distribution: self.satisfy_single_distribution, required_ordering: self.required_ordering, adjusted_request_ordering: self.adjusted_request_ordering, }) From c50988d5d0e2e307f39335348d61ecde568479ce Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 15 Mar 2023 17:07:49 +0300 Subject: [PATCH 18/35] remove global sort print --- .../physical_optimizer/dist_enforcement.rs | 32 ++-- .../src/physical_optimizer/repartition.rs | 18 +- .../physical_optimizer/sort_enforcement.rs | 164 +++++++++--------- .../core/src/physical_plan/sorts/sort.rs | 16 +- datafusion/core/tests/sql/joins.rs | 24 +-- datafusion/core/tests/sql/window.rs | 8 +- .../tests/sqllogictests/test_files/order.slt | 2 +- .../tests/sqllogictests/test_files/window.slt | 66 +++---- 8 files changed, 159 insertions(+), 171 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/dist_enforcement.rs b/datafusion/core/src/physical_optimizer/dist_enforcement.rs index d30fea9c65ad6..8b5a464b89365 100644 --- a/datafusion/core/src/physical_optimizer/dist_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/dist_enforcement.rs @@ -1925,31 +1925,31 @@ mod tests { vec![ top_join_plan.as_str(), join_plan.as_str(), - "SortExec: expr=[a@0 ASC], global=false", + "SortExec: expr=[a@0 ASC]", "RepartitionExec: partitioning=Hash([Column { name: \"a\", index: 0 }], 10), input_partitions=1", "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", - "SortExec: expr=[b1@1 ASC], global=false", + "SortExec: expr=[b1@1 ASC]", "RepartitionExec: partitioning=Hash([Column { name: \"b1\", index: 1 }], 10), input_partitions=1", "ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", - "SortExec: expr=[c@2 ASC], global=false", + "SortExec: expr=[c@2 ASC]", "RepartitionExec: partitioning=Hash([Column { name: \"c\", index: 2 }], 10), input_partitions=1", "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", ], // Should include 4 RepartitionExecs _ => vec![ top_join_plan.as_str(), - "SortExec: expr=[a@0 ASC], global=false", + "SortExec: expr=[a@0 ASC]", "RepartitionExec: partitioning=Hash([Column { name: \"a\", index: 0 }], 10), input_partitions=10", join_plan.as_str(), - "SortExec: expr=[a@0 ASC], global=false", + "SortExec: expr=[a@0 ASC]", "RepartitionExec: partitioning=Hash([Column { name: \"a\", index: 0 }], 10), input_partitions=1", "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", - "SortExec: expr=[b1@1 ASC], global=false", + "SortExec: expr=[b1@1 ASC]", "RepartitionExec: partitioning=Hash([Column { name: \"b1\", index: 1 }], 10), input_partitions=1", "ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", - "SortExec: expr=[c@2 ASC], global=false", + "SortExec: expr=[c@2 ASC]", "RepartitionExec: partitioning=Hash([Column { name: \"c\", index: 2 }], 10), input_partitions=1", "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", ], @@ -1978,31 +1978,31 @@ mod tests { JoinType::Inner | JoinType::Right => vec![ top_join_plan.as_str(), join_plan.as_str(), - "SortExec: expr=[a@0 ASC], global=false", + "SortExec: expr=[a@0 ASC]", "RepartitionExec: partitioning=Hash([Column { name: \"a\", index: 0 }], 10), input_partitions=1", "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", - "SortExec: expr=[b1@1 ASC], global=false", + "SortExec: expr=[b1@1 ASC]", "RepartitionExec: partitioning=Hash([Column { name: \"b1\", index: 1 }], 10), input_partitions=1", "ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", - "SortExec: expr=[c@2 ASC], global=false", + "SortExec: expr=[c@2 ASC]", "RepartitionExec: partitioning=Hash([Column { name: \"c\", index: 2 }], 10), input_partitions=1", "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", ], // Should include 4 RepartitionExecs and 4 SortExecs _ => vec![ top_join_plan.as_str(), - "SortExec: expr=[b1@6 ASC], global=false", + "SortExec: expr=[b1@6 ASC]", "RepartitionExec: partitioning=Hash([Column { name: \"b1\", index: 6 }], 10), input_partitions=10", join_plan.as_str(), - "SortExec: expr=[a@0 ASC], global=false", + "SortExec: expr=[a@0 ASC]", "RepartitionExec: partitioning=Hash([Column { name: \"a\", index: 0 }], 10), input_partitions=1", "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", - "SortExec: expr=[b1@1 ASC], global=false", + "SortExec: expr=[b1@1 ASC]", "RepartitionExec: partitioning=Hash([Column { name: \"b1\", index: 1 }], 10), input_partitions=1", "ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", - "SortExec: expr=[c@2 ASC], global=false", + "SortExec: expr=[c@2 ASC]", "RepartitionExec: partitioning=Hash([Column { name: \"c\", index: 2 }], 10), input_partitions=1", "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", ], @@ -2065,14 +2065,14 @@ mod tests { // Only two RepartitionExecs added let expected = &[ "SortMergeJoin: join_type=Inner, on=[(Column { name: \"b3\", index: 1 }, Column { name: \"b2\", index: 1 }), (Column { name: \"a3\", index: 0 }, Column { name: \"a2\", index: 0 })]", - "SortExec: expr=[b3@1 ASC,a3@0 ASC], global=false", + "SortExec: expr=[b3@1 ASC,a3@0 ASC]", "ProjectionExec: expr=[a1@0 as a3, b1@1 as b3]", "ProjectionExec: expr=[a1@1 as a1, b1@0 as b1]", "AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[]", "RepartitionExec: partitioning=Hash([Column { name: \"b1\", index: 0 }, Column { name: \"a1\", index: 1 }], 10), input_partitions=1", "AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[]", "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c, d, e]", - "SortExec: expr=[b2@1 ASC,a2@0 ASC], global=false", + "SortExec: expr=[b2@1 ASC,a2@0 ASC]", "ProjectionExec: expr=[a@1 as a2, b@0 as b2]", "AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[]", "RepartitionExec: partitioning=Hash([Column { name: \"b\", index: 0 }, Column { name: \"a\", index: 1 }], 10), input_partitions=1", diff --git a/datafusion/core/src/physical_optimizer/repartition.rs b/datafusion/core/src/physical_optimizer/repartition.rs index be6d869e75a42..58e76c5b7a17c 100644 --- a/datafusion/core/src/physical_optimizer/repartition.rs +++ b/datafusion/core/src/physical_optimizer/repartition.rs @@ -635,7 +635,7 @@ mod tests { "GlobalLimitExec: skip=0, fetch=100", "LocalLimitExec: fetch=100", // data is sorted so can't repartition here - "SortExec: expr=[c1@0 ASC], global=true", + "SortExec: expr=[c1@0 ASC]", "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[c1]", ]; @@ -653,7 +653,7 @@ mod tests { "FilterExec: c1@0", // data is sorted so can't repartition here even though // filter would benefit from parallelism, the answers might be wrong - "SortExec: expr=[c1@0 ASC], global=true", + "SortExec: expr=[c1@0 ASC]", "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[c1]", ]; @@ -741,7 +741,7 @@ mod tests { // need repartiton and resort as the data was not sorted correctly let expected = &[ "SortPreservingMergeExec: [c1@0 ASC]", - "SortExec: expr=[c1@0 ASC], global=false", + "SortExec: expr=[c1@0 ASC]", "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[c1]", ]; @@ -841,7 +841,7 @@ mod tests { // needs to repartition / sort as the data was not sorted correctly let expected = &[ "SortPreservingMergeExec: [c1@0 ASC]", - "SortExec: expr=[c1@0 ASC], global=false", + "SortExec: expr=[c1@0 ASC]", "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", "ProjectionExec: expr=[c1@0 as c1]", "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[c1]", @@ -874,7 +874,7 @@ mod tests { sort_preserving_merge_exec(sort_exec(projection_exec(parquet_exec()), true)); let expected = &[ - "SortExec: expr=[c1@0 ASC], global=true", + "SortExec: expr=[c1@0 ASC]", "ProjectionExec: expr=[c1@0 as c1]", "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[c1]", ]; @@ -891,7 +891,7 @@ mod tests { let expected = &[ "SortPreservingMergeExec: [c1@0 ASC]", // Expect repartition on the input to the sort (as it can benefit from additional parallelism) - "SortExec: expr=[c1@0 ASC], global=false", + "SortExec: expr=[c1@0 ASC]", "FilterExec: c1@0", "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[c1]", @@ -911,7 +911,7 @@ mod tests { let expected = &[ "SortPreservingMergeExec: [c1@0 ASC]", // Expect repartition on the input to the sort (as it can benefit from additional parallelism) - "SortExec: expr=[c1@0 ASC], global=false", + "SortExec: expr=[c1@0 ASC]", "ProjectionExec: expr=[c1@0 as c1]", "FilterExec: c1@0", // repartition is lowest down @@ -978,7 +978,7 @@ mod tests { "GlobalLimitExec: skip=0, fetch=100", "LocalLimitExec: fetch=100", // data is sorted so can't repartition here - "SortExec: expr=[c1@0 ASC], global=true", + "SortExec: expr=[c1@0 ASC]", // Doesn't parallelize for SortExec without preserve_partitioning "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[c1]", ]; @@ -997,7 +997,7 @@ mod tests { "FilterExec: c1@0", // data is sorted so can't repartition here even though // filter would benefit from parallelism, the answers might be wrong - "SortExec: expr=[c1@0 ASC], global=true", + "SortExec: expr=[c1@0 ASC]", // SortExec doesn't benefit from input partitioning "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[c1]", ]; diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement.rs b/datafusion/core/src/physical_optimizer/sort_enforcement.rs index 682f953249b6a..2b89c29c4d141 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement.rs @@ -1040,12 +1040,12 @@ mod tests { let physical_plan = sort_exec(vec![sort_expr("nullable_col", &schema)], input); let expected_input = vec![ - "SortExec: expr=[nullable_col@0 ASC], global=true", - " SortExec: expr=[non_nullable_col@1 ASC], global=true", + "SortExec: expr=[nullable_col@0 ASC]", + " SortExec: expr=[non_nullable_col@1 ASC]", " MemoryExec: partitions=0, partition_sizes=[]", ]; let expected_optimized = vec![ - "SortExec: expr=[nullable_col@0 ASC], global=true", + "SortExec: expr=[nullable_col@0 ASC]", " MemoryExec: partitions=0, partition_sizes=[]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); @@ -1094,9 +1094,9 @@ mod tests { let expected_input = vec![ "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", " FilterExec: NOT non_nullable_col@1", - " SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], global=true", + " SortExec: expr=[non_nullable_col@1 ASC NULLS LAST]", " BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " SortExec: expr=[non_nullable_col@1 DESC], global=true", + " SortExec: expr=[non_nullable_col@1 DESC]", " MemoryExec: partitions=0, partition_sizes=[]", ]; @@ -1104,7 +1104,7 @@ mod tests { "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(NULL) }]", " FilterExec: NOT non_nullable_col@1", " BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " SortExec: expr=[non_nullable_col@1 DESC], global=true", + " SortExec: expr=[non_nullable_col@1 DESC]", " MemoryExec: partitions=0, partition_sizes=[]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); @@ -1125,7 +1125,7 @@ mod tests { " MemoryExec: partitions=0, partition_sizes=[]", ]; let expected_optimized = vec![ - "SortExec: expr=[nullable_col@0 ASC], global=true", + "SortExec: expr=[nullable_col@0 ASC]", " MemoryExec: partitions=0, partition_sizes=[]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); @@ -1145,13 +1145,13 @@ mod tests { let physical_plan = sort_preserving_merge_exec(sort_exprs, sort); let expected_input = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortExec: expr=[nullable_col@0 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC]", " SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortExec: expr=[nullable_col@0 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC]", " MemoryExec: partitions=0, partition_sizes=[]", ]; let expected_optimized = vec![ - "SortExec: expr=[nullable_col@0 ASC], global=true", + "SortExec: expr=[nullable_col@0 ASC]", " MemoryExec: partitions=0, partition_sizes=[]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); @@ -1180,11 +1180,11 @@ mod tests { let expected_input = vec![ "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10", " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[nullable_col@0 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC]", " SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC]", " SortPreservingMergeExec: [non_nullable_col@1 ASC]", - " SortExec: expr=[non_nullable_col@1 ASC], global=true", + " SortExec: expr=[non_nullable_col@1 ASC]", " MemoryExec: partitions=0, partition_sizes=[]", ]; @@ -1226,10 +1226,10 @@ mod tests { let expected_input = vec![ "AggregateExec: mode=Final, gby=[], aggr=[]", " SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=false", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC]", " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", " SortPreservingMergeExec: [non_nullable_col@1 ASC]", - " SortExec: expr=[non_nullable_col@1 ASC], global=true", + " SortExec: expr=[non_nullable_col@1 ASC]", " MemoryExec: partitions=0, partition_sizes=[]", ]; @@ -1278,10 +1278,10 @@ mod tests { // requirements are not violated. In some cases, we may need to replace // it with a `CoalescePartitionsExec` instead of directly removing it. let expected_input = vec![ - "SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + "SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC]", " FilterExec: NOT non_nullable_col@1", " SortPreservingMergeExec: [non_nullable_col@1 ASC]", - " SortExec: expr=[non_nullable_col@1 ASC], global=false", + " SortExec: expr=[non_nullable_col@1 ASC]", " UnionExec", " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=0", " MemoryExec: partitions=0, partition_sizes=[]", @@ -1291,7 +1291,7 @@ mod tests { let expected_optimized = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=false", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC]", " FilterExec: NOT non_nullable_col@1", " UnionExec", " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=0", @@ -1325,7 +1325,7 @@ mod tests { " MemoryExec: partitions=0, partition_sizes=[]", ]; let expected_optimized = vec![ - "SortExec: expr=[nullable_col@0 ASC], global=true", + "SortExec: expr=[nullable_col@0 ASC]", " MemoryExec: partitions=0, partition_sizes=[]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); @@ -1358,20 +1358,20 @@ mod tests { " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", " GlobalLimitExec: skip=0, fetch=100", " LocalLimitExec: fetch=100", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; // We should keep the bottom `SortExec`. let expected_optimized = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=false", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC]", " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", " UnionExec", " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", " GlobalLimitExec: skip=0, fetch=100", " LocalLimitExec: fetch=100", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); @@ -1390,11 +1390,11 @@ mod tests { let physical_plan = sort_preserving_merge_exec(sort_exprs, sort); let expected_input = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", - " SortExec: expr=[nullable_col@0 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC]", " MemoryExec: partitions=0, partition_sizes=[]", ]; let expected_optimized = vec![ - "SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + "SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC]", " MemoryExec: partitions=0, partition_sizes=[]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); @@ -1416,12 +1416,12 @@ mod tests { let expected_input = vec![ "SortPreservingMergeExec: [non_nullable_col@1 ASC]", - " SortExec: expr=[nullable_col@0 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC]", " SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", " MemoryExec: partitions=0, partition_sizes=[]", ]; let expected_optimized = vec![ - "SortExec: expr=[non_nullable_col@1 ASC], global=true", + "SortExec: expr=[non_nullable_col@1 ASC]", " MemoryExec: partitions=0, partition_sizes=[]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); @@ -1446,7 +1446,7 @@ mod tests { "SortPreservingMergeExec: [nullable_col@0 ASC]", " UnionExec", " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; // should not add a sort at the output of the union, input plan should not be changed @@ -1477,7 +1477,7 @@ mod tests { "SortPreservingMergeExec: [nullable_col@0 ASC]", " UnionExec", " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; // should not add a sort at the output of the union, input plan should not be changed @@ -1510,16 +1510,16 @@ mod tests { "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", " UnionExec", " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; let expected_optimized = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", " UnionExec", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); @@ -1551,20 +1551,20 @@ mod tests { let expected_input = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC]", " UnionExec", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; // should adjust sorting in the first input of the union such that it is not unnecessarily fine let expected_optimized = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC]", " UnionExec", - " SortExec: expr=[nullable_col@0 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); @@ -1596,20 +1596,20 @@ mod tests { let expected_input = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", " UnionExec", - " SortExec: expr=[nullable_col@0 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; let expected_optimized = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", " UnionExec", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); @@ -1649,17 +1649,17 @@ mod tests { let expected_input = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC]", " UnionExec", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 DESC NULLS LAST], global=true", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 DESC NULLS LAST]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; let expected_optimized = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC]", " UnionExec", - " SortExec: expr=[nullable_col@0 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); @@ -1695,7 +1695,7 @@ mod tests { let expected_input = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC]", " UnionExec", - " SortExec: expr=[nullable_col@0 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", " SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", @@ -1707,10 +1707,10 @@ mod tests { let expected_optimized = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC]", " UnionExec", - " SortExec: expr=[nullable_col@0 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 ASC], global=false", + " SortExec: expr=[nullable_col@0 ASC]", " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; @@ -1738,9 +1738,9 @@ mod tests { let expected_input = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC]", " UnionExec", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; assert_optimized!(expected_input, expected_input, physical_plan); @@ -1783,9 +1783,9 @@ mod tests { // example below. let expected_input = vec![ "UnionExec", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 DESC NULLS LAST,non_nullable_col@1 DESC NULLS LAST], global=true", + " SortExec: expr=[nullable_col@0 DESC NULLS LAST,non_nullable_col@1 DESC NULLS LAST]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; // Since `UnionExec` doesn't preserve ordering in the plan above. @@ -1832,9 +1832,9 @@ mod tests { let expected_input = vec![ "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", " UnionExec", - " SortExec: expr=[nullable_col@0 DESC NULLS LAST], global=true", + " SortExec: expr=[nullable_col@0 DESC NULLS LAST]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 DESC NULLS LAST], global=true", + " SortExec: expr=[nullable_col@0 DESC NULLS LAST]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", ]; let expected_optimized = vec![ @@ -1880,9 +1880,9 @@ mod tests { "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", " CoalescePartitionsExec", " UnionExec", - " SortExec: expr=[nullable_col@0 DESC NULLS LAST], global=true", + " SortExec: expr=[nullable_col@0 DESC NULLS LAST]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 DESC NULLS LAST], global=true", + " SortExec: expr=[nullable_col@0 DESC NULLS LAST]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", ]; let expected_optimized = vec![ @@ -1930,21 +1930,21 @@ mod tests { let expected_input = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC]", " UnionExec", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", " GlobalLimitExec: skip=0, fetch=100", " LocalLimitExec: fetch=100", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 DESC NULLS LAST], global=true", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 DESC NULLS LAST]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; let expected_optimized = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC]", " UnionExec", - " SortExec: expr=[nullable_col@0 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", " GlobalLimitExec: skip=0, fetch=100", " LocalLimitExec: fetch=100", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 DESC NULLS LAST], global=true", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 DESC NULLS LAST]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); @@ -2000,20 +2000,20 @@ mod tests { // can push down the sort requirements and save 1 SortExec vec![ join_plan.as_str(), - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[col_a@0 ASC], global=true", + " SortExec: expr=[col_a@0 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", ] } _ => { // can not push down the sort requirements vec![ - "SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + "SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC]", join_plan2.as_str(), - " SortExec: expr=[nullable_col@0 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[col_a@0 ASC], global=true", + " SortExec: expr=[col_a@0 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", ] } @@ -2074,20 +2074,20 @@ mod tests { // can push down the sort requirements and save 1 SortExec vec![ join_plan.as_str(), - " SortExec: expr=[nullable_col@0 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[col_a@0 ASC,col_b@1 ASC], global=true", + " SortExec: expr=[col_a@0 ASC,col_b@1 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", ] } _ => { // can not push down the sort requirements for Left and Full join. vec![ - "SortExec: expr=[col_a@2 ASC,col_b@3 ASC], global=true", + "SortExec: expr=[col_a@2 ASC,col_b@3 ASC]", join_plan2.as_str(), - " SortExec: expr=[nullable_col@0 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[col_a@0 ASC], global=true", + " SortExec: expr=[col_a@0 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", ] } @@ -2129,11 +2129,11 @@ mod tests { // can not push down the sort requirements, need to add SortExec let expected_optimized = vec![ - "SortExec: expr=[col_b@3 ASC,col_a@2 ASC], global=true", + "SortExec: expr=[col_b@3 ASC,col_a@2 ASC]", " SortMergeJoin: join_type=Inner, on=[(Column { name: \"nullable_col\", index: 0 }, Column { name: \"col_a\", index: 0 })]", - " SortExec: expr=[nullable_col@0 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[col_a@0 ASC], global=true", + " SortExec: expr=[col_a@0 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); @@ -2155,11 +2155,11 @@ mod tests { // can not push down the sort requirements, need to add SortExec let expected_optimized = vec![ - "SortExec: expr=[nullable_col@0 ASC,col_b@3 ASC,col_a@2 ASC], global=true", + "SortExec: expr=[nullable_col@0 ASC,col_b@3 ASC,col_a@2 ASC]", " SortMergeJoin: join_type=Inner, on=[(Column { name: \"nullable_col\", index: 0 }, Column { name: \"col_a\", index: 0 })]", - " SortExec: expr=[nullable_col@0 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[col_a@0 ASC], global=true", + " SortExec: expr=[col_a@0 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[col_a, col_b]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); @@ -2191,7 +2191,7 @@ mod tests { "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", " BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", " BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " SortExec: expr=[nullable_col@0 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC]", " MemoryExec: partitions=0, partition_sizes=[]", ]; @@ -2199,7 +2199,7 @@ mod tests { "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", " BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", " BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC], global=true", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC]", " MemoryExec: partitions=0, partition_sizes=[]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); @@ -2227,7 +2227,7 @@ mod tests { // we should be able to parallelize Sorting also (given that executors in between don't require) // single partition. let expected_input = vec![ - "SortExec: expr=[nullable_col@0 ASC], global=true", + "SortExec: expr=[nullable_col@0 ASC]", " FilterExec: NOT non_nullable_col@1", " CoalescePartitionsExec", " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", @@ -2235,7 +2235,7 @@ mod tests { ]; let expected_optimized = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortExec: expr=[nullable_col@0 ASC], global=false", + " SortExec: expr=[nullable_col@0 ASC]", " FilterExec: NOT non_nullable_col@1", " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", @@ -2315,9 +2315,9 @@ mod tests { // Sort Parallelize rule should end Coalesce + Sort linkage when Sort is Global Sort // Also input plan is not valid as it is. We need to add SortExec before SortPreservingMergeExec. let expected_input = vec![ - "SortExec: expr=[nullable_col@0 ASC], global=true", + "SortExec: expr=[nullable_col@0 ASC]", " SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortExec: expr=[nullable_col@0 ASC], global=false", + " SortExec: expr=[nullable_col@0 ASC]", " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", " CoalescePartitionsExec", " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=0", @@ -2325,7 +2325,7 @@ mod tests { ]; let expected_optimized = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC]", - " SortExec: expr=[nullable_col@0 ASC], global=false", + " SortExec: expr=[nullable_col@0 ASC]", " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10", " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=0", " MemoryExec: partitions=0, partition_sizes=[]", diff --git a/datafusion/core/src/physical_plan/sorts/sort.rs b/datafusion/core/src/physical_plan/sorts/sort.rs index 1decb3c84a647..552924ba666c5 100644 --- a/datafusion/core/src/physical_plan/sorts/sort.rs +++ b/datafusion/core/src/physical_plan/sorts/sort.rs @@ -790,26 +790,14 @@ impl ExecutionPlan for SortExec { t: DisplayFormatType, f: &mut std::fmt::Formatter, ) -> std::fmt::Result { - // let is_global = !self.preserve_partitioning; - let is_global = self.output_partitioning().partition_count() <= 1; match t { DisplayFormatType::Default => { let expr: Vec = self.expr.iter().map(|e| e.to_string()).collect(); match self.fetch { Some(fetch) => { - write!( - f, - "SortExec: fetch={fetch}, expr=[{}], global={}", - expr.join(","), - is_global - ) + write!(f, "SortExec: fetch={fetch}, expr=[{}]", expr.join(","),) } - None => write!( - f, - "SortExec: expr=[{}], global={}", - expr.join(","), - is_global - ), + None => write!(f, "SortExec: expr=[{}]", expr.join(","),), } } } diff --git a/datafusion/core/tests/sql/joins.rs b/datafusion/core/tests/sql/joins.rs index 052e05df3316d..b9b4c5cf31d58 100644 --- a/datafusion/core/tests/sql/joins.rs +++ b/datafusion/core/tests/sql/joins.rs @@ -1869,12 +1869,12 @@ async fn sort_merge_join_on_date32() -> Result<()> { let physical_plan = dataframe.create_physical_plan().await?; let expected = vec![ "SortMergeJoin: join_type=Inner, on=[(Column { name: \"c1\", index: 0 }, Column { name: \"c1\", index: 0 })]", - " SortExec: expr=[c1@0 ASC], global=false", + " SortExec: expr=[c1@0 ASC]", " CoalesceBatchesExec: target_batch_size=4096", " RepartitionExec: partitioning=Hash([Column { name: \"c1\", index: 0 }], 2), input_partitions=2", " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", " MemoryExec: partitions=1, partition_sizes=[1]", - " SortExec: expr=[c1@0 ASC], global=false", + " SortExec: expr=[c1@0 ASC]", " CoalesceBatchesExec: target_batch_size=4096", " RepartitionExec: partitioning=Hash([Column { name: \"c1\", index: 0 }], 2), input_partitions=2", " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", @@ -1914,13 +1914,13 @@ async fn sort_merge_join_on_decimal() -> Result<()> { let expected = vec![ "ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3, c4@3 as c4, c1@5 as c1, c2@6 as c2, c3@7 as c3, c4@8 as c4]", " SortMergeJoin: join_type=Right, on=[(Column { name: \"CAST(t1.c3 AS Decimal128(10, 2))\", index: 4 }, Column { name: \"c3\", index: 2 })]", - " SortExec: expr=[CAST(t1.c3 AS Decimal128(10, 2))@4 ASC], global=false", + " SortExec: expr=[CAST(t1.c3 AS Decimal128(10, 2))@4 ASC]", " CoalesceBatchesExec: target_batch_size=4096", " RepartitionExec: partitioning=Hash([Column { name: \"CAST(t1.c3 AS Decimal128(10, 2))\", index: 4 }], 2), input_partitions=2", " ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3, c4@3 as c4, CAST(c3@2 AS Decimal128(10, 2)) as CAST(t1.c3 AS Decimal128(10, 2))]", " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", " MemoryExec: partitions=1, partition_sizes=[1]", - " SortExec: expr=[c3@2 ASC], global=false", + " SortExec: expr=[c3@2 ASC]", " CoalesceBatchesExec: target_batch_size=4096", " RepartitionExec: partitioning=Hash([Column { name: \"c3\", index: 2 }], 2), input_partitions=2", " RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1", @@ -1968,7 +1968,7 @@ async fn left_semi_join() -> Result<()> { let expected = if repartition_joins { vec![ "SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]", - " SortExec: expr=[t1_id@0 ASC NULLS LAST], global=false", + " SortExec: expr=[t1_id@0 ASC NULLS LAST]", " CoalesceBatchesExec: target_batch_size=4096", " HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(Column { name: \"t1_id\", index: 0 }, Column { name: \"t2_id\", index: 0 })]", " CoalesceBatchesExec: target_batch_size=4096", @@ -1983,7 +1983,7 @@ async fn left_semi_join() -> Result<()> { ] } else { vec![ - "SortExec: expr=[t1_id@0 ASC NULLS LAST], global=true", + "SortExec: expr=[t1_id@0 ASC NULLS LAST]", " CoalesceBatchesExec: target_batch_size=4096", " HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(Column { name: \"t1_id\", index: 0 }, Column { name: \"t2_id\", index: 0 })]", " MemoryExec: partitions=1, partition_sizes=[1]", @@ -2046,7 +2046,7 @@ async fn left_semi_join() -> Result<()> { let expected = if repartition_joins { vec![ "SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]", - " SortExec: expr=[t1_id@0 ASC NULLS LAST], global=false", + " SortExec: expr=[t1_id@0 ASC NULLS LAST]", " CoalesceBatchesExec: target_batch_size=4096", " HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(Column { name: \"t1_id\", index: 0 }, Column { name: \"t2_id\", index: 0 })]", " CoalesceBatchesExec: target_batch_size=4096", @@ -2060,7 +2060,7 @@ async fn left_semi_join() -> Result<()> { ] } else { vec![ - "SortExec: expr=[t1_id@0 ASC NULLS LAST], global=true", + "SortExec: expr=[t1_id@0 ASC NULLS LAST]", " CoalesceBatchesExec: target_batch_size=4096", " HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(Column { name: \"t1_id\", index: 0 }, Column { name: \"t2_id\", index: 0 })]", " MemoryExec: partitions=1, partition_sizes=[1]", @@ -2238,7 +2238,7 @@ async fn right_semi_join() -> Result<()> { let physical_plan = dataframe.create_physical_plan().await?; let expected = if repartition_joins { vec!["SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]", - " SortExec: expr=[t1_id@0 ASC NULLS LAST], global=false", + " SortExec: expr=[t1_id@0 ASC NULLS LAST]", " CoalesceBatchesExec: target_batch_size=4096", " HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(Column { name: \"t2_id\", index: 0 }, Column { name: \"t1_id\", index: 0 })], filter=BinaryExpr { left: Column { name: \"t2_name\", index: 1 }, op: NotEq, right: Column { name: \"t1_name\", index: 0 } }", " CoalesceBatchesExec: target_batch_size=4096", @@ -2252,7 +2252,7 @@ async fn right_semi_join() -> Result<()> { ] } else { vec![ - "SortExec: expr=[t1_id@0 ASC NULLS LAST], global=true", + "SortExec: expr=[t1_id@0 ASC NULLS LAST]", " CoalesceBatchesExec: target_batch_size=4096", " HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(Column { name: \"t2_id\", index: 0 }, Column { name: \"t1_id\", index: 0 })], filter=BinaryExpr { left: Column { name: \"t2_name\", index: 1 }, op: NotEq, right: Column { name: \"t1_name\", index: 0 } }", " MemoryExec: partitions=1, partition_sizes=[1]", @@ -2282,7 +2282,7 @@ async fn right_semi_join() -> Result<()> { let physical_plan = dataframe.create_physical_plan().await?; let expected = if repartition_joins { vec!["SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]", - " SortExec: expr=[t1_id@0 ASC NULLS LAST], global=false", + " SortExec: expr=[t1_id@0 ASC NULLS LAST]", " CoalesceBatchesExec: target_batch_size=4096", " HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(Column { name: \"t2_id\", index: 0 }, Column { name: \"t1_id\", index: 0 })], filter=BinaryExpr { left: Column { name: \"t2_name\", index: 0 }, op: NotEq, right: Column { name: \"t1_name\", index: 1 } }", " CoalesceBatchesExec: target_batch_size=4096", @@ -2296,7 +2296,7 @@ async fn right_semi_join() -> Result<()> { ] } else { vec![ - "SortExec: expr=[t1_id@0 ASC NULLS LAST], global=true", + "SortExec: expr=[t1_id@0 ASC NULLS LAST]", " CoalesceBatchesExec: target_batch_size=4096", " HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(Column { name: \"t2_id\", index: 0 }, Column { name: \"t1_id\", index: 0 })], filter=BinaryExpr { left: Column { name: \"t2_name\", index: 0 }, op: NotEq, right: Column { name: \"t1_name\", index: 1 } }", " MemoryExec: partitions=1, partition_sizes=[1]", diff --git a/datafusion/core/tests/sql/window.rs b/datafusion/core/tests/sql/window.rs index da0f931f472c4..ed996199921d0 100644 --- a/datafusion/core/tests/sql/window.rs +++ b/datafusion/core/tests/sql/window.rs @@ -201,7 +201,7 @@ mod tests { vec![ "ProjectionExec: expr=[sum1@0 as sum1, sum2@1 as sum2, sum3@2 as sum3, min1@3 as min1, min2@4 as min2, min3@5 as min3, max1@6 as max1, max2@7 as max2, max3@8 as max3, cnt1@9 as cnt1, cnt2@10 as cnt2, sumr1@11 as sumr1, sumr2@12 as sumr2, sumr3@13 as sumr3, minr1@14 as minr1, minr2@15 as minr2, minr3@16 as minr3, maxr1@17 as maxr1, maxr2@18 as maxr2, maxr3@19 as maxr3, cntr1@20 as cntr1, cntr2@21 as cntr2, sum4@22 as sum4, cnt3@23 as cnt3]", " GlobalLimitExec: skip=0, fetch=5", - " SortExec: fetch=5, expr=[inc_col@24 DESC], global=true", + " SortExec: fetch=5, expr=[inc_col@24 DESC]", " ProjectionExec: expr=[SUM(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@14 as sum1, SUM(annotated_data.desc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@15 as sum2, SUM(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@16 as sum3, MIN(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@17 as min1, MIN(annotated_data.desc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@18 as min2, MIN(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@19 as min3, MAX(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@20 as max1, MAX(annotated_data.desc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@21 as max2, MAX(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@22 as max3, COUNT(UInt8(1)) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING@23 as cnt1, COUNT(UInt8(1)) ORDER BY [annotated_data.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING@24 as cnt2, SUM(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING@3 as sumr1, SUM(annotated_data.desc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING@4 as sumr2, SUM(annotated_data.desc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@5 as sumr3, MIN(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@6 as minr1, MIN(annotated_data.desc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@7 as minr2, MIN(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@8 as minr3, MAX(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@9 as maxr1, MAX(annotated_data.desc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@10 as maxr2, MAX(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@11 as maxr3, COUNT(UInt8(1)) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@12 as cntr1, COUNT(UInt8(1)) ORDER BY [annotated_data.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING@13 as cntr2, SUM(annotated_data.desc_col) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING@25 as sum4, COUNT(UInt8(1)) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING@26 as cnt3, inc_col@1 as inc_col]", " BoundedWindowAggExec: wdw=[SUM(annotated_data.desc_col): Ok(Field { name: \"SUM(annotated_data.desc_col)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(8)), end_bound: Following(UInt64(1)) }, COUNT(UInt8(1)): Ok(Field { name: \"COUNT(UInt8(1))\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(8)), end_bound: Following(UInt64(1)) }]", " BoundedWindowAggExec: wdw=[SUM(annotated_data.inc_col): Ok(Field { name: \"SUM(annotated_data.inc_col)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, SUM(annotated_data.desc_col): Ok(Field { name: \"SUM(annotated_data.desc_col)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(5)), end_bound: Following(Int32(1)) }, SUM(annotated_data.inc_col): Ok(Field { name: \"SUM(annotated_data.inc_col)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)) }, MIN(annotated_data.inc_col): Ok(Field { name: \"MIN(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, MIN(annotated_data.desc_col): Ok(Field { name: \"MIN(annotated_data.desc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(5)), end_bound: Following(Int32(1)) }, MIN(annotated_data.inc_col): Ok(Field { name: \"MIN(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)) }, MAX(annotated_data.inc_col): Ok(Field { name: \"MAX(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, MAX(annotated_data.desc_col): Ok(Field { name: \"MAX(annotated_data.desc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(5)), end_bound: Following(Int32(1)) }, MAX(annotated_data.inc_col): Ok(Field { name: \"MAX(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)) }, COUNT(UInt8(1)): Ok(Field { name: \"COUNT(UInt8(1))\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(4)), end_bound: Following(Int32(8)) }, COUNT(UInt8(1)): Ok(Field { name: \"COUNT(UInt8(1))\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(8)), end_bound: Following(UInt64(1)) }]", @@ -276,7 +276,7 @@ mod tests { vec![ "ProjectionExec: expr=[fv1@0 as fv1, fv2@1 as fv2, lv1@2 as lv1, lv2@3 as lv2, nv1@4 as nv1, nv2@5 as nv2, rn1@6 as rn1, rn2@7 as rn2, rank1@8 as rank1, rank2@9 as rank2, dense_rank1@10 as dense_rank1, dense_rank2@11 as dense_rank2, lag1@12 as lag1, lag2@13 as lag2, lead1@14 as lead1, lead2@15 as lead2, fvr1@16 as fvr1, fvr2@17 as fvr2, lvr1@18 as lvr1, lvr2@19 as lvr2, lagr1@20 as lagr1, lagr2@21 as lagr2, leadr1@22 as leadr1, leadr2@23 as leadr2]", " GlobalLimitExec: skip=0, fetch=5", - " SortExec: fetch=5, expr=[ts@24 DESC], global=true", + " SortExec: fetch=5, expr=[ts@24 DESC]", " ProjectionExec: expr=[FIRST_VALUE(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@11 as fv1, FIRST_VALUE(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@12 as fv2, LAST_VALUE(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@13 as lv1, LAST_VALUE(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@14 as lv2, NTH_VALUE(annotated_data.inc_col,Int64(5)) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@15 as nv1, NTH_VALUE(annotated_data.inc_col,Int64(5)) ORDER BY [annotated_data.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@16 as nv2, ROW_NUMBER() ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@17 as rn1, ROW_NUMBER() ORDER BY [annotated_data.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@18 as rn2, RANK() ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@19 as rank1, RANK() ORDER BY [annotated_data.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@20 as rank2, DENSE_RANK() ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@21 as dense_rank1, DENSE_RANK() ORDER BY [annotated_data.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@22 as dense_rank2, LAG(annotated_data.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@23 as lag1, LAG(annotated_data.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@24 as lag2, LEAD(annotated_data.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@25 as lead1, LEAD(annotated_data.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@26 as lead2, FIRST_VALUE(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@3 as fvr1, FIRST_VALUE(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@4 as fvr2, LAST_VALUE(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@5 as lvr1, LAST_VALUE(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@6 as lvr2, LAG(annotated_data.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@7 as lagr1, LAG(annotated_data.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@8 as lagr2, LEAD(annotated_data.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@9 as leadr1, LEAD(annotated_data.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@10 as leadr2, ts@0 as ts]", " BoundedWindowAggExec: wdw=[FIRST_VALUE(annotated_data.inc_col): Ok(Field { name: \"FIRST_VALUE(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, FIRST_VALUE(annotated_data.inc_col): Ok(Field { name: \"FIRST_VALUE(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, LAST_VALUE(annotated_data.inc_col): Ok(Field { name: \"LAST_VALUE(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, LAST_VALUE(annotated_data.inc_col): Ok(Field { name: \"LAST_VALUE(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, NTH_VALUE(annotated_data.inc_col,Int64(5)): Ok(Field { name: \"NTH_VALUE(annotated_data.inc_col,Int64(5))\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, NTH_VALUE(annotated_data.inc_col,Int64(5)): Ok(Field { name: \"NTH_VALUE(annotated_data.inc_col,Int64(5))\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, ROW_NUMBER(): Ok(Field { name: \"ROW_NUMBER()\", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, ROW_NUMBER(): Ok(Field { name: \"ROW_NUMBER()\", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, RANK(): Ok(Field { name: \"RANK()\", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, RANK(): Ok(Field { name: \"RANK()\", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, DENSE_RANK(): Ok(Field { name: \"DENSE_RANK()\", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, DENSE_RANK(): Ok(Field { name: \"DENSE_RANK()\", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, LAG(annotated_data.inc_col,Int64(1),Int64(1001)): Ok(Field { name: \"LAG(annotated_data.inc_col,Int64(1),Int64(1001))\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, LAG(annotated_data.inc_col,Int64(2),Int64(1002)): Ok(Field { name: \"LAG(annotated_data.inc_col,Int64(2),Int64(1002))\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, LEAD(annotated_data.inc_col,Int64(-1),Int64(1001)): Ok(Field { name: \"LEAD(annotated_data.inc_col,Int64(-1),Int64(1001))\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, LEAD(annotated_data.inc_col,Int64(4),Int64(1004)): Ok(Field { name: \"LEAD(annotated_data.inc_col,Int64(4),Int64(1004))\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }]", " BoundedWindowAggExec: wdw=[FIRST_VALUE(annotated_data.inc_col): Ok(Field { name: \"FIRST_VALUE(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, FIRST_VALUE(annotated_data.inc_col): Ok(Field { name: \"FIRST_VALUE(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)) }, LAST_VALUE(annotated_data.inc_col): Ok(Field { name: \"LAST_VALUE(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, LAST_VALUE(annotated_data.inc_col): Ok(Field { name: \"LAST_VALUE(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)) }, LAG(annotated_data.inc_col,Int64(1),Int64(1001)): Ok(Field { name: \"LAG(annotated_data.inc_col,Int64(1),Int64(1001))\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, LAG(annotated_data.inc_col,Int64(2),Int64(1002)): Ok(Field { name: \"LAG(annotated_data.inc_col,Int64(2),Int64(1002))\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)) }, LEAD(annotated_data.inc_col,Int64(-1),Int64(1001)): Ok(Field { name: \"LEAD(annotated_data.inc_col,Int64(-1),Int64(1001))\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, LEAD(annotated_data.inc_col,Int64(4),Int64(1004)): Ok(Field { name: \"LEAD(annotated_data.inc_col,Int64(4),Int64(1004))\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)) }]", @@ -335,7 +335,7 @@ mod tests { vec![ "ProjectionExec: expr=[sum1@0 as sum1, sum2@1 as sum2, min1@2 as min1, min2@3 as min2, max1@4 as max1, max2@5 as max2, count1@6 as count1, count2@7 as count2, avg1@8 as avg1, avg2@9 as avg2]", " GlobalLimitExec: skip=0, fetch=5", - " SortExec: fetch=5, expr=[inc_col@10 ASC NULLS LAST], global=true", + " SortExec: fetch=5, expr=[inc_col@10 ASC NULLS LAST]", " ProjectionExec: expr=[SUM(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@8 as sum1, SUM(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@3 as sum2, MIN(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@9 as min1, MIN(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@4 as min2, MAX(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@10 as max1, MAX(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@5 as max2, COUNT(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@11 as count1, COUNT(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@6 as count2, AVG(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@12 as avg1, AVG(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@7 as avg2, inc_col@1 as inc_col]", " BoundedWindowAggExec: wdw=[SUM(annotated_data.inc_col): Ok(Field { name: \"SUM(annotated_data.inc_col)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: Following(Int32(5)) }, MIN(annotated_data.inc_col): Ok(Field { name: \"MIN(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: Following(Int32(5)) }, MAX(annotated_data.inc_col): Ok(Field { name: \"MAX(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: Following(Int32(5)) }, COUNT(annotated_data.inc_col): Ok(Field { name: \"COUNT(annotated_data.inc_col)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: Following(Int32(5)) }, AVG(annotated_data.inc_col): Ok(Field { name: \"AVG(annotated_data.inc_col)\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: Following(Int32(5)) }]", " BoundedWindowAggExec: wdw=[SUM(annotated_data.inc_col): Ok(Field { name: \"SUM(annotated_data.inc_col)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: Following(Int32(3)) }, MIN(annotated_data.inc_col): Ok(Field { name: \"MIN(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: Following(Int32(3)) }, MAX(annotated_data.inc_col): Ok(Field { name: \"MAX(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: Following(Int32(3)) }, COUNT(annotated_data.inc_col): Ok(Field { name: \"COUNT(annotated_data.inc_col)\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: Following(Int32(3)) }, AVG(annotated_data.inc_col): Ok(Field { name: \"AVG(annotated_data.inc_col)\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: Following(Int32(3)) }]", @@ -389,7 +389,7 @@ mod tests { vec![ "ProjectionExec: expr=[first_value1@0 as first_value1, first_value2@1 as first_value2, last_value1@2 as last_value1, last_value2@3 as last_value2, nth_value1@4 as nth_value1]", " GlobalLimitExec: skip=0, fetch=5", - " SortExec: fetch=5, expr=[inc_col@5 ASC NULLS LAST], global=true", + " SortExec: fetch=5, expr=[inc_col@5 ASC NULLS LAST]", " ProjectionExec: expr=[FIRST_VALUE(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING@5 as first_value1, FIRST_VALUE(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@3 as first_value2, LAST_VALUE(annotated_data.inc_col) ORDER BY [annotated_data.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING@6 as last_value1, LAST_VALUE(annotated_data.inc_col) ORDER BY [annotated_data.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@4 as last_value2, NTH_VALUE(annotated_data.inc_col,Int64(2)) ORDER BY [annotated_data.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING@7 as nth_value1, inc_col@1 as inc_col]", " BoundedWindowAggExec: wdw=[FIRST_VALUE(annotated_data.inc_col): Ok(Field { name: \"FIRST_VALUE(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(1)) }, LAST_VALUE(annotated_data.inc_col): Ok(Field { name: \"LAST_VALUE(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(1)) }, NTH_VALUE(annotated_data.inc_col,Int64(2)): Ok(Field { name: \"NTH_VALUE(annotated_data.inc_col,Int64(2))\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(1)) }]", " BoundedWindowAggExec: wdw=[FIRST_VALUE(annotated_data.inc_col): Ok(Field { name: \"FIRST_VALUE(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(3)) }, LAST_VALUE(annotated_data.inc_col): Ok(Field { name: \"LAST_VALUE(annotated_data.inc_col)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(3)) }]", diff --git a/datafusion/core/tests/sqllogictests/test_files/order.slt b/datafusion/core/tests/sqllogictests/test_files/order.slt index 58d6bde208901..d42d2cf62f1f2 100644 --- a/datafusion/core/tests/sqllogictests/test_files/order.slt +++ b/datafusion/core/tests/sqllogictests/test_files/order.slt @@ -164,7 +164,7 @@ Projection: aggregate_test_100.c1, aggregate_test_100.c2 TableScan: aggregate_test_100 projection=[c1, c2, c3] physical_plan ProjectionExec: expr=[c1@0 as c1, c2@1 as c2] - SortExec: expr=[c2@1 ASC NULLS LAST,c3@2 ASC NULLS LAST], global=true + SortExec: expr=[c2@1 ASC NULLS LAST,c3@2 ASC NULLS LAST] CsvExec: files={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, has_header=true, limit=None, projection=[c1, c2, c3] query II diff --git a/datafusion/core/tests/sqllogictests/test_files/window.slt b/datafusion/core/tests/sqllogictests/test_files/window.slt index b9c1dc03d078b..148c7a4fd0b92 100644 --- a/datafusion/core/tests/sqllogictests/test_files/window.slt +++ b/datafusion/core/tests/sqllogictests/test_files/window.slt @@ -271,7 +271,7 @@ Sort: d.b ASC NULLS LAST EmptyRelation physical_plan SortPreservingMergeExec: [b@0 ASC NULLS LAST] - SortExec: expr=[b@0 ASC NULLS LAST], global=false + SortExec: expr=[b@0 ASC NULLS LAST] ProjectionExec: expr=[b@0 as b, MAX(d.a)@1 as max_a] AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[MAX(d.a)] CoalesceBatchesExec: target_batch_size=8192 @@ -356,13 +356,13 @@ Sort: d.b ASC NULLS LAST EmptyRelation physical_plan SortPreservingMergeExec: [b@0 ASC NULLS LAST] - SortExec: expr=[b@0 ASC NULLS LAST], global=false + SortExec: expr=[b@0 ASC NULLS LAST] ProjectionExec: expr=[b@0 as b, MAX(d.a)@1 as max_a, MAX(d.seq)@2 as MAX(d.seq)] AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[MAX(d.a), MAX(d.seq)] AggregateExec: mode=Partial, gby=[b@2 as b], aggr=[MAX(d.a), MAX(d.seq)] ProjectionExec: expr=[ROW_NUMBER() PARTITION BY [s.b] ORDER BY [s.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as seq, a@0 as a, b@1 as b] BoundedWindowAggExec: wdw=[ROW_NUMBER(): Ok(Field { name: "ROW_NUMBER()", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow }] - SortExec: expr=[b@1 ASC NULLS LAST,a@0 ASC NULLS LAST], global=false + SortExec: expr=[b@1 ASC NULLS LAST,a@0 ASC NULLS LAST] CoalesceBatchesExec: target_batch_size=8192 RepartitionExec: partitioning=Hash([Column { name: "b", index: 1 }], 4), input_partitions=4 UnionExec @@ -1216,7 +1216,7 @@ physical_plan ProjectionExec: expr=[c9@8 as c9, SUM(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@14 as sum1, SUM(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@13 as sum2] BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: "SUM(aggregate_test_100.c9)", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow }] BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: "SUM(aggregate_test_100.c9)", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow }] - SortExec: expr=[c9@8 ASC NULLS LAST,c8@7 ASC NULLS LAST], global=true + SortExec: expr=[c9@8 ASC NULLS LAST,c8@7 ASC NULLS LAST] CsvExec: files={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, has_header=true, limit=None, projection=[c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13] # over_order_by_sort_keys_sorting_prefix_compacting @@ -1236,7 +1236,7 @@ ProjectionExec: expr=[c2@1 as c2, MAX(aggregate_test_100.c9) ORDER BY [aggregate WindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: "SUM(aggregate_test_100.c9)", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)) }] BoundedWindowAggExec: wdw=[MAX(aggregate_test_100.c9): Ok(Field { name: "MAX(aggregate_test_100.c9)", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int8(NULL)), end_bound: CurrentRow }] BoundedWindowAggExec: wdw=[MIN(aggregate_test_100.c9): Ok(Field { name: "MIN(aggregate_test_100.c9)", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int8(NULL)), end_bound: CurrentRow }] - SortExec: expr=[c2@1 ASC NULLS LAST,c9@8 ASC NULLS LAST], global=true + SortExec: expr=[c2@1 ASC NULLS LAST,c9@8 ASC NULLS LAST] CsvExec: files={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, has_header=true, limit=None, projection=[c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13] # FIXME: for now we are not detecting prefix of sorting keys in order to re-arrange with global and save one SortExec @@ -1254,13 +1254,13 @@ Sort: aggregate_test_100.c2 ASC NULLS LAST WindowAggr: windowExpr=[[MIN(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] TableScan: aggregate_test_100 projection=[c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13] physical_plan -SortExec: expr=[c2@0 ASC NULLS LAST], global=true +SortExec: expr=[c2@0 ASC NULLS LAST] ProjectionExec: expr=[c2@1 as c2, MAX(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@14 as MAX(aggregate_test_100.c9), SUM(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@15 as SUM(aggregate_test_100.c9), MIN(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@13 as MIN(aggregate_test_100.c9)] WindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: "SUM(aggregate_test_100.c9)", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)) }] BoundedWindowAggExec: wdw=[MAX(aggregate_test_100.c9): Ok(Field { name: "MAX(aggregate_test_100.c9)", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow }] - SortExec: expr=[c9@8 ASC NULLS LAST,c2@1 ASC NULLS LAST], global=true + SortExec: expr=[c9@8 ASC NULLS LAST,c2@1 ASC NULLS LAST] BoundedWindowAggExec: wdw=[MIN(aggregate_test_100.c9): Ok(Field { name: "MIN(aggregate_test_100.c9)", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int8(NULL)), end_bound: CurrentRow }] - SortExec: expr=[c2@1 ASC NULLS LAST,c9@8 ASC NULLS LAST], global=true + SortExec: expr=[c2@1 ASC NULLS LAST,c9@8 ASC NULLS LAST] CsvExec: files={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, has_header=true, limit=None, projection=[c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13] # test_window_partition_by_order_by @@ -1281,11 +1281,11 @@ Projection: SUM(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggr physical_plan ProjectionExec: expr=[SUM(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@13 as SUM(aggregate_test_100.c4), COUNT(UInt8(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@14 as COUNT(UInt8(1))] BoundedWindowAggExec: wdw=[COUNT(UInt8(1)): Ok(Field { name: "COUNT(UInt8(1))", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(1)) }] - SortExec: expr=[c1@0 ASC NULLS LAST,c2@1 ASC NULLS LAST], global=false + SortExec: expr=[c1@0 ASC NULLS LAST,c2@1 ASC NULLS LAST] CoalesceBatchesExec: target_batch_size=4096 RepartitionExec: partitioning=Hash([Column { name: "c1", index: 0 }], 2), input_partitions=2 BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c4): Ok(Field { name: "SUM(aggregate_test_100.c4)", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(1)) }] - SortExec: expr=[c1@0 ASC NULLS LAST,c2@1 ASC NULLS LAST], global=false + SortExec: expr=[c1@0 ASC NULLS LAST,c2@1 ASC NULLS LAST] CoalesceBatchesExec: target_batch_size=4096 RepartitionExec: partitioning=Hash([Column { name: "c1", index: 0 }, Column { name: "c2", index: 1 }], 2), input_partitions=2 RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 @@ -1312,7 +1312,7 @@ ProjectionExec: expr=[c9@8 as c9, SUM(aggregate_test_100.c9) ORDER BY [aggregate GlobalLimitExec: skip=0, fetch=5 BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: "SUM(aggregate_test_100.c9)", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: Following(UInt64(1)) }] BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: "SUM(aggregate_test_100.c9)", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)) }] - SortExec: expr=[c9@8 DESC], global=true + SortExec: expr=[c9@8 DESC] CsvExec: files={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, has_header=true, limit=None, projection=[c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13] query III @@ -1353,7 +1353,7 @@ ProjectionExec: expr=[c9@8 as c9, FIRST_VALUE(aggregate_test_100.c9) ORDER BY [a GlobalLimitExec: skip=0, fetch=5 BoundedWindowAggExec: wdw=[FIRST_VALUE(aggregate_test_100.c9): Ok(Field { name: "FIRST_VALUE(aggregate_test_100.c9)", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: Following(UInt64(1)) }, LAG(aggregate_test_100.c9,Int64(2),Int64(10101)): Ok(Field { name: "LAG(aggregate_test_100.c9,Int64(2),Int64(10101))", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)) }, LEAD(aggregate_test_100.c9,Int64(2),Int64(10101)): Ok(Field { name: "LEAD(aggregate_test_100.c9,Int64(2),Int64(10101))", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)) }] BoundedWindowAggExec: wdw=[FIRST_VALUE(aggregate_test_100.c9): Ok(Field { name: "FIRST_VALUE(aggregate_test_100.c9)", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)) }, LAG(aggregate_test_100.c9,Int64(2),Int64(10101)): Ok(Field { name: "LAG(aggregate_test_100.c9,Int64(2),Int64(10101))", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, LEAD(aggregate_test_100.c9,Int64(2),Int64(10101)): Ok(Field { name: "LEAD(aggregate_test_100.c9,Int64(2),Int64(10101))", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }] - SortExec: expr=[c9@8 DESC], global=true + SortExec: expr=[c9@8 DESC] CsvExec: files={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, has_header=true, limit=None, projection=[c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13] query IIIIIII @@ -1395,9 +1395,9 @@ physical_plan ProjectionExec: expr=[c9@8 as c9, ROW_NUMBER() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@14 as rn1, ROW_NUMBER() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@13 as rn2] GlobalLimitExec: skip=0, fetch=5 BoundedWindowAggExec: wdw=[ROW_NUMBER(): Ok(Field { name: "ROW_NUMBER()", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)) }] - SortExec: expr=[c9@8 ASC NULLS LAST], global=true + SortExec: expr=[c9@8 ASC NULLS LAST] BoundedWindowAggExec: wdw=[ROW_NUMBER(): Ok(Field { name: "ROW_NUMBER()", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)) }] - SortExec: expr=[c9@8 DESC], global=true + SortExec: expr=[c9@8 DESC] CsvExec: files={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, has_header=true, limit=None, projection=[c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13] @@ -1437,10 +1437,10 @@ physical_plan ProjectionExec: expr=[c9@8 as c9, SUM(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@15 as sum1, SUM(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c1 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@13 as sum2, ROW_NUMBER() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@14 as rn2] GlobalLimitExec: skip=0, fetch=5 BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: "SUM(aggregate_test_100.c9)", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)) }] - SortExec: expr=[c9@8 ASC NULLS LAST,c1@0 ASC NULLS LAST,c2@1 ASC NULLS LAST], global=true + SortExec: expr=[c9@8 ASC NULLS LAST,c1@0 ASC NULLS LAST,c2@1 ASC NULLS LAST] BoundedWindowAggExec: wdw=[ROW_NUMBER(): Ok(Field { name: "ROW_NUMBER()", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)) }] BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: "SUM(aggregate_test_100.c9)", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)) }] - SortExec: expr=[c9@8 DESC,c1@0 DESC], global=true + SortExec: expr=[c9@8 DESC,c1@0 DESC] CsvExec: files={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, has_header=true, limit=None, projection=[c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13] query IIII @@ -1519,17 +1519,17 @@ ProjectionExec: expr=[SUM(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] GlobalLimitExec: skip=0, fetch=5 WindowAggExec: wdw=[SUM(null_cases.c1): Ok(Field { name: "SUM(null_cases.c1)", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(10)), end_bound: Following(Int64(11)) }, SUM(null_cases.c1): Ok(Field { name: "SUM(null_cases.c1)", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow }, SUM(null_cases.c1): Ok(Field { name: "SUM(null_cases.c1)", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: Following(Int64(11)) }, SUM(null_cases.c1): Ok(Field { name: "SUM(null_cases.c1)", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)) }] BoundedWindowAggExec: wdw=[SUM(null_cases.c1): Ok(Field { name: "SUM(null_cases.c1)", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow }] - SortExec: expr=[c3@2 ASC NULLS LAST,c2@1 ASC NULLS LAST], global=true + SortExec: expr=[c3@2 ASC NULLS LAST,c2@1 ASC NULLS LAST] BoundedWindowAggExec: wdw=[SUM(null_cases.c1): Ok(Field { name: "SUM(null_cases.c1)", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow }] - SortExec: expr=[c3@2 ASC NULLS LAST,c1@0 ASC], global=true + SortExec: expr=[c3@2 ASC NULLS LAST,c1@0 ASC] BoundedWindowAggExec: wdw=[SUM(null_cases.c1): Ok(Field { name: "SUM(null_cases.c1)", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow }] - SortExec: expr=[c3@2 ASC NULLS LAST,c1@0 DESC], global=true + SortExec: expr=[c3@2 ASC NULLS LAST,c1@0 DESC] WindowAggExec: wdw=[SUM(null_cases.c1): Ok(Field { name: "SUM(null_cases.c1)", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(11)), end_bound: Following(Int64(10)) }, SUM(null_cases.c1): Ok(Field { name: "SUM(null_cases.c1)", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)) }, SUM(null_cases.c1): Ok(Field { name: "SUM(null_cases.c1)", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(11)), end_bound: Following(Int64(NULL)) }, SUM(null_cases.c1): Ok(Field { name: "SUM(null_cases.c1)", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow }] WindowAggExec: wdw=[SUM(null_cases.c1): Ok(Field { name: "SUM(null_cases.c1)", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(10)), end_bound: Following(Int64(11)) }, SUM(null_cases.c1): Ok(Field { name: "SUM(null_cases.c1)", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow }, SUM(null_cases.c1): Ok(Field { name: "SUM(null_cases.c1)", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: Following(Int64(11)) }, SUM(null_cases.c1): Ok(Field { name: "SUM(null_cases.c1)", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)) }] - SortExec: expr=[c3@2 DESC NULLS LAST], global=true + SortExec: expr=[c3@2 DESC NULLS LAST] WindowAggExec: wdw=[SUM(null_cases.c1): Ok(Field { name: "SUM(null_cases.c1)", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(10)), end_bound: Following(Int64(11)) }, SUM(null_cases.c1): Ok(Field { name: "SUM(null_cases.c1)", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow }, SUM(null_cases.c1): Ok(Field { name: "SUM(null_cases.c1)", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: Following(Int64(11)) }, SUM(null_cases.c1): Ok(Field { name: "SUM(null_cases.c1)", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)) }] BoundedWindowAggExec: wdw=[SUM(null_cases.c1): Ok(Field { name: "SUM(null_cases.c1)", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow }] - SortExec: expr=[c3@2 DESC,c1@0 ASC NULLS LAST], global=true + SortExec: expr=[c3@2 DESC,c1@0 ASC NULLS LAST] CsvExec: files={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/null_cases.csv]]}, has_header=true, limit=None, projection=[c1, c2, c3] query IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII @@ -1604,7 +1604,7 @@ ProjectionExec: expr=[c9@8 as c9, SUM(aggregate_test_100.c9) ORDER BY [aggregate GlobalLimitExec: skip=0, fetch=5 BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: "SUM(aggregate_test_100.c9)", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)) }] BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: "SUM(aggregate_test_100.c9)", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)) }] - SortExec: expr=[c1@0 ASC NULLS LAST,c9@8 DESC], global=true + SortExec: expr=[c1@0 ASC NULLS LAST,c9@8 DESC] CsvExec: files={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, has_header=true, limit=None, projection=[c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13] @@ -1648,7 +1648,7 @@ ProjectionExec: expr=[c9@8 as c9, SUM(aggregate_test_100.c9) PARTITION BY [aggre GlobalLimitExec: skip=0, fetch=5 BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: "SUM(aggregate_test_100.c9)", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: Following(UInt64(1)) }] BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: "SUM(aggregate_test_100.c9)", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)) }] - SortExec: expr=[c1@0 ASC NULLS LAST,c9@8 DESC], global=true + SortExec: expr=[c1@0 ASC NULLS LAST,c9@8 DESC] CsvExec: files={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, has_header=true, limit=None, projection=[c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13] query III @@ -1692,7 +1692,7 @@ ProjectionExec: expr=[c3@2 as c3, SUM(aggregate_test_100.c9) ORDER BY [aggregate GlobalLimitExec: skip=0, fetch=5 WindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: "SUM(aggregate_test_100.c9)", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int16(NULL)) }] BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: "SUM(aggregate_test_100.c9)", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int16(NULL)), end_bound: CurrentRow }] - SortExec: expr=[c3@2 + c4@3 DESC,c9@8 DESC,c2@1 ASC NULLS LAST], global=true + SortExec: expr=[c3@2 + c4@3 DESC,c9@8 DESC,c2@1 ASC NULLS LAST] CsvExec: files={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, has_header=true, limit=None, projection=[c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13] query III @@ -1784,12 +1784,12 @@ GlobalLimitExec: skip=0, fetch=5 SortPreservingMergeExec: [c3@0 ASC NULLS LAST] ProjectionExec: expr=[c3@2 as c3, SUM(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@13 as sum1, SUM(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c3] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@14 as sum2] BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: "SUM(aggregate_test_100.c9)", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow }] - SortExec: expr=[c3@2 ASC NULLS LAST,c9@8 DESC], global=false + SortExec: expr=[c3@2 ASC NULLS LAST,c9@8 DESC] CoalesceBatchesExec: target_batch_size=4096 RepartitionExec: partitioning=Hash([Column { name: "c3", index: 2 }], 2), input_partitions=2 RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: "SUM(aggregate_test_100.c9)", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int16(NULL)), end_bound: CurrentRow }] - SortExec: expr=[c3@2 DESC,c9@8 DESC,c2@1 ASC NULLS LAST], global=true + SortExec: expr=[c3@2 DESC,c9@8 DESC,c2@1 ASC NULLS LAST] CsvExec: files={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, has_header=true, limit=None, projection=[c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13] @@ -1824,7 +1824,7 @@ physical_plan SortPreservingMergeExec: [c1@0 ASC NULLS LAST] ProjectionExec: expr=[c1@0 as c1, ROW_NUMBER() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@13 as rn1] BoundedWindowAggExec: wdw=[ROW_NUMBER(): Ok(Field { name: "ROW_NUMBER()", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)) }] - SortExec: expr=[c1@0 ASC NULLS LAST], global=false + SortExec: expr=[c1@0 ASC NULLS LAST] CoalesceBatchesExec: target_batch_size=4096 RepartitionExec: partitioning=Hash([Column { name: "c1", index: 0 }], 2), input_partitions=2 RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 @@ -1947,11 +1947,11 @@ Sort: aggregate_test_100.c1 ASC NULLS LAST WindowAggr: windowExpr=[[ROW_NUMBER() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] TableScan: aggregate_test_100 projection=[c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13] physical_plan -SortExec: expr=[c1@0 ASC NULLS LAST], global=true +SortExec: expr=[c1@0 ASC NULLS LAST] CoalescePartitionsExec ProjectionExec: expr=[c1@0 as c1, ROW_NUMBER() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@13 as rn1] BoundedWindowAggExec: wdw=[ROW_NUMBER(): Ok(Field { name: "ROW_NUMBER()", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)) }] - SortExec: expr=[c1@0 ASC NULLS LAST], global=false + SortExec: expr=[c1@0 ASC NULLS LAST] CoalesceBatchesExec: target_batch_size=4096 RepartitionExec: partitioning=Hash([Column { name: "c1", index: 0 }], 2), input_partitions=2 RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 @@ -1974,13 +1974,13 @@ Sort: aggregate_test_100.c1 ASC NULLS LAST WindowAggr: windowExpr=[[SUM(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING]] TableScan: aggregate_test_100 projection=[c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13] physical_plan -SortExec: expr=[c1@0 ASC NULLS LAST], global=true +SortExec: expr=[c1@0 ASC NULLS LAST] ProjectionExec: expr=[c1@0 as c1, SUM(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING@13 as sum1, SUM(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@14 as sum2] BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: "SUM(aggregate_test_100.c9)", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)) }] SortPreservingMergeExec: [c9@8 ASC NULLS LAST] - SortExec: expr=[c9@8 ASC NULLS LAST], global=false + SortExec: expr=[c9@8 ASC NULLS LAST] BoundedWindowAggExec: wdw=[SUM(aggregate_test_100.c9): Ok(Field { name: "SUM(aggregate_test_100.c9)", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(3)) }] - SortExec: expr=[c1@0 ASC NULLS LAST,c9@8 ASC NULLS LAST], global=false + SortExec: expr=[c1@0 ASC NULLS LAST,c9@8 ASC NULLS LAST] CoalesceBatchesExec: target_batch_size=4096 RepartitionExec: partitioning=Hash([Column { name: "c1", index: 0 }], 2), input_partitions=2 RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 @@ -2006,7 +2006,7 @@ ProjectionExec: expr=[ARRAYAGG(aggregate_test_100.c13)@0 as array_agg1] AggregateExec: mode=Partial, gby=[], aggr=[ARRAYAGG(aggregate_test_100.c13)] RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 GlobalLimitExec: skip=0, fetch=1 - SortExec: fetch=1, expr=[c13@0 ASC NULLS LAST], global=true + SortExec: fetch=1, expr=[c13@0 ASC NULLS LAST] CsvExec: files={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, has_header=true, limit=None, projection=[c13] From 1102a12129f4c3fb9379e8951d0ba2c9a073eb2a Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 15 Mar 2023 17:23:39 +0300 Subject: [PATCH 19/35] remove unnecessary parameter --- .../src/physical_optimizer/sort_pushdown.rs | 35 ++----------------- 1 file changed, 3 insertions(+), 32 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/sort_pushdown.rs b/datafusion/core/src/physical_optimizer/sort_pushdown.rs index 9fdefbd9ed719..fb5fe48cf3a0f 100644 --- a/datafusion/core/src/physical_optimizer/sort_pushdown.rs +++ b/datafusion/core/src/physical_optimizer/sort_pushdown.rs @@ -44,8 +44,6 @@ use std::sync::Arc; pub(crate) struct SortPushDown { /// Current plan pub plan: Arc, - /// Whether the plan could impact the final result ordering - impact_result_ordering: bool, /// Parent required sort ordering required_ordering: Option>, /// The adjusted request sort ordering to children. @@ -55,23 +53,9 @@ pub(crate) struct SortPushDown { impl SortPushDown { pub fn init(plan: Arc) -> Self { - let impact_result_ordering = plan.output_ordering().is_some() - || plan.output_partitioning().partition_count() <= 1 - || is_limit(&plan); let request_ordering = plan.required_input_ordering(); SortPushDown { plan, - impact_result_ordering, - required_ordering: None, - adjusted_request_ordering: request_ordering, - } - } - - pub fn new_without_impact_result_ordering(plan: Arc) -> Self { - let request_ordering = plan.required_input_ordering(); - SortPushDown { - plan, - impact_result_ordering: false, required_ordering: None, adjusted_request_ordering: request_ordering, } @@ -84,18 +68,11 @@ impl SortPushDown { izip!( plan_children.into_iter(), self.adjusted_request_ordering.clone().into_iter(), - self.plan.maintains_input_order().into_iter(), ) - .map(|(child, from_parent, maintains_input_order)| { - let child_impact_result_ordering = if is_limit(&self.plan) { - true - } else { - maintains_input_order && self.impact_result_ordering - }; + .map(|(child, from_parent)| { let child_request_ordering = child.required_input_ordering(); SortPushDown { plan: child, - impact_result_ordering: child_impact_result_ordering, required_ordering: from_parent, adjusted_request_ordering: child_request_ordering, } @@ -125,7 +102,6 @@ impl TreeNodeRewritable for SortPushDown { let plan = with_new_children_if_necessary(self.plan, children_plans)?; Ok(SortPushDown { plan, - impact_result_ordering: self.impact_result_ordering, required_ordering: self.required_ordering, adjusted_request_ordering: self.adjusted_request_ordering, }) @@ -157,13 +133,10 @@ pub(crate) fn pushdown_sorts(requirements: SortPushDown) -> Result Result Date: Wed, 15 Mar 2023 17:46:58 +0300 Subject: [PATCH 20/35] Updates --- .../src/physical_optimizer/sort_pushdown.rs | 35 ++++++++++++------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/sort_pushdown.rs b/datafusion/core/src/physical_optimizer/sort_pushdown.rs index fb5fe48cf3a0f..4d7a9623c3b6a 100644 --- a/datafusion/core/src/physical_optimizer/sort_pushdown.rs +++ b/datafusion/core/src/physical_optimizer/sort_pushdown.rs @@ -112,6 +112,11 @@ impl TreeNodeRewritable for SortPushDown { pub(crate) fn pushdown_sorts(requirements: SortPushDown) -> Result> { let plan = &requirements.plan; let parent_required = requirements.required_ordering.as_deref(); + let err = || { + DataFusionError::Execution( + "Expects parent requirement to contain something".to_string(), + ) + }; if let Some(sort_exec) = plan.as_any().downcast_ref::() { let mut new_plan = plan.clone(); if !ordering_satisfy_requirement(plan.output_ordering(), parent_required, || { @@ -119,11 +124,12 @@ pub(crate) fn pushdown_sorts(requirements: SortPushDown) -> Result Result Result, parent_required: Option<&[PhysicalSortRequirements]>, ) -> Result>>>> { + let err = || { + DataFusionError::Execution( + "Expects parent requirement to contain something".to_string(), + ) + }; let maintains_input_order = plan.maintains_input_order(); if is_window(plan) { let required_input_ordering = plan.required_input_ordering(); @@ -197,7 +207,7 @@ fn pushdown_requirement_to_children( // If the current plan is SortMergeJoinExec let left_columns_len = smj.left.schema().fields().len(); let parent_required_expr = - create_sort_expr_from_requirement(parent_required.unwrap()); + create_sort_expr_from_requirement(parent_required.ok_or_else(err)?); let expr_source_side = expr_source_sides(&parent_required_expr, smj.join_type, left_columns_len); match expr_source_side { @@ -211,11 +221,12 @@ fn pushdown_requirement_to_children( } Some(JoinSide::Right) if maintains_input_order[1] => { let new_right_required = match smj.join_type { - JoinType::Inner | JoinType::Right => { - shift_right_required(parent_required.unwrap(), left_columns_len)? - } + JoinType::Inner | JoinType::Right => shift_right_required( + parent_required.ok_or_else(err)?, + left_columns_len, + )?, JoinType::RightSemi | JoinType::RightAnti => { - parent_required.unwrap().to_vec() + parent_required.ok_or_else(err)?.to_vec() } _ => Err(DataFusionError::Plan( "Unexpected SortMergeJoin type here".to_string(), From 071b05a3d0281232cd40e364dc84078694f16e88 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 15 Mar 2023 17:54:30 +0300 Subject: [PATCH 21/35] simplifications --- .../src/physical_optimizer/sort_pushdown.rs | 47 ++++++++----------- .../core/src/physical_plan/sorts/sort.rs | 4 +- .../windows/bounded_window_agg_exec.rs | 4 +- .../physical_plan/windows/window_agg_exec.rs | 4 +- datafusion/core/tests/sql/explain_analyze.rs | 3 +- datafusion/core/tests/sql/window.rs | 38 --------------- 6 files changed, 24 insertions(+), 76 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/sort_pushdown.rs b/datafusion/core/src/physical_optimizer/sort_pushdown.rs index 4d7a9623c3b6a..7c941671be4b1 100644 --- a/datafusion/core/src/physical_optimizer/sort_pushdown.rs +++ b/datafusion/core/src/physical_optimizer/sort_pushdown.rs @@ -112,11 +112,8 @@ impl TreeNodeRewritable for SortPushDown { pub(crate) fn pushdown_sorts(requirements: SortPushDown) -> Result> { let plan = &requirements.plan; let parent_required = requirements.required_ordering.as_deref(); - let err = || { - DataFusionError::Execution( - "Expects parent requirement to contain something".to_string(), - ) - }; + let err_msg = "Expects parent requirement to contain something"; + let err = || DataFusionError::Execution(err_msg.to_string()); if let Some(sort_exec) = plan.as_any().downcast_ref::() { let mut new_plan = plan.clone(); if !ordering_satisfy_requirement(plan.output_ordering(), parent_required, || { @@ -149,28 +146,25 @@ pub(crate) fn pushdown_sorts(requirements: SortPushDown) -> Result, parent_required: Option<&[PhysicalSortRequirements]>, ) -> Result>>>> { - let err = || { - DataFusionError::Execution( - "Expects parent requirement to contain something".to_string(), - ) - }; + let err_msg = "Expects parent requirement to contain something"; + let err = || DataFusionError::Execution(err_msg.to_string()); let maintains_input_order = plan.maintains_input_order(); if is_window(plan) { let required_input_ordering = plan.required_input_ordering(); diff --git a/datafusion/core/src/physical_plan/sorts/sort.rs b/datafusion/core/src/physical_plan/sorts/sort.rs index 552924ba666c5..c3fc06206ca15 100644 --- a/datafusion/core/src/physical_plan/sorts/sort.rs +++ b/datafusion/core/src/physical_plan/sorts/sort.rs @@ -795,9 +795,9 @@ impl ExecutionPlan for SortExec { let expr: Vec = self.expr.iter().map(|e| e.to_string()).collect(); match self.fetch { Some(fetch) => { - write!(f, "SortExec: fetch={fetch}, expr=[{}]", expr.join(","),) + write!(f, "SortExec: fetch={fetch}, expr=[{}]", expr.join(",")) } - None => write!(f, "SortExec: expr=[{}]", expr.join(","),), + None => write!(f, "SortExec: expr=[{}]", expr.join(",")), } } } diff --git a/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs b/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs index 36efd86356bd2..57863459d6360 100644 --- a/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs +++ b/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs @@ -125,9 +125,7 @@ impl BoundedWindowAggExec { let mut result = vec![]; // All window exprs have the same partition by, so we just use the first one: let partition_by = self.window_expr()[0].partition_by(); - let sort_keys = self - .output_ordering() - .unwrap_or_else(|| self.sort_keys.as_deref().unwrap_or(&[])); + let sort_keys = self.sort_keys.as_deref().unwrap_or(&[]); for item in partition_by { if let Some(a) = sort_keys.iter().find(|&e| e.expr.eq(item)) { result.push(a.clone()); diff --git a/datafusion/core/src/physical_plan/windows/window_agg_exec.rs b/datafusion/core/src/physical_plan/windows/window_agg_exec.rs index 43614e34adc82..a0afa3a27bebc 100644 --- a/datafusion/core/src/physical_plan/windows/window_agg_exec.rs +++ b/datafusion/core/src/physical_plan/windows/window_agg_exec.rs @@ -115,9 +115,7 @@ impl WindowAggExec { let mut result = vec![]; // All window exprs have the same partition by, so we just use the first one: let partition_by = self.window_expr()[0].partition_by(); - let sort_keys = self - .output_ordering() - .unwrap_or_else(|| self.sort_keys.as_deref().unwrap_or(&[])); + let sort_keys = self.sort_keys.as_deref().unwrap_or(&[]); for item in partition_by { if let Some(a) = sort_keys.iter().find(|&e| e.expr.eq(item)) { result.push(a.clone()); diff --git a/datafusion/core/tests/sql/explain_analyze.rs b/datafusion/core/tests/sql/explain_analyze.rs index f693b12e1c02a..6f7150d2a53b6 100644 --- a/datafusion/core/tests/sql/explain_analyze.rs +++ b/datafusion/core/tests/sql/explain_analyze.rs @@ -582,7 +582,6 @@ async fn explain_analyze_runs_optimizers() { } #[tokio::test] -#[ignore] async fn test_physical_plan_display_indent() { // Hard code target_partitions as it appears in the RepartitionExec output let config = SessionConfig::new() @@ -601,7 +600,7 @@ async fn test_physical_plan_display_indent() { let expected = vec![ "GlobalLimitExec: skip=0, fetch=10", " SortPreservingMergeExec: [the_min@2 DESC]", - " SortExec: fetch=10, expr=[the_min@2 DESC], global=false", + " SortExec: fetch=10, expr=[the_min@2 DESC]", " ProjectionExec: expr=[c1@0 as c1, MAX(aggregate_test_100.c12)@1 as MAX(aggregate_test_100.c12), MIN(aggregate_test_100.c12)@2 as the_min]", " AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[MAX(aggregate_test_100.c12), MIN(aggregate_test_100.c12)]", " CoalesceBatchesExec: target_batch_size=4096", diff --git a/datafusion/core/tests/sql/window.rs b/datafusion/core/tests/sql/window.rs index ed996199921d0..8f95eba572bad 100644 --- a/datafusion/core/tests/sql/window.rs +++ b/datafusion/core/tests/sql/window.rs @@ -18,7 +18,6 @@ use super::*; use ::parquet::arrow::arrow_writer::ArrowWriter; use ::parquet::file::properties::WriterProperties; -use arrow::util::pretty::print_batches; use datafusion::execution::options::ReadOptions; #[tokio::test] @@ -420,40 +419,3 @@ mod tests { Ok(()) } } - -fn print_plan(plan: &Arc) -> Result<()> { - let formatted = displayable(plan.as_ref()).indent().to_string(); - let actual: Vec<&str> = formatted.trim().lines().collect(); - println!("{:#?}", actual); - Ok(()) -} - -#[tokio::test] -async fn test_projection_wrong_push_down() -> Result<()> { - let config = SessionConfig::new(); - let ctx = SessionContext::with_config(config); - register_aggregate_csv(&ctx).await?; - // let sql = "SELECT a.c1, b.c1, SUM(a.c2) FROM aggregate_test_100 as a CROSS JOIN aggregate_test_100 as b GROUP BY a.c1, b.c1 ORDER BY a.c1, b.c1"; - let sql = "SELECT c9, - SUM(c5) OVER(ORDER BY c4 RANGE BETWEEN 3 PRECEDING AND 1 FOLLOWING) as summation2, - SUM(c4) OVER(ORDER BY c3 RANGE 3 PRECEDING) as summation3, - SUM(c4) OVER(ORDER BY c5 RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as summation6, - SUM(c4) OVER(ORDER BY c5 RANGE UNBOUNDED PRECEDING) as summation7, - SUM(c2) OVER(PARTITION BY c5 ORDER BY c5 RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as summation10, - SUM(c4) OVER(PARTITION BY c1 ORDER BY c5 RANGE UNBOUNDED PRECEDING) as summation11, - SUM(c2) OVER(PARTITION BY c1 ORDER BY c5 RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as summation14, - SUM(c4) OVER(PARTITION BY c5 ORDER BY c5 RANGE UNBOUNDED PRECEDING) as summation15, - SUM(c2) OVER(PARTITION BY c5, c7, c9 ORDER BY c5 RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as summation20, - SUM(c2) OVER(PARTITION BY c5 ORDER BY c5 RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as summation21 -FROM aggregate_test_100 -ORDER BY c9;"; - - let msg = format!("Creating logical plan for '{sql}'"); - let dataframe = ctx.sql(sql).await.expect(&msg); - let physical_plan = dataframe.create_physical_plan().await?; - print_plan(&physical_plan)?; - - let actual = execute_to_batches(&ctx, sql).await; - print_batches(&actual)?; - Ok(()) -} From 777aa1c9505ca71027f880d5b429bfad4071e263 Mon Sep 17 00:00:00 2001 From: Mehmet Ozan Kabak Date: Wed, 15 Mar 2023 22:24:51 -0500 Subject: [PATCH 22/35] Refactors and simplifications part 1 --- datafusion/core/src/dataframe.rs | 8 +- .../physical_optimizer/dist_enforcement.rs | 26 +- .../src/physical_optimizer/repartition.rs | 11 +- .../physical_optimizer/sort_enforcement.rs | 6 +- .../src/physical_optimizer/sort_pushdown.rs | 43 +-- .../core/src/physical_optimizer/utils.rs | 10 +- .../physical_plan/joins/sort_merge_join.rs | 13 +- .../joins/symmetric_hash_join.rs | 19 +- datafusion/core/src/physical_plan/mod.rs | 4 +- .../sorts/sort_preserving_merge.rs | 7 +- .../windows/bounded_window_agg_exec.rs | 29 +- .../physical_plan/windows/window_agg_exec.rs | 31 +- datafusion/physical-expr/src/lib.rs | 7 +- datafusion/physical-expr/src/sort_expr.rs | 87 +++--- datafusion/physical-expr/src/utils.rs | 271 +++++++----------- 15 files changed, 243 insertions(+), 329 deletions(-) diff --git a/datafusion/core/src/dataframe.rs b/datafusion/core/src/dataframe.rs index f675476321c3d..af41725db5186 100644 --- a/datafusion/core/src/dataframe.rs +++ b/datafusion/core/src/dataframe.rs @@ -639,11 +639,9 @@ impl DataFrame { .and_then(|r| r.columns().first()) .and_then(|c| c.as_any().downcast_ref::()) .and_then(|a| a.values().first()) - .ok_or_else(|| { - DataFusionError::Internal( - "Unexpected output when collecting for count()".to_string(), - ) - })? as usize; + .ok_or(DataFusionError::Internal( + "Unexpected output when collecting for count()".to_string(), + ))? as usize; Ok(len) } diff --git a/datafusion/core/src/physical_optimizer/dist_enforcement.rs b/datafusion/core/src/physical_optimizer/dist_enforcement.rs index 8b5a464b89365..a48d002397d59 100644 --- a/datafusion/core/src/physical_optimizer/dist_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/dist_enforcement.rs @@ -38,11 +38,11 @@ use datafusion_expr::logical_plan::JoinType; use datafusion_physical_expr::equivalence::EquivalenceProperties; use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::expressions::NoOp; +use datafusion_physical_expr::utils::map_columns_before_projection; use datafusion_physical_expr::{ expr_list_eq_strict_order, normalize_expr_with_equivalence_properties, AggregateExpr, PhysicalExpr, }; -use std::collections::HashMap; use std::sync::Arc; /// The EnforceDistribution rule ensures that distribution requirements are met @@ -492,30 +492,6 @@ fn reorder_aggregate_keys( } } -fn map_columns_before_projection( - parent_required: &[Arc], - proj_exprs: &[(Arc, String)], -) -> Vec> { - let mut column_mapping = HashMap::new(); - for (expression, name) in proj_exprs.iter() { - if let Some(column) = expression.as_any().downcast_ref::() { - column_mapping.insert(name.clone(), column.clone()); - }; - } - let new_required: Vec> = parent_required - .iter() - .filter_map(|r| { - if let Some(column) = r.as_any().downcast_ref::() { - column_mapping.get(column.name()) - } else { - None - } - }) - .map(|e| Arc::new(e.clone()) as Arc) - .collect::>(); - new_required -} - fn shift_right_required( parent_required: &[Arc], left_columns_len: usize, diff --git a/datafusion/core/src/physical_optimizer/repartition.rs b/datafusion/core/src/physical_optimizer/repartition.rs index 58e76c5b7a17c..683c28cdb43f3 100644 --- a/datafusion/core/src/physical_optimizer/repartition.rs +++ b/datafusion/core/src/physical_optimizer/repartition.rs @@ -330,7 +330,9 @@ mod tests { use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; use crate::physical_plan::union::UnionExec; use crate::physical_plan::{displayable, DisplayFormatType, Statistics}; - use datafusion_physical_expr::{new_sort_requirements, PhysicalSortRequirements}; + use datafusion_physical_expr::{ + make_sort_requirements_from_exprs, PhysicalSortRequirement, + }; fn schema() -> SchemaRef { Arc::new(Schema::new(vec![Field::new("c1", DataType::Boolean, true)])) @@ -1148,9 +1150,10 @@ mod tests { } // model that it requires the output ordering of its input - fn required_input_ordering(&self) -> Vec>> { - let ordering_requirements = new_sort_requirements(self.output_ordering()); - vec![ordering_requirements] + fn required_input_ordering(&self) -> Vec>> { + vec![self + .output_ordering() + .map(make_sort_requirements_from_exprs)] } fn with_new_children( diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement.rs b/datafusion/core/src/physical_optimizer/sort_enforcement.rs index 2b89c29c4d141..752a800478772 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement.rs @@ -49,7 +49,7 @@ use crate::physical_plan::{with_new_children_if_necessary, Distribution, Executi use arrow::datatypes::SchemaRef; use datafusion_common::{reverse_sort_options, DataFusionError}; use datafusion_physical_expr::utils::{ - create_sort_expr_from_requirement, ordering_satisfy, + make_sort_exprs_from_requirements, ordering_satisfy, ordering_satisfy_requirement_concrete, }; use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr}; @@ -438,7 +438,7 @@ fn ensure_sorting( &plan, idx, )?; - let sort_expr = create_sort_expr_from_requirement(&required_ordering); + let sort_expr = make_sort_exprs_from_requirements(&required_ordering); add_sort_above(child, sort_expr)?; if is_sort(child) { *sort_onwards = Some(ExecTree::new(child.clone(), idx, vec![])); @@ -449,7 +449,7 @@ fn ensure_sorting( } (Some(required), None) => { // Ordering requirement is not met, we should add a `SortExec` to the plan. - let sort_expr = create_sort_expr_from_requirement(&required); + let sort_expr = make_sort_exprs_from_requirements(&required); add_sort_above(child, sort_expr)?; *sort_onwards = Some(ExecTree::new(child.clone(), idx, vec![])); } diff --git a/datafusion/core/src/physical_optimizer/sort_pushdown.rs b/datafusion/core/src/physical_optimizer/sort_pushdown.rs index 7c941671be4b1..41c9e26da5180 100644 --- a/datafusion/core/src/physical_optimizer/sort_pushdown.rs +++ b/datafusion/core/src/physical_optimizer/sort_pushdown.rs @@ -28,11 +28,12 @@ use datafusion_common::{DataFusionError, Result}; use datafusion_expr::JoinType; use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::utils::{ - create_sort_expr_from_requirement, ordering_satisfy_requirement, + make_sort_exprs_from_requirements, ordering_satisfy_requirement, requirements_compatible, }; use datafusion_physical_expr::{ - new_sort_requirements, PhysicalExpr, PhysicalSortExpr, PhysicalSortRequirements, + make_sort_requirements_from_exprs, PhysicalExpr, PhysicalSortExpr, + PhysicalSortRequirement, }; use itertools::izip; use std::ops::Deref; @@ -45,10 +46,10 @@ pub(crate) struct SortPushDown { /// Current plan pub plan: Arc, /// Parent required sort ordering - required_ordering: Option>, + required_ordering: Option>, /// The adjusted request sort ordering to children. /// By default they are the same as the plan's required input ordering, but can be adjusted based on parent required sort ordering properties. - adjusted_request_ordering: Vec>>, + adjusted_request_ordering: Vec>>, } impl SortPushDown { @@ -121,11 +122,13 @@ pub(crate) fn pushdown_sorts(requirements: SortPushDown) -> Result Result Result, - parent_required: Option<&[PhysicalSortRequirements]>, -) -> Result>>>> { + parent_required: Option<&[PhysicalSortRequirement]>, +) -> Result>>>> { let err_msg = "Expects parent requirement to contain something"; let err = || DataFusionError::Execution(err_msg.to_string()); let maintains_input_order = plan.maintains_input_order(); @@ -198,7 +201,7 @@ fn pushdown_requirement_to_children( // If the current plan is SortMergeJoinExec let left_columns_len = smj.left.schema().fields().len(); let parent_required_expr = - create_sort_expr_from_requirement(parent_required.ok_or_else(err)?); + make_sort_exprs_from_requirements(parent_required.ok_or_else(err)?); let expr_source_side = expr_source_sides(&parent_required_expr, smj.join_type, left_columns_len); match expr_source_side { @@ -262,8 +265,8 @@ fn pushdown_requirement_to_children( /// If the the parent requirements are more specific, push down the parent requirements /// If they are not compatible, need to add Sort. fn determine_children_requirement( - parent_required: Option<&[PhysicalSortRequirements]>, - request_child: Option<&[PhysicalSortRequirements]>, + parent_required: Option<&[PhysicalSortRequirement]>, + request_child: Option<&[PhysicalSortRequirement]>, child_plan: Arc, ) -> RequirementsCompatibility { if requirements_compatible(request_child, parent_required, || { @@ -284,10 +287,10 @@ fn determine_children_requirement( fn try_pushdown_requirements_to_join( plan: &Arc, - parent_required: Option<&[PhysicalSortRequirements]>, + parent_required: Option<&[PhysicalSortRequirement]>, sort_expr: Vec, push_side: JoinSide, -) -> Result>>>> { +) -> Result>>>> { let child_idx = match push_side { JoinSide::Left => 0, JoinSide::Right => 1, @@ -396,20 +399,20 @@ fn expr_source_sides( } fn shift_right_required( - parent_required: &[PhysicalSortRequirements], + parent_required: &[PhysicalSortRequirement], left_columns_len: usize, -) -> Result> { - let new_right_required: Vec = parent_required +) -> Result> { + let new_right_required: Vec = parent_required .iter() .filter_map(|r| { if let Some(col) = r.expr.as_any().downcast_ref::() { if col.index() >= left_columns_len { - Some(PhysicalSortRequirements { + Some(PhysicalSortRequirement { expr: Arc::new(Column::new( col.name(), col.index() - left_columns_len, )) as Arc, - sort_options: r.sort_options, + options: r.options, }) } else { None @@ -435,7 +438,7 @@ enum RequirementsCompatibility { /// Requirements satisfy Satisfy, /// Requirements compatible - Compatible(Option>), + Compatible(Option>), /// Requirements not compatible NonCompatible, } diff --git a/datafusion/core/src/physical_optimizer/utils.rs b/datafusion/core/src/physical_optimizer/utils.rs index 7c74fdcd89523..abd828d91be05 100644 --- a/datafusion/core/src/physical_optimizer/utils.rs +++ b/datafusion/core/src/physical_optimizer/utils.rs @@ -72,23 +72,23 @@ pub fn add_sort_above( } /// Checks whether the given executor is a limit; -/// i.e. either a `LocalLimitExec` or a `GlobalLimitExec`. +/// i.e. either a [`LocalLimitExec`] or a [`GlobalLimitExec`]. pub fn is_limit(plan: &Arc) -> bool { plan.as_any().is::() || plan.as_any().is::() } -/// Checks whether the given executor is a widnow; -/// i.e. either a `WindowAggExec` or a `BoundedWindowAggExec`. +/// Checks whether the given executor is a window; +/// i.e. either a [`WindowAggExec`] or a [`BoundedWindowAggExec`]. pub fn is_window(plan: &Arc) -> bool { plan.as_any().is::() || plan.as_any().is::() } -/// Checks whether the given executor is a `SortExec`. +/// Checks whether the given executor is a [`SortExec`]. pub fn is_sort(plan: &Arc) -> bool { plan.as_any().is::() } -/// Checks whether the given executor is a `SortPreservingMergeExec`. +/// Checks whether the given executor is a [`SortPreservingMergeExec`]. pub fn is_sort_preserving_merge(plan: &Arc) -> bool { plan.as_any().is::() } diff --git a/datafusion/core/src/physical_plan/joins/sort_merge_join.rs b/datafusion/core/src/physical_plan/joins/sort_merge_join.rs index 035ffa4dfa089..bcf08126626c3 100644 --- a/datafusion/core/src/physical_plan/joins/sort_merge_join.rs +++ b/datafusion/core/src/physical_plan/joins/sort_merge_join.rs @@ -34,7 +34,9 @@ use arrow::compute::{concat_batches, take, SortOptions}; use arrow::datatypes::{DataType, SchemaRef, TimeUnit}; use arrow::error::ArrowError; use arrow::record_batch::RecordBatch; -use datafusion_physical_expr::{new_sort_requirements, PhysicalSortRequirements}; +use datafusion_physical_expr::{ + make_sort_requirements_from_exprs, PhysicalSortRequirement, +}; use futures::{Stream, StreamExt}; use crate::error::DataFusionError; @@ -222,10 +224,11 @@ impl ExecutionPlan for SortMergeJoinExec { ] } - fn required_input_ordering(&self) -> Vec>> { - let left_requirements = new_sort_requirements(Some(&self.left_sort_exprs)); - let right_requirements = new_sort_requirements(Some(&self.right_sort_exprs)); - vec![left_requirements, right_requirements] + fn required_input_ordering(&self) -> Vec>> { + vec![ + Some(make_sort_requirements_from_exprs(&self.left_sort_exprs)), + Some(make_sort_requirements_from_exprs(&self.right_sort_exprs)), + ] } fn output_partitioning(&self) -> Partitioning { diff --git a/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs b/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs index 649c5753a8a2c..fd57782dfc4bb 100644 --- a/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs +++ b/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs @@ -46,7 +46,9 @@ use hashbrown::{raw::RawTable, HashSet}; use datafusion_common::{utils::bisect, ScalarValue}; use datafusion_physical_expr::intervals::{ExprIntervalGraph, Interval}; -use datafusion_physical_expr::{new_sort_requirements, PhysicalSortRequirements}; +use datafusion_physical_expr::{ + make_sort_requirements_from_exprs, PhysicalSortRequirement, +}; use crate::error::{DataFusionError, Result}; use crate::execution::context::TaskContext; @@ -391,12 +393,15 @@ impl ExecutionPlan for SymmetricHashJoinExec { self.schema.clone() } - fn required_input_ordering(&self) -> Vec>> { - let left_ordering_requirements = - new_sort_requirements(self.left.output_ordering()); - let right_ordering_requirements = - new_sort_requirements(self.right.output_ordering()); - vec![left_ordering_requirements, right_ordering_requirements] + fn required_input_ordering(&self) -> Vec>> { + vec![ + self.left + .output_ordering() + .map(make_sort_requirements_from_exprs), + self.right + .output_ordering() + .map(make_sort_requirements_from_exprs), + ] } fn unbounded_output(&self, children: &[bool]) -> Result { diff --git a/datafusion/core/src/physical_plan/mod.rs b/datafusion/core/src/physical_plan/mod.rs index 1b576c5b6888c..3b11fcc40d202 100644 --- a/datafusion/core/src/physical_plan/mod.rs +++ b/datafusion/core/src/physical_plan/mod.rs @@ -141,7 +141,7 @@ pub trait ExecutionPlan: Debug + Send + Sync { /// NOTE that checking `!is_empty()` does **not** check for a /// required input ordering. Instead, the correct check is that at /// least one entry must be `Some` - fn required_input_ordering(&self) -> Vec>> { + fn required_input_ordering(&self) -> Vec>> { vec![None; self.children().len()] } @@ -594,7 +594,7 @@ use datafusion_physical_expr::{ expr_list_eq_strict_order, normalize_expr_with_equivalence_properties, }; pub use datafusion_physical_expr::{AggregateExpr, PhysicalExpr}; -use datafusion_physical_expr::{EquivalenceProperties, PhysicalSortRequirements}; +use datafusion_physical_expr::{EquivalenceProperties, PhysicalSortRequirement}; /// Applies an optional projection to a [`SchemaRef`], returning the /// projected schema diff --git a/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs b/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs index 2d7018f38a335..edacae4052d83 100644 --- a/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs +++ b/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs @@ -47,7 +47,7 @@ use crate::physical_plan::{ SendableRecordBatchStream, Statistics, }; use datafusion_physical_expr::{ - new_sort_requirements, EquivalenceProperties, PhysicalSortRequirements, + make_sort_requirements_from_exprs, EquivalenceProperties, PhysicalSortRequirement, }; /// Sort preserving merge execution plan @@ -127,9 +127,8 @@ impl ExecutionPlan for SortPreservingMergeExec { vec![Distribution::UnspecifiedDistribution] } - fn required_input_ordering(&self) -> Vec>> { - let ordering_requirements = new_sort_requirements(Some(&self.expr)); - vec![ordering_requirements] + fn required_input_ordering(&self) -> Vec>> { + vec![Some(make_sort_requirements_from_exprs(&self.expr))] } fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> { diff --git a/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs b/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs index 57863459d6360..fd2252ca862d8 100644 --- a/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs +++ b/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs @@ -55,7 +55,7 @@ use datafusion_physical_expr::window::{ WindowAggState, WindowState, }; use datafusion_physical_expr::{ - EquivalenceProperties, PhysicalExpr, PhysicalSortRequirements, + EquivalenceProperties, PhysicalExpr, PhysicalSortRequirement, }; use indexmap::IndexMap; use log::debug; @@ -169,29 +169,22 @@ impl ExecutionPlan for BoundedWindowAggExec { self.input().output_ordering() } - fn required_input_ordering(&self) -> Vec>> { - let expr_partition_keys = self.window_expr()[0].partition_by(); - let expr_order_keys = self.window_expr()[0].order_by(); + fn required_input_ordering(&self) -> Vec>> { + let partition_keys = self.window_expr()[0].partition_by(); + let order_keys = self.window_expr()[0].order_by(); let requirements = self.sort_keys.as_deref().map(|ordering| { ordering .iter() .map(|o| { - let is_partition_only = - expr_partition_keys.iter().any(|e| e.eq(&o.expr)) - && !expr_order_keys.iter().any(|e| e.expr.eq(&o.expr)); - if is_partition_only { - PhysicalSortRequirements { - expr: o.expr.clone(), - sort_options: None, - } - } else { - PhysicalSortRequirements { - expr: o.expr.clone(), - sort_options: Some(o.options), - } + let in_partition_keys = partition_keys.iter().any(|e| o.expr.eq(e)); + let in_order_keys = order_keys.iter().any(|e| o.expr.eq(&e.expr)); + let not_partition_only = !in_partition_keys || in_order_keys; + PhysicalSortRequirement { + expr: o.expr.clone(), + options: not_partition_only.then_some(o.options), } }) - .collect::>() + .collect() }); vec![requirements] } diff --git a/datafusion/core/src/physical_plan/windows/window_agg_exec.rs b/datafusion/core/src/physical_plan/windows/window_agg_exec.rs index a0afa3a27bebc..db767b5ff8d31 100644 --- a/datafusion/core/src/physical_plan/windows/window_agg_exec.rs +++ b/datafusion/core/src/physical_plan/windows/window_agg_exec.rs @@ -39,10 +39,9 @@ use arrow::{ record_batch::RecordBatch, }; use datafusion_common::DataFusionError; -use datafusion_physical_expr::PhysicalSortRequirements; +use datafusion_physical_expr::PhysicalSortRequirement; use futures::stream::Stream; use futures::{ready, StreamExt}; -use log::debug; use std::any::Any; use std::ops::Range; use std::pin::Pin; @@ -173,36 +172,28 @@ impl ExecutionPlan for WindowAggExec { vec![true] } - fn required_input_ordering(&self) -> Vec>> { - let expr_partition_keys = self.window_expr()[0].partition_by(); - let expr_order_keys = self.window_expr()[0].order_by(); + fn required_input_ordering(&self) -> Vec>> { + let partition_keys = self.window_expr()[0].partition_by(); + let order_keys = self.window_expr()[0].order_by(); let requirements = self.sort_keys.as_deref().map(|ordering| { ordering .iter() .map(|o| { - let is_partition_only = - expr_partition_keys.iter().any(|e| e.eq(&o.expr)) - && !expr_order_keys.iter().any(|e| e.expr.eq(&o.expr)); - if is_partition_only { - PhysicalSortRequirements { - expr: o.expr.clone(), - sort_options: None, - } - } else { - PhysicalSortRequirements { - expr: o.expr.clone(), - sort_options: Some(o.options), - } + let in_partition_keys = partition_keys.iter().any(|e| o.expr.eq(e)); + let in_order_keys = order_keys.iter().any(|e| o.expr.eq(&e.expr)); + let not_partition_only = !in_partition_keys || in_order_keys; + PhysicalSortRequirement { + expr: o.expr.clone(), + options: not_partition_only.then_some(o.options), } }) - .collect::>() + .collect() }); vec![requirements] } fn required_input_distribution(&self) -> Vec { if self.partition_keys.is_empty() { - debug!("No partition defined for WindowAggExec!!!"); vec![Distribution::SinglePartition] } else { vec![Distribution::HashPartitioned(self.partition_keys.clone())] diff --git a/datafusion/physical-expr/src/lib.rs b/datafusion/physical-expr/src/lib.rs index 5f082e03a593e..161200e50f799 100644 --- a/datafusion/physical-expr/src/lib.rs +++ b/datafusion/physical-expr/src/lib.rs @@ -53,10 +53,11 @@ pub use equivalence::EquivalentClass; pub use physical_expr::{AnalysisContext, ExprBoundaries, PhysicalExpr, PhysicalExprRef}; pub use planner::create_physical_expr; pub use scalar_function::ScalarFunctionExpr; -pub use sort_expr::PhysicalSortExpr; -pub use sort_expr::PhysicalSortRequirements; +pub use sort_expr::{ + make_sort_requirements_from_exprs, PhysicalSortExpr, PhysicalSortRequirement, +}; pub use utils::{ - expr_list_eq_any_order, expr_list_eq_strict_order, new_sort_requirements, + expr_list_eq_any_order, expr_list_eq_strict_order, normalize_expr_with_equivalence_properties, normalize_out_expr_with_alias_schema, normalize_sort_expr_with_equivalence_properties, sort_expr_list_eq_strict_order, split_conjunction, diff --git a/datafusion/physical-expr/src/sort_expr.rs b/datafusion/physical-expr/src/sort_expr.rs index a25802e9980ab..5683d4abee231 100644 --- a/datafusion/physical-expr/src/sort_expr.rs +++ b/datafusion/physical-expr/src/sort_expr.rs @@ -41,14 +41,7 @@ impl PartialEq for PhysicalSortExpr { impl std::fmt::Display for PhysicalSortExpr { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - let opts_string = match (self.options.descending, self.options.nulls_first) { - (true, true) => "DESC", - (true, false) => "DESC NULLS LAST", - (false, true) => "ASC", - (false, false) => "ASC NULLS LAST", - }; - - write!(f, "{} {}", self.expr, opts_string) + write!(f, "{} {}", self.expr, to_string(&self.options)) } } @@ -70,55 +63,69 @@ impl PhysicalSortExpr { }) } - pub fn satisfy(&self, requirement: &PhysicalSortRequirements) -> bool { - if requirement.sort_options.is_some() { - self.options == requirement.sort_options.unwrap() - && self.expr.eq(&requirement.expr) - } else { - self.expr.eq(&requirement.expr) - } + pub fn satisfy(&self, requirement: &PhysicalSortRequirement) -> bool { + self.expr.eq(&requirement.expr) + && requirement + .options + .map_or(true, |opts| self.options == opts) } } /// Represents sort requirement associated with a plan #[derive(Clone, Debug)] -pub struct PhysicalSortRequirements { +pub struct PhysicalSortRequirement { /// Physical expression representing the column to sort pub expr: Arc, /// Option to specify how the given column should be sorted. - /// If not specified, the PhysicalSortRequirements does not have specific requirements on the sort options. - pub sort_options: Option, + /// If unspecified, there is no constraint on sort options. + pub options: Option, } -impl PartialEq for PhysicalSortRequirements { - fn eq(&self, other: &PhysicalSortRequirements) -> bool { - self.sort_options == other.sort_options && self.expr.eq(&other.expr) +impl From for PhysicalSortRequirement { + fn from(value: PhysicalSortExpr) -> Self { + Self { + expr: value.expr, + options: Some(value.options), + } + } +} + +impl PartialEq for PhysicalSortRequirement { + fn eq(&self, other: &PhysicalSortRequirement) -> bool { + self.options == other.options && self.expr.eq(&other.expr) } } -impl std::fmt::Display for PhysicalSortRequirements { +impl std::fmt::Display for PhysicalSortRequirement { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - let opts_string = if let Some(sort_options) = self.sort_options { - match (sort_options.descending, sort_options.nulls_first) { - (true, true) => "DESC", - (true, false) => "DESC NULLS LAST", - (false, true) => "ASC", - (false, false) => "ASC NULLS LAST", - } - } else { - "NA" - }; + let opts_string = self.options.as_ref().map_or("NA", to_string); write!(f, "{} {}", self.expr, opts_string) } } -impl PhysicalSortRequirements { - /// Requirement is compatible with the other means the current requirement is equal or more specific than the other - pub fn compatible(&self, other: &PhysicalSortRequirements) -> bool { - if other.sort_options.is_some() { - self.eq(other) - } else { - self.expr.eq(&other.expr) - } +impl PhysicalSortRequirement { + /// Returns whether this requirement is equal or more specific than `other`. + pub fn compatible(&self, other: &PhysicalSortRequirement) -> bool { + self.expr.eq(&other.expr) + && other.options.map_or(true, |other_opts| { + self.options.map_or(false, |opts| opts == other_opts) + }) + } +} + +pub fn make_sort_requirements_from_exprs( + ordering: &[PhysicalSortExpr], +) -> Vec { + ordering.iter().map(|e| e.clone().into()).collect() +} + +/// Returns the SQL string representation of the given [SortOptions] object. +#[inline] +fn to_string(options: &SortOptions) -> &str { + match (options.descending, options.nulls_first) { + (true, true) => "DESC", + (true, false) => "DESC NULLS LAST", + (false, true) => "ASC", + (false, false) => "ASC NULLS LAST", } } diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index 0f67bb4290b87..76b2871886a5b 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -19,7 +19,7 @@ use crate::equivalence::EquivalentClass; use crate::expressions::{BinaryExpr, Column, UnKnownColumn}; use crate::rewrite::{TreeNodeRewritable, TreeNodeRewriter}; use crate::{ - EquivalenceProperties, PhysicalExpr, PhysicalSortExpr, PhysicalSortRequirements, + EquivalenceProperties, PhysicalExpr, PhysicalSortExpr, PhysicalSortRequirement, }; use arrow::datatypes::SchemaRef; use datafusion_common::{DataFusionError, Result}; @@ -118,33 +118,21 @@ pub fn normalize_out_expr_with_alias_schema( alias_map: &HashMap>, schema: &SchemaRef, ) -> Arc { - let expr_clone = expr.clone(); - expr_clone + expr.clone() .transform(&|expr| { - let normalized_form: Option> = - match expr.as_any().downcast_ref::() { - Some(column) => { - let out = alias_map - .get(column) - .map(|c| { - let out_col: Arc = - Arc::new(c[0].clone()); - out_col - }) - .or_else(|| match schema.index_of(column.name()) { - // Exactly matching, return None, no need to do the transform - Ok(idx) if column.index() == idx => None, - _ => { - let out_col: Arc = - Arc::new(UnKnownColumn::new(column.name())); - Some(out_col) - } - }); - out - } - None => None, - }; - Ok(normalized_form) + Ok(match expr.as_any().downcast_ref::() { + Some(column) => { + alias_map + .get(column) + .map(|c| Arc::new(c[0].clone()) as _) + .or_else(|| match schema.index_of(column.name()) { + // Exactly matching, return None, no need to do the transform + Ok(idx) if column.index() == idx => None, + _ => Some(Arc::new(UnKnownColumn::new(column.name())) as _), + }) + } + None => None, + }) }) .unwrap_or(expr) } @@ -153,38 +141,21 @@ pub fn normalize_expr_with_equivalence_properties( expr: Arc, eq_properties: &[EquivalentClass], ) -> Arc { - let expr_clone = expr.clone(); - expr_clone + expr.clone() .transform(&|expr| match expr.as_any().downcast_ref::() { Some(column) => { - let mut normalized: Option> = None; for class in eq_properties { if class.contains(column) { - normalized = Some(Arc::new(class.head().clone())); - break; + return Ok(Some(Arc::new(class.head().clone()))); } } - Ok(normalized) + Ok(None) } None => Ok(None), }) .unwrap_or(expr) } -pub fn new_sort_requirements( - sort_keys: Option<&[PhysicalSortExpr]>, -) -> Option> { - sort_keys.map(|ordering| { - ordering - .iter() - .map(|o| PhysicalSortRequirements { - expr: o.expr.clone(), - sort_options: Some(o.options), - }) - .collect::>() - }) -} - pub fn normalize_sort_expr_with_equivalence_properties( sort_expr: PhysicalSortExpr, eq_properties: &[EquivalentClass], @@ -203,24 +174,25 @@ pub fn normalize_sort_expr_with_equivalence_properties( } pub fn normalize_sort_requirement_with_equivalence_properties( - sort_requirement: PhysicalSortRequirements, + sort_requirement: PhysicalSortRequirement, eq_properties: &[EquivalentClass], -) -> PhysicalSortRequirements { +) -> PhysicalSortRequirement { let normalized_expr = normalize_expr_with_equivalence_properties( sort_requirement.expr.clone(), eq_properties, ); if sort_requirement.expr.ne(&normalized_expr) { - PhysicalSortRequirements { + PhysicalSortRequirement { expr: normalized_expr, - sort_options: sort_requirement.sort_options, + options: sort_requirement.options, } } else { sort_requirement } } -/// Checks whether the required [PhysicalSortExpr]s are satisfied by the provided [PhysicalSortExpr]s. +/// Checks whether the required [`PhysicalSortExpr`]s are satisfied by the +/// provided [`PhysicalSortExpr`]s. pub fn ordering_satisfy EquivalenceProperties>( provided: Option<&[PhysicalSortExpr]>, required: Option<&[PhysicalSortExpr]>, @@ -235,6 +207,8 @@ pub fn ordering_satisfy EquivalenceProperties>( } } +/// Checks whether the required [`PhysicalSortExpr`]s are satisfied by the +/// provided [`PhysicalSortExpr`]s. fn ordering_satisfy_concrete EquivalenceProperties>( provided: &[PhysicalSortExpr], required: &[PhysicalSortExpr], @@ -245,35 +219,29 @@ fn ordering_satisfy_concrete EquivalenceProperties>( } else if required .iter() .zip(provided.iter()) - .all(|(order1, order2)| order1.eq(order2)) + .all(|(req, given)| req.eq(given)) { true } else if let eq_classes @ [_, ..] = equal_properties().classes() { - let normalized_required_exprs = required + required .iter() .map(|e| { normalize_sort_expr_with_equivalence_properties(e.clone(), eq_classes) }) - .collect::>(); - let normalized_provided_exprs = provided - .iter() - .map(|e| { + .zip(provided.iter().map(|e| { normalize_sort_expr_with_equivalence_properties(e.clone(), eq_classes) - }) - .collect::>(); - normalized_required_exprs - .iter() - .zip(normalized_provided_exprs.iter()) - .all(|(order1, order2)| order1.eq(order2)) + })) + .all(|(req, given)| req.eq(&given)) } else { false } } -/// Checks whether the required ordering requirements are satisfied by the provided [PhysicalSortExpr]s. +/// Checks whether the given [`PhysicalSortRequirement`]s are satisfied by the +/// provided [`PhysicalSortExpr`]s. pub fn ordering_satisfy_requirement EquivalenceProperties>( provided: Option<&[PhysicalSortExpr]>, - required: Option<&[PhysicalSortRequirements]>, + required: Option<&[PhysicalSortRequirement]>, equal_properties: F, ) -> bool { match (provided, required) { @@ -285,9 +253,11 @@ pub fn ordering_satisfy_requirement EquivalenceProperties>( } } +/// Checks whether the given [`PhysicalSortRequirement`]s are satisfied by the +/// provided [`PhysicalSortExpr`]s. pub fn ordering_satisfy_requirement_concrete EquivalenceProperties>( provided: &[PhysicalSortExpr], - required: &[PhysicalSortRequirements], + required: &[PhysicalSortRequirement], equal_properties: F, ) -> bool { if required.len() > provided.len() { @@ -295,11 +265,11 @@ pub fn ordering_satisfy_requirement_concrete EquivalencePropertie } else if required .iter() .zip(provided.iter()) - .all(|(order1, order2)| order2.satisfy(order1)) + .all(|(req, given)| given.satisfy(req)) { true } else if let eq_classes @ [_, ..] = equal_properties().classes() { - let normalized_requirements = required + required .iter() .map(|e| { normalize_sort_requirement_with_equivalence_properties( @@ -307,81 +277,80 @@ pub fn ordering_satisfy_requirement_concrete EquivalencePropertie eq_classes, ) }) - .collect::>(); - let normalized_provided_exprs = provided - .iter() - .map(|e| { + .zip(provided.iter().map(|e| { normalize_sort_expr_with_equivalence_properties(e.clone(), eq_classes) - }) - .collect::>(); - normalized_requirements - .iter() - .zip(normalized_provided_exprs.iter()) - .all(|(order1, order2)| order2.satisfy(order1)) + })) + .all(|(req, given)| given.satisfy(&req)) } else { false } } -/// Provided requirements are compatible with the required, which means the provided requirements are equal or more specific than the required +/// Checks whether the given [`PhysicalSortRequirement`]s are equal or more +/// specific than the provided [`PhysicalSortRequirement`]s. pub fn requirements_compatible EquivalenceProperties>( - provided: Option<&[PhysicalSortRequirements]>, - required: Option<&[PhysicalSortRequirements]>, + provided: Option<&[PhysicalSortRequirement]>, + required: Option<&[PhysicalSortRequirement]>, equal_properties: F, ) -> bool { match (provided, required) { (_, None) => true, (None, Some(_)) => false, (Some(provided), Some(required)) => { - if required.len() > provided.len() { - false - } else if required - .iter() - .zip(provided.iter()) - .all(|(req, pro)| pro.compatible(req)) - { - true - } else if let eq_classes @ [_, ..] = equal_properties().classes() { - let normalized_required = required - .iter() - .map(|e| { - normalize_sort_requirement_with_equivalence_properties( - e.clone(), - eq_classes, - ) - }) - .collect::>(); - let normalized_provided = provided - .iter() - .map(|e| { - normalize_sort_requirement_with_equivalence_properties( - e.clone(), - eq_classes, - ) - }) - .collect::>(); - normalized_required - .iter() - .zip(normalized_provided.iter()) - .all(|(req, pro)| pro.compatible(req)) - } else { - false - } + requirements_compatible_concrete(provided, required, equal_properties) } } } +/// Checks whether the given [`PhysicalSortRequirement`]s are equal or more +/// specific than the provided [`PhysicalSortRequirement`]s. +fn requirements_compatible_concrete EquivalenceProperties>( + provided: &[PhysicalSortRequirement], + required: &[PhysicalSortRequirement], + equal_properties: F, +) -> bool { + if required.len() > provided.len() { + false + } else if required + .iter() + .zip(provided.iter()) + .all(|(req, given)| given.compatible(req)) + { + true + } else if let eq_classes @ [_, ..] = equal_properties().classes() { + required + .iter() + .map(|e| { + normalize_sort_requirement_with_equivalence_properties( + e.clone(), + eq_classes, + ) + }) + .zip(provided.iter().map(|e| { + normalize_sort_requirement_with_equivalence_properties( + e.clone(), + eq_classes, + ) + })) + .all(|(req, given)| given.compatible(&req)) + } else { + false + } +} + pub fn map_columns_before_projection( parent_required: &[Arc], proj_exprs: &[(Arc, String)], ) -> Vec> { - let mut column_mapping = HashMap::new(); - for (expression, name) in proj_exprs.iter() { - if let Some(column) = expression.as_any().downcast_ref::() { - column_mapping.insert(name.clone(), column.clone()); - }; - } - let new_required: Vec> = parent_required + let column_mapping = proj_exprs + .iter() + .filter_map(|(expr, name)| { + expr.as_any() + .downcast_ref::() + .map(|column| (name.clone(), column.clone())) + }) + .collect::>(); + parent_required .iter() .filter_map(|r| { if let Some(column) = r.as_any().downcast_ref::() { @@ -390,57 +359,24 @@ pub fn map_columns_before_projection( None } }) - .map(|e| Arc::new(e.clone()) as Arc) - .collect::>(); - new_required -} - -pub fn map_requirement_before_projection( - parent_required: Option<&[PhysicalSortRequirements]>, - proj_exprs: &[(Arc, String)], -) -> Option> { - println!("parent_required: {:?}", parent_required); - println!("proj_exprs: {:?}", proj_exprs); - if let Some(requirement) = parent_required { - let required_expr = create_sort_expr_from_requirement(requirement) - .iter() - .map(|sort_expr| sort_expr.expr.clone()) - .collect::>(); - println!("required_expr:{:?}", required_expr); - let new_exprs = map_columns_before_projection(&required_expr, proj_exprs); - println!("new_exprs:{:?}", new_exprs); - if new_exprs.len() == requirement.len() { - let new_request = new_exprs - .iter() - .zip(requirement.iter()) - .map(|(new, old)| PhysicalSortRequirements { - expr: new.clone(), - sort_options: old.sort_options, - }) - .collect::>(); - Some(new_request) - } else { - None - } - } else { - None - } + .map(|e| Arc::new(e.clone()) as _) + .collect() } -pub fn create_sort_expr_from_requirement( - required: &[PhysicalSortRequirements], +pub fn make_sort_exprs_from_requirements( + required: &[PhysicalSortRequirement], ) -> Vec { - let parent_required_expr = required + required .iter() - .map(|prop| { - if prop.sort_options.is_some() { + .map(|requirement| { + if let Some(options) = requirement.options { PhysicalSortExpr { - expr: prop.expr.clone(), - options: prop.sort_options.unwrap(), + expr: requirement.expr.clone(), + options, } } else { PhysicalSortExpr { - expr: prop.expr.clone(), + expr: requirement.expr.clone(), options: SortOptions { // By default, create sort key with ASC is true and NULLS LAST to be consistent with // PostgreSQL's rule: https://www.postgresql.org/docs/current/queries-order.html @@ -450,8 +386,7 @@ pub fn create_sort_expr_from_requirement( } } }) - .collect::>(); - parent_required_expr + .collect() } #[derive(Clone, Debug)] From de2d11e3eaea5c9781be6536ff661afcc8457682 Mon Sep 17 00:00:00 2001 From: Mehmet Ozan Kabak Date: Fri, 17 Mar 2023 00:09:53 -0500 Subject: [PATCH 23/35] Refactors and simplifications part 2 --- .../physical_optimizer/sort_enforcement.rs | 112 +++++++------- .../src/physical_optimizer/sort_pushdown.rs | 144 ++++++------------ .../core/src/physical_optimizer/utils.rs | 12 ++ 3 files changed, 112 insertions(+), 156 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement.rs b/datafusion/core/src/physical_optimizer/sort_enforcement.rs index 752a800478772..92bb89a477103 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement.rs @@ -37,7 +37,8 @@ use crate::config::ConfigOptions; use crate::error::Result; use crate::physical_optimizer::sort_pushdown::{pushdown_sorts, SortPushDown}; use crate::physical_optimizer::utils::{ - add_sort_above, is_limit, is_sort, is_sort_preserving_merge, is_window, + add_sort_above, is_coalesce_partitions, is_limit, is_sort, is_sort_preserving_merge, + is_window, }; use crate::physical_optimizer::PhysicalOptimizerRule; use crate::physical_plan::coalesce_partitions::CoalescePartitionsExec; @@ -105,8 +106,8 @@ impl ExecTree { } } -/// This object is used within the [EnforceSorting] rule to track the closest -/// `SortExec` descendant(s) for every child of a plan. +/// This object is used within the [`EnforceSorting`] rule to track the closest +/// [`SortExec`] descendant(s) for every child of a plan. #[derive(Debug, Clone)] struct PlanWithCorrespondingSort { plan: Arc, @@ -204,7 +205,7 @@ impl TreeNodeRewritable for PlanWithCorrespondingSort { } /// This object is used within the [EnforceSorting] rule to track the closest -/// `CoalescePartitionsExec` descendant(s) for every child of a plan. +/// [`CoalescePartitionsExec`] descendant(s) for every child of a plan. #[derive(Debug, Clone)] struct PlanWithCorrespondingCoalescePartitions { plan: Arc, @@ -245,7 +246,7 @@ impl PlanWithCorrespondingCoalescePartitions { if plan.children().is_empty() { // Plan has no children, there is nothing to propagate. None - } else if plan.as_any().is::() { + } else if is_coalesce_partitions(&plan) { Some(ExecTree::new(plan, idx, vec![])) } else { let children = item @@ -317,6 +318,8 @@ impl PhysicalOptimizerRule for EnforceSorting { config: &ConfigOptions, ) -> Result> { let plan_requirements = PlanWithCorrespondingSort::new(plan); + // Execute a bottom-up traversal to enforce sorting requirements, + // remove unnecessary sorts, and optimize sort-sensitive operators: let adjusted = plan_requirements.transform_up(&ensure_sorting)?; let new_plan = if config.optimizer.repartition_sorts { let plan_with_coalesce_partitions = @@ -327,7 +330,8 @@ impl PhysicalOptimizerRule for EnforceSorting { } else { adjusted.plan }; - // Execute a Top-Down process (Preorder Traversal) to push down sorts if they are helpful: + // Execute a top-down traversal to exploit sort push-down opportunities + // missed by the bottom-up traversal: let sort_pushdown = SortPushDown::init(new_plan); let adjusted = sort_pushdown.transform_down(&pushdown_sorts)?; Ok(adjusted.plan) @@ -343,38 +347,34 @@ impl PhysicalOptimizerRule for EnforceSorting { } /// This function turns plans of the form -/// "SortExec: expr=[a@0 ASC]", +/// "SortExec: expr=\[a@0 ASC\]", /// " CoalescePartitionsExec", /// " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", /// to -/// "SortPreservingMergeExec: [a@0 ASC]", -/// " SortExec: expr=[a@0 ASC]", +/// "SortPreservingMergeExec: \[a@0 ASC\]", +/// " SortExec: expr=\[a@0 ASC\]", /// " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", -/// by following connections from `CoalescePartitionsExec`s to `SortExec`s. +/// by following connections from [`CoalescePartitionsExec`]s to [`SortExec`]s. /// By performing sorting in parallel, we can increase performance in some scenarios. fn parallelize_sorts( requirements: PlanWithCorrespondingCoalescePartitions, ) -> Result> { let plan = requirements.plan; let mut coalesce_onwards = requirements.coalesce_onwards; - if plan.children().is_empty() - // We only do action when plan is either SortExec, SortPreservingMergeExec or CoalescePartitionsExec - // all of them have single child. If 0th child is `None` we can immediately return. - || coalesce_onwards[0].is_none() - { + if plan.children().is_empty() || coalesce_onwards[0].is_none() { + // We only take an action when the plan is either a SortExec, a + // SortPreservingMergeExec or a CoalescePartitionsExec, and they + // all have a single child. Therefore, if the first child is `None`, + // we can return immediately. return Ok(None); - } - // We know that `plan` has children, so `coalesce_onwards` is non-empty. - if (is_sort(&plan) || is_sort_preserving_merge(&plan)) - // Make sure that Sort is actually global sort + } else if (is_sort(&plan) || is_sort_preserving_merge(&plan)) && plan.output_partitioning().partition_count() <= 1 { - // If there is a connection between a `CoalescePartitionsExec` and a - // Global Sort that satisfy the requirements (i.e. intermediate - // executors don't require single partition), then we can - // replace the `CoalescePartitionsExec`+ GlobalSort cascade with - // the `SortExec` + `SortPreservingMergeExec` - // cascade to parallelize sorting. + // If there is a connection between a CoalescePartitionsExec and a + // global sort that satisfy the requirements (i.e. intermediate + // executors don't require single partition), then we can replace + // the CoalescePartitionsExec + Sort cascade with a SortExec + + // SortPreservingMergeExec cascade to parallelize sorting. let mut prev_layer = plan.clone(); update_child_to_remove_coalesce(&mut prev_layer, &mut coalesce_onwards[0])?; let sort_exprs = get_sort_exprs(&plan)?; @@ -384,7 +384,7 @@ fn parallelize_sorts( plan: Arc::new(spm), coalesce_onwards: vec![None], })); - } else if plan.as_any().is::() { + } else if is_coalesce_partitions(&plan) { // There is an unnecessary `CoalescePartitionExec` in the plan. let mut prev_layer = plan.clone(); update_child_to_remove_coalesce(&mut prev_layer, &mut coalesce_onwards[0])?; @@ -479,7 +479,8 @@ fn ensure_sorting( } else if is_sort_preserving_merge(&plan) && children[0].output_partitioning().partition_count() <= 1 { - // sort preserving merge can removed. Input already has single partition + // This SortPreservingMergeExec is unnecessary, input already has a + // single partition. return Ok(Some(PlanWithCorrespondingSort { plan: children[0].clone(), sort_onwards: vec![sort_onwards[0].clone()], @@ -491,8 +492,8 @@ fn ensure_sorting( })) } -/// Analyzes a given `SortExec` (`plan`) to determine whether its input already -/// has a finer ordering than this `SortExec` enforces. +/// Analyzes a given [`SortExec`] (`plan`) to determine whether its input +/// already has a finer ordering than it enforces. fn analyze_immediate_sort_removal( plan: &Arc, sort_onwards: &[Option], @@ -539,8 +540,8 @@ fn analyze_immediate_sort_removal( None } -/// Analyzes a [WindowAggExec] or a [BoundedWindowAggExec] to determine whether -/// it may allow removing a sort. +/// Analyzes a [`WindowAggExec`] or a [`BoundedWindowAggExec`] to determine +/// whether it may allow removing a sort. fn analyze_window_sort_removal( sort_tree: &mut ExecTree, window_exec: &Arc, @@ -588,8 +589,7 @@ fn analyze_window_sort_removal( )?; if !can_skip_sorting { return Ok(None); - } - if let Some(first_should_reverse) = first_should_reverse { + } else if let Some(first_should_reverse) = first_should_reverse { if first_should_reverse != should_reverse { return Ok(None); } @@ -661,26 +661,20 @@ fn update_child_to_remove_coalesce( fn remove_corresponding_coalesce_in_sub_plan( coalesce_onwards: &mut ExecTree, ) -> Result> { - Ok( - if coalesce_onwards - .plan - .as_any() - .is::() - { - // We can safely use the 0th index since we have a `CoalescePartitionsExec`. - coalesce_onwards.plan.children()[0].clone() - } else { - let plan = coalesce_onwards.plan.clone(); - let mut children = plan.children(); - for item in &mut coalesce_onwards.children { - children[item.idx] = remove_corresponding_coalesce_in_sub_plan(item)?; - } - plan.with_new_children(children)? - }, - ) + Ok(if is_coalesce_partitions(&coalesce_onwards.plan) { + // We can safely use the 0th index since we have a `CoalescePartitionsExec`. + coalesce_onwards.plan.children()[0].clone() + } else { + let plan = coalesce_onwards.plan.clone(); + let mut children = plan.children(); + for item in &mut coalesce_onwards.children { + children[item.idx] = remove_corresponding_coalesce_in_sub_plan(item)?; + } + plan.with_new_children(children)? + }) } -/// Updates child to remove the unnecessary sorting below it. +/// Updates child to remove the unnecessary sort below it. fn update_child_to_remove_unnecessary_sort( child: &mut Arc, sort_onwards: &mut Option, @@ -698,8 +692,10 @@ fn update_child_to_remove_unnecessary_sort( )?; } *sort_onwards = None; - // Deleting sort(SortExec+SortPreservingMergeExec) may invalidate distribution requirement - update_child_to_satisfy_distribution(child, parent, child_idx) + // Deleting a merging sort may invalidate distribution requirements. + // Ensure that we stay compliant with such requirements: + update_child_to_satisfy_distribution(child, parent, child_idx); + Ok(()) } /// Removes the sort from the plan in `sort_onwards`. @@ -741,8 +737,8 @@ fn update_child_to_satisfy_distribution( child: &mut Arc, parent: &Arc, child_idx: usize, -) -> Result<()> { - // If distribution requirement is not satisfied, satisfies it by adding +) { + // If distribution requirement is not met, satisfy it by adding a // CoalescePartitionsExec. let requires_single_partition = matches!( parent.required_input_distribution()[child_idx], @@ -751,7 +747,6 @@ fn update_child_to_satisfy_distribution( if requires_single_partition && child.output_partitioning().partition_count() > 1 { *child = Arc::new(CoalescePartitionsExec::new(child.clone())) as _; } - Ok(()) } /// Converts an [ExecutionPlan] trait object to a [PhysicalSortExpr] slice when possible. @@ -1088,7 +1083,6 @@ mod tests { sort, ); - // let filter_exec = sort_exec; let physical_plan = bounded_window_exec("non_nullable_col", sort_exprs, filter); let expected_input = vec![ @@ -1504,8 +1498,8 @@ mod tests { let physical_plan = sort_preserving_merge_exec(sort_exprs, union); // Input is an invalid plan. In this case rule should add required sorting in appropriate places. - // First ParquetExec has output ordering(nullable_col@0 ASC). However, it doesn't satisfy required ordering - // of SortPreservingMergeExec. + // First ParquetExec has output ordering(nullable_col@0 ASC). However, it doesn't satisfy the + // required ordering of SortPreservingMergeExec. let expected_input = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", " UnionExec", diff --git a/datafusion/core/src/physical_optimizer/sort_pushdown.rs b/datafusion/core/src/physical_optimizer/sort_pushdown.rs index 41c9e26da5180..5bff79088a828 100644 --- a/datafusion/core/src/physical_optimizer/sort_pushdown.rs +++ b/datafusion/core/src/physical_optimizer/sort_pushdown.rs @@ -14,7 +14,7 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. -use crate::physical_optimizer::utils::{add_sort_above, is_limit, is_window}; +use crate::physical_optimizer::utils::{add_sort_above, is_limit, is_union, is_window}; use crate::physical_plan::filter::FilterExec; use crate::physical_plan::joins::utils::JoinSide; use crate::physical_plan::joins::SortMergeJoinExec; @@ -22,7 +22,6 @@ use crate::physical_plan::projection::ProjectionExec; use crate::physical_plan::repartition::RepartitionExec; use crate::physical_plan::sorts::sort::SortExec; use crate::physical_plan::tree_node::TreeNodeRewritable; -use crate::physical_plan::union::UnionExec; use crate::physical_plan::{with_new_children_if_necessary, ExecutionPlan}; use datafusion_common::{DataFusionError, Result}; use datafusion_expr::JoinType; @@ -32,15 +31,15 @@ use datafusion_physical_expr::utils::{ requirements_compatible, }; use datafusion_physical_expr::{ - make_sort_requirements_from_exprs, PhysicalExpr, PhysicalSortExpr, - PhysicalSortRequirement, + make_sort_requirements_from_exprs, PhysicalSortExpr, PhysicalSortRequirement, }; use itertools::izip; use std::ops::Deref; use std::sync::Arc; -/// This is a "data class" we use within the [EnforceSorting] rule to push down SortExec in the plan. -// By pushing down SortExecs through some Executors in the plan we can increase speed. +/// This is a "data class" we use within the [`EnforceSorting`] rule to push +/// down [`SortExec`] in the plan. In some cases, we can reduce the total +/// computational cost by pushing down `SortExec`s through some executors. #[derive(Debug, Clone)] pub(crate) struct SortPushDown { /// Current plan @@ -63,11 +62,8 @@ impl SortPushDown { } pub fn children(&self) -> Vec { - let plan_children = self.plan.children(); - assert_eq!(plan_children.len(), self.adjusted_request_ordering.len()); - izip!( - plan_children.into_iter(), + self.plan.children().into_iter(), self.adjusted_request_ordering.clone().into_iter(), ) .map(|(child, from_parent)| { @@ -83,44 +79,35 @@ impl SortPushDown { } impl TreeNodeRewritable for SortPushDown { - fn map_children(self, transform: F) -> Result + fn map_children(mut self, transform: F) -> Result where F: FnMut(Self) -> Result, { let children = self.children(); - if children.is_empty() { - Ok(self) - } else { - let new_children = children + if !children.is_empty() { + let children_plans = children .into_iter() .map(transform) + .map(|r| r.map(|s| s.plan)) .collect::>>()?; - let children_plans = new_children - .iter() - .map(|elem| elem.plan.clone()) - .collect::>(); - let plan = with_new_children_if_necessary(self.plan, children_plans)?; - Ok(SortPushDown { - plan, - required_ordering: self.required_ordering, - adjusted_request_ordering: self.adjusted_request_ordering, - }) - } + self.plan = with_new_children_if_necessary(self.plan, children_plans)?; + }; + Ok(self) } } pub(crate) fn pushdown_sorts(requirements: SortPushDown) -> Result> { let plan = &requirements.plan; let parent_required = requirements.required_ordering.as_deref(); - let err_msg = "Expects parent requirement to contain something"; - let err = || DataFusionError::Execution(err_msg.to_string()); + const ERR_MSG: &str = "Expects parent requirement to contain something"; + let err = || DataFusionError::Plan(ERR_MSG.to_string()); if let Some(sort_exec) = plan.as_any().downcast_ref::() { let mut new_plan = plan.clone(); if !ordering_satisfy_requirement(plan.output_ordering(), parent_required, || { plan.equivalence_properties() }) { - // If the current plan is a SortExec, modify current SortExec to satisfy the parent requirements + // If the current plan is a SortExec, modify it to satisfy parent requirements: let parent_required_expr = make_sort_exprs_from_requirements(parent_required.ok_or_else(err)?); new_plan = sort_exec.input.clone(); @@ -129,7 +116,7 @@ pub(crate) fn pushdown_sorts(requirements: SortPushDown) -> Result Result Result, parent_required: Option<&[PhysicalSortRequirement]>, ) -> Result>>>> { - let err_msg = "Expects parent requirement to contain something"; - let err = || DataFusionError::Execution(err_msg.to_string()); + const ERR_MSG: &str = "Expects parent requirement to contain something"; + let err = || DataFusionError::Plan(ERR_MSG.to_string()); let maintains_input_order = plan.maintains_input_order(); if is_window(plan) { let required_input_ordering = plan.required_input_ordering(); @@ -190,7 +177,7 @@ fn pushdown_requirement_to_children( RequirementsCompatibility::Compatible(adjusted) => Ok(Some(vec![adjusted])), RequirementsCompatibility::NonCompatible => Ok(None), } - } else if plan.as_any().is::() { + } else if is_union(plan) { // UnionExec does not have real sort requirements for its input. Here we change the adjusted_request_ordering to UnionExec's output ordering and // propagate the sort requirements down to correct the unnecessary descendant SortExec under the UnionExec Ok(Some(vec![ @@ -313,8 +300,7 @@ fn try_pushdown_requirements_to_join( } RequirementsCompatibility::NonCompatible => { // Can not push down, add new SortExec - let mut new_plan = plan.clone(); - add_sort_above(&mut new_plan, sort_expr)?; + add_sort_above(&mut plan.clone(), sort_expr)?; Ok(None) } } @@ -330,15 +316,13 @@ fn expr_source_sides( let all_column_sides = required_exprs .iter() .filter_map(|r| { - if let Some(col) = r.expr.as_any().downcast_ref::() { + r.expr.as_any().downcast_ref::().map(|col| { if col.index() < left_columns_len { - Some(JoinSide::Left) + JoinSide::Left } else { - Some(JoinSide::Right) + JoinSide::Right } - } else { - None - } + }) }) .collect::>(); @@ -359,42 +343,14 @@ fn expr_source_sides( None } } - JoinType::LeftSemi | JoinType::LeftAnti => { - if required_exprs - .iter() - .filter_map(|r| { - if r.expr.as_any().downcast_ref::().is_some() { - Some(JoinSide::Left) - } else { - None - } - }) - .count() - != required_exprs.len() - { - None - } else { - Some(JoinSide::Left) - } - } - JoinType::RightSemi | JoinType::RightAnti => { - if required_exprs - .iter() - .filter_map(|r| { - if r.expr.as_any().downcast_ref::().is_some() { - Some(JoinSide::Right) - } else { - None - } - }) - .count() - != required_exprs.len() - { - None - } else { - Some(JoinSide::Right) - } - } + JoinType::LeftSemi | JoinType::LeftAnti => required_exprs + .iter() + .all(|e| e.expr.as_any().downcast_ref::().is_some()) + .then_some(JoinSide::Left), + JoinType::RightSemi | JoinType::RightAnti => required_exprs + .iter() + .all(|e| e.expr.as_any().downcast_ref::().is_some()) + .then_some(JoinSide::Right), } } @@ -405,30 +361,24 @@ fn shift_right_required( let new_right_required: Vec = parent_required .iter() .filter_map(|r| { - if let Some(col) = r.expr.as_any().downcast_ref::() { - if col.index() >= left_columns_len { - Some(PhysicalSortRequirement { - expr: Arc::new(Column::new( - col.name(), - col.index() - left_columns_len, - )) as Arc, - options: r.options, - }) - } else { - None - } - } else { - None - } + r.expr.as_any().downcast_ref::().and_then(|col| { + (col.index() >= left_columns_len).then_some(PhysicalSortRequirement { + expr: Arc::new(Column::new( + col.name(), + col.index() - left_columns_len, + )) as _, + options: r.options, + }) + }) }) .collect::>(); - if new_right_required.len() != parent_required.len() { + if new_right_required.len() == parent_required.len() { + Ok(new_right_required) + } else { Err(DataFusionError::Plan( "Expect to shift all the parent required column indexes for SortMergeJoin" .to_string(), )) - } else { - Ok(new_right_required) } } diff --git a/datafusion/core/src/physical_optimizer/utils.rs b/datafusion/core/src/physical_optimizer/utils.rs index abd828d91be05..3fb67e7877ccb 100644 --- a/datafusion/core/src/physical_optimizer/utils.rs +++ b/datafusion/core/src/physical_optimizer/utils.rs @@ -21,9 +21,11 @@ use super::optimizer::PhysicalOptimizerRule; use crate::config::ConfigOptions; use crate::error::Result; +use crate::physical_plan::coalesce_partitions::CoalescePartitionsExec; use crate::physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; use crate::physical_plan::sorts::sort::SortExec; use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; +use crate::physical_plan::union::UnionExec; use crate::physical_plan::windows::{BoundedWindowAggExec, WindowAggExec}; use crate::physical_plan::{with_new_children_if_necessary, ExecutionPlan}; use datafusion_physical_expr::utils::ordering_satisfy; @@ -92,3 +94,13 @@ pub fn is_sort(plan: &Arc) -> bool { pub fn is_sort_preserving_merge(plan: &Arc) -> bool { plan.as_any().is::() } + +/// Checks whether the given executor is a [`CoalescePartitionsExec`]. +pub fn is_coalesce_partitions(plan: &Arc) -> bool { + plan.as_any().is::() +} + +/// Checks whether the given executor is a [`UnionExec`]. +pub fn is_union(plan: &Arc) -> bool { + plan.as_any().is::() +} From c2f1b5c1f09feba091ea289cb8f3d8b487168153 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Fri, 17 Mar 2023 10:11:07 +0300 Subject: [PATCH 24/35] simplifications --- .../physical_optimizer/sort_enforcement.rs | 121 ++++++++---------- .../windows/bounded_window_agg_exec.rs | 5 +- .../physical_plan/windows/window_agg_exec.rs | 5 +- 3 files changed, 55 insertions(+), 76 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement.rs b/datafusion/core/src/physical_optimizer/sort_enforcement.rs index 92bb89a477103..e2c05e43d3e29 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement.rs @@ -38,7 +38,7 @@ use crate::error::Result; use crate::physical_optimizer::sort_pushdown::{pushdown_sorts, SortPushDown}; use crate::physical_optimizer::utils::{ add_sort_above, is_coalesce_partitions, is_limit, is_sort, is_sort_preserving_merge, - is_window, + is_union, is_window, }; use crate::physical_optimizer::PhysicalOptimizerRule; use crate::physical_plan::coalesce_partitions::CoalescePartitionsExec; @@ -432,12 +432,7 @@ fn ensure_sorting( || child.equivalence_properties(), ) { // Make sure we preserve the ordering requirements: - update_child_to_remove_unnecessary_sort( - child, - sort_onwards, - &plan, - idx, - )?; + update_child_to_remove_unnecessary_sort(child, sort_onwards, &plan)?; let sort_expr = make_sort_exprs_from_requirements(&required_ordering); add_sort_above(child, sort_expr)?; if is_sort(child) { @@ -456,13 +451,8 @@ fn ensure_sorting( (None, Some(_)) => { // We have a `SortExec` whose effect may be neutralized by // another order-imposing operator. Remove this sort. - if !plan.maintains_input_order()[idx] { - update_child_to_remove_unnecessary_sort( - child, - sort_onwards, - &plan, - idx, - )?; + if !plan.maintains_input_order()[idx] || is_union(&plan) { + update_child_to_remove_unnecessary_sort(child, sort_onwards, &plan)?; } } (None, None) => {} @@ -679,7 +669,6 @@ fn update_child_to_remove_unnecessary_sort( child: &mut Arc, sort_onwards: &mut Option, parent: &Arc, - child_idx: usize, ) -> Result<()> { if let Some(sort_onwards) = sort_onwards { let requires_single_partition = matches!( @@ -692,9 +681,6 @@ fn update_child_to_remove_unnecessary_sort( )?; } *sort_onwards = None; - // Deleting a merging sort may invalidate distribution requirements. - // Ensure that we stay compliant with such requirements: - update_child_to_satisfy_distribution(child, parent, child_idx); Ok(()) } @@ -704,8 +690,8 @@ fn remove_corresponding_sort_from_sub_plan( requires_single_partition: bool, ) -> Result> { // A `SortExec` is always at the bottom of the tree. - if is_sort(&sort_onwards.plan) { - Ok(sort_onwards.plan.children()[0].clone()) + let mut updated_plan = if is_sort(&sort_onwards.plan) { + sort_onwards.plan.children()[0].clone() } else { let plan = &sort_onwards.plan; let mut children = plan.children(); @@ -718,35 +704,19 @@ fn remove_corresponding_sort_from_sub_plan( remove_corresponding_sort_from_sub_plan(item, requires_single_partition)?; } if is_sort_preserving_merge(plan) { - let child = &children[0]; - if requires_single_partition - && child.output_partitioning().partition_count() > 1 - { - Ok(Arc::new(CoalescePartitionsExec::new(child.clone()))) - } else { - Ok(child.clone()) - } + children[0].clone() } else { - plan.clone().with_new_children(children) + plan.clone().with_new_children(children)? } + }; + // Deleting a merging sort may invalidate distribution requirements. + // Ensure that we stay compliant with such requirements: + if requires_single_partition + && updated_plan.output_partitioning().partition_count() > 1 + { + updated_plan = Arc::new(CoalescePartitionsExec::new(updated_plan.clone())); } -} - -/// Updates child to remove the unnecessary sorting below it. -fn update_child_to_satisfy_distribution( - child: &mut Arc, - parent: &Arc, - child_idx: usize, -) { - // If distribution requirement is not met, satisfy it by adding a - // CoalescePartitionsExec. - let requires_single_partition = matches!( - parent.required_input_distribution()[child_idx], - Distribution::SinglePartition - ); - if requires_single_partition && child.output_partitioning().partition_count() > 1 { - *child = Arc::new(CoalescePartitionsExec::new(child.clone())) as _; - } + Ok(updated_plan) } /// Converts an [ExecutionPlan] trait object to a [PhysicalSortExpr] slice when possible. @@ -1728,7 +1698,7 @@ mod tests { let union = union_exec(vec![sort1, sort2]); let physical_plan = sort_preserving_merge_exec(sort_exprs3, union); - // Union preserves the inputs ordering and we should not change any of the SortExecs under UnionExec + // Union has unnecessarily fine ordering below it. We should be able to replace them with absolutely necessary ordering. let expected_input = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC]", " UnionExec", @@ -1737,7 +1707,16 @@ mod tests { " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", ]; - assert_optimized!(expected_input, expected_input, physical_plan); + // Union preserves the inputs ordering and we should not change any of the SortExecs under UnionExec + let expected_output = vec![ + "SortPreservingMergeExec: [nullable_col@0 ASC]", + " UnionExec", + " SortExec: expr=[nullable_col@0 ASC]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + " SortExec: expr=[nullable_col@0 ASC]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]", + ]; + assert_optimized!(expected_input, expected_output, physical_plan); Ok(()) } @@ -1803,21 +1782,25 @@ mod tests { ]; let sort_exprs2 = vec![sort_expr("nullable_col", &schema)]; // reverse sorting of sort_exprs2 - let sort_exprs3 = vec![sort_expr_options( - "nullable_col", - &schema, - SortOptions { - descending: true, - nulls_first: false, - }, - )]; + let sort_exprs3 = vec![ + sort_expr("nullable_col", &schema), + sort_expr_options( + "non_nullable_col", + &schema, + SortOptions { + descending: false, + nulls_first: false, + }, + ), + ]; let source1 = parquet_exec_sorted(&schema, sort_exprs1); - let source2 = parquet_exec_sorted(&schema, sort_exprs2); + let source2 = parquet_exec_sorted(&schema, sort_exprs2.clone()); let sort1 = sort_exec(sort_exprs3.clone(), source1); - let sort2 = sort_exec(sort_exprs3.clone(), source2); + let sort2 = sort_exec(sort_exprs3, source2); let union = union_exec(vec![sort1, sort2]); - let physical_plan = bounded_window_exec("nullable_col", sort_exprs3, union); + let spm = sort_preserving_merge_exec(sort_exprs2.clone(), union); + let physical_plan = bounded_window_exec("nullable_col", sort_exprs2, spm); // The `WindowAggExec` gets its sorting from multiple children jointly. // During the removal of `SortExec`s, it should be able to remove the @@ -1825,17 +1808,19 @@ mod tests { // are not necessarily the same to be able to remove them. let expected_input = vec![ "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " UnionExec", - " SortExec: expr=[nullable_col@0 DESC NULLS LAST]", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 DESC NULLS LAST]", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + " SortPreservingMergeExec: [nullable_col@0 ASC]", + " UnionExec", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC NULLS LAST]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC NULLS LAST]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", ]; let expected_optimized = vec![ - "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(NULL) }]", - " UnionExec", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", + " SortPreservingMergeExec: [nullable_col@0 ASC]", + " UnionExec", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); Ok(()) diff --git a/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs b/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs index fd2252ca862d8..9e70e6caead7d 100644 --- a/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs +++ b/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs @@ -170,18 +170,15 @@ impl ExecutionPlan for BoundedWindowAggExec { } fn required_input_ordering(&self) -> Vec>> { - let partition_keys = self.window_expr()[0].partition_by(); let order_keys = self.window_expr()[0].order_by(); let requirements = self.sort_keys.as_deref().map(|ordering| { ordering .iter() .map(|o| { - let in_partition_keys = partition_keys.iter().any(|e| o.expr.eq(e)); let in_order_keys = order_keys.iter().any(|e| o.expr.eq(&e.expr)); - let not_partition_only = !in_partition_keys || in_order_keys; PhysicalSortRequirement { expr: o.expr.clone(), - options: not_partition_only.then_some(o.options), + options: in_order_keys.then_some(o.options), } }) .collect() diff --git a/datafusion/core/src/physical_plan/windows/window_agg_exec.rs b/datafusion/core/src/physical_plan/windows/window_agg_exec.rs index db767b5ff8d31..28c5f02f57aa4 100644 --- a/datafusion/core/src/physical_plan/windows/window_agg_exec.rs +++ b/datafusion/core/src/physical_plan/windows/window_agg_exec.rs @@ -173,18 +173,15 @@ impl ExecutionPlan for WindowAggExec { } fn required_input_ordering(&self) -> Vec>> { - let partition_keys = self.window_expr()[0].partition_by(); let order_keys = self.window_expr()[0].order_by(); let requirements = self.sort_keys.as_deref().map(|ordering| { ordering .iter() .map(|o| { - let in_partition_keys = partition_keys.iter().any(|e| o.expr.eq(e)); let in_order_keys = order_keys.iter().any(|e| o.expr.eq(&e.expr)); - let not_partition_only = !in_partition_keys || in_order_keys; PhysicalSortRequirement { expr: o.expr.clone(), - options: not_partition_only.then_some(o.options), + options: in_order_keys.then_some(o.options), } }) .collect() From ecb91f42adf575f159409ba624d65882df2aed44 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Fri, 17 Mar 2023 16:38:07 +0300 Subject: [PATCH 25/35] remove_sort_keys parameters from window --- .../physical_optimizer/sort_enforcement.rs | 9 -- datafusion/core/src/physical_plan/planner.rs | 41 +------ .../windows/bounded_window_agg_exec.rs | 22 +--- .../core/src/physical_plan/windows/mod.rs | 105 +++++++++++++++++- .../physical_plan/windows/window_agg_exec.rs | 22 +--- datafusion/core/tests/sql/select.rs | 4 +- datafusion/core/tests/window_fuzz.rs | 15 +-- datafusion/proto/src/physical_plan/mod.rs | 1 - 8 files changed, 125 insertions(+), 94 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement.rs b/datafusion/core/src/physical_optimizer/sort_enforcement.rs index e2c05e43d3e29..5b7b2506aa96e 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement.rs @@ -549,7 +549,6 @@ fn analyze_window_sort_removal( }; let mut first_should_reverse = None; - let mut physical_ordering_common = vec![]; for sort_any in sort_tree.get_leaves() { let sort_output_ordering = sort_any.output_ordering(); // Variable `sort_any` will either be a `SortExec` or a @@ -566,11 +565,6 @@ fn analyze_window_sort_removal( DataFusionError::Plan("A SortExec should have output ordering".to_string()) })?; if let Some(physical_ordering) = physical_ordering { - if physical_ordering_common.is_empty() - || physical_ordering.len() < physical_ordering_common.len() - { - physical_ordering_common = physical_ordering.to_vec(); - } let (can_skip_sorting, should_reverse) = can_skip_sort( window_expr[0].partition_by(), required_ordering, @@ -620,7 +614,6 @@ fn analyze_window_sort_removal( new_child, new_schema, partition_keys.to_vec(), - Some(physical_ordering_common), )?) as _ } else { Arc::new(WindowAggExec::try_new( @@ -628,7 +621,6 @@ fn analyze_window_sort_removal( new_child, new_schema, partition_keys.to_vec(), - Some(physical_ordering_common), )?) as _ }; return Ok(Some(PlanWithCorrespondingSort::new(new_plan))); @@ -2380,7 +2372,6 @@ mod tests { input.clone(), input.schema(), vec![], - Some(sort_exprs), ) .unwrap(), ) diff --git a/datafusion/core/src/physical_plan/planner.rs b/datafusion/core/src/physical_plan/planner.rs index 0dba182d8a355..564b8d353bdd0 100644 --- a/datafusion/core/src/physical_plan/planner.rs +++ b/datafusion/core/src/physical_plan/planner.rs @@ -25,7 +25,6 @@ use super::{ }; use crate::datasource::source_as_provider; use crate::execution::context::{ExecutionProps, SessionState}; -use crate::logical_expr::utils::generate_sort_key; use crate::logical_expr::{ Aggregate, EmptyRelation, Join, Projection, Sort, SubqueryAlias, TableScan, Unnest, Window, @@ -544,63 +543,35 @@ impl DefaultPhysicalPlanner { vec![] }; - let get_sort_keys = |expr: &Expr| match expr { + let get_sort_keys = |expr: & Expr| match expr { Expr::WindowFunction(WindowFunction{ ref partition_by, ref order_by, .. - }) => generate_sort_key(partition_by, order_by), + }) => (partition_by.to_vec(), order_by.to_vec()), Expr::Alias(expr, _) => { // Convert &Box to &T match &**expr { Expr::WindowFunction(WindowFunction{ ref partition_by, ref order_by, - ..}) => generate_sort_key(partition_by, order_by), + ..}) => (partition_by.to_vec(), order_by.to_vec()), _ => unreachable!(), } } _ => unreachable!(), }; - let sort_keys = get_sort_keys(&window_expr[0])?; + let sort_keys = get_sort_keys(&window_expr[0]); if window_expr.len() > 1 { debug_assert!( window_expr[1..] .iter() - .all(|expr| get_sort_keys(expr).unwrap() == sort_keys), + .all(|expr| get_sort_keys(expr) == sort_keys), "all window expressions shall have the same sort keys, as guaranteed by logical planning" ); } let logical_input_schema = input.schema(); - - let physical_sort_keys = if sort_keys.is_empty() { - None - } else { - let physical_input_schema = input_exec.schema(); - let sort_keys = sort_keys - .iter() - .map(|(e, _)| match e { - Expr::Sort(expr::Sort { - expr, - asc, - nulls_first, - }) => create_physical_sort_expr( - expr, - logical_input_schema, - &physical_input_schema, - SortOptions { - descending: !*asc, - nulls_first: *nulls_first, - }, - session_state.execution_props(), - ), - _ => unreachable!(), - }) - .collect::>>()?; - Some(sort_keys) - }; - let physical_input_schema = input_exec.schema(); let window_expr = window_expr .iter() @@ -625,7 +596,6 @@ impl DefaultPhysicalPlanner { input_exec, physical_input_schema, physical_partition_keys, - physical_sort_keys, )?) } else { Arc::new(WindowAggExec::try_new( @@ -633,7 +603,6 @@ impl DefaultPhysicalPlanner { input_exec, physical_input_schema, physical_partition_keys, - physical_sort_keys, )?) }) } diff --git a/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs b/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs index 9e70e6caead7d..bcc77e0db3481 100644 --- a/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs +++ b/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs @@ -50,6 +50,7 @@ use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; +use crate::physical_plan::windows::calc_requirements; use datafusion_physical_expr::window::{ PartitionBatchState, PartitionBatches, PartitionKey, PartitionWindowAggStates, WindowAggState, WindowState, @@ -73,8 +74,6 @@ pub struct BoundedWindowAggExec { input_schema: SchemaRef, /// Partition Keys pub partition_keys: Vec>, - /// Sort Keys - pub sort_keys: Option>, /// Execution metrics metrics: ExecutionPlanMetricsSet, } @@ -86,7 +85,6 @@ impl BoundedWindowAggExec { input: Arc, input_schema: SchemaRef, partition_keys: Vec>, - sort_keys: Option>, ) -> Result { let schema = create_schema(&input_schema, &window_expr)?; let schema = Arc::new(schema); @@ -96,7 +94,6 @@ impl BoundedWindowAggExec { schema, input_schema, partition_keys, - sort_keys, metrics: ExecutionPlanMetricsSet::new(), }) } @@ -125,7 +122,7 @@ impl BoundedWindowAggExec { let mut result = vec![]; // All window exprs have the same partition by, so we just use the first one: let partition_by = self.window_expr()[0].partition_by(); - let sort_keys = self.sort_keys.as_deref().unwrap_or(&[]); + let sort_keys = self.input.output_ordering().unwrap_or(&[]); for item in partition_by { if let Some(a) = sort_keys.iter().find(|&e| e.expr.eq(item)) { result.push(a.clone()); @@ -170,19 +167,9 @@ impl ExecutionPlan for BoundedWindowAggExec { } fn required_input_ordering(&self) -> Vec>> { + let partition_bys = self.window_expr()[0].partition_by(); let order_keys = self.window_expr()[0].order_by(); - let requirements = self.sort_keys.as_deref().map(|ordering| { - ordering - .iter() - .map(|o| { - let in_order_keys = order_keys.iter().any(|e| o.expr.eq(&e.expr)); - PhysicalSortRequirement { - expr: o.expr.clone(), - options: in_order_keys.then_some(o.options), - } - }) - .collect() - }); + let requirements = calc_requirements(partition_bys, order_keys); vec![requirements] } @@ -212,7 +199,6 @@ impl ExecutionPlan for BoundedWindowAggExec { children[0].clone(), self.input_schema.clone(), self.partition_keys.clone(), - self.sort_keys.clone(), )?)) } diff --git a/datafusion/core/src/physical_plan/windows/mod.rs b/datafusion/core/src/physical_plan/windows/mod.rs index bdb9aa32645f6..0602822ac36c6 100644 --- a/datafusion/core/src/physical_plan/windows/mod.rs +++ b/datafusion/core/src/physical_plan/windows/mod.rs @@ -46,6 +46,7 @@ pub use bounded_window_agg_exec::BoundedWindowAggExec; pub use datafusion_physical_expr::window::{ BuiltInWindowExpr, PlainAggregateWindowExpr, WindowExpr, }; +use datafusion_physical_expr::PhysicalSortRequirement; pub use window_agg_exec::WindowAggExec; /// Create a physical expression for window function @@ -187,6 +188,30 @@ fn create_built_in_window_expr( }) } +pub(crate) fn calc_requirements( + partition_by_exprs: &[Arc], + orderby_sort_exprs: &[PhysicalSortExpr], +) -> Option> { + let mut sort_reqs = vec![]; + for partition_by in partition_by_exprs { + sort_reqs.push(PhysicalSortRequirement { + expr: partition_by.clone(), + options: None, + }); + } + for PhysicalSortExpr { expr, options } in orderby_sort_exprs { + let contains = sort_reqs.iter().any(|e| expr.eq(&e.expr)); + if !contains { + sort_reqs.push(PhysicalSortRequirement { + expr: expr.clone(), + options: Some(*options), + }); + } + } + // Convert empty result to None. Otherwise wrap result inside Some() + (!sort_reqs.is_empty()).then_some(sort_reqs) +} + #[cfg(test)] mod tests { use super::*; @@ -198,6 +223,7 @@ mod tests { use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec}; use crate::test::{self, assert_is_pending}; use arrow::array::*; + use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field, SchemaRef}; use arrow::record_batch::RecordBatch; use datafusion_common::cast::as_primitive_array; @@ -210,6 +236,79 @@ mod tests { Ok((csv, schema)) } + fn create_test_schema2() -> Result { + let a = Field::new("a", DataType::Int32, true); + let b = Field::new("b", DataType::Int32, true); + let c = Field::new("c", DataType::Int32, true); + let d = Field::new("d", DataType::Int32, true); + let schema = Arc::new(Schema::new(vec![a, b, c, d])); + Ok(schema) + } + + #[tokio::test] + async fn test_calc_requirements() -> Result<()> { + let schema = create_test_schema2()?; + let test_data = vec![ + // PARTITION BY a, ORDER BY b ASC NULLS FIRST + ( + vec!["a"], + vec![("b", true, true)], + vec![("a", None), ("b", Some((true, true)))], + ), + // PARTITION BY a, ORDER BY a ASC NULLS FIRST + (vec!["a"], vec![("a", true, true)], vec![("a", None)]), + // PARTITION BY a, ORDER BY b ASC NULLS FIRST, c DESC NULLS LAST + ( + vec!["a"], + vec![("b", true, true), ("c", false, false)], + vec![ + ("a", None), + ("b", Some((true, true))), + ("c", Some((false, false))), + ], + ), + // PARTITION BY a, c, ORDER BY b ASC NULLS FIRST, c DESC NULLS LAST + ( + vec!["a", "c"], + vec![("b", true, true), ("c", false, false)], + vec![("a", None), ("c", None), ("b", Some((true, true)))], + ), + ]; + for (pb_params, ob_params, expected_params) in test_data { + let mut partitionbys = vec![]; + for col_name in pb_params { + partitionbys.push(col(col_name, &schema)?); + } + + let mut orderbys = vec![]; + for (col_name, descending, nulls_first) in ob_params { + let expr = col(col_name, &schema)?; + let options = SortOptions { + descending, + nulls_first, + }; + orderbys.push(PhysicalSortExpr { expr, options }); + } + + let mut expected: Option> = None; + for (col_name, reqs) in expected_params { + let options = reqs.map(|(descending, nulls_first)| SortOptions { + descending, + nulls_first, + }); + let expr = col(col_name, &schema)?; + let res = PhysicalSortRequirement { expr, options }; + if let Some(expected) = &mut expected { + expected.push(res); + } else { + expected = Some(vec![res]); + } + } + assert_eq!(calc_requirements(&partitionbys, &orderbys), expected); + } + Ok(()) + } + #[tokio::test] async fn window_function_with_udaf() -> Result<()> { #[derive(Debug)] @@ -269,7 +368,7 @@ mod tests { input, schema.clone(), vec![], - None, + // None, )?); let result: Vec = collect(window_exec, task_ctx).await?; @@ -323,7 +422,7 @@ mod tests { input, schema.clone(), vec![], - None, + // None, )?); let result: Vec = collect(window_exec, task_ctx).await?; @@ -371,7 +470,7 @@ mod tests { blocking_exec, schema, vec![], - None, + // None, )?); let fut = collect(window_agg_exec, task_ctx); diff --git a/datafusion/core/src/physical_plan/windows/window_agg_exec.rs b/datafusion/core/src/physical_plan/windows/window_agg_exec.rs index 28c5f02f57aa4..75598f1d5266f 100644 --- a/datafusion/core/src/physical_plan/windows/window_agg_exec.rs +++ b/datafusion/core/src/physical_plan/windows/window_agg_exec.rs @@ -24,6 +24,7 @@ use crate::physical_plan::expressions::PhysicalSortExpr; use crate::physical_plan::metrics::{ BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet, }; +use crate::physical_plan::windows::calc_requirements; use crate::physical_plan::{ ColumnStatistics, DisplayFormatType, Distribution, EquivalenceProperties, ExecutionPlan, Partitioning, PhysicalExpr, RecordBatchStream, @@ -61,8 +62,6 @@ pub struct WindowAggExec { input_schema: SchemaRef, /// Partition Keys pub partition_keys: Vec>, - /// Sort Keys - pub sort_keys: Option>, /// Execution metrics metrics: ExecutionPlanMetricsSet, } @@ -74,7 +73,6 @@ impl WindowAggExec { input: Arc, input_schema: SchemaRef, partition_keys: Vec>, - sort_keys: Option>, ) -> Result { let schema = create_schema(&input_schema, &window_expr)?; let schema = Arc::new(schema); @@ -85,7 +83,6 @@ impl WindowAggExec { schema, input_schema, partition_keys, - sort_keys, metrics: ExecutionPlanMetricsSet::new(), }) } @@ -114,7 +111,7 @@ impl WindowAggExec { let mut result = vec![]; // All window exprs have the same partition by, so we just use the first one: let partition_by = self.window_expr()[0].partition_by(); - let sort_keys = self.sort_keys.as_deref().unwrap_or(&[]); + let sort_keys = self.input.output_ordering().unwrap_or(&[]); for item in partition_by { if let Some(a) = sort_keys.iter().find(|&e| e.expr.eq(item)) { result.push(a.clone()); @@ -173,19 +170,9 @@ impl ExecutionPlan for WindowAggExec { } fn required_input_ordering(&self) -> Vec>> { + let partition_bys = self.window_expr()[0].partition_by(); let order_keys = self.window_expr()[0].order_by(); - let requirements = self.sort_keys.as_deref().map(|ordering| { - ordering - .iter() - .map(|o| { - let in_order_keys = order_keys.iter().any(|e| o.expr.eq(&e.expr)); - PhysicalSortRequirement { - expr: o.expr.clone(), - options: in_order_keys.then_some(o.options), - } - }) - .collect() - }); + let requirements = calc_requirements(partition_bys, order_keys); vec![requirements] } @@ -210,7 +197,6 @@ impl ExecutionPlan for WindowAggExec { children[0].clone(), self.input_schema.clone(), self.partition_keys.clone(), - self.sort_keys.clone(), )?)) } diff --git a/datafusion/core/tests/sql/select.rs b/datafusion/core/tests/sql/select.rs index 00cc06bb44da1..4e4b12403e18e 100644 --- a/datafusion/core/tests/sql/select.rs +++ b/datafusion/core/tests/sql/select.rs @@ -841,7 +841,7 @@ async fn sort_on_window_null_string() -> Result<()> { ]) .unwrap(); - let ctx = SessionContext::with_config(SessionConfig::new().with_target_partitions(2)); + let ctx = SessionContext::with_config(SessionConfig::new().with_target_partitions(1)); ctx.register_batch("test", batch)?; let sql = @@ -875,7 +875,7 @@ async fn sort_on_window_null_string() -> Result<()> { assert_batches_eq!(expected, &actual); let sql = - "SELECT d2, row_number() OVER (partition by d2 order by d2 desc) as rn1 FROM test"; + "SELECT d2, row_number() OVER (partition by d2 order by d2 desc) as rn1 FROM test ORDER BY d2 desc"; let actual = execute_to_batches(&ctx, sql).await; // NULLS FIRST diff --git a/datafusion/core/tests/window_fuzz.rs b/datafusion/core/tests/window_fuzz.rs index e03758600938e..0ce43c52ab68b 100644 --- a/datafusion/core/tests/window_fuzz.rs +++ b/datafusion/core/tests/window_fuzz.rs @@ -331,9 +331,10 @@ async fn run_window_test( } let concat_input_record = concat_batches(&schema, &input1).unwrap(); - let exec1 = Arc::new( - MemoryExec::try_new(&[vec![concat_input_record]], schema.clone(), None).unwrap(), - ); + let memory_exec = + MemoryExec::try_new(&[vec![concat_input_record]], schema.clone(), None).unwrap(); + let memory_exec = memory_exec.with_sort_information(sort_keys.clone()); + let exec1 = Arc::new(memory_exec); let usual_window_exec = Arc::new( WindowAggExec::try_new( vec![create_window_expr( @@ -349,12 +350,13 @@ async fn run_window_test( exec1, schema.clone(), vec![], - Some(sort_keys.clone()), ) .unwrap(), ); - let exec2 = - Arc::new(MemoryExec::try_new(&[input1.clone()], schema.clone(), None).unwrap()); + let memory_exec2 = + MemoryExec::try_new(&[input1.clone()], schema.clone(), None).unwrap(); + let memory_exec2 = memory_exec2.with_sort_information(sort_keys); + let exec2 = Arc::new(memory_exec2); let running_window_exec = Arc::new( BoundedWindowAggExec::try_new( vec![create_window_expr( @@ -370,7 +372,6 @@ async fn run_window_test( exec2, schema.clone(), vec![], - Some(sort_keys), ) .unwrap(), ); diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs index c31ff37474339..cc23398cdfb75 100644 --- a/datafusion/proto/src/physical_plan/mod.rs +++ b/datafusion/proto/src/physical_plan/mod.rs @@ -330,7 +330,6 @@ impl AsExecutionPlan for PhysicalPlanNode { input, Arc::new((&input_schema).try_into()?), vec![], - None, )?)) } PhysicalPlanType::Aggregate(hash_agg) => { From 8ea3f47768d53b8ad98bbfdf2c170d8f398d7ea0 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Fri, 17 Mar 2023 18:47:23 +0300 Subject: [PATCH 26/35] Update window multi_path test --- .../physical_optimizer/sort_enforcement.rs | 44 +++++++++---------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement.rs b/datafusion/core/src/physical_optimizer/sort_enforcement.rs index 5b7b2506aa96e..eaf4c63d85237 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement.rs @@ -1774,25 +1774,22 @@ mod tests { ]; let sort_exprs2 = vec![sort_expr("nullable_col", &schema)]; // reverse sorting of sort_exprs2 - let sort_exprs3 = vec![ - sort_expr("nullable_col", &schema), - sort_expr_options( - "non_nullable_col", - &schema, - SortOptions { - descending: false, - nulls_first: false, - }, - ), - ]; + let sort_exprs3 = vec![sort_expr_options( + "nullable_col", + &schema, + SortOptions { + descending: true, + nulls_first: false, + }, + )]; let source1 = parquet_exec_sorted(&schema, sort_exprs1); - let source2 = parquet_exec_sorted(&schema, sort_exprs2.clone()); + let source2 = parquet_exec_sorted(&schema, sort_exprs2); let sort1 = sort_exec(sort_exprs3.clone(), source1); - let sort2 = sort_exec(sort_exprs3, source2); + let sort2 = sort_exec(sort_exprs3.clone(), source2); let union = union_exec(vec![sort1, sort2]); - let spm = sort_preserving_merge_exec(sort_exprs2.clone(), union); - let physical_plan = bounded_window_exec("nullable_col", sort_exprs2, spm); + let spm = sort_preserving_merge_exec(sort_exprs3.clone(), union); + let physical_plan = bounded_window_exec("nullable_col", sort_exprs3, spm); // The `WindowAggExec` gets its sorting from multiple children jointly. // During the removal of `SortExec`s, it should be able to remove the @@ -1800,19 +1797,20 @@ mod tests { // are not necessarily the same to be able to remove them. let expected_input = vec![ "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " SortPreservingMergeExec: [nullable_col@0 ASC]", + " SortPreservingMergeExec: [nullable_col@0 DESC NULLS LAST]", " UnionExec", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC NULLS LAST]", + " SortExec: expr=[nullable_col@0 DESC NULLS LAST]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC NULLS LAST]", + " SortExec: expr=[nullable_col@0 DESC NULLS LAST]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", ]; let expected_optimized = vec![ - "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " SortPreservingMergeExec: [nullable_col@0 ASC]", - " UnionExec", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(NULL) }]", + " SortExec: expr=[nullable_col@0 ASC]", + " CoalescePartitionsExec", + " UnionExec", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); Ok(()) From 719f9a80b8236982260b5241c8bd573a82e958fe Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Sat, 18 Mar 2023 23:49:08 +0300 Subject: [PATCH 27/35] consider existing ordering during Coalesce --- .../physical_optimizer/sort_enforcement.rs | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement.rs b/datafusion/core/src/physical_optimizer/sort_enforcement.rs index eaf4c63d85237..ed5cd1f9612f1 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement.rs @@ -706,7 +706,16 @@ fn remove_corresponding_sort_from_sub_plan( if requires_single_partition && updated_plan.output_partitioning().partition_count() > 1 { - updated_plan = Arc::new(CoalescePartitionsExec::new(updated_plan.clone())); + // If there is existing ordering, to preserve ordering use SortPreservingMergeExec + // instead of CoalescePartitionsExec. + if let Some(ordering) = updated_plan.output_ordering() { + updated_plan = Arc::new(SortPreservingMergeExec::new( + ordering.to_vec(), + updated_plan, + )); + } else { + updated_plan = Arc::new(CoalescePartitionsExec::new(updated_plan.clone())); + } } Ok(updated_plan) } @@ -1806,11 +1815,10 @@ mod tests { ]; let expected_optimized = vec![ "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(NULL) }]", - " SortExec: expr=[nullable_col@0 ASC]", - " CoalescePartitionsExec", - " UnionExec", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + " SortPreservingMergeExec: [nullable_col@0 ASC]", + " UnionExec", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); Ok(()) From 67af8d2f44db96a9da1bf280a1aba9143a3b3bc2 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Sat, 18 Mar 2023 23:58:33 +0300 Subject: [PATCH 28/35] retract assertion in planner --- datafusion/core/src/physical_plan/planner.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/datafusion/core/src/physical_plan/planner.rs b/datafusion/core/src/physical_plan/planner.rs index 564b8d353bdd0..6770675be4ac5 100644 --- a/datafusion/core/src/physical_plan/planner.rs +++ b/datafusion/core/src/physical_plan/planner.rs @@ -25,6 +25,7 @@ use super::{ }; use crate::datasource::source_as_provider; use crate::execution::context::{ExecutionProps, SessionState}; +use crate::logical_expr::utils::generate_sort_key; use crate::logical_expr::{ Aggregate, EmptyRelation, Join, Projection, Sort, SubqueryAlias, TableScan, Unnest, Window, @@ -543,30 +544,30 @@ impl DefaultPhysicalPlanner { vec![] }; - let get_sort_keys = |expr: & Expr| match expr { + let get_sort_keys = |expr: &Expr| match expr { Expr::WindowFunction(WindowFunction{ ref partition_by, ref order_by, .. - }) => (partition_by.to_vec(), order_by.to_vec()), + }) => generate_sort_key(partition_by, order_by), Expr::Alias(expr, _) => { // Convert &Box to &T match &**expr { Expr::WindowFunction(WindowFunction{ ref partition_by, ref order_by, - ..}) => (partition_by.to_vec(), order_by.to_vec()), + ..}) => generate_sort_key(partition_by, order_by), _ => unreachable!(), } } _ => unreachable!(), }; - let sort_keys = get_sort_keys(&window_expr[0]); + let sort_keys = get_sort_keys(&window_expr[0])?; if window_expr.len() > 1 { debug_assert!( window_expr[1..] .iter() - .all(|expr| get_sort_keys(expr) == sort_keys), + .all(|expr| get_sort_keys(expr).unwrap() == sort_keys), "all window expressions shall have the same sort keys, as guaranteed by logical planning" ); } From 9a7577e412720d568908de48977b05f06edee86d Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Mon, 20 Mar 2023 10:50:05 +0300 Subject: [PATCH 29/35] remove todo. --- .../physical_optimizer/sort_enforcement.rs | 40 ++++++++----------- 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement.rs b/datafusion/core/src/physical_optimizer/sort_enforcement.rs index ed5cd1f9612f1..663c6e280ea39 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement.rs @@ -547,6 +547,10 @@ fn analyze_window_sort_removal( "Expects to receive either WindowAggExec of BoundedWindowAggExec".to_string(), )); }; + let n_req = window_exec.required_input_ordering()[0] + .as_ref() + .map(|elem| elem.len()) + .unwrap_or(0); let mut first_should_reverse = None; for sort_any in sort_tree.get_leaves() { @@ -556,14 +560,11 @@ fn analyze_window_sort_removal( // Therefore, we can use the 0th index without loss of generality. let sort_input = sort_any.children()[0].clone(); let physical_ordering = sort_input.output_ordering(); - // TODO: Once we can ensure that required ordering information propagates with - // the necessary lineage information, compare `physical_ordering` and the - // ordering required by the window executor instead of `sort_output_ordering`. - // This will enable us to handle cases such as (a,b) -> Sort -> (a,b,c) -> Required(a,b). - // Currently, we can not remove such sorts. let required_ordering = sort_output_ordering.ok_or_else(|| { DataFusionError::Plan("A SortExec should have output ordering".to_string()) })?; + // First n_req element of the sort output corresponds to required section of the window_exec. + let required_ordering = &required_ordering[0..n_req]; if let Some(physical_ordering) = physical_ordering { let (can_skip_sorting, should_reverse) = can_skip_sort( window_expr[0].partition_by(), @@ -1833,40 +1834,31 @@ mod tests { sort_expr("non_nullable_col", &schema), ]; let sort_exprs2 = vec![sort_expr("nullable_col", &schema)]; - // reverse sorting of sort_exprs2 - let reversed_sort_exprs2 = vec![sort_expr_options( - "nullable_col", - &schema, - SortOptions { - descending: true, - nulls_first: false, - }, - )]; - let source1 = parquet_exec_sorted(&schema, sort_exprs1); + let source1 = parquet_exec_sorted(&schema, sort_exprs2.clone()); let source2 = parquet_exec_sorted(&schema, sort_exprs2.clone()); - let sort1 = sort_exec(reversed_sort_exprs2.clone(), source1); - let sort2 = sort_exec(reversed_sort_exprs2, source2); + let sort1 = sort_exec(sort_exprs1.clone(), source1); + let sort2 = sort_exec(sort_exprs1.clone(), source2); let union = union_exec(vec![sort1, sort2]); - let coalesce = Arc::new(CoalescePartitionsExec::new(union)) as _; - let physical_plan = bounded_window_exec("nullable_col", sort_exprs2, coalesce); + let spm = Arc::new(SortPreservingMergeExec::new(sort_exprs1, union)) as _; + let physical_plan = bounded_window_exec("nullable_col", sort_exprs2, spm); // The `WindowAggExec` can get its required sorting from the leaf nodes directly. // The unnecessary SortExecs should be removed let expected_input = vec![ "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", - " CoalescePartitionsExec", + " SortPreservingMergeExec: [nullable_col@0 ASC,non_nullable_col@1 ASC]", " UnionExec", - " SortExec: expr=[nullable_col@0 DESC NULLS LAST]", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", - " SortExec: expr=[nullable_col@0 DESC NULLS LAST]", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", + " SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", ]; let expected_optimized = vec![ "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }]", " SortPreservingMergeExec: [nullable_col@0 ASC]", " UnionExec", - " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], projection=[nullable_col, non_nullable_col]", + " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", " ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); From 44f3c6f35ecd9d6d8239521181ca645cf7f86676 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Mon, 20 Mar 2023 11:15:16 +0300 Subject: [PATCH 30/35] remove unnecessary repartition from plan --- .../physical_optimizer/sort_enforcement.rs | 23 ++++++++++++------- .../core/src/physical_optimizer/utils.rs | 6 +++++ 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement.rs b/datafusion/core/src/physical_optimizer/sort_enforcement.rs index 663c6e280ea39..77819fe32bdee 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement.rs @@ -37,8 +37,8 @@ use crate::config::ConfigOptions; use crate::error::Result; use crate::physical_optimizer::sort_pushdown::{pushdown_sorts, SortPushDown}; use crate::physical_optimizer::utils::{ - add_sort_above, is_coalesce_partitions, is_limit, is_sort, is_sort_preserving_merge, - is_union, is_window, + add_sort_above, is_coalesce_partitions, is_limit, is_repartition, is_sort, + is_sort_preserving_merge, is_union, is_window, }; use crate::physical_optimizer::PhysicalOptimizerRule; use crate::physical_plan::coalesce_partitions::CoalescePartitionsExec; @@ -635,7 +635,7 @@ fn update_child_to_remove_coalesce( coalesce_onwards: &mut Option, ) -> Result<()> { if let Some(coalesce_onwards) = coalesce_onwards { - *child = remove_corresponding_coalesce_in_sub_plan(coalesce_onwards)?; + *child = remove_corresponding_coalesce_in_sub_plan(coalesce_onwards, child)?; } Ok(()) } @@ -643,15 +643,23 @@ fn update_child_to_remove_coalesce( /// Removes the `CoalescePartitions` from the plan in `coalesce_onwards`. fn remove_corresponding_coalesce_in_sub_plan( coalesce_onwards: &mut ExecTree, + parent: &Arc, ) -> Result> { Ok(if is_coalesce_partitions(&coalesce_onwards.plan) { // We can safely use the 0th index since we have a `CoalescePartitionsExec`. - coalesce_onwards.plan.children()[0].clone() + let mut new_plan = coalesce_onwards.plan.children()[0].clone(); + while new_plan.output_partitioning() == parent.output_partitioning() + && is_repartition(&new_plan) + && is_repartition(parent) + { + new_plan = new_plan.children()[0].clone() + } + new_plan } else { let plan = coalesce_onwards.plan.clone(); let mut children = plan.children(); for item in &mut coalesce_onwards.children { - children[item.idx] = remove_corresponding_coalesce_in_sub_plan(item)?; + children[item.idx] = remove_corresponding_coalesce_in_sub_plan(item, &plan)?; } plan.with_new_children(children)? }) @@ -2295,9 +2303,8 @@ mod tests { let expected_optimized = vec![ "SortPreservingMergeExec: [nullable_col@0 ASC]", " SortExec: expr=[nullable_col@0 ASC]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=0", - " MemoryExec: partitions=0, partition_sizes=[]", + " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=0", + " MemoryExec: partitions=0, partition_sizes=[]", ]; assert_optimized!(expected_input, expected_optimized, physical_plan); Ok(()) diff --git a/datafusion/core/src/physical_optimizer/utils.rs b/datafusion/core/src/physical_optimizer/utils.rs index 3fb67e7877ccb..b3485571c76b4 100644 --- a/datafusion/core/src/physical_optimizer/utils.rs +++ b/datafusion/core/src/physical_optimizer/utils.rs @@ -23,6 +23,7 @@ use crate::config::ConfigOptions; use crate::error::Result; use crate::physical_plan::coalesce_partitions::CoalescePartitionsExec; use crate::physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; +use crate::physical_plan::repartition::RepartitionExec; use crate::physical_plan::sorts::sort::SortExec; use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; use crate::physical_plan::union::UnionExec; @@ -104,3 +105,8 @@ pub fn is_coalesce_partitions(plan: &Arc) -> bool { pub fn is_union(plan: &Arc) -> bool { plan.as_any().is::() } + +/// Checks whether the given executor is a [`RepartitionExec`]. +pub fn is_repartition(plan: &Arc) -> bool { + plan.as_any().is::() +} From 9a49a34fbb63c9c260b1b8eb6fa31e1f3e47e1e1 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Mon, 20 Mar 2023 17:52:39 +0300 Subject: [PATCH 31/35] update comments --- datafusion/core/src/physical_optimizer/sort_enforcement.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement.rs b/datafusion/core/src/physical_optimizer/sort_enforcement.rs index 77819fe32bdee..ef96721080c41 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement.rs @@ -560,11 +560,12 @@ fn analyze_window_sort_removal( // Therefore, we can use the 0th index without loss of generality. let sort_input = sort_any.children()[0].clone(); let physical_ordering = sort_input.output_ordering(); - let required_ordering = sort_output_ordering.ok_or_else(|| { + let sort_output_ordering = sort_output_ordering.ok_or_else(|| { DataFusionError::Plan("A SortExec should have output ordering".to_string()) })?; - // First n_req element of the sort output corresponds to required section of the window_exec. - let required_ordering = &required_ordering[0..n_req]; + // It is enough to check whether first n_req element of the sort output satisfies window_exec requirement. + // Because length of window_exec requirement is n_req. + let required_ordering = &sort_output_ordering[0..n_req]; if let Some(physical_ordering) = physical_ordering { let (can_skip_sorting, should_reverse) = can_skip_sort( window_expr[0].partition_by(), From a2f0f70e87687620aa893b2028384276c18b4143 Mon Sep 17 00:00:00 2001 From: Mehmet Ozan Kabak Date: Mon, 20 Mar 2023 17:27:40 -0500 Subject: [PATCH 32/35] Remove commented out code --- datafusion/core/src/physical_plan/windows/mod.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/datafusion/core/src/physical_plan/windows/mod.rs b/datafusion/core/src/physical_plan/windows/mod.rs index 0602822ac36c6..f7f9bb76b3f44 100644 --- a/datafusion/core/src/physical_plan/windows/mod.rs +++ b/datafusion/core/src/physical_plan/windows/mod.rs @@ -368,7 +368,6 @@ mod tests { input, schema.clone(), vec![], - // None, )?); let result: Vec = collect(window_exec, task_ctx).await?; @@ -422,7 +421,6 @@ mod tests { input, schema.clone(), vec![], - // None, )?); let result: Vec = collect(window_exec, task_ctx).await?; @@ -470,7 +468,6 @@ mod tests { blocking_exec, schema, vec![], - // None, )?); let fut = collect(window_agg_exec, task_ctx); From 9406c5ba7c45f1328aa1380443dc3cb63cd19e50 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Mon, 27 Mar 2023 09:58:45 +0300 Subject: [PATCH 33/35] Address reviews --- .../physical_optimizer/dist_enforcement.rs | 29 +++++++++++++++++-- datafusion/physical-expr/src/sort_expr.rs | 6 ++-- datafusion/physical-expr/src/utils.rs | 11 +++++++ 3 files changed, 41 insertions(+), 5 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/dist_enforcement.rs b/datafusion/core/src/physical_optimizer/dist_enforcement.rs index a48d002397d59..ffc4c9231237b 100644 --- a/datafusion/core/src/physical_optimizer/dist_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/dist_enforcement.rs @@ -991,6 +991,30 @@ mod tests { )) } + // Created a sorted parquet exec with multiple files + fn parquet_exec_multiple_sorted( + output_ordering: Option>, + ) -> Arc { + Arc::new(ParquetExec::new( + FileScanConfig { + object_store_url: ObjectStoreUrl::parse("test:///").unwrap(), + file_schema: schema(), + file_groups: vec![ + vec![PartitionedFile::new("x".to_string(), 100)], + vec![PartitionedFile::new("y".to_string(), 100)], + ], + statistics: Statistics::default(), + projection: None, + limit: None, + table_partition_cols: vec![], + output_ordering, + infinite_source: false, + }, + None, + None, + )) + } + fn projection_exec_with_alias( input: Arc, alias_pairs: Vec<(String, String)>, @@ -2069,7 +2093,7 @@ mod tests { }]; // Scan some sorted parquet files - let exec = parquet_exec_with_sort(Some(sort_key.clone())); + let exec = parquet_exec_multiple_sorted(Some(sort_key.clone())); // CoalesceBatchesExec to mimic behavior after a filter let exec = Arc::new(CoalesceBatchesExec::new(exec, 4096)); @@ -2080,8 +2104,9 @@ mod tests { // The optimizer should not add an additional SortExec as the // data is already sorted let expected = &[ + "SortPreservingMergeExec: [a@0 ASC]", "CoalesceBatchesExec: target_batch_size=4096", - "ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[a@0 ASC], projection=[a, b, c, d, e]", + "ParquetExec: limit=None, partitions={2 groups: [[x], [y]]}, output_ordering=[a@0 ASC], projection=[a, b, c, d, e]", ]; assert_optimized!(expected, exec); Ok(()) diff --git a/datafusion/physical-expr/src/sort_expr.rs b/datafusion/physical-expr/src/sort_expr.rs index 5683d4abee231..bf62dde20059a 100644 --- a/datafusion/physical-expr/src/sort_expr.rs +++ b/datafusion/physical-expr/src/sort_expr.rs @@ -41,7 +41,7 @@ impl PartialEq for PhysicalSortExpr { impl std::fmt::Display for PhysicalSortExpr { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "{} {}", self.expr, to_string(&self.options)) + write!(f, "{} {}", self.expr, to_str(&self.options)) } } @@ -98,7 +98,7 @@ impl PartialEq for PhysicalSortRequirement { impl std::fmt::Display for PhysicalSortRequirement { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - let opts_string = self.options.as_ref().map_or("NA", to_string); + let opts_string = self.options.as_ref().map_or("NA", to_str); write!(f, "{} {}", self.expr, opts_string) } } @@ -121,7 +121,7 @@ pub fn make_sort_requirements_from_exprs( /// Returns the SQL string representation of the given [SortOptions] object. #[inline] -fn to_string(options: &SortOptions) -> &str { +fn to_str(options: &SortOptions) -> &str { match (options.descending, options.nulls_first) { (true, true) => "DESC", (true, false) => "DESC NULLS LAST", diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index b35b71ea53715..efa2c1037bb2a 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -338,6 +338,14 @@ fn requirements_compatible_concrete EquivalenceProperties>( } } +/// This function maps back requirement after ProjectionExec +/// to the Executor for its input. +// Specifically, `ProjectionExec` changes index of `Column`s in the schema of its input executor. +// This function changes requirement given according to ProjectionExec schema to the requirement +// according to schema of input executor to the ProjectionExec. +// For instance, Column{"a", 0} would turn to Column{"a", 1}. Please note that this function assumes that +// name of the Column is unique. If we have a requirement such that Column{"a", 0}, Column{"a", 1}. +// This function will produce incorrect result (It will only emit single Column as a result). pub fn map_columns_before_projection( parent_required: &[Arc], proj_exprs: &[(Arc, String)], @@ -363,6 +371,9 @@ pub fn map_columns_before_projection( .collect() } +/// This function converts `PhysicalSortRequirement` to `PhysicalSortExpr` +/// for each entry in the input. If required ordering is None for an entry +/// default ordering `ASC, NULLS LAST` if given. pub fn make_sort_exprs_from_requirements( required: &[PhysicalSortRequirement], ) -> Vec { From 6699a2e6683ff80ec39f9ba15370d3b17096bfef Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Mon, 3 Apr 2023 09:45:23 +0300 Subject: [PATCH 34/35] update comments --- .../core/src/physical_optimizer/sort_pushdown.rs | 2 +- datafusion/core/src/physical_optimizer/utils.rs | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/sort_pushdown.rs b/datafusion/core/src/physical_optimizer/sort_pushdown.rs index 30d0b03898c75..fbfe635cf8a20 100644 --- a/datafusion/core/src/physical_optimizer/sort_pushdown.rs +++ b/datafusion/core/src/physical_optimizer/sort_pushdown.rs @@ -158,6 +158,7 @@ pub(crate) fn pushdown_sorts( if ordering_satisfy_requirement(plan.output_ordering(), parent_required, || { plan.equivalence_properties() }) { + // Satisfies parent requirements, immediately return. return Ok(Transformed::Yes(SortPushDown { required_ordering: None, ..requirements @@ -257,7 +258,6 @@ fn pushdown_requirement_to_children( { // If the current plan is a leaf node or can not maintain any of the input ordering, can not pushed down requirements. // For RepartitionExec, we always choose to not push down the sort requirements even the RepartitionExec(input_partition=1) could maintain input ordering. - // For RepartitionExec, we always choose to not push down the sort requirements even the RepartitionExec(input_partition=1) could maintain input ordering. // Pushing down is not beneficial Ok(None) } else { diff --git a/datafusion/core/src/physical_optimizer/utils.rs b/datafusion/core/src/physical_optimizer/utils.rs index 609d8bc9bf164..2fa833bb7e9e0 100644 --- a/datafusion/core/src/physical_optimizer/utils.rs +++ b/datafusion/core/src/physical_optimizer/utils.rs @@ -75,39 +75,39 @@ pub fn add_sort_above( Ok(()) } -/// Checks whether the given executor is a limit; +/// Checks whether the given operator is a limit; /// i.e. either a [`LocalLimitExec`] or a [`GlobalLimitExec`]. pub fn is_limit(plan: &Arc) -> bool { plan.as_any().is::() || plan.as_any().is::() } -/// Checks whether the given executor is a window; +/// Checks whether the given operator is a window; /// i.e. either a [`WindowAggExec`] or a [`BoundedWindowAggExec`]. pub fn is_window(plan: &Arc) -> bool { plan.as_any().is::() || plan.as_any().is::() } -/// Checks whether the given executor is a [`SortExec`]. +/// Checks whether the given operator is a [`SortExec`]. pub fn is_sort(plan: &Arc) -> bool { plan.as_any().is::() } -/// Checks whether the given executor is a [`SortPreservingMergeExec`]. +/// Checks whether the given operator is a [`SortPreservingMergeExec`]. pub fn is_sort_preserving_merge(plan: &Arc) -> bool { plan.as_any().is::() } -/// Checks whether the given executor is a [`CoalescePartitionsExec`]. +/// Checks whether the given operator is a [`CoalescePartitionsExec`]. pub fn is_coalesce_partitions(plan: &Arc) -> bool { plan.as_any().is::() } -/// Checks whether the given executor is a [`UnionExec`]. +/// Checks whether the given operator is a [`UnionExec`]. pub fn is_union(plan: &Arc) -> bool { plan.as_any().is::() } -/// Checks whether the given executor is a [`RepartitionExec`]. +/// Checks whether the given operator is a [`RepartitionExec`]. pub fn is_repartition(plan: &Arc) -> bool { plan.as_any().is::() } From 87256398ed6eac4964b268677cbfe24c9a636ee1 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Mon, 3 Apr 2023 10:16:21 +0300 Subject: [PATCH 35/35] address reviews --- datafusion/core/src/physical_optimizer/sort_pushdown.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/sort_pushdown.rs b/datafusion/core/src/physical_optimizer/sort_pushdown.rs index fbfe635cf8a20..07d0002548dee 100644 --- a/datafusion/core/src/physical_optimizer/sort_pushdown.rs +++ b/datafusion/core/src/physical_optimizer/sort_pushdown.rs @@ -146,7 +146,7 @@ pub(crate) fn pushdown_sorts( // Can push down requirements Ok(Transformed::Yes(SortPushDown { plan: child.clone(), - required_ordering, + required_ordering: None, adjusted_request_ordering: adjusted, })) } else { @@ -168,8 +168,8 @@ pub(crate) fn pushdown_sorts( if let Some(adjusted) = pushdown_requirement_to_children(plan, parent_required)? { Ok(Transformed::Yes(SortPushDown { plan: plan.clone(), + required_ordering: None, adjusted_request_ordering: adjusted, - ..requirements })) } else { // Can not push down requirements, add new SortExec: