From 7c5d2258e8567e1b3d8c4eb4a27a81f171d0a442 Mon Sep 17 00:00:00 2001 From: Cheng-Yuan-Lai Date: Sun, 8 Jun 2025 00:16:34 +0800 Subject: [PATCH 01/11] feat: replace snapshot tests for enforce_sorting --- .../physical_optimizer/enforce_sorting.rs | 164 ++++++++++-------- 1 file changed, 87 insertions(+), 77 deletions(-) diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs index 38bc10a967e2b..88a5d70313d79 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +use insta::allow_duplicates; +use insta::assert_snapshot; use std::sync::Arc; use crate::physical_optimizer::test_utils::{ @@ -92,7 +94,7 @@ fn csv_exec_sorted( /// `REPARTITION_SORTS`: Flag to set `config.options.optimizer.repartition_sorts` option. /// macro_rules! assert_optimized { - ($EXPECTED_PLAN_LINES: expr, $EXPECTED_OPTIMIZED_PLAN_LINES: expr, $PLAN: expr, $REPARTITION_SORTS: expr) => { + (@ $EXPECTED_PLAN_LINES: literal $(,)?, @ $EXPECTED_OPTIMIZED_PLAN_LINES: literal $(,)?, $PLAN: expr, $REPARTITION_SORTS: expr) => { let mut config = ConfigOptions::new(); config.optimizer.repartition_sorts = $REPARTITION_SORTS; @@ -143,30 +145,23 @@ macro_rules! assert_optimized { let physical_plan = $PLAN; let formatted = displayable(physical_plan.as_ref()).indent(true).to_string(); - let actual: Vec<&str> = formatted.trim().lines().collect(); + let actual = formatted.trim(); - let expected_plan_lines: Vec<&str> = $EXPECTED_PLAN_LINES - .iter().map(|s| *s).collect(); - - assert_eq!( - expected_plan_lines, actual, - "\n**Original Plan Mismatch\n\nexpected:\n\n{expected_plan_lines:#?}\nactual:\n\n{actual:#?}\n\n" - ); - - let expected_optimized_lines: Vec<&str> = $EXPECTED_OPTIMIZED_PLAN_LINES - .iter().map(|s| *s).collect(); + allow_duplicates! { + assert_snapshot!(actual, @$EXPECTED_PLAN_LINES); + } // Run the actual optimizer let optimized_physical_plan = EnforceSorting::new().optimize(physical_plan,&config)?; // Get string representation of the plan - let actual = get_plan_string(&optimized_physical_plan); - assert_eq!( - expected_optimized_lines, actual, - "\n**Optimized Plan Mismatch\n\nexpected:\n\n{expected_optimized_lines:#?}\nactual:\n\n{actual:#?}\n\n" - ); + let plan_lines = get_plan_string(&optimized_physical_plan).join("\n"); + let actual = plan_lines.trim(); + allow_duplicates! { + assert_snapshot!(actual, @ $EXPECTED_OPTIMIZED_PLAN_LINES); + } }; } @@ -1225,7 +1220,12 @@ async fn test_sort_merge_join_order_by_left() -> Result<()> { ] } }; - assert_optimized!(expected_input, expected_optimized, physical_plan, true); + assert_optimized!( + @ "", + @ "", + physical_plan, + true + ); } Ok(()) } @@ -1298,7 +1298,7 @@ async fn test_sort_merge_join_order_by_right() -> Result<()> { ] } }; - assert_optimized!(expected_input, expected_optimized, physical_plan, true); + // assert_optimized!(expected_input, expected_optimized, physical_plan, true); } Ok(()) } @@ -1514,18 +1514,18 @@ async fn test_with_lost_ordering_unbounded_bounded( expected_optimized_bounded_parallelize_sort, ) }; - assert_optimized!( - expected_input, - expected_optimized, - physical_plan.clone(), - false - ); - assert_optimized!( - expected_input, - expected_optimized_sort_parallelize, - physical_plan, - true - ); + // assert_optimized!( + // expected_input, + // expected_optimized, + // physical_plan.clone(), + // false + // ); + // assert_optimized!( + // expected_input, + // expected_optimized_sort_parallelize, + // physical_plan, + // true + // ); Ok(()) } @@ -1616,7 +1616,7 @@ async fn test_window_multi_layer_requirement() -> Result<()> { " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", ]; - assert_optimized!(expected_input, expected_optimized, physical_plan, false); + // assert_optimized!(expected_input, expected_optimized, physical_plan, false); Ok(()) } @@ -1639,7 +1639,7 @@ async fn test_not_replaced_with_partial_sort_for_bounded_input() -> Result<()> { " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[b@1 ASC, c@2 ASC], file_type=parquet" ]; let expected_no_change = expected_input; - assert_optimized!(expected_input, expected_no_change, physical_plan, false); + // assert_optimized!(expected_input, expected_no_change, physical_plan, false); Ok(()) } @@ -1652,7 +1652,7 @@ async fn test_not_replaced_with_partial_sort_for_bounded_input() -> Result<()> { /// `REPARTITION_SORTS`: Flag to set `config.options.optimizer.repartition_sorts` option. /// `$CASE_NUMBER` (optional): The test case number to print on failure. macro_rules! assert_optimized { - ($EXPECTED_PLAN_LINES: expr, $EXPECTED_OPTIMIZED_PLAN_LINES: expr, $PLAN: expr, $REPARTITION_SORTS: expr $(, $CASE_NUMBER: expr)?) => { + (@ $EXPECTED_PLAN_LINES: literal $(,)?, @ $EXPECTED_OPTIMIZED_PLAN_LINES: literal $(,)?, $PLAN: expr, $REPARTITION_SORTS: expr $(, $CASE_NUMBER: expr)?) => { let mut config = ConfigOptions::new(); config.optimizer.repartition_sorts = $REPARTITION_SORTS; @@ -1703,32 +1703,42 @@ macro_rules! assert_optimized { let physical_plan = $PLAN; let formatted = displayable(physical_plan.as_ref()).indent(true).to_string(); - let actual: Vec<&str> = formatted.trim().lines().collect(); - - let expected_plan_lines: Vec<&str> = $EXPECTED_PLAN_LINES - .iter().map(|s| *s).collect(); - - if expected_plan_lines != actual { - $(println!("\n**Original Plan Mismatch in case {}**", $CASE_NUMBER);)? - println!("\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", expected_plan_lines, actual); - assert_eq!(expected_plan_lines, actual); + let actual = formatted.trim(); + + // let expected_plan_lines: Vec<&str> = $EXPECTED_PLAN_LINES + // .iter().map(|s| *s).collect(); + + // if expected_plan_lines != actual { + // $(println!("\n**Original Plan Mismatch in case {}**", $CASE_NUMBER);)? + // println!("\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", expected_plan_lines, actual); + // assert_eq!(expected_plan_lines, actual); + // } + $(println!("\n**Testing Original Plan for case {}**", $CASE_NUMBER);)? + allow_duplicates! { + assert_snapshot!(actual, @$EXPECTED_PLAN_LINES); } - let expected_optimized_lines: Vec<&str> = $EXPECTED_OPTIMIZED_PLAN_LINES - .iter().map(|s| *s).collect(); + + // let expected_optimized_lines: Vec<&str> = $EXPECTED_OPTIMIZED_PLAN_LINES + // .iter().map(|s| *s).collect(); // Run the actual optimizer let optimized_physical_plan = EnforceSorting::new().optimize(physical_plan, &config)?; // Get string representation of the plan - let actual = get_plan_string(&optimized_physical_plan); - if expected_optimized_lines != actual { - $(println!("\n**Optimized Plan Mismatch in case {}**", $CASE_NUMBER);)? - println!("\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", expected_optimized_lines, actual); - assert_eq!(expected_optimized_lines, actual); + let plan_lines = get_plan_string(&optimized_physical_plan).join("\n"); + let actual = plan_lines.trim(); + // if expected_optimized_lines != actual { + // $(println!("\n**Optimized Plan Mismatch in case {}**", $CASE_NUMBER);)? + // println!("\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", expected_optimized_lines, actual); + // assert_eq!(expected_optimized_lines, actual); + // } + $(println!("\n**Testing Original Plan for case {}**", $CASE_NUMBER);)? + allow_duplicates! { + assert_snapshot!(actual, @ $EXPECTED_OPTIMIZED_PLAN_LINES); } - }; + } } #[tokio::test] @@ -1747,7 +1757,7 @@ async fn test_remove_unnecessary_sort() -> Result<()> { "SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]", " DataSourceExec: partitions=1, partition_sizes=[0]", ]; - assert_optimized!(expected_input, expected_optimized, physical_plan, true); + // assert_optimized!(expected_input, expected_optimized, physical_plan, true); Ok(()) } @@ -1824,7 +1834,7 @@ async fn test_add_required_sort() -> Result<()> { "SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]", " DataSourceExec: partitions=1, partition_sizes=[0]", ]; - assert_optimized!(expected_input, expected_optimized, physical_plan, true); + // assert_optimized!(expected_input, expected_optimized, physical_plan, true); Ok(()) } @@ -1850,7 +1860,7 @@ async fn test_remove_unnecessary_sort1() -> Result<()> { "SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]", " DataSourceExec: partitions=1, partition_sizes=[0]", ]; - assert_optimized!(expected_input, expected_optimized, physical_plan, true); + // assert_optimized!(expected_input, expected_optimized, physical_plan, true); Ok(()) } @@ -1888,7 +1898,7 @@ async fn test_remove_unnecessary_sort2() -> Result<()> { " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", " DataSourceExec: partitions=1, partition_sizes=[0]", ]; - assert_optimized!(expected_input, expected_optimized, physical_plan, true); + // assert_optimized!(expected_input, expected_optimized, physical_plan, true); Ok(()) } @@ -1931,7 +1941,7 @@ async fn test_remove_unnecessary_sort3() -> Result<()> { " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", " DataSourceExec: partitions=1, partition_sizes=[0]", ]; - assert_optimized!(expected_input, expected_optimized, physical_plan, true); + // assert_optimized!(expected_input, expected_optimized, physical_plan, true); Ok(()) } @@ -2014,7 +2024,7 @@ async fn test_remove_unnecessary_sort6() -> Result<()> { "SortExec: TopK(fetch=2), expr=[non_nullable_col@1 ASC, nullable_col@0 ASC], preserve_partitioning=[false]", " DataSourceExec: partitions=1, partition_sizes=[0]", ]; - assert_optimized!(expected_input, expected_optimized, physical_plan, true); + // assert_optimized!(expected_input, expected_optimized, physical_plan, true); Ok(()) } @@ -2047,7 +2057,7 @@ async fn test_remove_unnecessary_sort7() -> Result<()> { " SortExec: expr=[non_nullable_col@1 ASC, nullable_col@0 ASC], preserve_partitioning=[false]", " DataSourceExec: partitions=1, partition_sizes=[0]", ]; - assert_optimized!(expected_input, expected_optimized, physical_plan, true); + // assert_optimized!(expected_input, expected_optimized, physical_plan, true); Ok(()) } @@ -2078,7 +2088,7 @@ async fn test_remove_unnecessary_sort8() -> Result<()> { " SortExec: TopK(fetch=2), expr=[non_nullable_col@1 ASC, nullable_col@0 ASC], preserve_partitioning=[false]", " DataSourceExec: partitions=1, partition_sizes=[0]", ]; - assert_optimized!(expected_input, expected_optimized, physical_plan, true); + // assert_optimized!(expected_input, expected_optimized, physical_plan, true); Ok(()) } @@ -2103,7 +2113,7 @@ async fn test_do_not_pushdown_through_limit() -> Result<()> { " SortExec: expr=[non_nullable_col@1 ASC], preserve_partitioning=[false]", " DataSourceExec: partitions=1, partition_sizes=[0]", ]; - assert_optimized!(expected_input, expected_optimized, physical_plan, true); + // assert_optimized!(expected_input, expected_optimized, physical_plan, true); Ok(()) } @@ -2128,7 +2138,7 @@ async fn test_remove_unnecessary_spm1() -> Result<()> { "SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]", " DataSourceExec: partitions=1, partition_sizes=[0]", ]; - assert_optimized!(expected_input, expected_optimized, physical_plan, true); + // assert_optimized!(expected_input, expected_optimized, physical_plan, true); Ok(()) } @@ -2152,7 +2162,7 @@ async fn test_remove_unnecessary_spm2() -> Result<()> { " SortExec: expr=[non_nullable_col@1 ASC], preserve_partitioning=[false]", " DataSourceExec: partitions=1, partition_sizes=[0]", ]; - assert_optimized!(expected_input, expected_optimized, input, true); + // assert_optimized!(expected_input, expected_optimized, input, true); Ok(()) } @@ -2177,7 +2187,7 @@ async fn test_change_wrong_sorting() -> Result<()> { "SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]", " DataSourceExec: partitions=1, partition_sizes=[0]", ]; - assert_optimized!(expected_input, expected_optimized, physical_plan, true); + // assert_optimized!(expected_input, expected_optimized, physical_plan, true); Ok(()) } @@ -2204,7 +2214,7 @@ async fn test_change_wrong_sorting2() -> Result<()> { "SortExec: expr=[non_nullable_col@1 ASC], preserve_partitioning=[false]", " DataSourceExec: partitions=1, partition_sizes=[0]", ]; - assert_optimized!(expected_input, expected_optimized, physical_plan, true); + // assert_optimized!(expected_input, expected_optimized, physical_plan, true); Ok(()) } @@ -2326,7 +2336,7 @@ async fn test_coalesce_propagate() -> Result<()> { " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", " DataSourceExec: partitions=1, partition_sizes=[0]", ]; - assert_optimized!(expected_input, expected_optimized, physical_plan, true); + // assert_optimized!(expected_input, expected_optimized, physical_plan, true); Ok(()) } @@ -2355,7 +2365,7 @@ async fn test_replace_with_partial_sort2() -> Result<()> { "PartialSortExec: expr=[a@0 ASC, c@2 ASC, d@3 ASC], common_prefix_length=[2]", " StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[a@0 ASC, c@2 ASC]", ]; - assert_optimized!(expected_input, expected_optimized, physical_plan, true); + // assert_optimized!(expected_input, expected_optimized, physical_plan, true); Ok(()) } @@ -2431,7 +2441,7 @@ async fn test_replace_with_partial_sort() -> Result<()> { "PartialSortExec: expr=[a@0 ASC, c@2 ASC], common_prefix_length=[1]", " StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[a@0 ASC]", ]; - assert_optimized!(expected_input, expected_optimized, physical_plan, true); + // assert_optimized!(expected_input, expected_optimized, physical_plan, true); Ok(()) } @@ -3709,13 +3719,13 @@ async fn test_window_partial_constant_and_set_monotonicity() -> Result<()> { let ordering = LexOrdering::new(sort_expr).unwrap(); let physical_plan = sort_exec(ordering, window_exec); - assert_optimized!( - case.initial_plan, - case.expected_plan, - physical_plan, - true, - case_idx - ); + // assert_optimized!( + // case.initial_plan, + // case.expected_plan, + // physical_plan, + // true, + // case_idx + // ); } Ok(()) @@ -3742,7 +3752,7 @@ fn test_removes_unused_orthogonal_sort() -> Result<()> { let expected_optimized = [ "StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[b@1 ASC, c@2 ASC]" ]; - assert_optimized!(expected_input, expected_optimized, output_sort, true); + // assert_optimized!(expected_input, expected_optimized, output_sort, true); Ok(()) } @@ -3767,7 +3777,7 @@ fn test_keeps_used_orthogonal_sort() -> Result<()> { // Test: should keep the orthogonal sort, since it modifies the output: let expected_optimized = expected_input; - assert_optimized!(expected_input, expected_optimized, output_sort, true); + // assert_optimized!(expected_input, expected_optimized, output_sort, true); Ok(()) } @@ -3804,7 +3814,7 @@ fn test_handles_multiple_orthogonal_sorts() -> Result<()> { " SortExec: TopK(fetch=3), expr=[a@0 ASC], preserve_partitioning=[false]", " StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[b@1 ASC, c@2 ASC]", ]; - assert_optimized!(expected_input, expected_optimized, output_sort, true); + // assert_optimized!(expected_input, expected_optimized, output_sort, true); Ok(()) } From 6090b9c2ae7004555a669eba85dff4420bcd1a9f Mon Sep 17 00:00:00 2001 From: Ian Lai Date: Tue, 17 Jun 2025 09:48:29 +0000 Subject: [PATCH 02/11] feat: modify assert_optimized macro to test one snapshot with a combined physical plan --- .../physical_optimizer/enforce_sorting.rs | 212 +++++++++--------- 1 file changed, 108 insertions(+), 104 deletions(-) diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs index 88a5d70313d79..c47cca5f2cb38 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs @@ -17,6 +17,7 @@ use insta::allow_duplicates; use insta::assert_snapshot; +use insta::with_settings; use std::sync::Arc; use crate::physical_optimizer::test_utils::{ @@ -85,6 +86,90 @@ fn csv_exec_sorted( DataSourceExec::from_data_source(config) } +/// Runs the sort enforcement optimizer and asserts the plan +/// against the original and expected plans +/// +/// `$PLAN`: the plan to optimized +/// `$REPARTITION_SORTS`: Flag to set `config.options.optimizer.repartition_sorts` option. +/// `$EXPECTED_PLAN_COMBINED`: The expected combined plan string with the original and optimized plans. +/// +macro_rules! assert_optimized { + ( + $PLAN: expr, + $REPARTITION_SORTS: expr, + @ $EXPECTED_PLAN_COMBINED: literal $(,)? + ) => {{ + let mut config = ConfigOptions::new(); + config.optimizer.repartition_sorts = $REPARTITION_SORTS; + + // This file has 4 rules that use tree node, apply these rules as in the + // EnforceSorting::optimize implementation + // After these operations tree nodes should be in a consistent state. + // This code block makes sure that these rules doesn't violate tree node integrity. + let plan_requirements = PlanWithCorrespondingSort::new_default($PLAN.clone()); + let adjusted = plan_requirements + .transform_up(ensure_sorting) + .data() + .and_then(check_integrity)?; + // TODO: End state payloads will be checked here. + + let new_plan = if config.optimizer.repartition_sorts { + let plan_with_coalesce_partitions = + PlanWithCorrespondingCoalescePartitions::new_default(adjusted.plan); + let parallel = plan_with_coalesce_partitions + .transform_up(parallelize_sorts) + .data() + .and_then(check_integrity)?; + // TODO: End state payloads will be checked here. + parallel.plan + } else { + adjusted.plan + }; + + let plan_with_pipeline_fixer = OrderPreservationContext::new_default(new_plan); + let updated_plan = plan_with_pipeline_fixer + .transform_up(|plan_with_pipeline_fixer| { + replace_with_order_preserving_variants( + plan_with_pipeline_fixer, + false, + true, + &config, + ) + }) + .data() + .and_then(check_integrity)?; + // TODO: End state payloads will be checked here. + + let mut sort_pushdown = SortPushDown::new_default(updated_plan.plan); + assign_initial_requirements(&mut sort_pushdown); + check_integrity(pushdown_sorts(sort_pushdown)?)?; + // TODO: End state payloads will be checked here. + + + let physical_plan = $PLAN; + let formatted = displayable(physical_plan.as_ref()).indent(true).to_string(); + let actual = formatted.trim(); + + // Run the actual optimizer + let optimized_physical_plan = + EnforceSorting::new().optimize(physical_plan,&config)?; + + // Get string representation of the plan + let plan_lines = get_plan_string(&optimized_physical_plan).join("\n"); + let actual_optimized = plan_lines.trim(); + + let actual_combined = format!( + "=== BEFORE OPTIMIZATION ===\n\n{}\n\n=== AFTER OPTIMIZATION ===\n\n{}", + actual, + actual_optimized + ); + + assert_snapshot!(actual_combined, @ $EXPECTED_PLAN_COMBINED); + + + }}; +} + /// Runs the sort enforcement optimizer and asserts the plan /// against the original and expected plans /// @@ -92,9 +177,9 @@ fn csv_exec_sorted( /// `$EXPECTED_OPTIMIZED_PLAN_LINES`: optimized plan /// `$PLAN`: the plan to optimized /// `REPARTITION_SORTS`: Flag to set `config.options.optimizer.repartition_sorts` option. -/// +/// `$CASE_NUMBER` (optional): The test case number to print on failure. macro_rules! assert_optimized { - (@ $EXPECTED_PLAN_LINES: literal $(,)?, @ $EXPECTED_OPTIMIZED_PLAN_LINES: literal $(,)?, $PLAN: expr, $REPARTITION_SORTS: expr) => { + (@ $EXPECTED_PLAN_LINES: literal $(,)?, @ $EXPECTED_OPTIMIZED_PLAN_LINES: literal $(,)?, $PLAN: expr, $REPARTITION_SORTS: expr $(, $CASE_NUMBER: expr)?) => { let mut config = ConfigOptions::new(); config.optimizer.repartition_sorts = $REPARTITION_SORTS; @@ -147,22 +232,39 @@ macro_rules! assert_optimized { let formatted = displayable(physical_plan.as_ref()).indent(true).to_string(); let actual = formatted.trim(); - allow_duplicates! { + // let expected_plan_lines: Vec<&str> = $EXPECTED_PLAN_LINES + // .iter().map(|s| *s).collect(); + + // if expected_plan_lines != actual { + // $(println!("\n**Original Plan Mismatch in case {}**", $CASE_NUMBER);)? + // println!("\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", expected_plan_lines, actual); + // assert_eq!(expected_plan_lines, actual); + // } + $(println!("\n**Testing Original Plan for case {}**", $CASE_NUMBER);)? + allow_duplicates! { assert_snapshot!(actual, @$EXPECTED_PLAN_LINES); } + // let expected_optimized_lines: Vec<&str> = $EXPECTED_OPTIMIZED_PLAN_LINES + // .iter().map(|s| *s).collect(); + // Run the actual optimizer let optimized_physical_plan = - EnforceSorting::new().optimize(physical_plan,&config)?; + EnforceSorting::new().optimize(physical_plan, &config)?; // Get string representation of the plan let plan_lines = get_plan_string(&optimized_physical_plan).join("\n"); let actual = plan_lines.trim(); - + // if expected_optimized_lines != actual { + // $(println!("\n**Optimized Plan Mismatch in case {}**", $CASE_NUMBER);)? + // println!("\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", expected_optimized_lines, actual); + // assert_eq!(expected_optimized_lines, actual); + // } + $(println!("\n**Testing Original Plan for case {}**", $CASE_NUMBER);)? allow_duplicates! { assert_snapshot!(actual, @ $EXPECTED_OPTIMIZED_PLAN_LINES); } - }; + } } #[tokio::test] @@ -1643,104 +1745,6 @@ async fn test_not_replaced_with_partial_sort_for_bounded_input() -> Result<()> { Ok(()) } -/// Runs the sort enforcement optimizer and asserts the plan -/// against the original and expected plans -/// -/// `$EXPECTED_PLAN_LINES`: input plan -/// `$EXPECTED_OPTIMIZED_PLAN_LINES`: optimized plan -/// `$PLAN`: the plan to optimized -/// `REPARTITION_SORTS`: Flag to set `config.options.optimizer.repartition_sorts` option. -/// `$CASE_NUMBER` (optional): The test case number to print on failure. -macro_rules! assert_optimized { - (@ $EXPECTED_PLAN_LINES: literal $(,)?, @ $EXPECTED_OPTIMIZED_PLAN_LINES: literal $(,)?, $PLAN: expr, $REPARTITION_SORTS: expr $(, $CASE_NUMBER: expr)?) => { - let mut config = ConfigOptions::new(); - config.optimizer.repartition_sorts = $REPARTITION_SORTS; - - // This file has 4 rules that use tree node, apply these rules as in the - // EnforceSorting::optimize implementation - // After these operations tree nodes should be in a consistent state. - // This code block makes sure that these rules doesn't violate tree node integrity. - { - let plan_requirements = PlanWithCorrespondingSort::new_default($PLAN.clone()); - let adjusted = plan_requirements - .transform_up(ensure_sorting) - .data() - .and_then(check_integrity)?; - // TODO: End state payloads will be checked here. - - let new_plan = if config.optimizer.repartition_sorts { - let plan_with_coalesce_partitions = - PlanWithCorrespondingCoalescePartitions::new_default(adjusted.plan); - let parallel = plan_with_coalesce_partitions - .transform_up(parallelize_sorts) - .data() - .and_then(check_integrity)?; - // TODO: End state payloads will be checked here. - parallel.plan - } else { - adjusted.plan - }; - - let plan_with_pipeline_fixer = OrderPreservationContext::new_default(new_plan); - let updated_plan = plan_with_pipeline_fixer - .transform_up(|plan_with_pipeline_fixer| { - replace_with_order_preserving_variants( - plan_with_pipeline_fixer, - false, - true, - &config, - ) - }) - .data() - .and_then(check_integrity)?; - // TODO: End state payloads will be checked here. - - let mut sort_pushdown = SortPushDown::new_default(updated_plan.plan); - assign_initial_requirements(&mut sort_pushdown); - check_integrity(pushdown_sorts(sort_pushdown)?)?; - // TODO: End state payloads will be checked here. - } - - let physical_plan = $PLAN; - let formatted = displayable(physical_plan.as_ref()).indent(true).to_string(); - let actual = formatted.trim(); - - // let expected_plan_lines: Vec<&str> = $EXPECTED_PLAN_LINES - // .iter().map(|s| *s).collect(); - - // if expected_plan_lines != actual { - // $(println!("\n**Original Plan Mismatch in case {}**", $CASE_NUMBER);)? - // println!("\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", expected_plan_lines, actual); - // assert_eq!(expected_plan_lines, actual); - // } - $(println!("\n**Testing Original Plan for case {}**", $CASE_NUMBER);)? - allow_duplicates! { - assert_snapshot!(actual, @$EXPECTED_PLAN_LINES); - } - - - // let expected_optimized_lines: Vec<&str> = $EXPECTED_OPTIMIZED_PLAN_LINES - // .iter().map(|s| *s).collect(); - - // Run the actual optimizer - let optimized_physical_plan = - EnforceSorting::new().optimize(physical_plan, &config)?; - - // Get string representation of the plan - let plan_lines = get_plan_string(&optimized_physical_plan).join("\n"); - let actual = plan_lines.trim(); - // if expected_optimized_lines != actual { - // $(println!("\n**Optimized Plan Mismatch in case {}**", $CASE_NUMBER);)? - // println!("\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", expected_optimized_lines, actual); - // assert_eq!(expected_optimized_lines, actual); - // } - $(println!("\n**Testing Original Plan for case {}**", $CASE_NUMBER);)? - allow_duplicates! { - assert_snapshot!(actual, @ $EXPECTED_OPTIMIZED_PLAN_LINES); - } - } -} - #[tokio::test] async fn test_remove_unnecessary_sort() -> Result<()> { let schema = create_test_schema()?; From 1e2fe6177be877e92d04c757a63efeba30a49e92 Mon Sep 17 00:00:00 2001 From: Cheng-Yuan-Lai Date: Fri, 20 Jun 2025 00:39:54 +0800 Subject: [PATCH 03/11] feat: update assert_optimized to support snapshot testing --- .../combine_partial_final_agg.rs | 90 +++++---- .../physical_optimizer/enforce_sorting.rs | 180 +++++++++--------- 2 files changed, 141 insertions(+), 129 deletions(-) diff --git a/datafusion/core/tests/physical_optimizer/combine_partial_final_agg.rs b/datafusion/core/tests/physical_optimizer/combine_partial_final_agg.rs index de8f7b36a3a70..77a67b6554e68 100644 --- a/datafusion/core/tests/physical_optimizer/combine_partial_final_agg.rs +++ b/datafusion/core/tests/physical_optimizer/combine_partial_final_agg.rs @@ -20,6 +20,7 @@ //! Note these tests are not in the same module as the optimizer pass because //! they rely on `DataSourceExec` which is in the core crate. +use insta::assert_snapshot; use std::sync::Arc; use crate::physical_optimizer::test_utils::{parquet_exec, trim_plan_display}; @@ -43,8 +44,8 @@ use datafusion_physical_plan::ExecutionPlan; /// Runs the CombinePartialFinalAggregate optimizer and asserts the plan against the expected macro_rules! assert_optimized { - ($EXPECTED_LINES: expr, $PLAN: expr) => { - let expected_lines: Vec<&str> = $EXPECTED_LINES.iter().map(|s| *s).collect(); + ($PLAN: expr, @ $EXPECTED_LINES: literal $(,)?) => { + // let expected_lines: Vec<&str> = $EXPECTED_LINES.iter().map(|s| *s).collect(); // run optimizer let optimizer = CombinePartialFinalAggregate {}; @@ -52,13 +53,16 @@ macro_rules! assert_optimized { let optimized = optimizer.optimize($PLAN, &config)?; // Now format correctly let plan = displayable(optimized.as_ref()).indent(true).to_string(); - let actual_lines = trim_plan_display(&plan); + // let actual_lines = trim_plan_display(&plan); + let actual_lines = plan.trim(); - assert_eq!( - &expected_lines, &actual_lines, - "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", - expected_lines, actual_lines - ); + // assert_eq!( + // &expected_lines, &actual_lines, + // "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", + // expected_lines, actual_lines + // ); + + assert_snapshot!(actual_lines, @ $EXPECTED_LINES); }; } @@ -144,13 +148,15 @@ fn aggregations_not_combined() -> datafusion_common::Result<()> { aggr_expr, ); // should not combine the Partial/Final AggregateExecs - let expected = &[ - "AggregateExec: mode=Final, gby=[], aggr=[COUNT(1)]", - "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - "AggregateExec: mode=Partial, gby=[], aggr=[COUNT(1)]", - "DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c], file_type=parquet", - ]; - assert_optimized!(expected, plan); + assert_optimized!( + plan, + @ r" + AggregateExec: mode=Final, gby=[], aggr=[COUNT(1)] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + AggregateExec: mode=Partial, gby=[], aggr=[COUNT(1)] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c], file_type=parquet + " + ); let aggr_expr1 = vec![count_expr(lit(1i8), "COUNT(1)", &schema)]; let aggr_expr2 = vec![count_expr(lit(1i8), "COUNT(2)", &schema)]; @@ -165,13 +171,14 @@ fn aggregations_not_combined() -> datafusion_common::Result<()> { aggr_expr2, ); // should not combine the Partial/Final AggregateExecs - let expected = &[ - "AggregateExec: mode=Final, gby=[], aggr=[COUNT(2)]", - "AggregateExec: mode=Partial, gby=[], aggr=[COUNT(1)]", - "DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c], file_type=parquet", - ]; - - assert_optimized!(expected, plan); + assert_optimized!( + plan, + @ r" + AggregateExec: mode=Final, gby=[], aggr=[COUNT(2)] + AggregateExec: mode=Partial, gby=[], aggr=[COUNT(1)] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c], file_type=parquet + " + ); Ok(()) } @@ -191,12 +198,13 @@ fn aggregations_combined() -> datafusion_common::Result<()> { aggr_expr, ); // should combine the Partial/Final AggregateExecs to the Single AggregateExec - let expected = &[ - "AggregateExec: mode=Single, gby=[], aggr=[COUNT(1)]", - "DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c], file_type=parquet", - ]; - - assert_optimized!(expected, plan); + assert_optimized!( + plan, + @ " + AggregateExec: mode=Single, gby=[], aggr=[COUNT(1)] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c], file_type=parquet + " + ); Ok(()) } @@ -224,12 +232,13 @@ fn aggregations_with_group_combined() -> datafusion_common::Result<()> { let plan = final_aggregate_exec(partial_agg, final_group_by, aggr_expr); // should combine the Partial/Final AggregateExecs to the Single AggregateExec - let expected = &[ - "AggregateExec: mode=Single, gby=[c@2 as c], aggr=[Sum(b)]", - "DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c], file_type=parquet", - ]; - - assert_optimized!(expected, plan); + assert_optimized!( + plan, + @ r" + AggregateExec: mode=Single, gby=[c@2 as c], aggr=[Sum(b)] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c], file_type=parquet + " + ); Ok(()) } @@ -265,11 +274,12 @@ fn aggregations_with_limit_combined() -> datafusion_common::Result<()> { let plan: Arc = final_agg; // should combine the Partial/Final AggregateExecs to a Single AggregateExec // with the final limit preserved - let expected = &[ - "AggregateExec: mode=Single, gby=[c@2 as c], aggr=[], lim=[5]", - "DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c], file_type=parquet", - ]; - - assert_optimized!(expected, plan); + assert_optimized!( + plan, + @ r" + AggregateExec: mode=Single, gby=[c@2 as c], aggr=[], lim=[5] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c], file_type=parquet + " + ); Ok(()) } diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs index c47cca5f2cb38..df563a4a4343b 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs @@ -97,11 +97,13 @@ macro_rules! assert_optimized { ( $PLAN: expr, $REPARTITION_SORTS: expr, - @ $EXPECTED_PLAN_COMBINED: literal $(,)? + @ $EXPECTED_PLAN_COMBINED: literal $(, $CASE_NUMBER: expr)? $(,)? ) => {{ let mut config = ConfigOptions::new(); config.optimizer.repartition_sorts = $REPARTITION_SORTS; + $(println!("\n**Testing case {}**", $CASE_NUMBER);)? + // This file has 4 rules that use tree node, apply these rules as in the // EnforceSorting::optimize implementation // After these operations tree nodes should be in a consistent state. @@ -178,94 +180,94 @@ macro_rules! assert_optimized { /// `$PLAN`: the plan to optimized /// `REPARTITION_SORTS`: Flag to set `config.options.optimizer.repartition_sorts` option. /// `$CASE_NUMBER` (optional): The test case number to print on failure. -macro_rules! assert_optimized { - (@ $EXPECTED_PLAN_LINES: literal $(,)?, @ $EXPECTED_OPTIMIZED_PLAN_LINES: literal $(,)?, $PLAN: expr, $REPARTITION_SORTS: expr $(, $CASE_NUMBER: expr)?) => { - let mut config = ConfigOptions::new(); - config.optimizer.repartition_sorts = $REPARTITION_SORTS; - - // This file has 4 rules that use tree node, apply these rules as in the - // EnforceSorting::optimize implementation - // After these operations tree nodes should be in a consistent state. - // This code block makes sure that these rules doesn't violate tree node integrity. - { - let plan_requirements = PlanWithCorrespondingSort::new_default($PLAN.clone()); - let adjusted = plan_requirements - .transform_up(ensure_sorting) - .data() - .and_then(check_integrity)?; - // TODO: End state payloads will be checked here. - - let new_plan = if config.optimizer.repartition_sorts { - let plan_with_coalesce_partitions = - PlanWithCorrespondingCoalescePartitions::new_default(adjusted.plan); - let parallel = plan_with_coalesce_partitions - .transform_up(parallelize_sorts) - .data() - .and_then(check_integrity)?; - // TODO: End state payloads will be checked here. - parallel.plan - } else { - adjusted.plan - }; - - let plan_with_pipeline_fixer = OrderPreservationContext::new_default(new_plan); - let updated_plan = plan_with_pipeline_fixer - .transform_up(|plan_with_pipeline_fixer| { - replace_with_order_preserving_variants( - plan_with_pipeline_fixer, - false, - true, - &config, - ) - }) - .data() - .and_then(check_integrity)?; - // TODO: End state payloads will be checked here. - - let mut sort_pushdown = SortPushDown::new_default(updated_plan.plan); - assign_initial_requirements(&mut sort_pushdown); - check_integrity(pushdown_sorts(sort_pushdown)?)?; - // TODO: End state payloads will be checked here. - } - - let physical_plan = $PLAN; - let formatted = displayable(physical_plan.as_ref()).indent(true).to_string(); - let actual = formatted.trim(); - - // let expected_plan_lines: Vec<&str> = $EXPECTED_PLAN_LINES - // .iter().map(|s| *s).collect(); - - // if expected_plan_lines != actual { - // $(println!("\n**Original Plan Mismatch in case {}**", $CASE_NUMBER);)? - // println!("\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", expected_plan_lines, actual); - // assert_eq!(expected_plan_lines, actual); - // } - $(println!("\n**Testing Original Plan for case {}**", $CASE_NUMBER);)? - allow_duplicates! { - assert_snapshot!(actual, @$EXPECTED_PLAN_LINES); - } - - // let expected_optimized_lines: Vec<&str> = $EXPECTED_OPTIMIZED_PLAN_LINES - // .iter().map(|s| *s).collect(); - - // Run the actual optimizer - let optimized_physical_plan = - EnforceSorting::new().optimize(physical_plan, &config)?; - - // Get string representation of the plan - let plan_lines = get_plan_string(&optimized_physical_plan).join("\n"); - let actual = plan_lines.trim(); - // if expected_optimized_lines != actual { - // $(println!("\n**Optimized Plan Mismatch in case {}**", $CASE_NUMBER);)? - // println!("\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", expected_optimized_lines, actual); - // assert_eq!(expected_optimized_lines, actual); - // } - $(println!("\n**Testing Original Plan for case {}**", $CASE_NUMBER);)? - allow_duplicates! { - assert_snapshot!(actual, @ $EXPECTED_OPTIMIZED_PLAN_LINES); - } - } -} +// macro_rules! assert_optimized { +// (@ $EXPECTED_PLAN_LINES: literal $(,)?, @ $EXPECTED_OPTIMIZED_PLAN_LINES: literal $(,)?, $PLAN: expr, $REPARTITION_SORTS: expr $(, $CASE_NUMBER: expr)?) => { +// let mut config = ConfigOptions::new(); +// config.optimizer.repartition_sorts = $REPARTITION_SORTS; + +// // This file has 4 rules that use tree node, apply these rules as in the +// // EnforceSorting::optimize implementation +// // After these operations tree nodes should be in a consistent state. +// // This code block makes sure that these rules doesn't violate tree node integrity. +// { +// let plan_requirements = PlanWithCorrespondingSort::new_default($PLAN.clone()); +// let adjusted = plan_requirements +// .transform_up(ensure_sorting) +// .data() +// .and_then(check_integrity)?; +// // TODO: End state payloads will be checked here. + +// let new_plan = if config.optimizer.repartition_sorts { +// let plan_with_coalesce_partitions = +// PlanWithCorrespondingCoalescePartitions::new_default(adjusted.plan); +// let parallel = plan_with_coalesce_partitions +// .transform_up(parallelize_sorts) +// .data() +// .and_then(check_integrity)?; +// // TODO: End state payloads will be checked here. +// parallel.plan +// } else { +// adjusted.plan +// }; + +// let plan_with_pipeline_fixer = OrderPreservationContext::new_default(new_plan); +// let updated_plan = plan_with_pipeline_fixer +// .transform_up(|plan_with_pipeline_fixer| { +// replace_with_order_preserving_variants( +// plan_with_pipeline_fixer, +// false, +// true, +// &config, +// ) +// }) +// .data() +// .and_then(check_integrity)?; +// // TODO: End state payloads will be checked here. + +// let mut sort_pushdown = SortPushDown::new_default(updated_plan.plan); +// assign_initial_requirements(&mut sort_pushdown); +// check_integrity(pushdown_sorts(sort_pushdown)?)?; +// // TODO: End state payloads will be checked here. +// } + +// let physical_plan = $PLAN; +// let formatted = displayable(physical_plan.as_ref()).indent(true).to_string(); +// let actual = formatted.trim(); + +// // let expected_plan_lines: Vec<&str> = $EXPECTED_PLAN_LINES +// // .iter().map(|s| *s).collect(); + +// // if expected_plan_lines != actual { +// // $(println!("\n**Original Plan Mismatch in case {}**", $CASE_NUMBER);)? +// // println!("\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", expected_plan_lines, actual); +// // assert_eq!(expected_plan_lines, actual); +// // } +// $(println!("\n**Testing Original Plan for case {}**", $CASE_NUMBER);)? +// allow_duplicates! { +// assert_snapshot!(actual, @$EXPECTED_PLAN_LINES); +// } + +// // let expected_optimized_lines: Vec<&str> = $EXPECTED_OPTIMIZED_PLAN_LINES +// // .iter().map(|s| *s).collect(); + +// // Run the actual optimizer +// let optimized_physical_plan = +// EnforceSorting::new().optimize(physical_plan, &config)?; + +// // Get string representation of the plan +// let plan_lines = get_plan_string(&optimized_physical_plan).join("\n"); +// let actual = plan_lines.trim(); +// // if expected_optimized_lines != actual { +// // $(println!("\n**Optimized Plan Mismatch in case {}**", $CASE_NUMBER);)? +// // println!("\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", expected_optimized_lines, actual); +// // assert_eq!(expected_optimized_lines, actual); +// // } +// $(println!("\n**Testing Original Plan for case {}**", $CASE_NUMBER);)? +// allow_duplicates! { +// assert_snapshot!(actual, @ $EXPECTED_OPTIMIZED_PLAN_LINES); +// } +// } +// } #[tokio::test] async fn test_remove_unnecessary_sort5() -> Result<()> { From e0d6fdcb3b406262c28d05cabd60dae431b52c8e Mon Sep 17 00:00:00 2001 From: Cheng-Yuan-Lai Date: Tue, 24 Jun 2025 00:44:52 +0800 Subject: [PATCH 04/11] Revert "feat: replace snapshot tests for enforce_sorting" This reverts commit 8c921fabc6722669682c8306ab283c2d5bd1ccba. --- .../physical_optimizer/enforce_sorting.rs | 305 +++++++++--------- 1 file changed, 147 insertions(+), 158 deletions(-) diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs index df563a4a4343b..eecb772ba264d 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs @@ -15,9 +15,6 @@ // specific language governing permissions and limitations // under the License. -use insta::allow_duplicates; -use insta::assert_snapshot; -use insta::with_settings; use std::sync::Arc; use crate::physical_optimizer::test_utils::{ @@ -94,11 +91,7 @@ fn csv_exec_sorted( /// `$EXPECTED_PLAN_COMBINED`: The expected combined plan string with the original and optimized plans. /// macro_rules! assert_optimized { - ( - $PLAN: expr, - $REPARTITION_SORTS: expr, - @ $EXPECTED_PLAN_COMBINED: literal $(, $CASE_NUMBER: expr)? $(,)? - ) => {{ + ($EXPECTED_PLAN_LINES: expr, $EXPECTED_OPTIMIZED_PLAN_LINES: expr, $PLAN: expr, $REPARTITION_SORTS: expr) => { let mut config = ConfigOptions::new(); config.optimizer.repartition_sorts = $REPARTITION_SORTS; @@ -150,125 +143,33 @@ macro_rules! assert_optimized { let physical_plan = $PLAN; let formatted = displayable(physical_plan.as_ref()).indent(true).to_string(); - let actual = formatted.trim(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + + let expected_plan_lines: Vec<&str> = $EXPECTED_PLAN_LINES + .iter().map(|s| *s).collect(); + + assert_eq!( + expected_plan_lines, actual, + "\n**Original Plan Mismatch\n\nexpected:\n\n{expected_plan_lines:#?}\nactual:\n\n{actual:#?}\n\n" + ); + + let expected_optimized_lines: Vec<&str> = $EXPECTED_OPTIMIZED_PLAN_LINES + .iter().map(|s| *s).collect(); // Run the actual optimizer let optimized_physical_plan = EnforceSorting::new().optimize(physical_plan,&config)?; // Get string representation of the plan - let plan_lines = get_plan_string(&optimized_physical_plan).join("\n"); - let actual_optimized = plan_lines.trim(); - - let actual_combined = format!( - "=== BEFORE OPTIMIZATION ===\n\n{}\n\n=== AFTER OPTIMIZATION ===\n\n{}", - actual, - actual_optimized + let actual = get_plan_string(&optimized_physical_plan); + assert_eq!( + expected_optimized_lines, actual, + "\n**Optimized Plan Mismatch\n\nexpected:\n\n{expected_optimized_lines:#?}\nactual:\n\n{actual:#?}\n\n" ); - assert_snapshot!(actual_combined, @ $EXPECTED_PLAN_COMBINED); - - - }}; + }; } -/// Runs the sort enforcement optimizer and asserts the plan -/// against the original and expected plans -/// -/// `$EXPECTED_PLAN_LINES`: input plan -/// `$EXPECTED_OPTIMIZED_PLAN_LINES`: optimized plan -/// `$PLAN`: the plan to optimized -/// `REPARTITION_SORTS`: Flag to set `config.options.optimizer.repartition_sorts` option. -/// `$CASE_NUMBER` (optional): The test case number to print on failure. -// macro_rules! assert_optimized { -// (@ $EXPECTED_PLAN_LINES: literal $(,)?, @ $EXPECTED_OPTIMIZED_PLAN_LINES: literal $(,)?, $PLAN: expr, $REPARTITION_SORTS: expr $(, $CASE_NUMBER: expr)?) => { -// let mut config = ConfigOptions::new(); -// config.optimizer.repartition_sorts = $REPARTITION_SORTS; - -// // This file has 4 rules that use tree node, apply these rules as in the -// // EnforceSorting::optimize implementation -// // After these operations tree nodes should be in a consistent state. -// // This code block makes sure that these rules doesn't violate tree node integrity. -// { -// let plan_requirements = PlanWithCorrespondingSort::new_default($PLAN.clone()); -// let adjusted = plan_requirements -// .transform_up(ensure_sorting) -// .data() -// .and_then(check_integrity)?; -// // TODO: End state payloads will be checked here. - -// let new_plan = if config.optimizer.repartition_sorts { -// let plan_with_coalesce_partitions = -// PlanWithCorrespondingCoalescePartitions::new_default(adjusted.plan); -// let parallel = plan_with_coalesce_partitions -// .transform_up(parallelize_sorts) -// .data() -// .and_then(check_integrity)?; -// // TODO: End state payloads will be checked here. -// parallel.plan -// } else { -// adjusted.plan -// }; - -// let plan_with_pipeline_fixer = OrderPreservationContext::new_default(new_plan); -// let updated_plan = plan_with_pipeline_fixer -// .transform_up(|plan_with_pipeline_fixer| { -// replace_with_order_preserving_variants( -// plan_with_pipeline_fixer, -// false, -// true, -// &config, -// ) -// }) -// .data() -// .and_then(check_integrity)?; -// // TODO: End state payloads will be checked here. - -// let mut sort_pushdown = SortPushDown::new_default(updated_plan.plan); -// assign_initial_requirements(&mut sort_pushdown); -// check_integrity(pushdown_sorts(sort_pushdown)?)?; -// // TODO: End state payloads will be checked here. -// } - -// let physical_plan = $PLAN; -// let formatted = displayable(physical_plan.as_ref()).indent(true).to_string(); -// let actual = formatted.trim(); - -// // let expected_plan_lines: Vec<&str> = $EXPECTED_PLAN_LINES -// // .iter().map(|s| *s).collect(); - -// // if expected_plan_lines != actual { -// // $(println!("\n**Original Plan Mismatch in case {}**", $CASE_NUMBER);)? -// // println!("\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", expected_plan_lines, actual); -// // assert_eq!(expected_plan_lines, actual); -// // } -// $(println!("\n**Testing Original Plan for case {}**", $CASE_NUMBER);)? -// allow_duplicates! { -// assert_snapshot!(actual, @$EXPECTED_PLAN_LINES); -// } - -// // let expected_optimized_lines: Vec<&str> = $EXPECTED_OPTIMIZED_PLAN_LINES -// // .iter().map(|s| *s).collect(); - -// // Run the actual optimizer -// let optimized_physical_plan = -// EnforceSorting::new().optimize(physical_plan, &config)?; - -// // Get string representation of the plan -// let plan_lines = get_plan_string(&optimized_physical_plan).join("\n"); -// let actual = plan_lines.trim(); -// // if expected_optimized_lines != actual { -// // $(println!("\n**Optimized Plan Mismatch in case {}**", $CASE_NUMBER);)? -// // println!("\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", expected_optimized_lines, actual); -// // assert_eq!(expected_optimized_lines, actual); -// // } -// $(println!("\n**Testing Original Plan for case {}**", $CASE_NUMBER);)? -// allow_duplicates! { -// assert_snapshot!(actual, @ $EXPECTED_OPTIMIZED_PLAN_LINES); -// } -// } -// } - #[tokio::test] async fn test_remove_unnecessary_sort5() -> Result<()> { let left_schema = create_test_schema2()?; @@ -1402,7 +1303,7 @@ async fn test_sort_merge_join_order_by_right() -> Result<()> { ] } }; - // assert_optimized!(expected_input, expected_optimized, physical_plan, true); + assert_optimized!(expected_input, expected_optimized, physical_plan, true); } Ok(()) } @@ -1618,18 +1519,18 @@ async fn test_with_lost_ordering_unbounded_bounded( expected_optimized_bounded_parallelize_sort, ) }; - // assert_optimized!( - // expected_input, - // expected_optimized, - // physical_plan.clone(), - // false - // ); - // assert_optimized!( - // expected_input, - // expected_optimized_sort_parallelize, - // physical_plan, - // true - // ); + assert_optimized!( + expected_input, + expected_optimized, + physical_plan.clone(), + false + ); + assert_optimized!( + expected_input, + expected_optimized_sort_parallelize, + physical_plan, + true + ); Ok(()) } @@ -1720,7 +1621,7 @@ async fn test_window_multi_layer_requirement() -> Result<()> { " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", ]; - // assert_optimized!(expected_input, expected_optimized, physical_plan, false); + assert_optimized!(expected_input, expected_optimized, physical_plan, false); Ok(()) } @@ -1743,10 +1644,98 @@ async fn test_not_replaced_with_partial_sort_for_bounded_input() -> Result<()> { " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[b@1 ASC, c@2 ASC], file_type=parquet" ]; let expected_no_change = expected_input; - // assert_optimized!(expected_input, expected_no_change, physical_plan, false); + assert_optimized!(expected_input, expected_no_change, physical_plan, false); Ok(()) } +/// Runs the sort enforcement optimizer and asserts the plan +/// against the original and expected plans +/// +/// `$EXPECTED_PLAN_LINES`: input plan +/// `$EXPECTED_OPTIMIZED_PLAN_LINES`: optimized plan +/// `$PLAN`: the plan to optimized +/// `REPARTITION_SORTS`: Flag to set `config.options.optimizer.repartition_sorts` option. +/// `$CASE_NUMBER` (optional): The test case number to print on failure. +macro_rules! assert_optimized { + ($EXPECTED_PLAN_LINES: expr, $EXPECTED_OPTIMIZED_PLAN_LINES: expr, $PLAN: expr, $REPARTITION_SORTS: expr $(, $CASE_NUMBER: expr)?) => { + let mut config = ConfigOptions::new(); + config.optimizer.repartition_sorts = $REPARTITION_SORTS; + + // This file has 4 rules that use tree node, apply these rules as in the + // EnforceSorting::optimize implementation + // After these operations tree nodes should be in a consistent state. + // This code block makes sure that these rules doesn't violate tree node integrity. + { + let plan_requirements = PlanWithCorrespondingSort::new_default($PLAN.clone()); + let adjusted = plan_requirements + .transform_up(ensure_sorting) + .data() + .and_then(check_integrity)?; + // TODO: End state payloads will be checked here. + + let new_plan = if config.optimizer.repartition_sorts { + let plan_with_coalesce_partitions = + PlanWithCorrespondingCoalescePartitions::new_default(adjusted.plan); + let parallel = plan_with_coalesce_partitions + .transform_up(parallelize_sorts) + .data() + .and_then(check_integrity)?; + // TODO: End state payloads will be checked here. + parallel.plan + } else { + adjusted.plan + }; + + let plan_with_pipeline_fixer = OrderPreservationContext::new_default(new_plan); + let updated_plan = plan_with_pipeline_fixer + .transform_up(|plan_with_pipeline_fixer| { + replace_with_order_preserving_variants( + plan_with_pipeline_fixer, + false, + true, + &config, + ) + }) + .data() + .and_then(check_integrity)?; + // TODO: End state payloads will be checked here. + + let mut sort_pushdown = SortPushDown::new_default(updated_plan.plan); + assign_initial_requirements(&mut sort_pushdown); + check_integrity(pushdown_sorts(sort_pushdown)?)?; + // TODO: End state payloads will be checked here. + } + + let physical_plan = $PLAN; + let formatted = displayable(physical_plan.as_ref()).indent(true).to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + + let expected_plan_lines: Vec<&str> = $EXPECTED_PLAN_LINES + .iter().map(|s| *s).collect(); + + if expected_plan_lines != actual { + $(println!("\n**Original Plan Mismatch in case {}**", $CASE_NUMBER);)? + println!("\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", expected_plan_lines, actual); + assert_eq!(expected_plan_lines, actual); + } + + let expected_optimized_lines: Vec<&str> = $EXPECTED_OPTIMIZED_PLAN_LINES + .iter().map(|s| *s).collect(); + + // Run the actual optimizer + let optimized_physical_plan = + EnforceSorting::new().optimize(physical_plan, &config)?; + + // Get string representation of the plan + let actual = get_plan_string(&optimized_physical_plan); + if expected_optimized_lines != actual { + $(println!("\n**Optimized Plan Mismatch in case {}**", $CASE_NUMBER);)? + println!("\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", expected_optimized_lines, actual); + assert_eq!(expected_optimized_lines, actual); + } + }; +} + #[tokio::test] async fn test_remove_unnecessary_sort() -> Result<()> { let schema = create_test_schema()?; @@ -1763,7 +1752,7 @@ async fn test_remove_unnecessary_sort() -> Result<()> { "SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]", " DataSourceExec: partitions=1, partition_sizes=[0]", ]; - // assert_optimized!(expected_input, expected_optimized, physical_plan, true); + assert_optimized!(expected_input, expected_optimized, physical_plan, true); Ok(()) } @@ -1840,7 +1829,7 @@ async fn test_add_required_sort() -> Result<()> { "SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]", " DataSourceExec: partitions=1, partition_sizes=[0]", ]; - // assert_optimized!(expected_input, expected_optimized, physical_plan, true); + assert_optimized!(expected_input, expected_optimized, physical_plan, true); Ok(()) } @@ -1866,7 +1855,7 @@ async fn test_remove_unnecessary_sort1() -> Result<()> { "SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]", " DataSourceExec: partitions=1, partition_sizes=[0]", ]; - // assert_optimized!(expected_input, expected_optimized, physical_plan, true); + assert_optimized!(expected_input, expected_optimized, physical_plan, true); Ok(()) } @@ -1904,7 +1893,7 @@ async fn test_remove_unnecessary_sort2() -> Result<()> { " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", " DataSourceExec: partitions=1, partition_sizes=[0]", ]; - // assert_optimized!(expected_input, expected_optimized, physical_plan, true); + assert_optimized!(expected_input, expected_optimized, physical_plan, true); Ok(()) } @@ -1947,7 +1936,7 @@ async fn test_remove_unnecessary_sort3() -> Result<()> { " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", " DataSourceExec: partitions=1, partition_sizes=[0]", ]; - // assert_optimized!(expected_input, expected_optimized, physical_plan, true); + assert_optimized!(expected_input, expected_optimized, physical_plan, true); Ok(()) } @@ -2030,7 +2019,7 @@ async fn test_remove_unnecessary_sort6() -> Result<()> { "SortExec: TopK(fetch=2), expr=[non_nullable_col@1 ASC, nullable_col@0 ASC], preserve_partitioning=[false]", " DataSourceExec: partitions=1, partition_sizes=[0]", ]; - // assert_optimized!(expected_input, expected_optimized, physical_plan, true); + assert_optimized!(expected_input, expected_optimized, physical_plan, true); Ok(()) } @@ -2063,7 +2052,7 @@ async fn test_remove_unnecessary_sort7() -> Result<()> { " SortExec: expr=[non_nullable_col@1 ASC, nullable_col@0 ASC], preserve_partitioning=[false]", " DataSourceExec: partitions=1, partition_sizes=[0]", ]; - // assert_optimized!(expected_input, expected_optimized, physical_plan, true); + assert_optimized!(expected_input, expected_optimized, physical_plan, true); Ok(()) } @@ -2094,7 +2083,7 @@ async fn test_remove_unnecessary_sort8() -> Result<()> { " SortExec: TopK(fetch=2), expr=[non_nullable_col@1 ASC, nullable_col@0 ASC], preserve_partitioning=[false]", " DataSourceExec: partitions=1, partition_sizes=[0]", ]; - // assert_optimized!(expected_input, expected_optimized, physical_plan, true); + assert_optimized!(expected_input, expected_optimized, physical_plan, true); Ok(()) } @@ -2119,7 +2108,7 @@ async fn test_do_not_pushdown_through_limit() -> Result<()> { " SortExec: expr=[non_nullable_col@1 ASC], preserve_partitioning=[false]", " DataSourceExec: partitions=1, partition_sizes=[0]", ]; - // assert_optimized!(expected_input, expected_optimized, physical_plan, true); + assert_optimized!(expected_input, expected_optimized, physical_plan, true); Ok(()) } @@ -2144,7 +2133,7 @@ async fn test_remove_unnecessary_spm1() -> Result<()> { "SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]", " DataSourceExec: partitions=1, partition_sizes=[0]", ]; - // assert_optimized!(expected_input, expected_optimized, physical_plan, true); + assert_optimized!(expected_input, expected_optimized, physical_plan, true); Ok(()) } @@ -2168,7 +2157,7 @@ async fn test_remove_unnecessary_spm2() -> Result<()> { " SortExec: expr=[non_nullable_col@1 ASC], preserve_partitioning=[false]", " DataSourceExec: partitions=1, partition_sizes=[0]", ]; - // assert_optimized!(expected_input, expected_optimized, input, true); + assert_optimized!(expected_input, expected_optimized, input, true); Ok(()) } @@ -2193,7 +2182,7 @@ async fn test_change_wrong_sorting() -> Result<()> { "SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]", " DataSourceExec: partitions=1, partition_sizes=[0]", ]; - // assert_optimized!(expected_input, expected_optimized, physical_plan, true); + assert_optimized!(expected_input, expected_optimized, physical_plan, true); Ok(()) } @@ -2220,7 +2209,7 @@ async fn test_change_wrong_sorting2() -> Result<()> { "SortExec: expr=[non_nullable_col@1 ASC], preserve_partitioning=[false]", " DataSourceExec: partitions=1, partition_sizes=[0]", ]; - // assert_optimized!(expected_input, expected_optimized, physical_plan, true); + assert_optimized!(expected_input, expected_optimized, physical_plan, true); Ok(()) } @@ -2342,7 +2331,7 @@ async fn test_coalesce_propagate() -> Result<()> { " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", " DataSourceExec: partitions=1, partition_sizes=[0]", ]; - // assert_optimized!(expected_input, expected_optimized, physical_plan, true); + assert_optimized!(expected_input, expected_optimized, physical_plan, true); Ok(()) } @@ -2371,7 +2360,7 @@ async fn test_replace_with_partial_sort2() -> Result<()> { "PartialSortExec: expr=[a@0 ASC, c@2 ASC, d@3 ASC], common_prefix_length=[2]", " StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[a@0 ASC, c@2 ASC]", ]; - // assert_optimized!(expected_input, expected_optimized, physical_plan, true); + assert_optimized!(expected_input, expected_optimized, physical_plan, true); Ok(()) } @@ -2447,7 +2436,7 @@ async fn test_replace_with_partial_sort() -> Result<()> { "PartialSortExec: expr=[a@0 ASC, c@2 ASC], common_prefix_length=[1]", " StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[a@0 ASC]", ]; - // assert_optimized!(expected_input, expected_optimized, physical_plan, true); + assert_optimized!(expected_input, expected_optimized, physical_plan, true); Ok(()) } @@ -3725,13 +3714,13 @@ async fn test_window_partial_constant_and_set_monotonicity() -> Result<()> { let ordering = LexOrdering::new(sort_expr).unwrap(); let physical_plan = sort_exec(ordering, window_exec); - // assert_optimized!( - // case.initial_plan, - // case.expected_plan, - // physical_plan, - // true, - // case_idx - // ); + assert_optimized!( + case.initial_plan, + case.expected_plan, + physical_plan, + true, + case_idx + ); } Ok(()) @@ -3758,7 +3747,7 @@ fn test_removes_unused_orthogonal_sort() -> Result<()> { let expected_optimized = [ "StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[b@1 ASC, c@2 ASC]" ]; - // assert_optimized!(expected_input, expected_optimized, output_sort, true); + assert_optimized!(expected_input, expected_optimized, output_sort, true); Ok(()) } @@ -3783,7 +3772,7 @@ fn test_keeps_used_orthogonal_sort() -> Result<()> { // Test: should keep the orthogonal sort, since it modifies the output: let expected_optimized = expected_input; - // assert_optimized!(expected_input, expected_optimized, output_sort, true); + assert_optimized!(expected_input, expected_optimized, output_sort, true); Ok(()) } @@ -3820,7 +3809,7 @@ fn test_handles_multiple_orthogonal_sorts() -> Result<()> { " SortExec: TopK(fetch=3), expr=[a@0 ASC], preserve_partitioning=[false]", " StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[b@1 ASC, c@2 ASC]", ]; - // assert_optimized!(expected_input, expected_optimized, output_sort, true); + assert_optimized!(expected_input, expected_optimized, output_sort, true); Ok(()) } From 18644f5d0808aff67b4e4a661930900421ff992a Mon Sep 17 00:00:00 2001 From: Ian Lai Date: Mon, 30 Jun 2025 07:16:14 +0000 Subject: [PATCH 05/11] feat: migrate core test to insta --- .../combine_partial_final_agg.rs | 9 +- .../physical_optimizer/enforce_sorting.rs | 77 ++- .../limited_distinct_aggregation.rs | 255 +++++---- .../physical_optimizer/projection_pushdown.rs | 522 +++++++++++------- .../physical_optimizer/sanity_checker.rs | 151 ++--- .../tests/physical_optimizer/test_utils.rs | 21 +- datafusion/core/tests/sql/joins.rs | 109 ++-- 7 files changed, 646 insertions(+), 498 deletions(-) diff --git a/datafusion/core/tests/physical_optimizer/combine_partial_final_agg.rs b/datafusion/core/tests/physical_optimizer/combine_partial_final_agg.rs index 77a67b6554e68..7e31600cdc4d5 100644 --- a/datafusion/core/tests/physical_optimizer/combine_partial_final_agg.rs +++ b/datafusion/core/tests/physical_optimizer/combine_partial_final_agg.rs @@ -23,7 +23,7 @@ use insta::assert_snapshot; use std::sync::Arc; -use crate::physical_optimizer::test_utils::{parquet_exec, trim_plan_display}; +use crate::physical_optimizer::test_utils::parquet_exec; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion_common::config::ConfigOptions; @@ -53,15 +53,8 @@ macro_rules! assert_optimized { let optimized = optimizer.optimize($PLAN, &config)?; // Now format correctly let plan = displayable(optimized.as_ref()).indent(true).to_string(); - // let actual_lines = trim_plan_display(&plan); let actual_lines = plan.trim(); - // assert_eq!( - // &expected_lines, &actual_lines, - // "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", - // expected_lines, actual_lines - // ); - assert_snapshot!(actual_lines, @ $EXPECTED_LINES); }; } diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs index eecb772ba264d..a6e7cded15b03 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs @@ -95,50 +95,50 @@ macro_rules! assert_optimized { let mut config = ConfigOptions::new(); config.optimizer.repartition_sorts = $REPARTITION_SORTS; - $(println!("\n**Testing case {}**", $CASE_NUMBER);)? - // This file has 4 rules that use tree node, apply these rules as in the // EnforceSorting::optimize implementation // After these operations tree nodes should be in a consistent state. // This code block makes sure that these rules doesn't violate tree node integrity. - let plan_requirements = PlanWithCorrespondingSort::new_default($PLAN.clone()); - let adjusted = plan_requirements - .transform_up(ensure_sorting) - .data() - .and_then(check_integrity)?; - // TODO: End state payloads will be checked here. - - let new_plan = if config.optimizer.repartition_sorts { - let plan_with_coalesce_partitions = - PlanWithCorrespondingCoalescePartitions::new_default(adjusted.plan); - let parallel = plan_with_coalesce_partitions - .transform_up(parallelize_sorts) + { + let plan_requirements = PlanWithCorrespondingSort::new_default($PLAN.clone()); + let adjusted = plan_requirements + .transform_up(ensure_sorting) .data() .and_then(check_integrity)?; // TODO: End state payloads will be checked here. - parallel.plan - } else { - adjusted.plan - }; - let plan_with_pipeline_fixer = OrderPreservationContext::new_default(new_plan); - let updated_plan = plan_with_pipeline_fixer - .transform_up(|plan_with_pipeline_fixer| { - replace_with_order_preserving_variants( - plan_with_pipeline_fixer, - false, - true, - &config, - ) - }) - .data() - .and_then(check_integrity)?; - // TODO: End state payloads will be checked here. + let new_plan = if config.optimizer.repartition_sorts { + let plan_with_coalesce_partitions = + PlanWithCorrespondingCoalescePartitions::new_default(adjusted.plan); + let parallel = plan_with_coalesce_partitions + .transform_up(parallelize_sorts) + .data() + .and_then(check_integrity)?; + // TODO: End state payloads will be checked here. + parallel.plan + } else { + adjusted.plan + }; + + let plan_with_pipeline_fixer = OrderPreservationContext::new_default(new_plan); + let updated_plan = plan_with_pipeline_fixer + .transform_up(|plan_with_pipeline_fixer| { + replace_with_order_preserving_variants( + plan_with_pipeline_fixer, + false, + true, + &config, + ) + }) + .data() + .and_then(check_integrity)?; + // TODO: End state payloads will be checked here. - let mut sort_pushdown = SortPushDown::new_default(updated_plan.plan); - assign_initial_requirements(&mut sort_pushdown); - check_integrity(pushdown_sorts(sort_pushdown)?)?; - // TODO: End state payloads will be checked here. + let mut sort_pushdown = SortPushDown::new_default(updated_plan.plan); + assign_initial_requirements(&mut sort_pushdown); + check_integrity(pushdown_sorts(sort_pushdown)?)?; + // TODO: End state payloads will be checked here. + } let physical_plan = $PLAN; @@ -1225,12 +1225,7 @@ async fn test_sort_merge_join_order_by_left() -> Result<()> { ] } }; - assert_optimized!( - @ "", - @ "", - physical_plan, - true - ); + assert_optimized!(expected_input, expected_optimized, physical_plan, true); } Ok(()) } diff --git a/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs b/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs index 409d392f7819e..8e4f25cb11dff 100644 --- a/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs +++ b/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs @@ -17,11 +17,12 @@ //! Integration tests for [`LimitedDistinctAggregation`] physical optimizer rule +use insta::assert_snapshot; use std::sync::Arc; use crate::physical_optimizer::test_utils::{ - assert_plan_matches_expected, build_group_by, mock_data, parquet_exec_with_sort, - schema, TestAggregate, + build_group_by, mock_data, parquet_exec_with_sort, plan_matches_expected, schema, + TestAggregate, }; use arrow::datatypes::DataType; @@ -39,16 +40,12 @@ use datafusion_physical_plan::{ ExecutionPlan, }; -async fn assert_results_match_expected( - plan: Arc, - expected: &str, -) -> Result<()> { +async fn assert_results_match_expected(plan: Arc) -> Result { let cfg = SessionConfig::new().with_target_partitions(1); let ctx = SessionContext::new_with_config(cfg); let batches = collect(plan, ctx.task_ctx()).await?; let actual = format!("{}", pretty_format_batches(&batches)?); - assert_eq!(actual, expected); - Ok(()) + Ok(actual) } #[tokio::test] @@ -77,27 +74,34 @@ async fn test_partial_final() -> Result<()> { Arc::new(final_agg), 4, // fetch ); - // expected to push the limit to the Partial and Final AggregateExecs - let expected = [ - "LocalLimitExec: fetch=4", - "AggregateExec: mode=Final, gby=[a@0 as a], aggr=[], lim=[4]", - "AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[], lim=[4]", - "DataSourceExec: partitions=1, partition_sizes=[1]", - ]; let plan: Arc = Arc::new(limit_exec); - assert_plan_matches_expected(&plan, &expected)?; - let expected = r#" -+---+ -| a | -+---+ -| 1 | -| 2 | -| | -| 4 | -+---+ -"# - .trim(); - assert_results_match_expected(plan, expected).await?; + let formatted = plan_matches_expected(&plan)?; + let actual = formatted.trim(); + assert_snapshot!( + actual, + @r" + LocalLimitExec: fetch=4 + AggregateExec: mode=Final, gby=[a@0 as a], aggr=[], lim=[4] + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[], lim=[4] + DataSourceExec: partitions=1, partition_sizes=[1] + " + ); + let expected = assert_results_match_expected(plan).await?; + let actual = expected.trim(); + assert_snapshot!( + actual, + @r" + +---+ + | a | + +---+ + | 1 | + | 2 | + | | + | 4 | + +---+ + " + ); + Ok(()) } @@ -120,25 +124,32 @@ async fn test_single_local() -> Result<()> { 4, // fetch ); // expected to push the limit to the AggregateExec - let expected = [ - "LocalLimitExec: fetch=4", - "AggregateExec: mode=Single, gby=[a@0 as a], aggr=[], lim=[4]", - "DataSourceExec: partitions=1, partition_sizes=[1]", - ]; let plan: Arc = Arc::new(limit_exec); - assert_plan_matches_expected(&plan, &expected)?; - let expected = r#" -+---+ -| a | -+---+ -| 1 | -| 2 | -| | -| 4 | -+---+ -"# - .trim(); - assert_results_match_expected(plan, expected).await?; + let formatted = plan_matches_expected(&plan)?; + let actual = formatted.trim(); + assert_snapshot!( + actual, + @r" + LocalLimitExec: fetch=4 + AggregateExec: mode=Single, gby=[a@0 as a], aggr=[], lim=[4] + DataSourceExec: partitions=1, partition_sizes=[1] + " + ); + let expected = assert_results_match_expected(plan).await?; + let actual = expected.trim(); + assert_snapshot!( + actual, + @r" + +---+ + | a | + +---+ + | 1 | + | 2 | + | | + | 4 | + +---+ + " + ); Ok(()) } @@ -162,24 +173,31 @@ async fn test_single_global() -> Result<()> { Some(3), // fetch ); // expected to push the skip+fetch limit to the AggregateExec - let expected = [ - "GlobalLimitExec: skip=1, fetch=3", - "AggregateExec: mode=Single, gby=[a@0 as a], aggr=[], lim=[4]", - "DataSourceExec: partitions=1, partition_sizes=[1]", - ]; let plan: Arc = Arc::new(limit_exec); - assert_plan_matches_expected(&plan, &expected)?; - let expected = r#" -+---+ -| a | -+---+ -| 2 | -| | -| 4 | -+---+ -"# - .trim(); - assert_results_match_expected(plan, expected).await?; + let formatted = plan_matches_expected(&plan)?; + let actual = formatted.trim(); + assert_snapshot!( + actual, + @r" + GlobalLimitExec: skip=1, fetch=3 + AggregateExec: mode=Single, gby=[a@0 as a], aggr=[], lim=[4] + DataSourceExec: partitions=1, partition_sizes=[1] + " + ); + let expected = assert_results_match_expected(plan).await?; + let actual = expected.trim(); + assert_snapshot!( + actual, + @r" + +---+ + | a | + +---+ + | 2 | + | | + | 4 | + +---+ + " + ); Ok(()) } @@ -210,26 +228,33 @@ async fn test_distinct_cols_different_than_group_by_cols() -> Result<()> { 4, // fetch ); // expected to push the limit to the outer AggregateExec only - let expected = [ - "LocalLimitExec: fetch=4", - "AggregateExec: mode=Single, gby=[a@0 as a], aggr=[], lim=[4]", - "AggregateExec: mode=Single, gby=[a@0 as a, b@1 as b], aggr=[]", - "DataSourceExec: partitions=1, partition_sizes=[1]", - ]; let plan: Arc = Arc::new(limit_exec); - assert_plan_matches_expected(&plan, &expected)?; - let expected = r#" -+---+ -| a | -+---+ -| 1 | -| 2 | -| | -| 4 | -+---+ -"# - .trim(); - assert_results_match_expected(plan, expected).await?; + let formatted = plan_matches_expected(&plan)?; + let actual = formatted.trim(); + assert_snapshot!( + actual, + @r" + LocalLimitExec: fetch=4 + AggregateExec: mode=Single, gby=[a@0 as a], aggr=[], lim=[4] + AggregateExec: mode=Single, gby=[a@0 as a, b@1 as b], aggr=[] + DataSourceExec: partitions=1, partition_sizes=[1] + " + ); + let expected = assert_results_match_expected(plan).await?; + let actual = expected.trim(); + assert_snapshot!( + actual, + @r" + +---+ + | a | + +---+ + | 1 | + | 2 | + | | + | 4 | + +---+ + " + ); Ok(()) } @@ -258,13 +283,17 @@ fn test_has_order_by() -> Result<()> { 10, // fetch ); // expected not to push the limit to the AggregateExec - let expected = [ - "LocalLimitExec: fetch=10", - "AggregateExec: mode=Single, gby=[a@0 as a], aggr=[], ordering_mode=Sorted", - "DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet", - ]; - let plan = Arc::new(limit_exec) as _; - assert_plan_matches_expected(&plan, &expected)?; + let plan: Arc = Arc::new(limit_exec); + let formatted = plan_matches_expected(&plan)?; + let actual = formatted.trim(); + assert_snapshot!( + actual, + @r" + LocalLimitExec: fetch=10 + AggregateExec: mode=Single, gby=[a@0 as a], aggr=[], ordering_mode=Sorted + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet + " + ); Ok(()) } @@ -287,13 +316,17 @@ fn test_no_group_by() -> Result<()> { 10, // fetch ); // expected not to push the limit to the AggregateExec - let expected = [ - "LocalLimitExec: fetch=10", - "AggregateExec: mode=Single, gby=[], aggr=[]", - "DataSourceExec: partitions=1, partition_sizes=[1]", - ]; let plan: Arc = Arc::new(limit_exec); - assert_plan_matches_expected(&plan, &expected)?; + let formatted = plan_matches_expected(&plan)?; + let actual = formatted.trim(); + assert_snapshot!( + actual, + @r" + LocalLimitExec: fetch=10 + AggregateExec: mode=Single, gby=[], aggr=[] + DataSourceExec: partitions=1, partition_sizes=[1] + " + ); Ok(()) } @@ -317,13 +350,17 @@ fn test_has_aggregate_expression() -> Result<()> { 10, // fetch ); // expected not to push the limit to the AggregateExec - let expected = [ - "LocalLimitExec: fetch=10", - "AggregateExec: mode=Single, gby=[a@0 as a], aggr=[COUNT(*)]", - "DataSourceExec: partitions=1, partition_sizes=[1]", - ]; let plan: Arc = Arc::new(limit_exec); - assert_plan_matches_expected(&plan, &expected)?; + let formatted = plan_matches_expected(&plan)?; + let actual = formatted.trim(); + assert_snapshot!( + actual, + @r" + LocalLimitExec: fetch=10 + AggregateExec: mode=Single, gby=[a@0 as a], aggr=[COUNT(*)] + DataSourceExec: partitions=1, partition_sizes=[1] + " + ); Ok(()) } @@ -355,12 +392,16 @@ fn test_has_filter() -> Result<()> { ); // expected not to push the limit to the AggregateExec // TODO(msirek): open an issue for `filter_expr` of `AggregateExec` not printing out - let expected = [ - "LocalLimitExec: fetch=10", - "AggregateExec: mode=Single, gby=[a@0 as a], aggr=[COUNT(*)]", - "DataSourceExec: partitions=1, partition_sizes=[1]", - ]; let plan: Arc = Arc::new(limit_exec); - assert_plan_matches_expected(&plan, &expected)?; + let formatted = plan_matches_expected(&plan)?; + let actual = formatted.trim(); + assert_snapshot!( + actual, + @r" + LocalLimitExec: fetch=10 + AggregateExec: mode=Single, gby=[a@0 as a], aggr=[COUNT(*)] + DataSourceExec: partitions=1, partition_sizes=[1] + " + ); Ok(()) } diff --git a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs index 1f8aad0f23345..585bb3fb360f0 100644 --- a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs +++ b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs @@ -59,6 +59,7 @@ use datafusion_physical_plan::streaming::{PartitionStream, StreamingTableExec}; use datafusion_physical_plan::union::UnionExec; use datafusion_physical_plan::{get_plan_string, ExecutionPlan}; +use insta::assert_snapshot; use itertools::Itertools; /// Mocked UDF @@ -424,19 +425,27 @@ fn test_csv_after_projection() -> Result<()> { ], csv.clone(), )?); - let initial = get_plan_string(&projection); - let expected_initial = [ - "ProjectionExec: expr=[b@2 as b, d@0 as d]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[d, c, b], file_type=csv, has_header=false", - ]; - assert_eq!(initial, expected_initial); + let initial = get_plan_string(&projection).join("\n"); + let actual = initial.trim(); + + assert_snapshot!( + actual, + @r" + ProjectionExec: expr=[b@2 as b, d@0 as d] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[d, c, b], file_type=csv, has_header=false + " + ); let after_optimize = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; - let expected = - ["DataSourceExec: file_groups={1 group: [[x]]}, projection=[b, d], file_type=csv, has_header=false"]; - assert_eq!(get_plan_string(&after_optimize), expected); + let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let actual = after_optimize_string.trim(); + + assert_snapshot!( + actual, + @"DataSourceExec: file_groups={1 group: [[x]]}, projection=[b, d], file_type=csv, has_header=false" + ); Ok(()) } @@ -452,18 +461,28 @@ fn test_memory_after_projection() -> Result<()> { ], memory.clone(), )?); - let initial = get_plan_string(&projection); - let expected_initial = [ - "ProjectionExec: expr=[d@2 as d, e@3 as e, a@1 as a]", - " DataSourceExec: partitions=0, partition_sizes=[]", - ]; - assert_eq!(initial, expected_initial); + let initial = get_plan_string(&projection).join("\n"); + let actual = initial.trim(); + + assert_snapshot!( + actual, + @r" + ProjectionExec: expr=[d@2 as d, e@3 as e, a@1 as a] + DataSourceExec: partitions=0, partition_sizes=[] + " + ); let after_optimize = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; - let expected = ["DataSourceExec: partitions=0, partition_sizes=[]"]; - assert_eq!(get_plan_string(&after_optimize), expected); + let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let actual = after_optimize_string.trim(); + + assert_snapshot!( + actual, + @"DataSourceExec: partitions=0, partition_sizes=[]" + ); + assert_eq!( after_optimize .clone() @@ -630,22 +649,31 @@ fn test_projection_after_projection() -> Result<()> { child_projection.clone(), )?); - let initial = get_plan_string(&top_projection); - let expected_initial = [ - "ProjectionExec: expr=[new_b@3 as new_b, c@0 + new_e@1 as binary, new_b@3 as newest_b]", - " ProjectionExec: expr=[c@2 as c, e@4 as new_e, a@0 as a, b@1 as new_b]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false" - ]; - assert_eq!(initial, expected_initial); + let initial = get_plan_string(&top_projection).join("\n"); + let actual = initial.trim(); + + assert_snapshot!( + actual, + @r" + ProjectionExec: expr=[new_b@3 as new_b, c@0 + new_e@1 as binary, new_b@3 as newest_b] + ProjectionExec: expr=[c@2 as c, e@4 as new_e, a@0 as a, b@1 as new_b] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + " + ); let after_optimize = ProjectionPushdown::new().optimize(top_projection, &ConfigOptions::new())?; - let expected = [ - "ProjectionExec: expr=[b@1 as new_b, c@2 + e@4 as binary, b@1 as newest_b]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false" - ]; - assert_eq!(get_plan_string(&after_optimize), expected); + let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let actual = after_optimize_string.trim(); + + assert_snapshot!( + actual, + @r" + ProjectionExec: expr=[b@1 as new_b, c@2 + e@4 as binary, b@1 as newest_b] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + " + ); Ok(()) } @@ -686,24 +714,33 @@ fn test_output_req_after_projection() -> Result<()> { sort_req.clone(), )?); - let initial = get_plan_string(&projection); - let expected_initial = [ - "ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b]", - " OutputRequirementExec", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false" - ]; - assert_eq!(initial, expected_initial); + let initial = get_plan_string(&projection).join("\n"); + let actual = initial.trim(); + + assert_snapshot!( + actual, + @r" + ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b] + OutputRequirementExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + " + ); let after_optimize = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; - let expected: [&str; 3] = [ - "OutputRequirementExec", - " ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false" - ]; + let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let actual = after_optimize_string.trim(); + + assert_snapshot!( + actual, + @r" + OutputRequirementExec + ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + " + ); - assert_eq!(get_plan_string(&after_optimize), expected); let expected_reqs = OrderingRequirements::new( [ PhysicalSortRequirement::new( @@ -766,23 +803,32 @@ fn test_coalesce_partitions_after_projection() -> Result<()> { ], coalesce_partitions, )?); - let initial = get_plan_string(&projection); - let expected_initial = [ - "ProjectionExec: expr=[b@1 as b, a@0 as a_new, d@3 as d]", - " CoalescePartitionsExec", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - assert_eq!(initial, expected_initial); + let initial = get_plan_string(&projection).join("\n"); + let actual = initial.trim(); + + assert_snapshot!( + actual, + @r" + ProjectionExec: expr=[b@1 as b, a@0 as a_new, d@3 as d] + CoalescePartitionsExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + " + ); let after_optimize = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; - let expected = [ - "CoalescePartitionsExec", - " ProjectionExec: expr=[b@1 as b, a@0 as a_new, d@3 as d]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - assert_eq!(get_plan_string(&after_optimize), expected); + let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let actual = after_optimize_string.trim(); + + assert_snapshot!( + actual, + @r" + CoalescePartitionsExec + ProjectionExec: expr=[b@1 as b, a@0 as a_new, d@3 as d] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + " + ); Ok(()) } @@ -813,23 +859,32 @@ fn test_filter_after_projection() -> Result<()> { filter.clone(), )?) as _; - let initial = get_plan_string(&projection); - let expected_initial = [ - "ProjectionExec: expr=[a@0 as a_new, b@1 as b, d@3 as d]", - " FilterExec: b@1 - a@0 > d@3 - a@0", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - assert_eq!(initial, expected_initial); + let initial = get_plan_string(&projection).join("\n"); + let actual = initial.trim(); + + assert_snapshot!( + actual, + @r" + ProjectionExec: expr=[a@0 as a_new, b@1 as b, d@3 as d] + FilterExec: b@1 - a@0 > d@3 - a@0 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + " + ); let after_optimize = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; - let expected = [ - "FilterExec: b@1 - a_new@0 > d@2 - a_new@0", - " ProjectionExec: expr=[a@0 as a_new, b@1 as b, d@3 as d]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - assert_eq!(get_plan_string(&after_optimize), expected); + let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let actual = after_optimize_string.trim(); + + assert_snapshot!( + actual, + @r" + FilterExec: b@1 - a_new@0 > d@2 - a_new@0 + ProjectionExec: expr=[a@0 as a_new, b@1 as b, d@3 as d] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + " + ); Ok(()) } @@ -898,26 +953,35 @@ fn test_join_after_projection() -> Result<()> { ], join, )?) as _; - let initial = get_plan_string(&projection); - let expected_initial = [ - "ProjectionExec: expr=[c@2 as c_from_left, b@1 as b_from_left, a@0 as a_from_left, a@5 as a_from_right, c@7 as c_from_right]", - " SymmetricHashJoinExec: mode=SinglePartition, join_type=Inner, on=[(b@1, c@2)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false" - ]; - assert_eq!(initial, expected_initial); + let initial = get_plan_string(&projection).join("\n"); + let actual = initial.trim(); + + assert_snapshot!( + actual, + @r" + ProjectionExec: expr=[c@2 as c_from_left, b@1 as b_from_left, a@0 as a_from_left, a@5 as a_from_right, c@7 as c_from_right] + SymmetricHashJoinExec: mode=SinglePartition, join_type=Inner, on=[(b@1, c@2)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + " + ); let after_optimize = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; - let expected = [ - "SymmetricHashJoinExec: mode=SinglePartition, join_type=Inner, on=[(b_from_left@1, c_from_right@1)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2", - " ProjectionExec: expr=[c@2 as c_from_left, b@1 as b_from_left, a@0 as a_from_left]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - " ProjectionExec: expr=[a@0 as a_from_right, c@2 as c_from_right]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false" - ]; - assert_eq!(get_plan_string(&after_optimize), expected); + let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let actual = after_optimize_string.trim(); + + assert_snapshot!( + actual, + @r" + SymmetricHashJoinExec: mode=SinglePartition, join_type=Inner, on=[(b_from_left@1, c_from_right@1)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2 + ProjectionExec: expr=[c@2 as c_from_left, b@1 as b_from_left, a@0 as a_from_left] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + ProjectionExec: expr=[a@0 as a_from_right, c@2 as c_from_right] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + " + ); let expected_filter_col_ind = vec![ ColumnIndex { @@ -1017,25 +1081,34 @@ fn test_join_after_required_projection() -> Result<()> { ], join, )?) as _; - let initial = get_plan_string(&projection); - let expected_initial = [ - "ProjectionExec: expr=[a@5 as a, b@6 as b, c@7 as c, d@8 as d, e@9 as e, a@0 as a, b@1 as b, c@2 as c, d@3 as d, e@4 as e]", - " SymmetricHashJoinExec: mode=SinglePartition, join_type=Inner, on=[(b@1, c@2)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false" - ]; - assert_eq!(initial, expected_initial); + let initial = get_plan_string(&projection).join("\n"); + let actual = initial.trim(); + + assert_snapshot!( + actual, + @r" + ProjectionExec: expr=[a@5 as a, b@6 as b, c@7 as c, d@8 as d, e@9 as e, a@0 as a, b@1 as b, c@2 as c, d@3 as d, e@4 as e] + SymmetricHashJoinExec: mode=SinglePartition, join_type=Inner, on=[(b@1, c@2)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + " + ); let after_optimize = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; - let expected = [ - "ProjectionExec: expr=[a@5 as a, b@6 as b, c@7 as c, d@8 as d, e@9 as e, a@0 as a, b@1 as b, c@2 as c, d@3 as d, e@4 as e]", - " SymmetricHashJoinExec: mode=SinglePartition, join_type=Inner, on=[(b@1, c@2)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false" - ]; - assert_eq!(get_plan_string(&after_optimize), expected); + let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let actual = after_optimize_string.trim(); + + assert_snapshot!( + actual, + @r" + ProjectionExec: expr=[a@5 as a, b@6 as b, c@7 as c, d@8 as d, e@9 as e, a@0 as a, b@1 as b, c@2 as c, d@3 as d, e@4 as e] + SymmetricHashJoinExec: mode=SinglePartition, join_type=Inner, on=[(b@1, c@2)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + " + ); Ok(()) } @@ -1085,23 +1158,31 @@ fn test_nested_loop_join_after_projection() -> Result<()> { vec![(col_left_c, "c".to_string())], Arc::clone(&join), )?) as _; - let initial = get_plan_string(&projection); - let expected_initial = [ - "ProjectionExec: expr=[c@2 as c]", - " NestedLoopJoinExec: join_type=Inner, filter=a@0 < b@1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - assert_eq!(initial, expected_initial); + let initial = get_plan_string(&projection).join("\n"); + let actual = initial.trim(); + assert_snapshot!( + actual, + @r" + ProjectionExec: expr=[c@2 as c] + NestedLoopJoinExec: join_type=Inner, filter=a@0 < b@1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + " + ); - let after_optimize = + let after_optimize_string = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; - let expected = [ - "NestedLoopJoinExec: join_type=Inner, filter=a@0 < b@1, projection=[c@2]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - assert_eq!(get_plan_string(&after_optimize), expected); + let after_optimize_string = get_plan_string(&after_optimize_string).join("\n"); + let actual = after_optimize_string.trim(); + assert_snapshot!( + actual, + @r" + NestedLoopJoinExec: join_type=Inner, filter=a@0 < b@1, projection=[c@2] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + " + + ); Ok(()) } @@ -1169,18 +1250,33 @@ fn test_hash_join_after_projection() -> Result<()> { ], join.clone(), )?) as _; - let initial = get_plan_string(&projection); - let expected_initial = [ - "ProjectionExec: expr=[c@2 as c_from_left, b@1 as b_from_left, a@0 as a_from_left, c@7 as c_from_right]", " HashJoinExec: mode=Auto, join_type=Inner, on=[(b@1, c@2)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2", " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false" - ]; - assert_eq!(initial, expected_initial); + let initial = get_plan_string(&projection).join("\n"); + let actual = initial.trim(); + assert_snapshot!( + actual, + @r" + ProjectionExec: expr=[c@2 as c_from_left, b@1 as b_from_left, a@0 as a_from_left, c@7 as c_from_right] + HashJoinExec: mode=Auto, join_type=Inner, on=[(b@1, c@2)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + " + ); let after_optimize = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; + let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let actual = after_optimize_string.trim(); // HashJoinExec only returns result after projection. Because there are some alias columns in the projection, the ProjectionExec is not removed. - let expected = ["ProjectionExec: expr=[c@2 as c_from_left, b@1 as b_from_left, a@0 as a_from_left, c@3 as c_from_right]", " HashJoinExec: mode=Auto, join_type=Inner, on=[(b@1, c@2)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2, projection=[a@0, b@1, c@2, c@7]", " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false"]; - assert_eq!(get_plan_string(&after_optimize), expected); + assert_snapshot!( + actual, + @r" + ProjectionExec: expr=[c@2 as c_from_left, b@1 as b_from_left, a@0 as a_from_left, c@3 as c_from_right] + HashJoinExec: mode=Auto, join_type=Inner, on=[(b@1, c@2)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2, projection=[a@0, b@1, c@2, c@7] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + " + ); let projection = Arc::new(ProjectionExec::try_new( vec![ @@ -1194,10 +1290,18 @@ fn test_hash_join_after_projection() -> Result<()> { let after_optimize = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; + let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let actual = after_optimize_string.trim(); // Comparing to the previous result, this projection don't have alias columns either change the order of output fields. So the ProjectionExec is removed. - let expected = ["HashJoinExec: mode=Auto, join_type=Inner, on=[(b@1, c@2)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2, projection=[a@0, b@1, c@2, c@7]", " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false"]; - assert_eq!(get_plan_string(&after_optimize), expected); + assert_snapshot!( + actual, + @r" + HashJoinExec: mode=Auto, join_type=Inner, on=[(b@1, c@2)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2, projection=[a@0, b@1, c@2, c@7] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + " + ); Ok(()) } @@ -1224,23 +1328,30 @@ fn test_repartition_after_projection() -> Result<()> { ], repartition, )?) as _; - let initial = get_plan_string(&projection); - let expected_initial = [ - "ProjectionExec: expr=[b@1 as b_new, a@0 as a, d@3 as d_new]", - " RepartitionExec: partitioning=Hash([a@0, b@1, d@3], 6), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - assert_eq!(initial, expected_initial); + let initial = get_plan_string(&projection).join("\n"); + let actual = initial.trim(); + assert_snapshot!( + actual, + @r" + ProjectionExec: expr=[b@1 as b_new, a@0 as a, d@3 as d_new] + RepartitionExec: partitioning=Hash([a@0, b@1, d@3], 6), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + " + ); let after_optimize = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; - let expected = [ - "RepartitionExec: partitioning=Hash([a@1, b_new@0, d_new@2], 6), input_partitions=1", - " ProjectionExec: expr=[b@1 as b_new, a@0 as a, d@3 as d_new]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - assert_eq!(get_plan_string(&after_optimize), expected); + let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let actual = after_optimize_string.trim(); + assert_snapshot!( + actual, + @r" + RepartitionExec: partitioning=Hash([a@1, b_new@0, d_new@2], 6), input_partitions=1 + ProjectionExec: expr=[b@1 as b_new, a@0 as a, d@3 as d_new] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + " + ); assert_eq!( after_optimize @@ -1286,23 +1397,30 @@ fn test_sort_after_projection() -> Result<()> { Arc::new(sort_exec), )?) as _; - let initial = get_plan_string(&projection); - let expected_initial = [ - "ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b]", - " SortExec: expr=[b@1 ASC, c@2 + a@0 ASC], preserve_partitioning=[false]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false" - ]; - assert_eq!(initial, expected_initial); + let initial = get_plan_string(&projection).join("\n"); + let actual = initial.trim(); + assert_snapshot!( + actual, + @r" + ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b] + SortExec: expr=[b@1 ASC, c@2 + a@0 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + " + ); let after_optimize = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; - let expected = [ - "SortExec: expr=[b@2 ASC, c@0 + new_a@1 ASC], preserve_partitioning=[false]", - " ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false" - ]; - assert_eq!(get_plan_string(&after_optimize), expected); + let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let actual = after_optimize_string.trim(); + assert_snapshot!( + actual, + @r" + SortExec: expr=[b@2 ASC, c@0 + new_a@1 ASC], preserve_partitioning=[false] + ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + " + ); Ok(()) } @@ -1331,23 +1449,30 @@ fn test_sort_preserving_after_projection() -> Result<()> { Arc::new(sort_exec), )?) as _; - let initial = get_plan_string(&projection); - let expected_initial = [ - "ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b]", - " SortPreservingMergeExec: [b@1 ASC, c@2 + a@0 ASC]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false" - ]; - assert_eq!(initial, expected_initial); + let initial = get_plan_string(&projection).join("\n"); + let actual = initial.trim(); + assert_snapshot!( + actual, + @r" + ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b] + SortPreservingMergeExec: [b@1 ASC, c@2 + a@0 ASC] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + " + ); let after_optimize = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; - let expected = [ - "SortPreservingMergeExec: [b@2 ASC, c@0 + new_a@1 ASC]", - " ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false" - ]; - assert_eq!(get_plan_string(&after_optimize), expected); + let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let actual = after_optimize_string.trim(); + assert_snapshot!( + actual, + @r" + SortPreservingMergeExec: [b@2 ASC, c@0 + new_a@1 ASC] + ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + " + ); Ok(()) } @@ -1365,29 +1490,36 @@ fn test_union_after_projection() -> Result<()> { union.clone(), )?) as _; - let initial = get_plan_string(&projection); - let expected_initial = [ - "ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b]", - " UnionExec", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false" - ]; - assert_eq!(initial, expected_initial); + let initial = get_plan_string(&projection).join("\n"); + let actual = initial.trim(); + assert_snapshot!( + actual, + @r" + ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b] + UnionExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + " + ); let after_optimize = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; - let expected = [ - "UnionExec", - " ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - " ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - " ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false" - ]; - assert_eq!(get_plan_string(&after_optimize), expected); + let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let actual = after_optimize_string.trim(); + assert_snapshot!( + actual, + @r" + UnionExec + ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + " + ); Ok(()) } @@ -1439,11 +1571,15 @@ fn test_partition_col_projection_pushdown() -> Result<()> { let after_optimize = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; - let expected = [ - "ProjectionExec: expr=[string_col@1 as string_col, partition_col@2 as partition_col, int_col@0 as int_col]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[int_col, string_col, partition_col], file_type=csv, has_header=false" - ]; - assert_eq!(get_plan_string(&after_optimize), expected); + let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let actual = after_optimize_string.trim(); + assert_snapshot!( + actual, + @r" + ProjectionExec: expr=[string_col@1 as string_col, partition_col@2 as partition_col, int_col@0 as int_col] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[int_col, string_col, partition_col], file_type=csv, has_header=false + " + ); Ok(()) } @@ -1479,11 +1615,15 @@ fn test_partition_col_projection_pushdown_expr() -> Result<()> { let after_optimize = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; - let expected = [ - "ProjectionExec: expr=[string_col@1 as string_col, CAST(partition_col@2 AS Utf8View) as partition_col, int_col@0 as int_col]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[int_col, string_col, partition_col], file_type=csv, has_header=false" - ]; - assert_eq!(get_plan_string(&after_optimize), expected); + let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let actual = after_optimize_string.trim(); + assert_snapshot!( + actual, + @r" + ProjectionExec: expr=[string_col@1 as string_col, CAST(partition_col@2 AS Utf8View) as partition_col, int_col@0 as int_col] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[int_col, string_col, partition_col], file_type=csv, has_header=false + " + ); Ok(()) } diff --git a/datafusion/core/tests/physical_optimizer/sanity_checker.rs b/datafusion/core/tests/physical_optimizer/sanity_checker.rs index f7d68e5d899c9..5d62ea4ccb20e 100644 --- a/datafusion/core/tests/physical_optimizer/sanity_checker.rs +++ b/datafusion/core/tests/physical_optimizer/sanity_checker.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use insta::assert_snapshot; use std::sync::Arc; use crate::physical_optimizer::test_utils::{ @@ -398,14 +399,6 @@ fn assert_sanity_check(plan: &Arc, is_sane: bool) { ); } -/// Check if the plan we created is as expected by comparing the plan -/// formatted as a string. -fn assert_plan(plan: &dyn ExecutionPlan, expected_lines: Vec<&str>) { - let plan_str = displayable(plan).indent(true).to_string(); - let actual_lines: Vec<&str> = plan_str.trim().lines().collect(); - assert_eq!(actual_lines, expected_lines); -} - #[tokio::test] /// Tests that plan is valid when the sort requirements are satisfied. async fn test_bounded_window_agg_sort_requirement() -> Result<()> { @@ -422,11 +415,16 @@ async fn test_bounded_window_agg_sort_requirement() -> Result<()> { .into(); let sort = sort_exec(ordering.clone(), source); let bw = bounded_window_exec("c9", ordering, sort); - assert_plan(bw.as_ref(), vec![ - "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]", - " SortExec: expr=[c9@0 ASC NULLS LAST], preserve_partitioning=[false]", - " DataSourceExec: partitions=1, partition_sizes=[0]" - ]); + let plan_str = displayable(bw.as_ref()).indent(true).to_string(); + let actual = plan_str.trim(); + assert_snapshot!( + actual, + @r#" + BoundedWindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted] + SortExec: expr=[c9@0 ASC NULLS LAST], preserve_partitioning=[false] + DataSourceExec: partitions=1, partition_sizes=[0] + "# + ); assert_sanity_check(&bw, true); Ok(()) } @@ -445,10 +443,15 @@ async fn test_bounded_window_agg_no_sort_requirement() -> Result<()> { }, )]; let bw = bounded_window_exec("c9", sort_exprs, source); - assert_plan(bw.as_ref(), vec![ - "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]", - " DataSourceExec: partitions=1, partition_sizes=[0]" - ]); + let plan_str = displayable(bw.as_ref()).indent(true).to_string(); + let actual = plan_str.trim(); + assert_snapshot!( + actual, + @r#" + BoundedWindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted] + DataSourceExec: partitions=1, partition_sizes=[0] + "# + ); // Order requirement of the `BoundedWindowAggExec` is not satisfied. We expect to receive error during sanity check. assert_sanity_check(&bw, false); Ok(()) @@ -462,12 +465,14 @@ async fn test_global_limit_single_partition() -> Result<()> { let source = memory_exec(&schema); let limit = global_limit_exec(source, 0, Some(100)); - assert_plan( - limit.as_ref(), - vec![ - "GlobalLimitExec: skip=0, fetch=100", - " DataSourceExec: partitions=1, partition_sizes=[0]", - ], + let plan_str = displayable(limit.as_ref()).indent(true).to_string(); + let actual = plan_str.trim(); + assert_snapshot!( + actual, + @r" + GlobalLimitExec: skip=0, fetch=100 + DataSourceExec: partitions=1, partition_sizes=[0] + " ); assert_sanity_check(&limit, true); Ok(()) @@ -481,13 +486,15 @@ async fn test_global_limit_multi_partition() -> Result<()> { let source = memory_exec(&schema); let limit = global_limit_exec(repartition_exec(source), 0, Some(100)); - assert_plan( - limit.as_ref(), - vec![ - "GlobalLimitExec: skip=0, fetch=100", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: partitions=1, partition_sizes=[0]", - ], + let plan_str = displayable(limit.as_ref()).indent(true).to_string(); + let actual = plan_str.trim(); + assert_snapshot!( + actual, + @r" + GlobalLimitExec: skip=0, fetch=100 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: partitions=1, partition_sizes=[0] + " ); // Distribution requirement of the `GlobalLimitExec` is not satisfied. We expect to receive error during sanity check. assert_sanity_check(&limit, false); @@ -501,12 +508,14 @@ async fn test_local_limit() -> Result<()> { let source = memory_exec(&schema); let limit = local_limit_exec(source, 100); - assert_plan( - limit.as_ref(), - vec![ - "LocalLimitExec: fetch=100", - " DataSourceExec: partitions=1, partition_sizes=[0]", - ], + let plan_str = displayable(limit.as_ref()).indent(true).to_string(); + let actual = plan_str.trim(); + assert_snapshot!( + actual, + @r" + LocalLimitExec: fetch=100 + DataSourceExec: partitions=1, partition_sizes=[0] + " ); assert_sanity_check(&limit, true); Ok(()) @@ -540,17 +549,19 @@ async fn test_sort_merge_join_satisfied() -> Result<()> { let join_ty = JoinType::Inner; let smj = sort_merge_join_exec(left, right, &join_on, &join_ty); - assert_plan( - smj.as_ref(), - vec![ - "SortMergeJoin: join_type=Inner, on=[(c9@0, a@0)]", - " RepartitionExec: partitioning=Hash([c9@0], 10), input_partitions=1", - " SortExec: expr=[c9@0 ASC], preserve_partitioning=[false]", - " DataSourceExec: partitions=1, partition_sizes=[0]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1", - " SortExec: expr=[a@0 ASC], preserve_partitioning=[false]", - " DataSourceExec: partitions=1, partition_sizes=[0]", - ], + let plan_str = displayable(smj.as_ref()).indent(true).to_string(); + let actual = plan_str.trim(); + assert_snapshot!( + actual, + @r" + SortMergeJoin: join_type=Inner, on=[(c9@0, a@0)] + RepartitionExec: partitioning=Hash([c9@0], 10), input_partitions=1 + SortExec: expr=[c9@0 ASC], preserve_partitioning=[false] + DataSourceExec: partitions=1, partition_sizes=[0] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1 + SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + DataSourceExec: partitions=1, partition_sizes=[0] + " ); assert_sanity_check(&smj, true); Ok(()) @@ -588,16 +599,18 @@ async fn test_sort_merge_join_order_missing() -> Result<()> { let join_ty = JoinType::Inner; let smj = sort_merge_join_exec(left, right, &join_on, &join_ty); - assert_plan( - smj.as_ref(), - vec![ - "SortMergeJoin: join_type=Inner, on=[(c9@0, a@0)]", - " RepartitionExec: partitioning=Hash([c9@0], 10), input_partitions=1", - " SortExec: expr=[c9@0 ASC], preserve_partitioning=[false]", - " DataSourceExec: partitions=1, partition_sizes=[0]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1", - " DataSourceExec: partitions=1, partition_sizes=[0]", - ], + let plan_str = displayable(smj.as_ref()).indent(true).to_string(); + let actual = plan_str.trim(); + assert_snapshot!( + actual, + @r" + SortMergeJoin: join_type=Inner, on=[(c9@0, a@0)] + RepartitionExec: partitioning=Hash([c9@0], 10), input_partitions=1 + SortExec: expr=[c9@0 ASC], preserve_partitioning=[false] + DataSourceExec: partitions=1, partition_sizes=[0] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1 + DataSourceExec: partitions=1, partition_sizes=[0] + " ); // Order requirement for the `SortMergeJoin` is not satisfied for right child. We expect to receive error during sanity check. assert_sanity_check(&smj, false); @@ -634,17 +647,19 @@ async fn test_sort_merge_join_dist_missing() -> Result<()> { let join_ty = JoinType::Inner; let smj = sort_merge_join_exec(left, right, &join_on, &join_ty); - assert_plan( - smj.as_ref(), - vec![ - "SortMergeJoin: join_type=Inner, on=[(c9@0, a@0)]", - " RepartitionExec: partitioning=Hash([c9@0], 10), input_partitions=1", - " SortExec: expr=[c9@0 ASC], preserve_partitioning=[false]", - " DataSourceExec: partitions=1, partition_sizes=[0]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[a@0 ASC], preserve_partitioning=[false]", - " DataSourceExec: partitions=1, partition_sizes=[0]", - ], + let plan_str = displayable(smj.as_ref()).indent(true).to_string(); + let actual = plan_str.trim(); + assert_snapshot!( + actual, + @r" + SortMergeJoin: join_type=Inner, on=[(c9@0, a@0)] + RepartitionExec: partitioning=Hash([c9@0], 10), input_partitions=1 + SortExec: expr=[c9@0 ASC], preserve_partitioning=[false] + DataSourceExec: partitions=1, partition_sizes=[0] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + DataSourceExec: partitions=1, partition_sizes=[0] + " ); // Distribution requirement for the `SortMergeJoin` is not satisfied for right child (has round-robin partitioning). We expect to receive error during sanity check. assert_sanity_check(&smj, false); diff --git a/datafusion/core/tests/physical_optimizer/test_utils.rs b/datafusion/core/tests/physical_optimizer/test_utils.rs index c91a70989be49..d1b51adf73e28 100644 --- a/datafusion/core/tests/physical_optimizer/test_utils.rs +++ b/datafusion/core/tests/physical_optimizer/test_utils.rs @@ -509,13 +509,6 @@ pub fn check_integrity(context: PlanContext) -> Result Vec<&str> { - plan.split('\n') - .map(|s| s.trim()) - .filter(|s| !s.is_empty()) - .collect() -} - // construct a stream partition for test purposes #[derive(Debug)] pub struct TestStreamPartition { @@ -629,25 +622,15 @@ pub fn build_group_by(input_schema: &SchemaRef, columns: Vec) -> Physica PhysicalGroupBy::new_single(group_by_expr.clone()) } -pub fn assert_plan_matches_expected( - plan: &Arc, - expected: &[&str], -) -> Result<()> { - let expected_lines: Vec<&str> = expected.to_vec(); +pub fn plan_matches_expected(plan: &Arc) -> Result { let config = ConfigOptions::new(); let optimized = LimitedDistinctAggregation::new().optimize(Arc::clone(plan), &config)?; let optimized_result = displayable(optimized.as_ref()).indent(true).to_string(); - let actual_lines = trim_plan_display(&optimized_result); - - assert_eq!( - &expected_lines, &actual_lines, - "\n\nexpected:\n\n{expected_lines:#?}\nactual:\n\n{actual_lines:#?}\n\n" - ); - Ok(()) + Ok(optimized_result) } /// Describe the type of aggregate being tested diff --git a/datafusion/core/tests/sql/joins.rs b/datafusion/core/tests/sql/joins.rs index 729542d27e3fd..7ac63210a9671 100644 --- a/datafusion/core/tests/sql/joins.rs +++ b/datafusion/core/tests/sql/joins.rs @@ -13,7 +13,9 @@ // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations -// under the License. +// under the License.\ + +use insta::assert_snapshot; use datafusion::assert_batches_eq; use datafusion::datasource::stream::{FileStreamProvider, StreamConfig, StreamTable}; @@ -62,28 +64,21 @@ async fn join_change_in_planner() -> Result<()> { let dataframe = ctx.sql(sql).await?; let physical_plan = dataframe.create_physical_plan().await?; let formatted = displayable(physical_plan.as_ref()).indent(true).to_string(); - let expected = { - [ - "SymmetricHashJoinExec: mode=Partitioned, join_type=Full, on=[(a2@1, a2@1)], filter=CAST(a1@0 AS Int64) > CAST(a1@1 AS Int64) + 3 AND CAST(a1@0 AS Int64) < CAST(a1@1 AS Int64) + 10", - " CoalesceBatchesExec: target_batch_size=8192", - " RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a1@0 ASC NULLS LAST", - " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", - // " DataSourceExec: file_groups={1 group: [[tempdir/left.csv]]}, projection=[a1, a2], file_type=csv, has_header=false", - " CoalesceBatchesExec: target_batch_size=8192", - " RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a1@0 ASC NULLS LAST", - " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", - // " DataSourceExec: file_groups={1 group: [[tempdir/right.csv]]}, projection=[a1, a2], file_type=csv, has_header=false" - ] - }; - let mut actual: Vec<&str> = formatted.trim().lines().collect(); - // Remove CSV lines - actual.remove(4); - actual.remove(7); + let actual = formatted.trim(); - assert_eq!( - expected, - actual[..], - "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + assert_snapshot!( + actual, + @r" + SymmetricHashJoinExec: mode=Partitioned, join_type=Full, on=[(a2@1, a2@1)], filter=CAST(a1@0 AS Int64) > CAST(a1@1 AS Int64) + 3 AND CAST(a1@0 AS Int64) < CAST(a1@1 AS Int64) + 10 + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a1@0 ASC NULLS LAST + RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 + StreamingTableExec: partition_sizes=1, projection=[a1, a2], infinite_source=true, output_ordering=[a1@0 ASC NULLS LAST] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a1@0 ASC NULLS LAST + RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 + StreamingTableExec: partition_sizes=1, projection=[a1, a2], infinite_source=true, output_ordering=[a1@0 ASC NULLS LAST] + " ); Ok(()) } @@ -130,28 +125,21 @@ async fn join_no_order_on_filter() -> Result<()> { let dataframe = ctx.sql(sql).await?; let physical_plan = dataframe.create_physical_plan().await?; let formatted = displayable(physical_plan.as_ref()).indent(true).to_string(); - let expected = { - [ - "SymmetricHashJoinExec: mode=Partitioned, join_type=Full, on=[(a2@1, a2@1)], filter=CAST(a3@0 AS Int64) > CAST(a3@1 AS Int64) + 3 AND CAST(a3@0 AS Int64) < CAST(a3@1 AS Int64) + 10", - " CoalesceBatchesExec: target_batch_size=8192", - " RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=8", - " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", - // " DataSourceExec: file_groups={1 group: [[tempdir/left.csv]]}, projection=[a1, a2], file_type=csv, has_header=false", - " CoalesceBatchesExec: target_batch_size=8192", - " RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=8", - " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", - // " DataSourceExec: file_groups={1 group: [[tempdir/right.csv]]}, projection=[a1, a2], file_type=csv, has_header=false" - ] - }; - let mut actual: Vec<&str> = formatted.trim().lines().collect(); - // Remove CSV lines - actual.remove(4); - actual.remove(7); + let actual = formatted.trim(); - assert_eq!( - expected, - actual[..], - "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + assert_snapshot!( + actual, + @r" + SymmetricHashJoinExec: mode=Partitioned, join_type=Full, on=[(a2@1, a2@1)], filter=CAST(a3@0 AS Int64) > CAST(a3@1 AS Int64) + 3 AND CAST(a3@0 AS Int64) < CAST(a3@1 AS Int64) + 10 + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=8 + RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 + StreamingTableExec: partition_sizes=1, projection=[a1, a2, a3], infinite_source=true, output_ordering=[a1@0 ASC NULLS LAST] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=8 + RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 + StreamingTableExec: partition_sizes=1, projection=[a1, a2, a3], infinite_source=true, output_ordering=[a1@0 ASC NULLS LAST] + " ); Ok(()) } @@ -180,28 +168,21 @@ async fn join_change_in_planner_without_sort() -> Result<()> { let dataframe = ctx.sql(sql).await?; let physical_plan = dataframe.create_physical_plan().await?; let formatted = displayable(physical_plan.as_ref()).indent(true).to_string(); - let expected = { - [ - "SymmetricHashJoinExec: mode=Partitioned, join_type=Full, on=[(a2@1, a2@1)], filter=CAST(a1@0 AS Int64) > CAST(a1@1 AS Int64) + 3 AND CAST(a1@0 AS Int64) < CAST(a1@1 AS Int64) + 10", - " CoalesceBatchesExec: target_batch_size=8192", - " RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=8", - " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", - // " DataSourceExec: file_groups={1 group: [[tempdir/left.csv]]}, projection=[a1, a2], file_type=csv, has_header=false", - " CoalesceBatchesExec: target_batch_size=8192", - " RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=8", - " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", - // " DataSourceExec: file_groups={1 group: [[tempdir/right.csv]]}, projection=[a1, a2], file_type=csv, has_header=false" - ] - }; - let mut actual: Vec<&str> = formatted.trim().lines().collect(); - // Remove CSV lines - actual.remove(4); - actual.remove(7); + let actual = formatted.trim(); - assert_eq!( - expected, - actual[..], - "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + assert_snapshot!( + actual, + @r" + SymmetricHashJoinExec: mode=Partitioned, join_type=Full, on=[(a2@1, a2@1)], filter=CAST(a1@0 AS Int64) > CAST(a1@1 AS Int64) + 3 AND CAST(a1@0 AS Int64) < CAST(a1@1 AS Int64) + 10 + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=8 + RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 + StreamingTableExec: partition_sizes=1, projection=[a1, a2], infinite_source=true + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=8 + RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 + StreamingTableExec: partition_sizes=1, projection=[a1, a2], infinite_source=true + " ); Ok(()) } From 61acb341f172d9a56284b7c602e0ceebbf5d0eb6 Mon Sep 17 00:00:00 2001 From: Ian Lai Date: Mon, 30 Jun 2025 07:29:05 +0000 Subject: [PATCH 06/11] fix format --- .../core/tests/physical_optimizer/enforce_sorting.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs index a6e7cded15b03..06395eb41e17c 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs @@ -86,10 +86,10 @@ fn csv_exec_sorted( /// Runs the sort enforcement optimizer and asserts the plan /// against the original and expected plans /// +/// `$EXPECTED_PLAN_LINES`: input plan +/// `$EXPECTED_OPTIMIZED_PLAN_LINES`: optimized plan /// `$PLAN`: the plan to optimized -/// `$REPARTITION_SORTS`: Flag to set `config.options.optimizer.repartition_sorts` option. -/// `$EXPECTED_PLAN_COMBINED`: The expected combined plan string with the original and optimized plans. -/// +/// `REPARTITION_SORTS`: Flag to set `config.options.optimizer.repartition_sorts` option. macro_rules! assert_optimized { ($EXPECTED_PLAN_LINES: expr, $EXPECTED_OPTIMIZED_PLAN_LINES: expr, $PLAN: expr, $REPARTITION_SORTS: expr) => { let mut config = ConfigOptions::new(); @@ -140,7 +140,6 @@ macro_rules! assert_optimized { // TODO: End state payloads will be checked here. } - let physical_plan = $PLAN; let formatted = displayable(physical_plan.as_ref()).indent(true).to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); From 773ccc10ba95bb47f67e73064448b9c6f0ef4a85 Mon Sep 17 00:00:00 2001 From: Ian Lai Date: Mon, 30 Jun 2025 07:30:11 +0000 Subject: [PATCH 07/11] fix format --- datafusion/core/tests/physical_optimizer/enforce_sorting.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs index 06395eb41e17c..38bc10a967e2b 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs @@ -90,6 +90,7 @@ fn csv_exec_sorted( /// `$EXPECTED_OPTIMIZED_PLAN_LINES`: optimized plan /// `$PLAN`: the plan to optimized /// `REPARTITION_SORTS`: Flag to set `config.options.optimizer.repartition_sorts` option. +/// macro_rules! assert_optimized { ($EXPECTED_PLAN_LINES: expr, $EXPECTED_OPTIMIZED_PLAN_LINES: expr, $PLAN: expr, $REPARTITION_SORTS: expr) => { let mut config = ConfigOptions::new(); From 792e5d6100cad4b50cdad47aff6427a5373e8ac0 Mon Sep 17 00:00:00 2001 From: Ian Lai Date: Mon, 30 Jun 2025 07:44:04 +0000 Subject: [PATCH 08/11] fix typo --- datafusion/core/tests/sql/joins.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/tests/sql/joins.rs b/datafusion/core/tests/sql/joins.rs index 7ac63210a9671..fbe7e3a00f542 100644 --- a/datafusion/core/tests/sql/joins.rs +++ b/datafusion/core/tests/sql/joins.rs @@ -13,7 +13,7 @@ // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations -// under the License.\ +// under the License. use insta::assert_snapshot; From f5652e813ce5c717d2c95a552d363957e4cc74f5 Mon Sep 17 00:00:00 2001 From: Ian Lai Date: Tue, 1 Jul 2025 05:35:42 +0000 Subject: [PATCH 09/11] refactor: rename function --- .../combine_partial_final_agg.rs | 2 -- .../limited_distinct_aggregation.rs | 28 +++++++++---------- .../tests/physical_optimizer/test_utils.rs | 2 +- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/datafusion/core/tests/physical_optimizer/combine_partial_final_agg.rs b/datafusion/core/tests/physical_optimizer/combine_partial_final_agg.rs index 7e31600cdc4d5..9c76f6ab6f58b 100644 --- a/datafusion/core/tests/physical_optimizer/combine_partial_final_agg.rs +++ b/datafusion/core/tests/physical_optimizer/combine_partial_final_agg.rs @@ -45,8 +45,6 @@ use datafusion_physical_plan::ExecutionPlan; /// Runs the CombinePartialFinalAggregate optimizer and asserts the plan against the expected macro_rules! assert_optimized { ($PLAN: expr, @ $EXPECTED_LINES: literal $(,)?) => { - // let expected_lines: Vec<&str> = $EXPECTED_LINES.iter().map(|s| *s).collect(); - // run optimizer let optimizer = CombinePartialFinalAggregate {}; let config = ConfigOptions::new(); diff --git a/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs b/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs index 8e4f25cb11dff..22b5fefeedcde 100644 --- a/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs +++ b/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs @@ -21,7 +21,7 @@ use insta::assert_snapshot; use std::sync::Arc; use crate::physical_optimizer::test_utils::{ - build_group_by, mock_data, parquet_exec_with_sort, plan_matches_expected, schema, + build_group_by, get_optimized_plan, mock_data, parquet_exec_with_sort, schema, TestAggregate, }; @@ -40,7 +40,7 @@ use datafusion_physical_plan::{ ExecutionPlan, }; -async fn assert_results_match_expected(plan: Arc) -> Result { +async fn run_plan_and_format(plan: Arc) -> Result { let cfg = SessionConfig::new().with_target_partitions(1); let ctx = SessionContext::new_with_config(cfg); let batches = collect(plan, ctx.task_ctx()).await?; @@ -75,7 +75,7 @@ async fn test_partial_final() -> Result<()> { 4, // fetch ); let plan: Arc = Arc::new(limit_exec); - let formatted = plan_matches_expected(&plan)?; + let formatted = get_optimized_plan(&plan)?; let actual = formatted.trim(); assert_snapshot!( actual, @@ -86,7 +86,7 @@ async fn test_partial_final() -> Result<()> { DataSourceExec: partitions=1, partition_sizes=[1] " ); - let expected = assert_results_match_expected(plan).await?; + let expected = run_plan_and_format(plan).await?; let actual = expected.trim(); assert_snapshot!( actual, @@ -125,7 +125,7 @@ async fn test_single_local() -> Result<()> { ); // expected to push the limit to the AggregateExec let plan: Arc = Arc::new(limit_exec); - let formatted = plan_matches_expected(&plan)?; + let formatted = get_optimized_plan(&plan)?; let actual = formatted.trim(); assert_snapshot!( actual, @@ -135,7 +135,7 @@ async fn test_single_local() -> Result<()> { DataSourceExec: partitions=1, partition_sizes=[1] " ); - let expected = assert_results_match_expected(plan).await?; + let expected = run_plan_and_format(plan).await?; let actual = expected.trim(); assert_snapshot!( actual, @@ -174,7 +174,7 @@ async fn test_single_global() -> Result<()> { ); // expected to push the skip+fetch limit to the AggregateExec let plan: Arc = Arc::new(limit_exec); - let formatted = plan_matches_expected(&plan)?; + let formatted = get_optimized_plan(&plan)?; let actual = formatted.trim(); assert_snapshot!( actual, @@ -184,7 +184,7 @@ async fn test_single_global() -> Result<()> { DataSourceExec: partitions=1, partition_sizes=[1] " ); - let expected = assert_results_match_expected(plan).await?; + let expected = run_plan_and_format(plan).await?; let actual = expected.trim(); assert_snapshot!( actual, @@ -229,7 +229,7 @@ async fn test_distinct_cols_different_than_group_by_cols() -> Result<()> { ); // expected to push the limit to the outer AggregateExec only let plan: Arc = Arc::new(limit_exec); - let formatted = plan_matches_expected(&plan)?; + let formatted = get_optimized_plan(&plan)?; let actual = formatted.trim(); assert_snapshot!( actual, @@ -240,7 +240,7 @@ async fn test_distinct_cols_different_than_group_by_cols() -> Result<()> { DataSourceExec: partitions=1, partition_sizes=[1] " ); - let expected = assert_results_match_expected(plan).await?; + let expected = run_plan_and_format(plan).await?; let actual = expected.trim(); assert_snapshot!( actual, @@ -284,7 +284,7 @@ fn test_has_order_by() -> Result<()> { ); // expected not to push the limit to the AggregateExec let plan: Arc = Arc::new(limit_exec); - let formatted = plan_matches_expected(&plan)?; + let formatted = get_optimized_plan(&plan)?; let actual = formatted.trim(); assert_snapshot!( actual, @@ -317,7 +317,7 @@ fn test_no_group_by() -> Result<()> { ); // expected not to push the limit to the AggregateExec let plan: Arc = Arc::new(limit_exec); - let formatted = plan_matches_expected(&plan)?; + let formatted = get_optimized_plan(&plan)?; let actual = formatted.trim(); assert_snapshot!( actual, @@ -351,7 +351,7 @@ fn test_has_aggregate_expression() -> Result<()> { ); // expected not to push the limit to the AggregateExec let plan: Arc = Arc::new(limit_exec); - let formatted = plan_matches_expected(&plan)?; + let formatted = get_optimized_plan(&plan)?; let actual = formatted.trim(); assert_snapshot!( actual, @@ -393,7 +393,7 @@ fn test_has_filter() -> Result<()> { // expected not to push the limit to the AggregateExec // TODO(msirek): open an issue for `filter_expr` of `AggregateExec` not printing out let plan: Arc = Arc::new(limit_exec); - let formatted = plan_matches_expected(&plan)?; + let formatted = get_optimized_plan(&plan)?; let actual = formatted.trim(); assert_snapshot!( actual, diff --git a/datafusion/core/tests/physical_optimizer/test_utils.rs b/datafusion/core/tests/physical_optimizer/test_utils.rs index d1b51adf73e28..7fb0f795f2944 100644 --- a/datafusion/core/tests/physical_optimizer/test_utils.rs +++ b/datafusion/core/tests/physical_optimizer/test_utils.rs @@ -622,7 +622,7 @@ pub fn build_group_by(input_schema: &SchemaRef, columns: Vec) -> Physica PhysicalGroupBy::new_single(group_by_expr.clone()) } -pub fn plan_matches_expected(plan: &Arc) -> Result { +pub fn get_optimized_plan(plan: &Arc) -> Result { let config = ConfigOptions::new(); let optimized = From 40497e8cbca13ce2ac1e2b9f9df20b4e3b968bad Mon Sep 17 00:00:00 2001 From: Cheng-Yuan-Lai Date: Wed, 2 Jul 2025 22:37:02 +0800 Subject: [PATCH 10/11] fix: remove trimming --- .../limited_distinct_aggregation.rs | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs b/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs index 22b5fefeedcde..ad15d6803413b 100644 --- a/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs +++ b/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs @@ -87,9 +87,8 @@ async fn test_partial_final() -> Result<()> { " ); let expected = run_plan_and_format(plan).await?; - let actual = expected.trim(); assert_snapshot!( - actual, + expected, @r" +---+ | a | @@ -136,9 +135,8 @@ async fn test_single_local() -> Result<()> { " ); let expected = run_plan_and_format(plan).await?; - let actual = expected.trim(); assert_snapshot!( - actual, + expected, @r" +---+ | a | @@ -185,9 +183,8 @@ async fn test_single_global() -> Result<()> { " ); let expected = run_plan_and_format(plan).await?; - let actual = expected.trim(); assert_snapshot!( - actual, + expected, @r" +---+ | a | @@ -241,9 +238,8 @@ async fn test_distinct_cols_different_than_group_by_cols() -> Result<()> { " ); let expected = run_plan_and_format(plan).await?; - let actual = expected.trim(); assert_snapshot!( - actual, + expected, @r" +---+ | a | From f10d7458474e19872fb445f954a1ae8311121ce3 Mon Sep 17 00:00:00 2001 From: Cheng-Yuan-Lai Date: Wed, 2 Jul 2025 23:22:44 +0800 Subject: [PATCH 11/11] refactor: replace get_plan_string with displayable in projection_pushdown --- .../physical_optimizer/projection_pushdown.rs | 122 ++++++++++++------ 1 file changed, 79 insertions(+), 43 deletions(-) diff --git a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs index 585bb3fb360f0..6964965a6431a 100644 --- a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs +++ b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs @@ -57,7 +57,7 @@ use datafusion_physical_plan::sorts::sort::SortExec; use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; use datafusion_physical_plan::streaming::{PartitionStream, StreamingTableExec}; use datafusion_physical_plan::union::UnionExec; -use datafusion_physical_plan::{get_plan_string, ExecutionPlan}; +use datafusion_physical_plan::{displayable, ExecutionPlan}; use insta::assert_snapshot; use itertools::Itertools; @@ -425,7 +425,7 @@ fn test_csv_after_projection() -> Result<()> { ], csv.clone(), )?); - let initial = get_plan_string(&projection).join("\n"); + let initial = displayable(projection.as_ref()).indent(true).to_string(); let actual = initial.trim(); assert_snapshot!( @@ -439,7 +439,9 @@ fn test_csv_after_projection() -> Result<()> { let after_optimize = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; - let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let after_optimize_string = displayable(after_optimize.as_ref()) + .indent(true) + .to_string(); let actual = after_optimize_string.trim(); assert_snapshot!( @@ -461,7 +463,7 @@ fn test_memory_after_projection() -> Result<()> { ], memory.clone(), )?); - let initial = get_plan_string(&projection).join("\n"); + let initial = displayable(projection.as_ref()).indent(true).to_string(); let actual = initial.trim(); assert_snapshot!( @@ -475,7 +477,9 @@ fn test_memory_after_projection() -> Result<()> { let after_optimize = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; - let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let after_optimize_string = displayable(after_optimize.as_ref()) + .indent(true) + .to_string(); let actual = after_optimize_string.trim(); assert_snapshot!( @@ -649,7 +653,9 @@ fn test_projection_after_projection() -> Result<()> { child_projection.clone(), )?); - let initial = get_plan_string(&top_projection).join("\n"); + let initial = displayable(top_projection.as_ref()) + .indent(true) + .to_string(); let actual = initial.trim(); assert_snapshot!( @@ -664,7 +670,9 @@ fn test_projection_after_projection() -> Result<()> { let after_optimize = ProjectionPushdown::new().optimize(top_projection, &ConfigOptions::new())?; - let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let after_optimize_string = displayable(after_optimize.as_ref()) + .indent(true) + .to_string(); let actual = after_optimize_string.trim(); assert_snapshot!( @@ -714,7 +722,7 @@ fn test_output_req_after_projection() -> Result<()> { sort_req.clone(), )?); - let initial = get_plan_string(&projection).join("\n"); + let initial = displayable(projection.as_ref()).indent(true).to_string(); let actual = initial.trim(); assert_snapshot!( @@ -729,7 +737,9 @@ fn test_output_req_after_projection() -> Result<()> { let after_optimize = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; - let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let after_optimize_string = displayable(after_optimize.as_ref()) + .indent(true) + .to_string(); let actual = after_optimize_string.trim(); assert_snapshot!( @@ -803,7 +813,7 @@ fn test_coalesce_partitions_after_projection() -> Result<()> { ], coalesce_partitions, )?); - let initial = get_plan_string(&projection).join("\n"); + let initial = displayable(projection.as_ref()).indent(true).to_string(); let actual = initial.trim(); assert_snapshot!( @@ -818,7 +828,9 @@ fn test_coalesce_partitions_after_projection() -> Result<()> { let after_optimize = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; - let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let after_optimize_string = displayable(after_optimize.as_ref()) + .indent(true) + .to_string(); let actual = after_optimize_string.trim(); assert_snapshot!( @@ -850,7 +862,7 @@ fn test_filter_after_projection() -> Result<()> { )), )); let filter = Arc::new(FilterExec::try_new(predicate, csv)?); - let projection = Arc::new(ProjectionExec::try_new( + let projection: Arc = Arc::new(ProjectionExec::try_new( vec![ (Arc::new(Column::new("a", 0)), "a_new".to_string()), (Arc::new(Column::new("b", 1)), "b".to_string()), @@ -859,7 +871,7 @@ fn test_filter_after_projection() -> Result<()> { filter.clone(), )?) as _; - let initial = get_plan_string(&projection).join("\n"); + let initial = displayable(projection.as_ref()).indent(true).to_string(); let actual = initial.trim(); assert_snapshot!( @@ -874,7 +886,9 @@ fn test_filter_after_projection() -> Result<()> { let after_optimize = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; - let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let after_optimize_string = displayable(after_optimize.as_ref()) + .indent(true) + .to_string(); let actual = after_optimize_string.trim(); assert_snapshot!( @@ -943,7 +957,7 @@ fn test_join_after_projection() -> Result<()> { None, StreamJoinPartitionMode::SinglePartition, )?); - let projection = Arc::new(ProjectionExec::try_new( + let projection: Arc = Arc::new(ProjectionExec::try_new( vec![ (Arc::new(Column::new("c", 2)), "c_from_left".to_string()), (Arc::new(Column::new("b", 1)), "b_from_left".to_string()), @@ -953,7 +967,7 @@ fn test_join_after_projection() -> Result<()> { ], join, )?) as _; - let initial = get_plan_string(&projection).join("\n"); + let initial = displayable(projection.as_ref()).indent(true).to_string(); let actual = initial.trim(); assert_snapshot!( @@ -969,7 +983,9 @@ fn test_join_after_projection() -> Result<()> { let after_optimize = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; - let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let after_optimize_string = displayable(after_optimize.as_ref()) + .indent(true) + .to_string(); let actual = after_optimize_string.trim(); assert_snapshot!( @@ -1066,7 +1082,7 @@ fn test_join_after_required_projection() -> Result<()> { None, StreamJoinPartitionMode::SinglePartition, )?); - let projection = Arc::new(ProjectionExec::try_new( + let projection: Arc = Arc::new(ProjectionExec::try_new( vec![ (Arc::new(Column::new("a", 5)), "a".to_string()), (Arc::new(Column::new("b", 6)), "b".to_string()), @@ -1081,7 +1097,7 @@ fn test_join_after_required_projection() -> Result<()> { ], join, )?) as _; - let initial = get_plan_string(&projection).join("\n"); + let initial = displayable(projection.as_ref()).indent(true).to_string(); let actual = initial.trim(); assert_snapshot!( @@ -1097,7 +1113,9 @@ fn test_join_after_required_projection() -> Result<()> { let after_optimize = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; - let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let after_optimize_string = displayable(after_optimize.as_ref()) + .indent(true) + .to_string(); let actual = after_optimize_string.trim(); assert_snapshot!( @@ -1154,11 +1172,11 @@ fn test_nested_loop_join_after_projection() -> Result<()> { None, )?) as _; - let projection = Arc::new(ProjectionExec::try_new( + let projection: Arc = Arc::new(ProjectionExec::try_new( vec![(col_left_c, "c".to_string())], Arc::clone(&join), )?) as _; - let initial = get_plan_string(&projection).join("\n"); + let initial = displayable(projection.as_ref()).indent(true).to_string(); let actual = initial.trim(); assert_snapshot!( actual, @@ -1172,7 +1190,9 @@ fn test_nested_loop_join_after_projection() -> Result<()> { let after_optimize_string = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; - let after_optimize_string = get_plan_string(&after_optimize_string).join("\n"); + let after_optimize_string = displayable(after_optimize_string.as_ref()) + .indent(true) + .to_string(); let actual = after_optimize_string.trim(); assert_snapshot!( actual, @@ -1241,7 +1261,7 @@ fn test_hash_join_after_projection() -> Result<()> { PartitionMode::Auto, NullEquality::NullEqualsNull, )?); - let projection = Arc::new(ProjectionExec::try_new( + let projection: Arc = Arc::new(ProjectionExec::try_new( vec![ (Arc::new(Column::new("c", 2)), "c_from_left".to_string()), (Arc::new(Column::new("b", 1)), "b_from_left".to_string()), @@ -1250,7 +1270,7 @@ fn test_hash_join_after_projection() -> Result<()> { ], join.clone(), )?) as _; - let initial = get_plan_string(&projection).join("\n"); + let initial = displayable(projection.as_ref()).indent(true).to_string(); let actual = initial.trim(); assert_snapshot!( actual, @@ -1264,7 +1284,9 @@ fn test_hash_join_after_projection() -> Result<()> { let after_optimize = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; - let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let after_optimize_string = displayable(after_optimize.as_ref()) + .indent(true) + .to_string(); let actual = after_optimize_string.trim(); // HashJoinExec only returns result after projection. Because there are some alias columns in the projection, the ProjectionExec is not removed. @@ -1290,7 +1312,9 @@ fn test_hash_join_after_projection() -> Result<()> { let after_optimize = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; - let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let after_optimize_string = displayable(after_optimize.as_ref()) + .indent(true) + .to_string(); let actual = after_optimize_string.trim(); // Comparing to the previous result, this projection don't have alias columns either change the order of output fields. So the ProjectionExec is removed. @@ -1320,7 +1344,7 @@ fn test_repartition_after_projection() -> Result<()> { 6, ), )?); - let projection = Arc::new(ProjectionExec::try_new( + let projection: Arc = Arc::new(ProjectionExec::try_new( vec![ (Arc::new(Column::new("b", 1)), "b_new".to_string()), (Arc::new(Column::new("a", 0)), "a".to_string()), @@ -1328,7 +1352,7 @@ fn test_repartition_after_projection() -> Result<()> { ], repartition, )?) as _; - let initial = get_plan_string(&projection).join("\n"); + let initial = displayable(projection.as_ref()).indent(true).to_string(); let actual = initial.trim(); assert_snapshot!( actual, @@ -1342,7 +1366,9 @@ fn test_repartition_after_projection() -> Result<()> { let after_optimize = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; - let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let after_optimize_string = displayable(after_optimize.as_ref()) + .indent(true) + .to_string(); let actual = after_optimize_string.trim(); assert_snapshot!( actual, @@ -1388,7 +1414,7 @@ fn test_sort_after_projection() -> Result<()> { .into(), csv, ); - let projection = Arc::new(ProjectionExec::try_new( + let projection: Arc = Arc::new(ProjectionExec::try_new( vec![ (Arc::new(Column::new("c", 2)), "c".to_string()), (Arc::new(Column::new("a", 0)), "new_a".to_string()), @@ -1397,7 +1423,7 @@ fn test_sort_after_projection() -> Result<()> { Arc::new(sort_exec), )?) as _; - let initial = get_plan_string(&projection).join("\n"); + let initial = displayable(projection.as_ref()).indent(true).to_string(); let actual = initial.trim(); assert_snapshot!( actual, @@ -1411,7 +1437,9 @@ fn test_sort_after_projection() -> Result<()> { let after_optimize = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; - let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let after_optimize_string = displayable(after_optimize.as_ref()) + .indent(true) + .to_string(); let actual = after_optimize_string.trim(); assert_snapshot!( actual, @@ -1440,7 +1468,7 @@ fn test_sort_preserving_after_projection() -> Result<()> { .into(), csv, ); - let projection = Arc::new(ProjectionExec::try_new( + let projection: Arc = Arc::new(ProjectionExec::try_new( vec![ (Arc::new(Column::new("c", 2)), "c".to_string()), (Arc::new(Column::new("a", 0)), "new_a".to_string()), @@ -1449,7 +1477,7 @@ fn test_sort_preserving_after_projection() -> Result<()> { Arc::new(sort_exec), )?) as _; - let initial = get_plan_string(&projection).join("\n"); + let initial = displayable(projection.as_ref()).indent(true).to_string(); let actual = initial.trim(); assert_snapshot!( actual, @@ -1463,7 +1491,9 @@ fn test_sort_preserving_after_projection() -> Result<()> { let after_optimize = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; - let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let after_optimize_string = displayable(after_optimize.as_ref()) + .indent(true) + .to_string(); let actual = after_optimize_string.trim(); assert_snapshot!( actual, @@ -1481,7 +1511,7 @@ fn test_sort_preserving_after_projection() -> Result<()> { fn test_union_after_projection() -> Result<()> { let csv = create_simple_csv_exec(); let union = Arc::new(UnionExec::new(vec![csv.clone(), csv.clone(), csv])); - let projection = Arc::new(ProjectionExec::try_new( + let projection: Arc = Arc::new(ProjectionExec::try_new( vec![ (Arc::new(Column::new("c", 2)), "c".to_string()), (Arc::new(Column::new("a", 0)), "new_a".to_string()), @@ -1490,7 +1520,7 @@ fn test_union_after_projection() -> Result<()> { union.clone(), )?) as _; - let initial = get_plan_string(&projection).join("\n"); + let initial = displayable(projection.as_ref()).indent(true).to_string(); let actual = initial.trim(); assert_snapshot!( actual, @@ -1506,7 +1536,9 @@ fn test_union_after_projection() -> Result<()> { let after_optimize = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; - let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let after_optimize_string = displayable(after_optimize.as_ref()) + .indent(true) + .to_string(); let actual = after_optimize_string.trim(); assert_snapshot!( actual, @@ -1550,7 +1582,7 @@ fn test_partition_col_projection_pushdown() -> Result<()> { let source = partitioned_data_source(); let partitioned_schema = source.schema(); - let projection = Arc::new(ProjectionExec::try_new( + let projection: Arc = Arc::new(ProjectionExec::try_new( vec![ ( col("string_col", partitioned_schema.as_ref())?, @@ -1571,7 +1603,9 @@ fn test_partition_col_projection_pushdown() -> Result<()> { let after_optimize = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; - let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let after_optimize_string = displayable(after_optimize.as_ref()) + .indent(true) + .to_string(); let actual = after_optimize_string.trim(); assert_snapshot!( actual, @@ -1589,7 +1623,7 @@ fn test_partition_col_projection_pushdown_expr() -> Result<()> { let source = partitioned_data_source(); let partitioned_schema = source.schema(); - let projection = Arc::new(ProjectionExec::try_new( + let projection: Arc = Arc::new(ProjectionExec::try_new( vec![ ( col("string_col", partitioned_schema.as_ref())?, @@ -1615,7 +1649,9 @@ fn test_partition_col_projection_pushdown_expr() -> Result<()> { let after_optimize = ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?; - let after_optimize_string = get_plan_string(&after_optimize).join("\n"); + let after_optimize_string = displayable(after_optimize.as_ref()) + .indent(true) + .to_string(); let actual = after_optimize_string.trim(); assert_snapshot!( actual,