-
Notifications
You must be signed in to change notification settings - Fork 2.2k
fix: handle duplicate WindowFunction expressions in Substrait consumer #15211
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ea30dd7
1ad1b97
f5ac0db
e577695
d1d1aab
c4f3ffa
404b70a
696bf5d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1059,7 +1059,7 @@ pub async fn from_project_rel( | |
| p: &ProjectRel, | ||
| ) -> Result<LogicalPlan> { | ||
| if let Some(input) = p.input.as_ref() { | ||
| let mut input = LogicalPlanBuilder::from(consumer.consume_rel(input).await?); | ||
| let input = consumer.consume_rel(input).await?; | ||
| let original_schema = Arc::clone(input.schema()); | ||
|
|
||
| // Ensure that all expressions have a unique display name, so that | ||
|
|
@@ -1075,6 +1075,10 @@ pub async fn from_project_rel( | |
| // leaving only explicit expressions. | ||
|
|
||
| let mut explicit_exprs: Vec<Expr> = vec![]; | ||
| // For WindowFunctions, we need to wrap them in a Window relation. If there are duplicates, | ||
| // we can do the window'ing only once, then the project will duplicate the result. | ||
| // Order here doesn't matter since LPB::window_plan sorts the expressions. | ||
| let mut window_exprs: HashSet<Expr> = HashSet::new(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doesn't order of the expressions matter? If so, I think you could use an
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems like an improvement over what is on main, so I think we could merge this PR as is, but this seems the potential reordering might cause potentially confusing intermittently variable output
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hm, I don't think it matters per se, since the project below then puts things into right places based on the names. However, for consistency it might be nice for the order to stay. Let me see if I can quickly do that!
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually no, the ordering we use here doesn't matter, since the LogicalPlanBuilder::window_plan calls
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks @Blizzara |
||
| for expr in &p.expressions { | ||
| let e = consumer | ||
| .consume_expression(expr, input.clone().schema()) | ||
|
|
@@ -1084,18 +1088,24 @@ pub async fn from_project_rel( | |
| // Adding the same expression here and in the project below | ||
| // works because the project's builder uses columnize_expr(..) | ||
| // to transform it into a column reference | ||
| input = input.window(vec![e.clone()])? | ||
| window_exprs.insert(e.clone()); | ||
| } | ||
| explicit_exprs.push(name_tracker.get_uniquely_named_expr(e)?); | ||
| } | ||
|
|
||
| let input = if !window_exprs.is_empty() { | ||
| LogicalPlanBuilder::window_plan(input, window_exprs)? | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This has logic built in to separate the expressions by their windows, so it's kinda nice. |
||
| } else { | ||
| input | ||
| }; | ||
|
|
||
| let mut final_exprs: Vec<Expr> = vec![]; | ||
| for index in 0..original_schema.fields().len() { | ||
| let e = Expr::Column(Column::from(original_schema.qualified_field(index))); | ||
| final_exprs.push(name_tracker.get_uniquely_named_expr(e)?); | ||
| } | ||
| final_exprs.append(&mut explicit_exprs); | ||
| input.project(final_exprs)?.build() | ||
| project(input, final_exprs) | ||
| } else { | ||
| not_impl_err!("Projection without an input is not supported") | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -45,6 +45,10 @@ mod tests { | |
| "Projection: NOT DATA.D AS EXPR$0\ | ||
| \n TableScan: DATA" | ||
| ); | ||
|
|
||
| // Trigger execution to ensure plan validity | ||
| DataFrame::new(ctx.state(), plan).show().await?; | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just added these for all of the cases, since some of my iterations passed logical plan creation but failed at physical planning stage. |
||
|
|
||
| Ok(()) | ||
| } | ||
|
|
||
|
|
@@ -71,6 +75,63 @@ mod tests { | |
| \n WindowAggr: windowExpr=[[sum(DATA.D) PARTITION BY [DATA.PART] ORDER BY [DATA.ORD ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING]]\ | ||
| \n TableScan: DATA" | ||
| ); | ||
|
|
||
| // Trigger execution to ensure plan validity | ||
| DataFrame::new(ctx.state(), plan).show().await?; | ||
|
|
||
| Ok(()) | ||
| } | ||
|
|
||
| #[tokio::test] | ||
| async fn double_window_function() -> Result<()> { | ||
| // Confirms a WindowExpr can be repeated in the same project. | ||
| // This wouldn't normally happen with DF-created plans since CSE would eliminate the duplicate. | ||
|
|
||
| // File generated with substrait-java's Isthmus: | ||
| // ./isthmus-cli/build/graal/isthmus --create "create table data (a int)" "select ROW_NUMBER() OVER (), ROW_NUMBER() OVER () AS aliased from data"; | ||
| let proto_plan = | ||
| read_json("tests/testdata/test_plans/double_window.substrait.json"); | ||
| let ctx = add_plan_schemas_to_ctx(SessionContext::new(), &proto_plan)?; | ||
| let plan = from_substrait_plan(&ctx.state(), &proto_plan).await?; | ||
|
|
||
| assert_eq!( | ||
| format!("{}", plan), | ||
| "Projection: row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS EXPR$0, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW__temp__0 AS ALIASED\ | ||
| \n WindowAggr: windowExpr=[[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]\ | ||
| \n TableScan: DATA" | ||
| ); | ||
|
|
||
| // Trigger execution to ensure plan validity | ||
| DataFrame::new(ctx.state(), plan).show().await?; | ||
|
|
||
| Ok(()) | ||
| } | ||
|
|
||
| #[tokio::test] | ||
| async fn double_window_function_distinct_windows() -> Result<()> { | ||
| // Confirms a single project can have multiple window functions with separate windows in it. | ||
| // This wouldn't normally happen with DF-created plans since logical optimizer would | ||
| // separate them out. | ||
|
|
||
| // File generated with substrait-java's Isthmus: | ||
| // ./isthmus-cli/build/graal/isthmus --create "create table data (a int)" "select ROW_NUMBER() OVER (), ROW_NUMBER() OVER (PARTITION BY a) from data"; | ||
| let proto_plan = read_json( | ||
| "tests/testdata/test_plans/double_window_distinct_windows.substrait.json", | ||
| ); | ||
| let ctx = add_plan_schemas_to_ctx(SessionContext::new(), &proto_plan)?; | ||
| let plan = from_substrait_plan(&ctx.state(), &proto_plan).await?; | ||
|
|
||
| assert_eq!( | ||
| format!("{}", plan), | ||
| "Projection: row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS EXPR$0, row_number() PARTITION BY [DATA.A] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS EXPR$1\ | ||
| \n WindowAggr: windowExpr=[[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]\ | ||
| \n WindowAggr: windowExpr=[[row_number() PARTITION BY [DATA.A] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]\ | ||
| \n TableScan: DATA" | ||
| ); | ||
|
|
||
| // Trigger execution to ensure plan validity | ||
| DataFrame::new(ctx.state(), plan).show().await?; | ||
|
|
||
| Ok(()) | ||
| } | ||
|
|
||
|
|
@@ -86,7 +147,7 @@ mod tests { | |
|
|
||
| assert_eq!(format!("{}", &plan), "Values: (List([1, 2]))"); | ||
|
|
||
| // Need to trigger execution to ensure that Arrow has validated the plan | ||
| // Trigger execution to ensure plan validity | ||
| DataFrame::new(ctx.state(), plan).show().await?; | ||
|
|
||
| Ok(()) | ||
|
|
@@ -107,6 +168,9 @@ mod tests { | |
| \n TableScan: sales" | ||
| ); | ||
|
|
||
| // Trigger execution to ensure plan validity | ||
| DataFrame::new(ctx.state(), plan).show().await?; | ||
|
|
||
| Ok(()) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,126 @@ | ||
| { | ||
| "extensionUris": [ | ||
| { | ||
| "extensionUriAnchor": 1, | ||
| "uri": "/functions_arithmetic.yaml" | ||
| } | ||
| ], | ||
| "extensions": [ | ||
| { | ||
| "extensionFunction": { | ||
| "extensionUriReference": 1, | ||
| "functionAnchor": 0, | ||
| "name": "row_number:" | ||
| } | ||
| } | ||
| ], | ||
| "relations": [ | ||
| { | ||
| "root": { | ||
| "input": { | ||
| "project": { | ||
| "common": { | ||
| "emit": { | ||
| "outputMapping": [ | ||
| 1, | ||
| 2 | ||
| ] | ||
| } | ||
| }, | ||
| "input": { | ||
| "read": { | ||
| "common": { | ||
| "direct": { | ||
| } | ||
| }, | ||
| "baseSchema": { | ||
| "names": [ | ||
| "A" | ||
| ], | ||
| "struct": { | ||
| "types": [ | ||
| { | ||
| "i32": { | ||
| "typeVariationReference": 0, | ||
| "nullability": "NULLABILITY_NULLABLE" | ||
| } | ||
| } | ||
| ], | ||
| "typeVariationReference": 0, | ||
| "nullability": "NULLABILITY_REQUIRED" | ||
| } | ||
| }, | ||
| "namedTable": { | ||
| "names": [ | ||
| "DATA" | ||
| ] | ||
| } | ||
| } | ||
| }, | ||
| "expressions": [ | ||
| { | ||
| "windowFunction": { | ||
| "functionReference": 0, | ||
| "partitions": [], | ||
| "sorts": [], | ||
| "upperBound": { | ||
| "currentRow": { | ||
| } | ||
| }, | ||
| "lowerBound": { | ||
| "unbounded": { | ||
| } | ||
| }, | ||
| "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", | ||
| "outputType": { | ||
| "i64": { | ||
| "typeVariationReference": 0, | ||
| "nullability": "NULLABILITY_REQUIRED" | ||
| } | ||
| }, | ||
| "args": [], | ||
| "arguments": [], | ||
| "invocation": "AGGREGATION_INVOCATION_ALL", | ||
| "options": [], | ||
| "boundsType": "BOUNDS_TYPE_ROWS" | ||
| } | ||
| }, | ||
| { | ||
| "windowFunction": { | ||
| "functionReference": 0, | ||
| "partitions": [], | ||
| "sorts": [], | ||
| "upperBound": { | ||
| "currentRow": { | ||
| } | ||
| }, | ||
| "lowerBound": { | ||
| "unbounded": { | ||
| } | ||
| }, | ||
| "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", | ||
| "outputType": { | ||
| "i64": { | ||
| "typeVariationReference": 0, | ||
| "nullability": "NULLABILITY_REQUIRED" | ||
| } | ||
| }, | ||
| "args": [], | ||
| "arguments": [], | ||
| "invocation": "AGGREGATION_INVOCATION_ALL", | ||
| "options": [], | ||
| "boundsType": "BOUNDS_TYPE_ROWS" | ||
| } | ||
| } | ||
| ] | ||
| } | ||
| }, | ||
| "names": [ | ||
| "EXPR$0", | ||
| "ALIASED" | ||
| ] | ||
| } | ||
| } | ||
| ], | ||
| "expectedTypeUrls": [] | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this allows feeding in a HashSet