-
Notifications
You must be signed in to change notification settings - Fork 2.1k
refactor: port HashExpr proto hooks #22502
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,6 +27,8 @@ use arrow::{ | |
| use datafusion_common::Result; | ||
| use datafusion_common::hash_utils::RandomState; | ||
| use datafusion_common::hash_utils::{create_hashes, with_hashes}; | ||
| #[cfg(feature = "proto")] | ||
| use datafusion_common::internal_err; | ||
| use datafusion_expr::ColumnarValue; | ||
| use datafusion_physical_expr_common::physical_expr::{ | ||
| DynHash, PhysicalExpr, PhysicalExprRef, | ||
|
|
@@ -199,6 +201,55 @@ impl PhysicalExpr for HashExpr { | |
| fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { | ||
| write!(f, "{}", self.description) | ||
| } | ||
|
|
||
| #[cfg(feature = "proto")] | ||
| fn try_to_proto( | ||
| &self, | ||
| ctx: &datafusion_physical_expr_common::physical_expr::proto_encode::PhysicalExprEncodeCtx<'_>, | ||
| ) -> Result<Option<datafusion_proto_models::protobuf::PhysicalExprNode>> { | ||
| use datafusion_proto_models::protobuf; | ||
| let on_columns = ctx.encode_children_expressions(&self.on_columns)?; | ||
| Ok(Some(protobuf::PhysicalExprNode { | ||
| expr_id: None, | ||
| expr_type: Some(protobuf::physical_expr_node::ExprType::HashExpr( | ||
| protobuf::PhysicalHashExprNode { | ||
| on_columns, | ||
| seed0: self.seed(), | ||
| description: self.description.clone(), | ||
| }, | ||
| )), | ||
| })) | ||
| } | ||
| } | ||
|
|
||
| #[cfg(feature = "proto")] | ||
| impl HashExpr { | ||
| /// Reconstruct a [`HashExpr`] from its protobuf representation. | ||
| /// | ||
| /// Takes the whole [`PhysicalExprNode`], the exact inverse of what | ||
| /// [`PhysicalExpr::try_to_proto`] produces, so every expression's | ||
| /// `try_from_proto` shares one signature. Child sub-expressions are | ||
| /// decoded recursively via [`PhysicalExprDecodeCtx::decode`]. | ||
| /// | ||
| /// [`PhysicalExprNode`]: datafusion_proto_models::protobuf::PhysicalExprNode | ||
| /// [`PhysicalExpr::try_to_proto`]: datafusion_physical_expr_common::physical_expr::PhysicalExpr::try_to_proto | ||
| /// [`PhysicalExprDecodeCtx::decode`]: datafusion_physical_expr_common::physical_expr::proto_decode::PhysicalExprDecodeCtx::decode | ||
| pub fn try_from_proto( | ||
| node: &datafusion_proto_models::protobuf::PhysicalExprNode, | ||
| ctx: &datafusion_physical_expr_common::physical_expr::proto_decode::PhysicalExprDecodeCtx<'_>, | ||
| ) -> Result<Arc<dyn PhysicalExpr>> { | ||
| use datafusion_proto_models::protobuf; | ||
| let hash_expr = match &node.expr_type { | ||
| Some(protobuf::physical_expr_node::ExprType::HashExpr(h)) => h, | ||
| _ => return internal_err!("PhysicalExprNode is not a HashExpr"), | ||
| }; | ||
|
Comment on lines
+242
to
+245
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it might make sense to add a helper for this to |
||
| let on_columns = ctx.decode_children_expressions(&hash_expr.on_columns)?; | ||
| Ok(Arc::new(HashExpr::new( | ||
| on_columns, | ||
| SeededRandomState::with_seed(hash_expr.seed0), | ||
| hash_expr.description.clone(), | ||
| ))) | ||
| } | ||
| } | ||
|
|
||
| /// Physical expression that checks join keys in a [`Map`] (hash table or array map). | ||
|
|
@@ -498,6 +549,172 @@ mod tests { | |
| assert_eq!(compute_hash(&expr1), compute_hash(&expr2)); | ||
| } | ||
|
|
||
| #[cfg(feature = "proto")] | ||
| mod proto_tests { | ||
| use super::*; | ||
| use arrow::datatypes::{DataType, Field}; | ||
| use datafusion_common::internal_datafusion_err; | ||
| use datafusion_physical_expr_common::physical_expr::proto_decode::{ | ||
| PhysicalExprDecode, PhysicalExprDecodeCtx, | ||
| }; | ||
| use datafusion_physical_expr_common::physical_expr::proto_encode::{ | ||
| PhysicalExprEncode, PhysicalExprEncodeCtx, | ||
| }; | ||
| use datafusion_proto_models::protobuf; | ||
|
|
||
| struct TestEncoder; | ||
|
|
||
| impl PhysicalExprEncode for TestEncoder { | ||
| fn encode( | ||
| &self, | ||
| expr: &Arc<dyn PhysicalExpr>, | ||
| ) -> Result<protobuf::PhysicalExprNode> { | ||
| let ctx = PhysicalExprEncodeCtx::new(self); | ||
| expr.try_to_proto(&ctx)?.ok_or_else(|| { | ||
| internal_datafusion_err!("test encoder cannot encode {expr:?}") | ||
| }) | ||
| } | ||
| } | ||
|
|
||
| struct TestDecoder; | ||
|
|
||
| impl PhysicalExprDecode for TestDecoder { | ||
| fn decode( | ||
| &self, | ||
| node: &protobuf::PhysicalExprNode, | ||
| schema: &Schema, | ||
| ) -> Result<Arc<dyn PhysicalExpr>> { | ||
| let ctx = PhysicalExprDecodeCtx::new(schema, self); | ||
| match &node.expr_type { | ||
| Some(protobuf::physical_expr_node::ExprType::Column(_)) => { | ||
| Column::try_from_proto(node, &ctx) | ||
| } | ||
| _ => internal_err!("test decoder cannot decode {node:?}"), | ||
| } | ||
| } | ||
| } | ||
|
|
||
| fn test_decode_ctx<'a>( | ||
| schema: &'a Schema, | ||
| decoder: &'a TestDecoder, | ||
| ) -> PhysicalExprDecodeCtx<'a> { | ||
| PhysicalExprDecodeCtx::new(schema, decoder) | ||
| } | ||
|
Comment on lines
+565
to
+602
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we put these into https://github.com/apache/datafusion/blob/main/datafusion/physical-expr/src/proto_test_util.rs ? |
||
|
|
||
| #[test] | ||
| fn hash_expr_try_to_proto() { | ||
| let expr = HashExpr::new( | ||
| vec![Arc::new(Column::new("a", 0)), Arc::new(Column::new("b", 1))], | ||
| SeededRandomState::with_seed(42), | ||
| "hash_join".to_string(), | ||
| ); | ||
| let encoder = TestEncoder; | ||
| let ctx = PhysicalExprEncodeCtx::new(&encoder); | ||
|
|
||
| let proto = expr.try_to_proto(&ctx).unwrap().unwrap(); | ||
|
|
||
| assert_eq!(proto.expr_id, None); | ||
| let hash_expr = match proto.expr_type.unwrap() { | ||
| protobuf::physical_expr_node::ExprType::HashExpr(hash_expr) => hash_expr, | ||
| other => panic!("expected HashExpr, got {other:?}"), | ||
| }; | ||
| assert_eq!(hash_expr.seed0, 42); | ||
| assert_eq!(hash_expr.description, "hash_join"); | ||
| assert_eq!(hash_expr.on_columns.len(), 2); | ||
| assert!( | ||
| hash_expr | ||
| .on_columns | ||
| .iter() | ||
| .all(|expr| expr.expr_id.is_none()) | ||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn hash_expr_try_from_proto() { | ||
| let schema = Schema::new(vec![ | ||
| Field::new("a", DataType::Int32, false), | ||
| Field::new("b", DataType::Utf8, true), | ||
| ]); | ||
| let decoder = TestDecoder; | ||
| let ctx = test_decode_ctx(&schema, &decoder); | ||
| let proto = protobuf::PhysicalExprNode { | ||
| expr_id: None, | ||
| expr_type: Some(protobuf::physical_expr_node::ExprType::HashExpr( | ||
| protobuf::PhysicalHashExprNode { | ||
| on_columns: vec![ | ||
| protobuf::PhysicalExprNode { | ||
| expr_id: None, | ||
| expr_type: Some( | ||
| protobuf::physical_expr_node::ExprType::Column( | ||
| protobuf::PhysicalColumn { | ||
| name: "a".to_string(), | ||
| index: 0, | ||
| }, | ||
| ), | ||
| ), | ||
| }, | ||
| protobuf::PhysicalExprNode { | ||
| expr_id: None, | ||
| expr_type: Some( | ||
| protobuf::physical_expr_node::ExprType::Column( | ||
| protobuf::PhysicalColumn { | ||
| name: "b".to_string(), | ||
| index: 1, | ||
| }, | ||
| ), | ||
| ), | ||
| }, | ||
| ], | ||
| seed0: 42, | ||
| description: "hash_join".to_string(), | ||
| }, | ||
| )), | ||
| }; | ||
|
|
||
| let expr = HashExpr::try_from_proto(&proto, &ctx).unwrap(); | ||
| let expr = expr.downcast_ref::<HashExpr>().unwrap(); | ||
|
|
||
| assert_eq!(expr.seed(), 42); | ||
| assert_eq!(expr.description(), "hash_join"); | ||
| assert_eq!(expr.on_columns().len(), 2); | ||
| assert_eq!( | ||
| expr.on_columns()[0] | ||
| .downcast_ref::<Column>() | ||
| .map(|col| (col.name(), col.index())), | ||
| Some(("a", 0)) | ||
| ); | ||
| assert_eq!( | ||
| expr.on_columns()[1] | ||
| .downcast_ref::<Column>() | ||
| .map(|col| (col.name(), col.index())), | ||
| Some(("b", 1)) | ||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn hash_expr_try_from_proto_rejects_wrong_node_type() { | ||
| let schema = Schema::empty(); | ||
| let decoder = TestDecoder; | ||
| let ctx = test_decode_ctx(&schema, &decoder); | ||
| let proto = protobuf::PhysicalExprNode { | ||
| expr_id: None, | ||
| expr_type: Some(protobuf::physical_expr_node::ExprType::Column( | ||
| protobuf::PhysicalColumn { | ||
| name: "a".to_string(), | ||
| index: 0, | ||
| }, | ||
| )), | ||
| }; | ||
|
|
||
| let err = HashExpr::try_from_proto(&proto, &ctx).unwrap_err(); | ||
| assert!( | ||
| err.to_string() | ||
| .contains("PhysicalExprNode is not a HashExpr"), | ||
| "{err}" | ||
| ); | ||
| } | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_hash_table_lookup_expr_eq_same() { | ||
| let col_a: PhysicalExprRef = Arc::new(Column::new("a", 0)); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we use the helpers from #22513?