-
Notifications
You must be signed in to change notification settings - Fork 4.5k
[BEAM-9547] Lift associative aggregations. #12469
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0858f29
7a169fd
d24e048
c5ddbe6
afaf3e6
b101ee2
238d1e5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,6 +16,7 @@ | |
|
|
||
| from __future__ import absolute_import | ||
|
|
||
| import sys | ||
| import unittest | ||
|
|
||
| import numpy as np | ||
|
|
@@ -36,7 +37,7 @@ def _run_test(self, func, *args): | |
| expected = func(*args) | ||
| actual = expressions.Session({}).evaluate(func(*deferred_args)._expr) | ||
| self.assertTrue( | ||
| expected.equals(actual), | ||
| getattr(expected, 'equals', expected.__eq__)(actual), | ||
| 'Expected:\n\n%r\n\nActual:\n\n%r' % (expected, actual)) | ||
|
|
||
| def test_series_arithmetic(self): | ||
|
|
@@ -81,6 +82,26 @@ def test_loc(self): | |
| self._run_test(lambda df: df.loc[df.A > 10], df) | ||
| self._run_test(lambda df: df.loc[lambda df: df.A > 10], df) | ||
|
|
||
| def test_series_agg(self): | ||
| s = pd.Series(list(range(16))) | ||
| self._run_test(lambda s: s.agg('sum'), s) | ||
| self._run_test(lambda s: s.agg(['sum']), s) | ||
| with beam.dataframe.allow_non_parallel_operations(): | ||
| self._run_test(lambda s: s.agg(['sum', 'mean']), s) | ||
| self._run_test(lambda s: s.agg(['mean']), s) | ||
| self._run_test(lambda s: s.agg('mean'), s) | ||
|
|
||
| @unittest.skipIf(sys.version_info < (3, 6), 'Nondeterministic dict ordering.') | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it be reasonable to re-order the columns by name when asserting equality?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Column ordering seems to be a fairly fundamental property of dataframes that I'd prefer to check in general, and 3.5 won't be supported for long. |
||
| def test_dataframe_agg(self): | ||
| df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [2, 3, 5, 7]}) | ||
| self._run_test(lambda df: df.agg('sum'), df) | ||
| with beam.dataframe.allow_non_parallel_operations(): | ||
| self._run_test(lambda df: df.agg(['sum', 'mean']), df) | ||
| self._run_test(lambda df: df.agg({'A': 'sum', 'B': 'sum'}), df) | ||
| self._run_test(lambda df: df.agg({'A': 'sum', 'B': 'mean'}), df) | ||
| self._run_test(lambda df: df.agg({'A': ['sum', 'mean']}), df) | ||
| self._run_test(lambda df: df.agg({'A': ['sum', 'mean'], 'B': 'min'}), df) | ||
|
|
||
|
|
||
| class AllowNonParallelTest(unittest.TestCase): | ||
| def _use_non_parallel_operation(self): | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -125,6 +125,7 @@ def default_label(self): | |||||||||||||||
| return '%s:%s' % (self.stage.ops, id(self)) | ||||||||||||||||
|
|
||||||||||||||||
| def expand(self, pcolls): | ||||||||||||||||
|
|
||||||||||||||||
| scalar_inputs = [expr for expr in self.stage.inputs if is_scalar(expr)] | ||||||||||||||||
| tabular_inputs = [ | ||||||||||||||||
| expr for expr in self.stage.inputs if not is_scalar(expr) | ||||||||||||||||
|
|
@@ -180,6 +181,22 @@ def __init__(self, inputs, partitioning): | |||||||||||||||
| self.ops = [] | ||||||||||||||||
| self.outputs = set() | ||||||||||||||||
|
|
||||||||||||||||
| def __repr__(self, indent=0): | ||||||||||||||||
| if indent: | ||||||||||||||||
| sep = '\n' + ' ' * indent | ||||||||||||||||
| else: | ||||||||||||||||
| sep = '' | ||||||||||||||||
| return ( | ||||||||||||||||
| "Stage[%sinputs=%s, %spartitioning=%s, %sops=%s, %soutputs=%s]" % ( | ||||||||||||||||
| sep, | ||||||||||||||||
| self.inputs, | ||||||||||||||||
| sep, | ||||||||||||||||
| self.partitioning, | ||||||||||||||||
| sep, | ||||||||||||||||
| self.ops, | ||||||||||||||||
| sep, | ||||||||||||||||
| self.outputs)) | ||||||||||||||||
|
|
||||||||||||||||
| # First define some helper functions. | ||||||||||||||||
| def output_is_partitioned_by(expr, stage, partitioning): | ||||||||||||||||
| if partitioning == partitionings.Nothing(): | ||||||||||||||||
|
|
@@ -244,6 +261,11 @@ def expr_to_stages(expr): | |||||||||||||||
| # It also must be declared as an output of the producing stage. | ||||||||||||||||
| expr_to_stage(arg).outputs.add(arg) | ||||||||||||||||
| stage.ops.append(expr) | ||||||||||||||||
| # Ensure that any inputs for the overall transform are added | ||||||||||||||||
| # in downstream stages. | ||||||||||||||||
| for arg in expr.args(): | ||||||||||||||||
| if arg in inputs: | ||||||||||||||||
| stage.inputs.add(arg) | ||||||||||||||||
|
Comment on lines
+266
to
+268
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||||||||||||||||
| # This is a list as given expression may be available in many stages. | ||||||||||||||||
| return [stage] | ||||||||||||||||
|
|
||||||||||||||||
|
|
||||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we're missing this alias in
SeriesThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good call. Done.