-
Notifications
You must be signed in to change notification settings - Fork 4.5k
Interactive: clean up when pipeline is out of scope #12339
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -251,18 +251,32 @@ def inspector(self): | |
| return self._inspector | ||
|
|
||
| def cleanup(self, pipeline=None): | ||
| """Cleans up cached states for the given pipeline. Cleans up | ||
| for all pipelines if no specific pipeline is given.""" | ||
| """Cleans up cached states for the given pipeline. Noop if the given | ||
| pipeline is absent from the environment. Cleans up for all pipelines | ||
| if no pipeline is specified.""" | ||
| if pipeline: | ||
| from apache_beam.runners.interactive import background_caching_job as bcj | ||
| bcj.attempt_to_cancel_background_caching_job(pipeline) | ||
| bcj.attempt_to_stop_test_stream_service(pipeline) | ||
| cache_manager = self.get_cache_manager(pipeline) | ||
| if cache_manager: | ||
| cache_manager.cleanup() | ||
| else: | ||
| for _, job in self._background_caching_jobs.items(): | ||
| if job: | ||
| job.cancel() | ||
| for _, controller in self._test_stream_service_controllers.items(): | ||
| if controller: | ||
| controller.stop() | ||
| for _, cache_manager in self._cache_managers.items(): | ||
| cache_manager.cleanup() | ||
| if cache_manager: | ||
| cache_manager.cleanup() | ||
|
|
||
| self.evict_background_caching_job(pipeline) | ||
| self.evict_test_stream_service_controller(pipeline) | ||
| self.evict_computed_pcollections(pipeline) | ||
| self.evict_cached_source_signature(pipeline) | ||
| self.evict_pipeline_result(pipeline) | ||
|
|
||
| def watch(self, watchable): | ||
| """Watches a watchable. | ||
|
|
@@ -343,9 +357,13 @@ def set_pipeline_result(self, pipeline, result): | |
| 'apache_beam.runners.runner.PipelineResult or its subclass') | ||
| self._main_pipeline_results[str(id(pipeline))] = result | ||
|
|
||
| def evict_pipeline_result(self, pipeline): | ||
| """Evicts the tracking of given pipeline run. Noop if absent.""" | ||
| return self._main_pipeline_results.pop(str(id(pipeline)), None) | ||
| def evict_pipeline_result(self, pipeline=None): | ||
| """Evicts the last run result of the given pipeline. Noop if the pipeline | ||
| is absent from the environment. If no pipeline is specified, evicts for all | ||
| pipelines.""" | ||
| if pipeline: | ||
| return self._main_pipeline_results.pop(str(id(pipeline)), None) | ||
| self._main_pipeline_results.clear() | ||
|
|
||
| def pipeline_result(self, pipeline): | ||
| """Gets the pipeline run result. None if absent.""" | ||
|
|
@@ -364,26 +382,37 @@ def get_background_caching_job(self, pipeline): | |
| """Gets the background caching job started from the given pipeline.""" | ||
| return self._background_caching_jobs.get(str(id(pipeline)), None) | ||
|
|
||
| def evict_background_caching_job(self, pipeline=None): | ||
| """Evicts the background caching job started from the given pipeline. Noop | ||
| if the given pipeline is absent from the environment. If no pipeline is | ||
| specified, evicts for all pipelines.""" | ||
| if pipeline: | ||
| return self._background_caching_jobs.pop(str(id(pipeline)), None) | ||
| self._background_caching_jobs.clear() | ||
|
|
||
| def set_test_stream_service_controller(self, pipeline, controller): | ||
| """Sets the test stream service controller that has started a gRPC server | ||
| serving the test stream for any job started from the given user-defined | ||
| serving the test stream for any job started from the given user defined | ||
| pipeline. | ||
| """ | ||
| self._test_stream_service_controllers[str(id(pipeline))] = controller | ||
|
|
||
| def get_test_stream_service_controller(self, pipeline): | ||
| """Gets the test stream service controller that has started a gRPC server | ||
| serving the test stream for any job started from the given user-defined | ||
| serving the test stream for any job started from the given user defined | ||
| pipeline. | ||
| """ | ||
| return self._test_stream_service_controllers.get(str(id(pipeline)), None) | ||
|
|
||
| def evict_test_stream_service_controller(self, pipeline): | ||
| """Evicts and pops the test stream service controller that has started a | ||
| gRPC server serving the test stream for any job started from the given | ||
| user-defined pipeline. | ||
| user defined pipeline. Noop if the given pipeline is absent from the | ||
| environment. If no pipeline is specified, evicts for all pipelines. | ||
| """ | ||
| return self._test_stream_service_controllers.pop(str(id(pipeline)), None) | ||
| if pipeline: | ||
| return self._test_stream_service_controllers.pop(str(id(pipeline)), None) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we check that it's running / stopped before evicting?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ditto. |
||
| self._test_stream_service_controllers.clear() | ||
|
|
||
| def is_terminated(self, pipeline): | ||
| """Queries if the most recent job (by executing the given pipeline) state | ||
|
|
@@ -400,13 +429,15 @@ def get_cached_source_signature(self, pipeline): | |
| return self._cached_source_signature.get(str(id(pipeline)), set()) | ||
|
|
||
| def evict_cached_source_signature(self, pipeline=None): | ||
| """Evicts the signature generated for each recorded source of the given | ||
| pipeline. Noop if the given pipeline is absent from the environment. If no | ||
| pipeline is specified, evicts for all pipelines.""" | ||
| if pipeline: | ||
| self._cached_source_signature.pop(str(id(pipeline)), None) | ||
| else: | ||
| self._cached_source_signature.clear() | ||
| return self._cached_source_signature.pop(str(id(pipeline)), None) | ||
| self._cached_source_signature.clear() | ||
|
|
||
| def track_user_pipelines(self): | ||
| """Record references to all user-defined pipeline instances watched in | ||
| """Record references to all user defined pipeline instances watched in | ||
| current environment. | ||
|
|
||
| Current static global singleton interactive environment holds references to | ||
|
|
@@ -416,18 +447,35 @@ def track_user_pipelines(self): | |
| then handle them differently. | ||
|
|
||
| This is invoked every time a PTransform is to be applied if the current | ||
| code execution is under ipython due to the possibility that any user-defined | ||
| code execution is under ipython due to the possibility that any user defined | ||
| pipeline can be re-evaluated through notebook cell re-execution at any time. | ||
|
|
||
| Each time this is invoked, it will check if there is a cache manager | ||
| already created for each user defined pipeline. If not, create one for it. | ||
|
|
||
| If a pipeline is no longer watched due to re-execution while its | ||
| PCollections are still in watched scope, the pipeline becomes anonymous but | ||
| still accessible indirectly through references to its PCollections. This | ||
| function also clears up internal states for those anonymous pipelines once | ||
| all their PCollections are anonymous. | ||
| """ | ||
| self._tracked_user_pipelines = set() | ||
| for watching in self.watching(): | ||
| for _, val in watching: | ||
| if isinstance(val, beam.pipeline.Pipeline): | ||
| self._tracked_user_pipelines.add(val) | ||
| _ = self.get_cache_manager(val, create_if_absent=True) | ||
| all_tracked_pipeline_ids = set(self._background_caching_jobs.keys()).union( | ||
| set(self._test_stream_service_controllers.keys()), | ||
| set(self._cache_managers.keys()), | ||
| {str(id(pcoll.pipeline)) | ||
| for pcoll in self._computed_pcolls}, | ||
| set(self._cached_source_signature.keys()), | ||
| set(self._main_pipeline_results.keys())) | ||
| inspectable_pipelines = self._inspector.inspectable_pipelines | ||
| for pipeline in all_tracked_pipeline_ids: | ||
| if pipeline not in inspectable_pipelines: | ||
| self.cleanup(pipeline) | ||
|
|
||
| @property | ||
| def tracked_user_pipelines(self): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should we check that it's running / stopped before evicting?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That should not be necessary.
The idea is to let
cleanupclear all states andevict_*functions only do eviction for each field.Because if we
stopa test stream, probably, we'll also have to stop the background caching job and any other in-memory states that is related. There is rarely a case that one of the fields get evicted without clearing other fields except in some unit tests, for example:This also makes sure cleanup code doesn't redundantly show up in multiple places or cause infinite loop when evicting/cleanup are called by different modules.