Skip to content

Flaky distributed.deploy.tests.test_spec_cluster::test_adaptive_killed_worker #6794

Description

@gjoseph92
_________________________ test_adaptive_killed_worker __________________________

args = (), kwds = {}

    @wraps(func)
    def inner(*args, **kwds):
>       with self._recreate_cm():

../../../miniconda3/envs/dask-distributed/lib/python3.10/contextlib.py:78: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
../../../miniconda3/envs/dask-distributed/lib/python3.10/contextlib.py:142: in __exit__
    next(self.gen)
distributed/utils_test.py:1904: in clean
    with check_thread_leak() if threads else nullcontext():
../../../miniconda3/envs/dask-distributed/lib/python3.10/contextlib.py:142: in __exit__
    next(self.gen)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    @contextmanager
    def check_thread_leak():
        """Context manager to ensure we haven't leaked any threads"""
        active_threads_start = threading.enumerate()
    
        yield
    
        start = time()
        while True:
            bad_threads = [
                thread
                for thread in threading.enumerate()
                if thread not in active_threads_start
                # FIXME this looks like a genuine leak that needs fixing
                and "watch message queue" not in thread.name
            ]
            if not bad_threads:
                break
            else:
                sleep(0.01)
            if time() > start + 5:
                # Raise an error with information about leaked threads
                from distributed import profile
    
                bad_thread = bad_threads[0]
                call_stacks = profile.call_stack(sys._current_frames()[bad_thread.ident])
>               assert False, (bad_thread, call_stacks)
E               AssertionError: (<Thread(asyncio_3, started daemon 123145605582848)>, ['  File "/Users/runner/miniconda3/envs/dask-distributed/lib/pyt...ibuted/lib/python3.10/concurrent/futures/thread.py", line 81, in _worker
E                 \twork_item = work_queue.get(block=True)
E                 '])
E               assert False

distributed/utils_test.py:1755: AssertionError
----------------------------- Captured stderr call -----------------------------
2022-07-26 14:46:57,289 - distributed.worker - INFO -       Start worker at:    tcp://10.79.11.16:54792
2022-07-26 14:46:57,289 - distributed.worker - INFO -          Listening to:    tcp://10.79.11.16:54792
2022-07-26 14:46:57,289 - distributed.worker - INFO -          dashboard at:          10.79.11.16:54793
2022-07-26 14:46:57,289 - distributed.worker - INFO - Waiting to connect to:    tcp://10.79.11.16:54755
2022-07-26 14:46:57,289 - distributed.worker - INFO - -------------------------------------------------
2022-07-26 14:46:57,289 - distributed.worker - INFO -               Threads:                          1
2022-07-26 14:46:57,289 - distributed.worker - INFO -                Memory:                   4.67 GiB
2022-07-26 14:46:57,290 - distributed.worker - INFO -       Local Directory: /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/dask-worker-space/worker-p3uk5khp
2022-07-26 14:46:57,290 - distributed.worker - INFO - -------------------------------------------------
2022-07-26 14:46:58,033 - distributed.worker - INFO -         Registered to:    tcp://10.79.11.16:54755
2022-07-26 14:46:58,034 - distributed.worker - INFO - -------------------------------------------------
2022-07-26 14:46:58,035 - distributed.core - INFO - Starting established connection
2022-07-26 14:46:58,174 - distributed.worker - INFO - Stopping worker at tcp://10.79.11.16:54792
2022-07-26 14:46:58,177 - distributed.worker - INFO - Connection to scheduler broken. Closing without reporting. ID: Worker-ea338bf0-1457-492a-b791-3dc97bd58bfd Address tcp://10.79.11.16:54792 Status: Status.closing
2022-07-26 14:47:00,481 - distributed.worker - INFO -       Start worker at:    tcp://10.79.11.16:54845
2022-07-26 14:47:00,482 - distributed.worker - INFO -          Listening to:    tcp://10.79.11.16:54845
2022-07-26 14:47:00,482 - distributed.worker - INFO -          dashboard at:          10.79.11.16:54846
2022-07-26 14:47:00,482 - distributed.worker - INFO - Waiting to connect to:    tcp://10.79.11.16:54755
2022-07-26 14:47:00,482 - distributed.worker - INFO - -------------------------------------------------
2022-07-26 14:47:00,482 - distributed.worker - INFO -               Threads:                          1
2022-07-26 14:47:00,482 - distributed.worker - INFO -                Memory:                   4.67 GiB
2022-07-26 14:47:00,482 - distributed.worker - INFO -       Local Directory: /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/dask-worker-space/worker-bwpenfrc
2022-07-26 14:47:00,482 - distributed.worker - INFO - -------------------------------------------------
2022-07-26 14:47:01,331 - distributed.worker - INFO -         Registered to:    tcp://10.79.11.16:54755
2022-07-26 14:47:01,332 - distributed.worker - INFO - -------------------------------------------------
2022-07-26 14:47:01,333 - distributed.core - INFO - Starting established connection
2022-07-26 14:47:01,471 - distributed.worker - INFO - Stopping worker at tcp://10.79.11.16:54845
2022-07-26 14:47:01,475 - distributed.worker - INFO - Connection to scheduler broken. Closing without reporting. ID: Worker-d17dea54-fc9a-405c-b029-58f458f5c865 Address tcp://10.79.11.16:54845 Status: Status.closing

https://github.com/dask/distributed/runs/7522388304?check_suite_focus=true#step:11:2209

Metadata

Metadata

Assignees

No one assigned

    Labels

    flaky testIntermittent failures on CI.

    Type

    No type

    Fields

    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions