Skip to content

Flaky distributed.deploy.tests.test_spec_cluster::test_restart #6795

Description

@gjoseph92
_________________________________ test_restart _________________________________

args = (), kwds = {}

    @wraps(func)
    def inner(*args, **kwds):
>       with self._recreate_cm():

../../../miniconda3/envs/dask-distributed/lib/python3.10/contextlib.py:78: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
../../../miniconda3/envs/dask-distributed/lib/python3.10/contextlib.py:142: in __exit__
    next(self.gen)
distributed/utils_test.py:1904: in clean
    with check_thread_leak() if threads else nullcontext():
../../../miniconda3/envs/dask-distributed/lib/python3.10/contextlib.py:142: in __exit__
    next(self.gen)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    @contextmanager
    def check_thread_leak():
        """Context manager to ensure we haven't leaked any threads"""
        active_threads_start = threading.enumerate()
    
        yield
    
        start = time()
        while True:
            bad_threads = [
                thread
                for thread in threading.enumerate()
                if thread not in active_threads_start
                # FIXME this looks like a genuine leak that needs fixing
                and "watch message queue" not in thread.name
            ]
            if not bad_threads:
                break
            else:
                sleep(0.01)
            if time() > start + 5:
                # Raise an error with information about leaked threads
                from distributed import profile
    
                bad_thread = bad_threads[0]
                call_stacks = profile.call_stack(sys._current_frames()[bad_thread.ident])
>               assert False, (bad_thread, call_stacks)
E               AssertionError: (<Thread(asyncio_4, started daemon 123145657024512)>, ['  File "/Users/runner/miniconda3/envs/dask-distributed/lib/pyt...ibuted/lib/python3.10/concurrent/futures/thread.py", line 81, in _worker
E                 \twork_item = work_queue.get(block=True)
E                 '])
E               assert False

distributed/utils_test.py:1755: AssertionError
----------------------------- Captured stderr call -----------------------------
2022-07-26 14:47:09,998 - distributed.worker - INFO -       Start worker at:    tcp://10.79.11.16:54996
2022-07-26 14:47:09,999 - distributed.worker - INFO -          Listening to:    tcp://10.79.11.16:54996
2022-07-26 14:47:09,999 - distributed.worker - INFO -          dashboard at:          10.79.11.16:54998
2022-07-26 14:47:09,999 - distributed.worker - INFO - Waiting to connect to:    tcp://10.79.11.16:54947
2022-07-26 14:47:09,999 - distributed.worker - INFO - -------------------------------------------------
2022-07-26 14:47:09,999 - distributed.worker - INFO -               Threads:                          1
2022-07-26 14:47:09,999 - distributed.worker - INFO -                Memory:                   4.67 GiB
2022-07-26 14:47:09,999 - distributed.worker - INFO -       Local Directory: /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/dask-worker-space/worker-j6b4ssyq
2022-07-26 14:47:09,999 - distributed.worker - INFO - -------------------------------------------------
2022-07-26 14:47:10,002 - distributed.worker - INFO -       Start worker at:    tcp://10.79.11.16:54997
2022-07-26 14:47:10,003 - distributed.worker - INFO -          Listening to:    tcp://10.79.11.16:54997
2022-07-26 14:47:10,003 - distributed.worker - INFO -          dashboard at:          10.79.11.16:54999
2022-07-26 14:47:10,003 - distributed.worker - INFO - Waiting to connect to:    tcp://10.79.11.16:54947
2022-07-26 14:47:10,003 - distributed.worker - INFO - -------------------------------------------------
2022-07-26 14:47:10,003 - distributed.worker - INFO -               Threads:                          1
2022-07-26 14:47:10,003 - distributed.worker - INFO -                Memory:                   4.67 GiB
2022-07-26 14:47:10,003 - distributed.worker - INFO -       Local Directory: /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/dask-worker-space/worker-175h2tkz
2022-07-26 14:47:10,003 - distributed.worker - INFO - -------------------------------------------------
2022-07-26 14:47:10,822 - distributed.worker - INFO -         Registered to:    tcp://10.79.11.16:54947
2022-07-26 14:47:10,822 - distributed.worker - INFO - -------------------------------------------------
2022-07-26 14:47:10,825 - distributed.core - INFO - Starting established connection
2022-07-26 14:47:10,832 - distributed.worker - INFO -         Registered to:    tcp://10.79.11.16:54947
2022-07-26 14:47:10,833 - distributed.worker - INFO - -------------------------------------------------
2022-07-26 14:47:10,835 - distributed.core - INFO - Starting established connection
2022-07-26 14:47:10,858 - distributed.worker - INFO - Stopping worker at tcp://10.79.11.16:54996
2022-07-26 14:47:10,858 - distributed.worker - INFO - Stopping worker at tcp://10.79.11.16:54997
2022-07-26 14:47:10,861 - distributed.worker - INFO - Connection to scheduler broken. Closing without reporting. ID: Worker-a09f495d-5495-448a-85d9-d2c5f735753c Address tcp://10.79.11.16:54996 Status: Status.closing
2022-07-26 14:47:10,862 - distributed.worker - INFO - Connection to scheduler broken. Closing without reporting. ID: Worker-da6fefef-0473-410b-9b09-012c1a356e24 Address tcp://10.79.11.16:54997 Status: Status.closing
2022-07-26 14:47:11,123 - distributed.nanny - WARNING - Restarting worker
2022-07-26 14:47:11,160 - distributed.nanny - WARNING - Restarting worker
2022-07-26 14:47:13,037 - distributed.worker - INFO -       Start worker at:    tcp://10.79.11.16:55053
2022-07-26 14:47:13,037 - distributed.worker - INFO -          Listening to:    tcp://10.79.11.16:55053
2022-07-26 14:47:13,038 - distributed.worker - INFO -          dashboard at:          10.79.11.16:55054
2022-07-26 14:47:13,038 - distributed.worker - INFO - Waiting to connect to:    tcp://10.79.11.16:54947
2022-07-26 14:47:13,038 - distributed.worker - INFO - -------------------------------------------------
2022-07-26 14:47:13,038 - distributed.worker - INFO -               Threads:                          1
2022-07-26 14:47:13,038 - distributed.worker - INFO -                Memory:                   4.67 GiB
2022-07-26 14:47:13,038 - distributed.worker - INFO -       Local Directory: /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/dask-worker-space/worker-7l0_0qgj
2022-07-26 14:47:13,038 - distributed.worker - INFO - -------------------------------------------------
2022-07-26 14:47:13,133 - distributed.worker - INFO -       Start worker at:    tcp://10.79.11.16:55056
2022-07-26 14:47:13,133 - distributed.worker - INFO -          Listening to:    tcp://10.79.11.16:55056
2022-07-26 14:47:13,133 - distributed.worker - INFO -          dashboard at:          10.79.11.16:55057
2022-07-26 14:47:13,133 - distributed.worker - INFO - Waiting to connect to:    tcp://10.79.11.16:54947
2022-07-26 14:47:13,133 - distributed.worker - INFO - -------------------------------------------------
2022-07-26 14:47:13,134 - distributed.worker - INFO -               Threads:                          1
2022-07-26 14:47:13,134 - distributed.worker - INFO -                Memory:                   4.67 GiB
2022-07-26 14:47:13,134 - distributed.worker - INFO -       Local Directory: /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/dask-worker-space/worker-u5v6idqd
2022-07-26 14:47:13,134 - distributed.worker - INFO - -------------------------------------------------
2022-07-26 14:47:13,850 - distributed.worker - INFO -         Registered to:    tcp://10.79.11.16:54947
2022-07-26 14:47:13,851 - distributed.worker - INFO - -------------------------------------------------
2022-07-26 14:47:13,865 - distributed.core - INFO - Starting established connection
2022-07-26 14:47:13,926 - distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
2022-07-26 14:47:13,926 - distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
2022-07-26 14:47:13,954 - distributed.worker - INFO -         Registered to:    tcp://10.79.11.16:54947
2022-07-26 14:47:13,955 - distributed.worker - INFO - -------------------------------------------------
2022-07-26 14:47:13,961 - distributed.core - INFO - Starting established connection
2022-07-26 14:47:14,061 - distributed.worker - INFO - Stopping worker at tcp://10.79.11.16:55053
2022-07-26 14:47:14,061 - distributed.worker - INFO - Stopping worker at tcp://10.79.11.16:55056
2022-07-26 14:47:14,064 - distributed.worker - INFO - Connection to scheduler broken. Closing without reporting. ID: Worker-6688e5a5-bb12-4f82-b7f7-055093c8fb29 Address tcp://10.79.11.16:55053 Status: Status.closing
2022-07-26 14:47:14,065 - distributed.worker - INFO - Connection to scheduler broken. Closing without reporting. ID: Worker-500c5cbf-98b1-4a42-bdd3-0873c4507deb Address tcp://10.79.11.16:55056 Status: Status.closing

https://github.com/dask/distributed/runs/7522388304?check_suite_focus=true#step:11:2289

Metadata

Metadata

Assignees

No one assigned

    Labels

    flaky testIntermittent failures on CI.

    Type

    No type

    Fields

    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions