diff --git a/distributed/tests/test_worker.py b/distributed/tests/test_worker.py index 2488fbc447a..20719b21ddb 100644 --- a/distributed/tests/test_worker.py +++ b/distributed/tests/test_worker.py @@ -1783,6 +1783,28 @@ async def close(self): assert not s.workers +@gen_cluster( + client=True, + nthreads=[("", 1)], + Worker=Nanny, + worker_kwargs={"heartbeat_interval": "1ms"}, +) +async def test_heartbeat_missing_restarts(c, s: Scheduler, n: Nanny): + old_heartbeat_handler = s.handlers["heartbeat_worker"] + s.handlers["heartbeat_worker"] = lambda *args, **kwargs: {"status": "missing"} + + assert n.process + await n.process.stopped.wait() + + assert not s.workers + s.handlers["heartbeat_worker"] = old_heartbeat_handler + + await n.process.running.wait() + assert n.status == Status.running + + await c.wait_for_workers(1) + + @gen_cluster(nthreads=[]) async def test_bad_local_directory(s): try: diff --git a/distributed/worker.py b/distributed/worker.py index a8f4b6ba942..1277a78b1d0 100644 --- a/distributed/worker.py +++ b/distributed/worker.py @@ -1245,7 +1245,8 @@ async def heartbeat(self): logger.error( f"Scheduler was unaware of this worker {self.address!r}. Shutting down." ) - await self.close() + # Something is out of sync; have the nanny restart us if possible. + await self.close(nanny=False) return self.scheduler_delay = response["time"] - middle