dask · crusaderky · Jul 2, 2026 · Jul 1, 2026 · Jul 2, 2026 · Jul 2, 2026
@@ -658,6 +658,14 @@ async def _handle_stream(self, stream, address):
         if stream is None:
             # Preparation failed
             return
+        if self.tcp_server is None:
+            # stop() was called after the connection was accepted, but before this
+            # method could run. abort_handshaking_comms() has already run and won't
+            # take care of this comm; if we left the stream dangling, the client
+            # would hang forever in the comm handshake, which is deliberately not
+            # subject to timeouts (see distributed.comm.core.connect()).
+            stream.close()
+            return
         logger.debug("Incoming connection from %r to %r", address, self.contact_address)
         local_address = self.prefix + get_stream_address(stream)
         comm = self.comm_class(stream, local_address, address, self.deserialize)

@@ -989,6 +989,26 @@ def get_connector(self):
         listener.stop()
 
 
+@gen_test()
+async def test_stop_listener_during_handle_stream(tcp):
+    """The listener is stopped after a connection has been accepted, but before the
+    server could start handling it. The accepted stream must be closed, so that the
+    client fails fast instead of hanging forever in the comm handshake, which is
+    deliberately not subject to the connect timeout (see test_handshake_slow_comm).
+    """
+    listener = await listen("tcp://127.0.0.1", echo)
+    orig_handle_stream = listener._handle_stream
+
+    async def stop_then_handle_stream(stream, address):
+        listener.stop()
+        await orig_handle_stream(stream, address)
+
+    listener.tcp_server.handle_stream = stop_then_handle_stream
+
+    with pytest.raises(CommClosedError):
+        await wait_for(connect(listener.contact_address), timeout=5)
+
+
 async def check_connect_timeout(addr):
     t1 = time()
     with pytest.raises(IOError):

@@ -1296,9 +1296,19 @@ async def tensordot_stress(c, s):
             break
     else:
         raise RuntimeError("Expected 'update_graph' event not found")
-    # Test that we didn't recompute any tasks during the stress test
+    # Test that we didn't recompute any tasks during the stress test.
+    # Exception: when a worker is retired, any tasks that completed on it between the
+    # moment the AMM RetireWorker policy measured that no unique keys were left on it
+    # and the moment the worker was actually removed are lost and will be recomputed
+    # elsewhere (see RetireWorker.done).
     await async_poll_for(lambda: not s.tasks)
-    assert sum(t.start == "memory" for t in s.transition_log) == expected_tasks
+    lost = sum(
+        len(msg["lost-computed-tasks"])
+        for _, msg in await c.get_events("all")
+        if msg["action"] == "remove-worker" and msg["expected"]
+    )
+    actual = sum(t.start == "memory" for t in s.transition_log)
+    assert expected_tasks <= actual <= expected_tasks + lost
 
 
 @pytest.mark.slow
@@ -1372,6 +1382,7 @@ async def test_ReduceReplicas_stress(c, s, *workers):
     },
     scheduler_kwargs={"transition_counter_max": 500_000},
     worker_kwargs={"transition_counter_max": 500_000},
+    timeout=180,  # Normally runs in ~5s, but has been observed to take up to 48s
 )
 async def test_RetireWorker_stress(c, s, *workers, use_ReduceReplicas):
     """It is safe to retire the best part of a cluster in the middle of a computation"""