diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index 581e17b8027..72e50b1b1b9 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -12,6 +12,7 @@ from distributed import Client, Scheduler, wait from distributed.comm import connect, listen, parse_address, ucx +from distributed.comm.core import CommClosedError from distributed.comm.registry import backends, get_backend from distributed.deploy.local import LocalCluster from distributed.diagnostics.nvml import has_cuda_context @@ -367,3 +368,15 @@ async def test_ucx_unreachable( ): with pytest.raises(OSError, match="Timed out trying to connect to"): await Client("ucx://255.255.255.255:12345", timeout=1, asynchronous=True) + + +@gen_test() +async def test_comm_closed_on_read_error(): + reader, writer = await get_comm_pair() + + # Depending on the UCP protocol selected, it may raise either + # `asyncio.TimeoutError` or `CommClosedError`, so validate either one. + with pytest.raises((asyncio.TimeoutError, CommClosedError)): + await asyncio.wait_for(reader.read(), 0.01) + + assert reader.closed() diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index 16ce82d49d2..9c73a538156 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -325,12 +325,14 @@ async def read(self, deserializers=("cuda", "dask", "pickle", "error")): await self.ep.recv(header) header = struct.unpack(header_fmt, header) cuda_frames, sizes = header[:nframes], header[nframes:] - except ( - ucp.exceptions.UCXCloseError, - ucp.exceptions.UCXCanceled, - ) + (getattr(ucp.exceptions, "UCXConnectionReset", ()),): + except BaseException as e: + # In addition to UCX exceptions, may be CancelledError or a another + # "low-level" exception. The only safe thing to do is to abort. + # (See also https://github.com/dask/distributed/pull/6574). self.abort() - raise CommClosedError("Connection closed by writer") + raise CommClosedError( + f"Connection closed by writer.\nInner exception: {e!r}" + ) else: # Recv frames frames = [