From dcb8a8c8628db9a27ff3554d5cc8eef5f62344c0 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 2 Sep 2022 13:17:14 -0700 Subject: [PATCH 1/6] Catch `BaseException` on UCX read error This change will ensure `CancelledError`s are catched upon shutting down the Dask cluster, which may otherwise raise various errors. See also https://github.com/dask/distributed/pull/6574 . --- distributed/comm/ucx.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index 16ce82d49d2..664c8bc0405 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -325,10 +325,10 @@ async def read(self, deserializers=("cuda", "dask", "pickle", "error")): await self.ep.recv(header) header = struct.unpack(header_fmt, header) cuda_frames, sizes = header[:nframes], header[nframes:] - except ( - ucp.exceptions.UCXCloseError, - ucp.exceptions.UCXCanceled, - ) + (getattr(ucp.exceptions, "UCXConnectionReset", ()),): + except BaseException: + # In addition to UCX exceptions, may be CancelledError or a another + # "low-level" exception. The only safe thing to do is to abort. + # (See also https://github.com/dask/distributed/pull/6574). self.abort() raise CommClosedError("Connection closed by writer") else: From 97b0d5d9994968742f888016918927b893dc398a Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 2 Sep 2022 13:33:34 -0700 Subject: [PATCH 2/6] Test UCX for read errors --- distributed/comm/tests/test_ucx.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index 581e17b8027..5d0b01caab9 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -367,3 +367,13 @@ async def test_ucx_unreachable( ): with pytest.raises(OSError, match="Timed out trying to connect to"): await Client("ucx://255.255.255.255:12345", timeout=1, asynchronous=True) + + +@gen_test() +async def test_comm_closed_on_read_error(): + reader, writer = await get_comm_pair() + + with pytest.raises(asyncio.TimeoutError): + await asyncio.wait_for(reader.read(), 0.01) + + assert reader.closed() From ec8d7027ee20bad4c93d107a460a1cb4b733c0ee Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 2 Sep 2022 14:21:08 -0700 Subject: [PATCH 3/6] Validate test independent of selected UCP protocol --- distributed/comm/tests/test_ucx.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index 5d0b01caab9..72e50b1b1b9 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -12,6 +12,7 @@ from distributed import Client, Scheduler, wait from distributed.comm import connect, listen, parse_address, ucx +from distributed.comm.core import CommClosedError from distributed.comm.registry import backends, get_backend from distributed.deploy.local import LocalCluster from distributed.diagnostics.nvml import has_cuda_context @@ -373,7 +374,9 @@ async def test_ucx_unreachable( async def test_comm_closed_on_read_error(): reader, writer = await get_comm_pair() - with pytest.raises(asyncio.TimeoutError): + # Depending on the UCP protocol selected, it may raise either + # `asyncio.TimeoutError` or `CommClosedError`, so validate either one. + with pytest.raises((asyncio.TimeoutError, CommClosedError)): await asyncio.wait_for(reader.read(), 0.01) assert reader.closed() From da64c0353dd2070908fe3e198c593db52b2b26ab Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 28 Sep 2022 11:06:54 +0200 Subject: [PATCH 4/6] Include the inner exception when raising `CommClosedError` Co-authored-by: Lawrence Mitchell --- distributed/comm/ucx.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index 664c8bc0405..5899d1d4e68 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -325,12 +325,13 @@ async def read(self, deserializers=("cuda", "dask", "pickle", "error")): await self.ep.recv(header) header = struct.unpack(header_fmt, header) cuda_frames, sizes = header[:nframes], header[nframes:] - except BaseException: + except BaseException as e: # In addition to UCX exceptions, may be CancelledError or a another # "low-level" exception. The only safe thing to do is to abort. # (See also https://github.com/dask/distributed/pull/6574). self.abort() - raise CommClosedError("Connection closed by writer") + raise CommClosedError("Connection closed by writer.\n" + "Inner exception: {e!r}") else: # Recv frames frames = [ From 514dc68984674a43906c341bc8ef24cb086a1bb1 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 28 Sep 2022 12:01:33 +0200 Subject: [PATCH 5/6] Fix missing f-string prefix Co-authored-by: Lawrence Mitchell --- distributed/comm/ucx.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index 5899d1d4e68..326b6077d5d 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -331,7 +331,7 @@ async def read(self, deserializers=("cuda", "dask", "pickle", "error")): # (See also https://github.com/dask/distributed/pull/6574). self.abort() raise CommClosedError("Connection closed by writer.\n" - "Inner exception: {e!r}") + f"Inner exception: {e!r}") else: # Recv frames frames = [ From 582bc2ce8326e82ee63e6d77a0d53454e6d8b5b7 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 29 Sep 2022 10:12:38 -0700 Subject: [PATCH 6/6] Fix linting --- distributed/comm/ucx.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index 326b6077d5d..9c73a538156 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -330,8 +330,9 @@ async def read(self, deserializers=("cuda", "dask", "pickle", "error")): # "low-level" exception. The only safe thing to do is to abort. # (See also https://github.com/dask/distributed/pull/6574). self.abort() - raise CommClosedError("Connection closed by writer.\n" - f"Inner exception: {e!r}") + raise CommClosedError( + f"Connection closed by writer.\nInner exception: {e!r}" + ) else: # Recv frames frames = [