From a9c17f8ec82b57d015cf54f5d53190acc266a1e6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 17 Jan 2019 11:49:17 -0800 Subject: [PATCH 01/68] ENH: UCX-based Comms * Stubs for classes --- distributed/comm/__init__.py | 5 + distributed/comm/tests/__init__.py | 0 distributed/comm/tests/test_comms.py | 12 +- distributed/comm/tests/test_ucx.py | 194 +++++++++++++++ distributed/comm/ucx.py | 316 ++++++++++++++++++++++++ distributed/core.py | 7 +- distributed/deploy/local.py | 40 ++- distributed/deploy/tests/test_local.py | 15 ++ distributed/protocol/__init__.py | 6 + distributed/protocol/cuda.py | 38 +++ distributed/protocol/tests/test_cuda.py | 33 +++ distributed/protocol/utils.py | 4 + distributed/utils.py | 7 +- 13 files changed, 665 insertions(+), 12 deletions(-) create mode 100644 distributed/comm/tests/__init__.py create mode 100644 distributed/comm/tests/test_ucx.py create mode 100644 distributed/comm/ucx.py create mode 100644 distributed/protocol/cuda.py create mode 100644 distributed/protocol/tests/test_cuda.py diff --git a/distributed/comm/__init__.py b/distributed/comm/__init__.py index dfda0459a54..2a4670b5707 100644 --- a/distributed/comm/__init__.py +++ b/distributed/comm/__init__.py @@ -12,6 +12,11 @@ def _register_transports(): from . import inproc from . import tcp + try: + from . import ucx + except ImportError: + pass + _register_transports() diff --git a/distributed/comm/tests/__init__.py b/distributed/comm/tests/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/distributed/comm/tests/test_comms.py b/distributed/comm/tests/test_comms.py index 0cfeb7a63a8..cdf40d3f2e7 100644 --- a/distributed/comm/tests/test_comms.py +++ b/distributed/comm/tests/test_comms.py @@ -25,7 +25,8 @@ from distributed.comm import (tcp, inproc, connect, listen, CommClosedError, parse_address, parse_host_port, unparse_host_port, resolve_address, - get_address_host, get_local_address_for) + get_address_host, get_local_address_for, + registry) EXTERNAL_IP4 = get_ip() @@ -458,7 +459,7 @@ def handle_comm(comm): # Check listener properties bound_addr = listener.listen_address bound_scheme, bound_loc = parse_address(bound_addr) - assert bound_scheme in ('inproc', 'tcp', 'tls') + assert bound_scheme in registry.backends assert bound_scheme == parse_address(addr)[0] if check_listen_addr is not None: @@ -501,6 +502,12 @@ def client_communicate(key, delay=0): listener.stop() +@gen_test() +def test_ucx_client_server(): + pytest.importorskip("distributed.comm.ucx") + yield check_client_server('ucx://10.33.225.160') + + def tcp_eq(expected_host, expected_port=None): def checker(loc): host, port = parse_host_port(loc) @@ -598,6 +605,7 @@ def test_inproc_client_server(): yield check_client_server(inproc.new_address(), inproc_check()) + # # TLS certificate handling # diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py new file mode 100644 index 00000000000..5379b321f19 --- /dev/null +++ b/distributed/comm/tests/test_ucx.py @@ -0,0 +1,194 @@ +import asyncio +import itertools + +import pytest +import dask +import numpy as np + +from distributed.comm import ucx, listen, connect +from distributed.comm.registry import backends, get_backend +from distributed.comm import ucx, parse_address, parse_host_port +from distributed.protocol import to_serialize +from distributed.utils_test import gen_test + +from .test_comms import check_deserialize +import ucp_py as ucp + + +ADDRESS = ucx.ADDRESS +HOST, PORT = parse_host_port(ADDRESS.lstrip("ucx://")) +HOST = 'ucx://' + HOST +# Currently having some issues with re-using ports. +# Tests just hang. Still debugging. +port_counter = itertools.count(PORT) + + +def test_parse_address(): + result = ucx._parse_address("ucx://10.33.225.160") + assert result == ("ucx", "10.33.225.160") + + +def test_parse_host_port(): + assert ucx._parse_host_port("10.33.225.160") == ("10.33.225.160", 13337) + assert ucx._parse_host_port("10.33.225.160:13337") == ("10.33.225.160", 13337) + assert ucx._parse_host_port("10.33.225.160:13338") == ("10.33.225.160", 13338) + + +def test_registered(): + assert "ucx" in backends + backend = get_backend("ucx") + assert isinstance(backend, ucx.UCXBackend) + + +async def get_comm_pair(listen_addr, listen_args=None, connect_args=None, **kwargs): + q = asyncio.queues.Queue() + + async def handle_comm(comm): + await q.put(comm) + + listener = listen(listen_addr, handle_comm, connection_args=listen_args, **kwargs) + listener.start() + + comm = await connect( + listener.contact_address, connection_args=connect_args, **kwargs + ) + serv_com = await q.get() + return comm, serv_com + + +@pytest.mark.asyncio +async def test_ping_pong(): + address = "{}:{}".format(HOST, next(port_counter)) + com, serv_com = await get_comm_pair(address) + msg = {"op": "ping"} + await com.write(msg) + result = await serv_com.read() + assert result == msg + result["op"] = "pong" + + await serv_com.write(result) + + result = await com.read() + assert result == {"op": "pong"} + + await com.close() + await serv_com.close() + + +@pytest.mark.asyncio +async def test_comm_objs(): + address = "{}:{}".format(HOST, next(port_counter)) + comm, serv_com = await get_comm_pair(address) + + assert comm.peer_address == address + scheme, loc = parse_address(comm.peer_address) + assert scheme == 'ucx' + + assert comm.peer_address == address + scheme, loc = parse_address(serv_com.peer_address) + assert scheme == 'ucx' + + +def test_ucx_specific(): + """ + Test concrete UCX API. + """ + # TODO: + # 1. ensure exceptions in handle_comm fail the test + # 2. Use dict in read / write, put seralization there. + # 3. Test peer_address + # 4. Test cleanup + async def f(): + address = "{}:{}".format(HOST, next(port_counter)) + + async def handle_comm(comm): + # XXX: failures here don't fail the build yet + msg = await comm.read() + msg["op"] = "pong" + await comm.write(msg) + assert comm.closed() is False + await comm.close() + assert comm.closed + + listener = ucx.UCXListener(address, handle_comm) + listener.start() + host, port = listener.get_host_port() + assert host.count(".") == 3 + assert port > 0 + + connector = ucx.UCXConnector() + l = [] + + async def client_communicate(key, delay=0): + addr = "%s:%d" % (host, port) + comm = await connector.connect(addr) + # TODO: peer_address + # assert comm.peer_address == 'ucx://' + addr + assert comm.extra_info == {} + msg = {"op": "ping", "data": key} + await comm.write(msg) + if delay: + await asyncio.sleep(delay) + msg = await comm.read() + assert msg == {"op": "pong", "data": key} + l.append(key) + return comm + assert comm.closed() is False + await comm.close() + assert comm.closed + + comm = await client_communicate(key=1234, delay=0.5) + + # Many clients at once + N = 2 + futures = [client_communicate(key=i, delay=0.05) for i in range(N)] + await asyncio.gather(*futures) + assert set(l) == {1234} | set(range(N)) + + asyncio.run(f()) + + +@pytest.mark.asyncio +async def test_ping_pong_data(): + data = np.ones((10, 10)) + # TODO: broken for large arrays + address = "{}:{}".format(HOST, next(port_counter)) + com, serv_com = await get_comm_pair(address) + msg = {"op": "ping", "data": to_serialize(data)} + await com.write(msg) + result = await serv_com.read() + result["op"] = "pong" + data2 = result.pop('data') + np.testing.assert_array_equal(data2, data) + + await serv_com.write(result) + + result = await com.read() + assert result == {"op": "pong"} + + await com.close() + await serv_com.close() + + +@gen_test() +def test_ucx_deserialize(): + yield check_deserialize("tcp://") + + +@pytest.mark.asyncio +async def test_ping_pong_cupy(): + cupy = pytest.importorskip('cupy') + address = "{}:{}".format(HOST, next(port_counter)) + com, serv_com = await get_comm_pair(address) + + arr = cupy.random.random((100, 10)) + msg = {"op": "ping", 'data': to_serialize(arr)} + + await com.write(msg) + result = await serv_com.read() + data2 = result.pop('data') + + assert result['op'] == 'ping' + cupy.testing.assert_array_equal(arr, data2) + await com.close() + await serv_com.close() diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py new file mode 100644 index 00000000000..541817c9465 --- /dev/null +++ b/distributed/comm/ucx.py @@ -0,0 +1,316 @@ +""" +:ref:`UCX`_ based communications for distributed. + +See :ref:`communcations` for more. + +.. _UCX: https://github.com/openucx/ucx +""" +import asyncio +import itertools +import logging +import sys +import struct +import msgpack + +from dask import config + +from .addressing import parse_host_port, unparse_host_port +from .core import Comm, Connector, Listener +from .registry import Backend, backends +from .utils import ensure_concrete_host, to_frames, from_frames +from ..utils import ensure_ip, get_ip, get_ipv6, nbytes + +import ucp_py as ucp + +logger = logging.getLogger(__name__) +MAX_MSG_LOG = 23 +PORT = 13337 +IP = ucp.get_address() +DEFAULT_ADDRESS = f"ucx://{IP}:{PORT}" + +# set in ~/.dask/config.yaml +# or DASK_DISTRIBUTED__COMM__UCXADDRESS +ADDRESS = DEFAULT_ADDRESS +_PORT_COUNTER = itertools.count(PORT) + +_INITIALIZED = False + + +def _ucp_init(): + global _INITIALIZED + + if not _INITIALIZED: + ucp.init() + _INITIALIZED = True + + +# ---------------------------------------------------------------------------- +# Addressing +# TODO: Parts of these should probably be moved to `comm/addressing.py` +# ---------------------------------------------------------------------------- + + +def _parse_address(addr: str, strict=False) -> tuple: + """ + >>> _parse_address("ucx://10.33.225.160") + """ + if not addr.startswith("ucx://"): + raise ValueError("Invalid url scheme {}".format(addr)) + + proto, address = addr.split("://", 1) + return proto, address + + +def _parse_host_port(address: str, default_port=None) -> tuple: + """ + Parse an endpoint address given in the form "host:port". + + >>> _parse_host_port("10.33.225.160:13337") + ("10.33.225.160", 13337) + """ + if address.startswith("ucx://"): + _, address = _parse_address(address) + + default_port = default_port or 13337 + return parse_host_port(address, default_port=default_port) + + +def _unparse_host_port(host, port=None): + return unparse_host_port(host, port) + + +def get_endpoint_address(endpoint): + # TODO: ucx-py: 18 + pass + + +# ---------------------------------------------------------------------------- +# Comm Interface +# ---------------------------------------------------------------------------- + + +class UCX(Comm): + """Comm object using UCP. + + Parameters + ---------- + ep : ucp.ucp_py_ep + The UCP endpoint. + address : str + The address, prefixed with `ucx://` to use. + deserialize : bool, default True + Whether to deserialize data in :meth:`distributed.protocol.loads` + """ + + def __init__(self, ep: ucp.ucp_py_ep, + address: str, + listener_instance, + deserialize=True): + logger.info("UCX.__init__ %s %s", address, listener_instance) + self.ep = ep + assert address.startswith("ucx") + self.address = address + self.listener_instance = listener_instance + default_port = next(_PORT_COUNTER) + self._host, self._port = _parse_host_port(address, default_port) + self._local_addr = None + self.deserialize = deserialize + + # finalizer? + + @property + def local_address(self) -> str: + return self._local_addr + + @property + def peer_address(self) -> str: + # XXX: This isn't quite for the server (from UCXListener). + # We need the port? Or the tag? + return self.address + + async def write(self, msg: dict, serializers=None, on_error: str = "message"): + frames = await to_frames( + msg, serializers=serializers, on_error=on_error + ) # TODO: context= + nframes = struct.pack("Q", len(frames)) + await self.ep.send_obj(nframes, sys.getsizeof(nframes)) # send number of frames + + for frame in frames: + if isinstance(frame, memoryview): + # TODO: UCX-PY #27 + frame = frame.tobytes() + size = sys.getsizeof(frame) + await self.ep.send_obj(frame, size) + return sum(map(nbytes, frames)) + + async def read(self, deserializers=None): + resp = await self.ep.recv_future() + # XXX: this breaks things, e.g. test_ucx_specific + # dummy = b'0' * 8 + # resp = await self.ep.recv_obj(dummy, 41) + obj = ucp.get_obj_from_msg(resp) + n_frames, = struct.unpack("Q", obj) + + # Notes: + # 1. Eventually, ucp_msg will be our abstraction over GPU vs. CPU + # memory. We won't need to worry about checking for the destination + # here. The message object will have a reference to the region of + # memory. Downstream of us (say from _frames) will deserialize + # appropriately, based on whether it's a GPU or CPU object. + # 2. We will still deserialize the header early, to check the *length* + # of the next message. This lets us use the faster `recv_obj` to + # read the next nbytes. + _header_start = b'\x83\xa7headers' + + frames = [] + msg = {} + + for i in range(n_frames): + resp = await self.ep.recv_future() + frame = ucp.get_obj_from_msg(resp) + frames.append(frame) + + msg = await from_frames( + frames, deserialize=self.deserialize, deserializers=deserializers + ) + return msg + + def abort(self): + if self.ep: + ucp.destroy_ep(self.ep) + self.ep = None + # if self.listener_instance: + # ucp.stop_listener(self.listener_instance) + + async def close(self): + # TODO: Handle in-flight messages? + self.abort() + + def closed(self): + return self.ep is None + + +class UCXConnector(Connector): + prefix = "ucx://" + comm_class = UCX + encrypted = False + + client = ... # TODO: add a client here? + + async def connect(self, address: str, deserialize=True, **connection_args) -> UCX: + logger.debug("UCXConnector.connect") + _ucp_init() + + ip, port = _parse_host_port(address) + ep = ucp.get_endpoint(ip.encode(), port) + return self.comm_class(ep, self.prefix + address, + listener_instance=None, + deserialize=deserialize) + + +class UCXListener(Listener): + prefix = UCXConnector.prefix + comm_class = UCXConnector.comm_class + encrypted = UCXConnector.encrypted + + def __init__( + self, + address: str, + comm_handler: None, + deserialize=False, + **connection_args, + ): + logger.debug("UCXListener.__init__") + if not address.startswith("ucx"): + address = "ucx://" + address + self.address = address + self.ip, self.port = _parse_host_port(address, default_port=next(_PORT_COUNTER)) + self.comm_handler = comm_handler + self.deserialize = deserialize + self.ep = None # type: ucp.ucp_py_ep + self.listener_instance = None # type: ucp.ListenerFuture + + # XXX: The init may be required to take args like + # {'require_encryption': None, 'ssl_context': None} + self.connection_args = connection_args + + def start(self): + async def serve_forever(client_ep, listener_instance): + ucx = UCX(client_ep, self.address, listener_instance, + deserialize=self.deserialize) + self.listener_instance = listener_instance + if self.comm_handler: + await self.comm_handler(ucx) + + _ucp_init() + # XXX: the port handling is probably incorrect. + # need to figure out if `server_port=None` is + # server_port=13337, or server_port="next free port" + server = ucp.start_listener( + serve_forever, listener_port=self.port, is_coroutine=True + ) + + try: + loop = asyncio.get_running_loop() + except RuntimeError: + loop = asyncio.get_event_loop() + + # Does someone need to hold onto this task? + loop.create_task(server.coroutine) + + def stop(self): + # What all should this do? + # ucp.stop_listener(self.ep) # do this here? + if self.ep: + ucp.destroy_ep(self.ep) + # if self.listener_instance: + # ucp.stop_listener(self.listener_instance) + + def get_host_port(self): + # TODO: TCP raises if this hasn't started yet. + return self.ip, self.port + + @property + def listen_address(self): + return self.prefix + _unparse_host_port(*self.get_host_port()) + + @property + def contact_address(self): + host, port = self.get_host_port() + host = ensure_concrete_host(host) # TODO: ensure_concrete_host + return self.prefix + unparse_host_port(host, port) + + +class UCXBackend(Backend): + # I / O + + def get_connector(self): + return UCXConnector() + + def get_listener(self, loc, handle_comm, deserialize, **connection_args): + return UCXListener(loc, handle_comm, deserialize, **connection_args) + + # Address handling + # This duplicates BaseTCPBackend + + def get_address_host(self, loc): + return _parse_host_port(loc)[0] + + def get_address_host_port(self, loc): + return _parse_host_port(loc) + + def resolve_address(self, loc): + host, port = parse_host_port(loc) + return _unparse_host_port(ensure_ip(host), port) + + def get_local_address_for(self, loc): + host, port = parse_host_port(loc) + host = ensure_ip(host) + if ":" in host: + local_host = get_ipv6(host) + else: + local_host = get_ip(host) + return unparse_host_port(local_host, None) + + +backends["ucx"] = UCXBackend() diff --git a/distributed/core.py b/distributed/core.py index bf08abebc8c..80e52458d11 100644 --- a/distributed/core.py +++ b/distributed/core.py @@ -396,6 +396,9 @@ def handle_stream(self, comm, extra=None, every_cycle=[]): if op == 'close-stream': closed = True break + # XXX: getting a KeyError here. Our + # not stream_handlers. + # It's /pdb handler = self.stream_handlers[op] handler(**merge(extra, msg)) else: @@ -412,7 +415,9 @@ def handle_stream(self, comm, extra=None, every_cycle=[]): pdb.set_trace() raise finally: - comm.close() # TODO: why do we need this now? + yield comm.close() # TODO: why do we need this now? + # ^ Good question :) comm.close can be a + # coroutine, in which case this isn't doing anything. assert comm.closed() @gen.coroutine diff --git a/distributed/deploy/local.py b/distributed/deploy/local.py index 72a12cfaced..fa2dc5f3ad5 100644 --- a/distributed/deploy/local.py +++ b/distributed/deploy/local.py @@ -57,6 +57,10 @@ class LocalCluster(Cluster): service_kwargs: Dict[str, Dict] Extra keywords to hand to the running services security : Security + protocol: str (optiona) + Protocol to use like ``tcp://``, ``tls://``, ``inproc://`` + This defaults to sensible choice given other keyword arguments like + ``processes`` and ``security`` Examples -------- @@ -82,7 +86,8 @@ def __init__(self, n_workers=None, threads_per_worker=None, processes=True, loop=None, start=None, ip=None, scheduler_port=0, silence_logs=logging.WARN, diagnostics_port=8787, services=None, worker_services=None, service_kwargs=None, - asynchronous=False, security=None, **worker_kwargs): + asynchronous=False, security=None, protocol=None, + **worker_kwargs): if start is not None: msg = ("The start= parameter is deprecated. " "LocalCluster always starts. " @@ -92,6 +97,20 @@ def __init__(self, n_workers=None, threads_per_worker=None, processes=True, self.status = None self.processes = processes + + if protocol is None: + if ip and '://' in ip: + protocol = ip.split('://')[0] + elif security: + protocol = 'tls://' + elif not self.processes and not scheduler_port: + protocol = 'inproc://' + else: + protocol = 'tcp://' + if not protocol.endswith('://'): + protocol = protocol + '://' + self.protocol = protocol + self.silence_logs = silence_logs self._asynchronous = asynchronous self.security = security @@ -185,16 +204,21 @@ def _start(self, ip=None, n_workers=0): """ if self.status == 'running': return - if (ip is None) and (not self.scheduler_port) and (not self.processes): - # Use inproc transport for optimization - scheduler_address = 'inproc://' - elif ip is not None and ip.startswith('tls://'): - scheduler_address = ('%s:%d' % (ip, self.scheduler_port)) + + if self.protocol == 'inproc://': + address = self.protocol else: if ip is None: ip = '127.0.0.1' - scheduler_address = (ip, self.scheduler_port) - self.scheduler.start(scheduler_address) + + if '://' in ip: + address = ip + else: + address = self.protocol + ip + if self.scheduler_port: + address += ':' + str(self.scheduler_port) + + self.scheduler.start(address) yield [self._start_worker(**self.worker_kwargs) for i in range(n_workers)] diff --git a/distributed/deploy/tests/test_local.py b/distributed/deploy/tests/test_local.py index 26d3d28784a..48f737a99bf 100644 --- a/distributed/deploy/tests/test_local.py +++ b/distributed/deploy/tests/test_local.py @@ -555,5 +555,20 @@ def _(): cluster.sync(_) +def test_protocol_inproc(loop): + with LocalCluster(protocol='inproc://', loop=loop, processes=False) as cluster: + assert cluster.scheduler.address.startswith('inproc://') + + +def test_protocol_tcp(loop): + with LocalCluster(protocol='tcp', loop=loop, processes=False) as cluster: + assert cluster.scheduler.address.startswith('tcp://') + + +def test_protocol_ip(loop): + with LocalCluster(ip='tcp://127.0.0.2', loop=loop, processes=False) as cluster: + assert cluster.scheduler.address.startswith('tcp://127.0.0.2') + + if sys.version_info >= (3, 5): from distributed.deploy.tests.py3_test_deploy import * # noqa F401 diff --git a/distributed/protocol/__init__.py b/distributed/protocol/__init__.py index bd8f7331c8e..a56871b5ad5 100644 --- a/distributed/protocol/__init__.py +++ b/distributed/protocol/__init__.py @@ -63,3 +63,9 @@ def _register_sklearn(): @dask_deserialize.register_lazy("torchvision") def _register_torch(): from . import torch + + +@dask_serialize.register_lazy("cupy") +@dask_deserialize.register_lazy("cupy") +def _register_cupy(): + from . import cuda diff --git a/distributed/protocol/cuda.py b/distributed/protocol/cuda.py new file mode 100644 index 00000000000..c9bc7875c8f --- /dev/null +++ b/distributed/protocol/cuda.py @@ -0,0 +1,38 @@ +""" +Efficient serialization GPU arrays. +""" +import cupy +from .serialize import dask_serialize, dask_deserialize + +# Some questions +# 1.Do we need *protocol-dependent* serialization? +# I assume we want this kind of serialization only when +# in UCP. +# 2. What does ucp-py need to know about? + + +@dask_serialize.register(cupy.ndarray) +def serialize_cupy_ndarray(x): + # TODO: handle non-contiguous + # shape + # typestr + # descr + # data + # version + # strides (noncontiguous-only) + header = x.__cuda_array_interface__.copy() + header['device'] = x.device.id + header['lengths'] = (x.nbytes,) # one per stride + header['compression'] = (None,) # TODO + # TODO: I don't think ucx-py should have to worry about + # MemoryPointer. Maybe some thin wrapper. + return header, [x.data] + + +@dask_deserialize.register(cupy.ndarray) +def deserialize_cupy_array(header, frames): + # MemoryPointer { PoolMemory, offset } + frame, = frames + arr = cupy.ndarray(header['shape'], dtype=header['typestr'], + memptr=frame) + return arr diff --git a/distributed/protocol/tests/test_cuda.py b/distributed/protocol/tests/test_cuda.py new file mode 100644 index 00000000000..e8e95224b0d --- /dev/null +++ b/distributed/protocol/tests/test_cuda.py @@ -0,0 +1,33 @@ +import pytest + + +cupy = pytest.importorskip("cupy") +from distributed.protocol import serialize, deserialize + + +def test_serialize(): + x = cupy.ones((5000, 50)) + header, frames = serialize(x) + type_ = 'cupy.core.core.ndarray' + _, [type_serialized] = serialize(type_) + + expected_header = { + 'shape': (5000, 50), + 'typestr': " Date: Fri, 22 Feb 2019 08:39:48 -0800 Subject: [PATCH 02/68] CUDA failing --- distributed/comm/tests/test_ucx.py | 4 ++- distributed/comm/ucx.py | 55 +++++++++++++++++++----------- distributed/protocol/cuda.py | 19 ++++------- 3 files changed, 45 insertions(+), 33 deletions(-) diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index 5379b321f19..1e8cb866c02 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -176,12 +176,14 @@ def test_ucx_deserialize(): @pytest.mark.asyncio +@pytest.mark.xfail(reason="UCX") # memory is garbage... async def test_ping_pong_cupy(): cupy = pytest.importorskip('cupy') address = "{}:{}".format(HOST, next(port_counter)) com, serv_com = await get_comm_pair(address) - arr = cupy.random.random((100, 10)) + # TODO: ucx-py doesn't handle 2d yet. + arr = cupy.random.random(100,) msg = {"op": "ping", 'data': to_serialize(arr)} await com.write(msg) diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index 541817c9465..05e15d115e2 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -133,41 +133,56 @@ async def write(self, msg: dict, serializers=None, on_error: str = "message"): msg, serializers=serializers, on_error=on_error ) # TODO: context= nframes = struct.pack("Q", len(frames)) - await self.ep.send_obj(nframes, sys.getsizeof(nframes)) # send number of frames + await self.ep.send_obj(nframes) # send number of frames for frame in frames: - if isinstance(frame, memoryview): - # TODO: UCX-PY #27 - frame = frame.tobytes() - size = sys.getsizeof(frame) - await self.ep.send_obj(frame, size) + await self.ep.send_obj(frame) return sum(map(nbytes, frames)) async def read(self, deserializers=None): resp = await self.ep.recv_future() - # XXX: this breaks things, e.g. test_ucx_specific - # dummy = b'0' * 8 - # resp = await self.ep.recv_obj(dummy, 41) obj = ucp.get_obj_from_msg(resp) n_frames, = struct.unpack("Q", obj) - # Notes: - # 1. Eventually, ucp_msg will be our abstraction over GPU vs. CPU - # memory. We won't need to worry about checking for the destination - # here. The message object will have a reference to the region of - # memory. Downstream of us (say from _frames) will deserialize - # appropriately, based on whether it's a GPU or CPU object. - # 2. We will still deserialize the header early, to check the *length* - # of the next message. This lets us use the faster `recv_obj` to - # read the next nbytes. - _header_start = b'\x83\xa7headers' + # TODO: see if we care about deserializing all headers. + # We could probably do some tricks to make this less expensive, + # (if it's even expensive in the first place) + header_start = b'\x83\xa7headers' + peek_bytes = len(header_start) frames = [] msg = {} + gpu_inbound = False + size = () + for i in range(n_frames): - resp = await self.ep.recv_future() + if size: + # XXX: when do we get multiple keys here? Non-contiguous? + assert len(size) == 1 + size, = size + resp = await self.ep.recv_obj(size, cuda=gpu_inbound) + # prepare for the next (header) recv + size = () + gpu_inbound = False + else: + resp = await self.ep.recv_future() frame = ucp.get_obj_from_msg(resp) + if type(frame) == memoryview: + if frame[:peek_bytes] == header_start: + # we have a header. Let's see if + # 1. We know the next frame's length (for fast recv) + # 2. We know the next frame's memory destination (GPU or CPU). + headers = msgpack.loads(frame, use_list=False) + keys = headers[b'keys'] + for key in keys: + header = headers[b'headers'][key] + size = header.get(b'lengths', ()) + if size: + if header.get(b'is_cuda', False): + gpu_inbound = True + break + frames.append(frame) msg = await from_frames( diff --git a/distributed/protocol/cuda.py b/distributed/protocol/cuda.py index c9bc7875c8f..cc82527914e 100644 --- a/distributed/protocol/cuda.py +++ b/distributed/protocol/cuda.py @@ -14,25 +14,20 @@ @dask_serialize.register(cupy.ndarray) def serialize_cupy_ndarray(x): # TODO: handle non-contiguous - # shape - # typestr - # descr - # data - # version - # strides (noncontiguous-only) + # TODO: handle 2d header = x.__cuda_array_interface__.copy() - header['device'] = x.device.id header['lengths'] = (x.nbytes,) # one per stride header['compression'] = (None,) # TODO - # TODO: I don't think ucx-py should have to worry about - # MemoryPointer. Maybe some thin wrapper. - return header, [x.data] + header['is_cuda'] = True + return header, [x] @dask_deserialize.register(cupy.ndarray) def deserialize_cupy_array(header, frames): # MemoryPointer { PoolMemory, offset } frame, = frames - arr = cupy.ndarray(header['shape'], dtype=header['typestr'], - memptr=frame) + # TODO: put this in ucx... as a kind of "fixup" + frame.typestr = header['typestr'] + frame._shape = header['shape'] + arr = cupy.asarray(frame) return arr From 4fc6acd9490958120497d91a28e6ed63ab5b73f9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 25 Feb 2019 11:20:19 -0800 Subject: [PATCH 03/68] fixups --- distributed/comm/tests/test_ucx.py | 12 ++++++------ distributed/protocol/cuda.py | 18 +++++++++++++++--- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index 1e8cb866c02..188085f23e7 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -2,7 +2,6 @@ import itertools import pytest -import dask import numpy as np from distributed.comm import ucx, listen, connect @@ -12,7 +11,6 @@ from distributed.utils_test import gen_test from .test_comms import check_deserialize -import ucp_py as ucp ADDRESS = ucx.ADDRESS @@ -176,14 +174,16 @@ def test_ucx_deserialize(): @pytest.mark.asyncio -@pytest.mark.xfail(reason="UCX") # memory is garbage... -async def test_ping_pong_cupy(): +@pytest.mark.parametrize('shape', [ + (100,), + (10, 10) +]) +async def test_ping_pong_cupy(shape): cupy = pytest.importorskip('cupy') address = "{}:{}".format(HOST, next(port_counter)) com, serv_com = await get_comm_pair(address) - # TODO: ucx-py doesn't handle 2d yet. - arr = cupy.random.random(100,) + arr = cupy.random.random(shape) msg = {"op": "ping", 'data': to_serialize(arr)} await com.write(msg) diff --git a/distributed/protocol/cuda.py b/distributed/protocol/cuda.py index cc82527914e..be9fdd9ae15 100644 --- a/distributed/protocol/cuda.py +++ b/distributed/protocol/cuda.py @@ -15,19 +15,31 @@ def serialize_cupy_ndarray(x): # TODO: handle non-contiguous # TODO: handle 2d + # TODO: 0d + + if x.flags.c_contiguous or x.flags.f_contiguous: + strides = x.strides + data = x.ravel() # order='K' + else: + x = cupy.ascontiguousarray(x) + strides = x.strides + data = x.ravel() + + dtype = (0, x.dtype.str) + header = x.__cuda_array_interface__.copy() header['lengths'] = (x.nbytes,) # one per stride header['compression'] = (None,) # TODO header['is_cuda'] = True - return header, [x] + header['dtype'] = dtype + return header, [data] @dask_deserialize.register(cupy.ndarray) def deserialize_cupy_array(header, frames): - # MemoryPointer { PoolMemory, offset } frame, = frames # TODO: put this in ucx... as a kind of "fixup" frame.typestr = header['typestr'] - frame._shape = header['shape'] + frame.shape = header['shape'] arr = cupy.asarray(frame) return arr From d64c5ccc03eac11cb48253594837a1e7c8302608 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 25 Feb 2019 13:19:39 -0800 Subject: [PATCH 04/68] wip --- distributed/comm/ucx.py | 26 +++++++++++-------- distributed/protocol/__init__.py | 12 +++++++++ distributed/protocol/cudf.py | 38 +++++++++++++++++++++++++++ distributed/protocol/numba.py | 44 ++++++++++++++++++++++++++++++++ 4 files changed, 110 insertions(+), 10 deletions(-) create mode 100644 distributed/protocol/cudf.py create mode 100644 distributed/protocol/numba.py diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index 05e15d115e2..1de7ff04c8a 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -153,18 +153,23 @@ async def read(self, deserializers=None): frames = [] msg = {} - gpu_inbound = False - size = () + # For cudf, we would ideally do + # header = recv_future() + # columns [recv_obj(size, cuda=True) for size in header['sizes']] + # So maybe make gpu_inbound an int that has the number of remaining + # gpu_inbound recvs? + + gpu_inbound = 0 + size = [] for i in range(n_frames): if size: + this_size = size.pop() # XXX: when do we get multiple keys here? Non-contiguous? - assert len(size) == 1 - size, = size - resp = await self.ep.recv_obj(size, cuda=gpu_inbound) + resp = await self.ep.recv_obj(this_size, cuda=bool(gpu_inbound)) # prepare for the next (header) recv - size = () - gpu_inbound = False + if gpu_inbound: + gpu_inbound -= 1 else: resp = await self.ep.recv_future() frame = ucp.get_obj_from_msg(resp) @@ -177,10 +182,11 @@ async def read(self, deserializers=None): keys = headers[b'keys'] for key in keys: header = headers[b'headers'][key] - size = header.get(b'lengths', ()) + size = list(header.get(b'lengths', [])) if size: - if header.get(b'is_cuda', False): - gpu_inbound = True + size = size[::-1] + if header.get(b'is_cuda', 0): + gpu_inbound = int(header[b'is_cuda']) break frames.append(frame) diff --git a/distributed/protocol/__init__.py b/distributed/protocol/__init__.py index a56871b5ad5..67dc66b7d0d 100644 --- a/distributed/protocol/__init__.py +++ b/distributed/protocol/__init__.py @@ -69,3 +69,15 @@ def _register_torch(): @dask_deserialize.register_lazy("cupy") def _register_cupy(): from . import cuda + + +@dask_serialize.register_lazy("numba") +@dask_deserialize.register_lazy("numba") +def _register_cupy(): + from . import numba + + +@dask_serialize.register_lazy("cudf") +@dask_deserialize.register_lazy("cudf") +def _register_cupy(): + from . import cudf diff --git a/distributed/protocol/cudf.py b/distributed/protocol/cudf.py new file mode 100644 index 00000000000..05f3215fc0f --- /dev/null +++ b/distributed/protocol/cudf.py @@ -0,0 +1,38 @@ +import cudf +from .serialize import dask_serialize, dask_deserialize +from .numba import serialize_numba_ndarray, deserialize_numba_ndarray + + +@dask_serialize.register(cudf.DataFrame) +def serialize_cudf_dataframe(x): + # TODO: does cudf support duplicate columns? + print('hey!') + sub_headers = [] + arrays = [] + + for label, col in x.iteritems(): + header, (frame,) = serialize_numba_ndarray(col.to_gpu_array()) + sub_headers.append(header) + arrays.append(frame) + + header = { + 'lengths': [len(x)] * x.shape[1], + 'is_cuda': True, + 'subheaders': sub_headers, + 'columns': x.columns, # TODO + } + + return header, arrays + + +@dask_deserialize.register(cudf.DataFrame) +def serialize_cudf_dataframe(header, frames): + assert len(frames) == len(header['columns']) + arrays = [] + + for subheader, frame in zip(header['subheaders'], frames): + array = deserialize_numba_ndarray(subheader, [frame]) + arrays.append(array) + + objs = list(zip(header['columns'], arrays)) + return cudf.DataFrame(objs) diff --git a/distributed/protocol/numba.py b/distributed/protocol/numba.py new file mode 100644 index 00000000000..798619edbdf --- /dev/null +++ b/distributed/protocol/numba.py @@ -0,0 +1,44 @@ +import numba.cuda +from .serialize import dask_serialize, dask_deserialize + + +@dask_serialize.register(numba.cuda.devicearray.DeviceNDArray) +def serialize_numba_ndarray(x): + # TODO: handle non-contiguous + # TODO: handle 2d + # TODO: 0d + + if x.flags['C_CONTIGUOUS'] or x.flags['F_CONTIGUOUS']: + strides = x.strides + if x.ndim > 1: + data = x.ravel() # order='K' + else: + data = x + else: + raise ValueError("Array must be contiguous") + x = numba.ascontiguousarray(x) + strides = x.strides + if x.ndim > 1: + data = x.ravel() + else: + data = x + + dtype = (0, x.dtype.str) + nbytes = data.dtype.itemsize * data.size + + header = x.__cuda_array_interface__.copy() + header['lengths'] = (nbytes,) # one per stride + header['compression'] = (None,) # TODO + header['is_cuda'] = True + header['dtype'] = dtype + return header, [data] + + +@dask_deserialize.register(numba.cuda.devicearray.DeviceNDArray) +def deserialize_numba_ndarray(header, frames): + frame, = frames + # TODO: put this in ucx... as a kind of "fixup" + frame.typestr = header['typestr'] + frame.shape = header['shape'] + arr, _ = numba.cuda.devicearray.auto_device(frame) + return arr From b28668b2610ba623c7784fc0b918bc3b000a180e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 25 Feb 2019 14:08:18 -0800 Subject: [PATCH 05/68] zero copy --- distributed/protocol/cudf.py | 52 +++++++++++++++++++++++++++-------- distributed/protocol/numba.py | 2 +- 2 files changed, 42 insertions(+), 12 deletions(-) diff --git a/distributed/protocol/cudf.py b/distributed/protocol/cudf.py index 05f3215fc0f..66fee05ea68 100644 --- a/distributed/protocol/cudf.py +++ b/distributed/protocol/cudf.py @@ -6,20 +6,33 @@ @dask_serialize.register(cudf.DataFrame) def serialize_cudf_dataframe(x): # TODO: does cudf support duplicate columns? - print('hey!') sub_headers = [] - arrays = [] + arrays = [] + null_masks = [] + null_headers = [] + null_counts = {} for label, col in x.iteritems(): - header, (frame,) = serialize_numba_ndarray(col.to_gpu_array()) + header, [frame] = serialize_numba_ndarray(col.data.mem) + header['name'] = label sub_headers.append(header) arrays.append(frame) + if col.null_count: + header, [frame] = serialize_numba_ndarray(col.nullmask.mem) + header['name'] = label + null_headers.append(header) + null_masks.append(frame) + null_counts[label] = col.null_count + + arrays.extend(null_masks) header = { - 'lengths': [len(x)] * x.shape[1], + 'lengths': [len(x)] * len(arrays), 'is_cuda': True, 'subheaders': sub_headers, - 'columns': x.columns, # TODO + 'columns': x.columns, + 'null_counts': null_counts, + 'null_subheaders': null_headers } return header, arrays @@ -27,12 +40,29 @@ def serialize_cudf_dataframe(x): @dask_deserialize.register(cudf.DataFrame) def serialize_cudf_dataframe(header, frames): - assert len(frames) == len(header['columns']) - arrays = [] + # TODO: duplicate columns + + columns = header['columns'] + n_columns = len(header['columns']) + n_masks = len(header['null_subheaders']) - for subheader, frame in zip(header['subheaders'], frames): + masks = {} + pairs = [] + + for i in range(n_masks): + subheader = header['null_subheaders'][i] + frame = frames[n_columns + i] + mask = deserialize_numba_ndarray(subheader, [frame]) + masks[subheader['name']] = mask + + for subheader, frame in zip(header['subheaders'], frames[:n_columns]): + name = subheader['name'] array = deserialize_numba_ndarray(subheader, [frame]) - arrays.append(array) - objs = list(zip(header['columns'], arrays)) - return cudf.DataFrame(objs) + if name in masks: + series = cudf.Series.from_masked_array(array, masks[name]) + else: + series = cudf.Series(array) + pairs.append((name, series)) + + return cudf.DataFrame(pairs) diff --git a/distributed/protocol/numba.py b/distributed/protocol/numba.py index 798619edbdf..c81e991e60a 100644 --- a/distributed/protocol/numba.py +++ b/distributed/protocol/numba.py @@ -40,5 +40,5 @@ def deserialize_numba_ndarray(header, frames): # TODO: put this in ucx... as a kind of "fixup" frame.typestr = header['typestr'] frame.shape = header['shape'] - arr, _ = numba.cuda.devicearray.auto_device(frame) + arr, _ = numba.cuda.devicearray.auto_device(frame) return arr From 4fcafae6727e2455d96282b15435f7040f55c651 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 26 Feb 2019 04:47:51 -0800 Subject: [PATCH 06/68] wip --- distributed/comm/tests/test_ucx.py | 50 +++++++++++++++++++++++++----- distributed/comm/ucx.py | 13 ++++++-- distributed/protocol/cudf.py | 2 +- distributed/protocol/numba.py | 2 +- 4 files changed, 55 insertions(+), 12 deletions(-) diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index 188085f23e7..a5fcf5353ed 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -45,13 +45,12 @@ async def handle_comm(comm): await q.put(comm) listener = listen(listen_addr, handle_comm, connection_args=listen_args, **kwargs) - listener.start() - - comm = await connect( - listener.contact_address, connection_args=connect_args, **kwargs - ) - serv_com = await q.get() - return comm, serv_com + with listener: + comm = await connect( + listener.contact_address, connection_args=connect_args, **kwargs + ) + serv_com = await q.get() + return comm, serv_com @pytest.mark.asyncio @@ -194,3 +193,40 @@ async def test_ping_pong_cupy(shape): cupy.testing.assert_array_equal(arr, data2) await com.close() await serv_com.close() + + +@pytest.mark.asyncio +async def test_ping_pong_numba(): + numba = pytest.importorskip("numba") + numpy = pytest.importorskip("numpy") + + import distributed.protocol.numba # noqa + address = "{}:{}".format(HOST, next(port_counter)) + + arr = np.arange(10) + arr = numba.cuda.to_device(arr) + + com, serv_com = await get_comm_pair(address) + msg = {"op": "ping", 'data': to_serialize(arr)} + + await com.write(msg) + result = await serv_com.read() + data2 = result.pop('data') + assert result['op'] == 'ping' + + +@pytest.mark.asyncio +async def test_ping_pong_cudf(): + cudf = pytest.importorskip("cudf") + import distributed.protocol.cudf # noqa + + df = cudf.DataFrame({"A": [1, 2, None], "B": [1., 2., None]}) + address = "{}:{}".format(HOST, next(port_counter)) + + com, serv_com = await get_comm_pair(address) + msg = {"op": "ping", 'data': to_serialize(df)} + + await com.write(msg) + result = await serv_com.read() + data2 = result.pop('data') + assert result['op'] == 'ping' diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index 1de7ff04c8a..822326d3eea 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -162,9 +162,11 @@ async def read(self, deserializers=None): gpu_inbound = 0 size = [] + # TODO: this multi-send for cudf broke things. for i in range(n_frames): if size: this_size = size.pop() + print("this size", this_size) # XXX: when do we get multiple keys here? Non-contiguous? resp = await self.ep.recv_obj(this_size, cuda=bool(gpu_inbound)) # prepare for the next (header) recv @@ -254,6 +256,7 @@ def __init__( # XXX: The init may be required to take args like # {'require_encryption': None, 'ssl_context': None} self.connection_args = connection_args + self._task = None def start(self): async def serve_forever(client_ep, listener_instance): @@ -277,15 +280,19 @@ async def serve_forever(client_ep, listener_instance): loop = asyncio.get_event_loop() # Does someone need to hold onto this task? - loop.create_task(server.coroutine) + t = loop.create_task(server.coroutine) + self._task = t def stop(self): # What all should this do? - # ucp.stop_listener(self.ep) # do this here? + if self._task: + print("Cancelling task!") + self._task.cancel() + if self.ep: ucp.destroy_ep(self.ep) # if self.listener_instance: - # ucp.stop_listener(self.listener_instance) + # ucp.stop_listener(self.listener_instance) def get_host_port(self): # TODO: TCP raises if this hasn't started yet. diff --git a/distributed/protocol/cudf.py b/distributed/protocol/cudf.py index 66fee05ea68..7e14622ea04 100644 --- a/distributed/protocol/cudf.py +++ b/distributed/protocol/cudf.py @@ -30,7 +30,7 @@ def serialize_cudf_dataframe(x): 'lengths': [len(x)] * len(arrays), 'is_cuda': True, 'subheaders': sub_headers, - 'columns': x.columns, + 'columns': x.columns.tolist(), # TODO: ugh... 'null_counts': null_counts, 'null_subheaders': null_headers } diff --git a/distributed/protocol/numba.py b/distributed/protocol/numba.py index c81e991e60a..4d85afca08a 100644 --- a/distributed/protocol/numba.py +++ b/distributed/protocol/numba.py @@ -40,5 +40,5 @@ def deserialize_numba_ndarray(header, frames): # TODO: put this in ucx... as a kind of "fixup" frame.typestr = header['typestr'] frame.shape = header['shape'] - arr, _ = numba.cuda.devicearray.auto_device(frame) + arr, _ = numba.cuda.as_cuda_array(frame) return arr From f33ba29815f0c7516db2d06ab99d40691c9b3c7d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 26 Feb 2019 04:50:48 -0800 Subject: [PATCH 07/68] BUG: Ensure proper cleanup in comm_pair tests --- distributed/comm/tests/test_ucx.py | 13 ++++++------- distributed/comm/ucx.py | 9 ++++++--- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index 5379b321f19..3eff282a960 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -47,13 +47,12 @@ async def handle_comm(comm): await q.put(comm) listener = listen(listen_addr, handle_comm, connection_args=listen_args, **kwargs) - listener.start() - - comm = await connect( - listener.contact_address, connection_args=connect_args, **kwargs - ) - serv_com = await q.get() - return comm, serv_com + with listener: + comm = await connect( + listener.contact_address, connection_args=connect_args, **kwargs + ) + serv_com = await q.get() + return comm, serv_com @pytest.mark.asyncio diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index 541817c9465..87d0323d99d 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -229,6 +229,7 @@ def __init__( self.deserialize = deserialize self.ep = None # type: ucp.ucp_py_ep self.listener_instance = None # type: ucp.ListenerFuture + self._task = None # XXX: The init may be required to take args like # {'require_encryption': None, 'ssl_context': None} @@ -255,12 +256,14 @@ async def serve_forever(client_ep, listener_instance): except RuntimeError: loop = asyncio.get_event_loop() - # Does someone need to hold onto this task? - loop.create_task(server.coroutine) + t = loop.create_task(server.coroutine) + self._task = t def stop(self): # What all should this do? - # ucp.stop_listener(self.ep) # do this here? + if self._task: + self._task.cancel() + if self.ep: ucp.destroy_ep(self.ep) # if self.listener_instance: From b61e56d2fdaf4061f1a2f7ef0dd62f670bb4acb2 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 26 Feb 2019 07:17:33 -0800 Subject: [PATCH 08/68] Reset reader_added before listening --- distributed/comm/ucx.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index 87d0323d99d..f94475d9d60 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -244,6 +244,10 @@ async def serve_forever(client_ep, listener_instance): await self.comm_handler(ucx) _ucp_init() + # Workaround for hanging test in + # pytest distributed/comm/tests/test_ucx.py::test_comm_objs -vs --count=2 + # on the second time through. + ucp._libs.ucp_py.reader_added = 0 # XXX: the port handling is probably incorrect. # need to figure out if `server_port=None` is # server_port=13337, or server_port="next free port" From 89df2bd0d05908b45676e088661e0e8e5f83b31c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 22 Feb 2019 08:39:48 -0800 Subject: [PATCH 09/68] CUDA failing --- distributed/comm/tests/test_ucx.py | 4 ++- distributed/comm/ucx.py | 55 +++++++++++++++++++----------- distributed/protocol/cuda.py | 19 ++++------- 3 files changed, 45 insertions(+), 33 deletions(-) diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index 3eff282a960..6f39d4254b3 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -175,12 +175,14 @@ def test_ucx_deserialize(): @pytest.mark.asyncio +@pytest.mark.xfail(reason="UCX") # memory is garbage... async def test_ping_pong_cupy(): cupy = pytest.importorskip('cupy') address = "{}:{}".format(HOST, next(port_counter)) com, serv_com = await get_comm_pair(address) - arr = cupy.random.random((100, 10)) + # TODO: ucx-py doesn't handle 2d yet. + arr = cupy.random.random(100,) msg = {"op": "ping", 'data': to_serialize(arr)} await com.write(msg) diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index f94475d9d60..f75c76e1d8c 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -133,41 +133,56 @@ async def write(self, msg: dict, serializers=None, on_error: str = "message"): msg, serializers=serializers, on_error=on_error ) # TODO: context= nframes = struct.pack("Q", len(frames)) - await self.ep.send_obj(nframes, sys.getsizeof(nframes)) # send number of frames + await self.ep.send_obj(nframes) # send number of frames for frame in frames: - if isinstance(frame, memoryview): - # TODO: UCX-PY #27 - frame = frame.tobytes() - size = sys.getsizeof(frame) - await self.ep.send_obj(frame, size) + await self.ep.send_obj(frame) return sum(map(nbytes, frames)) async def read(self, deserializers=None): resp = await self.ep.recv_future() - # XXX: this breaks things, e.g. test_ucx_specific - # dummy = b'0' * 8 - # resp = await self.ep.recv_obj(dummy, 41) obj = ucp.get_obj_from_msg(resp) n_frames, = struct.unpack("Q", obj) - # Notes: - # 1. Eventually, ucp_msg will be our abstraction over GPU vs. CPU - # memory. We won't need to worry about checking for the destination - # here. The message object will have a reference to the region of - # memory. Downstream of us (say from _frames) will deserialize - # appropriately, based on whether it's a GPU or CPU object. - # 2. We will still deserialize the header early, to check the *length* - # of the next message. This lets us use the faster `recv_obj` to - # read the next nbytes. - _header_start = b'\x83\xa7headers' + # TODO: see if we care about deserializing all headers. + # We could probably do some tricks to make this less expensive, + # (if it's even expensive in the first place) + header_start = b'\x83\xa7headers' + peek_bytes = len(header_start) frames = [] msg = {} + gpu_inbound = False + size = () + for i in range(n_frames): - resp = await self.ep.recv_future() + if size: + # XXX: when do we get multiple keys here? Non-contiguous? + assert len(size) == 1 + size, = size + resp = await self.ep.recv_obj(size, cuda=gpu_inbound) + # prepare for the next (header) recv + size = () + gpu_inbound = False + else: + resp = await self.ep.recv_future() frame = ucp.get_obj_from_msg(resp) + if type(frame) == memoryview: + if frame[:peek_bytes] == header_start: + # we have a header. Let's see if + # 1. We know the next frame's length (for fast recv) + # 2. We know the next frame's memory destination (GPU or CPU). + headers = msgpack.loads(frame, use_list=False) + keys = headers[b'keys'] + for key in keys: + header = headers[b'headers'][key] + size = header.get(b'lengths', ()) + if size: + if header.get(b'is_cuda', False): + gpu_inbound = True + break + frames.append(frame) msg = await from_frames( diff --git a/distributed/protocol/cuda.py b/distributed/protocol/cuda.py index c9bc7875c8f..cc82527914e 100644 --- a/distributed/protocol/cuda.py +++ b/distributed/protocol/cuda.py @@ -14,25 +14,20 @@ @dask_serialize.register(cupy.ndarray) def serialize_cupy_ndarray(x): # TODO: handle non-contiguous - # shape - # typestr - # descr - # data - # version - # strides (noncontiguous-only) + # TODO: handle 2d header = x.__cuda_array_interface__.copy() - header['device'] = x.device.id header['lengths'] = (x.nbytes,) # one per stride header['compression'] = (None,) # TODO - # TODO: I don't think ucx-py should have to worry about - # MemoryPointer. Maybe some thin wrapper. - return header, [x.data] + header['is_cuda'] = True + return header, [x] @dask_deserialize.register(cupy.ndarray) def deserialize_cupy_array(header, frames): # MemoryPointer { PoolMemory, offset } frame, = frames - arr = cupy.ndarray(header['shape'], dtype=header['typestr'], - memptr=frame) + # TODO: put this in ucx... as a kind of "fixup" + frame.typestr = header['typestr'] + frame._shape = header['shape'] + arr = cupy.asarray(frame) return arr From 5b5171652cd90619bcacd1de842ef654b5a29978 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 25 Feb 2019 11:20:19 -0800 Subject: [PATCH 10/68] fixups --- distributed/comm/tests/test_ucx.py | 12 ++++++------ distributed/protocol/cuda.py | 18 +++++++++++++++--- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index 6f39d4254b3..311aba24e80 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -2,7 +2,6 @@ import itertools import pytest -import dask import numpy as np from distributed.comm import ucx, listen, connect @@ -12,7 +11,6 @@ from distributed.utils_test import gen_test from .test_comms import check_deserialize -import ucp_py as ucp ADDRESS = ucx.ADDRESS @@ -175,14 +173,16 @@ def test_ucx_deserialize(): @pytest.mark.asyncio -@pytest.mark.xfail(reason="UCX") # memory is garbage... -async def test_ping_pong_cupy(): +@pytest.mark.parametrize('shape', [ + (100,), + (10, 10) +]) +async def test_ping_pong_cupy(shape): cupy = pytest.importorskip('cupy') address = "{}:{}".format(HOST, next(port_counter)) com, serv_com = await get_comm_pair(address) - # TODO: ucx-py doesn't handle 2d yet. - arr = cupy.random.random(100,) + arr = cupy.random.random(shape) msg = {"op": "ping", 'data': to_serialize(arr)} await com.write(msg) diff --git a/distributed/protocol/cuda.py b/distributed/protocol/cuda.py index cc82527914e..be9fdd9ae15 100644 --- a/distributed/protocol/cuda.py +++ b/distributed/protocol/cuda.py @@ -15,19 +15,31 @@ def serialize_cupy_ndarray(x): # TODO: handle non-contiguous # TODO: handle 2d + # TODO: 0d + + if x.flags.c_contiguous or x.flags.f_contiguous: + strides = x.strides + data = x.ravel() # order='K' + else: + x = cupy.ascontiguousarray(x) + strides = x.strides + data = x.ravel() + + dtype = (0, x.dtype.str) + header = x.__cuda_array_interface__.copy() header['lengths'] = (x.nbytes,) # one per stride header['compression'] = (None,) # TODO header['is_cuda'] = True - return header, [x] + header['dtype'] = dtype + return header, [data] @dask_deserialize.register(cupy.ndarray) def deserialize_cupy_array(header, frames): - # MemoryPointer { PoolMemory, offset } frame, = frames # TODO: put this in ucx... as a kind of "fixup" frame.typestr = header['typestr'] - frame._shape = header['shape'] + frame.shape = header['shape'] arr = cupy.asarray(frame) return arr From a71c896961d698e902cc77de966bca0a06a024c9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 25 Feb 2019 13:19:39 -0800 Subject: [PATCH 11/68] wip --- distributed/comm/ucx.py | 26 +++++++++++-------- distributed/protocol/__init__.py | 12 +++++++++ distributed/protocol/cudf.py | 38 +++++++++++++++++++++++++++ distributed/protocol/numba.py | 44 ++++++++++++++++++++++++++++++++ 4 files changed, 110 insertions(+), 10 deletions(-) create mode 100644 distributed/protocol/cudf.py create mode 100644 distributed/protocol/numba.py diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index f75c76e1d8c..1375ced9521 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -153,18 +153,23 @@ async def read(self, deserializers=None): frames = [] msg = {} - gpu_inbound = False - size = () + # For cudf, we would ideally do + # header = recv_future() + # columns [recv_obj(size, cuda=True) for size in header['sizes']] + # So maybe make gpu_inbound an int that has the number of remaining + # gpu_inbound recvs? + + gpu_inbound = 0 + size = [] for i in range(n_frames): if size: + this_size = size.pop() # XXX: when do we get multiple keys here? Non-contiguous? - assert len(size) == 1 - size, = size - resp = await self.ep.recv_obj(size, cuda=gpu_inbound) + resp = await self.ep.recv_obj(this_size, cuda=bool(gpu_inbound)) # prepare for the next (header) recv - size = () - gpu_inbound = False + if gpu_inbound: + gpu_inbound -= 1 else: resp = await self.ep.recv_future() frame = ucp.get_obj_from_msg(resp) @@ -177,10 +182,11 @@ async def read(self, deserializers=None): keys = headers[b'keys'] for key in keys: header = headers[b'headers'][key] - size = header.get(b'lengths', ()) + size = list(header.get(b'lengths', [])) if size: - if header.get(b'is_cuda', False): - gpu_inbound = True + size = size[::-1] + if header.get(b'is_cuda', 0): + gpu_inbound = int(header[b'is_cuda']) break frames.append(frame) diff --git a/distributed/protocol/__init__.py b/distributed/protocol/__init__.py index a56871b5ad5..67dc66b7d0d 100644 --- a/distributed/protocol/__init__.py +++ b/distributed/protocol/__init__.py @@ -69,3 +69,15 @@ def _register_torch(): @dask_deserialize.register_lazy("cupy") def _register_cupy(): from . import cuda + + +@dask_serialize.register_lazy("numba") +@dask_deserialize.register_lazy("numba") +def _register_cupy(): + from . import numba + + +@dask_serialize.register_lazy("cudf") +@dask_deserialize.register_lazy("cudf") +def _register_cupy(): + from . import cudf diff --git a/distributed/protocol/cudf.py b/distributed/protocol/cudf.py new file mode 100644 index 00000000000..05f3215fc0f --- /dev/null +++ b/distributed/protocol/cudf.py @@ -0,0 +1,38 @@ +import cudf +from .serialize import dask_serialize, dask_deserialize +from .numba import serialize_numba_ndarray, deserialize_numba_ndarray + + +@dask_serialize.register(cudf.DataFrame) +def serialize_cudf_dataframe(x): + # TODO: does cudf support duplicate columns? + print('hey!') + sub_headers = [] + arrays = [] + + for label, col in x.iteritems(): + header, (frame,) = serialize_numba_ndarray(col.to_gpu_array()) + sub_headers.append(header) + arrays.append(frame) + + header = { + 'lengths': [len(x)] * x.shape[1], + 'is_cuda': True, + 'subheaders': sub_headers, + 'columns': x.columns, # TODO + } + + return header, arrays + + +@dask_deserialize.register(cudf.DataFrame) +def serialize_cudf_dataframe(header, frames): + assert len(frames) == len(header['columns']) + arrays = [] + + for subheader, frame in zip(header['subheaders'], frames): + array = deserialize_numba_ndarray(subheader, [frame]) + arrays.append(array) + + objs = list(zip(header['columns'], arrays)) + return cudf.DataFrame(objs) diff --git a/distributed/protocol/numba.py b/distributed/protocol/numba.py new file mode 100644 index 00000000000..798619edbdf --- /dev/null +++ b/distributed/protocol/numba.py @@ -0,0 +1,44 @@ +import numba.cuda +from .serialize import dask_serialize, dask_deserialize + + +@dask_serialize.register(numba.cuda.devicearray.DeviceNDArray) +def serialize_numba_ndarray(x): + # TODO: handle non-contiguous + # TODO: handle 2d + # TODO: 0d + + if x.flags['C_CONTIGUOUS'] or x.flags['F_CONTIGUOUS']: + strides = x.strides + if x.ndim > 1: + data = x.ravel() # order='K' + else: + data = x + else: + raise ValueError("Array must be contiguous") + x = numba.ascontiguousarray(x) + strides = x.strides + if x.ndim > 1: + data = x.ravel() + else: + data = x + + dtype = (0, x.dtype.str) + nbytes = data.dtype.itemsize * data.size + + header = x.__cuda_array_interface__.copy() + header['lengths'] = (nbytes,) # one per stride + header['compression'] = (None,) # TODO + header['is_cuda'] = True + header['dtype'] = dtype + return header, [data] + + +@dask_deserialize.register(numba.cuda.devicearray.DeviceNDArray) +def deserialize_numba_ndarray(header, frames): + frame, = frames + # TODO: put this in ucx... as a kind of "fixup" + frame.typestr = header['typestr'] + frame.shape = header['shape'] + arr, _ = numba.cuda.devicearray.auto_device(frame) + return arr From fb862bf165869624ea4412a6d769ab0aec4e78ef Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 25 Feb 2019 14:08:18 -0800 Subject: [PATCH 12/68] zero copy --- distributed/protocol/cudf.py | 52 +++++++++++++++++++++++++++-------- distributed/protocol/numba.py | 2 +- 2 files changed, 42 insertions(+), 12 deletions(-) diff --git a/distributed/protocol/cudf.py b/distributed/protocol/cudf.py index 05f3215fc0f..66fee05ea68 100644 --- a/distributed/protocol/cudf.py +++ b/distributed/protocol/cudf.py @@ -6,20 +6,33 @@ @dask_serialize.register(cudf.DataFrame) def serialize_cudf_dataframe(x): # TODO: does cudf support duplicate columns? - print('hey!') sub_headers = [] - arrays = [] + arrays = [] + null_masks = [] + null_headers = [] + null_counts = {} for label, col in x.iteritems(): - header, (frame,) = serialize_numba_ndarray(col.to_gpu_array()) + header, [frame] = serialize_numba_ndarray(col.data.mem) + header['name'] = label sub_headers.append(header) arrays.append(frame) + if col.null_count: + header, [frame] = serialize_numba_ndarray(col.nullmask.mem) + header['name'] = label + null_headers.append(header) + null_masks.append(frame) + null_counts[label] = col.null_count + + arrays.extend(null_masks) header = { - 'lengths': [len(x)] * x.shape[1], + 'lengths': [len(x)] * len(arrays), 'is_cuda': True, 'subheaders': sub_headers, - 'columns': x.columns, # TODO + 'columns': x.columns, + 'null_counts': null_counts, + 'null_subheaders': null_headers } return header, arrays @@ -27,12 +40,29 @@ def serialize_cudf_dataframe(x): @dask_deserialize.register(cudf.DataFrame) def serialize_cudf_dataframe(header, frames): - assert len(frames) == len(header['columns']) - arrays = [] + # TODO: duplicate columns + + columns = header['columns'] + n_columns = len(header['columns']) + n_masks = len(header['null_subheaders']) - for subheader, frame in zip(header['subheaders'], frames): + masks = {} + pairs = [] + + for i in range(n_masks): + subheader = header['null_subheaders'][i] + frame = frames[n_columns + i] + mask = deserialize_numba_ndarray(subheader, [frame]) + masks[subheader['name']] = mask + + for subheader, frame in zip(header['subheaders'], frames[:n_columns]): + name = subheader['name'] array = deserialize_numba_ndarray(subheader, [frame]) - arrays.append(array) - objs = list(zip(header['columns'], arrays)) - return cudf.DataFrame(objs) + if name in masks: + series = cudf.Series.from_masked_array(array, masks[name]) + else: + series = cudf.Series(array) + pairs.append((name, series)) + + return cudf.DataFrame(pairs) diff --git a/distributed/protocol/numba.py b/distributed/protocol/numba.py index 798619edbdf..c81e991e60a 100644 --- a/distributed/protocol/numba.py +++ b/distributed/protocol/numba.py @@ -40,5 +40,5 @@ def deserialize_numba_ndarray(header, frames): # TODO: put this in ucx... as a kind of "fixup" frame.typestr = header['typestr'] frame.shape = header['shape'] - arr, _ = numba.cuda.devicearray.auto_device(frame) + arr, _ = numba.cuda.devicearray.auto_device(frame) return arr From 594028cac2c05e8ec3251f263a68fb3f0c381b76 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 26 Feb 2019 12:12:59 -0800 Subject: [PATCH 13/68] fix registration --- distributed/protocol/__init__.py | 4 ++-- distributed/protocol/numba.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/distributed/protocol/__init__.py b/distributed/protocol/__init__.py index 67dc66b7d0d..a9e4989bf50 100644 --- a/distributed/protocol/__init__.py +++ b/distributed/protocol/__init__.py @@ -73,11 +73,11 @@ def _register_cupy(): @dask_serialize.register_lazy("numba") @dask_deserialize.register_lazy("numba") -def _register_cupy(): +def _register_numba(): from . import numba @dask_serialize.register_lazy("cudf") @dask_deserialize.register_lazy("cudf") -def _register_cupy(): +def _register_cudf(): from . import cudf diff --git a/distributed/protocol/numba.py b/distributed/protocol/numba.py index 4d85afca08a..2345fcb6a14 100644 --- a/distributed/protocol/numba.py +++ b/distributed/protocol/numba.py @@ -40,5 +40,5 @@ def deserialize_numba_ndarray(header, frames): # TODO: put this in ucx... as a kind of "fixup" frame.typestr = header['typestr'] frame.shape = header['shape'] - arr, _ = numba.cuda.as_cuda_array(frame) + arr = numba.cuda.as_cuda_array(frame) return arr From 120bc2fbe814f4aca678c807aee8e44286e5515c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 26 Feb 2019 12:31:12 -0800 Subject: [PATCH 14/68] all passing --- distributed/comm/ucx.py | 51 ++++++++++++++++++++++------------- distributed/protocol/cuda.py | 2 +- distributed/protocol/cudf.py | 2 +- distributed/protocol/numba.py | 2 +- 4 files changed, 36 insertions(+), 21 deletions(-) diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index 81bbd20b86d..3684c48a654 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -160,12 +160,12 @@ async def read(self, deserializers=None): # gpu_inbound recvs? gpu_inbound = 0 - size = [] + sizes = [] # TODO: this multi-send for cudf broke things. for i in range(n_frames): - if size: - this_size = size.pop() + if sizes: + this_size = sizes.pop() print("this size", this_size) # XXX: when do we get multiple keys here? Non-contiguous? resp = await self.ep.recv_obj(this_size, cuda=bool(gpu_inbound)) @@ -175,21 +175,11 @@ async def read(self, deserializers=None): else: resp = await self.ep.recv_future() frame = ucp.get_obj_from_msg(resp) - if type(frame) == memoryview: - if frame[:peek_bytes] == header_start: - # we have a header. Let's see if - # 1. We know the next frame's length (for fast recv) - # 2. We know the next frame's memory destination (GPU or CPU). - headers = msgpack.loads(frame, use_list=False) - keys = headers[b'keys'] - for key in keys: - header = headers[b'headers'][key] - size = list(header.get(b'lengths', [])) - if size: - size = size[::-1] - if header.get(b'is_cuda', 0): - gpu_inbound = int(header[b'is_cuda']) - break + if should_peek(frame): + # we have a header. Let's see if + # 1. We know the next frame's length (for fast recv) + # 2. We know the next frame's memory destination (GPU or CPU). + sizes, gpu_inbound = peek(frame) frames.append(frame) @@ -344,4 +334,29 @@ def get_local_address_for(self, loc): return unparse_host_port(local_host, None) +def should_peek(frame): + header_start = b'\x83\xa7headers' + peek_bytes = len(header_start) + + return type(frame) == memoryview and frame[:peek_bytes] == header_start + + +def peek(frame): + headers = msgpack.loads(frame, use_list=False) + keys = headers[b'keys'] + sizes = [] + gpu_inbound = 0 + + for key in keys: + header = headers[b'headers'][key] + sizes = list(header.get(b'lengths', [])) + if sizes: + sizes = sizes[::-1] + if header.get(b'is_cuda', 0): + gpu_inbound = int(header[b'is_cuda']) + break + + return sizes, gpu_inbound + + backends["ucx"] = UCXBackend() diff --git a/distributed/protocol/cuda.py b/distributed/protocol/cuda.py index be9fdd9ae15..5218f5fe2fb 100644 --- a/distributed/protocol/cuda.py +++ b/distributed/protocol/cuda.py @@ -30,7 +30,7 @@ def serialize_cupy_ndarray(x): header = x.__cuda_array_interface__.copy() header['lengths'] = (x.nbytes,) # one per stride header['compression'] = (None,) # TODO - header['is_cuda'] = True + header['is_cuda'] = 1 header['dtype'] = dtype return header, [data] diff --git a/distributed/protocol/cudf.py b/distributed/protocol/cudf.py index 7e14622ea04..403a5cd44f1 100644 --- a/distributed/protocol/cudf.py +++ b/distributed/protocol/cudf.py @@ -28,7 +28,7 @@ def serialize_cudf_dataframe(x): header = { 'lengths': [len(x)] * len(arrays), - 'is_cuda': True, + 'is_cuda': len(arrays), 'subheaders': sub_headers, 'columns': x.columns.tolist(), # TODO: ugh... 'null_counts': null_counts, diff --git a/distributed/protocol/numba.py b/distributed/protocol/numba.py index 2345fcb6a14..24df5b0adb8 100644 --- a/distributed/protocol/numba.py +++ b/distributed/protocol/numba.py @@ -29,7 +29,7 @@ def serialize_numba_ndarray(x): header = x.__cuda_array_interface__.copy() header['lengths'] = (nbytes,) # one per stride header['compression'] = (None,) # TODO - header['is_cuda'] = True + header['is_cuda'] = 1 header['dtype'] = dtype return header, [data] From 2def1373af437f548b97524397379a4128cf4b1d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 26 Feb 2019 13:40:04 -0800 Subject: [PATCH 15/68] cleanup --- distributed/comm/ucx.py | 42 +++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index 3684c48a654..aabae96a573 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -144,29 +144,17 @@ async def read(self, deserializers=None): obj = ucp.get_obj_from_msg(resp) n_frames, = struct.unpack("Q", obj) - # TODO: see if we care about deserializing all headers. - # We could probably do some tricks to make this less expensive, - # (if it's even expensive in the first place) - header_start = b'\x83\xa7headers' - peek_bytes = len(header_start) - frames = [] - msg = {} - - # For cudf, we would ideally do - # header = recv_future() - # columns [recv_obj(size, cuda=True) for size in header['sizes']] - # So maybe make gpu_inbound an int that has the number of remaining - # gpu_inbound recvs? - + # gpu_inbound and sizes are for seeing if we can + # 1. Take a fastpath to recv a known-length object + # 2. Take a fast-fastpath to recv into GPU memory. + # see peek for more. gpu_inbound = 0 sizes = [] - # TODO: this multi-send for cudf broke things. for i in range(n_frames): if sizes: this_size = sizes.pop() - print("this size", this_size) # XXX: when do we get multiple keys here? Non-contiguous? resp = await self.ep.recv_obj(this_size, cuda=bool(gpu_inbound)) # prepare for the next (header) recv @@ -342,6 +330,28 @@ def should_peek(frame): def peek(frame): + """ + Inspect a header for whether we can take a faster recv-path. + + Parameters + ---------- + frame : memoryview + The header frame to inspect. + + Returns + ------- + sizes : list + List of the next sided :meth:`recv_obj` receives to perform. + The recevies should be performed last to first, so use + :func:`list.pop` to get the next receive. + gpu_inbound : int + The number of GPU recvies to do. Decrement this to get + back to regular recvs. + + See Also + -------- + should_peek : check whether it's appropriate to peek. + """ headers = msgpack.loads(frame, use_list=False) keys = headers[b'keys'] sizes = [] From fcb800a86116e471b852c4ff70825b55aead3200 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 26 Feb 2019 13:53:15 -0800 Subject: [PATCH 16/68] move override to the test --- distributed/comm/tests/test_ucx.py | 6 ++++++ distributed/comm/ucx.py | 4 ---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index a5fcf5353ed..044ff1aad84 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -3,6 +3,7 @@ import pytest import numpy as np +import ucp_py as ucp from distributed.comm import ucx, listen, connect from distributed.comm.registry import backends, get_backend @@ -44,6 +45,11 @@ async def get_comm_pair(listen_addr, listen_args=None, connect_args=None, **kwar async def handle_comm(comm): await q.put(comm) + # Workaround for hanging test in + # pytest distributed/comm/tests/test_ucx.py::test_comm_objs -vs --count=2 + # on the second time through. + ucp._libs.ucp_py.reader_added = 0 + listener = listen(listen_addr, handle_comm, connection_args=listen_args, **kwargs) with listener: comm = await connect( diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index aabae96a573..b98d5f16927 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -246,10 +246,6 @@ async def serve_forever(client_ep, listener_instance): await self.comm_handler(ucx) _ucp_init() - # Workaround for hanging test in - # pytest distributed/comm/tests/test_ucx.py::test_comm_objs -vs --count=2 - # on the second time through. - ucp._libs.ucp_py.reader_added = 0 # XXX: the port handling is probably incorrect. # need to figure out if `server_port=None` is # server_port=13337, or server_port="next free port" From 8745a19e420c505729b5c8764d8682eb5de4741f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 26 Feb 2019 14:09:27 -0800 Subject: [PATCH 17/68] rename --- distributed/comm/tests/test_ucx.py | 8 +++--- distributed/comm/ucx.py | 4 +-- distributed/protocol/__init__.py | 2 +- distributed/protocol/cuda.py | 8 +----- distributed/protocol/cupy.py | 39 ++++++++++++++++++++++++++++++ 5 files changed, 47 insertions(+), 14 deletions(-) create mode 100644 distributed/protocol/cupy.py diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index 044ff1aad84..5ff4f8e9969 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -2,7 +2,6 @@ import itertools import pytest -import numpy as np import ucp_py as ucp from distributed.comm import ucx, listen, connect @@ -153,6 +152,8 @@ async def client_communicate(key, delay=0): @pytest.mark.asyncio async def test_ping_pong_data(): + np = pytest.importorskip('numpy') + data = np.ones((10, 10)) # TODO: broken for large arrays address = "{}:{}".format(HOST, next(port_counter)) @@ -203,10 +204,10 @@ async def test_ping_pong_cupy(shape): @pytest.mark.asyncio async def test_ping_pong_numba(): + np = pytest.importorskip('numpy') numba = pytest.importorskip("numba") - numpy = pytest.importorskip("numpy") + import numba.cuda - import distributed.protocol.numba # noqa address = "{}:{}".format(HOST, next(port_counter)) arr = np.arange(10) @@ -224,7 +225,6 @@ async def test_ping_pong_numba(): @pytest.mark.asyncio async def test_ping_pong_cudf(): cudf = pytest.importorskip("cudf") - import distributed.protocol.cudf # noqa df = cudf.DataFrame({"A": [1, 2, None], "B": [1., 2., None]}) address = "{}:{}".format(HOST, next(port_counter)) diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index b98d5f16927..c6f06f49bc2 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -133,7 +133,7 @@ async def write(self, msg: dict, serializers=None, on_error: str = "message"): msg, serializers=serializers, on_error=on_error ) # TODO: context= nframes = struct.pack("Q", len(frames)) - await self.ep.send_obj(nframes) # send number of frames + await self.ep.send_obj(nframes) for frame in frames: await self.ep.send_obj(frame) @@ -269,7 +269,7 @@ def stop(self): if self.ep: ucp.destroy_ep(self.ep) # if self.listener_instance: - # ucp.stop_listener(self.listener_instance) + # ucp.stop_listener(self.listener_instance) def get_host_port(self): # TODO: TCP raises if this hasn't started yet. diff --git a/distributed/protocol/__init__.py b/distributed/protocol/__init__.py index a9e4989bf50..06cec6addb6 100644 --- a/distributed/protocol/__init__.py +++ b/distributed/protocol/__init__.py @@ -68,7 +68,7 @@ def _register_torch(): @dask_serialize.register_lazy("cupy") @dask_deserialize.register_lazy("cupy") def _register_cupy(): - from . import cuda + from . import cupy @dask_serialize.register_lazy("numba") diff --git a/distributed/protocol/cuda.py b/distributed/protocol/cuda.py index 5218f5fe2fb..cb6efe01b3f 100644 --- a/distributed/protocol/cuda.py +++ b/distributed/protocol/cuda.py @@ -4,17 +4,11 @@ import cupy from .serialize import dask_serialize, dask_deserialize -# Some questions -# 1.Do we need *protocol-dependent* serialization? -# I assume we want this kind of serialization only when -# in UCP. -# 2. What does ucp-py need to know about? - @dask_serialize.register(cupy.ndarray) def serialize_cupy_ndarray(x): # TODO: handle non-contiguous - # TODO: handle 2d + # TODO: Handle order='K' ravel # TODO: 0d if x.flags.c_contiguous or x.flags.f_contiguous: diff --git a/distributed/protocol/cupy.py b/distributed/protocol/cupy.py new file mode 100644 index 00000000000..cb6efe01b3f --- /dev/null +++ b/distributed/protocol/cupy.py @@ -0,0 +1,39 @@ +""" +Efficient serialization GPU arrays. +""" +import cupy +from .serialize import dask_serialize, dask_deserialize + + +@dask_serialize.register(cupy.ndarray) +def serialize_cupy_ndarray(x): + # TODO: handle non-contiguous + # TODO: Handle order='K' ravel + # TODO: 0d + + if x.flags.c_contiguous or x.flags.f_contiguous: + strides = x.strides + data = x.ravel() # order='K' + else: + x = cupy.ascontiguousarray(x) + strides = x.strides + data = x.ravel() + + dtype = (0, x.dtype.str) + + header = x.__cuda_array_interface__.copy() + header['lengths'] = (x.nbytes,) # one per stride + header['compression'] = (None,) # TODO + header['is_cuda'] = 1 + header['dtype'] = dtype + return header, [data] + + +@dask_deserialize.register(cupy.ndarray) +def deserialize_cupy_array(header, frames): + frame, = frames + # TODO: put this in ucx... as a kind of "fixup" + frame.typestr = header['typestr'] + frame.shape = header['shape'] + arr = cupy.asarray(frame) + return arr From 1339a3e6494ffc00548c6fbc4163e4cb44856327 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 26 Feb 2019 14:10:07 -0800 Subject: [PATCH 18/68] remove old tests --- distributed/protocol/tests/test_cuda.py | 33 ------------------------- 1 file changed, 33 deletions(-) delete mode 100644 distributed/protocol/tests/test_cuda.py diff --git a/distributed/protocol/tests/test_cuda.py b/distributed/protocol/tests/test_cuda.py deleted file mode 100644 index e8e95224b0d..00000000000 --- a/distributed/protocol/tests/test_cuda.py +++ /dev/null @@ -1,33 +0,0 @@ -import pytest - - -cupy = pytest.importorskip("cupy") -from distributed.protocol import serialize, deserialize - - -def test_serialize(): - x = cupy.ones((5000, 50)) - header, frames = serialize(x) - type_ = 'cupy.core.core.ndarray' - _, [type_serialized] = serialize(type_) - - expected_header = { - 'shape': (5000, 50), - 'typestr': " Date: Tue, 26 Feb 2019 14:12:30 -0800 Subject: [PATCH 19/68] todos --- distributed/protocol/cudf.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/distributed/protocol/cudf.py b/distributed/protocol/cudf.py index 403a5cd44f1..85171f07f6b 100644 --- a/distributed/protocol/cudf.py +++ b/distributed/protocol/cudf.py @@ -3,9 +3,16 @@ from .numba import serialize_numba_ndarray, deserialize_numba_ndarray +# TODO: +# 1. Just use positions +# a. Fixes duplicate columns +# b. Fixes non-msgpack-serializable names +# 2. cudf.Series +# 3. Serialize the index + + @dask_serialize.register(cudf.DataFrame) def serialize_cudf_dataframe(x): - # TODO: does cudf support duplicate columns? sub_headers = [] arrays = [] null_masks = [] @@ -30,7 +37,9 @@ def serialize_cudf_dataframe(x): 'lengths': [len(x)] * len(arrays), 'is_cuda': len(arrays), 'subheaders': sub_headers, - 'columns': x.columns.tolist(), # TODO: ugh... + # TODO: the header must be msgpack (de)serializable. + # See if we can avoid names, and just use integer positions. + 'columns': x.columns.tolist(), 'null_counts': null_counts, 'null_subheaders': null_headers } @@ -40,8 +49,6 @@ def serialize_cudf_dataframe(x): @dask_deserialize.register(cudf.DataFrame) def serialize_cudf_dataframe(header, frames): - # TODO: duplicate columns - columns = header['columns'] n_columns = len(header['columns']) n_masks = len(header['null_subheaders']) From 8bc2dbb9ae495bc231d3d572ecdd89f7bb2c67ea Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 27 Feb 2019 09:08:04 -0800 Subject: [PATCH 20/68] Send headers --- distributed/comm/ucx.py | 124 ++++++++++++++++++---------------------- 1 file changed, 56 insertions(+), 68 deletions(-) diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index c6f06f49bc2..fa1ffeb8129 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -100,6 +100,27 @@ class UCX(Comm): The address, prefixed with `ucx://` to use. deserialize : bool, default True Whether to deserialize data in :meth:`distributed.protocol.loads` + + Notes + ----- + The read-write cycle uses the following pattern: + + Each msg is serialized into a number of "data" frames. We prepend these + real frames with two additional frames + + 1. is_gpu: Boolean indicator for whether the frame should be + received into GPU memory. Packed in '?' format. Unpack with + ``?`` format. + 2. frame_size : Unisigned int describing the size of frame (in bytes) + to receive. Packed in 'Q' format, so a length-0 frame is equivalent + to an unsized frame. Unpacked with ``Q``. + + The expected read cycle is + + 1. Read the frame describing number of frames + 2. Read the frame describing whether each data frame is gpu-bound + 3. Read the frame describing whether each data frame is sized + 4. Read all the data frames. """ def __init__(self, ep: ucp.ucp_py_ep, @@ -132,7 +153,29 @@ async def write(self, msg: dict, serializers=None, on_error: str = "message"): frames = await to_frames( msg, serializers=serializers, on_error=on_error ) # TODO: context= + gpu_frames = b''.join([struct.pack("?", hasattr(frame, '__cuda_array_interface__')) + for frame in frames]) + + def sizeof(x): + # I don't think we want to use nbytes, since that falls back + # to sys.getsizeof. + attrs = set(dir(x)) + if 'nbytes' in attrs: + nbytes = x.nbytes + elif {'size', 'dtype'} & attrs: + # numba + nbytes = x.dtype.itemsize * x.size + elif isinstance(x, bytes): + nbytes = len(x) + else: + nbytes = 0 + return struct.pack('Q', nbytes) + + n_data_frames = len(frames) + sized_frames = b''.join(sizeof(x) for x in frames) + frames = [gpu_frames] + [sized_frames] + frames nframes = struct.pack("Q", len(frames)) + await self.ep.send_obj(nframes) for frame in frames: @@ -143,32 +186,24 @@ async def read(self, deserializers=None): resp = await self.ep.recv_future() obj = ucp.get_obj_from_msg(resp) n_frames, = struct.unpack("Q", obj) + n_data_frames = n_frames - 2 + + gpu_frame_msg = await self.ep.recv_future() + gpu_frame_msg = gpu_frame_msg.get_obj() + is_gpu = struct.unpack("{}?".format(n_data_frames), gpu_frame_msg) + + sized_frame_msg = await self.ep.recv_future() + sized_frame_msg = sized_frame_msg.get_obj() + sizes = struct.unpack("{}Q".format(n_data_frames), sized_frame_msg) frames = [] - # gpu_inbound and sizes are for seeing if we can - # 1. Take a fastpath to recv a known-length object - # 2. Take a fast-fastpath to recv into GPU memory. - # see peek for more. - gpu_inbound = 0 - sizes = [] - - for i in range(n_frames): - if sizes: - this_size = sizes.pop() - # XXX: when do we get multiple keys here? Non-contiguous? - resp = await self.ep.recv_obj(this_size, cuda=bool(gpu_inbound)) - # prepare for the next (header) recv - if gpu_inbound: - gpu_inbound -= 1 + + for i, (is_gpu, size) in enumerate(zip(is_gpu, sizes)): + if size > 0: + resp = await self.ep.recv_obj(size, cuda=is_gpu) else: resp = await self.ep.recv_future() frame = ucp.get_obj_from_msg(resp) - if should_peek(frame): - # we have a header. Let's see if - # 1. We know the next frame's length (for fast recv) - # 2. We know the next frame's memory destination (GPU or CPU). - sizes, gpu_inbound = peek(frame) - frames.append(frame) msg = await from_frames( @@ -318,51 +353,4 @@ def get_local_address_for(self, loc): return unparse_host_port(local_host, None) -def should_peek(frame): - header_start = b'\x83\xa7headers' - peek_bytes = len(header_start) - - return type(frame) == memoryview and frame[:peek_bytes] == header_start - - -def peek(frame): - """ - Inspect a header for whether we can take a faster recv-path. - - Parameters - ---------- - frame : memoryview - The header frame to inspect. - - Returns - ------- - sizes : list - List of the next sided :meth:`recv_obj` receives to perform. - The recevies should be performed last to first, so use - :func:`list.pop` to get the next receive. - gpu_inbound : int - The number of GPU recvies to do. Decrement this to get - back to regular recvs. - - See Also - -------- - should_peek : check whether it's appropriate to peek. - """ - headers = msgpack.loads(frame, use_list=False) - keys = headers[b'keys'] - sizes = [] - gpu_inbound = 0 - - for key in keys: - header = headers[b'headers'][key] - sizes = list(header.get(b'lengths', [])) - if sizes: - sizes = sizes[::-1] - if header.get(b'is_cuda', 0): - gpu_inbound = int(header[b'is_cuda']) - break - - return sizes, gpu_inbound - - backends["ucx"] = UCXBackend() From 3e998ce63171c997a8ce8b27c60b7ad95a05c951 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 27 Feb 2019 09:39:24 -0800 Subject: [PATCH 21/68] use nbytes --- distributed/comm/ucx.py | 20 ++------------------ distributed/utils.py | 16 +++++++++++----- 2 files changed, 13 insertions(+), 23 deletions(-) diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index fa1ffeb8129..0fee16af2dd 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -155,25 +155,9 @@ async def write(self, msg: dict, serializers=None, on_error: str = "message"): ) # TODO: context= gpu_frames = b''.join([struct.pack("?", hasattr(frame, '__cuda_array_interface__')) for frame in frames]) + size_frames = b''.join([struct.pack("Q", nbytes(frame)) for frame in frames]) - def sizeof(x): - # I don't think we want to use nbytes, since that falls back - # to sys.getsizeof. - attrs = set(dir(x)) - if 'nbytes' in attrs: - nbytes = x.nbytes - elif {'size', 'dtype'} & attrs: - # numba - nbytes = x.dtype.itemsize * x.size - elif isinstance(x, bytes): - nbytes = len(x) - else: - nbytes = 0 - return struct.pack('Q', nbytes) - - n_data_frames = len(frames) - sized_frames = b''.join(sizeof(x) for x in frames) - frames = [gpu_frames] + [sized_frames] + frames + frames = [gpu_frames] + [size_frames] + frames nframes = struct.pack("Q", len(frames)) await self.ep.send_obj(nframes) diff --git a/distributed/utils.py b/distributed/utils.py index 506af847f10..d9724a4589b 100644 --- a/distributed/utils.py +++ b/distributed/utils.py @@ -1277,12 +1277,18 @@ def nbytes(frame, _bytes_like=(bytes, bytearray)): try: return frame.nbytes except AttributeError: - # XXX: nbytes fails for MemoryPointer. - # Probably time to move away try: - return len(frame) - except TypeError: - return sys.getsizeof(frame) + # TODO: https://github.com/numba/numba/issues/3810 + # numba DeviceNDArary doesn't implement .nbytes + # remove once that's fixed. + return frame.dtype.itemsize * frame.size + except AttributeError: + # XXX: nbytes fails for MemoryPointer. + # Probably time to move away + try: + return len(frame) + except TypeError: + return sys.getsizeof(frame) def PeriodicCallback(callback, callback_time, io_loop=None): From d4b3501ceaed0ea3cd150d323e429676dcd13acc Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Mon, 1 Apr 2019 20:23:03 -0700 Subject: [PATCH 22/68] let internal protocol machiner set lengths --- distributed/protocol/cudf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/distributed/protocol/cudf.py b/distributed/protocol/cudf.py index 85171f07f6b..77432bee035 100644 --- a/distributed/protocol/cudf.py +++ b/distributed/protocol/cudf.py @@ -34,7 +34,6 @@ def serialize_cudf_dataframe(x): arrays.extend(null_masks) header = { - 'lengths': [len(x)] * len(arrays), 'is_cuda': len(arrays), 'subheaders': sub_headers, # TODO: the header must be msgpack (de)serializable. From b89717e5a180829c77c0a5314a0d3c768807d4cf Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Mon, 1 Apr 2019 20:23:16 -0700 Subject: [PATCH 23/68] clean up --- distributed/comm/tests/test_comms.py | 3 +-- distributed/comm/tests/test_ucx.py | 35 +++++++++++++++------------- distributed/comm/ucx.py | 4 ---- 3 files changed, 20 insertions(+), 22 deletions(-) diff --git a/distributed/comm/tests/test_comms.py b/distributed/comm/tests/test_comms.py index cdf40d3f2e7..5916731fc36 100644 --- a/distributed/comm/tests/test_comms.py +++ b/distributed/comm/tests/test_comms.py @@ -604,12 +604,11 @@ def test_inproc_client_server(): yield check_client_server('inproc://', inproc_check()) yield check_client_server(inproc.new_address(), inproc_check()) - - # # TLS certificate handling # + @gen_test() def test_tls_reject_certificate(): cli_ctx = get_client_ssl_context() diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index 5ff4f8e9969..3afb229f908 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -179,6 +179,25 @@ def test_ucx_deserialize(): yield check_deserialize("tcp://") +@pytest.mark.asyncio +async def test_ping_pong_cudf(): + # if this test appears after cupy an import error arises + # *** ImportError: /usr/lib/x86_64-linux-gnu/libstdc++.so.6: version `CXXABI_1.3.11' + # not found (required by python3.7/site-packages/pyarrow/../../../libarrow.so.12) + cudf = pytest.importorskip('cudf') + + df = cudf.DataFrame({"A": [1, 2, None], "B": [1., 2., None]}) + address = "{}:{}".format(HOST, next(port_counter)) + + com, serv_com = await get_comm_pair(address) + msg = {"op": "ping", 'data': to_serialize(df)} + + await com.write(msg) + result = await serv_com.read() + data2 = result.pop('data') + assert result['op'] == 'ping' + + @pytest.mark.asyncio @pytest.mark.parametrize('shape', [ (100,), @@ -220,19 +239,3 @@ async def test_ping_pong_numba(): result = await serv_com.read() data2 = result.pop('data') assert result['op'] == 'ping' - - -@pytest.mark.asyncio -async def test_ping_pong_cudf(): - cudf = pytest.importorskip("cudf") - - df = cudf.DataFrame({"A": [1, 2, None], "B": [1., 2., None]}) - address = "{}:{}".format(HOST, next(port_counter)) - - com, serv_com = await get_comm_pair(address) - msg = {"op": "ping", 'data': to_serialize(df)} - - await com.write(msg) - result = await serv_com.read() - data2 = result.pop('data') - assert result['op'] == 'ping' diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index 0fee16af2dd..4514110fa3b 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -8,11 +8,7 @@ import asyncio import itertools import logging -import sys import struct -import msgpack - -from dask import config from .addressing import parse_host_port, unparse_host_port from .core import Comm, Connector, Listener From 3906d5679e5342259cf487deb764e403d86190fb Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Tue, 2 Apr 2019 13:07:08 -0700 Subject: [PATCH 24/68] numba and ucx updates from @tom --- distributed/comm/ucx.py | 16 +++++++++++----- distributed/protocol/numba.py | 18 +++++++++++++++++- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index 4514110fa3b..dd2b72be05a 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -104,10 +104,10 @@ class UCX(Comm): Each msg is serialized into a number of "data" frames. We prepend these real frames with two additional frames - 1. is_gpu: Boolean indicator for whether the frame should be + 1. is_gpus: Boolean indicator for whether the frame should be received into GPU memory. Packed in '?' format. Unpack with ``?`` format. - 2. frame_size : Unisigned int describing the size of frame (in bytes) + 2. frame_size : Unsigned int describing the size of frame (in bytes) to receive. Packed in 'Q' format, so a length-0 frame is equivalent to an unsized frame. Unpacked with ``Q``. @@ -170,7 +170,7 @@ async def read(self, deserializers=None): gpu_frame_msg = await self.ep.recv_future() gpu_frame_msg = gpu_frame_msg.get_obj() - is_gpu = struct.unpack("{}?".format(n_data_frames), gpu_frame_msg) + is_gpus = struct.unpack("{}?".format(n_data_frames), gpu_frame_msg) sized_frame_msg = await self.ep.recv_future() sized_frame_msg = sized_frame_msg.get_obj() @@ -178,9 +178,9 @@ async def read(self, deserializers=None): frames = [] - for i, (is_gpu, size) in enumerate(zip(is_gpu, sizes)): + for i, (is_gpus, size) in enumerate(zip(is_gpus, sizes)): if size > 0: - resp = await self.ep.recv_obj(size, cuda=is_gpu) + resp = await self.ep.recv_obj(size, cuda=is_gpus) else: resp = await self.ep.recv_future() frame = ucp.get_obj_from_msg(resp) @@ -300,6 +300,12 @@ def contact_address(self): host = ensure_concrete_host(host) # TODO: ensure_concrete_host return self.prefix + unparse_host_port(host, port) + @property + def bound_address(self): + # TODO: Does this become part of the base API? Kinda hazy, since + # we exclude in for inproc. + return self.get_host_port() + class UCXBackend(Backend): # I / O diff --git a/distributed/protocol/numba.py b/distributed/protocol/numba.py index 24df5b0adb8..b5c7ca5ec86 100644 --- a/distributed/protocol/numba.py +++ b/distributed/protocol/numba.py @@ -38,7 +38,23 @@ def serialize_numba_ndarray(x): def deserialize_numba_ndarray(header, frames): frame, = frames # TODO: put this in ucx... as a kind of "fixup" + if isinstance(frame, bytes): + import numpy as np + arr2 = np.frombuffer(frame, header['typestr']) + return numba.cuda.to_device(arr2) + frame.typestr = header['typestr'] frame.shape = header['shape'] + + # numba & cupy don't properly roundtrip length-zero arrays. + if frame.shape[0] == 0: + arr = numba.cuda.device_array( + header['shape'], + header['typestr'] + # strides? + # order? + ) + return arr + arr = numba.cuda.as_cuda_array(frame) - return arr + return arr \ No newline at end of file From 4eb58608e9dc6a04acbb9924d9dc154f78222cae Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Tue, 2 Apr 2019 13:31:07 -0700 Subject: [PATCH 25/68] remove length settings in cupy/numba protocols --- distributed/protocol/cupy.py | 3 ++- distributed/protocol/numba.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/distributed/protocol/cupy.py b/distributed/protocol/cupy.py index cb6efe01b3f..6f01f3375d4 100644 --- a/distributed/protocol/cupy.py +++ b/distributed/protocol/cupy.py @@ -21,8 +21,9 @@ def serialize_cupy_ndarray(x): dtype = (0, x.dtype.str) + # used in the ucx comms for gpu/cpu message passing + # 'lengths' set by dask header = x.__cuda_array_interface__.copy() - header['lengths'] = (x.nbytes,) # one per stride header['compression'] = (None,) # TODO header['is_cuda'] = 1 header['dtype'] = dtype diff --git a/distributed/protocol/numba.py b/distributed/protocol/numba.py index b5c7ca5ec86..73c18784eb1 100644 --- a/distributed/protocol/numba.py +++ b/distributed/protocol/numba.py @@ -26,8 +26,9 @@ def serialize_numba_ndarray(x): dtype = (0, x.dtype.str) nbytes = data.dtype.itemsize * data.size + # used in the ucx comms for gpu/cpu message passing + # 'lengths' set by dask header = x.__cuda_array_interface__.copy() - header['lengths'] = (nbytes,) # one per stride header['compression'] = (None,) # TODO header['is_cuda'] = 1 header['dtype'] = dtype From 6dcd1928697fd1c4aba0cb42710c751e819da336 Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Tue, 16 Apr 2019 11:28:48 -0700 Subject: [PATCH 26/68] partially working localcluster -- need to cleanup delete --- distributed/comm/tests/test_ucx.py | 38 ++++++++++++++++++++++++++---- distributed/comm/ucx.py | 15 ++++++++++-- distributed/deploy/local.py | 2 +- 3 files changed, 47 insertions(+), 8 deletions(-) diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index 3afb229f908..3f2ed73d0be 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -4,21 +4,26 @@ import pytest import ucp_py as ucp +from distributed import Client from distributed.comm import ucx, listen, connect from distributed.comm.registry import backends, get_backend from distributed.comm import ucx, parse_address, parse_host_port from distributed.protocol import to_serialize -from distributed.utils_test import gen_test +from distributed.deploy.local import LocalCluster +from distributed.utils_test import gen_test, loop, inc from .test_comms import check_deserialize -ADDRESS = ucx.ADDRESS -HOST, PORT = parse_host_port(ADDRESS.lstrip("ucx://")) -HOST = 'ucx://' + HOST +ucx_addr = ucp.get_address() +scheduler_port = 13337 + +ADDRESS = ucx_addr +HOST = f'ucx://{ucx_addr}:{scheduler_port}' + # Currently having some issues with re-using ports. # Tests just hang. Still debugging. -port_counter = itertools.count(PORT) +port_counter = itertools.count(scheduler_port) def test_parse_address(): @@ -239,3 +244,26 @@ async def test_ping_pong_numba(): result = await serv_com.read() data2 = result.pop('data') assert result['op'] == 'ping' + + +def test_ucx_localcluster(loop): + ucx_addr = ucp.get_address() + port = 13337 + env={'UCX_MEMTYPE_CACHE': 'n'} + worker_kwargs = {'env': env} + with LocalCluster(protocol="ucx://", scheduler_port=port, + ip=ucx_addr, + dashboard_address='127.0.0.1:8787', + n_workers=2, + threads_per_worker=1, + processes=False, + # env=env, + ) as c: + with Client(c) as e: + x = e.submit(inc, 1) + x.result() + assert x.key in c.scheduler.tasks + assert any(w.data == {x.key: 2} for w in c.workers) + assert e.loop is c.loop + + diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index dd2b72be05a..cfd6ab87a7c 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -67,7 +67,9 @@ def _parse_host_port(address: str, default_port=None) -> tuple: if address.startswith("ucx://"): _, address = _parse_address(address) - default_port = default_port or 13337 + # if default port is None we select the next port availabe + # ucx-py does not currently support random port assignment + default_port = default_port or next(_PORT_COUNTER) return parse_host_port(address, default_port=default_port) @@ -132,6 +134,7 @@ def __init__(self, ep: ucp.ucp_py_ep, self._host, self._port = _parse_host_port(address, default_port) self._local_addr = None self.deserialize = deserialize + self.comm_flag = None # finalizer? @@ -146,6 +149,7 @@ def peer_address(self) -> str: return self.address async def write(self, msg: dict, serializers=None, on_error: str = "message"): + # msg can also be a list of dicts when sending batched messages frames = await to_frames( msg, serializers=serializers, on_error=on_error ) # TODO: context= @@ -189,14 +193,20 @@ async def read(self, deserializers=None): msg = await from_frames( frames, deserialize=self.deserialize, deserializers=deserializers ) + return msg def abort(self): + # breakpoint() if self.ep: ucp.destroy_ep(self.ep) + print(self) + print(self.listener_instance) + print(type(self.listener_instance)) self.ep = None # if self.listener_instance: - # ucp.stop_listener(self.listener_instance) + # ucp.stop_listener(self.listener_instance) + # self.listener_instance = None async def close(self): # TODO: Handle in-flight messages? @@ -225,6 +235,7 @@ async def connect(self, address: str, deserialize=True, **connection_args) -> UC class UCXListener(Listener): + # MAX_LISTENERS 256 in ucx-py prefix = UCXConnector.prefix comm_class = UCXConnector.comm_class encrypted = UCXConnector.encrypted diff --git a/distributed/deploy/local.py b/distributed/deploy/local.py index ececc9fe12c..bfab1b032d9 100644 --- a/distributed/deploy/local.py +++ b/distributed/deploy/local.py @@ -253,7 +253,6 @@ def _start_worker(self, death_timeout=60, **kwargs): kwargs['quiet'] = True else: W = Worker - w = yield W(self.scheduler.address, loop=self.loop, death_timeout=death_timeout, silence_logs=self.silence_logs, **kwargs) @@ -321,6 +320,7 @@ def _close(self, timeout='2s'): timedelta(seconds=parse_timedelta(timeout)), All([self._stop_worker(w) for w in self.workers]), ) + del self.workers[:] try: From 8a7f9e6ab9adb4bcbafef3ef346cd6904f578e87 Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Tue, 16 Apr 2019 11:33:39 -0700 Subject: [PATCH 27/68] merge with master --- distributed/comm/tests/test_comms.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/distributed/comm/tests/test_comms.py b/distributed/comm/tests/test_comms.py index d0938cfadc3..597a66db455 100644 --- a/distributed/comm/tests/test_comms.py +++ b/distributed/comm/tests/test_comms.py @@ -24,16 +24,6 @@ ) from distributed.utils_test import loop # noqa: F401 -<<<<<<< HEAD -from distributed.protocol import (to_serialize, Serialized, serialize, - deserialize) - -from distributed.comm import (tcp, inproc, connect, listen, CommClosedError, - parse_address, parse_host_port, - unparse_host_port, resolve_address, - get_address_host, get_local_address_for, - registry) -======= from distributed.protocol import to_serialize, Serialized, serialize, deserialize from distributed.comm import ( @@ -49,8 +39,6 @@ get_address_host, get_local_address_for, ) ->>>>>>> cb6ed57573ef171988b372843489e12c8e5c5b6b - EXTERNAL_IP4 = get_ip() if has_ipv6(): @@ -499,11 +487,7 @@ def handle_comm(comm): # Check listener properties bound_addr = listener.listen_address bound_scheme, bound_loc = parse_address(bound_addr) -<<<<<<< HEAD - assert bound_scheme in registry.backends -======= assert bound_scheme in ("inproc", "tcp", "tls") ->>>>>>> cb6ed57573ef171988b372843489e12c8e5c5b6b assert bound_scheme == parse_address(addr)[0] if check_listen_addr is not None: From d332b07bc0c137d51cb23a8c66bc8528ac69805d Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Wed, 17 Apr 2019 08:45:01 -0700 Subject: [PATCH 28/68] yield comm closing and parameterize ucx test --- distributed/comm/tests/test_ucx.py | 52 +++++++++++++++++++++++------- distributed/comm/ucx.py | 5 ++- distributed/core.py | 2 +- distributed/deploy/local.py | 1 - 4 files changed, 46 insertions(+), 14 deletions(-) diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index 3f2ed73d0be..877f7871580 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -245,25 +245,55 @@ async def test_ping_pong_numba(): data2 = result.pop('data') assert result['op'] == 'ping' +@pytest.mark.parametrize('processes', [ + True, + False, +]) +def test_ucx_localcluster(loop, processes): + if processes: + kwargs = {'env': {'UCX_MEMTYPE_CACHE': 'n'}} + else: + kwargs = {} -def test_ucx_localcluster(loop): ucx_addr = ucp.get_address() port = 13337 - env={'UCX_MEMTYPE_CACHE': 'n'} - worker_kwargs = {'env': env} with LocalCluster(protocol="ucx://", scheduler_port=port, ip=ucx_addr, - dashboard_address='127.0.0.1:8787', + dashboard_address=None, n_workers=2, threads_per_worker=1, - processes=False, - # env=env, - ) as c: - with Client(c) as e: + processes=processes, + **kwargs, + ) as cluster: + with Client(cluster) as e: x = e.submit(inc, 1) x.result() - assert x.key in c.scheduler.tasks - assert any(w.data == {x.key: 2} for w in c.workers) - assert e.loop is c.loop + assert x.key in cluster.scheduler.tasks + assert any(w.data == {x.key: 2} for w in cluster.workers) + assert e.loop is cluster.loop + assert len(cluster.scheduler.workers) == 2 + print(cluster.scheduler.workers) +def test_tcp_localcluster(loop): + ucx_addr = '127.0.0.1' + port = 13337 + env={'UCX_MEMTYPE_CACHE': 'n'} + with LocalCluster( + 2, + scheduler_port=port, + ip = ucx_addr, + processes=True, + threads_per_worker=1, + dashboard_address=None, + silence_logs=False, + env=env, + ) as cluster: + print(cluster.scheduler.workers) + # with Client(cluster) as e: + # x = e.submit(inc, 1) + # x.result() + # assert x.key in c.scheduler.tasks + # assert any(w.data == {x.key: 2} for w in c.workers) + # assert e.loop is c.loop + # print(c.scheduler.workers) \ No newline at end of file diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index cfd6ab87a7c..608ed8bf57a 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -210,6 +210,8 @@ def abort(self): async def close(self): # TODO: Handle in-flight messages? + # sleep is currently used to help flush buffer + await asyncio.sleep(1.0) self.abort() def closed(self): @@ -226,8 +228,9 @@ class UCXConnector(Connector): async def connect(self, address: str, deserialize=True, **connection_args) -> UCX: logger.debug("UCXConnector.connect") _ucp_init() - + print(address) ip, port = _parse_host_port(address) + print(f'Connection Established at {ip} {port}') ep = ucp.get_endpoint(ip.encode(), port) return self.comm_class(ep, self.prefix + address, listener_instance=None, diff --git a/distributed/core.py b/distributed/core.py index 7f5f9596bc4..2e4dceb57a7 100644 --- a/distributed/core.py +++ b/distributed/core.py @@ -496,7 +496,7 @@ def handle_stream(self, comm, extra=None, every_cycle=[]): def close(self): self.listener.stop() for comm in self._comms: - comm.close() + yield comm.close() for cb in self._ongoing_coroutines: cb.cancel() for i in range(10): diff --git a/distributed/deploy/local.py b/distributed/deploy/local.py index ee5e77bbd33..82d528bc145 100644 --- a/distributed/deploy/local.py +++ b/distributed/deploy/local.py @@ -265,7 +265,6 @@ def _start(self, ip=None, n_workers=0): address += ":" + str(self.scheduler_port) self.scheduler.start(address) - yield [self._start_worker(**self.worker_kwargs) for i in range(n_workers)] self.status = "running" From 6fdcd2b8734d43eb88a1fd09023d0262114cbac7 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 17 Apr 2019 11:47:17 -0700 Subject: [PATCH 29/68] Use random port when given 0 This should be fixed upstream and should retry if the port is taken Also handle AttributeError on Python 3.6 for asyncio --- distributed/comm/ucx.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index 608ed8bf57a..2874f2c8557 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -69,7 +69,8 @@ def _parse_host_port(address: str, default_port=None) -> tuple: # if default port is None we select the next port availabe # ucx-py does not currently support random port assignment - default_port = default_port or next(_PORT_COUNTER) + import random + default_port = default_port or random.randint(1024, 65000) return parse_host_port(address, default_port=default_port) @@ -254,7 +255,7 @@ def __init__( if not address.startswith("ucx"): address = "ucx://" + address self.address = address - self.ip, self.port = _parse_host_port(address, default_port=next(_PORT_COUNTER)) + self.ip, self.port = _parse_host_port(address, default_port=0) self.comm_handler = comm_handler self.deserialize = deserialize self.ep = None # type: ucp.ucp_py_ep @@ -284,7 +285,7 @@ async def serve_forever(client_ep, listener_instance): try: loop = asyncio.get_running_loop() - except RuntimeError: + except (RuntimeError, AttributeError): loop = asyncio.get_event_loop() t = loop.create_task(server.coroutine) From c0ffeff5c994092d7fcdc154e22755fb2f4eb33d Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 17 Apr 2019 11:48:08 -0700 Subject: [PATCH 30/68] Allow for async def comm.close functions Previously we called `comm.close()`, a coroutine, assuming that this would start running the function. This is true in tornado, but not in asyncio. Now we explicitly yield or add them to the IOLoop --- distributed/core.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/distributed/core.py b/distributed/core.py index 2e4dceb57a7..563410de94b 100644 --- a/distributed/core.py +++ b/distributed/core.py @@ -495,8 +495,9 @@ def handle_stream(self, comm, extra=None, every_cycle=[]): @gen.coroutine def close(self): self.listener.stop() - for comm in self._comms: - yield comm.close() + + yield [comm.close() for comm in self._comms] + for cb in self._ongoing_coroutines: cb.cancel() for i in range(10): @@ -880,7 +881,8 @@ def collect(self): ) for addr, comms in self.available.items(): for comm in comms: - comm.close() + IOLoop.current().add_callback(comm.close) + # comm.close() comms.clear() if self.open < self.limit: self.event.set() @@ -893,7 +895,8 @@ def remove(self, addr): if addr in self.available: comms = self.available.pop(addr) for comm in comms: - comm.close() + # comm.close() + IOLoop.current().add_callback(comm.close) if addr in self.occupied: comms = self.occupied.pop(addr) for comm in comms: From 44c1d5cc6a867dcaa055518b6c7615863f498ebd Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 17 Apr 2019 11:49:33 -0700 Subject: [PATCH 31/68] Coerce the small bytes header of a message to bytes UCX was producing memoryviews here, and they seemed to be malformed. Coercing to bytes seems foolproof and cheap (these should always be small) --- distributed/protocol/core.py | 1 + 1 file changed, 1 insertion(+) diff --git a/distributed/protocol/core.py b/distributed/protocol/core.py index 0b5f7eb0fea..4dbc679b1fb 100644 --- a/distributed/protocol/core.py +++ b/distributed/protocol/core.py @@ -192,6 +192,7 @@ def loads_msgpack(header, payload): See Also: dumps_msgpack """ + header = bytes(header) if header: header = msgpack.loads(header, use_list=False, **msgpack_opts) else: From 69250f759d3402db717c0b6bf0c45652478d1123 Mon Sep 17 00:00:00 2001 From: Richard Zamora Date: Wed, 24 Apr 2019 10:20:59 -0700 Subject: [PATCH 32/68] Correcting ucp import statement to replect ucx-py PR#85 -- Also adding simple error message for failed ucx import --- distributed/comm/__init__.py | 2 +- distributed/comm/ucx.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/distributed/comm/__init__.py b/distributed/comm/__init__.py index b34170773a4..ebcafb08246 100644 --- a/distributed/comm/__init__.py +++ b/distributed/comm/__init__.py @@ -20,7 +20,7 @@ def _register_transports(): try: from . import ucx except ImportError: - pass + print("UCX Import Error!") diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index 2874f2c8557..ace02b8d6f7 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -16,7 +16,7 @@ from .utils import ensure_concrete_host, to_frames, from_frames from ..utils import ensure_ip, get_ip, get_ipv6, nbytes -import ucp_py as ucp +import ucp logger = logging.getLogger(__name__) MAX_MSG_LOG = 23 From bf6b6b93f32b025825389379af665ae708ee21d6 Mon Sep 17 00:00:00 2001 From: "(Rick) Richard J Zamora" Date: Wed, 24 Apr 2019 15:48:22 -0500 Subject: [PATCH 33/68] Removing print statement --- distributed/comm/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/distributed/comm/__init__.py b/distributed/comm/__init__.py index ebcafb08246..b34170773a4 100644 --- a/distributed/comm/__init__.py +++ b/distributed/comm/__init__.py @@ -20,7 +20,7 @@ def _register_transports(): try: from . import ucx except ImportError: - print("UCX Import Error!") + pass From 2c9cc2b053220292dc97c3d5a8793902b857b19e Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 25 Apr 2019 07:20:58 -0700 Subject: [PATCH 34/68] remove print statements --- distributed/comm/tests/test_ucx.py | 2 +- distributed/comm/ucx.py | 8 ++------ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index 877f7871580..f0cdeb5b681 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -296,4 +296,4 @@ def test_tcp_localcluster(loop): # assert x.key in c.scheduler.tasks # assert any(w.data == {x.key: 2} for w in c.workers) # assert e.loop is c.loop - # print(c.scheduler.workers) \ No newline at end of file + # print(c.scheduler.workers) diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index ace02b8d6f7..65249e8d9d3 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -201,9 +201,7 @@ def abort(self): # breakpoint() if self.ep: ucp.destroy_ep(self.ep) - print(self) - print(self.listener_instance) - print(type(self.listener_instance)) + logger.debug("Destroyed UCX endpoint") self.ep = None # if self.listener_instance: # ucp.stop_listener(self.listener_instance) @@ -227,11 +225,9 @@ class UCXConnector(Connector): client = ... # TODO: add a client here? async def connect(self, address: str, deserialize=True, **connection_args) -> UCX: - logger.debug("UCXConnector.connect") + logger.debug("UCXConnector.connect: %s", address) _ucp_init() - print(address) ip, port = _parse_host_port(address) - print(f'Connection Established at {ip} {port}') ep = ucp.get_endpoint(ip.encode(), port) return self.comm_class(ep, self.prefix + address, listener_instance=None, From c5d669dee909207aea7f842af2bb921fa8bfb77d Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 25 Apr 2019 08:26:44 -0700 Subject: [PATCH 35/68] skip ucx tests if not available --- distributed/comm/tests/test_ucx.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index f0cdeb5b681..bda31b5e6d0 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -2,7 +2,7 @@ import itertools import pytest -import ucp_py as ucp +ucp = pytest.importorskip('ucp') from distributed import Client from distributed.comm import ucx, listen, connect From f2925bfa505c55e3aedaf1928eab5bc22b2f2ae7 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 25 Apr 2019 11:55:57 -0700 Subject: [PATCH 36/68] cleanup ucx comm tests --- distributed/comm/tests/test_ucx.py | 37 ++++++++++++------------------ distributed/comm/ucx.py | 3 --- 2 files changed, 15 insertions(+), 25 deletions(-) diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index bda31b5e6d0..f0c0732ff63 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -15,15 +15,11 @@ from .test_comms import check_deserialize -ucx_addr = ucp.get_address() -scheduler_port = 13337 - -ADDRESS = ucx_addr -HOST = f'ucx://{ucx_addr}:{scheduler_port}' +HOST = ucp.get_address() # Currently having some issues with re-using ports. # Tests just hang. Still debugging. -port_counter = itertools.count(scheduler_port) +port_counter = itertools.count(13337) def test_parse_address(): @@ -32,7 +28,6 @@ def test_parse_address(): def test_parse_host_port(): - assert ucx._parse_host_port("10.33.225.160") == ("10.33.225.160", 13337) assert ucx._parse_host_port("10.33.225.160:13337") == ("10.33.225.160", 13337) assert ucx._parse_host_port("10.33.225.160:13338") == ("10.33.225.160", 13338) @@ -84,7 +79,7 @@ async def test_ping_pong(): @pytest.mark.asyncio async def test_comm_objs(): - address = "{}:{}".format(HOST, next(port_counter)) + address = "ucx://{}:{}".format(HOST, next(port_counter)) comm, serv_com = await get_comm_pair(address) assert comm.peer_address == address @@ -256,23 +251,21 @@ def test_ucx_localcluster(loop, processes): kwargs = {} ucx_addr = ucp.get_address() - port = 13337 - with LocalCluster(protocol="ucx://", scheduler_port=port, - ip=ucx_addr, - dashboard_address=None, - n_workers=2, - threads_per_worker=1, - processes=processes, - **kwargs, - ) as cluster: - with Client(cluster) as e: - x = e.submit(inc, 1) + with LocalCluster(protocol="ucx://", + ip=ucx_addr, + dashboard_address=None, + n_workers=2, + threads_per_worker=1, + processes=processes, + **kwargs, + ) as cluster: + with Client(cluster) as client: + x = client.submit(inc, 1) x.result() assert x.key in cluster.scheduler.tasks - assert any(w.data == {x.key: 2} for w in cluster.workers) - assert e.loop is cluster.loop + if not processes: + assert any(w.data == {x.key: 2} for w in cluster.workers) assert len(cluster.scheduler.workers) == 2 - print(cluster.scheduler.workers) def test_tcp_localcluster(loop): diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index 65249e8d9d3..03db707ce33 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -21,12 +21,9 @@ logger = logging.getLogger(__name__) MAX_MSG_LOG = 23 PORT = 13337 -IP = ucp.get_address() -DEFAULT_ADDRESS = f"ucx://{IP}:{PORT}" # set in ~/.dask/config.yaml # or DASK_DISTRIBUTED__COMM__UCXADDRESS -ADDRESS = DEFAULT_ADDRESS _PORT_COUNTER = itertools.count(PORT) _INITIALIZED = False From c78eb515b07945cd09f915be4cc1ad166811834f Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 25 Apr 2019 11:56:17 -0700 Subject: [PATCH 37/68] black --- distributed/comm/__init__.py | 2 +- distributed/comm/tests/test_comms.py | 3 +- distributed/comm/tests/test_ucx.py | 73 +++++++++++++--------------- distributed/comm/ucx.py | 39 +++++++-------- distributed/protocol/cuda.py | 12 ++--- distributed/protocol/cudf.py | 28 +++++------ distributed/protocol/cupy.py | 10 ++-- distributed/protocol/numba.py | 21 ++++---- distributed/protocol/utils.py | 2 +- 9 files changed, 95 insertions(+), 95 deletions(-) diff --git a/distributed/comm/__init__.py b/distributed/comm/__init__.py index b34170773a4..e0615b38c7a 100644 --- a/distributed/comm/__init__.py +++ b/distributed/comm/__init__.py @@ -17,11 +17,11 @@ def _register_transports(): from . import inproc from . import tcp + try: from . import ucx except ImportError: pass - _register_transports() diff --git a/distributed/comm/tests/test_comms.py b/distributed/comm/tests/test_comms.py index 597a66db455..e7df99a02f8 100644 --- a/distributed/comm/tests/test_comms.py +++ b/distributed/comm/tests/test_comms.py @@ -532,7 +532,7 @@ def client_communicate(key, delay=0): @gen_test() def test_ucx_client_server(): pytest.importorskip("distributed.comm.ucx") - yield check_client_server('ucx://10.33.225.160') + yield check_client_server("ucx://10.33.225.160") def tcp_eq(expected_host, expected_port=None): @@ -636,6 +636,7 @@ def test_inproc_client_server(): yield check_client_server("inproc://", inproc_check()) yield check_client_server(inproc.new_address(), inproc_check()) + # # TLS certificate handling # diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index f0c0732ff63..c5534a16c0c 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -2,7 +2,8 @@ import itertools import pytest -ucp = pytest.importorskip('ucp') + +ucp = pytest.importorskip("ucp") from distributed import Client from distributed.comm import ucx, listen, connect @@ -84,11 +85,11 @@ async def test_comm_objs(): assert comm.peer_address == address scheme, loc = parse_address(comm.peer_address) - assert scheme == 'ucx' + assert scheme == "ucx" assert comm.peer_address == address scheme, loc = parse_address(serv_com.peer_address) - assert scheme == 'ucx' + assert scheme == "ucx" def test_ucx_specific(): @@ -152,7 +153,7 @@ async def client_communicate(key, delay=0): @pytest.mark.asyncio async def test_ping_pong_data(): - np = pytest.importorskip('numpy') + np = pytest.importorskip("numpy") data = np.ones((10, 10)) # TODO: broken for large arrays @@ -162,7 +163,7 @@ async def test_ping_pong_data(): await com.write(msg) result = await serv_com.read() result["op"] = "pong" - data2 = result.pop('data') + data2 = result.pop("data") np.testing.assert_array_equal(data2, data) await serv_com.write(result) @@ -184,38 +185,35 @@ async def test_ping_pong_cudf(): # if this test appears after cupy an import error arises # *** ImportError: /usr/lib/x86_64-linux-gnu/libstdc++.so.6: version `CXXABI_1.3.11' # not found (required by python3.7/site-packages/pyarrow/../../../libarrow.so.12) - cudf = pytest.importorskip('cudf') + cudf = pytest.importorskip("cudf") - df = cudf.DataFrame({"A": [1, 2, None], "B": [1., 2., None]}) + df = cudf.DataFrame({"A": [1, 2, None], "B": [1.0, 2.0, None]}) address = "{}:{}".format(HOST, next(port_counter)) com, serv_com = await get_comm_pair(address) - msg = {"op": "ping", 'data': to_serialize(df)} + msg = {"op": "ping", "data": to_serialize(df)} await com.write(msg) result = await serv_com.read() - data2 = result.pop('data') - assert result['op'] == 'ping' + data2 = result.pop("data") + assert result["op"] == "ping" @pytest.mark.asyncio -@pytest.mark.parametrize('shape', [ - (100,), - (10, 10) -]) +@pytest.mark.parametrize("shape", [(100,), (10, 10)]) async def test_ping_pong_cupy(shape): - cupy = pytest.importorskip('cupy') + cupy = pytest.importorskip("cupy") address = "{}:{}".format(HOST, next(port_counter)) com, serv_com = await get_comm_pair(address) arr = cupy.random.random(shape) - msg = {"op": "ping", 'data': to_serialize(arr)} + msg = {"op": "ping", "data": to_serialize(arr)} await com.write(msg) result = await serv_com.read() - data2 = result.pop('data') + data2 = result.pop("data") - assert result['op'] == 'ping' + assert result["op"] == "ping" cupy.testing.assert_array_equal(arr, data2) await com.close() await serv_com.close() @@ -223,7 +221,7 @@ async def test_ping_pong_cupy(shape): @pytest.mark.asyncio async def test_ping_pong_numba(): - np = pytest.importorskip('numpy') + np = pytest.importorskip("numpy") numba = pytest.importorskip("numba") import numba.cuda @@ -233,32 +231,31 @@ async def test_ping_pong_numba(): arr = numba.cuda.to_device(arr) com, serv_com = await get_comm_pair(address) - msg = {"op": "ping", 'data': to_serialize(arr)} + msg = {"op": "ping", "data": to_serialize(arr)} await com.write(msg) result = await serv_com.read() - data2 = result.pop('data') - assert result['op'] == 'ping' + data2 = result.pop("data") + assert result["op"] == "ping" + -@pytest.mark.parametrize('processes', [ - True, - False, -]) +@pytest.mark.parametrize("processes", [True, False]) def test_ucx_localcluster(loop, processes): if processes: - kwargs = {'env': {'UCX_MEMTYPE_CACHE': 'n'}} + kwargs = {"env": {"UCX_MEMTYPE_CACHE": "n"}} else: kwargs = {} ucx_addr = ucp.get_address() - with LocalCluster(protocol="ucx://", - ip=ucx_addr, - dashboard_address=None, - n_workers=2, - threads_per_worker=1, - processes=processes, - **kwargs, - ) as cluster: + with LocalCluster( + protocol="ucx://", + ip=ucx_addr, + dashboard_address=None, + n_workers=2, + threads_per_worker=1, + processes=processes, + **kwargs, + ) as cluster: with Client(cluster) as client: x = client.submit(inc, 1) x.result() @@ -269,13 +266,13 @@ def test_ucx_localcluster(loop, processes): def test_tcp_localcluster(loop): - ucx_addr = '127.0.0.1' + ucx_addr = "127.0.0.1" port = 13337 - env={'UCX_MEMTYPE_CACHE': 'n'} + env = {"UCX_MEMTYPE_CACHE": "n"} with LocalCluster( 2, scheduler_port=port, - ip = ucx_addr, + ip=ucx_addr, processes=True, threads_per_worker=1, dashboard_address=None, diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index 03db707ce33..7ed1d80e291 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -67,6 +67,7 @@ def _parse_host_port(address: str, default_port=None) -> tuple: # if default port is None we select the next port availabe # ucx-py does not currently support random port assignment import random + default_port = default_port or random.randint(1024, 65000) return parse_host_port(address, default_port=default_port) @@ -119,10 +120,9 @@ class UCX(Comm): 4. Read all the data frames. """ - def __init__(self, ep: ucp.ucp_py_ep, - address: str, - listener_instance, - deserialize=True): + def __init__( + self, ep: ucp.ucp_py_ep, address: str, listener_instance, deserialize=True + ): logger.info("UCX.__init__ %s %s", address, listener_instance) self.ep = ep assert address.startswith("ucx") @@ -151,9 +151,13 @@ async def write(self, msg: dict, serializers=None, on_error: str = "message"): frames = await to_frames( msg, serializers=serializers, on_error=on_error ) # TODO: context= - gpu_frames = b''.join([struct.pack("?", hasattr(frame, '__cuda_array_interface__')) - for frame in frames]) - size_frames = b''.join([struct.pack("Q", nbytes(frame)) for frame in frames]) + gpu_frames = b"".join( + [ + struct.pack("?", hasattr(frame, "__cuda_array_interface__")) + for frame in frames + ] + ) + size_frames = b"".join([struct.pack("Q", nbytes(frame)) for frame in frames]) frames = [gpu_frames] + [size_frames] + frames nframes = struct.pack("Q", len(frames)) @@ -201,8 +205,8 @@ def abort(self): logger.debug("Destroyed UCX endpoint") self.ep = None # if self.listener_instance: - # ucp.stop_listener(self.listener_instance) - # self.listener_instance = None + # ucp.stop_listener(self.listener_instance) + # self.listener_instance = None async def close(self): # TODO: Handle in-flight messages? @@ -226,9 +230,9 @@ async def connect(self, address: str, deserialize=True, **connection_args) -> UC _ucp_init() ip, port = _parse_host_port(address) ep = ucp.get_endpoint(ip.encode(), port) - return self.comm_class(ep, self.prefix + address, - listener_instance=None, - deserialize=deserialize) + return self.comm_class( + ep, self.prefix + address, listener_instance=None, deserialize=deserialize + ) class UCXListener(Listener): @@ -238,11 +242,7 @@ class UCXListener(Listener): encrypted = UCXConnector.encrypted def __init__( - self, - address: str, - comm_handler: None, - deserialize=False, - **connection_args, + self, address: str, comm_handler: None, deserialize=False, **connection_args ): logger.debug("UCXListener.__init__") if not address.startswith("ucx"): @@ -262,8 +262,9 @@ def __init__( def start(self): async def serve_forever(client_ep, listener_instance): - ucx = UCX(client_ep, self.address, listener_instance, - deserialize=self.deserialize) + ucx = UCX( + client_ep, self.address, listener_instance, deserialize=self.deserialize + ) self.listener_instance = listener_instance if self.comm_handler: await self.comm_handler(ucx) diff --git a/distributed/protocol/cuda.py b/distributed/protocol/cuda.py index cb6efe01b3f..ba435f897f7 100644 --- a/distributed/protocol/cuda.py +++ b/distributed/protocol/cuda.py @@ -22,10 +22,10 @@ def serialize_cupy_ndarray(x): dtype = (0, x.dtype.str) header = x.__cuda_array_interface__.copy() - header['lengths'] = (x.nbytes,) # one per stride - header['compression'] = (None,) # TODO - header['is_cuda'] = 1 - header['dtype'] = dtype + header["lengths"] = (x.nbytes,) # one per stride + header["compression"] = (None,) # TODO + header["is_cuda"] = 1 + header["dtype"] = dtype return header, [data] @@ -33,7 +33,7 @@ def serialize_cupy_ndarray(x): def deserialize_cupy_array(header, frames): frame, = frames # TODO: put this in ucx... as a kind of "fixup" - frame.typestr = header['typestr'] - frame.shape = header['shape'] + frame.typestr = header["typestr"] + frame.shape = header["shape"] arr = cupy.asarray(frame) return arr diff --git a/distributed/protocol/cudf.py b/distributed/protocol/cudf.py index 77432bee035..cf3a172044f 100644 --- a/distributed/protocol/cudf.py +++ b/distributed/protocol/cudf.py @@ -21,12 +21,12 @@ def serialize_cudf_dataframe(x): for label, col in x.iteritems(): header, [frame] = serialize_numba_ndarray(col.data.mem) - header['name'] = label + header["name"] = label sub_headers.append(header) arrays.append(frame) if col.null_count: header, [frame] = serialize_numba_ndarray(col.nullmask.mem) - header['name'] = label + header["name"] = label null_headers.append(header) null_masks.append(frame) null_counts[label] = col.null_count @@ -34,13 +34,13 @@ def serialize_cudf_dataframe(x): arrays.extend(null_masks) header = { - 'is_cuda': len(arrays), - 'subheaders': sub_headers, + "is_cuda": len(arrays), + "subheaders": sub_headers, # TODO: the header must be msgpack (de)serializable. # See if we can avoid names, and just use integer positions. - 'columns': x.columns.tolist(), - 'null_counts': null_counts, - 'null_subheaders': null_headers + "columns": x.columns.tolist(), + "null_counts": null_counts, + "null_subheaders": null_headers, } return header, arrays @@ -48,21 +48,21 @@ def serialize_cudf_dataframe(x): @dask_deserialize.register(cudf.DataFrame) def serialize_cudf_dataframe(header, frames): - columns = header['columns'] - n_columns = len(header['columns']) - n_masks = len(header['null_subheaders']) + columns = header["columns"] + n_columns = len(header["columns"]) + n_masks = len(header["null_subheaders"]) masks = {} pairs = [] for i in range(n_masks): - subheader = header['null_subheaders'][i] + subheader = header["null_subheaders"][i] frame = frames[n_columns + i] mask = deserialize_numba_ndarray(subheader, [frame]) - masks[subheader['name']] = mask + masks[subheader["name"]] = mask - for subheader, frame in zip(header['subheaders'], frames[:n_columns]): - name = subheader['name'] + for subheader, frame in zip(header["subheaders"], frames[:n_columns]): + name = subheader["name"] array = deserialize_numba_ndarray(subheader, [frame]) if name in masks: diff --git a/distributed/protocol/cupy.py b/distributed/protocol/cupy.py index 6f01f3375d4..745de450bb2 100644 --- a/distributed/protocol/cupy.py +++ b/distributed/protocol/cupy.py @@ -24,9 +24,9 @@ def serialize_cupy_ndarray(x): # used in the ucx comms for gpu/cpu message passing # 'lengths' set by dask header = x.__cuda_array_interface__.copy() - header['compression'] = (None,) # TODO - header['is_cuda'] = 1 - header['dtype'] = dtype + header["compression"] = (None,) # TODO + header["is_cuda"] = 1 + header["dtype"] = dtype return header, [data] @@ -34,7 +34,7 @@ def serialize_cupy_ndarray(x): def deserialize_cupy_array(header, frames): frame, = frames # TODO: put this in ucx... as a kind of "fixup" - frame.typestr = header['typestr'] - frame.shape = header['shape'] + frame.typestr = header["typestr"] + frame.shape = header["shape"] arr = cupy.asarray(frame) return arr diff --git a/distributed/protocol/numba.py b/distributed/protocol/numba.py index 73c18784eb1..6c3f9da4d83 100644 --- a/distributed/protocol/numba.py +++ b/distributed/protocol/numba.py @@ -8,7 +8,7 @@ def serialize_numba_ndarray(x): # TODO: handle 2d # TODO: 0d - if x.flags['C_CONTIGUOUS'] or x.flags['F_CONTIGUOUS']: + if x.flags["C_CONTIGUOUS"] or x.flags["F_CONTIGUOUS"]: strides = x.strides if x.ndim > 1: data = x.ravel() # order='K' @@ -29,9 +29,9 @@ def serialize_numba_ndarray(x): # used in the ucx comms for gpu/cpu message passing # 'lengths' set by dask header = x.__cuda_array_interface__.copy() - header['compression'] = (None,) # TODO - header['is_cuda'] = 1 - header['dtype'] = dtype + header["compression"] = (None,) # TODO + header["is_cuda"] = 1 + header["dtype"] = dtype return header, [data] @@ -41,21 +41,22 @@ def deserialize_numba_ndarray(header, frames): # TODO: put this in ucx... as a kind of "fixup" if isinstance(frame, bytes): import numpy as np - arr2 = np.frombuffer(frame, header['typestr']) + + arr2 = np.frombuffer(frame, header["typestr"]) return numba.cuda.to_device(arr2) - frame.typestr = header['typestr'] - frame.shape = header['shape'] + frame.typestr = header["typestr"] + frame.shape = header["shape"] # numba & cupy don't properly roundtrip length-zero arrays. if frame.shape[0] == 0: arr = numba.cuda.device_array( - header['shape'], - header['typestr'] + header["shape"], + header["typestr"] # strides? # order? ) return arr arr = numba.cuda.as_cuda_array(frame) - return arr \ No newline at end of file + return arr diff --git a/distributed/protocol/utils.py b/distributed/protocol/utils.py index 38856d2be4f..1e395f31814 100644 --- a/distributed/protocol/utils.py +++ b/distributed/protocol/utils.py @@ -55,7 +55,7 @@ def merge_frames(header, frames): if not frames: return frames - if any(x.__class__.__name__ == 'MemoryPointer' for x in frames): + if any(x.__class__.__name__ == "MemoryPointer" for x in frames): # XXX return frames From 477a91349cdde471b762fcfa8c0919d4d69ea264 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 25 Apr 2019 13:20:44 -0700 Subject: [PATCH 38/68] downgrade logging to debug level --- distributed/comm/ucx.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index 7ed1d80e291..00d600ff8bd 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -123,7 +123,7 @@ class UCX(Comm): def __init__( self, ep: ucp.ucp_py_ep, address: str, listener_instance, deserialize=True ): - logger.info("UCX.__init__ %s %s", address, listener_instance) + logger.debug("UCX.__init__ %s %s", address, listener_instance) self.ep = ep assert address.startswith("ucx") self.address = address From 52f2f8c32e7862724acb9a43675e0414de7ec2ed Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Fri, 26 Apr 2019 06:39:20 -0700 Subject: [PATCH 39/68] raised CommClosedError on closed UCX comm --- distributed/comm/ucx.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index 00d600ff8bd..d9636ef6f10 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -11,7 +11,7 @@ import struct from .addressing import parse_host_port, unparse_host_port -from .core import Comm, Connector, Listener +from .core import Comm, Connector, Listener, CommClosedError from .registry import Backend, backends from .utils import ensure_concrete_host, to_frames, from_frames from ..utils import ensure_ip, get_ip, get_ipv6, nbytes @@ -124,7 +124,7 @@ def __init__( self, ep: ucp.ucp_py_ep, address: str, listener_instance, deserialize=True ): logger.debug("UCX.__init__ %s %s", address, listener_instance) - self.ep = ep + self._ep = ep assert address.startswith("ucx") self.address = address self.listener_instance = listener_instance @@ -200,22 +200,28 @@ async def read(self, deserializers=None): def abort(self): # breakpoint() - if self.ep: - ucp.destroy_ep(self.ep) + if self._ep: + ucp.destroy_ep(self._ep) logger.debug("Destroyed UCX endpoint") - self.ep = None + self._ep = None # if self.listener_instance: # ucp.stop_listener(self.listener_instance) # self.listener_instance = None + @property + def ep(self): + if self._ep: + return self._ep + else: + raise CommClosedError("UCX Endpoint is closed") + async def close(self): # TODO: Handle in-flight messages? # sleep is currently used to help flush buffer - await asyncio.sleep(1.0) self.abort() def closed(self): - return self.ep is None + return self._ep is None class UCXConnector(Connector): @@ -251,7 +257,7 @@ def __init__( self.ip, self.port = _parse_host_port(address, default_port=0) self.comm_handler = comm_handler self.deserialize = deserialize - self.ep = None # type: ucp.ucp_py_ep + self._ep = None # type: ucp.ucp_py_ep self.listener_instance = None # type: ucp.ListenerFuture self._task = None @@ -290,8 +296,8 @@ def stop(self): if self._task: self._task.cancel() - if self.ep: - ucp.destroy_ep(self.ep) + if self._ep: + ucp.destroy_ep(self._ep) # if self.listener_instance: # ucp.stop_listener(self.listener_instance) From 4a30f172f5a4c3891125d33218adcc08061824f6 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Mon, 29 Apr 2019 12:10:21 -0700 Subject: [PATCH 40/68] Clean up UCX tests --- distributed/comm/tests/test_comms.py | 7 +++++-- distributed/comm/tests/test_ucx.py | 13 +++++++------ distributed/comm/ucx.py | 6 +++--- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/distributed/comm/tests/test_comms.py b/distributed/comm/tests/test_comms.py index e7df99a02f8..d8a73a048ab 100644 --- a/distributed/comm/tests/test_comms.py +++ b/distributed/comm/tests/test_comms.py @@ -26,6 +26,7 @@ from distributed.protocol import to_serialize, Serialized, serialize, deserialize +from distributed.comm.registry import backends from distributed.comm import ( tcp, inproc, @@ -487,7 +488,7 @@ def handle_comm(comm): # Check listener properties bound_addr = listener.listen_address bound_scheme, bound_loc = parse_address(bound_addr) - assert bound_scheme in ("inproc", "tcp", "tls") + assert bound_scheme in backends assert bound_scheme == parse_address(addr)[0] if check_listen_addr is not None: @@ -532,7 +533,9 @@ def client_communicate(key, delay=0): @gen_test() def test_ucx_client_server(): pytest.importorskip("distributed.comm.ucx") - yield check_client_server("ucx://10.33.225.160") + import ucp + addr = ucp.get_address() + yield check_client_server("ucx://" + addr) def tcp_eq(expected_host, expected_port=None): diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index c5534a16c0c..179ec328b6e 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -61,7 +61,7 @@ async def handle_comm(comm): @pytest.mark.asyncio async def test_ping_pong(): - address = "{}:{}".format(HOST, next(port_counter)) + address = "ucx://{}:{}".format(HOST, next(port_counter)) com, serv_com = await get_comm_pair(address) msg = {"op": "ping"} await com.write(msg) @@ -102,7 +102,7 @@ def test_ucx_specific(): # 3. Test peer_address # 4. Test cleanup async def f(): - address = "{}:{}".format(HOST, next(port_counter)) + address = "ucx://{}:{}".format(HOST, next(port_counter)) async def handle_comm(comm): # XXX: failures here don't fail the build yet @@ -157,7 +157,7 @@ async def test_ping_pong_data(): data = np.ones((10, 10)) # TODO: broken for large arrays - address = "{}:{}".format(HOST, next(port_counter)) + address = "ucx://{}:{}".format(HOST, next(port_counter)) com, serv_com = await get_comm_pair(address) msg = {"op": "ping", "data": to_serialize(data)} await com.write(msg) @@ -188,7 +188,7 @@ async def test_ping_pong_cudf(): cudf = pytest.importorskip("cudf") df = cudf.DataFrame({"A": [1, 2, None], "B": [1.0, 2.0, None]}) - address = "{}:{}".format(HOST, next(port_counter)) + address = "ucx://{}:{}".format(HOST, next(port_counter)) com, serv_com = await get_comm_pair(address) msg = {"op": "ping", "data": to_serialize(df)} @@ -203,7 +203,7 @@ async def test_ping_pong_cudf(): @pytest.mark.parametrize("shape", [(100,), (10, 10)]) async def test_ping_pong_cupy(shape): cupy = pytest.importorskip("cupy") - address = "{}:{}".format(HOST, next(port_counter)) + address = "ucx://{}:{}".format(HOST, next(port_counter)) com, serv_com = await get_comm_pair(address) arr = cupy.random.random(shape) @@ -225,7 +225,7 @@ async def test_ping_pong_numba(): numba = pytest.importorskip("numba") import numba.cuda - address = "{}:{}".format(HOST, next(port_counter)) + address = "ucx://{}:{}".format(HOST, next(port_counter)) arr = np.arange(10) arr = numba.cuda.to_device(arr) @@ -254,6 +254,7 @@ def test_ucx_localcluster(loop, processes): n_workers=2, threads_per_worker=1, processes=processes, + loop=loop, **kwargs, ) as cluster: with Client(cluster) as client: diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index d9636ef6f10..81c5cd611d6 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -91,7 +91,7 @@ class UCX(Comm): Parameters ---------- - ep : ucp.ucp_py_ep + ep : ucp.Endpoint The UCP endpoint. address : str The address, prefixed with `ucx://` to use. @@ -121,7 +121,7 @@ class UCX(Comm): """ def __init__( - self, ep: ucp.ucp_py_ep, address: str, listener_instance, deserialize=True + self, ep: ucp.Endpoint, address: str, listener_instance, deserialize=True ): logger.debug("UCX.__init__ %s %s", address, listener_instance) self._ep = ep @@ -257,7 +257,7 @@ def __init__( self.ip, self.port = _parse_host_port(address, default_port=0) self.comm_handler = comm_handler self.deserialize = deserialize - self._ep = None # type: ucp.ucp_py_ep + self._ep = None # type: ucp.Endpoint self.listener_instance = None # type: ucp.ListenerFuture self._task = None From 25b132764c87fdea39ef6f7b7666ae70f15d1ead Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Mon, 29 Apr 2019 15:21:04 -0700 Subject: [PATCH 41/68] Add larger cupy array into test Currently this fails. An array slightly smaller than this is fine --- distributed/comm/tests/test_ucx.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index 179ec328b6e..86b4a8ccaff 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -200,7 +200,7 @@ async def test_ping_pong_cudf(): @pytest.mark.asyncio -@pytest.mark.parametrize("shape", [(100,), (10, 10)]) +@pytest.mark.parametrize("shape", [(100,), (10, 10), (4_947,)]) async def test_ping_pong_cupy(shape): cupy = pytest.importorskip("cupy") address = "ucx://{}:{}".format(HOST, next(port_counter)) From ff5ca77c492d140a81487d8a8ea8e38d4f2399e3 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Mon, 29 Apr 2019 16:05:41 -0700 Subject: [PATCH 42/68] Send all frame metadata as a single message This should reduce back and forth a bit --- distributed/comm/ucx.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index 81c5cd611d6..89ce8ca64c5 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -159,10 +159,11 @@ async def write(self, msg: dict, serializers=None, on_error: str = "message"): ) size_frames = b"".join([struct.pack("Q", nbytes(frame)) for frame in frames]) - frames = [gpu_frames] + [size_frames] + frames nframes = struct.pack("Q", len(frames)) - await self.ep.send_obj(nframes) + meta = b''.join([nframes, gpu_frames, size_frames]) + + await self.ep.send_obj(meta) for frame in frames: await self.ep.send_obj(frame) @@ -171,16 +172,13 @@ async def write(self, msg: dict, serializers=None, on_error: str = "message"): async def read(self, deserializers=None): resp = await self.ep.recv_future() obj = ucp.get_obj_from_msg(resp) - n_frames, = struct.unpack("Q", obj) - n_data_frames = n_frames - 2 + nframes, = struct.unpack("Q", obj[:8]) - gpu_frame_msg = await self.ep.recv_future() - gpu_frame_msg = gpu_frame_msg.get_obj() - is_gpus = struct.unpack("{}?".format(n_data_frames), gpu_frame_msg) + gpu_frame_msg = obj[8:8 + nframes] + is_gpus = struct.unpack("{}?".format(nframes), gpu_frame_msg) - sized_frame_msg = await self.ep.recv_future() - sized_frame_msg = sized_frame_msg.get_obj() - sizes = struct.unpack("{}Q".format(n_data_frames), sized_frame_msg) + sized_frame_msg = obj[8 + nframes:] + sizes = struct.unpack("{}Q".format(nframes), sized_frame_msg) frames = [] From f0c51a5fd839015f9902a86b2807ae03acf1ff72 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Tue, 30 Apr 2019 06:08:00 -0700 Subject: [PATCH 43/68] add environment variables at ucx import time --- distributed/comm/tests/test_ucx.py | 9 ++++----- distributed/comm/ucx.py | 5 +++++ 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index 86b4a8ccaff..f859e490cc5 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -200,7 +200,7 @@ async def test_ping_pong_cudf(): @pytest.mark.asyncio -@pytest.mark.parametrize("shape", [(100,), (10, 10), (4_947,)]) +@pytest.mark.parametrize("shape", [(100,), (10, 10), (4947,)]) async def test_ping_pong_cupy(shape): cupy = pytest.importorskip("cupy") address = "ucx://{}:{}".format(HOST, next(port_counter)) @@ -209,8 +209,7 @@ async def test_ping_pong_cupy(shape): arr = cupy.random.random(shape) msg = {"op": "ping", "data": to_serialize(arr)} - await com.write(msg) - result = await serv_com.read() + _, result = await asyncio.gather(com.write(msg), serv_com.read()) data2 = result.pop("data") assert result["op"] == "ping" @@ -248,8 +247,8 @@ def test_ucx_localcluster(loop, processes): ucx_addr = ucp.get_address() with LocalCluster( - protocol="ucx://", - ip=ucx_addr, + protocol="ucx", + interface='ib0', dashboard_address=None, n_workers=2, threads_per_worker=1, diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index 89ce8ca64c5..56d8d842e6b 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -18,6 +18,11 @@ import ucp +import os + +os.environ.setdefault('UCX_RNDV_SCHEME', 'put_zcopy') +os.environ.setdefault('UCX_MEMTYPE_CACHE', 'n') + logger = logging.getLogger(__name__) MAX_MSG_LOG = 23 PORT = 13337 From ce46bc3ae35bf35ae5f38d64049f1d097ccd6fac Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Tue, 30 Apr 2019 06:08:26 -0700 Subject: [PATCH 44/68] black --- distributed/comm/tests/test_comms.py | 1 + distributed/comm/tests/test_ucx.py | 2 +- distributed/comm/ucx.py | 10 +++++----- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/distributed/comm/tests/test_comms.py b/distributed/comm/tests/test_comms.py index d8a73a048ab..05f2d631626 100644 --- a/distributed/comm/tests/test_comms.py +++ b/distributed/comm/tests/test_comms.py @@ -534,6 +534,7 @@ def client_communicate(key, delay=0): def test_ucx_client_server(): pytest.importorskip("distributed.comm.ucx") import ucp + addr = ucp.get_address() yield check_client_server("ucx://" + addr) diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index f859e490cc5..9c6e072e791 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -248,7 +248,7 @@ def test_ucx_localcluster(loop, processes): ucx_addr = ucp.get_address() with LocalCluster( protocol="ucx", - interface='ib0', + interface="ib0", dashboard_address=None, n_workers=2, threads_per_worker=1, diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index 56d8d842e6b..9157d662663 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -20,8 +20,8 @@ import os -os.environ.setdefault('UCX_RNDV_SCHEME', 'put_zcopy') -os.environ.setdefault('UCX_MEMTYPE_CACHE', 'n') +os.environ.setdefault("UCX_RNDV_SCHEME", "put_zcopy") +os.environ.setdefault("UCX_MEMTYPE_CACHE", "n") logger = logging.getLogger(__name__) MAX_MSG_LOG = 23 @@ -166,7 +166,7 @@ async def write(self, msg: dict, serializers=None, on_error: str = "message"): nframes = struct.pack("Q", len(frames)) - meta = b''.join([nframes, gpu_frames, size_frames]) + meta = b"".join([nframes, gpu_frames, size_frames]) await self.ep.send_obj(meta) @@ -179,10 +179,10 @@ async def read(self, deserializers=None): obj = ucp.get_obj_from_msg(resp) nframes, = struct.unpack("Q", obj[:8]) - gpu_frame_msg = obj[8:8 + nframes] + gpu_frame_msg = obj[8 : 8 + nframes] is_gpus = struct.unpack("{}?".format(nframes), gpu_frame_msg) - sized_frame_msg = obj[8 + nframes:] + sized_frame_msg = obj[8 + nframes :] sizes = struct.unpack("{}Q".format(nframes), sized_frame_msg) frames = [] From ecdf3cbd9916657fd636f841560845bb34554367 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Tue, 30 Apr 2019 06:21:13 -0700 Subject: [PATCH 45/68] add comments about byte offsets --- distributed/comm/ucx.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index 9157d662663..0f4f5bf5c86 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -177,12 +177,12 @@ async def write(self, msg: dict, serializers=None, on_error: str = "message"): async def read(self, deserializers=None): resp = await self.ep.recv_future() obj = ucp.get_obj_from_msg(resp) - nframes, = struct.unpack("Q", obj[:8]) + nframes, = struct.unpack("Q", obj[:8]) # first eight bytes for number of frames - gpu_frame_msg = obj[8 : 8 + nframes] + gpu_frame_msg = obj[8 : 8 + nframes] # next nframes bytes for if they're GPU frames is_gpus = struct.unpack("{}?".format(nframes), gpu_frame_msg) - sized_frame_msg = obj[8 + nframes :] + sized_frame_msg = obj[8 + nframes :] # then the rest for frame sizes sizes = struct.unpack("{}Q".format(nframes), sized_frame_msg) frames = [] From f55776627a592f1bba8b51f1109ea6285d068c20 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 2 May 2019 13:10:47 -0700 Subject: [PATCH 46/68] Add comm.close call to IOLoop This is important if comm.close is defined as an async function rather than a tornado coroutine --- distributed/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/distributed/core.py b/distributed/core.py index 563410de94b..8395004a339 100644 --- a/distributed/core.py +++ b/distributed/core.py @@ -900,7 +900,7 @@ def remove(self, addr): if addr in self.occupied: comms = self.occupied.pop(addr) for comm in comms: - comm.close() + IOLoop.current().add_callback(comm.close) if self.open < self.limit: self.event.set() From ec3f77ff8113619ef1bb7abfa4c73ab9eedb06f8 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 9 May 2019 16:36:07 -0700 Subject: [PATCH 47/68] set UCX_TLS default environment variable --- distributed/comm/ucx.py | 1 + 1 file changed, 1 insertion(+) diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index 0f4f5bf5c86..8ac39034064 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -22,6 +22,7 @@ os.environ.setdefault("UCX_RNDV_SCHEME", "put_zcopy") os.environ.setdefault("UCX_MEMTYPE_CACHE", "n") +os.environ.setdefault("UCX_TLS", "rc,cuda_copy") logger = logging.getLogger(__name__) MAX_MSG_LOG = 23 From b21a954c46b827d9152ff4d83479f8000dc0d0b2 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Fri, 17 May 2019 07:23:22 -0700 Subject: [PATCH 48/68] await endpoint --- distributed/comm/ucx.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index 8ac39034064..a5a15a67db0 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -239,7 +239,7 @@ async def connect(self, address: str, deserialize=True, **connection_args) -> UC logger.debug("UCXConnector.connect: %s", address) _ucp_init() ip, port = _parse_host_port(address) - ep = ucp.get_endpoint(ip.encode(), port) + ep = await ucp.get_endpoint(ip.encode(), port) return self.comm_class( ep, self.prefix + address, listener_instance=None, deserialize=deserialize ) From b319a37fec40137be07e82953b615a53d75fdc70 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 23 May 2019 07:06:48 -0700 Subject: [PATCH 49/68] Add --protocol flag to CLI --- distributed/cli/dask_scheduler.py | 8 ++++++++ distributed/cli/dask_worker.py | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/distributed/cli/dask_scheduler.py b/distributed/cli/dask_scheduler.py index 3668be684d0..93eea043312 100755 --- a/distributed/cli/dask_scheduler.py +++ b/distributed/cli/dask_scheduler.py @@ -39,6 +39,12 @@ default=None, help="Preferred network interface like 'eth0' or 'ib0'", ) +@click.option( + "--protocol", + type=str, + default=None, + help="Protocol like tcp, tls, or ucx", +) @click.option( "--tls-ca-file", type=pem_file_option_type, @@ -126,6 +132,7 @@ def main( pid_file, scheduler_file, interface, + protocol, local_directory, preload, preload_argv, @@ -195,6 +202,7 @@ def del_pid_file(): host=host, port=port, interface=interface, + protocol=protocol, dashboard_address=dashboard_address if _bokeh else None, service_kwargs={"bokeh": {"prefix": bokeh_prefix}}, ) diff --git a/distributed/cli/dask_worker.py b/distributed/cli/dask_worker.py index 439bdaf4a62..0cbcb45e1e3 100755 --- a/distributed/cli/dask_worker.py +++ b/distributed/cli/dask_worker.py @@ -104,6 +104,12 @@ @click.option( "--interface", type=str, default=None, help="Network interface like 'eth0' or 'ib0'" ) +@click.option( + "--protocol", + type=str, + default=None, + help="Protocol like tcp, tls, or ucx", +) @click.option("--nthreads", type=int, default=0, help="Number of threads per process.") @click.option( "--nprocs", @@ -195,6 +201,7 @@ def main( local_directory, scheduler_file, interface, + protocol, death_timeout, preload, preload_argv, @@ -336,6 +343,7 @@ def del_pid_file(): security=sec, contact_address=contact_address, interface=interface, + protocol=protocol, host=host, port=port, dashboard_address=dashboard_address if bokeh else None, From 9ba163f438901c52ee3cb82527747e0bf09d1489 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Mon, 27 May 2019 09:22:45 -0700 Subject: [PATCH 50/68] Create separate cuda serialization family Now we can still move GPU objects on non-UCX comms --- distributed/comm/tests/test_ucx.py | 5 +- distributed/comm/ucx.py | 6 +-- distributed/protocol/__init__.py | 13 ++--- distributed/protocol/cuda.py | 71 +++++++++++-------------- distributed/protocol/cudf.py | 6 +-- distributed/protocol/cupy.py | 6 +-- distributed/protocol/numba.py | 6 +-- distributed/protocol/tests/test_cupy.py | 13 +++++ 8 files changed, 67 insertions(+), 59 deletions(-) create mode 100644 distributed/protocol/tests/test_cupy.py diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index 9c6e072e791..1f809a24b77 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -238,6 +238,7 @@ async def test_ping_pong_numba(): assert result["op"] == "ping" +@pytest.mark.skip(reason="hangs") @pytest.mark.parametrize("processes", [True, False]) def test_ucx_localcluster(loop, processes): if processes: @@ -261,7 +262,7 @@ def test_ucx_localcluster(loop, processes): x.result() assert x.key in cluster.scheduler.tasks if not processes: - assert any(w.data == {x.key: 2} for w in cluster.workers) + assert any(w.data == {x.key: 2} for w in cluster.workers.values()) assert len(cluster.scheduler.workers) == 2 @@ -279,7 +280,7 @@ def test_tcp_localcluster(loop): silence_logs=False, env=env, ) as cluster: - print(cluster.scheduler.workers) + pass # with Client(cluster) as e: # x = e.submit(inc, 1) # x.result() diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index a5a15a67db0..fd9f39d74c9 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -152,11 +152,11 @@ def peer_address(self) -> str: # We need the port? Or the tag? return self.address - async def write(self, msg: dict, serializers=None, on_error: str = "message"): + async def write(self, msg: dict, serializers=('cuda', 'dask', 'pickle', 'error'), on_error: str = "message"): # msg can also be a list of dicts when sending batched messages frames = await to_frames( msg, serializers=serializers, on_error=on_error - ) # TODO: context= + ) gpu_frames = b"".join( [ struct.pack("?", hasattr(frame, "__cuda_array_interface__")) @@ -175,7 +175,7 @@ async def write(self, msg: dict, serializers=None, on_error: str = "message"): await self.ep.send_obj(frame) return sum(map(nbytes, frames)) - async def read(self, deserializers=None): + async def read(self, deserializers=('cuda', 'dask', 'pickle', 'error')): resp = await self.ep.recv_future() obj = ucp.get_obj_from_msg(resp) nframes, = struct.unpack("Q", obj[:8]) # first eight bytes for number of frames diff --git a/distributed/protocol/__init__.py b/distributed/protocol/__init__.py index ef459b14fb1..3f98436f4b9 100644 --- a/distributed/protocol/__init__.py +++ b/distributed/protocol/__init__.py @@ -4,6 +4,7 @@ from .compression import compressions, default_compression from .core import dumps, loads, maybe_compress, decompress, msgpack +from .cuda import cuda_serialize, cuda_deserialize from .serialize import ( serialize, deserialize, @@ -68,19 +69,19 @@ def _register_torch(): from . import torch -@dask_serialize.register_lazy("cupy") -@dask_deserialize.register_lazy("cupy") +@cuda_serialize.register_lazy("cupy") +@cuda_deserialize.register_lazy("cupy") def _register_cupy(): from . import cupy -@dask_serialize.register_lazy("numba") -@dask_deserialize.register_lazy("numba") +@cuda_serialize.register_lazy("numba") +@cuda_deserialize.register_lazy("numba") def _register_numba(): from . import numba -@dask_serialize.register_lazy("cudf") -@dask_deserialize.register_lazy("cudf") +@cuda_serialize.register_lazy("cudf") +@cuda_deserialize.register_lazy("cudf") def _register_cudf(): from . import cudf diff --git a/distributed/protocol/cuda.py b/distributed/protocol/cuda.py index ba435f897f7..ff681ee046d 100644 --- a/distributed/protocol/cuda.py +++ b/distributed/protocol/cuda.py @@ -1,39 +1,32 @@ -""" -Efficient serialization GPU arrays. -""" -import cupy -from .serialize import dask_serialize, dask_deserialize - - -@dask_serialize.register(cupy.ndarray) -def serialize_cupy_ndarray(x): - # TODO: handle non-contiguous - # TODO: Handle order='K' ravel - # TODO: 0d - - if x.flags.c_contiguous or x.flags.f_contiguous: - strides = x.strides - data = x.ravel() # order='K' - else: - x = cupy.ascontiguousarray(x) - strides = x.strides - data = x.ravel() - - dtype = (0, x.dtype.str) - - header = x.__cuda_array_interface__.copy() - header["lengths"] = (x.nbytes,) # one per stride - header["compression"] = (None,) # TODO - header["is_cuda"] = 1 - header["dtype"] = dtype - return header, [data] - - -@dask_deserialize.register(cupy.ndarray) -def deserialize_cupy_array(header, frames): - frame, = frames - # TODO: put this in ucx... as a kind of "fixup" - frame.typestr = header["typestr"] - frame.shape = header["shape"] - arr = cupy.asarray(frame) - return arr +import dask + +from . import pickle +from .serialize import register_serialization_family +from dask.utils import typename + +cuda_serialize = dask.utils.Dispatch("cuda_serialize") +cuda_deserialize = dask.utils.Dispatch("cuda_deserialize") + + +def cuda_dumps(x): + type_name = typename(type(x)) + try: + dumps = cuda_serialize.dispatch(type(x)) + except TypeError: + raise NotImplementedError(type_name) + + header, frames = dumps(x) + + header["type"] = type_name + header["type-serialized"] = pickle.dumps(type(x)) + header["serializer"] = "cuda" + return header, frames + + +def cuda_loads(header, frames): + typ = pickle.loads(header["type-serialized"]) + loads = cuda_deserialize.dispatch(typ) + return loads(header, frames) + + +register_serialization_family("cuda", cuda_dumps, cuda_loads) diff --git a/distributed/protocol/cudf.py b/distributed/protocol/cudf.py index cf3a172044f..018596b1560 100644 --- a/distributed/protocol/cudf.py +++ b/distributed/protocol/cudf.py @@ -1,5 +1,5 @@ import cudf -from .serialize import dask_serialize, dask_deserialize +from .cuda import cuda_serialize, cuda_deserialize from .numba import serialize_numba_ndarray, deserialize_numba_ndarray @@ -11,7 +11,7 @@ # 3. Serialize the index -@dask_serialize.register(cudf.DataFrame) +@cuda_serialize.register(cudf.DataFrame) def serialize_cudf_dataframe(x): sub_headers = [] arrays = [] @@ -46,7 +46,7 @@ def serialize_cudf_dataframe(x): return header, arrays -@dask_deserialize.register(cudf.DataFrame) +@cuda_deserialize.register(cudf.DataFrame) def serialize_cudf_dataframe(header, frames): columns = header["columns"] n_columns = len(header["columns"]) diff --git a/distributed/protocol/cupy.py b/distributed/protocol/cupy.py index 745de450bb2..e51f0b11880 100644 --- a/distributed/protocol/cupy.py +++ b/distributed/protocol/cupy.py @@ -2,10 +2,10 @@ Efficient serialization GPU arrays. """ import cupy -from .serialize import dask_serialize, dask_deserialize +from .cuda import cuda_serialize, cuda_deserialize -@dask_serialize.register(cupy.ndarray) +@cuda_serialize.register(cupy.ndarray) def serialize_cupy_ndarray(x): # TODO: handle non-contiguous # TODO: Handle order='K' ravel @@ -30,7 +30,7 @@ def serialize_cupy_ndarray(x): return header, [data] -@dask_deserialize.register(cupy.ndarray) +@cuda_deserialize.register(cupy.ndarray) def deserialize_cupy_array(header, frames): frame, = frames # TODO: put this in ucx... as a kind of "fixup" diff --git a/distributed/protocol/numba.py b/distributed/protocol/numba.py index 6c3f9da4d83..ce8baee7cc5 100644 --- a/distributed/protocol/numba.py +++ b/distributed/protocol/numba.py @@ -1,8 +1,8 @@ import numba.cuda -from .serialize import dask_serialize, dask_deserialize +from .cuda import cuda_serialize, cuda_deserialize -@dask_serialize.register(numba.cuda.devicearray.DeviceNDArray) +@cuda_serialize.register(numba.cuda.devicearray.DeviceNDArray) def serialize_numba_ndarray(x): # TODO: handle non-contiguous # TODO: handle 2d @@ -35,7 +35,7 @@ def serialize_numba_ndarray(x): return header, [data] -@dask_deserialize.register(numba.cuda.devicearray.DeviceNDArray) +@cuda_deserialize.register(numba.cuda.devicearray.DeviceNDArray) def deserialize_numba_ndarray(header, frames): frame, = frames # TODO: put this in ucx... as a kind of "fixup" diff --git a/distributed/protocol/tests/test_cupy.py b/distributed/protocol/tests/test_cupy.py new file mode 100644 index 00000000000..3ea8e9d9239 --- /dev/null +++ b/distributed/protocol/tests/test_cupy.py @@ -0,0 +1,13 @@ +from distributed.protocol import serialize, deserialize +import pytest + +cupy = pytest.importorskip('cupy') + +def test_serialize_cupy(): + x = cupy.arange(100) + header, frames = serialize(x, serializers=('cuda', 'dask', 'pickle')) + y = deserialize(header, frames, deserializers=('cuda', 'dask', 'pickle', + 'error')) + + assert (x == y).all() + From 2b2b459948b5da88790967519eae0978d5a9032c Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Mon, 27 May 2019 12:54:03 -0700 Subject: [PATCH 51/68] workers -> workers.values() --- distributed/deploy/local.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/distributed/deploy/local.py b/distributed/deploy/local.py index e5a73c05071..41390b2661c 100644 --- a/distributed/deploy/local.py +++ b/distributed/deploy/local.py @@ -200,7 +200,7 @@ def __repr__(self): return "LocalCluster(%r, workers=%d, ncores=%d)" % ( self.scheduler_address, len(self.workers), - sum(w.ncores for w in self.workers), + sum(w.ncores for w in self.workers.values()), ) From 7532265ec26c27a51d5af67533450d4aebe762a6 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Mon, 27 May 2019 12:55:12 -0700 Subject: [PATCH 52/68] lint --- distributed/cli/dask_scheduler.py | 5 +---- distributed/cli/dask_worker.py | 5 +---- distributed/comm/tests/test_ucx.py | 4 ++-- distributed/comm/ucx.py | 17 +++++++++++------ distributed/protocol/tests/test_cupy.py | 9 ++++----- 5 files changed, 19 insertions(+), 21 deletions(-) diff --git a/distributed/cli/dask_scheduler.py b/distributed/cli/dask_scheduler.py index 93eea043312..cd7b57f4a29 100755 --- a/distributed/cli/dask_scheduler.py +++ b/distributed/cli/dask_scheduler.py @@ -40,10 +40,7 @@ help="Preferred network interface like 'eth0' or 'ib0'", ) @click.option( - "--protocol", - type=str, - default=None, - help="Protocol like tcp, tls, or ucx", + "--protocol", type=str, default=None, help="Protocol like tcp, tls, or ucx" ) @click.option( "--tls-ca-file", diff --git a/distributed/cli/dask_worker.py b/distributed/cli/dask_worker.py index 0cbcb45e1e3..2eefe7cd9df 100755 --- a/distributed/cli/dask_worker.py +++ b/distributed/cli/dask_worker.py @@ -105,10 +105,7 @@ "--interface", type=str, default=None, help="Network interface like 'eth0' or 'ib0'" ) @click.option( - "--protocol", - type=str, - default=None, - help="Protocol like tcp, tls, or ucx", + "--protocol", type=str, default=None, help="Protocol like tcp, tls, or ucx" ) @click.option("--nthreads", type=int, default=0, help="Number of threads per process.") @click.option( diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index 1f809a24b77..f821fbc42b2 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -8,10 +8,10 @@ from distributed import Client from distributed.comm import ucx, listen, connect from distributed.comm.registry import backends, get_backend -from distributed.comm import ucx, parse_address, parse_host_port +from distributed.comm import ucx, parse_address from distributed.protocol import to_serialize from distributed.deploy.local import LocalCluster -from distributed.utils_test import gen_test, loop, inc +from distributed.utils_test import gen_test, loop, inc # noqa: 401 from .test_comms import check_deserialize diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index fd9f39d74c9..568d9a9b7a8 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -152,11 +152,14 @@ def peer_address(self) -> str: # We need the port? Or the tag? return self.address - async def write(self, msg: dict, serializers=('cuda', 'dask', 'pickle', 'error'), on_error: str = "message"): + async def write( + self, + msg: dict, + serializers=("cuda", "dask", "pickle", "error"), + on_error: str = "message", + ): # msg can also be a list of dicts when sending batched messages - frames = await to_frames( - msg, serializers=serializers, on_error=on_error - ) + frames = await to_frames(msg, serializers=serializers, on_error=on_error) gpu_frames = b"".join( [ struct.pack("?", hasattr(frame, "__cuda_array_interface__")) @@ -175,12 +178,14 @@ async def write(self, msg: dict, serializers=('cuda', 'dask', 'pickle', 'error') await self.ep.send_obj(frame) return sum(map(nbytes, frames)) - async def read(self, deserializers=('cuda', 'dask', 'pickle', 'error')): + async def read(self, deserializers=("cuda", "dask", "pickle", "error")): resp = await self.ep.recv_future() obj = ucp.get_obj_from_msg(resp) nframes, = struct.unpack("Q", obj[:8]) # first eight bytes for number of frames - gpu_frame_msg = obj[8 : 8 + nframes] # next nframes bytes for if they're GPU frames + gpu_frame_msg = obj[ + 8 : 8 + nframes + ] # next nframes bytes for if they're GPU frames is_gpus = struct.unpack("{}?".format(nframes), gpu_frame_msg) sized_frame_msg = obj[8 + nframes :] # then the rest for frame sizes diff --git a/distributed/protocol/tests/test_cupy.py b/distributed/protocol/tests/test_cupy.py index 3ea8e9d9239..26940597f81 100644 --- a/distributed/protocol/tests/test_cupy.py +++ b/distributed/protocol/tests/test_cupy.py @@ -1,13 +1,12 @@ from distributed.protocol import serialize, deserialize import pytest -cupy = pytest.importorskip('cupy') +cupy = pytest.importorskip("cupy") + def test_serialize_cupy(): x = cupy.arange(100) - header, frames = serialize(x, serializers=('cuda', 'dask', 'pickle')) - y = deserialize(header, frames, deserializers=('cuda', 'dask', 'pickle', - 'error')) + header, frames = serialize(x, serializers=("cuda", "dask", "pickle")) + y = deserialize(header, frames, deserializers=("cuda", "dask", "pickle", "error")) assert (x == y).all() - From 8babb8e7b603e3d40c9005c2270f81859dc0002f Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Mon, 27 May 2019 14:06:29 -0700 Subject: [PATCH 53/68] allow cupy arrays as frames (perhaps not the best idea, but it allows a sensible test until we have a device-memoryview) --- distributed/protocol/cupy.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/distributed/protocol/cupy.py b/distributed/protocol/cupy.py index e51f0b11880..63324e8a280 100644 --- a/distributed/protocol/cupy.py +++ b/distributed/protocol/cupy.py @@ -34,7 +34,10 @@ def serialize_cupy_ndarray(x): def deserialize_cupy_array(header, frames): frame, = frames # TODO: put this in ucx... as a kind of "fixup" - frame.typestr = header["typestr"] - frame.shape = header["shape"] + try: + frame.typestr = header["typestr"] + frame.shape = header["shape"] + except AttributeError: + pass arr = cupy.asarray(frame) return arr From 6696466fa809d313c6426655ba6086f20cfde2f2 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Mon, 27 May 2019 14:07:02 -0700 Subject: [PATCH 54/68] numba fixed an issue upstream. remove a hack --- distributed/utils.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/distributed/utils.py b/distributed/utils.py index cecd6d0f47f..55508a4c574 100644 --- a/distributed/utils.py +++ b/distributed/utils.py @@ -1316,18 +1316,7 @@ def nbytes(frame, _bytes_like=(bytes, bytearray)): try: return frame.nbytes except AttributeError: - try: - # TODO: https://github.com/numba/numba/issues/3810 - # numba DeviceNDArary doesn't implement .nbytes - # remove once that's fixed. - return frame.dtype.itemsize * frame.size - except AttributeError: - # XXX: nbytes fails for MemoryPointer. - # Probably time to move away - try: - return len(frame) - except TypeError: - return sys.getsizeof(frame) + return len(frame) def PeriodicCallback(callback, callback_time, io_loop=None): From 23d0b0dff1839f8264bb7c11ab1b2e614bb19912 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Mon, 27 May 2019 14:25:17 -0700 Subject: [PATCH 55/68] Don't special-case MemoryPointer objects Lets just not split them into pieces in the first place hopefully? --- distributed/protocol/utils.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/distributed/protocol/utils.py b/distributed/protocol/utils.py index 61dc94ee6af..caf4bb8833b 100644 --- a/distributed/protocol/utils.py +++ b/distributed/protocol/utils.py @@ -68,10 +68,6 @@ def merge_frames(header, frames): if not frames: return frames - if any(x.__class__.__name__ == "MemoryPointer" for x in frames): - # XXX - return frames - assert sum(lengths) == sum(map(nbytes, frames)) if all(len(f) == l for f, l in zip(frames, lengths)): @@ -94,7 +90,10 @@ def merge_frames(header, frames): L.append(mv[:l]) frames.append(mv[l:]) l = 0 - out.append(b"".join(map(ensure_bytes, L))) + if len(L) == 1: # no work necessary + out.extend(L) + else: + out.append(b"".join(map(ensure_bytes, L))) return out From 4c94e8d18d8ca543033785262855259ec70c6637 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Mon, 27 May 2019 14:40:48 -0700 Subject: [PATCH 56/68] Handle compression in dask_dumps, not sub serializers --- distributed/comm/tests/test_ucx.py | 19 +++++++++++++++++++ distributed/protocol/cuda.py | 1 + distributed/protocol/cupy.py | 1 - distributed/protocol/numba.py | 1 - 4 files changed, 20 insertions(+), 2 deletions(-) diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index f821fbc42b2..5c3879c54b3 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -218,6 +218,25 @@ async def test_ping_pong_cupy(shape): await serv_com.close() +@pytest.mark.slow +@pytest.mark.asyncio +async def test_large_cupy(): + cupy = pytest.importorskip("cupy") + address = "ucx://{}:{}".format(HOST, next(port_counter)) + com, serv_com = await get_comm_pair(address) + + arr = cupy.ones(1000000000, dtype='u1') + msg = {"op": "ping", "data": to_serialize(arr)} + + _, result = await asyncio.gather(com.write(msg), serv_com.read()) + data2 = result.pop("data") + + assert result["op"] == "ping" + assert len(data2) == len(arr) + await com.close() + await serv_com.close() + + @pytest.mark.asyncio async def test_ping_pong_numba(): np = pytest.importorskip("numpy") diff --git a/distributed/protocol/cuda.py b/distributed/protocol/cuda.py index ff681ee046d..13be1d75bb8 100644 --- a/distributed/protocol/cuda.py +++ b/distributed/protocol/cuda.py @@ -20,6 +20,7 @@ def cuda_dumps(x): header["type"] = type_name header["type-serialized"] = pickle.dumps(type(x)) header["serializer"] = "cuda" + header["compression"] = (None,) * len(frames) # no compression for gpu data return header, frames diff --git a/distributed/protocol/cupy.py b/distributed/protocol/cupy.py index 63324e8a280..13c0348a821 100644 --- a/distributed/protocol/cupy.py +++ b/distributed/protocol/cupy.py @@ -24,7 +24,6 @@ def serialize_cupy_ndarray(x): # used in the ucx comms for gpu/cpu message passing # 'lengths' set by dask header = x.__cuda_array_interface__.copy() - header["compression"] = (None,) # TODO header["is_cuda"] = 1 header["dtype"] = dtype return header, [data] diff --git a/distributed/protocol/numba.py b/distributed/protocol/numba.py index ce8baee7cc5..18405ffebe0 100644 --- a/distributed/protocol/numba.py +++ b/distributed/protocol/numba.py @@ -29,7 +29,6 @@ def serialize_numba_ndarray(x): # used in the ucx comms for gpu/cpu message passing # 'lengths' set by dask header = x.__cuda_array_interface__.copy() - header["compression"] = (None,) # TODO header["is_cuda"] = 1 header["dtype"] = dtype return header, [data] From 5e4940930bbc1ff3bdf9794a800bf78adbb474c3 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Mon, 27 May 2019 14:47:57 -0700 Subject: [PATCH 57/68] add failing test for large messages in UCX --- distributed/comm/tests/test_ucx.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index 5c3879c54b3..0f7bbc2dda1 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -220,12 +220,16 @@ async def test_ping_pong_cupy(shape): @pytest.mark.slow @pytest.mark.asyncio -async def test_large_cupy(): +@pytest.mark.parametrize("n", [ + 1_000_000_000, + pytest.param(2_500_000_000, marks=[pytest.mark.xfail(reason='integer type in ucx-py')]), +]) +async def test_large_cupy(n): cupy = pytest.importorskip("cupy") address = "ucx://{}:{}".format(HOST, next(port_counter)) com, serv_com = await get_comm_pair(address) - arr = cupy.ones(1000000000, dtype='u1') + arr = cupy.ones(n, dtype='u1') msg = {"op": "ping", "data": to_serialize(arr)} _, result = await asyncio.gather(com.write(msg), serv_com.read()) From 1457011f2d217b67c29d5165e57be41e39b22699 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Mon, 27 May 2019 15:08:46 -0700 Subject: [PATCH 58/68] cleanup close comments --- distributed/core.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/distributed/core.py b/distributed/core.py index 1bc7cf83c48..3133bb58b10 100644 --- a/distributed/core.py +++ b/distributed/core.py @@ -487,9 +487,7 @@ def handle_stream(self, comm, extra=None, every_cycle=[]): pdb.set_trace() raise finally: - yield comm.close() # TODO: why do we need this now? - # ^ Good question :) comm.close can be a - # coroutine, in which case this isn't doing anything. + yield comm.close() assert comm.closed() @gen.coroutine @@ -907,7 +905,6 @@ def collect(self): for addr, comms in self.available.items(): for comm in comms: IOLoop.current().add_callback(comm.close) - # comm.close() comms.clear() if self.open < self.limit: self.event.set() @@ -920,7 +917,6 @@ def remove(self, addr): if addr in self.available: comms = self.available.pop(addr) for comm in comms: - # comm.close() IOLoop.current().add_callback(comm.close) if addr in self.occupied: comms = self.occupied.pop(addr) From d89e654c171714ec92bc6a2b0c0e642f0369ca03 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Tue, 28 May 2019 10:07:42 -0700 Subject: [PATCH 59/68] Cleanup UCX comm --- distributed/comm/addressing.py | 4 +- distributed/comm/tests/test_ucx.py | 48 +++-------- distributed/comm/ucx.py | 123 ++++++++--------------------- 3 files changed, 51 insertions(+), 124 deletions(-) diff --git a/distributed/comm/addressing.py b/distributed/comm/addressing.py index 3d79befe0f1..f134b5bb258 100644 --- a/distributed/comm/addressing.py +++ b/distributed/comm/addressing.py @@ -72,6 +72,8 @@ def _default(): raise ValueError("missing port number in address %r" % (address,)) return default_port + if '://' in address: + _, address = address.split('://') if address.startswith("["): # IPv6 notation: '[addr]:port' or '[addr]'. # The address may contain multiple colons. @@ -101,7 +103,7 @@ def unparse_host_port(host, port=None): """ if ":" in host and not host.startswith("["): host = "[%s]" % host - if port: + if port is not None: return "%s:%s" % (host, port) else: return host diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index 0f7bbc2dda1..8582ea56ac3 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -18,20 +18,6 @@ HOST = ucp.get_address() -# Currently having some issues with re-using ports. -# Tests just hang. Still debugging. -port_counter = itertools.count(13337) - - -def test_parse_address(): - result = ucx._parse_address("ucx://10.33.225.160") - assert result == ("ucx", "10.33.225.160") - - -def test_parse_host_port(): - assert ucx._parse_host_port("10.33.225.160:13337") == ("10.33.225.160", 13337) - assert ucx._parse_host_port("10.33.225.160:13338") == ("10.33.225.160", 13338) - def test_registered(): assert "ucx" in backends @@ -39,7 +25,7 @@ def test_registered(): assert isinstance(backend, ucx.UCXBackend) -async def get_comm_pair(listen_addr, listen_args=None, connect_args=None, **kwargs): +async def get_comm_pair(listen_addr='ucx://' + HOST, listen_args=None, connect_args=None, **kwargs): q = asyncio.queues.Queue() async def handle_comm(comm): @@ -61,8 +47,7 @@ async def handle_comm(comm): @pytest.mark.asyncio async def test_ping_pong(): - address = "ucx://{}:{}".format(HOST, next(port_counter)) - com, serv_com = await get_comm_pair(address) + com, serv_com = await get_comm_pair() msg = {"op": "ping"} await com.write(msg) result = await serv_com.read() @@ -80,17 +65,16 @@ async def test_ping_pong(): @pytest.mark.asyncio async def test_comm_objs(): - address = "ucx://{}:{}".format(HOST, next(port_counter)) - comm, serv_com = await get_comm_pair(address) + comm, serv_comm = await get_comm_pair() - assert comm.peer_address == address scheme, loc = parse_address(comm.peer_address) assert scheme == "ucx" - assert comm.peer_address == address - scheme, loc = parse_address(serv_com.peer_address) + scheme, loc = parse_address(serv_comm.peer_address) assert scheme == "ucx" + assert comm.peer_address == serv_comm.local_address + def test_ucx_specific(): """ @@ -102,7 +86,7 @@ def test_ucx_specific(): # 3. Test peer_address # 4. Test cleanup async def f(): - address = "ucx://{}:{}".format(HOST, next(port_counter)) + address = "ucx://{}:{}".format(HOST, 0) async def handle_comm(comm): # XXX: failures here don't fail the build yet @@ -156,9 +140,8 @@ async def test_ping_pong_data(): np = pytest.importorskip("numpy") data = np.ones((10, 10)) - # TODO: broken for large arrays - address = "ucx://{}:{}".format(HOST, next(port_counter)) - com, serv_com = await get_comm_pair(address) + + com, serv_com = await get_comm_pair() msg = {"op": "ping", "data": to_serialize(data)} await com.write(msg) result = await serv_com.read() @@ -188,9 +171,8 @@ async def test_ping_pong_cudf(): cudf = pytest.importorskip("cudf") df = cudf.DataFrame({"A": [1, 2, None], "B": [1.0, 2.0, None]}) - address = "ucx://{}:{}".format(HOST, next(port_counter)) - com, serv_com = await get_comm_pair(address) + com, serv_com = await get_comm_pair() msg = {"op": "ping", "data": to_serialize(df)} await com.write(msg) @@ -203,8 +185,7 @@ async def test_ping_pong_cudf(): @pytest.mark.parametrize("shape", [(100,), (10, 10), (4947,)]) async def test_ping_pong_cupy(shape): cupy = pytest.importorskip("cupy") - address = "ucx://{}:{}".format(HOST, next(port_counter)) - com, serv_com = await get_comm_pair(address) + com, serv_com = await get_comm_pair() arr = cupy.random.random(shape) msg = {"op": "ping", "data": to_serialize(arr)} @@ -226,8 +207,7 @@ async def test_ping_pong_cupy(shape): ]) async def test_large_cupy(n): cupy = pytest.importorskip("cupy") - address = "ucx://{}:{}".format(HOST, next(port_counter)) - com, serv_com = await get_comm_pair(address) + com, serv_com = await get_comm_pair() arr = cupy.ones(n, dtype='u1') msg = {"op": "ping", "data": to_serialize(arr)} @@ -247,12 +227,10 @@ async def test_ping_pong_numba(): numba = pytest.importorskip("numba") import numba.cuda - address = "ucx://{}:{}".format(HOST, next(port_counter)) - arr = np.arange(10) arr = numba.cuda.to_device(arr) - com, serv_com = await get_comm_pair(address) + com, serv_com = await get_comm_pair() msg = {"op": "ping", "data": to_serialize(arr)} await com.write(msg) diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index 568d9a9b7a8..e78fc1f4670 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -26,11 +26,6 @@ logger = logging.getLogger(__name__) MAX_MSG_LOG = 23 -PORT = 13337 - -# set in ~/.dask/config.yaml -# or DASK_DISTRIBUTED__COMM__UCXADDRESS -_PORT_COUNTER = itertools.count(PORT) _INITIALIZED = False @@ -43,50 +38,6 @@ def _ucp_init(): _INITIALIZED = True -# ---------------------------------------------------------------------------- -# Addressing -# TODO: Parts of these should probably be moved to `comm/addressing.py` -# ---------------------------------------------------------------------------- - - -def _parse_address(addr: str, strict=False) -> tuple: - """ - >>> _parse_address("ucx://10.33.225.160") - """ - if not addr.startswith("ucx://"): - raise ValueError("Invalid url scheme {}".format(addr)) - - proto, address = addr.split("://", 1) - return proto, address - - -def _parse_host_port(address: str, default_port=None) -> tuple: - """ - Parse an endpoint address given in the form "host:port". - - >>> _parse_host_port("10.33.225.160:13337") - ("10.33.225.160", 13337) - """ - if address.startswith("ucx://"): - _, address = _parse_address(address) - - # if default port is None we select the next port availabe - # ucx-py does not currently support random port assignment - import random - - default_port = default_port or random.randint(1024, 65000) - return parse_host_port(address, default_port=default_port) - - -def _unparse_host_port(host, port=None): - return unparse_host_port(host, port) - - -def get_endpoint_address(endpoint): - # TODO: ucx-py: 18 - pass - - # ---------------------------------------------------------------------------- # Comm Interface # ---------------------------------------------------------------------------- @@ -127,20 +78,18 @@ class UCX(Comm): """ def __init__( - self, ep: ucp.Endpoint, address: str, listener_instance, deserialize=True + self, ep: ucp.Endpoint, local_addr: str, peer_addr: str, deserialize=True ): - logger.debug("UCX.__init__ %s %s", address, listener_instance) + Comm.__init__(self) self._ep = ep - assert address.startswith("ucx") - self.address = address - self.listener_instance = listener_instance - default_port = next(_PORT_COUNTER) - self._host, self._port = _parse_host_port(address, default_port) - self._local_addr = None + if local_addr: + assert local_addr.startswith("ucx") + assert peer_addr.startswith("ucx") + self._local_addr = local_addr + self._peer_addr = peer_addr self.deserialize = deserialize self.comm_flag = None - - # finalizer? + logger.debug("UCX.__init__ %s", self) @property def local_address(self) -> str: @@ -148,9 +97,7 @@ def local_address(self) -> str: @property def peer_address(self) -> str: - # XXX: This isn't quite for the server (from UCXListener). - # We need the port? Or the tag? - return self.address + return self._peer_addr async def write( self, @@ -193,9 +140,9 @@ async def read(self, deserializers=("cuda", "dask", "pickle", "error")): frames = [] - for i, (is_gpus, size) in enumerate(zip(is_gpus, sizes)): + for i, (is_gpu, size) in enumerate(zip(is_gpus, sizes)): if size > 0: - resp = await self.ep.recv_obj(size, cuda=is_gpus) + resp = await self.ep.recv_obj(size, cuda=is_gpu) else: resp = await self.ep.recv_future() frame = ucp.get_obj_from_msg(resp) @@ -208,14 +155,10 @@ async def read(self, deserializers=("cuda", "dask", "pickle", "error")): return msg def abort(self): - # breakpoint() if self._ep: ucp.destroy_ep(self._ep) logger.debug("Destroyed UCX endpoint") self._ep = None - # if self.listener_instance: - # ucp.stop_listener(self.listener_instance) - # self.listener_instance = None @property def ep(self): @@ -238,16 +181,13 @@ class UCXConnector(Connector): comm_class = UCX encrypted = False - client = ... # TODO: add a client here? - async def connect(self, address: str, deserialize=True, **connection_args) -> UCX: logger.debug("UCXConnector.connect: %s", address) _ucp_init() - ip, port = _parse_host_port(address) + ip, port = parse_host_port(address) ep = await ucp.get_endpoint(ip.encode(), port) - return self.comm_class( - ep, self.prefix + address, listener_instance=None, deserialize=deserialize - ) + return self.comm_class( ep, local_addr=None, peer_addr=self.prefix + + address, deserialize=deserialize) class UCXListener(Listener): @@ -259,15 +199,14 @@ class UCXListener(Listener): def __init__( self, address: str, comm_handler: None, deserialize=False, **connection_args ): - logger.debug("UCXListener.__init__") if not address.startswith("ucx"): address = "ucx://" + address - self.address = address - self.ip, self.port = _parse_host_port(address, default_port=0) + self.ip, self._input_port = parse_host_port(address, default_port=0) self.comm_handler = comm_handler self.deserialize = deserialize self._ep = None # type: ucp.Endpoint self.listener_instance = None # type: ucp.ListenerFuture + self.ucp_server = None self._task = None # XXX: The init may be required to take args like @@ -275,21 +214,29 @@ def __init__( self.connection_args = connection_args self._task = None + @property + def port(self): + return self.ucp_server.port + + @property + def address(self): + return 'ucx://' + self.ip + ':' + str(self.port) + def start(self): async def serve_forever(client_ep, listener_instance): ucx = UCX( - client_ep, self.address, listener_instance, deserialize=self.deserialize + client_ep, + local_addr=self.address, + peer_addr=self.address, # TODO: https://github.com/Akshay-Venkatesh/ucx-py/issues/111 + deserialize=self.deserialize ) self.listener_instance = listener_instance if self.comm_handler: await self.comm_handler(ucx) _ucp_init() - # XXX: the port handling is probably incorrect. - # need to figure out if `server_port=None` is - # server_port=13337, or server_port="next free port" - server = ucp.start_listener( - serve_forever, listener_port=self.port, is_coroutine=True + self.ucp_server = ucp.start_listener( + serve_forever, listener_port=self._input_port, is_coroutine=True ) try: @@ -297,7 +244,7 @@ async def serve_forever(client_ep, listener_instance): except (RuntimeError, AttributeError): loop = asyncio.get_event_loop() - t = loop.create_task(server.coroutine) + t = loop.create_task(self.ucp_server.coroutine) self._task = t def stop(self): @@ -316,7 +263,7 @@ def get_host_port(self): @property def listen_address(self): - return self.prefix + _unparse_host_port(*self.get_host_port()) + return self.prefix + unparse_host_port(*self.get_host_port()) @property def contact_address(self): @@ -344,14 +291,14 @@ def get_listener(self, loc, handle_comm, deserialize, **connection_args): # This duplicates BaseTCPBackend def get_address_host(self, loc): - return _parse_host_port(loc)[0] + return parse_host_port(loc)[0] def get_address_host_port(self, loc): - return _parse_host_port(loc) + return parse_host_port(loc) def resolve_address(self, loc): host, port = parse_host_port(loc) - return _unparse_host_port(ensure_ip(host), port) + return unparse_host_port(ensure_ip(host), port) def get_local_address_for(self, loc): host, port = parse_host_port(loc) From 23e4b4ce4c16dece2625046e81bf50375b9501fb Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 29 May 2019 09:24:29 -0700 Subject: [PATCH 60/68] add logging for the preload module --- distributed/preloading.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/distributed/preloading.py b/distributed/preloading.py index 0f08f60f71c..a5e67c1611a 100644 --- a/distributed/preloading.py +++ b/distributed/preloading.py @@ -100,6 +100,7 @@ def _import_modules(names, file_dir=None): import_module(name) module = sys.modules[name] + logger.info("Import preload module: %s", name) result_modules[name] = { attrname: getattr(module, attrname, None) for attrname in ("dask_setup", "dask_teardown") @@ -137,6 +138,7 @@ def preload_modules(names, parameter=None, file_dir=None, argv=None): dask_setup.callback(parameter, *context.args, **context.params) else: dask_setup(parameter) + logger.info("Run preload setup function: %s", name) if interface["dask_teardown"]: atexit.register(interface["dask_teardown"], parameter) From a9478e4186d4bf0c70176ae01e0cd1dc8cb89916 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 29 May 2019 09:40:43 -0700 Subject: [PATCH 61/68] Verify that we use cuda to serialize --- distributed/comm/ucx.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index e78fc1f4670..09cf1fdad91 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -105,6 +105,8 @@ async def write( serializers=("cuda", "dask", "pickle", "error"), on_error: str = "message", ): + if serializers is None: + serializers = ("cuda", "dask", "pickle", "error") # msg can also be a list of dicts when sending batched messages frames = await to_frames(msg, serializers=serializers, on_error=on_error) gpu_frames = b"".join( @@ -126,6 +128,8 @@ async def write( return sum(map(nbytes, frames)) async def read(self, deserializers=("cuda", "dask", "pickle", "error")): + if deserializers is None: + deserializers = ("cuda", "dask", "pickle", "error") resp = await self.ep.recv_future() obj = ucp.get_obj_from_msg(resp) nframes, = struct.unpack("Q", obj[:8]) # first eight bytes for number of frames From 0a7105132ba9d45630dc9b1313397b8a9bc23cd7 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 29 May 2019 10:21:02 -0700 Subject: [PATCH 62/68] black --- distributed/comm/addressing.py | 4 ++-- distributed/comm/tests/test_ucx.py | 19 +++++++++++++------ distributed/comm/ucx.py | 14 +++++++++----- 3 files changed, 24 insertions(+), 13 deletions(-) diff --git a/distributed/comm/addressing.py b/distributed/comm/addressing.py index f134b5bb258..d707adb84ac 100644 --- a/distributed/comm/addressing.py +++ b/distributed/comm/addressing.py @@ -72,8 +72,8 @@ def _default(): raise ValueError("missing port number in address %r" % (address,)) return default_port - if '://' in address: - _, address = address.split('://') + if "://" in address: + _, address = address.split("://") if address.startswith("["): # IPv6 notation: '[addr]:port' or '[addr]'. # The address may contain multiple colons. diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index 8582ea56ac3..5aeb77ad992 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -25,7 +25,9 @@ def test_registered(): assert isinstance(backend, ucx.UCXBackend) -async def get_comm_pair(listen_addr='ucx://' + HOST, listen_args=None, connect_args=None, **kwargs): +async def get_comm_pair( + listen_addr="ucx://" + HOST, listen_args=None, connect_args=None, **kwargs +): q = asyncio.queues.Queue() async def handle_comm(comm): @@ -201,15 +203,20 @@ async def test_ping_pong_cupy(shape): @pytest.mark.slow @pytest.mark.asyncio -@pytest.mark.parametrize("n", [ - 1_000_000_000, - pytest.param(2_500_000_000, marks=[pytest.mark.xfail(reason='integer type in ucx-py')]), -]) +@pytest.mark.parametrize( + "n", + [ + 1_000_000_000, + pytest.param( + 2_500_000_000, marks=[pytest.mark.xfail(reason="integer type in ucx-py")] + ), + ], +) async def test_large_cupy(n): cupy = pytest.importorskip("cupy") com, serv_com = await get_comm_pair() - arr = cupy.ones(n, dtype='u1') + arr = cupy.ones(n, dtype="u1") msg = {"op": "ping", "data": to_serialize(arr)} _, result = await asyncio.gather(com.write(msg), serv_com.read()) diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index 09cf1fdad91..f5d9ceb2caa 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -78,7 +78,7 @@ class UCX(Comm): """ def __init__( - self, ep: ucp.Endpoint, local_addr: str, peer_addr: str, deserialize=True + self, ep: ucp.Endpoint, local_addr: str, peer_addr: str, deserialize=True ): Comm.__init__(self) self._ep = ep @@ -190,8 +190,12 @@ async def connect(self, address: str, deserialize=True, **connection_args) -> UC _ucp_init() ip, port = parse_host_port(address) ep = await ucp.get_endpoint(ip.encode(), port) - return self.comm_class( ep, local_addr=None, peer_addr=self.prefix + - address, deserialize=deserialize) + return self.comm_class( + ep, + local_addr=None, + peer_addr=self.prefix + address, + deserialize=deserialize, + ) class UCXListener(Listener): @@ -224,7 +228,7 @@ def port(self): @property def address(self): - return 'ucx://' + self.ip + ':' + str(self.port) + return "ucx://" + self.ip + ":" + str(self.port) def start(self): async def serve_forever(client_ep, listener_instance): @@ -232,7 +236,7 @@ async def serve_forever(client_ep, listener_instance): client_ep, local_addr=self.address, peer_addr=self.address, # TODO: https://github.com/Akshay-Venkatesh/ucx-py/issues/111 - deserialize=self.deserialize + deserialize=self.deserialize, ) self.listener_instance = listener_instance if self.comm_handler: From 51d4c03345876df431764bbe1d2232bcf9eb3958 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 29 May 2019 10:23:48 -0700 Subject: [PATCH 63/68] Fix test_comm test --- distributed/comm/tests/test_comms.py | 1 - 1 file changed, 1 deletion(-) diff --git a/distributed/comm/tests/test_comms.py b/distributed/comm/tests/test_comms.py index 05f2d631626..e761deeab86 100644 --- a/distributed/comm/tests/test_comms.py +++ b/distributed/comm/tests/test_comms.py @@ -154,7 +154,6 @@ def test_unparse_host_port(): assert f("[::1]", 123) == "[::1]:123" assert f("127.0.0.1") == "127.0.0.1" - assert f("127.0.0.1", 0) == "127.0.0.1" assert f("127.0.0.1", None) == "127.0.0.1" assert f("127.0.0.1", "*") == "127.0.0.1:*" From 3a28ea0b810bb2b4c38a33081feafd5afd5cb2f2 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 29 May 2019 10:25:43 -0700 Subject: [PATCH 64/68] flake8 --- distributed/comm/tests/test_ucx.py | 1 - distributed/comm/ucx.py | 1 - 2 files changed, 2 deletions(-) diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index 5aeb77ad992..5417a8fd439 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -1,5 +1,4 @@ import asyncio -import itertools import pytest diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index f5d9ceb2caa..53eb94c3cc9 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -6,7 +6,6 @@ .. _UCX: https://github.com/openucx/ucx """ import asyncio -import itertools import logging import struct From 3a0e25d4a5ce679f3d1b24a05ab6ecc0e108cc68 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 29 May 2019 11:48:53 -0700 Subject: [PATCH 65/68] py35 compat --- distributed/comm/tests/test_ucx.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index 5417a8fd439..30789a8b5b3 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -205,9 +205,9 @@ async def test_ping_pong_cupy(shape): @pytest.mark.parametrize( "n", [ - 1_000_000_000, + int(1e9), pytest.param( - 2_500_000_000, marks=[pytest.mark.xfail(reason="integer type in ucx-py")] + int(2.5e9), marks=[pytest.mark.xfail(reason="integer type in ucx-py")] ), ], ) From 90f8eebafde36b203711d16405b9a67092722824 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 30 May 2019 07:09:56 -0700 Subject: [PATCH 66/68] _ucp_init -> ucp.init --- distributed/comm/ucx.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index 53eb94c3cc9..b87b6f0293c 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -1,7 +1,7 @@ """ :ref:`UCX`_ based communications for distributed. -See :ref:`communcations` for more. +See :ref:`communications` for more. .. _UCX: https://github.com/openucx/ucx """ @@ -26,16 +26,6 @@ logger = logging.getLogger(__name__) MAX_MSG_LOG = 23 -_INITIALIZED = False - - -def _ucp_init(): - global _INITIALIZED - - if not _INITIALIZED: - ucp.init() - _INITIALIZED = True - # ---------------------------------------------------------------------------- # Comm Interface @@ -186,7 +176,7 @@ class UCXConnector(Connector): async def connect(self, address: str, deserialize=True, **connection_args) -> UCX: logger.debug("UCXConnector.connect: %s", address) - _ucp_init() + ucp.init() ip, port = parse_host_port(address) ep = await ucp.get_endpoint(ip.encode(), port) return self.comm_class( @@ -241,7 +231,7 @@ async def serve_forever(client_ep, listener_instance): if self.comm_handler: await self.comm_handler(ucx) - _ucp_init() + ucp.init() self.ucp_server = ucp.start_listener( serve_forever, listener_port=self._input_port, is_coroutine=True ) From 850b9ad4a6b47ad07bf9f659636bade86a95c50d Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 30 May 2019 07:10:38 -0700 Subject: [PATCH 67/68] remove _frames suffix --- distributed/comm/ucx.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index b87b6f0293c..4c3d42a3099 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -98,17 +98,17 @@ async def write( serializers = ("cuda", "dask", "pickle", "error") # msg can also be a list of dicts when sending batched messages frames = await to_frames(msg, serializers=serializers, on_error=on_error) - gpu_frames = b"".join( + is_gpus = b"".join( [ struct.pack("?", hasattr(frame, "__cuda_array_interface__")) for frame in frames ] ) - size_frames = b"".join([struct.pack("Q", nbytes(frame)) for frame in frames]) + sizes = b"".join([struct.pack("Q", nbytes(frame)) for frame in frames]) nframes = struct.pack("Q", len(frames)) - meta = b"".join([nframes, gpu_frames, size_frames]) + meta = b"".join([nframes, is_gpus, sizes]) await self.ep.send_obj(meta) From 88afada4d11a3cf53bdb73c1fd2f71e366580da6 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 30 May 2019 07:14:09 -0700 Subject: [PATCH 68/68] Remove unnecessary comments --- distributed/comm/tests/test_ucx.py | 1 - distributed/comm/ucx.py | 2 -- distributed/core.py | 3 --- 3 files changed, 6 deletions(-) diff --git a/distributed/comm/tests/test_ucx.py b/distributed/comm/tests/test_ucx.py index 30789a8b5b3..55a2f4ec82c 100644 --- a/distributed/comm/tests/test_ucx.py +++ b/distributed/comm/tests/test_ucx.py @@ -90,7 +90,6 @@ async def f(): address = "ucx://{}:{}".format(HOST, 0) async def handle_comm(comm): - # XXX: failures here don't fail the build yet msg = await comm.read() msg["op"] = "pong" await comm.write(msg) diff --git a/distributed/comm/ucx.py b/distributed/comm/ucx.py index 4c3d42a3099..3f3f0bfe943 100644 --- a/distributed/comm/ucx.py +++ b/distributed/comm/ucx.py @@ -206,8 +206,6 @@ def __init__( self.ucp_server = None self._task = None - # XXX: The init may be required to take args like - # {'require_encryption': None, 'ssl_context': None} self.connection_args = connection_args self._task = None diff --git a/distributed/core.py b/distributed/core.py index 3133bb58b10..79c726eed6d 100644 --- a/distributed/core.py +++ b/distributed/core.py @@ -467,9 +467,6 @@ def handle_stream(self, comm, extra=None, every_cycle=[]): if op == "close-stream": closed = True break - # XXX: getting a KeyError here. Our - # not stream_handlers. - # It's /pdb handler = self.stream_handlers[op] handler(**merge(extra, msg)) else: