From 56fe96c87a5c9d502ed94bfb7a5bd39ee88c9f9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Wed, 4 Mar 2020 16:18:07 +0100 Subject: [PATCH 01/13] Add scheduler_options to JobQueueCluster. --- dask_jobqueue/core.py | 57 +++++++++++++++++------ dask_jobqueue/tests/test_jobqueue_core.py | 18 ++++++- docs/source/changelog.rst | 4 ++ docs/source/interactive.rst | 8 ++-- 4 files changed, 70 insertions(+), 17 deletions(-) diff --git a/dask_jobqueue/core.py b/dask_jobqueue/core.py index e6bc55d8..b971649c 100644 --- a/dask_jobqueue/core.py +++ b/dask_jobqueue/core.py @@ -69,8 +69,12 @@ Whether or not to run this cluster object with the async/await syntax security : Security A dask.distributed security object if you're using TLS/SSL - dashboard_address : str or int - An address like ":8787" on which to host the Scheduler's dashboard + scheduler_options : dict + Used to pass additional arguments to Dask Scheduler. For example use + ``scheduler_options={'dasboard_address': ':12435'}`` to specify which + port the web dashboard should use or ``scheduler_options={'host': 'your-host'}`` + to specify the host the Dask scheduler should run on. See + :class:`distributed.Scheduler` for more details. """.strip() @@ -420,13 +424,15 @@ def __init__( silence_logs="error", name=None, asynchronous=False, - # Scheduler keywords - interface=None, + # Scheduler-only keywords + dashboard_address=None, host=None, + scheduler_options=None, + # Options for both scheduler and workers + interface=None, protocol="tcp://", - dashboard_address=":8787", - config_name=None, # Job keywords + config_name=None, **kwargs ): self.status = "created" @@ -447,22 +453,47 @@ def __init__( ) ) + if dashboard_address is not None: + raise ValueError( + "Please pass 'dashboard_address' through 'scheduler_options': use\n" + 'cluster = {0}(..., scheduler_options={{"dashboard_address": ":12345"}}) rather than\n' + 'cluster = {0}(..., dashboard_address="12435")'.format( + self.__class__.__name__ + ) + ) + + if host is not None: + raise ValueError( + "Please pass 'host' through 'scheduler_options': use\n" + 'cluster = {0}(..., scheduler_options={{"host": "your-host"}}) rather than\n' + 'cluster = {0}(..., host="your-host")'.format(self.__class__.__name__) + ) + default_config_name = self.job_cls.default_config_name() if config_name is None: config_name = default_config_name if interface is None: interface = dask.config.get("jobqueue.%s.interface" % config_name) + if scheduler_options is None: + scheduler_options = {} + + default_scheduler_options = { + "protocol": protocol, + "dashboard_address": ":8787", + "security": security, + } + # scheduler_options overrides parameters common to both workers and scheduler + scheduler_options = dict(default_scheduler_options, **scheduler_options) + + # Use the same network interface as the workers if scheduler ip has not + # been set through scheduler_options via 'host' or 'interface' + if "host" not in scheduler_options and "interface" not in scheduler_options: + scheduler_options["interface"] = interface scheduler = { "cls": Scheduler, # Use local scheduler for now - "options": { - "protocol": protocol, - "interface": interface, - "host": host, - "dashboard_address": dashboard_address, - "security": security, - }, + "options": scheduler_options, } kwargs["config_name"] = config_name diff --git a/dask_jobqueue/tests/test_jobqueue_core.py b/dask_jobqueue/tests/test_jobqueue_core.py index ebbf0c1e..ac4ddb54 100644 --- a/dask_jobqueue/tests/test_jobqueue_core.py +++ b/dask_jobqueue/tests/test_jobqueue_core.py @@ -84,7 +84,7 @@ def test_forward_ip(): cores=8, memory="28GB", name="dask-worker", - host=ip, + scheduler_options={"host": ip}, ) as cluster: assert cluster.scheduler.ip == ip @@ -268,3 +268,19 @@ def test_default_number_of_worker_processes(Cluster): with Cluster(cores=6, memory="4GB") as cluster: assert " --nprocs 3" in cluster.job_script() assert " --nthreads 2" in cluster.job_script() + + +@pytest.mark.parametrize( + "Cluster", + [PBSCluster, MoabCluster, SLURMCluster, SGECluster, LSFCluster, OARCluster], +) +def test_scheduler_options(Cluster): + net_if_addrs = psutil.net_if_addrs() + interface = list(net_if_addrs.keys())[0] + port = 8804 + with Cluster( + cores=1, memory="1GB", scheduler_options={"interface": interface, "port": port} + ) as cluster: + scheduler_options = cluster.scheduler_spec["options"] + assert scheduler_options["interface"] == interface + assert scheduler_options["port"] == port diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 66a9fa87..825eb71f 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -15,6 +15,10 @@ Development version process and only threads, i.e. ``proccesses=1``, ``threads_per_process=cores``. - fix bug (forgotten async def) in ``OARCluster._submit_job`` (:pr:`380`). - ``LSFCluster``: switch to ``use_stdin=True`` (:pr:`388`). +- all cluster classes: add ``scheduler_options`` allows to pass parameters to + the Dask scheduler. For example ``scheduler_options={'interface': 'eth0', + dashboard_addresses=':12435')`` (:pr:`384`). Breaking change: ``port`` and + ``dashboard_addresses`` has to be passed through ``scheduler_options``. 0.7.0 / 2019-10-09 ------------------ diff --git a/docs/source/interactive.rst b/docs/source/interactive.rst index 4be452c3..72789853 100644 --- a/docs/source/interactive.rst +++ b/docs/source/interactive.rst @@ -129,10 +129,12 @@ dashboard is valuable to help you understand the state of your computation and cluster. Typically, the dashboard is served on a separate port from Jupyter, and so can -be used whether you choose to use Jupyter or not. If you want to open up a +be used whether you choose to use Jupyter or not. If you want to open up a connection to see the dashboard you can do so with SSH Tunneling as described -above. The dashboard's default port is at ``8787``, and is configurable with -the ``dashboard_address=`` keyword to the Dask Jobqueue cluster objects. +above. The dashboard's default port is at ``8787``, and is configurable by +using the ``scheduler_options`` parameter in the Dask Jobqueue cluster object. +For example ``scheduler_options={'dashboard_address': ':12435'}`` would use +12435 for the web dasboard port. However, Jupyter is also able to proxy the dashboard connection through the Jupyter server, allowing you to access the dashboard at From bb9de9584baa56657dc40471c4d447915df70e24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Tue, 10 Mar 2020 07:16:28 +0100 Subject: [PATCH 02/13] Do the test pass if I remove the "host" part? --- dask_jobqueue/core.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/dask_jobqueue/core.py b/dask_jobqueue/core.py index b971649c..73b9214f 100644 --- a/dask_jobqueue/core.py +++ b/dask_jobqueue/core.py @@ -15,7 +15,7 @@ from distributed.deploy.spec import ProcessInterface, SpecCluster from distributed.deploy.local import nprocesses_nthreads from distributed.scheduler import Scheduler -from distributed.utils import format_bytes, parse_bytes, tmpfile, get_ip_interface +from distributed.utils import format_bytes, parse_bytes, tmpfile logger = logging.getLogger(__name__) @@ -206,9 +206,6 @@ def __init__( if interface: extra = extra + ["--interface", interface] - kwargs.setdefault("host", get_ip_interface(interface)) - else: - kwargs.setdefault("host", "") # Keep information on process, cores, and memory, for use in subclasses self.worker_memory = parse_bytes(memory) if memory is not None else None From 21b24873b267de4353f86501a247a8e976b3f5dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Mon, 16 Mar 2020 17:59:12 +0100 Subject: [PATCH 03/13] Test case of different interface on login and compute nodes. --- ci/slurm/Dockerfile | 2 ++ ci/slurm/docker-compose.yml | 28 ++++++++++++++++++++++++++++ ci/slurm/start-slurm.sh | 8 ++++++++ dask_jobqueue/tests/test_slurm.py | 19 +++++++++++++++++++ 4 files changed, 57 insertions(+) diff --git a/ci/slurm/Dockerfile b/ci/slurm/Dockerfile index 6c6c2439..be2c8320 100644 --- a/ci/slurm/Dockerfile +++ b/ci/slurm/Dockerfile @@ -1,5 +1,7 @@ FROM giovtorres/slurm-docker-cluster +RUN yum install -y iproute + RUN curl -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ bash miniconda.sh -f -b -p /opt/anaconda && \ /opt/anaconda/bin/conda clean -tipy && \ diff --git a/ci/slurm/docker-compose.yml b/ci/slurm/docker-compose.yml index 414ebfea..93e429a4 100644 --- a/ci/slurm/docker-compose.yml +++ b/ci/slurm/docker-compose.yml @@ -12,6 +12,8 @@ services: MYSQL_PASSWORD: password volumes: - var_lib_mysql:/var/lib/mysql + networks: + common-network: slurmdbd: build: . @@ -26,6 +28,8 @@ services: - "6819" depends_on: - mysql + networks: + common-network: slurmctld: build: . @@ -42,6 +46,11 @@ services: - "6817" depends_on: - "slurmdbd" + networks: + common-network: + ipv4_address: 172.16.238.10 + cap_add: + - NET_ADMIN c1: build: . @@ -57,6 +66,11 @@ services: - "6818" depends_on: - "slurmctld" + networks: + common-network: + ipv4_address: 172.16.238.11 + cap_add: + - NET_ADMIN c2: build: . @@ -72,6 +86,11 @@ services: - "6818" depends_on: - "slurmctld" + networks: + common-network: + ipv4_address: 172.16.238.12 + cap_add: + - NET_ADMIN volumes: etc_munge: @@ -79,3 +98,12 @@ volumes: slurm_jobdir: var_lib_mysql: var_log_slurm: + +networks: + common-network: + driver: bridge + ipam: + driver: default + config: + - subnet: 172.16.238.0/24 + gateway: 172.16.238.254 diff --git a/ci/slurm/start-slurm.sh b/ci/slurm/start-slurm.sh index 5ba34471..ea7e0476 100755 --- a/ci/slurm/start-slurm.sh +++ b/ci/slurm/start-slurm.sh @@ -1,6 +1,14 @@ #!/bin/bash docker-compose up --build -d + +# On some clusters the login node does not have the same interface as the +# compute nodes. The next three lines allow to test this edge case by adding +# separate interfaces on the worker and on the scheduler nodes. +docker exec slurmctld ip addr add 172.16.238.20/24 dev eth0 label eth0:scheduler +docker exec c1 ip addr add 172.16.238.21/24 dev eth0 label eth0:worker +docker exec c2 ip addr add 172.16.238.22/24 dev eth0 label eth0:worker + while [ `./register_cluster.sh 2>&1 | grep "sacctmgr: error" | wc -l` -ne 0 ] do echo "Waiting for SLURM cluster to become ready"; diff --git a/dask_jobqueue/tests/test_slurm.py b/dask_jobqueue/tests/test_slurm.py index 6b26ffc0..dec9fd97 100644 --- a/dask_jobqueue/tests/test_slurm.py +++ b/dask_jobqueue/tests/test_slurm.py @@ -193,3 +193,22 @@ def test_config_name_slurm_takes_custom_config(): with dask.config.set({"jobqueue.slurm-config-name": conf}): with SLURMCluster(config_name="slurm-config-name") as cluster: assert cluster.job_name == "myname" + + +@pytest.mark.env("slurm") +def test_different_interfaces_on_scheduler_and_workers(loop): + with SLURMCluster( + walltime="00:02:00", + cores=1, + memory="2GB", + interface="eth0:worker", + scheduler_options={"interface": "eth0:scheduler"}, + loop=loop, + ) as cluster: + cluster.scale(jobs=1) + with Client(cluster) as client: + future = client.submit(lambda x: x + 1, 10) + + client.wait_for_workers(1) + + assert future.result(QUEUE_WAIT) == 11 From 7acaae8c21eed04b58bc1250c9bccd554296fa92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Mon, 16 Mar 2020 19:07:39 +0100 Subject: [PATCH 04/13] debug --- ci/slurm/start-slurm.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ci/slurm/start-slurm.sh b/ci/slurm/start-slurm.sh index ea7e0476..e2bde4e4 100755 --- a/ci/slurm/start-slurm.sh +++ b/ci/slurm/start-slurm.sh @@ -9,6 +9,10 @@ docker exec slurmctld ip addr add 172.16.238.20/24 dev eth0 label eth0:scheduler docker exec c1 ip addr add 172.16.238.21/24 dev eth0 label eth0:worker docker exec c2 ip addr add 172.16.238.22/24 dev eth0 label eth0:worker +docker exec slurmctld ip addr +docker exec c1 ip addr +docker exec c2 ip addr + while [ `./register_cluster.sh 2>&1 | grep "sacctmgr: error" | wc -l` -ne 0 ] do echo "Waiting for SLURM cluster to become ready"; From e30a0fa88abf33521607f97fcdefb235964adbf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Tue, 17 Mar 2020 09:16:23 +0100 Subject: [PATCH 05/13] debug --- .travis.yml | 26 +++++++++++++------------- ci/slurm/Dockerfile | 4 ++-- ci/slurm/start-slurm.sh | 9 ++++++--- 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/.travis.yml b/.travis.yml index 86a834f6..eafec75c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,19 +9,19 @@ services: - docker matrix: include: - - python: "3.6" - env: - - OS=ubuntu-14.04 - - JOBQUEUE=sge - - python: "3.6" - env: - - OS=ubuntu-14.04 - # JOBQUEUE=none is for tests that do not need a cluster to run - - JOBQUEUE=none - - python: "3.6" - env: - - OS=ubuntu-14.04 - - JOBQUEUE=pbs + # - python: "3.6" + # env: + # - OS=ubuntu-14.04 + # - JOBQUEUE=sge + # - python: "3.6" + # env: + # - OS=ubuntu-14.04 + # # JOBQUEUE=none is for tests that do not need a cluster to run + # - JOBQUEUE=none + # - python: "3.6" + # env: + # - OS=ubuntu-14.04 + # - JOBQUEUE=pbs - python: "3.6" env: - OS=ubuntu-14.04 diff --git a/ci/slurm/Dockerfile b/ci/slurm/Dockerfile index be2c8320..d12427e0 100644 --- a/ci/slurm/Dockerfile +++ b/ci/slurm/Dockerfile @@ -7,8 +7,8 @@ RUN curl -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-L /opt/anaconda/bin/conda clean -tipy && \ rm -f miniconda.sh ENV PATH /opt/anaconda/bin:$PATH -RUN conda install --yes -c conda-forge python=3.6 dask distributed flake8 pytest pytest-asyncio -RUN pip install git+https://github.com/dask/distributed --upgrade --no-deps +RUN conda install --yes -c conda-forge python=3.7 dask distributed flake8 pytest pytest-asyncio +# RUN pip install git+https://github.com/dask/distributed --upgrade --no-deps ENV LC_ALL en_US.UTF-8 diff --git a/ci/slurm/start-slurm.sh b/ci/slurm/start-slurm.sh index e2bde4e4..7171eee6 100755 --- a/ci/slurm/start-slurm.sh +++ b/ci/slurm/start-slurm.sh @@ -9,9 +9,12 @@ docker exec slurmctld ip addr add 172.16.238.20/24 dev eth0 label eth0:scheduler docker exec c1 ip addr add 172.16.238.21/24 dev eth0 label eth0:worker docker exec c2 ip addr add 172.16.238.22/24 dev eth0 label eth0:worker -docker exec slurmctld ip addr -docker exec c1 ip addr -docker exec c2 ip addr +for c in slurmctld c1 c2; do + echo $i + docker exec $c ip addr + docker exec -it $c python -c 'import psutil; print(psutil.net_if_addrs().keys())' + echo '------------------------------------------------------------' +done while [ `./register_cluster.sh 2>&1 | grep "sacctmgr: error" | wc -l` -ne 0 ] do From 2d20ad524d92f32efbc2069a286b0370631dcc40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Tue, 17 Mar 2020 09:34:13 +0100 Subject: [PATCH 06/13] More debug --- ci/slurm.sh | 2 +- dask_jobqueue/tests/test_slurm.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/ci/slurm.sh b/ci/slurm.sh index 065bfa02..2118e240 100644 --- a/ci/slurm.sh +++ b/ci/slurm.sh @@ -18,7 +18,7 @@ function jobqueue_install { } function jobqueue_script { - docker exec -it slurmctld /bin/bash -c "pytest /dask-jobqueue/dask_jobqueue --verbose -E slurm -s" + docker exec -it slurmctld /bin/bash -c "pytest /dask-jobqueue/dask_jobqueue --verbose -E slurm -s -k different" } function jobqueue_after_script { diff --git a/dask_jobqueue/tests/test_slurm.py b/dask_jobqueue/tests/test_slurm.py index dec9fd97..f94c6fbf 100644 --- a/dask_jobqueue/tests/test_slurm.py +++ b/dask_jobqueue/tests/test_slurm.py @@ -197,6 +197,13 @@ def test_config_name_slurm_takes_custom_config(): @pytest.mark.env("slurm") def test_different_interfaces_on_scheduler_and_workers(loop): + import socket + + print() + print("host:", socket.gethostname()) + import psutil + + print("interfaces:", psutil.net_if_addrs().keys()) with SLURMCluster( walltime="00:02:00", cores=1, From 82c84f7b40f654e887c42c6d117773a568df21a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Tue, 17 Mar 2020 09:43:09 +0100 Subject: [PATCH 07/13] Still more debug --- ci/slurm.sh | 13 +++++++++++++ ci/slurm/start-slurm.sh | 7 ------- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/ci/slurm.sh b/ci/slurm.sh index 2118e240..26e2c3c7 100644 --- a/ci/slurm.sh +++ b/ci/slurm.sh @@ -11,13 +11,26 @@ function jobqueue_before_install { docker ps -a docker images + debug +} + +function debug { + for c in slurmctld c1 c2; do + echo '------------------------------------------------------------' + echo $c + docker exec $c ip addr + docker exec -it $c python -c 'import psutil; print(psutil.net_if_addrs().keys())' + echo '------------------------------------------------------------' + done } function jobqueue_install { + debug docker exec -it slurmctld /bin/bash -c "cd /dask-jobqueue; pip install -e ." } function jobqueue_script { + debug docker exec -it slurmctld /bin/bash -c "pytest /dask-jobqueue/dask_jobqueue --verbose -E slurm -s -k different" } diff --git a/ci/slurm/start-slurm.sh b/ci/slurm/start-slurm.sh index 7171eee6..ea7e0476 100755 --- a/ci/slurm/start-slurm.sh +++ b/ci/slurm/start-slurm.sh @@ -9,13 +9,6 @@ docker exec slurmctld ip addr add 172.16.238.20/24 dev eth0 label eth0:scheduler docker exec c1 ip addr add 172.16.238.21/24 dev eth0 label eth0:worker docker exec c2 ip addr add 172.16.238.22/24 dev eth0 label eth0:worker -for c in slurmctld c1 c2; do - echo $i - docker exec $c ip addr - docker exec -it $c python -c 'import psutil; print(psutil.net_if_addrs().keys())' - echo '------------------------------------------------------------' -done - while [ `./register_cluster.sh 2>&1 | grep "sacctmgr: error" | wc -l` -ne 0 ] do echo "Waiting for SLURM cluster to become ready"; From deb47126f7184e3e0f71b9bba0b75ff7e5ff0f47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Tue, 17 Mar 2020 09:53:36 +0100 Subject: [PATCH 08/13] this should work. --- ci/slurm/start-slurm.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ci/slurm/start-slurm.sh b/ci/slurm/start-slurm.sh index ea7e0476..b0dbc5f7 100755 --- a/ci/slurm/start-slurm.sh +++ b/ci/slurm/start-slurm.sh @@ -2,16 +2,16 @@ docker-compose up --build -d -# On some clusters the login node does not have the same interface as the -# compute nodes. The next three lines allow to test this edge case by adding -# separate interfaces on the worker and on the scheduler nodes. -docker exec slurmctld ip addr add 172.16.238.20/24 dev eth0 label eth0:scheduler -docker exec c1 ip addr add 172.16.238.21/24 dev eth0 label eth0:worker -docker exec c2 ip addr add 172.16.238.22/24 dev eth0 label eth0:worker - while [ `./register_cluster.sh 2>&1 | grep "sacctmgr: error" | wc -l` -ne 0 ] do echo "Waiting for SLURM cluster to become ready"; sleep 2 done echo "SLURM properly configured" + +# On some clusters the login node does not have the same interface as the +# compute nodes. The next three lines allow to test this edge case by adding +# separate interfaces on the worker and on the scheduler nodes. +docker exec slurmctld ip addr add 172.16.238.20/24 dev eth0 label eth0:scheduler +docker exec c1 ip addr add 172.16.238.21/24 dev eth0 label eth0:worker +docker exec c2 ip addr add 172.16.238.22/24 dev eth0 label eth0:worker From 8bfaee929a29395f1c17ba635e1cb79215fede91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Tue, 17 Mar 2020 13:12:36 +0100 Subject: [PATCH 09/13] Back to normal. --- .travis.yml | 26 +++++++++++++------------- ci/slurm.sh | 11 ++++------- 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/.travis.yml b/.travis.yml index eafec75c..86a834f6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,19 +9,19 @@ services: - docker matrix: include: - # - python: "3.6" - # env: - # - OS=ubuntu-14.04 - # - JOBQUEUE=sge - # - python: "3.6" - # env: - # - OS=ubuntu-14.04 - # # JOBQUEUE=none is for tests that do not need a cluster to run - # - JOBQUEUE=none - # - python: "3.6" - # env: - # - OS=ubuntu-14.04 - # - JOBQUEUE=pbs + - python: "3.6" + env: + - OS=ubuntu-14.04 + - JOBQUEUE=sge + - python: "3.6" + env: + - OS=ubuntu-14.04 + # JOBQUEUE=none is for tests that do not need a cluster to run + - JOBQUEUE=none + - python: "3.6" + env: + - OS=ubuntu-14.04 + - JOBQUEUE=pbs - python: "3.6" env: - OS=ubuntu-14.04 diff --git a/ci/slurm.sh b/ci/slurm.sh index 26e2c3c7..b856858c 100644 --- a/ci/slurm.sh +++ b/ci/slurm.sh @@ -11,27 +11,24 @@ function jobqueue_before_install { docker ps -a docker images - debug + show_network_interfaces } -function debug { +function show_network_interfaces { for c in slurmctld c1 c2; do echo '------------------------------------------------------------' - echo $c - docker exec $c ip addr + echo docker container: $c docker exec -it $c python -c 'import psutil; print(psutil.net_if_addrs().keys())' echo '------------------------------------------------------------' done } function jobqueue_install { - debug docker exec -it slurmctld /bin/bash -c "cd /dask-jobqueue; pip install -e ." } function jobqueue_script { - debug - docker exec -it slurmctld /bin/bash -c "pytest /dask-jobqueue/dask_jobqueue --verbose -E slurm -s -k different" + docker exec -it slurmctld /bin/bash -c "pytest /dask-jobqueue/dask_jobqueue --verbose -E slurm -s" } function jobqueue_after_script { From 6c6d01f2e7863ca552cc1bd64acd10ce0b345626 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Tue, 17 Mar 2020 15:44:31 +0100 Subject: [PATCH 10/13] Cleanup --- ci/slurm/Dockerfile | 4 ++-- ci/slurm/docker-compose.yml | 9 ++++----- dask_jobqueue/tests/test_slurm.py | 7 ------- 3 files changed, 6 insertions(+), 14 deletions(-) diff --git a/ci/slurm/Dockerfile b/ci/slurm/Dockerfile index d12427e0..be2c8320 100644 --- a/ci/slurm/Dockerfile +++ b/ci/slurm/Dockerfile @@ -7,8 +7,8 @@ RUN curl -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-L /opt/anaconda/bin/conda clean -tipy && \ rm -f miniconda.sh ENV PATH /opt/anaconda/bin:$PATH -RUN conda install --yes -c conda-forge python=3.7 dask distributed flake8 pytest pytest-asyncio -# RUN pip install git+https://github.com/dask/distributed --upgrade --no-deps +RUN conda install --yes -c conda-forge python=3.6 dask distributed flake8 pytest pytest-asyncio +RUN pip install git+https://github.com/dask/distributed --upgrade --no-deps ENV LC_ALL en_US.UTF-8 diff --git a/ci/slurm/docker-compose.yml b/ci/slurm/docker-compose.yml index 93e429a4..4764533f 100644 --- a/ci/slurm/docker-compose.yml +++ b/ci/slurm/docker-compose.yml @@ -48,7 +48,7 @@ services: - "slurmdbd" networks: common-network: - ipv4_address: 172.16.238.10 + ipv4_address: 10.1.1.10 cap_add: - NET_ADMIN @@ -68,7 +68,7 @@ services: - "slurmctld" networks: common-network: - ipv4_address: 172.16.238.11 + ipv4_address: 10.1.1.11 cap_add: - NET_ADMIN @@ -88,7 +88,7 @@ services: - "slurmctld" networks: common-network: - ipv4_address: 172.16.238.12 + ipv4_address: 10.1.1.12 cap_add: - NET_ADMIN @@ -105,5 +105,4 @@ networks: ipam: driver: default config: - - subnet: 172.16.238.0/24 - gateway: 172.16.238.254 + - subnet: 10.1.1.0/24 diff --git a/dask_jobqueue/tests/test_slurm.py b/dask_jobqueue/tests/test_slurm.py index f94c6fbf..dec9fd97 100644 --- a/dask_jobqueue/tests/test_slurm.py +++ b/dask_jobqueue/tests/test_slurm.py @@ -197,13 +197,6 @@ def test_config_name_slurm_takes_custom_config(): @pytest.mark.env("slurm") def test_different_interfaces_on_scheduler_and_workers(loop): - import socket - - print() - print("host:", socket.gethostname()) - import psutil - - print("interfaces:", psutil.net_if_addrs().keys()) with SLURMCluster( walltime="00:02:00", cores=1, From 0a9d7c0e66a076ebbabfd7e5c182048ceec84850 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Tue, 17 Mar 2020 16:08:42 +0100 Subject: [PATCH 11/13] Add more tests. --- dask_jobqueue/tests/test_jobqueue_core.py | 59 +++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/dask_jobqueue/tests/test_jobqueue_core.py b/dask_jobqueue/tests/test_jobqueue_core.py index ac4ddb54..bc9c94e0 100644 --- a/dask_jobqueue/tests/test_jobqueue_core.py +++ b/dask_jobqueue/tests/test_jobqueue_core.py @@ -278,9 +278,68 @@ def test_scheduler_options(Cluster): net_if_addrs = psutil.net_if_addrs() interface = list(net_if_addrs.keys())[0] port = 8804 + with Cluster( cores=1, memory="1GB", scheduler_options={"interface": interface, "port": port} ) as cluster: scheduler_options = cluster.scheduler_spec["options"] assert scheduler_options["interface"] == interface assert scheduler_options["port"] == port + + +@pytest.mark.parametrize( + "Cluster", + [PBSCluster, MoabCluster, SLURMCluster, SGECluster, LSFCluster, OARCluster], +) +def test_scheduler_options_interface(Cluster): + net_if_addrs = psutil.net_if_addrs() + scheduler_interface = list(net_if_addrs.keys())[0] + worker_interface = "worker-interface" + scheduler_host = socket.gethostname() + + with Cluster(cores=1, memory="1GB", interface=scheduler_interface) as cluster: + scheduler_options = cluster.scheduler_spec["options"] + worker_options = cluster.new_spec["options"] + assert scheduler_options["interface"] == scheduler_interface + assert worker_options["interface"] == scheduler_interface + + with Cluster( + cores=1, + memory="1GB", + interface=worker_interface, + scheduler_options={"interface": scheduler_interface}, + ) as cluster: + scheduler_options = cluster.scheduler_spec["options"] + worker_options = cluster.new_spec["options"] + assert scheduler_options["interface"] == scheduler_interface + assert worker_options["interface"] == worker_interface + + with Cluster( + cores=1, + memory="1GB", + interface=worker_interface, + scheduler_options={"host": scheduler_host}, + ) as cluster: + scheduler_options = cluster.scheduler_spec["options"] + assert scheduler_options.get("interface") is None + assert scheduler_options["host"] == scheduler_host + assert worker_options["interface"] == worker_interface + + +@pytest.mark.parametrize( + "Cluster", + [PBSCluster, MoabCluster, SLURMCluster, SGECluster, LSFCluster, OARCluster], +) +def test_cluster_error_scheduler_arguments_should_use_scheduler_options(Cluster): + scheduler_host = socket.gethostname() + message_template = "pass {!r} through 'scheduler_options'" + + message = message_template.format("host") + with pytest.raises(ValueError, match=message): + with Cluster(cores=1, memory="1GB", host=scheduler_host): + pass + + message = message_template.format("dashboard_address") + with pytest.raises(ValueError, match=message): + with Cluster(cores=1, memory="1GB", dashboard_address=":8787"): + pass From b203326daa1aad1ebac7cfdae9c62571824f9fb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Tue, 17 Mar 2020 16:12:14 +0100 Subject: [PATCH 12/13] Fix forgotten IP addresses. --- ci/slurm/start-slurm.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/slurm/start-slurm.sh b/ci/slurm/start-slurm.sh index b0dbc5f7..b369f3aa 100755 --- a/ci/slurm/start-slurm.sh +++ b/ci/slurm/start-slurm.sh @@ -12,6 +12,6 @@ echo "SLURM properly configured" # On some clusters the login node does not have the same interface as the # compute nodes. The next three lines allow to test this edge case by adding # separate interfaces on the worker and on the scheduler nodes. -docker exec slurmctld ip addr add 172.16.238.20/24 dev eth0 label eth0:scheduler -docker exec c1 ip addr add 172.16.238.21/24 dev eth0 label eth0:worker -docker exec c2 ip addr add 172.16.238.22/24 dev eth0 label eth0:worker +docker exec slurmctld ip addr add 10.1.1.20/24 dev eth0 label eth0:scheduler +docker exec c1 ip addr add 10.1.1.21/24 dev eth0 label eth0:worker +docker exec c2 ip addr add 10.1.1.22/24 dev eth0 label eth0:worker From d2158103c7fc9884f7a76afd40f867d96c1adedf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Tue, 17 Mar 2020 16:36:06 +0100 Subject: [PATCH 13/13] Tweak doc. --- dask_jobqueue/core.py | 6 +++++- docs/source/changelog.rst | 5 +++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/dask_jobqueue/core.py b/dask_jobqueue/core.py index 73b9214f..994bc886 100644 --- a/dask_jobqueue/core.py +++ b/dask_jobqueue/core.py @@ -30,7 +30,11 @@ By default, ``process ~= sqrt(cores)`` so that the number of processes and the number of threads per process is roughly the same. interface : str - Network interface like 'eth0' or 'ib0'. + Network interface like 'eth0' or 'ib0'. This will be used both for the + Dask scheduler and the Dask workers interface. If you need a different + interface for the Dask scheduler you can pass it through + the ``scheduler_options`` argument: + ``interface=your_worker_interface, scheduler_options={'interface': your_scheduler_interface}``. nanny : bool Whether or not to start a nanny process local_directory : str diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 825eb71f..d31f47a7 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -17,8 +17,9 @@ Development version - ``LSFCluster``: switch to ``use_stdin=True`` (:pr:`388`). - all cluster classes: add ``scheduler_options`` allows to pass parameters to the Dask scheduler. For example ``scheduler_options={'interface': 'eth0', - dashboard_addresses=':12435')`` (:pr:`384`). Breaking change: ``port`` and - ``dashboard_addresses`` has to be passed through ``scheduler_options``. + dashboard_addresses=':12435')`` (:pr:`384`). Breaking change: using ``port`` + or ``dashboard_addresses`` arguments raises an error. They have to be passed + through ``scheduler_options``. 0.7.0 / 2019-10-09 ------------------