Skip to content

Using different interfaces for scheduler and workers #382

Description

@kathoef

I am currently trying to setup a jobqueue cluster in a HPC environment in which available network interfaces on the compute nodes (where Dask workers live) and the login nodes (where the scheduler lives) are not the same.

Available network interfaces

  • on the compute nodes there are lo, ib0, but no ib1 and no eth{0,1}
  • whereas on the login nodes there are lo, ib1, eth0, and eth1

Starting a jobqueue cluster with a basic workflow of

import dask_jobqueue
import dask.distributed as dask_distributed
jobqueue_cluster = dask_jobqueue.SLURMCluster(cores=6, memory='24GB',
                                              project='esmtst', queue='devel',
                                              interface='ib0')
client = dask_distributed.Client(jobqueue_cluster)
jobqueue_cluster.scale(jobs=1)

causes the following error

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-2-b93ee7f2ce8e> in <module>
----> 1 jobqueue_cluster = dask_jobqueue.SLURMCluster(cores=6, memory='24GB',
      2                                               project='esmtst', queue='devel',
      3                                               interface='ib0')

/p/project/cesmtst/hoeflich1/miniconda3/envs/Dask-jobqueue_v2020.02.10/lib/python3.8/site-packages/dask_jobqueue/core.py in __init__(self, n_workers, job_cls, loop, security, silence_logs, name, asynchronous, interface, host, protocol, dashboard_address, config_name, **kwargs)
    446             worker["group"] = ["-" + str(i) for i in range(kwargs["processes"])]
    447
--> 448         self._dummy_job  # trigger property to ensure that the job is valid
    449
    450         super().__init__(

/p/project/cesmtst/hoeflich1/miniconda3/envs/Dask-jobqueue_v2020.02.10/lib/python3.8/site-packages/dask_jobqueue/core.py in _dummy_job(self)
    473         except AttributeError:
    474             address = "tcp://<insert-scheduler-address-here>:8786"
--> 475         return self.job_cls(
    476             address or "tcp://<insert-scheduler-address-here>:8786",
    477             name="name",

/p/project/cesmtst/hoeflich1/miniconda3/envs/Dask-jobqueue_v2020.02.10/lib/python3.8/site-packages/dask_jobqueue/slurm.py in __init__(self, queue, project, walltime, job_cpu, job_mem, job_extra, config_name, *args, **kwargs)
     39             job_extra = dask.config.get("jobqueue.%s.job-extra" % config_name)
     40
---> 41         super().__init__(*args, config_name=config_name, **kwargs)
     42
     43         header_lines = []

/p/project/cesmtst/hoeflich1/miniconda3/envs/Dask-jobqueue_v2020.02.10/lib/python3.8/site-packages/dask_jobqueue/core.py in __init__(self, scheduler, name, cores, memory, processes, nanny, interface, death_timeout, local_directory, extra, env_extra, header_skip, log_directory, shebang, python, job_name, config_name, **kwargs)
    195         if interface:
    196             extra = extra + ["--interface", interface]
--> 197             kwargs.setdefault("host", get_ip_interface(interface))
    198         else:
    199             kwargs.setdefault("host", "")

/p/project/cesmtst/hoeflich1/miniconda3/envs/Dask-jobqueue_v2020.02.10/lib/python3.8/site-packages/distributed/utils.py in get_ip_interface(ifname)
    181     if ifname not in net_if_addrs:
    182         allowed_ifnames = list(net_if_addrs.keys())
--> 183         raise ValueError(
    184             "{!r} is not a valid network interface. "
    185             "Valid network interfaces are: {}".format(ifname, allowed_ifnames)

ValueError: 'ib0' is not a valid network interface. Valid network interfaces are: ['lo', 'ib1', 'eth0', 'eth1']

Workaround as suggested in #207 (comment) doesn't work, because Dask distributed doesn't like interface and host to be set at the same time.

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Fields

    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions