I am currently trying to setup a jobqueue cluster in a HPC environment in which available network interfaces on the compute nodes (where Dask workers live) and the login nodes (where the scheduler lives) are not the same.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-2-b93ee7f2ce8e> in <module>
----> 1 jobqueue_cluster = dask_jobqueue.SLURMCluster(cores=6, memory='24GB',
2 project='esmtst', queue='devel',
3 interface='ib0')
/p/project/cesmtst/hoeflich1/miniconda3/envs/Dask-jobqueue_v2020.02.10/lib/python3.8/site-packages/dask_jobqueue/core.py in __init__(self, n_workers, job_cls, loop, security, silence_logs, name, asynchronous, interface, host, protocol, dashboard_address, config_name, **kwargs)
446 worker["group"] = ["-" + str(i) for i in range(kwargs["processes"])]
447
--> 448 self._dummy_job # trigger property to ensure that the job is valid
449
450 super().__init__(
/p/project/cesmtst/hoeflich1/miniconda3/envs/Dask-jobqueue_v2020.02.10/lib/python3.8/site-packages/dask_jobqueue/core.py in _dummy_job(self)
473 except AttributeError:
474 address = "tcp://<insert-scheduler-address-here>:8786"
--> 475 return self.job_cls(
476 address or "tcp://<insert-scheduler-address-here>:8786",
477 name="name",
/p/project/cesmtst/hoeflich1/miniconda3/envs/Dask-jobqueue_v2020.02.10/lib/python3.8/site-packages/dask_jobqueue/slurm.py in __init__(self, queue, project, walltime, job_cpu, job_mem, job_extra, config_name, *args, **kwargs)
39 job_extra = dask.config.get("jobqueue.%s.job-extra" % config_name)
40
---> 41 super().__init__(*args, config_name=config_name, **kwargs)
42
43 header_lines = []
/p/project/cesmtst/hoeflich1/miniconda3/envs/Dask-jobqueue_v2020.02.10/lib/python3.8/site-packages/dask_jobqueue/core.py in __init__(self, scheduler, name, cores, memory, processes, nanny, interface, death_timeout, local_directory, extra, env_extra, header_skip, log_directory, shebang, python, job_name, config_name, **kwargs)
195 if interface:
196 extra = extra + ["--interface", interface]
--> 197 kwargs.setdefault("host", get_ip_interface(interface))
198 else:
199 kwargs.setdefault("host", "")
/p/project/cesmtst/hoeflich1/miniconda3/envs/Dask-jobqueue_v2020.02.10/lib/python3.8/site-packages/distributed/utils.py in get_ip_interface(ifname)
181 if ifname not in net_if_addrs:
182 allowed_ifnames = list(net_if_addrs.keys())
--> 183 raise ValueError(
184 "{!r} is not a valid network interface. "
185 "Valid network interfaces are: {}".format(ifname, allowed_ifnames)
ValueError: 'ib0' is not a valid network interface. Valid network interfaces are: ['lo', 'ib1', 'eth0', 'eth1']
I am currently trying to setup a jobqueue cluster in a HPC environment in which available network interfaces on the compute nodes (where Dask workers live) and the login nodes (where the scheduler lives) are not the same.
Available network interfaces
lo,ib0, but noib1and noeth{0,1}lo,ib1,eth0, andeth1Starting a jobqueue cluster with a basic workflow of
causes the following error
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-2-b93ee7f2ce8e> in <module> ----> 1 jobqueue_cluster = dask_jobqueue.SLURMCluster(cores=6, memory='24GB', 2 project='esmtst', queue='devel', 3 interface='ib0') /p/project/cesmtst/hoeflich1/miniconda3/envs/Dask-jobqueue_v2020.02.10/lib/python3.8/site-packages/dask_jobqueue/core.py in __init__(self, n_workers, job_cls, loop, security, silence_logs, name, asynchronous, interface, host, protocol, dashboard_address, config_name, **kwargs) 446 worker["group"] = ["-" + str(i) for i in range(kwargs["processes"])] 447 --> 448 self._dummy_job # trigger property to ensure that the job is valid 449 450 super().__init__( /p/project/cesmtst/hoeflich1/miniconda3/envs/Dask-jobqueue_v2020.02.10/lib/python3.8/site-packages/dask_jobqueue/core.py in _dummy_job(self) 473 except AttributeError: 474 address = "tcp://<insert-scheduler-address-here>:8786" --> 475 return self.job_cls( 476 address or "tcp://<insert-scheduler-address-here>:8786", 477 name="name", /p/project/cesmtst/hoeflich1/miniconda3/envs/Dask-jobqueue_v2020.02.10/lib/python3.8/site-packages/dask_jobqueue/slurm.py in __init__(self, queue, project, walltime, job_cpu, job_mem, job_extra, config_name, *args, **kwargs) 39 job_extra = dask.config.get("jobqueue.%s.job-extra" % config_name) 40 ---> 41 super().__init__(*args, config_name=config_name, **kwargs) 42 43 header_lines = [] /p/project/cesmtst/hoeflich1/miniconda3/envs/Dask-jobqueue_v2020.02.10/lib/python3.8/site-packages/dask_jobqueue/core.py in __init__(self, scheduler, name, cores, memory, processes, nanny, interface, death_timeout, local_directory, extra, env_extra, header_skip, log_directory, shebang, python, job_name, config_name, **kwargs) 195 if interface: 196 extra = extra + ["--interface", interface] --> 197 kwargs.setdefault("host", get_ip_interface(interface)) 198 else: 199 kwargs.setdefault("host", "") /p/project/cesmtst/hoeflich1/miniconda3/envs/Dask-jobqueue_v2020.02.10/lib/python3.8/site-packages/distributed/utils.py in get_ip_interface(ifname) 181 if ifname not in net_if_addrs: 182 allowed_ifnames = list(net_if_addrs.keys()) --> 183 raise ValueError( 184 "{!r} is not a valid network interface. " 185 "Valid network interfaces are: {}".format(ifname, allowed_ifnames) ValueError: 'ib0' is not a valid network interface. Valid network interfaces are: ['lo', 'ib1', 'eth0', 'eth1']Workaround as suggested in #207 (comment) doesn't work, because Dask distributed doesn't like interface and host to be set at the same time.