From cfdb97f6037c9341fb27f4fd33a8c6e2285459e3 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Mon, 5 Aug 2019 16:54:43 -0600 Subject: [PATCH 001/109] Add basic Job class, and PBS implementation This is the first step towards rewriting with SpecCluster I mostly copied the implementations from the Cluster classes, but then removed the cluster bits --- dask_jobqueue/__init__.py | 3 +- dask_jobqueue/job.py | 330 ++++++++++++++++++++++++++++++++ dask_jobqueue/pbs.py | 71 +++++++ dask_jobqueue/tests/test_job.py | 19 ++ 4 files changed, 422 insertions(+), 1 deletion(-) create mode 100644 dask_jobqueue/job.py create mode 100644 dask_jobqueue/tests/test_job.py diff --git a/dask_jobqueue/__init__.py b/dask_jobqueue/__init__.py index 4cd6a49e..864fcba2 100644 --- a/dask_jobqueue/__init__.py +++ b/dask_jobqueue/__init__.py @@ -1,8 +1,9 @@ # flake8: noqa from . import config from .core import JobQueueCluster +from .job import Job from .moab import MoabCluster -from .pbs import PBSCluster +from .pbs import PBSCluster, PBSJob from .slurm import SLURMCluster from .sge import SGECluster from .lsf import LSFCluster diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py new file mode 100644 index 00000000..f8fc0637 --- /dev/null +++ b/dask_jobqueue/job.py @@ -0,0 +1,330 @@ +import sys +from contextlib import contextmanager + +import dask +from distributed.deploy.spec import ProcessInterface + +import logging +import math +import os +import re +import shlex +import subprocess +import sys +from collections import OrderedDict +from contextlib import contextmanager + +import six + +import dask +import docrep +from .deploy import ClusterManager +from distributed import LocalCluster +from distributed.utils import format_bytes, parse_bytes, tmpfile, get_ip_interface + +logger = logging.getLogger(__name__) + + +class Job(ProcessInterface): + """ Base class to launch Dask workers on Job queues + + This class should not be used directly, use inherited class appropriate for + your queueing system (e.g. PBScluster or SLURMCluster) + + Parameters + ---------- + name : str + Name of Dask workers. + cores : int + Total number of cores per job + memory: str + Total amount of memory per job + processes : int + Number of processes per job + interface : str + Network interface like 'eth0' or 'ib0'. + death_timeout : float + Seconds to wait for a scheduler before closing workers + local_directory : str + Dask worker local directory for file spilling. + extra : list + Additional arguments to pass to `dask-worker` + env_extra : list + Other commands to add to script before launching worker. + log_directory : str + Directory to use for job scheduler logs. + shebang : str + Path to desired interpreter for your batch submission script. + python : str + Python executable used to launch Dask workers. + config_name : str + Section to use from jobqueue.yaml configuration file. + kwargs : dict + Additional keyword arguments to pass to `LocalCluster` + + Attributes + ---------- + submit_command: str + Abstract attribute for job scheduler submit command, + should be overridden + cancel_command: str + Abstract attribute for job scheduler cancel command, + should be overridden + + See Also + -------- + PBSCluster + SLURMCluster + SGECluster + OARCluster + LSFCluster + MoabCluster + """ + + _script_template = """ +%(shebang)s + +%(job_header)s + +%(env_header)s + +%(worker_command)s +""".lstrip() + + # Following class attributes should be overridden by extending classes. + submit_command = None + cancel_command = None + job_id_regexp = r"(?P\d+)" + + def __init__( + self, + scheduler=None, + name=None, + job_name=None, + cores=None, + memory=None, + processes=None, + interface=None, + death_timeout=None, + local_directory=None, + extra=None, + env_extra=None, + log_directory=None, + shebang=None, + python=sys.executable, + config_name=None, + **kwargs + ): + # """ + # This initializer should be considered as Abstract, and never used directly. + # """ + self.scheduler = scheduler + self.job_id = None + + super().__init__() + if config_name is None: + config_name = getattr(type(self), "config_name", None) + + if config_name is None: + raise NotImplementedError( + "JobQueueCluster is an abstract class that should not be instantiated." + ) + + if job_name is None: + job_name = dask.config.get("jobqueue.%s.name" % config_name) + if cores is None: + cores = dask.config.get("jobqueue.%s.cores" % config_name) + if memory is None: + memory = dask.config.get("jobqueue.%s.memory" % config_name) + if processes is None: + processes = dask.config.get("jobqueue.%s.processes" % config_name) + if interface is None: + interface = dask.config.get("jobqueue.%s.interface" % config_name) + if death_timeout is None: + death_timeout = dask.config.get("jobqueue.%s.death-timeout" % config_name) + if local_directory is None: + local_directory = dask.config.get( + "jobqueue.%s.local-directory" % config_name + ) + if extra is None: + extra = dask.config.get("jobqueue.%s.extra" % config_name) + if env_extra is None: + env_extra = dask.config.get("jobqueue.%s.env-extra" % config_name) + if log_directory is None: + log_directory = dask.config.get("jobqueue.%s.log-directory" % config_name) + if shebang is None: + shebang = dask.config.get("jobqueue.%s.shebang" % config_name) + + if cores is None: + raise ValueError( + "You must specify how many cores to use per job like ``cores=8``" + ) + + if memory is None: + raise ValueError( + "You must specify how much memory to use per job like ``memory='24 GB'``" + ) + + # This attribute should be overridden + self.job_header = None + + if interface: + extra += ["--interface", interface] + kwargs.setdefault("host", get_ip_interface(interface)) + else: + kwargs.setdefault("host", "") + + # Keep information on process, cores, and memory, for use in subclasses + self.worker_memory = parse_bytes(memory) if memory is not None else None + self.worker_processes = processes + self.worker_cores = cores + self.name = name + + self.shebang = shebang + + self._env_header = "\n".join(env_extra) + + # dask-worker command line build + dask_worker_command = "%(python)s -m distributed.cli.dask_worker" % dict( + python=python + ) + command_args = [dask_worker_command, self.scheduler] + command_args += ["--nthreads", self.worker_process_threads] + if processes is not None and processes > 1: + command_args += ["--nprocs", processes] + + command_args += ["--memory-limit", self.worker_process_memory] + command_args += ["--name", "%s--${JOB_ID}--" % name] + + if death_timeout is not None: + command_args += ["--death-timeout", death_timeout] + if local_directory is not None: + command_args += ["--local-directory", local_directory] + if extra is not None: + command_args += extra + + self._command_template = " ".join(map(str, command_args)) + + self.log_directory = log_directory + if self.log_directory is not None: + if not os.path.exists(self.log_directory): + os.makedirs(self.log_directory) + + def job_script(self): + """ Construct a job submission script """ + pieces = { + "shebang": self.shebang, + "job_header": self.job_header, + "env_header": self._env_header, + "worker_command": self._command_template, + } + return self._script_template % pieces + + @contextmanager + def job_file(self): + """ Write job submission script to temporary file """ + with tmpfile(extension="sh") as fn: + with open(fn, "w") as f: + logger.debug("writing job script: \n%s", self.job_script()) + f.write(self.job_script()) + yield fn + + def _submit_job(self, script_filename): + # Should we make this async friendly? + return self._call(shlex.split(self.submit_command) + [script_filename]) + + @property + def worker_process_threads(self): + return int(self.worker_cores / self.worker_processes) + + @property + def worker_process_memory(self): + mem = format_bytes(self.worker_memory / self.worker_processes) + mem = mem.replace(" ", "") + return mem + + async def start(self): + """ Start workers and point them to our local scheduler """ + logger.debug("Starting job: %s", self.name) + + with self.job_file() as fn: + out = self._submit_job(fn) + job = self._job_id_from_submit_output(out) + if not job: + raise ValueError("Unable to parse jobid from output of %s" % out) + self.job_id = job + + await super().start() + + def _job_id_from_submit_output(self, out): + match = re.search(self.job_id_regexp, out) + if match is None: + msg = ( + "Could not parse job id from submission command " + "output.\nJob id regexp is {!r}\nSubmission command " + "output is:\n{}".format(self.job_id_regexp, out) + ) + raise ValueError(msg) + + job_id = match.groupdict().get("job_id") + if job_id is None: + msg = ( + "You need to use a 'job_id' named group in your regexp, e.g. " + "r'(?P\\d+)', in your regexp. Your regexp was: " + "{!r}".format(self.job_id_regexp) + ) + raise ValueError(msg) + + return job_id + + async def close(self): + logger.debug("Stopping job: %s", self.name) + if self.job_id: + self._call(shlex.split(self.cancel_command) + [self.job_id]) + + def _call(self, cmd, **kwargs): + """ Call a command using subprocess.Popen. + + This centralizes calls out to the command line, providing consistent + outputs, logging, and an opportunity to go asynchronous in the future. + + Parameters + ---------- + cmd: List(str)) + A command, each of which is a list of strings to hand to + subprocess.Popen + + Examples + -------- + >>> self._call(['ls', '/foo']) + + Returns + ------- + The stdout produced by the command, as string. + + Raises + ------ + RuntimeError if the command exits with a non-zero exit code + """ + cmd_str = " ".join(cmd) + logger.debug( + "Executing the following command to command line\n{}".format(cmd_str) + ) + + proc = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs + ) + + out, err = proc.communicate() + if six.PY3: + out, err = out.decode(), err.decode() + if proc.returncode != 0: + raise RuntimeError( + "Command exited with non-zero exit code.\n" + "Exit code: {}\n" + "Command:\n{}\n" + "stdout:\n{}\n" + "stderr:\n{}\n".format(proc.returncode, cmd_str, out, err) + ) + return out + diff --git a/dask_jobqueue/pbs.py b/dask_jobqueue/pbs.py index 54459430..db75f988 100644 --- a/dask_jobqueue/pbs.py +++ b/dask_jobqueue/pbs.py @@ -7,6 +7,7 @@ import dask from .core import JobQueueCluster, docstrings +from .job import Job logger = logging.getLogger(__name__) @@ -142,3 +143,73 @@ def pbs_format_bytes_ceil(n): if n >= 10 * 1024: return "%dkB" % math.ceil(n / 1024) return "%dB" % n + + +class PBSJob(Job): + submit_command = "qsub" + cancel_command = "qdel" + config_name = "pbs" + + + def __init__( + self, + *args, + queue=None, + project=None, + resource_spec=None, + walltime=None, + job_extra=None, + config_name="pbs", + **kwargs + ): + if queue is None: + queue = dask.config.get("jobqueue.%s.queue" % config_name) + if resource_spec is None: + resource_spec = dask.config.get("jobqueue.%s.resource-spec" % config_name) + if walltime is None: + walltime = dask.config.get("jobqueue.%s.walltime" % config_name) + if job_extra is None: + job_extra = dask.config.get("jobqueue.%s.job-extra" % config_name) + if project is None: + project = dask.config.get( + "jobqueue.%s.project" % config_name + ) or os.environ.get("PBS_ACCOUNT") + + # Instantiate args and parameters from parent abstract class + super().__init__(*args, config_name=config_name, **kwargs) + + # Try to find a project name from environment variable + project = project or os.environ.get("PBS_ACCOUNT") + + header_lines = [] + # PBS header build + if self.name is not None: + header_lines.append("#PBS -N %s" % self.name) + if queue is not None: + header_lines.append("#PBS -q %s" % queue) + if project is not None: + header_lines.append("#PBS -A %s" % project) + if resource_spec is None: + # Compute default resources specifications + resource_spec = "select=1:ncpus=%d" % self.worker_cores + memory_string = pbs_format_bytes_ceil(self.worker_memory) + resource_spec += ":mem=" + memory_string + logger.info( + "Resource specification for PBS not set, initializing it to %s" + % resource_spec + ) + if resource_spec is not None: + header_lines.append("#PBS -l %s" % resource_spec) + if walltime is not None: + header_lines.append("#PBS -l walltime=%s" % walltime) + if self.log_directory is not None: + header_lines.append("#PBS -e %s/" % self.log_directory) + header_lines.append("#PBS -o %s/" % self.log_directory) + header_lines.extend(["#PBS %s" % arg for arg in job_extra]) + header_lines.append("JOB_ID=${PBS_JOBID%%.*}") + + # Declare class attribute that shall be overridden + self.job_header = "\n".join(header_lines) + + logger.debug("Job script: \n %s" % self.job_script()) + diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py new file mode 100644 index 00000000..48ddee16 --- /dev/null +++ b/dask_jobqueue/tests/test_job.py @@ -0,0 +1,19 @@ +from dask_jobqueue import PBSJob +from dask.distributed import Scheduler, Client +from distributed.utils_test import cleanup +import pytest + + +def test_basic(): + job = PBSJob(scheduler="127.0.0.1:12345") + assert "127.0.0.1:12345" in job.job_script() + + +@pytest.mark.env("pbs") +@pytest.mark.asyncio +async def test_live(): + async with Scheduler(port=0) as s: + async with PBSJob(s.address, name="foo") as job: + async with Client(s.address, asynchronous=True) as client: + await client.wait_for_workers(1) + assert list(s.workers.values())[0].name == "foo" From 7fde0005db3297f44467fc1d46ac27a25c748835 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Mon, 5 Aug 2019 17:17:03 -0700 Subject: [PATCH 002/109] add pytest-asyncio and fix pbs test --- ci/none.sh | 2 +- ci/pbs/Dockerfile | 2 +- ci/sge/Dockerfile-master | 2 +- ci/sge/Dockerfile-slave | 2 +- ci/slurm/Dockerfile | 2 +- dask_jobqueue/tests/test_job.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ci/none.sh b/ci/none.sh index c826f30c..6fbba1aa 100644 --- a/ci/none.sh +++ b/ci/none.sh @@ -4,7 +4,7 @@ function jobqueue_before_install { # Install miniconda ./ci/conda_setup.sh export PATH="$HOME/miniconda/bin:$PATH" - conda install --yes -c conda-forge python=$TRAVIS_PYTHON_VERSION dask distributed flake8 pytest docrep + conda install --yes -c conda-forge python=$TRAVIS_PYTHON_VERSION dask distributed flake8 pytest docrep pytest-asyncio # black only available for python 3 if [[ "$TRAVIS_PYTHON_VERSION" =~ ^[3-9].+ ]]; then pip install black diff --git a/ci/pbs/Dockerfile b/ci/pbs/Dockerfile index 1013c91e..47fe2770 100644 --- a/ci/pbs/Dockerfile +++ b/ci/pbs/Dockerfile @@ -30,7 +30,7 @@ RUN curl -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-L bash miniconda.sh -f -b -p /opt/anaconda && \ /opt/anaconda/bin/conda clean -tipy && \ rm -f miniconda.sh -RUN conda install --yes -c conda-forge python=3.6 dask distributed flake8 pytest docrep +RUN conda install --yes -c conda-forge python=3.6 dask distributed flake8 pytest docrep pytest-asyncio # Copy entrypoint and other needed scripts COPY ./*.sh / diff --git a/ci/sge/Dockerfile-master b/ci/sge/Dockerfile-master index d596fd60..c6f47340 100644 --- a/ci/sge/Dockerfile-master +++ b/ci/sge/Dockerfile-master @@ -10,7 +10,7 @@ RUN curl -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-L rm -f miniconda.sh ENV PATH /opt/anaconda/bin:$PATH ARG PYTHON_VERSION -RUN conda install -c conda-forge python=$PYTHON_VERSION dask distributed pytest && conda clean -tipy +RUN conda install -c conda-forge python=$PYTHON_VERSION dask distributed pytest pytest-asyncio && conda clean -tipy COPY ./*.sh / COPY ./*.txt / diff --git a/ci/sge/Dockerfile-slave b/ci/sge/Dockerfile-slave index d97647cf..10e51d2e 100644 --- a/ci/sge/Dockerfile-slave +++ b/ci/sge/Dockerfile-slave @@ -10,7 +10,7 @@ RUN curl -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-L rm -f miniconda.sh ENV PATH /opt/anaconda/bin:$PATH ARG PYTHON_VERSION -RUN conda install -c conda-forge python=$PYTHON_VERSION dask distributed pytest && conda clean -tipy +RUN conda install -c conda-forge python=$PYTHON_VERSION dask distributed pytest pytest-asyncio && conda clean -tipy COPY ./setup-slave.sh / COPY ./*.sh / diff --git a/ci/slurm/Dockerfile b/ci/slurm/Dockerfile index 814cf792..e2bb7ad8 100644 --- a/ci/slurm/Dockerfile +++ b/ci/slurm/Dockerfile @@ -5,7 +5,7 @@ RUN curl -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-L /opt/anaconda/bin/conda clean -tipy && \ rm -f miniconda.sh ENV PATH /opt/anaconda/bin:$PATH -RUN conda install --yes -c conda-forge python=3.6 dask distributed flake8 pytest docrep +RUN conda install --yes -c conda-forge python=3.6 dask distributed flake8 pytest docrep pytest-asyncio ENV LC_ALL en_US.UTF-8 diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index 48ddee16..d06aa132 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -5,7 +5,7 @@ def test_basic(): - job = PBSJob(scheduler="127.0.0.1:12345") + job = PBSJob(scheduler="127.0.0.1:12345", cores=1, memory="1 GB") assert "127.0.0.1:12345" in job.job_script() From a49ac2a414873bc486efdf5be18ac876ae8a9d2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Tue, 6 Aug 2019 17:15:37 +0200 Subject: [PATCH 003/109] Add SGEJob with passing test. Fix flake8 as well. --- dask_jobqueue/__init__.py | 2 +- dask_jobqueue/job.py | 10 ------ dask_jobqueue/pbs.py | 2 -- dask_jobqueue/sge.py | 61 +++++++++++++++++++++++++++++++++ dask_jobqueue/tests/test_job.py | 25 +++++++++++--- 5 files changed, 83 insertions(+), 17 deletions(-) diff --git a/dask_jobqueue/__init__.py b/dask_jobqueue/__init__.py index 864fcba2..6faa8404 100644 --- a/dask_jobqueue/__init__.py +++ b/dask_jobqueue/__init__.py @@ -5,7 +5,7 @@ from .moab import MoabCluster from .pbs import PBSCluster, PBSJob from .slurm import SLURMCluster -from .sge import SGECluster +from .sge import SGECluster, SGEJob from .lsf import LSFCluster from .oar import OARCluster from .htcondor import HTCondorCluster diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index f8fc0637..1a4b56fc 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -5,21 +5,12 @@ from distributed.deploy.spec import ProcessInterface import logging -import math import os import re import shlex import subprocess -import sys -from collections import OrderedDict -from contextlib import contextmanager - import six -import dask -import docrep -from .deploy import ClusterManager -from distributed import LocalCluster from distributed.utils import format_bytes, parse_bytes, tmpfile, get_ip_interface logger = logging.getLogger(__name__) @@ -327,4 +318,3 @@ def _call(self, cmd, **kwargs): "stderr:\n{}\n".format(proc.returncode, cmd_str, out, err) ) return out - diff --git a/dask_jobqueue/pbs.py b/dask_jobqueue/pbs.py index db75f988..86141dc7 100644 --- a/dask_jobqueue/pbs.py +++ b/dask_jobqueue/pbs.py @@ -150,7 +150,6 @@ class PBSJob(Job): cancel_command = "qdel" config_name = "pbs" - def __init__( self, *args, @@ -212,4 +211,3 @@ def __init__( self.job_header = "\n".join(header_lines) logger.debug("Job script: \n %s" % self.job_script()) - diff --git a/dask_jobqueue/sge.py b/dask_jobqueue/sge.py index a9a8679a..490b7c56 100644 --- a/dask_jobqueue/sge.py +++ b/dask_jobqueue/sge.py @@ -5,6 +5,7 @@ import dask from .core import JobQueueCluster, docstrings +from .job import Job logger = logging.getLogger(__name__) @@ -108,3 +109,63 @@ def __init__( self.job_header = header_template % config logger.debug("Job script: \n %s" % self.job_script()) + + +class SGEJob(Job): + submit_command = "qsub" + cancel_command = "qdel" + + def __init__( + self, + *args, + queue=None, + project=None, + resource_spec=None, + walltime=None, + job_extra=None, + config_name="sge", + **kwargs + ): + if queue is None: + queue = dask.config.get("jobqueue.%s.queue" % config_name) + if project is None: + project = dask.config.get("jobqueue.%s.project" % config_name) + if resource_spec is None: + resource_spec = dask.config.get("jobqueue.%s.resource-spec" % config_name) + if walltime is None: + walltime = dask.config.get("jobqueue.%s.walltime" % config_name) + if job_extra is None: + job_extra = dask.config.get("jobqueue.%s.job-extra" % config_name) + + super().__init__(config_name=config_name, **kwargs) + + header_lines = [] + if self.name is not None: + header_lines.append("#$ -N %(name)s") + if queue is not None: + header_lines.append("#$ -q %(queue)s") + if project is not None: + header_lines.append("#$ -P %(project)s") + if resource_spec is not None: + header_lines.append("#$ -l %(resource_spec)s") + if walltime is not None: + header_lines.append("#$ -l h_rt=%(walltime)s") + if self.log_directory is not None: + header_lines.append("#$ -e %(log_directory)s/") + header_lines.append("#$ -o %(log_directory)s/") + header_lines.extend(["#$ -cwd", "#$ -j y"]) + header_lines.extend(["#$ %s" % arg for arg in job_extra]) + header_template = "\n".join(header_lines) + + config = { + "name": self.name, + "queue": queue, + "project": project, + "processes": self.worker_processes, + "walltime": walltime, + "resource_spec": resource_spec, + "log_directory": self.log_directory, + } + self.job_header = header_template % config + + logger.debug("Job script: \n %s" % self.job_script()) diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index d06aa132..87bfc65f 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -1,6 +1,5 @@ -from dask_jobqueue import PBSJob +from dask_jobqueue import PBSJob, SGEJob from dask.distributed import Scheduler, Client -from distributed.utils_test import cleanup import pytest @@ -13,7 +12,25 @@ def test_basic(): @pytest.mark.asyncio async def test_live(): async with Scheduler(port=0) as s: - async with PBSJob(s.address, name="foo") as job: + async with PBSJob( + scheduler=s.address, name="foo", cores=1, memory="1GB" + ) as job: async with Client(s.address, asynchronous=True) as client: await client.wait_for_workers(1) - assert list(s.workers.values())[0].name == "foo" + worker_name = list(s.workers.values())[0].name + assert worker_name.startswith("foo") + assert job.job_id in worker_name + + +@pytest.mark.env("sge") +@pytest.mark.asyncio +async def test_live_sge(): + async with Scheduler(port=0) as s: + async with SGEJob( + scheduler=s.address, name="foo", cores=1, memory="1GB" + ) as job: + async with Client(s.address, asynchronous=True) as client: + await client.wait_for_workers(1) + worker_name = list(s.workers.values())[0].name + assert worker_name.startswith("foo") + assert job.job_id in worker_name From 86261f4a318a58d932a2d7bcb9f8c1ec6f83aec4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Tue, 6 Aug 2019 17:28:59 +0200 Subject: [PATCH 004/109] Rewrite the test to be dask 2.2.0 compatible. --- dask_jobqueue/tests/test_job.py | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index 87bfc65f..02abdf99 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -12,25 +12,23 @@ def test_basic(): @pytest.mark.asyncio async def test_live(): async with Scheduler(port=0) as s: - async with PBSJob( - scheduler=s.address, name="foo", cores=1, memory="1GB" - ) as job: - async with Client(s.address, asynchronous=True) as client: - await client.wait_for_workers(1) - worker_name = list(s.workers.values())[0].name - assert worker_name.startswith("foo") - assert job.job_id in worker_name + job = PBSJob(scheduler=s.address, name="foo", cores=1, memory="1GB") + job = await job + async with Client(s.address, asynchronous=True) as client: + await client.wait_for_workers(1) + worker_name = list(s.workers.values())[0].name + assert worker_name.startswith("foo") + assert job.job_id in worker_name @pytest.mark.env("sge") @pytest.mark.asyncio async def test_live_sge(): async with Scheduler(port=0) as s: - async with SGEJob( - scheduler=s.address, name="foo", cores=1, memory="1GB" - ) as job: - async with Client(s.address, asynchronous=True) as client: - await client.wait_for_workers(1) - worker_name = list(s.workers.values())[0].name - assert worker_name.startswith("foo") - assert job.job_id in worker_name + job = SGEJob(scheduler=s.address, name="foo", cores=1, memory="1GB") + job = await job + async with Client(s.address, asynchronous=True) as client: + await client.wait_for_workers(1) + worker_name = list(s.workers.values())[0].name + assert worker_name.startswith("foo") + assert job.job_id in worker_name From 3fdfd5723a0685343173f6511fbe522968606b27 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Tue, 6 Aug 2019 12:12:44 -0600 Subject: [PATCH 005/109] Add basic JobQueueCluster SpecCluster implementation --- dask_jobqueue/job.py | 43 ++++++++++++++++++++++++++++++++- dask_jobqueue/tests/test_job.py | 25 +++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index 1a4b56fc..037d5752 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -2,7 +2,8 @@ from contextlib import contextmanager import dask -from distributed.deploy.spec import ProcessInterface +from distributed.deploy.spec import ProcessInterface, SpecCluster +from distributed.scheduler import Scheduler import logging import os @@ -318,3 +319,43 @@ def _call(self, cmd, **kwargs): "stderr:\n{}\n".format(proc.returncode, cmd_str, out, err) ) return out + + +def JobQueueCluster( + *args, + Job : Job = None, + n_workers=0, + # Cluster keywords + loop=None, + security=None, + silence_logs=False, + name=None, + asynchronous=False, + # Scheduler keywords + interface=None, + protocol="tcp://", + dashboard_address=":8787", + # Job keywords + **kwargs + ): + if Job is None: + raise ValueError("You must provide a Job type like PBSJob, SLURMJob, " + "or SGEJob with the Job= argument.") + + scheduler = { + "cls": Scheduler, # Use local scheduler for now + "options": { + "protocol" : protocol, + "interface": interface, + "dashboard_address": dashboard_address, + "security": security, + } + } + kwargs["interface"] = interface + kwargs["protocol"] = protocol + kwargs["security"] = security + worker = {"cls": Job, "options": kwargs} + + return SpecCluster(scheduler=scheduler, worker=worker, loop=loop, + silence_logs=silence_logs, + asynchronous=asynchronous, name=name) diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index 02abdf99..6dcc2305 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -1,4 +1,5 @@ from dask_jobqueue import PBSJob, SGEJob +from dask_jobqueue.job import JobQueueCluster from dask.distributed import Scheduler, Client import pytest @@ -21,6 +22,18 @@ async def test_live(): assert job.job_id in worker_name +@pytest.mark.env("pbs") +@pytest.mark.asyncio +async def test_pbs_cluster(): + async with JobQueueCluster(cores=1, memory="1GB", Job=PBSJob, + asynchronous=True) as cluster: + cluster.scale(2) + await cluster + assert len(cluster.workers) == 2 + assert all(isinstance(w, PBSJob) for w in cluster.workers.values()) + assert all(w.status == "running" for w in cluster.workers.values()) + + @pytest.mark.env("sge") @pytest.mark.asyncio async def test_live_sge(): @@ -32,3 +45,15 @@ async def test_live_sge(): worker_name = list(s.workers.values())[0].name assert worker_name.startswith("foo") assert job.job_id in worker_name + + +@pytest.mark.env("sge") +@pytest.mark.asyncio +async def test_sge_cluster(): + async with JobQueueCluster(cores=1, memory="1GB", Job=SGEJob, + asynchronous=True) as cluster: + cluster.scale(2) + await cluster + assert len(cluster.workers) == 2 + assert all(isinstance(w, SGEJob) for w in cluster.workers.values()) + assert all(w.status == "running" for w in cluster.workers.values()) From 91ad132714d16b587273acbb4250469e06677c90 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Tue, 6 Aug 2019 12:13:45 -0600 Subject: [PATCH 006/109] black --- dask_jobqueue/job.py | 53 +++++++++++++++++++-------------- dask_jobqueue/tests/test_job.py | 10 ++++--- 2 files changed, 36 insertions(+), 27 deletions(-) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index 037d5752..4f037ac7 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -322,40 +322,47 @@ def _call(self, cmd, **kwargs): def JobQueueCluster( - *args, - Job : Job = None, - n_workers=0, - # Cluster keywords - loop=None, - security=None, - silence_logs=False, - name=None, - asynchronous=False, - # Scheduler keywords - interface=None, - protocol="tcp://", - dashboard_address=":8787", - # Job keywords - **kwargs - ): + *args, + Job: Job = None, + n_workers=0, + # Cluster keywords + loop=None, + security=None, + silence_logs=False, + name=None, + asynchronous=False, + # Scheduler keywords + interface=None, + protocol="tcp://", + dashboard_address=":8787", + # Job keywords + **kwargs +): if Job is None: - raise ValueError("You must provide a Job type like PBSJob, SLURMJob, " - "or SGEJob with the Job= argument.") + raise ValueError( + "You must provide a Job type like PBSJob, SLURMJob, " + "or SGEJob with the Job= argument." + ) scheduler = { "cls": Scheduler, # Use local scheduler for now "options": { - "protocol" : protocol, + "protocol": protocol, "interface": interface, "dashboard_address": dashboard_address, "security": security, - } + }, } kwargs["interface"] = interface kwargs["protocol"] = protocol kwargs["security"] = security worker = {"cls": Job, "options": kwargs} - return SpecCluster(scheduler=scheduler, worker=worker, loop=loop, - silence_logs=silence_logs, - asynchronous=asynchronous, name=name) + return SpecCluster( + scheduler=scheduler, + worker=worker, + loop=loop, + silence_logs=silence_logs, + asynchronous=asynchronous, + name=name, + ) diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index 6dcc2305..71cf2edf 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -25,8 +25,9 @@ async def test_live(): @pytest.mark.env("pbs") @pytest.mark.asyncio async def test_pbs_cluster(): - async with JobQueueCluster(cores=1, memory="1GB", Job=PBSJob, - asynchronous=True) as cluster: + async with JobQueueCluster( + cores=1, memory="1GB", Job=PBSJob, asynchronous=True + ) as cluster: cluster.scale(2) await cluster assert len(cluster.workers) == 2 @@ -50,8 +51,9 @@ async def test_live_sge(): @pytest.mark.env("sge") @pytest.mark.asyncio async def test_sge_cluster(): - async with JobQueueCluster(cores=1, memory="1GB", Job=SGEJob, - asynchronous=True) as cluster: + async with JobQueueCluster( + cores=1, memory="1GB", Job=SGEJob, asynchronous=True + ) as cluster: cluster.scale(2) await cluster assert len(cluster.workers) == 2 From d67e4845f83337696a862f2d228b9dc53b4ed08b Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Tue, 6 Aug 2019 12:17:40 -0600 Subject: [PATCH 007/109] Add test for initial scale --- dask_jobqueue/job.py | 10 +++++++--- dask_jobqueue/tests/test_job.py | 6 ++++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index 4f037ac7..f73bf5a8 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -322,9 +322,8 @@ def _call(self, cmd, **kwargs): def JobQueueCluster( - *args, - Job: Job = None, n_workers=0, + Job: Job = None, # Cluster keywords loop=None, security=None, @@ -358,7 +357,7 @@ def JobQueueCluster( kwargs["security"] = security worker = {"cls": Job, "options": kwargs} - return SpecCluster( + cluster = SpecCluster( scheduler=scheduler, worker=worker, loop=loop, @@ -366,3 +365,8 @@ def JobQueueCluster( asynchronous=asynchronous, name=name, ) + + if n_workers: + cluster.scale(n_workers) + + return cluster diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index 71cf2edf..80792d31 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -26,8 +26,9 @@ async def test_live(): @pytest.mark.asyncio async def test_pbs_cluster(): async with JobQueueCluster( - cores=1, memory="1GB", Job=PBSJob, asynchronous=True + 1, cores=1, memory="1GB", Job=PBSJob, asynchronous=True ) as cluster: + assert len(cluster.workers) == 1 cluster.scale(2) await cluster assert len(cluster.workers) == 2 @@ -52,8 +53,9 @@ async def test_live_sge(): @pytest.mark.asyncio async def test_sge_cluster(): async with JobQueueCluster( - cores=1, memory="1GB", Job=SGEJob, asynchronous=True + 1, cores=1, memory="1GB", Job=SGEJob, asynchronous=True ) as cluster: + assert len(cluster.workers) == 1 cluster.scale(2) await cluster assert len(cluster.workers) == 2 From 146555f576b3eb74285621406c0730ff9bd2c633 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Tue, 6 Aug 2019 23:05:50 +0200 Subject: [PATCH 008/109] Fix name / job_name. --- dask_jobqueue/job.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index f73bf5a8..f34e944f 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -92,7 +92,6 @@ def __init__( self, scheduler=None, name=None, - job_name=None, cores=None, memory=None, processes=None, @@ -122,8 +121,8 @@ def __init__( "JobQueueCluster is an abstract class that should not be instantiated." ) - if job_name is None: - job_name = dask.config.get("jobqueue.%s.name" % config_name) + if name is None: + name = dask.config.get("jobqueue.%s.name" % config_name) if cores is None: cores = dask.config.get("jobqueue.%s.cores" % config_name) if memory is None: From 83f832abcc57ff9afc22c313bc1b0846dc1a21e2 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Tue, 6 Aug 2019 17:15:32 -0700 Subject: [PATCH 009/109] add names to cluster tests --- dask_jobqueue/tests/test_job.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index 80792d31..e9d4bba1 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -26,7 +26,7 @@ async def test_live(): @pytest.mark.asyncio async def test_pbs_cluster(): async with JobQueueCluster( - 1, cores=1, memory="1GB", Job=PBSJob, asynchronous=True + 1, cores=1, memory="1GB", Job=PBSJob, asynchronous=True, name="foo", ) as cluster: assert len(cluster.workers) == 1 cluster.scale(2) @@ -38,7 +38,7 @@ async def test_pbs_cluster(): @pytest.mark.env("sge") @pytest.mark.asyncio -async def test_live_sge(): +async def test_sge(): async with Scheduler(port=0) as s: job = SGEJob(scheduler=s.address, name="foo", cores=1, memory="1GB") job = await job @@ -53,7 +53,7 @@ async def test_live_sge(): @pytest.mark.asyncio async def test_sge_cluster(): async with JobQueueCluster( - 1, cores=1, memory="1GB", Job=SGEJob, asynchronous=True + 1, cores=1, memory="1GB", Job=SGEJob, asynchronous=True, name="foo", ) as cluster: assert len(cluster.workers) == 1 cluster.scale(2) From e520e214bdb2b54744dda845e48c3c80626b969d Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Tue, 6 Aug 2019 17:58:27 -0700 Subject: [PATCH 010/109] Add echo into job script template --- dask_jobqueue/htcondor.py | 13 ------------- dask_jobqueue/job.py | 8 +++++--- 2 files changed, 5 insertions(+), 16 deletions(-) diff --git a/dask_jobqueue/htcondor.py b/dask_jobqueue/htcondor.py index b751773e..5130bde3 100644 --- a/dask_jobqueue/htcondor.py +++ b/dask_jobqueue/htcondor.py @@ -53,7 +53,6 @@ class HTCondorCluster(JobQueueCluster): submit_command = "condor_submit -queue 1 -file" cancel_command = "condor_rm" - job_id_regexp = r"(?P\d+\.\d+)" # condor sets argv[0] of the executable to "condor_exec.exe", which confuses # Python (can't find its libs), so we have to go through the shell. @@ -139,18 +138,6 @@ def job_script(self): "executable": self.executable, } - def _job_id_from_submit_output(self, out): - cluster_id_regexp = r"submitted to cluster (\d+)" - match = re.search(cluster_id_regexp, out) - if match is None: - msg = ( - "Could not parse cluster id from submission command output.\n" - "Cluster id regexp is {!r}\n" - "Submission command output is:\n{}".format(cluster_id_regexp, out) - ) - raise ValueError(msg) - return "%s.0" % match.group(1) - def _double_up_quotes(instr): return instr.replace("'", "''").replace('"', '""') diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index f34e944f..59eacb83 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -26,7 +26,7 @@ class Job(ProcessInterface): Parameters ---------- name : str - Name of Dask workers. + Name of Dask worker. cores : int Total number of cores per job memory: str @@ -80,13 +80,15 @@ class Job(ProcessInterface): %(env_header)s +echo "Job ID: $JOB_ID" + %(worker_command)s """.lstrip() # Following class attributes should be overridden by extending classes. submit_command = None cancel_command = None - job_id_regexp = r"(?P\d+)" + job_id_regexp = r"Job ID:\s*(?P.*)" def __init__( self, @@ -185,7 +187,7 @@ def __init__( command_args += ["--nprocs", processes] command_args += ["--memory-limit", self.worker_process_memory] - command_args += ["--name", "%s--${JOB_ID}--" % name] + command_args += ["--name", str(name)] if death_timeout is not None: command_args += ["--death-timeout", death_timeout] From c07133d2c554abf41bd36c95f0a2376b2a5cd99e Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Tue, 6 Aug 2019 19:32:17 -0700 Subject: [PATCH 011/109] Revert "Add echo into job script template" This reverts commit e520e214bdb2b54744dda845e48c3c80626b969d. --- dask_jobqueue/htcondor.py | 13 +++++++++++++ dask_jobqueue/job.py | 8 +++----- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/dask_jobqueue/htcondor.py b/dask_jobqueue/htcondor.py index 5130bde3..b751773e 100644 --- a/dask_jobqueue/htcondor.py +++ b/dask_jobqueue/htcondor.py @@ -53,6 +53,7 @@ class HTCondorCluster(JobQueueCluster): submit_command = "condor_submit -queue 1 -file" cancel_command = "condor_rm" + job_id_regexp = r"(?P\d+\.\d+)" # condor sets argv[0] of the executable to "condor_exec.exe", which confuses # Python (can't find its libs), so we have to go through the shell. @@ -138,6 +139,18 @@ def job_script(self): "executable": self.executable, } + def _job_id_from_submit_output(self, out): + cluster_id_regexp = r"submitted to cluster (\d+)" + match = re.search(cluster_id_regexp, out) + if match is None: + msg = ( + "Could not parse cluster id from submission command output.\n" + "Cluster id regexp is {!r}\n" + "Submission command output is:\n{}".format(cluster_id_regexp, out) + ) + raise ValueError(msg) + return "%s.0" % match.group(1) + def _double_up_quotes(instr): return instr.replace("'", "''").replace('"', '""') diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index 59eacb83..f34e944f 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -26,7 +26,7 @@ class Job(ProcessInterface): Parameters ---------- name : str - Name of Dask worker. + Name of Dask workers. cores : int Total number of cores per job memory: str @@ -80,15 +80,13 @@ class Job(ProcessInterface): %(env_header)s -echo "Job ID: $JOB_ID" - %(worker_command)s """.lstrip() # Following class attributes should be overridden by extending classes. submit_command = None cancel_command = None - job_id_regexp = r"Job ID:\s*(?P.*)" + job_id_regexp = r"(?P\d+)" def __init__( self, @@ -187,7 +185,7 @@ def __init__( command_args += ["--nprocs", processes] command_args += ["--memory-limit", self.worker_process_memory] - command_args += ["--name", str(name)] + command_args += ["--name", "%s--${JOB_ID}--" % name] if death_timeout is not None: command_args += ["--death-timeout", death_timeout] From 3ddd0b2ae709372c5c45a5097c3df452559b7dbc Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Tue, 6 Aug 2019 19:40:35 -0700 Subject: [PATCH 012/109] improve job-name --- dask_jobqueue/job.py | 13 ++++++++----- dask_jobqueue/pbs.py | 4 ++-- dask_jobqueue/sge.py | 6 +++--- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index f34e944f..48603110 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -103,6 +103,7 @@ def __init__( log_directory=None, shebang=None, python=sys.executable, + job_name="dask-worker", config_name=None, **kwargs ): @@ -121,8 +122,8 @@ def __init__( "JobQueueCluster is an abstract class that should not be instantiated." ) - if name is None: - name = dask.config.get("jobqueue.%s.name" % config_name) + if job_name is None: + job_name = dask.config.get("jobqueue.%s.name" % config_name) if cores is None: cores = dask.config.get("jobqueue.%s.cores" % config_name) if memory is None: @@ -170,6 +171,7 @@ def __init__( self.worker_processes = processes self.worker_cores = cores self.name = name + self.job_name = job_name self.shebang = shebang @@ -185,7 +187,7 @@ def __init__( command_args += ["--nprocs", processes] command_args += ["--memory-limit", self.worker_process_memory] - command_args += ["--name", "%s--${JOB_ID}--" % name] + command_args += ["--name", str(name)] if death_timeout is not None: command_args += ["--death-timeout", death_timeout] @@ -236,7 +238,7 @@ def worker_process_memory(self): async def start(self): """ Start workers and point them to our local scheduler """ - logger.debug("Starting job: %s", self.name) + logger.debug("Starting worker: %s", self.name) with self.job_file() as fn: out = self._submit_job(fn) @@ -245,6 +247,7 @@ async def start(self): raise ValueError("Unable to parse jobid from output of %s" % out) self.job_id = job + logger.debug("Starting job: %s", self.job_id) await super().start() def _job_id_from_submit_output(self, out): @@ -269,7 +272,7 @@ def _job_id_from_submit_output(self, out): return job_id async def close(self): - logger.debug("Stopping job: %s", self.name) + logger.debug("Stopping worker: %s job: %s", self.name, self.job_id) if self.job_id: self._call(shlex.split(self.cancel_command) + [self.job_id]) diff --git a/dask_jobqueue/pbs.py b/dask_jobqueue/pbs.py index 86141dc7..728e045d 100644 --- a/dask_jobqueue/pbs.py +++ b/dask_jobqueue/pbs.py @@ -182,8 +182,8 @@ def __init__( header_lines = [] # PBS header build - if self.name is not None: - header_lines.append("#PBS -N %s" % self.name) + if self.job_name is not None: + header_lines.append("#PBS -N %s" % self.job_name) if queue is not None: header_lines.append("#PBS -q %s" % queue) if project is not None: diff --git a/dask_jobqueue/sge.py b/dask_jobqueue/sge.py index 490b7c56..04db64d6 100644 --- a/dask_jobqueue/sge.py +++ b/dask_jobqueue/sge.py @@ -140,8 +140,8 @@ def __init__( super().__init__(config_name=config_name, **kwargs) header_lines = [] - if self.name is not None: - header_lines.append("#$ -N %(name)s") + if self.job_name is not None: + header_lines.append("#$ -N %(job-name)s") if queue is not None: header_lines.append("#$ -q %(queue)s") if project is not None: @@ -158,7 +158,7 @@ def __init__( header_template = "\n".join(header_lines) config = { - "name": self.name, + "job-name": self.job_name, "queue": queue, "project": project, "processes": self.worker_processes, From 6f81aec57d6695baf1af057ac8b624e068466257 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Tue, 6 Aug 2019 19:40:49 -0700 Subject: [PATCH 013/109] black --- dask_jobqueue/tests/test_job.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index e9d4bba1..8297fcc1 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -26,7 +26,7 @@ async def test_live(): @pytest.mark.asyncio async def test_pbs_cluster(): async with JobQueueCluster( - 1, cores=1, memory="1GB", Job=PBSJob, asynchronous=True, name="foo", + 1, cores=1, memory="1GB", Job=PBSJob, asynchronous=True, name="foo" ) as cluster: assert len(cluster.workers) == 1 cluster.scale(2) @@ -53,7 +53,7 @@ async def test_sge(): @pytest.mark.asyncio async def test_sge_cluster(): async with JobQueueCluster( - 1, cores=1, memory="1GB", Job=SGEJob, asynchronous=True, name="foo", + 1, cores=1, memory="1GB", Job=SGEJob, asynchronous=True, name="foo" ) as cluster: assert len(cluster.workers) == 1 cluster.scale(2) From 370ea47fe94df468edbc9c01cf1a77c03258370e Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Tue, 6 Aug 2019 20:16:16 -0700 Subject: [PATCH 014/109] cleanup jobname --- dask_jobqueue/job.py | 2 +- dask_jobqueue/tests/test_job.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index 48603110..66b158fb 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -103,7 +103,7 @@ def __init__( log_directory=None, shebang=None, python=sys.executable, - job_name="dask-worker", + job_name=None, config_name=None, **kwargs ): diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index 8297fcc1..49563508 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -17,9 +17,7 @@ async def test_live(): job = await job async with Client(s.address, asynchronous=True) as client: await client.wait_for_workers(1) - worker_name = list(s.workers.values())[0].name - assert worker_name.startswith("foo") - assert job.job_id in worker_name + assert list(s.workers.values())[0].name == "foo" @pytest.mark.env("pbs") From e7bc03b610a2555fae2c04f1199e0b8fa5d1b821 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Tue, 6 Aug 2019 20:17:46 -0700 Subject: [PATCH 015/109] get debug information for tests in sge --- ci/sge.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/sge.sh b/ci/sge.sh index 1673e9b3..1f244283 100644 --- a/ci/sge.sh +++ b/ci/sge.sh @@ -17,7 +17,7 @@ function jobqueue_install { } function jobqueue_script { - docker exec -it sge_master /bin/bash -c "cd /dask-jobqueue; pytest dask_jobqueue --verbose -E sge" + docker exec -it sge_master /bin/bash -c "cd /dask-jobqueue; pytest dask_jobqueue --verbose -s -E sge" } function jobqueue_after_script { From 9cdac926d42a9769518fc486e6282a234866098e Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 7 Aug 2019 07:01:17 -0700 Subject: [PATCH 016/109] pass through `*args` --- dask_jobqueue/sge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_jobqueue/sge.py b/dask_jobqueue/sge.py index 04db64d6..f50fd1cf 100644 --- a/dask_jobqueue/sge.py +++ b/dask_jobqueue/sge.py @@ -137,7 +137,7 @@ def __init__( if job_extra is None: job_extra = dask.config.get("jobqueue.%s.job-extra" % config_name) - super().__init__(config_name=config_name, **kwargs) + super().__init__(*args, config_name=config_name, **kwargs) header_lines = [] if self.job_name is not None: From 09e4d6fde8632731aa410a28f667b32cce76ef30 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 7 Aug 2019 07:15:33 -0700 Subject: [PATCH 017/109] cleanup sge test --- dask_jobqueue/tests/test_job.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index 49563508..0b104765 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -11,7 +11,7 @@ def test_basic(): @pytest.mark.env("pbs") @pytest.mark.asyncio -async def test_live(): +async def test_pbs_job(): async with Scheduler(port=0) as s: job = PBSJob(scheduler=s.address, name="foo", cores=1, memory="1GB") job = await job @@ -36,15 +36,13 @@ async def test_pbs_cluster(): @pytest.mark.env("sge") @pytest.mark.asyncio -async def test_sge(): +async def test_sge_job(): async with Scheduler(port=0) as s: job = SGEJob(scheduler=s.address, name="foo", cores=1, memory="1GB") job = await job async with Client(s.address, asynchronous=True) as client: await client.wait_for_workers(1) - worker_name = list(s.workers.values())[0].name - assert worker_name.startswith("foo") - assert job.job_id in worker_name + assert list(s.workers.values())[0].name == "foo" @pytest.mark.env("sge") From f5c86ebe5372a3258d81dd5bcd59db75c0510974 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 7 Aug 2019 07:32:33 -0700 Subject: [PATCH 018/109] test scale down --- dask_jobqueue/tests/test_job.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index 0b104765..7673dace 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -57,3 +57,10 @@ async def test_sge_cluster(): assert len(cluster.workers) == 2 assert all(isinstance(w, SGEJob) for w in cluster.workers.values()) assert all(w.status == "running" for w in cluster.workers.values()) + + cluster.scale(1) + await cluster + start = time() + while len(cluster.scheduler.workers) != 1: + await asyncio.sleep(0.1) + assert time() < start + 5 From 4b7182632fec2f0c813267d1d043428a79222c98 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 7 Aug 2019 07:35:51 -0700 Subject: [PATCH 019/109] parametrize tests --- dask_jobqueue/tests/test_job.py | 39 +++++++++------------------------ 1 file changed, 10 insertions(+), 29 deletions(-) diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index 7673dace..cbb98b53 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -9,47 +9,28 @@ def test_basic(): assert "127.0.0.1:12345" in job.job_script() -@pytest.mark.env("pbs") -@pytest.mark.asyncio -async def test_pbs_job(): - async with Scheduler(port=0) as s: - job = PBSJob(scheduler=s.address, name="foo", cores=1, memory="1GB") - job = await job - async with Client(s.address, asynchronous=True) as client: - await client.wait_for_workers(1) - assert list(s.workers.values())[0].name == "foo" - - -@pytest.mark.env("pbs") -@pytest.mark.asyncio -async def test_pbs_cluster(): - async with JobQueueCluster( - 1, cores=1, memory="1GB", Job=PBSJob, asynchronous=True, name="foo" - ) as cluster: - assert len(cluster.workers) == 1 - cluster.scale(2) - await cluster - assert len(cluster.workers) == 2 - assert all(isinstance(w, PBSJob) for w in cluster.workers.values()) - assert all(w.status == "running" for w in cluster.workers.values()) +job_params = [ + pytest.param(SGEJob, marks=[pytest.mark.env("sge")]), + pytest.param(PBSJob, marks=[pytest.mark.env("pbs")]), +] -@pytest.mark.env("sge") +@pytest.mark.parametrize("Job", job_params) @pytest.mark.asyncio -async def test_sge_job(): +async def test_job(Job): async with Scheduler(port=0) as s: - job = SGEJob(scheduler=s.address, name="foo", cores=1, memory="1GB") + job = Job(scheduler=s.address, name="foo", cores=1, memory="1GB") job = await job async with Client(s.address, asynchronous=True) as client: await client.wait_for_workers(1) assert list(s.workers.values())[0].name == "foo" -@pytest.mark.env("sge") +@pytest.mark.parametrize("Job", job_params) @pytest.mark.asyncio -async def test_sge_cluster(): +async def test_cluster(Job): async with JobQueueCluster( - 1, cores=1, memory="1GB", Job=SGEJob, asynchronous=True, name="foo" + 1, cores=1, memory="1GB", Job=Job, asynchronous=True, name="foo" ) as cluster: assert len(cluster.workers) == 1 cluster.scale(2) From 6b554ab7a203ce4d659c5545933833165a8522ac Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 7 Aug 2019 13:39:45 -0600 Subject: [PATCH 020/109] cleanup tests --- dask_jobqueue/tests/test_job.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index cbb98b53..fcc98926 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -1,6 +1,9 @@ +from time import time + from dask_jobqueue import PBSJob, SGEJob from dask_jobqueue.job import JobQueueCluster from dask.distributed import Scheduler, Client + import pytest @@ -36,12 +39,11 @@ async def test_cluster(Job): cluster.scale(2) await cluster assert len(cluster.workers) == 2 - assert all(isinstance(w, SGEJob) for w in cluster.workers.values()) + assert all(isinstance(w, Job) for w in cluster.workers.values()) assert all(w.status == "running" for w in cluster.workers.values()) cluster.scale(1) await cluster - start = time() while len(cluster.scheduler.workers) != 1: await asyncio.sleep(0.1) assert time() < start + 5 From cada8a8027effed2ca3b370ad24afb7e9c6066ef Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 7 Aug 2019 13:39:51 -0600 Subject: [PATCH 021/109] close job with weakref.finalize --- dask_jobqueue/job.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index 66b158fb..3deda7c8 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -1,16 +1,16 @@ -import sys from contextlib import contextmanager - -import dask -from distributed.deploy.spec import ProcessInterface, SpecCluster -from distributed.scheduler import Scheduler - import logging import os import re import shlex import subprocess import six +import sys +import weakref + +import dask +from distributed.deploy.spec import ProcessInterface, SpecCluster +from distributed.scheduler import Scheduler from distributed.utils import format_bytes, parse_bytes, tmpfile, get_ip_interface @@ -247,6 +247,8 @@ async def start(self): raise ValueError("Unable to parse jobid from output of %s" % out) self.job_id = job + weakref.finalize(self, self._close_job, job) + logger.debug("Starting job: %s", self.job_id) await super().start() @@ -273,10 +275,15 @@ def _job_id_from_submit_output(self, out): async def close(self): logger.debug("Stopping worker: %s job: %s", self.name, self.job_id) - if self.job_id: - self._call(shlex.split(self.cancel_command) + [self.job_id]) + self._close_job(self.job_id) + + @classmethod + def _close_job(cls, job_id): + if job_id: + cls._call(shlex.split(cls.cancel_command) + [job_id]) - def _call(self, cmd, **kwargs): + @staticmethod + def _call(cmd, **kwargs): """ Call a command using subprocess.Popen. This centralizes calls out to the command line, providing consistent From 1d70a19feed475cbc0f276ac93df6082bda51a1d Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Sat, 10 Aug 2019 08:34:37 -0600 Subject: [PATCH 022/109] Add SLURMJob --- dask_jobqueue/__init__.py | 2 +- dask_jobqueue/slurm.py | 99 +++++++++++++++++++++++++++++++++ dask_jobqueue/tests/test_job.py | 5 +- 3 files changed, 104 insertions(+), 2 deletions(-) diff --git a/dask_jobqueue/__init__.py b/dask_jobqueue/__init__.py index 6faa8404..ed6d4de8 100644 --- a/dask_jobqueue/__init__.py +++ b/dask_jobqueue/__init__.py @@ -4,7 +4,7 @@ from .job import Job from .moab import MoabCluster from .pbs import PBSCluster, PBSJob -from .slurm import SLURMCluster +from .slurm import SLURMCluster, SLURMJob from .sge import SGECluster, SGEJob from .lsf import LSFCluster from .oar import OARCluster diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py index 7b051972..2d617267 100644 --- a/dask_jobqueue/slurm.py +++ b/dask_jobqueue/slurm.py @@ -6,6 +6,7 @@ import dask from .core import JobQueueCluster, docstrings +from .job import Job logger = logging.getLogger(__name__) @@ -145,3 +146,101 @@ def slurm_format_bytes_ceil(n): if n >= 1024: return "%dK" % math.ceil(n / 1024) return "1K" % n + + +class SLURMJob(Job): + __doc__ = docstrings.with_indents( + """ Launch Dask on a SLURM cluster + + Parameters + ---------- + queue : str + Destination queue for each worker job. Passed to `#SBATCH -p` option. + project : str + Accounting string associated with each worker job. Passed to `#SBATCH -A` option. + walltime : str + Walltime for each worker job. + job_cpu : int + Number of cpu to book in SLURM, if None, defaults to worker `threads * processes` + job_mem : str + Amount of memory to request in SLURM. If None, defaults to worker + processes * memory + job_extra : list + List of other Slurm options, for example -j oe. Each option will be prepended with the #SBATCH prefix. + %(JobQueueCluster.parameters)s + + Examples + -------- + """, + 4, + ) + + # Override class variables + submit_command = "sbatch" + cancel_command = "scancel" + + def __init__( + self, + *args, + queue=None, + project=None, + walltime=None, + job_cpu=None, + job_mem=None, + job_extra=None, + config_name="slurm", + **kwargs + ): + if queue is None: + queue = dask.config.get("jobqueue.%s.queue" % config_name) + if project is None: + project = dask.config.get("jobqueue.%s.project" % config_name) + if walltime is None: + walltime = dask.config.get("jobqueue.%s.walltime" % config_name) + if job_cpu is None: + job_cpu = dask.config.get("jobqueue.%s.job-cpu" % config_name) + if job_mem is None: + job_mem = dask.config.get("jobqueue.%s.job-mem" % config_name) + if job_extra is None: + job_extra = dask.config.get("jobqueue.%s.job-extra" % config_name) + + super().__init__(*args, config_name=config_name, **kwargs) + + # Always ask for only one task + header_lines = [] + # SLURM header build + if self.job_name is not None: + header_lines.append("#SBATCH -J %s" % self.job_name) + if self.log_directory is not None: + header_lines.append( + "#SBATCH -e %s/%s-%%J.err" % (self.log_directory, self.job_name or "worker") + ) + header_lines.append( + "#SBATCH -o %s/%s-%%J.out" % (self.log_directory, self.job_name or "worker") + ) + if queue is not None: + header_lines.append("#SBATCH -p %s" % queue) + if project is not None: + header_lines.append("#SBATCH -A %s" % project) + + # Init resources, always 1 task, + # and then number of cpu is processes * threads if not set + header_lines.append("#SBATCH -n 1") + header_lines.append( + "#SBATCH --cpus-per-task=%d" % (job_cpu or self.worker_cores) + ) + # Memory + memory = job_mem + if job_mem is None: + memory = slurm_format_bytes_ceil(self.worker_memory) + if memory is not None: + header_lines.append("#SBATCH --mem=%s" % memory) + + if walltime is not None: + header_lines.append("#SBATCH -t %s" % walltime) + header_lines.extend(["#SBATCH %s" % arg for arg in job_extra]) + + header_lines.append("\nJOB_ID=${SLURM_JOB_ID%;*}") + + # Declare class attribute that shall be overridden + self.job_header = "\n".join(header_lines) diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index fcc98926..1a4340cb 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -1,6 +1,7 @@ +import asyncio from time import time -from dask_jobqueue import PBSJob, SGEJob +from dask_jobqueue import PBSJob, SGEJob, SLURMJob from dask_jobqueue.job import JobQueueCluster from dask.distributed import Scheduler, Client @@ -15,6 +16,7 @@ def test_basic(): job_params = [ pytest.param(SGEJob, marks=[pytest.mark.env("sge")]), pytest.param(PBSJob, marks=[pytest.mark.env("pbs")]), + pytest.param(SLURMJob, marks=[pytest.mark.env("slurm")]), ] @@ -43,6 +45,7 @@ async def test_cluster(Job): assert all(w.status == "running" for w in cluster.workers.values()) cluster.scale(1) + start = time() await cluster while len(cluster.scheduler.workers) != 1: await asyncio.sleep(0.1) From 2545ebcd6a9b0aafe6b537485092cf2fa6ba3c42 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Sat, 10 Aug 2019 09:05:42 -0600 Subject: [PATCH 023/109] Add test for adaptive --- dask_jobqueue/tests/test_job.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index 1a4340cb..a25bef27 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -50,3 +50,28 @@ async def test_cluster(Job): while len(cluster.scheduler.workers) != 1: await asyncio.sleep(0.1) assert time() < start + 5 + + +@pytest.mark.parametrize("Job", job_params) +@pytest.mark.asyncio +async def test_adapt(Job): + async with JobQueueCluster( + 1, cores=1, memory="1GB", Job=Job, asynchronous=True, name="foo" + ) as cluster: + cluster.adapt(minimum=0, maximum=4, interval="10ms") + + start = time() + while len(cluster.scheduler.workers): + await asyncio.sleep(0.050) + assert time() < start + 5 + + async with Client(cluster, asynchronous=True) as client: + future = client.submit(lambda: 0) + await client.wait_for_workers(1) + + del future + + start = time() + while len(cluster.scheduler.workers): + await asyncio.sleep(0.050) + assert time() < start + 5 From dae702dcff20f048cfe1f774b042458852e5a3dc Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Sat, 10 Aug 2019 09:20:14 -0600 Subject: [PATCH 024/109] fixup adaptive tests --- dask_jobqueue/tests/test_job.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index a25bef27..d314227c 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -58,14 +58,17 @@ async def test_adapt(Job): async with JobQueueCluster( 1, cores=1, memory="1GB", Job=Job, asynchronous=True, name="foo" ) as cluster: - cluster.adapt(minimum=0, maximum=4, interval="10ms") + async with Client(cluster, asynchronous=True) as client: + await client.wait_for_workers(1) + cluster.adapt(minimum=0, maximum=4, interval="10ms") - start = time() - while len(cluster.scheduler.workers): - await asyncio.sleep(0.050) - assert time() < start + 5 + start = time() + while len(cluster.scheduler.workers): + await asyncio.sleep(0.050) + assert time() < start + 10 + assert not cluster.worker_spec + assert not cluster.workers - async with Client(cluster, asynchronous=True) as client: future = client.submit(lambda: 0) await client.wait_for_workers(1) @@ -74,4 +77,6 @@ async def test_adapt(Job): start = time() while len(cluster.scheduler.workers): await asyncio.sleep(0.050) - assert time() < start + 5 + assert time() < start + 10 + assert not cluster.worker_spec + assert not cluster.workers From 043888b4eb7d2579349b13f6228f973930542808 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Sat, 10 Aug 2019 09:38:43 -0600 Subject: [PATCH 025/109] black --- dask_jobqueue/slurm.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py index 2d617267..ec0b6f04 100644 --- a/dask_jobqueue/slurm.py +++ b/dask_jobqueue/slurm.py @@ -213,10 +213,12 @@ def __init__( header_lines.append("#SBATCH -J %s" % self.job_name) if self.log_directory is not None: header_lines.append( - "#SBATCH -e %s/%s-%%J.err" % (self.log_directory, self.job_name or "worker") + "#SBATCH -e %s/%s-%%J.err" + % (self.log_directory, self.job_name or "worker") ) header_lines.append( - "#SBATCH -o %s/%s-%%J.out" % (self.log_directory, self.job_name or "worker") + "#SBATCH -o %s/%s-%%J.out" + % (self.log_directory, self.job_name or "worker") ) if queue is not None: header_lines.append("#SBATCH -p %s" % queue) From f3974bb5eac426c8a1c79b70ddb3759a8865baaf Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Sat, 10 Aug 2019 09:41:29 -0600 Subject: [PATCH 026/109] close job in test_job --- dask_jobqueue/tests/test_job.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index d314227c..a76f70b5 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -30,6 +30,13 @@ async def test_job(Job): await client.wait_for_workers(1) assert list(s.workers.values())[0].name == "foo" + await job.close() + + start = time() + while len(s.workers): + await asyncio.sleep(0.1) + assert time() < start + 5 + @pytest.mark.parametrize("Job", job_params) @pytest.mark.asyncio From abcb4e1101fefa99b462d10df143a31f36493350 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Sun, 11 Aug 2019 13:32:18 -0600 Subject: [PATCH 027/109] Also wait for workers in test --- dask_jobqueue/tests/test_job.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index a76f70b5..6ab335ac 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -70,7 +70,7 @@ async def test_adapt(Job): cluster.adapt(minimum=0, maximum=4, interval="10ms") start = time() - while len(cluster.scheduler.workers): + while len(cluster.scheduler.workers) or cluster.workers: await asyncio.sleep(0.050) assert time() < start + 10 assert not cluster.worker_spec @@ -82,7 +82,7 @@ async def test_adapt(Job): del future start = time() - while len(cluster.scheduler.workers): + while len(cluster.scheduler.workers) or cluster.workers: await asyncio.sleep(0.050) assert time() < start + 10 assert not cluster.worker_spec From 5a709c0bbb77795545a9fe29630f5ad79d58dcc6 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Tue, 13 Aug 2019 14:58:24 -0400 Subject: [PATCH 028/109] add dask/distributed git master to CI --- ci/pbs/Dockerfile | 1 + ci/sge/Dockerfile-master | 1 + ci/sge/Dockerfile-slave | 1 + ci/slurm/Dockerfile | 1 + 4 files changed, 4 insertions(+) diff --git a/ci/pbs/Dockerfile b/ci/pbs/Dockerfile index 47fe2770..b3423e88 100644 --- a/ci/pbs/Dockerfile +++ b/ci/pbs/Dockerfile @@ -31,6 +31,7 @@ RUN curl -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-L /opt/anaconda/bin/conda clean -tipy && \ rm -f miniconda.sh RUN conda install --yes -c conda-forge python=3.6 dask distributed flake8 pytest docrep pytest-asyncio +RUN pip install git+https://github.com/dask/distributed --upgrade --no-deps # Copy entrypoint and other needed scripts COPY ./*.sh / diff --git a/ci/sge/Dockerfile-master b/ci/sge/Dockerfile-master index c6f47340..d6e486d2 100644 --- a/ci/sge/Dockerfile-master +++ b/ci/sge/Dockerfile-master @@ -11,6 +11,7 @@ RUN curl -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-L ENV PATH /opt/anaconda/bin:$PATH ARG PYTHON_VERSION RUN conda install -c conda-forge python=$PYTHON_VERSION dask distributed pytest pytest-asyncio && conda clean -tipy +RUN pip install git+https://github.com/dask/distributed --upgrade --no-deps COPY ./*.sh / COPY ./*.txt / diff --git a/ci/sge/Dockerfile-slave b/ci/sge/Dockerfile-slave index 10e51d2e..777547ef 100644 --- a/ci/sge/Dockerfile-slave +++ b/ci/sge/Dockerfile-slave @@ -11,6 +11,7 @@ RUN curl -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-L ENV PATH /opt/anaconda/bin:$PATH ARG PYTHON_VERSION RUN conda install -c conda-forge python=$PYTHON_VERSION dask distributed pytest pytest-asyncio && conda clean -tipy +RUN pip install git+https://github.com/dask/distributed --upgrade --no-deps COPY ./setup-slave.sh / COPY ./*.sh / diff --git a/ci/slurm/Dockerfile b/ci/slurm/Dockerfile index e2bb7ad8..1a8cc112 100644 --- a/ci/slurm/Dockerfile +++ b/ci/slurm/Dockerfile @@ -6,6 +6,7 @@ RUN curl -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-L rm -f miniconda.sh ENV PATH /opt/anaconda/bin:$PATH RUN conda install --yes -c conda-forge python=3.6 dask distributed flake8 pytest docrep pytest-asyncio +RUN pip install git+https://github.com/dask/distributed --upgrade --no-deps ENV LC_ALL en_US.UTF-8 From 9b0b0f8c9d4562991bb4cf8b66b5e3e300a1ff04 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Mon, 19 Aug 2019 07:26:11 -0600 Subject: [PATCH 029/109] Relax test_slurm.py to not check jobs attributes --- dask_jobqueue/tests/test_slurm.py | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/dask_jobqueue/tests/test_slurm.py b/dask_jobqueue/tests/test_slurm.py index fee6ed02..9d9eb262 100644 --- a/dask_jobqueue/tests/test_slurm.py +++ b/dask_jobqueue/tests/test_slurm.py @@ -25,7 +25,7 @@ def test_header(): assert "#SBATCH --mem=27G" in cluster.job_header assert "#SBATCH -t 00:02:00" in cluster.job_header assert "#SBATCH -p" not in cluster.job_header - assert "#SBATCH -A" not in cluster.job_header + # assert "#SBATCH -A" not in cluster.job_header with SLURMCluster( queue="regular", @@ -51,7 +51,7 @@ def test_header(): assert "#SBATCH -n 1" in cluster.job_header assert "#SBATCH -t " in cluster.job_header assert "#SBATCH -p" not in cluster.job_header - assert "#SBATCH -A" not in cluster.job_header + # assert "#SBATCH -A" not in cluster.job_header def test_job_script(): @@ -68,7 +68,7 @@ def test_job_script(): assert "#SBATCH --mem=27G" in job_script assert "#SBATCH -t 00:02:00" in job_script assert "#SBATCH -p" not in job_script - assert "#SBATCH -A" not in job_script + # assert "#SBATCH -A" not in job_script assert "export " not in job_script @@ -97,7 +97,7 @@ def test_job_script(): assert "#SBATCH --mem=27G" in job_script assert "#SBATCH -t 00:02:00" in job_script assert "#SBATCH -p" not in job_script - assert "#SBATCH -A" not in job_script + # assert "#SBATCH -A" not in job_script assert 'export LANG="en_US.utf8"' in job_script assert 'export LANGUAGE="en_US.utf8"' in job_script @@ -125,13 +125,10 @@ def test_basic(loop): cluster.scale(2) start = time() - while not (cluster.pending_jobs or cluster.running_jobs): - sleep(0.100) - assert time() < start + QUEUE_WAIT + client.wait_for_workers(2) future = client.submit(lambda x: x + 1, 10) assert future.result(QUEUE_WAIT) == 11 - assert cluster.running_jobs workers = list(client.scheduler_info()["workers"].values()) w = workers[0] @@ -141,7 +138,7 @@ def test_basic(loop): cluster.scale(0) start = time() - while cluster.running_jobs: + while client.scheduler_info()["workers"]: sleep(0.100) assert time() < start + QUEUE_WAIT @@ -161,27 +158,17 @@ def test_adaptive(loop): future = client.submit(lambda x: x + 1, 10) start = time() - while not (cluster.pending_jobs or cluster.running_jobs): - sleep(0.100) - assert time() < start + QUEUE_WAIT + client.wait_for_workers(1) assert future.result(QUEUE_WAIT) == 11 - start = time() - processes = cluster.worker_processes - while len(client.scheduler_info()["workers"]) != processes: - sleep(0.1) - assert time() < start + QUEUE_WAIT - del future start = time() - while cluster.running_jobs: + while client.scheduler_info()["workers"]: sleep(0.100) assert time() < start + QUEUE_WAIT - assert cluster.finished_jobs - def test_config_name_slurm_takes_custom_config(): conf = { From 1443435fd8257ea15e7d932d34a6cad4caf629cc Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Tue, 20 Aug 2019 16:25:57 -0600 Subject: [PATCH 030/109] Replace SLURMCluster with new variant --- dask_jobqueue/job.py | 108 +++++++++++++++------------ dask_jobqueue/slurm.py | 120 ++---------------------------- dask_jobqueue/tests/__init__.py | 2 +- dask_jobqueue/tests/test_job.py | 28 +++---- dask_jobqueue/tests/test_slurm.py | 4 +- 5 files changed, 85 insertions(+), 177 deletions(-) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index 3deda7c8..9903db85 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -330,52 +330,66 @@ def _call(cmd, **kwargs): return out -def JobQueueCluster( - n_workers=0, - Job: Job = None, - # Cluster keywords - loop=None, - security=None, - silence_logs=False, - name=None, - asynchronous=False, - # Scheduler keywords - interface=None, - protocol="tcp://", - dashboard_address=":8787", - # Job keywords - **kwargs -): - if Job is None: - raise ValueError( - "You must provide a Job type like PBSJob, SLURMJob, " - "or SGEJob with the Job= argument." +class JobQueueCluster(SpecCluster): + + def __init__( + self, + n_workers=0, + Job: Job = None, + # Cluster keywords + loop=None, + security=None, + silence_logs=False, + name=None, + asynchronous=False, + # Scheduler keywords + interface=None, + protocol="tcp://", + dashboard_address=":8787", + # Job keywords + **kwargs + ): + if Job is None: + raise ValueError( + "You must provide a Job type like PBSJob, SLURMJob, " + "or SGEJob with the Job= argument." + ) + + + scheduler = { + "cls": Scheduler, # Use local scheduler for now + "options": { + "protocol": protocol, + "interface": interface, + "dashboard_address": dashboard_address, + "security": security, + }, + } + kwargs["interface"] = interface + kwargs["protocol"] = protocol + kwargs["security"] = security + worker = {"cls": Job, "options": kwargs} + self.example_job = Job("tcp://scheduler:8786", name="name", **kwargs) + + super().__init__( + scheduler=scheduler, + worker=worker, + loop=loop, + silence_logs=silence_logs, + asynchronous=asynchronous, + name=name, ) - scheduler = { - "cls": Scheduler, # Use local scheduler for now - "options": { - "protocol": protocol, - "interface": interface, - "dashboard_address": dashboard_address, - "security": security, - }, - } - kwargs["interface"] = interface - kwargs["protocol"] = protocol - kwargs["security"] = security - worker = {"cls": Job, "options": kwargs} - - cluster = SpecCluster( - scheduler=scheduler, - worker=worker, - loop=loop, - silence_logs=silence_logs, - asynchronous=asynchronous, - name=name, - ) - - if n_workers: - cluster.scale(n_workers) - - return cluster + if n_workers: + self.scale(n_workers) + + @property + def job_header(self): + return self.example_job.job_header + + def job_script(self): + return self.example_job.job_script() + + @property + def name(self): + return self.example_job.job_name diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py index ec0b6f04..a0a97550 100644 --- a/dask_jobqueue/slurm.py +++ b/dask_jobqueue/slurm.py @@ -1,128 +1,17 @@ from __future__ import absolute_import, division, print_function +import functools import logging import math import dask -from .core import JobQueueCluster, docstrings -from .job import Job +from .core import docstrings +from .job import Job, JobQueueCluster logger = logging.getLogger(__name__) -class SLURMCluster(JobQueueCluster): - __doc__ = docstrings.with_indents( - """ Launch Dask on a SLURM cluster - - Parameters - ---------- - queue : str - Destination queue for each worker job. Passed to `#SBATCH -p` option. - project : str - Accounting string associated with each worker job. Passed to `#SBATCH -A` option. - walltime : str - Walltime for each worker job. - job_cpu : int - Number of cpu to book in SLURM, if None, defaults to worker `threads * processes` - job_mem : str - Amount of memory to request in SLURM. If None, defaults to worker - processes * memory - job_extra : list - List of other Slurm options, for example -j oe. Each option will be prepended with the #SBATCH prefix. - %(JobQueueCluster.parameters)s - - Examples - -------- - >>> from dask_jobqueue import SLURMCluster - >>> cluster = SLURMCluster(processes=6, cores=24, memory="120GB", - env_extra=['export LANG="en_US.utf8"', - 'export LANGUAGE="en_US.utf8"', - 'export LC_ALL="en_US.utf8"']) - >>> cluster.scale(10) # this may take a few seconds to launch - - >>> from dask.distributed import Client - >>> client = Client(cluster) - - This also works with adaptive clusters. This automatically launches and kill workers based on load. - - >>> cluster.adapt() - """, - 4, - ) - - # Override class variables - submit_command = "sbatch" - cancel_command = "scancel" - - def __init__( - self, - queue=None, - project=None, - walltime=None, - job_cpu=None, - job_mem=None, - job_extra=None, - config_name="slurm", - **kwargs - ): - if queue is None: - queue = dask.config.get("jobqueue.%s.queue" % config_name) - if project is None: - project = dask.config.get("jobqueue.%s.project" % config_name) - if walltime is None: - walltime = dask.config.get("jobqueue.%s.walltime" % config_name) - if job_cpu is None: - job_cpu = dask.config.get("jobqueue.%s.job-cpu" % config_name) - if job_mem is None: - job_mem = dask.config.get("jobqueue.%s.job-mem" % config_name) - if job_extra is None: - job_extra = dask.config.get("jobqueue.%s.job-extra" % config_name) - - super(SLURMCluster, self).__init__(config_name=config_name, **kwargs) - - # Always ask for only one task - header_lines = [] - # SLURM header build - if self.name is not None: - header_lines.append("#SBATCH -J %s" % self.name) - if self.log_directory is not None: - header_lines.append( - "#SBATCH -e %s/%s-%%J.err" % (self.log_directory, self.name or "worker") - ) - header_lines.append( - "#SBATCH -o %s/%s-%%J.out" % (self.log_directory, self.name or "worker") - ) - if queue is not None: - header_lines.append("#SBATCH -p %s" % queue) - if project is not None: - header_lines.append("#SBATCH -A %s" % project) - - # Init resources, always 1 task, - # and then number of cpu is processes * threads if not set - header_lines.append("#SBATCH -n 1") - header_lines.append( - "#SBATCH --cpus-per-task=%d" % (job_cpu or self.worker_cores) - ) - # Memory - memory = job_mem - if job_mem is None: - memory = slurm_format_bytes_ceil(self.worker_memory) - if memory is not None: - header_lines.append("#SBATCH --mem=%s" % memory) - - if walltime is not None: - header_lines.append("#SBATCH -t %s" % walltime) - header_lines.extend(["#SBATCH %s" % arg for arg in job_extra]) - - header_lines.append("JOB_ID=${SLURM_JOB_ID%;*}") - - # Declare class attribute that shall be overridden - self.job_header = "\n".join(header_lines) - - logger.debug("Job script: \n %s" % self.job_script()) - - def slurm_format_bytes_ceil(n): """ Format bytes as text. @@ -246,3 +135,6 @@ def __init__( # Declare class attribute that shall be overridden self.job_header = "\n".join(header_lines) + + +SLURMCluster = functools.partial(JobQueueCluster, Job=SLURMJob) diff --git a/dask_jobqueue/tests/__init__.py b/dask_jobqueue/tests/__init__.py index 9d193036..7c7408c7 100644 --- a/dask_jobqueue/tests/__init__.py +++ b/dask_jobqueue/tests/__init__.py @@ -1,3 +1,3 @@ from __future__ import absolute_import, division, print_function -QUEUE_WAIT = 15 # seconds +QUEUE_WAIT = 60 # seconds diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index 6ab335ac..392fdb09 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -44,19 +44,21 @@ async def test_cluster(Job): async with JobQueueCluster( 1, cores=1, memory="1GB", Job=Job, asynchronous=True, name="foo" ) as cluster: - assert len(cluster.workers) == 1 - cluster.scale(2) - await cluster - assert len(cluster.workers) == 2 - assert all(isinstance(w, Job) for w in cluster.workers.values()) - assert all(w.status == "running" for w in cluster.workers.values()) - - cluster.scale(1) - start = time() - await cluster - while len(cluster.scheduler.workers) != 1: - await asyncio.sleep(0.1) - assert time() < start + 5 + async with Client(cluster, asynchronous=True) as client: + assert len(cluster.workers) == 1 + cluster.scale(2) + await cluster + assert len(cluster.workers) == 2 + assert all(isinstance(w, Job) for w in cluster.workers.values()) + assert all(w.status == "running" for w in cluster.workers.values()) + await client.wait_for_workers(2) + + cluster.scale(1) + start = time() + await cluster + while len(cluster.scheduler.workers) > 1: + await asyncio.sleep(0.1) + assert time() < start + 10 @pytest.mark.parametrize("Job", job_params) diff --git a/dask_jobqueue/tests/test_slurm.py b/dask_jobqueue/tests/test_slurm.py index 9d9eb262..4aeac472 100644 --- a/dask_jobqueue/tests/test_slurm.py +++ b/dask_jobqueue/tests/test_slurm.py @@ -117,7 +117,7 @@ def test_basic(loop): cores=2, processes=1, memory="2GB", - job_extra=["-D /"], + # job_extra=["-D /"], loop=loop, ) as cluster: with Client(cluster) as client: @@ -150,7 +150,7 @@ def test_adaptive(loop): cores=2, processes=1, memory="2GB", - job_extra=["-D /"], + # job_extra=["-D /"], loop=loop, ) as cluster: cluster.adapt() From 29dfccbcea50e06dfe440b40b2023452ff65a14a Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Tue, 20 Aug 2019 17:54:14 -0600 Subject: [PATCH 031/109] update tests --- dask_jobqueue/job.py | 3 +-- dask_jobqueue/tests/test_jobqueue_core.py | 4 ++-- dask_jobqueue/tests/test_slurm.py | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index 9903db85..7f6daf47 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -355,7 +355,6 @@ def __init__( "or SGEJob with the Job= argument." ) - scheduler = { "cls": Scheduler, # Use local scheduler for now "options": { @@ -391,5 +390,5 @@ def job_script(self): return self.example_job.job_script() @property - def name(self): + def job_name(self): return self.example_job.job_name diff --git a/dask_jobqueue/tests/test_jobqueue_core.py b/dask_jobqueue/tests/test_jobqueue_core.py index 5a94663e..222276ff 100644 --- a/dask_jobqueue/tests/test_jobqueue_core.py +++ b/dask_jobqueue/tests/test_jobqueue_core.py @@ -104,7 +104,7 @@ def test_forward_ip(): @pytest.mark.parametrize( - "Cluster", [PBSCluster, MoabCluster, SLURMCluster, SGECluster, LSFCluster] + "Cluster", [PBSCluster, MoabCluster, SGECluster, LSFCluster] ) @pytest.mark.parametrize( "qsub_return_string", @@ -125,7 +125,7 @@ def test_job_id_from_qsub(Cluster, qsub_return_string): @pytest.mark.parametrize( - "Cluster", [PBSCluster, MoabCluster, SLURMCluster, SGECluster, LSFCluster] + "Cluster", [PBSCluster, MoabCluster, SGECluster, LSFCluster] ) def test_job_id_error_handling(Cluster): # non-matching regexp diff --git a/dask_jobqueue/tests/test_slurm.py b/dask_jobqueue/tests/test_slurm.py index db9a0e9b..6b26ffc0 100644 --- a/dask_jobqueue/tests/test_slurm.py +++ b/dask_jobqueue/tests/test_slurm.py @@ -192,4 +192,4 @@ def test_config_name_slurm_takes_custom_config(): with dask.config.set({"jobqueue.slurm-config-name": conf}): with SLURMCluster(config_name="slurm-config-name") as cluster: - assert cluster.name == "myname" + assert cluster.job_name == "myname" From 152d8e4350eca2e84592a4d41a769c14153cc32d Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Tue, 20 Aug 2019 19:38:39 -0700 Subject: [PATCH 032/109] remove SLURMCluster from another test --- dask_jobqueue/tests/test_jobqueue_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_jobqueue/tests/test_jobqueue_core.py b/dask_jobqueue/tests/test_jobqueue_core.py index 222276ff..61a2272b 100644 --- a/dask_jobqueue/tests/test_jobqueue_core.py +++ b/dask_jobqueue/tests/test_jobqueue_core.py @@ -63,7 +63,7 @@ def test_shebang_settings(Cluster): @pytest.mark.parametrize( - "Cluster", [PBSCluster, MoabCluster, SLURMCluster, SGECluster, LSFCluster] + "Cluster", [PBSCluster, MoabCluster, SGECluster, LSFCluster] ) def test_repr(Cluster): with Cluster( From 87d33f45a8f7acb76a2832174bb965d3a870af61 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 21 Aug 2019 16:00:22 -0700 Subject: [PATCH 033/109] move around functions in slurm.py --- dask_jobqueue/slurm.py | 50 +++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py index ea1faade..d3ed18b7 100644 --- a/dask_jobqueue/slurm.py +++ b/dask_jobqueue/slurm.py @@ -10,31 +10,6 @@ logger = logging.getLogger(__name__) -def slurm_format_bytes_ceil(n): - """ Format bytes as text. - - SLURM expects KiB, MiB or Gib, but names it KB, MB, GB. SLURM does not handle Bytes, only starts at KB. - - >>> slurm_format_bytes_ceil(1) - '1K' - >>> slurm_format_bytes_ceil(1234) - '2K' - >>> slurm_format_bytes_ceil(12345678) - '13M' - >>> slurm_format_bytes_ceil(1234567890) - '2G' - >>> slurm_format_bytes_ceil(15000000000) - '14G' - """ - if n >= (1024 ** 3): - return "%dG" % math.ceil(n / (1024 ** 3)) - if n >= (1024 ** 2): - return "%dM" % math.ceil(n / (1024 ** 2)) - if n >= 1024: - return "%dK" % math.ceil(n / 1024) - return "1K" % n - - class SLURMJob(Job): __doc__ = docstrings.with_indents( """ Launch Dask on a SLURM cluster @@ -136,3 +111,28 @@ def __init__( SLURMCluster = functools.partial(JobQueueCluster, Job=SLURMJob) + + +def slurm_format_bytes_ceil(n): + """ Format bytes as text. + + SLURM expects KiB, MiB or Gib, but names it KB, MB, GB. SLURM does not handle Bytes, only starts at KB. + + >>> slurm_format_bytes_ceil(1) + '1K' + >>> slurm_format_bytes_ceil(1234) + '2K' + >>> slurm_format_bytes_ceil(12345678) + '13M' + >>> slurm_format_bytes_ceil(1234567890) + '2G' + >>> slurm_format_bytes_ceil(15000000000) + '14G' + """ + if n >= (1024 ** 3): + return "%dG" % math.ceil(n / (1024 ** 3)) + if n >= (1024 ** 2): + return "%dM" % math.ceil(n / (1024 ** 2)) + if n >= 1024: + return "%dK" % math.ceil(n / 1024) + return "1K" % n From c5ac57355b31ce4cae2055dc358887c722f4cd18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 22 Aug 2019 14:12:24 +0200 Subject: [PATCH 034/109] SGECluster now uses SpecCluster. With some quick test fixes. --- dask_jobqueue/sge.py | 108 ++------------------------------ dask_jobqueue/tests/test_sge.py | 36 ++++++----- 2 files changed, 25 insertions(+), 119 deletions(-) diff --git a/dask_jobqueue/sge.py b/dask_jobqueue/sge.py index 0abd49e9..5ee873e8 100644 --- a/dask_jobqueue/sge.py +++ b/dask_jobqueue/sge.py @@ -1,114 +1,13 @@ import logging +import functools import dask -from .core import JobQueueCluster, docstrings -from .job import Job +from .job import Job, JobQueueCluster logger = logging.getLogger(__name__) -class SGECluster(JobQueueCluster): - __doc__ = docstrings.with_indents( - """ Launch Dask on a SGE cluster - - .. note:: - If you want a specific amount of RAM, both ``memory`` and ``resource_spec`` - must be specified. The exact syntax of ``resource_spec`` is defined by your - GridEngine system administrator. The amount of ``memory`` requested should - match the ``resource_spec``, so that Dask's memory management system can - perform accurately. - - Parameters - ---------- - queue : str - Destination queue for each worker job. Passed to `#$ -q` option. - project : str - Accounting string associated with each worker job. Passed to `#$ -A` option. - resource_spec : str - Request resources and specify job placement. Passed to `#$ -l` option. - walltime : str - Walltime for each worker job. - job_extra : list - List of other SGE options, for example -w e. Each option will be - prepended with the #$ prefix. - %(JobQueueCluster.parameters)s - - Examples - -------- - >>> from dask_jobqueue import SGECluster - >>> cluster = SGECluster(queue='regular') - >>> cluster.scale(10) # this may take a few seconds to launch - - >>> from dask.distributed import Client - >>> client = Client(cluster) - - This also works with adaptive clusters. This automatically launches and kill workers based on load. - - >>> cluster.adapt() - """, - 4, - ) - - # Override class variables - submit_command = "qsub -terse" - cancel_command = "qdel" - - def __init__( - self, - queue=None, - project=None, - resource_spec=None, - walltime=None, - job_extra=None, - config_name="sge", - **kwargs - ): - if queue is None: - queue = dask.config.get("jobqueue.%s.queue" % config_name) - if project is None: - project = dask.config.get("jobqueue.%s.project" % config_name) - if resource_spec is None: - resource_spec = dask.config.get("jobqueue.%s.resource-spec" % config_name) - if walltime is None: - walltime = dask.config.get("jobqueue.%s.walltime" % config_name) - if job_extra is None: - job_extra = dask.config.get("jobqueue.%s.job-extra" % config_name) - - super().__init__(config_name=config_name, **kwargs) - - header_lines = [] - if self.name is not None: - header_lines.append("#$ -N %(name)s") - if queue is not None: - header_lines.append("#$ -q %(queue)s") - if project is not None: - header_lines.append("#$ -P %(project)s") - if resource_spec is not None: - header_lines.append("#$ -l %(resource_spec)s") - if walltime is not None: - header_lines.append("#$ -l h_rt=%(walltime)s") - if self.log_directory is not None: - header_lines.append("#$ -e %(log_directory)s/") - header_lines.append("#$ -o %(log_directory)s/") - header_lines.extend(["#$ -cwd", "#$ -j y"]) - header_lines.extend(["#$ %s" % arg for arg in job_extra]) - header_template = "\n".join(header_lines) - - config = { - "name": self.name, - "queue": queue, - "project": project, - "processes": self.worker_processes, - "walltime": walltime, - "resource_spec": resource_spec, - "log_directory": self.log_directory, - } - self.job_header = header_template % config - - logger.debug("Job script: \n %s" % self.job_script()) - - class SGEJob(Job): submit_command = "qsub" cancel_command = "qdel" @@ -167,3 +66,6 @@ def __init__( self.job_header = header_template % config logger.debug("Job script: \n %s" % self.job_script()) + + +SGECluster = functools.partial(JobQueueCluster, Job=SGEJob) diff --git a/dask_jobqueue/tests/test_sge.py b/dask_jobqueue/tests/test_sge.py index 0d8dac75..980de436 100644 --- a/dask_jobqueue/tests/test_sge.py +++ b/dask_jobqueue/tests/test_sge.py @@ -19,13 +19,13 @@ def test_basic(loop): cluster.scale(2) start = time() - while not (cluster.pending_jobs or cluster.running_jobs): + while not client.scheduler_info()["workers"]: sleep(0.100) assert time() < start + QUEUE_WAIT future = client.submit(lambda x: x + 1, 10) assert future.result(QUEUE_WAIT) == 11 - assert cluster.running_jobs + assert len(client.scheduler_info()["workers"]) > 0 workers = list(client.scheduler_info()["workers"].values()) w = workers[0] @@ -35,7 +35,7 @@ def test_basic(loop): cluster.scale(0) start = time() - while cluster.running_jobs: + while client.scheduler_info()["workers"]: sleep(0.100) assert time() < start + QUEUE_WAIT @@ -65,7 +65,7 @@ def test_config_name_sge_takes_custom_config(): with dask.config.set({"jobqueue.sge-config-name": conf}): with SGECluster(config_name="sge-config-name") as cluster: - assert cluster.name == "myname" + assert cluster.job_name == "myname" def test_job_script(tmpdir): @@ -107,19 +107,23 @@ def test_complex_cancel_command(loop): with SGECluster( walltime="00:02:00", cores=1, processes=1, memory="2GB", loop=loop ) as cluster: - username = "root" - cluster.cancel_command = "qdel -u {}".format(username) + with Client(cluster) as client: + username = "root" + cluster.cancel_command = "qdel -u {}".format(username) - cluster.scale(2) + cluster.scale(2) - start = time() - while not cluster.running_jobs: - sleep(0.100) - assert time() < start + QUEUE_WAIT + start = time() + while not client.scheduler_info()["workers"]: + sleep(0.100) + assert time() < start + QUEUE_WAIT - cluster.stop_all_jobs() + # TODO: Is there a replacement for .stop_all_jobs? stop_all_jobs + # does make sure that the pending jobs get qdeled. + # cluster.stop_all_jobs() + cluster.scale(0) - start = time() - while cluster.running_jobs: - sleep(0.100) - assert time() < start + QUEUE_WAIT + start = time() + while client.scheduler_info()["workers"]: + sleep(0.100) + assert time() < start + QUEUE_WAIT From f3847e080211d1fbbcd7c2403ca8e168de8e2acb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 22 Aug 2019 17:02:35 +0200 Subject: [PATCH 035/109] Two more fixes now that ._job_id_from_submit_output has moved to the job level. --- dask_jobqueue/tests/test_jobqueue_core.py | 56 +++++++++++++++++++---- 1 file changed, 46 insertions(+), 10 deletions(-) diff --git a/dask_jobqueue/tests/test_jobqueue_core.py b/dask_jobqueue/tests/test_jobqueue_core.py index 61a2272b..159583c5 100644 --- a/dask_jobqueue/tests/test_jobqueue_core.py +++ b/dask_jobqueue/tests/test_jobqueue_core.py @@ -16,6 +16,8 @@ OARCluster, ) +from dask_jobqueue.sge import SGEJob + def test_errors(): with pytest.raises(NotImplementedError) as info: @@ -62,12 +64,15 @@ def test_shebang_settings(Cluster): assert job_script.startswith(default_shebang) -@pytest.mark.parametrize( - "Cluster", [PBSCluster, MoabCluster, SGECluster, LSFCluster] -) +@pytest.mark.parametrize("Cluster", [PBSCluster, MoabCluster, SGECluster, LSFCluster]) def test_repr(Cluster): with Cluster( - walltime="00:02:00", processes=4, cores=8, memory="28GB", name="dask-worker" + # TODO name -> job_name could be a problem ... + walltime="00:02:00", + processes=4, + cores=8, + memory="28GB", + job_name="dask-worker", ) as cluster: cluster_repr = repr(cluster) assert cluster.__class__.__name__ in cluster_repr @@ -103,9 +108,7 @@ def test_forward_ip(): assert cluster.local_cluster.scheduler.ip == default_ip -@pytest.mark.parametrize( - "Cluster", [PBSCluster, MoabCluster, SGECluster, LSFCluster] -) +@pytest.mark.parametrize("Cluster", [PBSCluster, MoabCluster, LSFCluster]) @pytest.mark.parametrize( "qsub_return_string", [ @@ -117,17 +120,34 @@ def test_forward_ip(): "{job_id}", ], ) -def test_job_id_from_qsub(Cluster, qsub_return_string): +def test_job_id_from_qsub_legacy(Cluster, qsub_return_string): original_job_id = "654321" qsub_return_string = qsub_return_string.format(job_id=original_job_id) with Cluster(cores=1, memory="1GB") as cluster: assert original_job_id == cluster._job_id_from_submit_output(qsub_return_string) +@pytest.mark.parametrize("Job", [SGEJob]) @pytest.mark.parametrize( - "Cluster", [PBSCluster, MoabCluster, SGECluster, LSFCluster] + "qsub_return_string", + [ + "{job_id}.admin01", + "Request {job_id}.asdf was sumbitted to queue: standard.", + "sbatch: Submitted batch job {job_id}", + "{job_id};cluster", + "Job <{job_id}> is submitted to default queue .", + "{job_id}", + ], ) -def test_job_id_error_handling(Cluster): +def test_job_id_from_qsub(Job, qsub_return_string): + original_job_id = "654321" + qsub_return_string = qsub_return_string.format(job_id=original_job_id) + job = Job(cores=1, memory="1GB") + assert original_job_id == job._job_id_from_submit_output(qsub_return_string) + + +@pytest.mark.parametrize("Cluster", [PBSCluster, MoabCluster, LSFCluster]) +def test_job_id_error_handling_legacy(Cluster): # non-matching regexp with Cluster(cores=1, memory="1GB") as cluster: with pytest.raises(ValueError, match="Could not parse job id"): @@ -142,6 +162,22 @@ def test_job_id_error_handling(Cluster): cluster._job_id_from_submit_output(return_string) +@pytest.mark.parametrize("Job", [SGEJob]) +def test_job_id_error_handling(Job): + # non-matching regexp + job = Job(cores=1, memory="1GB") + with pytest.raises(ValueError, match="Could not parse job id"): + return_string = "there is no number here" + job._job_id_from_submit_output(return_string) + + # no job_id named group in the regexp + job = Job(cores=1, memory="1GB") + with pytest.raises(ValueError, match="You need to use a 'job_id' named group"): + return_string = "Job <12345> submitted to ." + job.job_id_regexp = r"(\d+)" + job._job_id_from_submit_output(return_string) + + def test_log_directory(tmpdir): shutil.rmtree(tmpdir.strpath, ignore_errors=True) with PBSCluster(cores=1, memory="1GB"): From 1ff904d97d3a65b4b1c467c3c6d7de215fedf581 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 22 Aug 2019 15:31:44 -0700 Subject: [PATCH 036/109] silence logs by default --- dask_jobqueue/job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index 7f6daf47..a20f57c5 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -339,7 +339,7 @@ def __init__( # Cluster keywords loop=None, security=None, - silence_logs=False, + silence_logs="error", name=None, asynchronous=False, # Scheduler keywords From afe8e9e4c5d661cb7b0e34cf5b215e7131475404 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 22 Aug 2019 15:32:08 -0700 Subject: [PATCH 037/109] bump requirements --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index ba31c0ce..0834a6ff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -dask>=2 -distributed>=2.1 +dask>=2.3 +distributed>=2.3 docrep From 95f00ac84c4c919019dc86cc7224b569a1b47013 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Sun, 25 Aug 2019 12:46:22 -0700 Subject: [PATCH 038/109] add config_name to JobQueueCluster --- dask_jobqueue/job.py | 7 +++++++ dask_jobqueue/sge.py | 2 +- dask_jobqueue/slurm.py | 2 +- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index a20f57c5..ebdfc7d8 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -346,6 +346,7 @@ def __init__( interface=None, protocol="tcp://", dashboard_address=":8787", + config_name=None, # Job keywords **kwargs ): @@ -355,6 +356,10 @@ def __init__( "or SGEJob with the Job= argument." ) + if config_name: + if interface is None: + interface = dask.config.get("jobqueue.%s.interface" % config_name) + scheduler = { "cls": Scheduler, # Use local scheduler for now "options": { @@ -364,6 +369,8 @@ def __init__( "security": security, }, } + if config_name: + kwargs["config_name"] = config_name kwargs["interface"] = interface kwargs["protocol"] = protocol kwargs["security"] = security diff --git a/dask_jobqueue/sge.py b/dask_jobqueue/sge.py index 5ee873e8..79ced212 100644 --- a/dask_jobqueue/sge.py +++ b/dask_jobqueue/sge.py @@ -68,4 +68,4 @@ def __init__( logger.debug("Job script: \n %s" % self.job_script()) -SGECluster = functools.partial(JobQueueCluster, Job=SGEJob) +SGECluster = functools.partial(JobQueueCluster, Job=SGEJob, config_name="sge") diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py index d3ed18b7..b5082c5b 100644 --- a/dask_jobqueue/slurm.py +++ b/dask_jobqueue/slurm.py @@ -110,7 +110,7 @@ def __init__( self.job_header = "\n".join(header_lines) -SLURMCluster = functools.partial(JobQueueCluster, Job=SLURMJob) +SLURMCluster = functools.partial(JobQueueCluster, Job=SLURMJob, config_name="slurm") def slurm_format_bytes_ceil(n): From 191186100fc9a5b62eea4bad2b07f9632fc6e03c Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Sun, 25 Aug 2019 13:22:53 -0700 Subject: [PATCH 039/109] Move repr functionality upstream to Cluster.__repr__ https://github.com/dask/distributed/pull/2995 --- dask_jobqueue/tests/test_jobqueue_core.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/dask_jobqueue/tests/test_jobqueue_core.py b/dask_jobqueue/tests/test_jobqueue_core.py index 159583c5..8f301295 100644 --- a/dask_jobqueue/tests/test_jobqueue_core.py +++ b/dask_jobqueue/tests/test_jobqueue_core.py @@ -64,23 +64,6 @@ def test_shebang_settings(Cluster): assert job_script.startswith(default_shebang) -@pytest.mark.parametrize("Cluster", [PBSCluster, MoabCluster, SGECluster, LSFCluster]) -def test_repr(Cluster): - with Cluster( - # TODO name -> job_name could be a problem ... - walltime="00:02:00", - processes=4, - cores=8, - memory="28GB", - job_name="dask-worker", - ) as cluster: - cluster_repr = repr(cluster) - assert cluster.__class__.__name__ in cluster_repr - assert "cores=0" in cluster_repr - assert "memory=0 B" in cluster_repr - assert "workers=0" in cluster_repr - - @pytest.mark.parametrize( "Cluster", [PBSCluster, MoabCluster, SLURMCluster, SGECluster, LSFCluster] ) From 716dd40cf0b05ca0e559c75aebc8fb6d0d2fa328 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Sun, 25 Aug 2019 13:39:41 -0700 Subject: [PATCH 040/109] move pbs and moab --- dask_jobqueue/job.py | 2 + dask_jobqueue/moab.py | 46 ++------- dask_jobqueue/pbs.py | 115 +--------------------- dask_jobqueue/tests/test_jobqueue_core.py | 23 ++--- dask_jobqueue/tests/test_pbs.py | 8 +- 5 files changed, 30 insertions(+), 164 deletions(-) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index ebdfc7d8..a7db2e12 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -344,6 +344,7 @@ def __init__( asynchronous=False, # Scheduler keywords interface=None, + host=None, protocol="tcp://", dashboard_address=":8787", config_name=None, @@ -365,6 +366,7 @@ def __init__( "options": { "protocol": protocol, "interface": interface, + "host": host, "dashboard_address": dashboard_address, "security": security, }, diff --git a/dask_jobqueue/moab.py b/dask_jobqueue/moab.py index 0114ac91..3c6f5bc5 100644 --- a/dask_jobqueue/moab.py +++ b/dask_jobqueue/moab.py @@ -1,45 +1,13 @@ -from .core import docstrings -from .pbs import PBSCluster +import functools +from .job import JobQueueCluster +from .pbs import PBSJob -class MoabCluster(PBSCluster): - __doc__ = docstrings.with_indents( - """Launch Dask on a Moab cluster - Parameters - ---------- - queue : str - Destination queue for each worker job. Passed to `#PBS -q` option. - project : str - Accounting string associated with each worker job. Passed to - `#PBS -A` option. - resource_spec : str - Request resources and specify job placement. Passed to `#PBS -l` option. - walltime : str - Walltime for each worker job. - job_extra : list - List of other PBS options, for example -j oe. Each option will be prepended with the #PBS prefix. - %(JobQueueCluster.parameters)s - - Examples - -------- - >>> import os - >>> from dask_jobqueue import MoabCluster - >>> cluster = MoabCluster(processes=6, cores=6, project='gfdl_m', - memory='96G', resource_spec='96G', - job_extra=['-d /home/First.Last', '-M none'], - local_directory=os.getenv('TMPDIR', '/tmp')) - >>> cluster.scale(60) # submit enough jobs to deploy 10 workers - - >>> from dask.distributed import Client - >>> client = Client(cluster) - - This also works with adaptive clusters. This automatically launches and kill workers based on load. - - >>> cluster.adapt() - """, - 4, - ) +class MoabJob(PBSJob): submit_command = "msub" cancel_command = "canceljob" scheduler_name = "moab" + + +MoabCluster = functools.partial(JobQueueCluster, Job=MoabJob, config_name='pbs') diff --git a/dask_jobqueue/pbs.py b/dask_jobqueue/pbs.py index 1314d618..383e70a3 100644 --- a/dask_jobqueue/pbs.py +++ b/dask_jobqueue/pbs.py @@ -1,123 +1,15 @@ +import functools import logging import math import os import dask -from .core import JobQueueCluster, docstrings -from .job import Job +from .job import Job, JobQueueCluster logger = logging.getLogger(__name__) -class PBSCluster(JobQueueCluster): - __doc__ = docstrings.with_indents( - """ Launch Dask on a PBS cluster - - Parameters - ---------- - queue : str - Destination queue for each worker job. Passed to `#PBS -q` option. - project : str - Accounting string associated with each worker job. Passed to - `#PBS -A` option. - resource_spec : str - Request resources and specify job placement. Passed to `#PBS -l` - option. - walltime : str - Walltime for each worker job. - job_extra : list - List of other PBS options, for example -j oe. Each option will be prepended with the #PBS prefix. - %(JobQueueCluster.parameters)s - - Examples - -------- - >>> from dask_jobqueue import PBSCluster - >>> cluster = PBSCluster(queue='regular', project='DaskOnPBS', cores=12) - >>> cluster.scale(10) # this may take a few seconds to launch - - >>> from dask.distributed import Client - >>> client = Client(cluster) - - This also works with adaptive clusters. This automatically launches and kill workers based on load. - - >>> cluster.adapt() - - It is a good practice to define local_directory to your PBS system scratch directory: - - >>> cluster = PBSCluster(queue='regular', project='DaskOnPBS', - ... local_directory='$TMPDIR', - ... cores=24, processes=6, memory='100GB') - """, - 4, - ) - - # Override class variables - submit_command = "qsub" - cancel_command = "qdel" - - def __init__( - self, - queue=None, - project=None, - resource_spec=None, - walltime=None, - job_extra=None, - config_name="pbs", - **kwargs - ): - if queue is None: - queue = dask.config.get("jobqueue.%s.queue" % config_name) - if resource_spec is None: - resource_spec = dask.config.get("jobqueue.%s.resource-spec" % config_name) - if walltime is None: - walltime = dask.config.get("jobqueue.%s.walltime" % config_name) - if job_extra is None: - job_extra = dask.config.get("jobqueue.%s.job-extra" % config_name) - if project is None: - project = dask.config.get( - "jobqueue.%s.project" % config_name - ) or os.environ.get("PBS_ACCOUNT") - - # Instantiate args and parameters from parent abstract class - super().__init__(config_name=config_name, **kwargs) - - # Try to find a project name from environment variable - project = project or os.environ.get("PBS_ACCOUNT") - - header_lines = [] - # PBS header build - if self.name is not None: - header_lines.append("#PBS -N %s" % self.name) - if queue is not None: - header_lines.append("#PBS -q %s" % queue) - if project is not None: - header_lines.append("#PBS -A %s" % project) - if resource_spec is None: - # Compute default resources specifications - resource_spec = "select=1:ncpus=%d" % self.worker_cores - memory_string = pbs_format_bytes_ceil(self.worker_memory) - resource_spec += ":mem=" + memory_string - logger.info( - "Resource specification for PBS not set, initializing it to %s" - % resource_spec - ) - if resource_spec is not None: - header_lines.append("#PBS -l %s" % resource_spec) - if walltime is not None: - header_lines.append("#PBS -l walltime=%s" % walltime) - if self.log_directory is not None: - header_lines.append("#PBS -e %s/" % self.log_directory) - header_lines.append("#PBS -o %s/" % self.log_directory) - header_lines.extend(["#PBS %s" % arg for arg in job_extra]) - header_lines.append("JOB_ID=${PBS_JOBID%%.*}") - - # Declare class attribute that shall be overridden - self.job_header = "\n".join(header_lines) - - logger.debug("Job script: \n %s" % self.job_script()) - - def pbs_format_bytes_ceil(n): """ Format bytes as text. @@ -209,3 +101,6 @@ def __init__( self.job_header = "\n".join(header_lines) logger.debug("Job script: \n %s" % self.job_script()) + + +PBSCluster = functools.partial(JobQueueCluster, Job=PBSJob, config_name='pbs') diff --git a/dask_jobqueue/tests/test_jobqueue_core.py b/dask_jobqueue/tests/test_jobqueue_core.py index 8f301295..2020bcb5 100644 --- a/dask_jobqueue/tests/test_jobqueue_core.py +++ b/dask_jobqueue/tests/test_jobqueue_core.py @@ -30,11 +30,11 @@ def test_command_template(): with PBSCluster(cores=2, memory="4GB") as cluster: assert ( "%s -m distributed.cli.dask_worker" % (sys.executable) - in cluster._command_template + in cluster.example_job._command_template ) - assert " --nthreads 2" in cluster._command_template - assert " --memory-limit " in cluster._command_template - assert " --name " in cluster._command_template + assert " --nthreads 2" in cluster.example_job._command_template + assert " --memory-limit " in cluster.example_job._command_template + assert " --name " in cluster.example_job._command_template with PBSCluster( cores=2, @@ -43,9 +43,9 @@ def test_command_template(): local_directory="/scratch", extra=["--preload", "mymodule"], ) as cluster: - assert " --death-timeout 60" in cluster._command_template - assert " --local-directory /scratch" in cluster._command_template - assert " --preload mymodule" in cluster._command_template + assert " --death-timeout 60" in cluster.example_job._command_template + assert " --local-directory /scratch" in cluster.example_job._command_template + assert " --preload mymodule" in cluster.example_job._command_template @pytest.mark.parametrize( @@ -82,16 +82,16 @@ def test_forward_ip(): name="dask-worker", host=ip, ) as cluster: - assert cluster.local_cluster.scheduler.ip == ip + assert cluster.scheduler.ip == ip default_ip = socket.gethostbyname("") with PBSCluster( walltime="00:02:00", processes=4, cores=8, memory="28GB", name="dask-worker" ) as cluster: - assert cluster.local_cluster.scheduler.ip == default_ip + assert cluster.scheduler.ip == default_ip -@pytest.mark.parametrize("Cluster", [PBSCluster, MoabCluster, LSFCluster]) +@pytest.mark.parametrize("Cluster", [LSFCluster]) @pytest.mark.parametrize( "qsub_return_string", [ @@ -129,7 +129,7 @@ def test_job_id_from_qsub(Job, qsub_return_string): assert original_job_id == job._job_id_from_submit_output(qsub_return_string) -@pytest.mark.parametrize("Cluster", [PBSCluster, MoabCluster, LSFCluster]) +@pytest.mark.parametrize("Cluster", [LSFCluster]) def test_job_id_error_handling_legacy(Cluster): # non-matching regexp with Cluster(cores=1, memory="1GB") as cluster: @@ -170,6 +170,7 @@ def test_log_directory(tmpdir): assert os.path.exists(tmpdir.strpath) +@pytest.mark.skip def test_jobqueue_cluster_call(tmpdir): cluster = PBSCluster(cores=1, memory="1GB") diff --git a/dask_jobqueue/tests/test_pbs.py b/dask_jobqueue/tests/test_pbs.py index f7abb53f..1e2f4de8 100644 --- a/dask_jobqueue/tests/test_pbs.py +++ b/dask_jobqueue/tests/test_pbs.py @@ -22,7 +22,6 @@ def test_header(Cluster): assert "#PBS -l walltime=00:02:00" in cluster.job_header assert "#PBS -q" not in cluster.job_header assert "#PBS -A" not in cluster.job_header - assert "--name dask-worker--${JOB_ID}--" in cluster.job_script() with Cluster( queue="regular", @@ -388,7 +387,7 @@ def test_config_name_pbs_takes_custom_config(): with dask.config.set({"jobqueue.pbs-config-name": conf}): with PBSCluster(config_name="pbs-config-name") as cluster: - assert cluster.name == "myname" + assert cluster.job_name == "myname" def test_informative_errors(): @@ -401,6 +400,7 @@ def test_informative_errors(): assert "cores" in str(info.value) -def test_adapt(loop): - with PBSCluster(loop, cores=1, memory="1 GB") as cluster: +@pytest.mark.asyncio +async def test_adapt(loop): + async with PBSCluster(cores=1, memory="1 GB", asynchronous=True) as cluster: cluster.adapt() From 1f206815c19d3c0001debeb1172cab99ef692d7b Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Sun, 25 Aug 2019 16:03:18 -0700 Subject: [PATCH 041/109] move over LSF --- dask_jobqueue/lsf.py | 20 ++++++++++++-------- dask_jobqueue/tests/test_jobqueue_core.py | 4 ++-- dask_jobqueue/tests/test_lsf.py | 3 +-- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/dask_jobqueue/lsf.py b/dask_jobqueue/lsf.py index fd8d20af..95042a8d 100644 --- a/dask_jobqueue/lsf.py +++ b/dask_jobqueue/lsf.py @@ -1,15 +1,17 @@ +import functools import logging import math import os import dask -from .core import JobQueueCluster, docstrings +from .core import docstrings +from .job import Job, JobQueueCluster logger = logging.getLogger(__name__) -class LSFCluster(JobQueueCluster): +class LSFJob(Job): __doc__ = docstrings.with_indents( """ Launch Dask on a LSF cluster @@ -32,12 +34,12 @@ class LSFCluster(JobQueueCluster): lsf_units : str Unit system for large units in resource usage set by the LSF_UNIT_FOR_LIMITS in the lsf.conf file of a cluster. - %(JobQueueCluster.parameters)s + %(Job.parameters)s Examples -------- >>> from dask_jobqueue import LSFCluster - >>> cluster = LSFcluster(queue='general', project='DaskonLSF', + >>> cluster = LSFCluster(queue='general', project='DaskonLSF', ... cores=15, memory='25GB') >>> cluster.scale(10) # this may take a few seconds to launch @@ -51,13 +53,12 @@ class LSFCluster(JobQueueCluster): """, 4, ) - - # Override class variables submit_command = "bsub <" cancel_command = "bkill" def __init__( self, + *args, queue=None, project=None, ncpus=None, @@ -84,12 +85,12 @@ def __init__( lsf_units = dask.config.get("jobqueue.%s.lsf-units" % config_name) # Instantiate args and parameters from parent abstract class - super().__init__(config_name=config_name, **kwargs) + super().__init__(*args, config_name=config_name, **kwargs) header_lines = [] # LSF header build if self.name is not None: - header_lines.append("#BSUB -J %s" % self.name) + header_lines.append("#BSUB -J %s" % self.job_name) if self.log_directory is not None: header_lines.append( "#BSUB -e %s/%s-%%J.err" % (self.log_directory, self.name or "worker") @@ -196,3 +197,6 @@ def lsf_detect_units(): "default unit of %s." % unit ) return unit + + +LSFCluster = functools.partial(JobQueueCluster, Job=LSFJob, config_name="lsf") diff --git a/dask_jobqueue/tests/test_jobqueue_core.py b/dask_jobqueue/tests/test_jobqueue_core.py index 2020bcb5..929140ec 100644 --- a/dask_jobqueue/tests/test_jobqueue_core.py +++ b/dask_jobqueue/tests/test_jobqueue_core.py @@ -91,7 +91,7 @@ def test_forward_ip(): assert cluster.scheduler.ip == default_ip -@pytest.mark.parametrize("Cluster", [LSFCluster]) +@pytest.mark.parametrize("Cluster", []) @pytest.mark.parametrize( "qsub_return_string", [ @@ -129,7 +129,7 @@ def test_job_id_from_qsub(Job, qsub_return_string): assert original_job_id == job._job_id_from_submit_output(qsub_return_string) -@pytest.mark.parametrize("Cluster", [LSFCluster]) +@pytest.mark.parametrize("Cluster", []) def test_job_id_error_handling_legacy(Cluster): # non-matching regexp with Cluster(cores=1, memory="1GB") as cluster: diff --git a/dask_jobqueue/tests/test_lsf.py b/dask_jobqueue/tests/test_lsf.py index bd3ca5e7..7915a1bb 100644 --- a/dask_jobqueue/tests/test_lsf.py +++ b/dask_jobqueue/tests/test_lsf.py @@ -25,7 +25,6 @@ def test_header(): assert "#BSUB -W 00:02" in cluster.job_header assert "#BSUB -q" not in cluster.job_header assert "#BSUB -P" not in cluster.job_header - assert "--name dask-worker--${JOB_ID}--" in cluster.job_script() with LSFCluster( queue="general", @@ -249,7 +248,7 @@ def test_config_name_lsf_takes_custom_config(): with dask.config.set({"jobqueue.lsf-config-name": conf}): with LSFCluster(config_name="lsf-config-name") as cluster: - assert cluster.name == "myname" + assert cluster.job_name == "myname" def test_informative_errors(): From 172ecd43917f2f35a6d51d1fcb825267521425f1 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Sun, 25 Aug 2019 16:11:01 -0700 Subject: [PATCH 042/109] Add header_skip option to Job constructor --- dask_jobqueue/__init__.py | 4 ++-- dask_jobqueue/job.py | 11 ++++++++++- dask_jobqueue/tests/test_job.py | 11 ++++++++++- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/dask_jobqueue/__init__.py b/dask_jobqueue/__init__.py index ed6d4de8..d88ec417 100644 --- a/dask_jobqueue/__init__.py +++ b/dask_jobqueue/__init__.py @@ -2,11 +2,11 @@ from . import config from .core import JobQueueCluster from .job import Job -from .moab import MoabCluster +from .moab import MoabCluster, MoabJob from .pbs import PBSCluster, PBSJob from .slurm import SLURMCluster, SLURMJob from .sge import SGECluster, SGEJob -from .lsf import LSFCluster +from .lsf import LSFCluster, LSFJob from .oar import OARCluster from .htcondor import HTCondorCluster diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index a7db2e12..1e70bfaa 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -43,6 +43,9 @@ class Job(ProcessInterface): Additional arguments to pass to `dask-worker` env_extra : list Other commands to add to script before launching worker. + header_skip : list + Lines to skip in the header. + Header lines matching this text will be removed log_directory : str Directory to use for job scheduler logs. shebang : str @@ -100,6 +103,7 @@ def __init__( local_directory=None, extra=None, env_extra=None, + header_skip=None, log_directory=None, shebang=None, python=sys.executable, @@ -142,6 +146,8 @@ def __init__( extra = dask.config.get("jobqueue.%s.extra" % config_name) if env_extra is None: env_extra = dask.config.get("jobqueue.%s.env-extra" % config_name) + if header_skip is None: + header_skip = dask.config.get("jobqueue.%s.header-skip" % config_name, ()) if log_directory is None: log_directory = dask.config.get("jobqueue.%s.log-directory" % config_name) if shebang is None: @@ -176,6 +182,7 @@ def __init__( self.shebang = shebang self._env_header = "\n".join(env_extra) + self.header_skip = set(header_skip) # dask-worker command line build dask_worker_command = "%(python)s -m distributed.cli.dask_worker" % dict( @@ -205,9 +212,11 @@ def __init__( def job_script(self): """ Construct a job submission script """ + header = "\n".join([line for line in self.job_header.split("\n") if not any(skip + in line for skip in self.header_skip)]) pieces = { "shebang": self.shebang, - "job_header": self.job_header, + "job_header": header, "env_header": self._env_header, "worker_command": self._command_template, } diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index 392fdb09..c7669b80 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -1,7 +1,7 @@ import asyncio from time import time -from dask_jobqueue import PBSJob, SGEJob, SLURMJob +from dask_jobqueue import PBSJob, SGEJob, SLURMJob, LSFJob from dask_jobqueue.job import JobQueueCluster from dask.distributed import Scheduler, Client @@ -17,6 +17,7 @@ def test_basic(): pytest.param(SGEJob, marks=[pytest.mark.env("sge")]), pytest.param(PBSJob, marks=[pytest.mark.env("pbs")]), pytest.param(SLURMJob, marks=[pytest.mark.env("slurm")]), + pytest.param(LSFJob, marks=[pytest.mark.env("lsf")]), ] @@ -89,3 +90,11 @@ async def test_adapt(Job): assert time() < start + 10 assert not cluster.worker_spec assert not cluster.workers + + +def test_header_lines_skip(): + job = PBSJob(cores=1, memory="1GB", job_name="foobar") + assert "foobar" in job.job_script() + + job = PBSJob(cores=1, memory="1GB", job_name="foobar", header_skip=["-N"]) + assert "foobar" not in job.job_script() From 9bf286a4a71e4515e9bc756a2b1be4047930a573 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Sun, 25 Aug 2019 16:54:25 -0700 Subject: [PATCH 043/109] simplify bsub management in lsf --- dask_jobqueue/lsf.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/dask_jobqueue/lsf.py b/dask_jobqueue/lsf.py index 95042a8d..12126ee2 100644 --- a/dask_jobqueue/lsf.py +++ b/dask_jobqueue/lsf.py @@ -53,7 +53,7 @@ class LSFJob(Job): """, 4, ) - submit_command = "bsub <" + submit_command = "bsub" cancel_command = "bkill" def __init__( @@ -134,10 +134,6 @@ def __init__( logger.debug("Job script: \n %s" % self.job_script()) - def _submit_job(self, script_filename): - piped_cmd = [self.submit_command + " " + script_filename + " 2> /dev/null"] - return self._call(piped_cmd, shell=True) - def lsf_format_bytes_ceil(n, lsf_units="mb"): """ Format bytes as text From f91bec220b986c4a734eff61c18d535f31ee72ae Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Sun, 25 Aug 2019 18:00:57 -0700 Subject: [PATCH 044/109] add nanny keyword --- dask_jobqueue/job.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index 1e70bfaa..a448b7fd 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -33,6 +33,8 @@ class Job(ProcessInterface): Total amount of memory per job processes : int Number of processes per job + nanny : bool + Whether or not to start a nanny process interface : str Network interface like 'eth0' or 'ib0'. death_timeout : float @@ -54,8 +56,6 @@ class Job(ProcessInterface): Python executable used to launch Dask workers. config_name : str Section to use from jobqueue.yaml configuration file. - kwargs : dict - Additional keyword arguments to pass to `LocalCluster` Attributes ---------- @@ -98,6 +98,7 @@ def __init__( cores=None, memory=None, processes=None, + nanny=True, interface=None, death_timeout=None, local_directory=None, @@ -195,6 +196,7 @@ def __init__( command_args += ["--memory-limit", self.worker_process_memory] command_args += ["--name", str(name)] + command_args += ["--nanny" if nanny else "--no-nanny"] if death_timeout is not None: command_args += ["--death-timeout", death_timeout] From 6db1366bc36d39d0527ad5fd7fc04fbaab422cde Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Sun, 25 Aug 2019 18:01:04 -0700 Subject: [PATCH 045/109] remove docstring wrapping in LSF for now --- dask_jobqueue/lsf.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/dask_jobqueue/lsf.py b/dask_jobqueue/lsf.py index 12126ee2..5d030102 100644 --- a/dask_jobqueue/lsf.py +++ b/dask_jobqueue/lsf.py @@ -12,8 +12,9 @@ class LSFJob(Job): - __doc__ = docstrings.with_indents( - """ Launch Dask on a LSF cluster + """ Launch Dask on a LSF cluster + + See also the docstring for Job for more parameters Parameters ---------- @@ -34,7 +35,6 @@ class LSFJob(Job): lsf_units : str Unit system for large units in resource usage set by the LSF_UNIT_FOR_LIMITS in the lsf.conf file of a cluster. - %(Job.parameters)s Examples -------- @@ -50,9 +50,7 @@ class LSFJob(Job): kill workers based on load. >>> cluster.adapt() - """, - 4, - ) + """ submit_command = "bsub" cancel_command = "bkill" From f523cb58a1a1db6c9bbc4e59798f380a6162e889 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Mon, 26 Aug 2019 07:21:06 -0700 Subject: [PATCH 046/109] black + flake8 --- dask_jobqueue/job.py | 10 +++++++--- dask_jobqueue/lsf.py | 2 +- dask_jobqueue/moab.py | 2 +- dask_jobqueue/pbs.py | 2 +- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index a448b7fd..3fe2a431 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -214,8 +214,13 @@ def __init__( def job_script(self): """ Construct a job submission script """ - header = "\n".join([line for line in self.job_header.split("\n") if not any(skip - in line for skip in self.header_skip)]) + header = "\n".join( + [ + line + for line in self.job_header.split("\n") + if not any(skip in line for skip in self.header_skip) + ] + ) pieces = { "shebang": self.shebang, "job_header": header, @@ -342,7 +347,6 @@ def _call(cmd, **kwargs): class JobQueueCluster(SpecCluster): - def __init__( self, n_workers=0, diff --git a/dask_jobqueue/lsf.py b/dask_jobqueue/lsf.py index 5d030102..31170f0c 100644 --- a/dask_jobqueue/lsf.py +++ b/dask_jobqueue/lsf.py @@ -5,7 +5,6 @@ import dask -from .core import docstrings from .job import Job, JobQueueCluster logger = logging.getLogger(__name__) @@ -51,6 +50,7 @@ class LSFJob(Job): >>> cluster.adapt() """ + submit_command = "bsub" cancel_command = "bkill" diff --git a/dask_jobqueue/moab.py b/dask_jobqueue/moab.py index 3c6f5bc5..a6fe664e 100644 --- a/dask_jobqueue/moab.py +++ b/dask_jobqueue/moab.py @@ -10,4 +10,4 @@ class MoabJob(PBSJob): scheduler_name = "moab" -MoabCluster = functools.partial(JobQueueCluster, Job=MoabJob, config_name='pbs') +MoabCluster = functools.partial(JobQueueCluster, Job=MoabJob, config_name="pbs") diff --git a/dask_jobqueue/pbs.py b/dask_jobqueue/pbs.py index 383e70a3..e387cbfd 100644 --- a/dask_jobqueue/pbs.py +++ b/dask_jobqueue/pbs.py @@ -103,4 +103,4 @@ def __init__( logger.debug("Job script: \n %s" % self.job_script()) -PBSCluster = functools.partial(JobQueueCluster, Job=PBSJob, config_name='pbs') +PBSCluster = functools.partial(JobQueueCluster, Job=PBSJob, config_name="pbs") From 00352ac7763c0c6111488e82fe93ff343097d66d Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Mon, 26 Aug 2019 08:16:15 -0700 Subject: [PATCH 047/109] cleanup pbs testing --- dask_jobqueue/tests/test_pbs.py | 44 +++++++++------------------------ 1 file changed, 12 insertions(+), 32 deletions(-) diff --git a/dask_jobqueue/tests/test_pbs.py b/dask_jobqueue/tests/test_pbs.py index 1e2f4de8..f1d3400d 100644 --- a/dask_jobqueue/tests/test_pbs.py +++ b/dask_jobqueue/tests/test_pbs.py @@ -112,11 +112,7 @@ def test_basic(loop): with Client(cluster) as client: cluster.scale(2) - - start = time() - while not (cluster.pending_jobs or cluster.running_jobs): - sleep(0.100) - assert time() < start + QUEUE_WAIT + client.wait_for_workers(2) future = client.submit(lambda x: x + 1, 10) assert future.result(QUEUE_WAIT) == 11 @@ -130,11 +126,11 @@ def test_basic(loop): cluster.scale(0) start = time() - while cluster.running_jobs: + while client.scheduler_info()["workers"]: sleep(0.100) assert time() < start + QUEUE_WAIT - assert not cluster.running_jobs + assert not cluster.workers and not cluster.worker_spec @pytest.mark.env("pbs") @@ -151,15 +147,11 @@ def test_scale_cores_memory(loop): with Client(cluster) as client: cluster.scale(cores=2) - - start = time() - while not (cluster.pending_jobs or cluster.running_jobs): - sleep(0.100) - assert time() < start + QUEUE_WAIT + client.wait_for_workers(2) future = client.submit(lambda x: x + 1, 10) assert future.result(QUEUE_WAIT) == 11 - assert cluster.running_jobs + assert cluster.workers workers = list(client.scheduler_info()["workers"].values()) w = workers[0] @@ -169,11 +161,11 @@ def test_scale_cores_memory(loop): cluster.scale(memory="0GB") start = time() - while cluster.running_jobs: + while client.scheduler_info()["workers"]: sleep(0.100) assert time() < start + QUEUE_WAIT - assert not cluster.running_jobs + assert not cluster.workers @pytest.mark.env("pbs") @@ -194,11 +186,11 @@ def test_basic_scale_edge_cases(loop): # Wait to see what happens sleep(0.2) start = time() - while cluster.pending_jobs or cluster.running_jobs: + while cluster.workers or client.scheduler_info()["workers"]: sleep(0.1) assert time() < start + QUEUE_WAIT - assert not (cluster.pending_jobs or cluster.running_jobs) + assert not cluster.workers @pytest.mark.env("pbs") @@ -226,12 +218,10 @@ def test_adaptive(loop): del future start = time() - while cluster.pending_jobs or cluster.running_jobs: + while client.scheduler_info()["workers"] or cluster.workers: sleep(0.100) assert time() < start + QUEUE_WAIT - assert cluster.finished_jobs - @pytest.mark.env("pbs") def test_adaptive_grouped(loop): @@ -246,19 +236,11 @@ def test_adaptive_grouped(loop): ) as cluster: cluster.adapt(minimum=1) # at least 1 worker with Client(cluster) as client: - start = time() - while not (cluster.pending_jobs or cluster.running_jobs): - sleep(0.100) - assert time() < start + QUEUE_WAIT + client.wait_for_workers(1) future = client.submit(lambda x: x + 1, 10) assert future.result(QUEUE_WAIT) == 11 - start = time() - while not cluster.running_jobs: - sleep(0.100) - assert time() < start + QUEUE_WAIT - start = time() processes = cluster.worker_processes while len(client.scheduler_info()["workers"]) != processes: @@ -291,12 +273,10 @@ def test_adaptive_cores_mem(loop): del future start = time() - while cluster.pending_jobs or cluster.running_jobs: + while cluster.workers: sleep(0.100) assert time() < start + QUEUE_WAIT - assert cluster.finished_jobs - @pytest.mark.env("pbs") def test_scale_grouped(loop): From 1b0e212e7d4fe0fedf60ba37ac78d2df16f7942d Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Mon, 26 Aug 2019 13:20:22 -0700 Subject: [PATCH 048/109] typo --- dask_jobqueue/tests/test_pbs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_jobqueue/tests/test_pbs.py b/dask_jobqueue/tests/test_pbs.py index f1d3400d..4eaeae8b 100644 --- a/dask_jobqueue/tests/test_pbs.py +++ b/dask_jobqueue/tests/test_pbs.py @@ -186,7 +186,7 @@ def test_basic_scale_edge_cases(loop): # Wait to see what happens sleep(0.2) start = time() - while cluster.workers or client.scheduler_info()["workers"]: + while cluster.workers: sleep(0.1) assert time() < start + QUEUE_WAIT From 5eda238bb9680400eabb53ea5bad11b7b46ec23a Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Mon, 26 Aug 2019 13:38:23 -0700 Subject: [PATCH 049/109] move Cluster.example_job down to a property --- dask_jobqueue/job.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index 3fe2a431..6e82b710 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -391,8 +391,9 @@ def __init__( kwargs["interface"] = interface kwargs["protocol"] = protocol kwargs["security"] = security + self._kwargs = kwargs + self._Job = Job worker = {"cls": Job, "options": kwargs} - self.example_job = Job("tcp://scheduler:8786", name="name", **kwargs) super().__init__( scheduler=scheduler, @@ -406,6 +407,10 @@ def __init__( if n_workers: self.scale(n_workers) + @property + def example_job(self): + return self._Job(self.scheduler.address or "tcp://scheduler:8786", name="name", **self._kwargs) + @property def job_header(self): return self.example_job.job_header From e10ce606d9ac78344692f2aa73510ab155f653e4 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Mon, 26 Aug 2019 13:45:51 -0700 Subject: [PATCH 050/109] verify that we can create a Job on instantiation --- dask_jobqueue/job.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index 6e82b710..58411e61 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -394,6 +394,7 @@ def __init__( self._kwargs = kwargs self._Job = Job worker = {"cls": Job, "options": kwargs} + self.example_job super().__init__( scheduler=scheduler, @@ -409,7 +410,11 @@ def __init__( @property def example_job(self): - return self._Job(self.scheduler.address or "tcp://scheduler:8786", name="name", **self._kwargs) + try: + address = self.scheduler.address + except AttributeError: + address = "tcp://scheduler:8786" + return self._Job(address or "tcp://scheduler:8786", name="name", **self._kwargs) @property def job_header(self): From 5b8cb14be50173eaf7b80f072c81a49658e2a304 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Mon, 26 Aug 2019 15:11:50 -0700 Subject: [PATCH 051/109] cleanup pbs tests --- dask_jobqueue/tests/test_pbs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_jobqueue/tests/test_pbs.py b/dask_jobqueue/tests/test_pbs.py index 4eaeae8b..f6e835a8 100644 --- a/dask_jobqueue/tests/test_pbs.py +++ b/dask_jobqueue/tests/test_pbs.py @@ -147,7 +147,7 @@ def test_scale_cores_memory(loop): with Client(cluster) as client: cluster.scale(cores=2) - client.wait_for_workers(2) + client.wait_for_workers(1) future = client.submit(lambda x: x + 1, 10) assert future.result(QUEUE_WAIT) == 11 From 50340d5c57fe668c60a46f777adbdd223e2a4eff Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Mon, 26 Aug 2019 15:13:45 -0700 Subject: [PATCH 052/109] make hanging slurm test verbose --- ci/slurm.sh | 2 +- dask_jobqueue/tests/test_job.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/ci/slurm.sh b/ci/slurm.sh index 8f12aadf..a0cb1bc0 100644 --- a/ci/slurm.sh +++ b/ci/slurm.sh @@ -18,7 +18,7 @@ function jobqueue_install { } function jobqueue_script { - docker exec -it slurmctld /bin/bash -c "cd /dask-jobqueue; pytest dask_jobqueue --verbose -E slurm" + docker exec -it slurmctld /bin/bash -c "cd /dask-jobqueue; pytest dask_jobqueue --verbose -E slurm -s" } function jobqueue_after_script { diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index c7669b80..9a6cf606 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -25,18 +25,26 @@ def test_basic(): @pytest.mark.asyncio async def test_job(Job): async with Scheduler(port=0) as s: + print(1) job = Job(scheduler=s.address, name="foo", cores=1, memory="1GB") + print(2) job = await job + print(3) async with Client(s.address, asynchronous=True) as client: + print(4) await client.wait_for_workers(1) + print(5) assert list(s.workers.values())[0].name == "foo" + print(6) await job.close() + print(7) start = time() while len(s.workers): await asyncio.sleep(0.1) assert time() < start + 5 + print(8) @pytest.mark.parametrize("Job", job_params) From 75392348e48bbd4291c6d35cafcf8a6553b8a085 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Tue, 27 Aug 2019 14:29:32 -0700 Subject: [PATCH 053/109] Add LocalJob for testing --- dask_jobqueue/__init__.py | 1 + dask_jobqueue/job.py | 11 +++---- dask_jobqueue/jobqueue.yaml | 15 ++++++++++ dask_jobqueue/local.py | 53 +++++++++++++++++++++++++++++++++ dask_jobqueue/tests/test_job.py | 5 ++-- 5 files changed, 76 insertions(+), 9 deletions(-) create mode 100644 dask_jobqueue/local.py diff --git a/dask_jobqueue/__init__.py b/dask_jobqueue/__init__.py index d88ec417..3659631c 100644 --- a/dask_jobqueue/__init__.py +++ b/dask_jobqueue/__init__.py @@ -1,5 +1,6 @@ # flake8: noqa from . import config +from .local import LocalJob, LocalCluster from .core import JobQueueCluster from .job import Job from .moab import MoabCluster, MoabJob diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index 58411e61..de7fa833 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -4,7 +4,6 @@ import re import shlex import subprocess -import six import sys import weakref @@ -80,9 +79,7 @@ class Job(ProcessInterface): %(shebang)s %(job_header)s - %(env_header)s - %(worker_command)s """.lstrip() @@ -182,7 +179,7 @@ def __init__( self.shebang = shebang - self._env_header = "\n".join(env_extra) + self._env_header = "\n".join(filter(None, env_extra)) self.header_skip = set(header_skip) # dask-worker command line build @@ -329,12 +326,12 @@ def _call(cmd, **kwargs): ) proc = subprocess.Popen( - cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs + cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, **kwargs ) out, err = proc.communicate() - if six.PY3: - out, err = out.decode(), err.decode() + out, err = out.decode(), err.decode() + if proc.returncode != 0: raise RuntimeError( "Command exited with non-zero exit code.\n" diff --git a/dask_jobqueue/jobqueue.yaml b/dask_jobqueue/jobqueue.yaml index eb1d3b49..d2174258 100644 --- a/dask_jobqueue/jobqueue.yaml +++ b/dask_jobqueue/jobqueue.yaml @@ -161,3 +161,18 @@ jobqueue: log-directory: null shebang: "#!/usr/bin/env condor_submit" + local: + name: dask-worker + # Dask worker options + cores: null # Total number of cores per job + memory: null # Total amount of memory per job + processes: 1 # Number of Python processes per job + + interface: null # Network interface to use like eth0 or ib0 + death-timeout: 60 # Number of seconds to wait if a worker can not find a scheduler + local-directory: null # Location of fast local storage like /scratch or $TMPDIR + + extra: [] + env-extra: [] + job-extra: [] + log-directory: null diff --git a/dask_jobqueue/local.py b/dask_jobqueue/local.py new file mode 100644 index 00000000..9c227891 --- /dev/null +++ b/dask_jobqueue/local.py @@ -0,0 +1,53 @@ +import functools +import logging +import os +import subprocess + +from .job import Job, JobQueueCluster + +logger = logging.getLogger(__name__) + + +class LocalJob(Job): + """ This is mostly used for testing. It runs locally. """ + + config_name = "local" + + def __init__( + self, + *args, + queue=None, + project=None, + resource_spec=None, + walltime=None, + job_extra=None, + config_name="local", + **kwargs + ): + # Instantiate args and parameters from parent abstract class + super().__init__(*args, config_name=config_name, shebang="", **kwargs) + + # Declare class attribute that shall be overridden + header_lines = [] + self.job_header = "\n".join(header_lines) + + logger.debug("Job script: \n %s" % self.job_script()) + + def _submit_job(self, script_filename): + # Should we make this async friendly? + with open(script_filename) as f: + text = f.read().strip().split() + self.process = subprocess.Popen( + text, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + self.process.stderr.readline() # make sure that we start + return str(self.process.pid) + + @classmethod + def _close_job(self, job_id): + os.kill(int(job_id), 9) + # from distributed.utils_test import terminate_process + # terminate_process(self.process) + + +LocalCluster = functools.partial(JobQueueCluster, Job=LocalJob, config_name="local") diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index 9a6cf606..26b3f0ba 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -1,7 +1,7 @@ import asyncio from time import time -from dask_jobqueue import PBSJob, SGEJob, SLURMJob, LSFJob +from dask_jobqueue import PBSJob, SGEJob, SLURMJob, LSFJob, LocalJob from dask_jobqueue.job import JobQueueCluster from dask.distributed import Scheduler, Client @@ -18,6 +18,7 @@ def test_basic(): pytest.param(PBSJob, marks=[pytest.mark.env("pbs")]), pytest.param(SLURMJob, marks=[pytest.mark.env("slurm")]), pytest.param(LSFJob, marks=[pytest.mark.env("lsf")]), + LocalJob, ] @@ -43,7 +44,7 @@ async def test_job(Job): start = time() while len(s.workers): await asyncio.sleep(0.1) - assert time() < start + 5 + assert time() < start + 10 print(8) From 01481bf4507583b5ec44ebef964398956ef81842 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 28 Aug 2019 08:50:43 -0700 Subject: [PATCH 054/109] Add empty jobs to fill out worker spec with many processes --- dask_jobqueue/job.py | 20 ++++++++++++++++++++ dask_jobqueue/tests/test_job.py | 26 +++++++++++++++++++++++++- 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index de7fa833..9821c6bc 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -5,6 +5,7 @@ import shlex import subprocess import sys +import toolz import weakref import dask @@ -405,6 +406,21 @@ def __init__( if n_workers: self.scale(n_workers) + def new_worker_spec(self): + spec = super().new_worker_spec() + nprocs = self.new_spec["options"]["processes"] + if nprocs >= 1: + [(name, value)] = spec.items() + value = value.copy() + value["options"] = toolz.assoc(value["options"], "name", name) + name = str(name) + + spec = {name + "-0": value} + for i in range(1, nprocs): + spec[name + "-" + str(i)] = {"cls": EmptyJob} + + return spec + @property def example_job(self): try: @@ -423,3 +439,7 @@ def job_script(self): @property def job_name(self): return self.example_job.job_name + + +class EmptyJob(ProcessInterface): + pass diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index 26b3f0ba..9123f906 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -1,7 +1,7 @@ import asyncio from time import time -from dask_jobqueue import PBSJob, SGEJob, SLURMJob, LSFJob, LocalJob +from dask_jobqueue import PBSJob, SGEJob, SLURMJob, LSFJob, LocalJob, LocalCluster from dask_jobqueue.job import JobQueueCluster from dask.distributed import Scheduler, Client @@ -107,3 +107,27 @@ def test_header_lines_skip(): job = PBSJob(cores=1, memory="1GB", job_name="foobar", header_skip=["-N"]) assert "foobar" not in job.job_script() + + +@pytest.mark.asyncio +async def test_nprocs(): + async with LocalCluster( + cores=2, memory="4GB", processes=2, asynchronous=True + ) as cluster: + async with Client(cluster, asynchronous=True) as client: + cluster.scale(cores=2) + assert len(cluster.worker_spec) == 2 # two workers + await cluster + await client.wait_for_workers(2) + + assert set(cluster.workers) == { + ws.name for ws in cluster.scheduler.workers.values() + } + + cluster.scale(cores=1) + await cluster + await asyncio.sleep(0.2) + assert len(cluster.scheduler.workers) == 2 # they're still one group + + # this fails + # assert len(cluster.workers) == len(cluster.worker_spec) == 2 From 3f81a997eddd99c21d771104a729266e5664acd6 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 28 Aug 2019 15:16:45 -0700 Subject: [PATCH 055/109] cleanup new_worker_spec --- dask_jobqueue/job.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index 9821c6bc..950079ba 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -407,8 +407,10 @@ def __init__( self.scale(n_workers) def new_worker_spec(self): - spec = super().new_worker_spec() - nprocs = self.new_spec["options"]["processes"] + spec = {self._i: self.new_spec} + self._i += 1 + + nprocs = self.new_spec.get("options", {}).get("processes", 1) if nprocs >= 1: [(name, value)] = spec.items() value = value.copy() From 5c97395dd9804d44a56ffbeb6bde7304a6e0344c Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 28 Aug 2019 15:29:39 -0700 Subject: [PATCH 056/109] add distributed master to CI for none --- ci/none.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/none.sh b/ci/none.sh index 6aeb3afd..c5ea74ab 100644 --- a/ci/none.sh +++ b/ci/none.sh @@ -5,6 +5,7 @@ function jobqueue_before_install { ./ci/conda_setup.sh export PATH="$HOME/miniconda/bin:$PATH" conda install --yes -c conda-forge python=$TRAVIS_PYTHON_VERSION dask distributed flake8 black pytest docrep pytest-asyncio + pip install git+https://github.com/dask/distributed@master --upgrade --no-deps } function jobqueue_install { From 44f01206b7c0c0c3c8adea5af49b21cfd9e8cf98 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 28 Aug 2019 17:11:12 -0700 Subject: [PATCH 057/109] Update multi-job work for upstream PR See https://github.com/dask/distributed/pull/3013 --- dask_jobqueue/job.py | 22 ++++------------------ dask_jobqueue/tests/test_job.py | 12 ++++-------- 2 files changed, 8 insertions(+), 26 deletions(-) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index 950079ba..e4dafb57 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -392,7 +392,10 @@ def __init__( self._kwargs = kwargs self._Job = Job worker = {"cls": Job, "options": kwargs} - self.example_job + if "processes" in kwargs and kwargs["processes"] > 1: + worker["group"] = ["-" + str(i) for i in range(kwargs["processes"])] + + self.example_job # trigger property to ensure that the job is valid super().__init__( scheduler=scheduler, @@ -406,23 +409,6 @@ def __init__( if n_workers: self.scale(n_workers) - def new_worker_spec(self): - spec = {self._i: self.new_spec} - self._i += 1 - - nprocs = self.new_spec.get("options", {}).get("processes", 1) - if nprocs >= 1: - [(name, value)] = spec.items() - value = value.copy() - value["options"] = toolz.assoc(value["options"], "name", name) - name = str(name) - - spec = {name + "-0": value} - for i in range(1, nprocs): - spec[name + "-" + str(i)] = {"cls": EmptyJob} - - return spec - @property def example_job(self): try: diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index 9123f906..c6874ec1 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -114,20 +114,16 @@ async def test_nprocs(): async with LocalCluster( cores=2, memory="4GB", processes=2, asynchronous=True ) as cluster: + s = cluster.scheduler async with Client(cluster, asynchronous=True) as client: cluster.scale(cores=2) - assert len(cluster.worker_spec) == 2 # two workers await cluster await client.wait_for_workers(2) - - assert set(cluster.workers) == { - ws.name for ws in cluster.scheduler.workers.values() - } + assert len(cluster.workers) == 1 # two workers, one job + assert len(s.workers) == 2 + assert cluster.plan == {ws.name for ws in s.workers.values()} cluster.scale(cores=1) await cluster await asyncio.sleep(0.2) assert len(cluster.scheduler.workers) == 2 # they're still one group - - # this fails - # assert len(cluster.workers) == len(cluster.worker_spec) == 2 From 9d3e181362d2f99a10c06e2739ea8ee0863c4b86 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 29 Aug 2019 09:33:07 -0700 Subject: [PATCH 058/109] remove errant shell=True keyword --- dask_jobqueue/job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index e4dafb57..a429ab19 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -327,7 +327,7 @@ def _call(cmd, **kwargs): ) proc = subprocess.Popen( - cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, **kwargs + cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs ) out, err = proc.communicate() From 2ac776c2ea8e1eac07c31fa135af5340b085f050 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 29 Aug 2019 11:51:33 -0700 Subject: [PATCH 059/109] relax Cluster name test, add status --- dask_jobqueue/job.py | 1 + dask_jobqueue/tests/test_jobqueue_core.py | 7 +++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index a429ab19..8cc70eb9 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -364,6 +364,7 @@ def __init__( # Job keywords **kwargs ): + self.status = "created" if Job is None: raise ValueError( "You must provide a Job type like PBSJob, SLURMJob, " diff --git a/dask_jobqueue/tests/test_jobqueue_core.py b/dask_jobqueue/tests/test_jobqueue_core.py index 51db8bb5..af289fd6 100644 --- a/dask_jobqueue/tests/test_jobqueue_core.py +++ b/dask_jobqueue/tests/test_jobqueue_core.py @@ -200,12 +200,11 @@ def test_jobqueue_cluster_call(tmpdir): [PBSCluster, MoabCluster, SLURMCluster, SGECluster, LSFCluster, OARCluster], ) def test_cluster_has_cores_and_memory(Cluster): - cls_name = Cluster.__name__ + r"\(" - with pytest.raises(ValueError, match=cls_name + r"cores=\d, memory='\d+GB'"): + with pytest.raises(ValueError, match=r"cores=\d, memory='\d+GB'"): Cluster() - with pytest.raises(ValueError, match=cls_name + r"cores=\d, memory='1GB'"): + with pytest.raises(ValueError, match=r"cores=\d, memory='1GB'"): Cluster(memory="1GB") - with pytest.raises(ValueError, match=cls_name + r"cores=4, memory='\d+GB'"): + with pytest.raises(ValueError, match=r"cores=4, memory='\d+GB'"): Cluster(cores=4) From 875ce5fc7578a7ee103051978bccca63c871a1b9 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 29 Aug 2019 12:03:25 -0700 Subject: [PATCH 060/109] copy over cores=, memory= error message from master --- dask_jobqueue/job.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index 8cc70eb9..eab69877 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -5,7 +5,6 @@ import shlex import subprocess import sys -import toolz import weakref import dask @@ -152,14 +151,12 @@ def __init__( if shebang is None: shebang = dask.config.get("jobqueue.%s.shebang" % config_name) - if cores is None: - raise ValueError( - "You must specify how many cores to use per job like ``cores=8``" - ) - - if memory is None: + if cores is None or memory is None: raise ValueError( - "You must specify how much memory to use per job like ``memory='24 GB'``" + "You must specify how much cores and memory per job you want to use, for example:\n" + "cluster = {}(cores={}, memory={!r})".format( + self.__class__.__name__, cores or 8, memory or "24GB" + ) ) # This attribute should be overridden From fbbfc4b12c4a051b2d72842c9a82452f2c80a6b1 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 29 Aug 2019 13:36:37 -0700 Subject: [PATCH 061/109] add -s to pbs test --- ci/pbs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/pbs.sh b/ci/pbs.sh index 2229d812..0132a49a 100644 --- a/ci/pbs.sh +++ b/ci/pbs.sh @@ -19,7 +19,7 @@ function jobqueue_install { } function jobqueue_script { - docker exec -it -u pbsuser pbs_master /bin/bash -c "cd /dask-jobqueue; pytest dask_jobqueue --verbose -E pbs" + docker exec -it -u pbsuser pbs_master /bin/bash -c "cd /dask-jobqueue; pytest dask_jobqueue --verbose -s -E pbs" } function jobqueue_after_script { From a92d0ff7189e951e76837c64cf54028a58d22140 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 29 Aug 2019 15:32:32 -0600 Subject: [PATCH 062/109] ignore Runtime Errors when closing jobs --- dask_jobqueue/job.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index eab69877..c4de6230 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -8,6 +8,7 @@ import weakref import dask +from dask.utils import ignoring from distributed.deploy.spec import ProcessInterface, SpecCluster from distributed.scheduler import Scheduler @@ -291,7 +292,8 @@ async def close(self): @classmethod def _close_job(cls, job_id): if job_id: - cls._call(shlex.split(cls.cancel_command) + [job_id]) + with ignoring(RuntimeError): # deleting job when job already gone + cls._call(shlex.split(cls.cancel_command) + [job_id]) @staticmethod def _call(cmd, **kwargs): From 0a338f74439bd1982718c6703977f6e713d7e078 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 30 Aug 2019 16:54:46 +0200 Subject: [PATCH 063/109] Friday afternoon semi random attempt. --- ci/slurm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/slurm.sh b/ci/slurm.sh index a0cb1bc0..065bfa02 100644 --- a/ci/slurm.sh +++ b/ci/slurm.sh @@ -18,7 +18,7 @@ function jobqueue_install { } function jobqueue_script { - docker exec -it slurmctld /bin/bash -c "cd /dask-jobqueue; pytest dask_jobqueue --verbose -E slurm -s" + docker exec -it slurmctld /bin/bash -c "pytest /dask-jobqueue/dask_jobqueue --verbose -E slurm -s" } function jobqueue_after_script { From db5c397d18b71e987d9e0f14dcca54cb0d492906 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 30 Aug 2019 17:35:50 +0200 Subject: [PATCH 064/109] Same attempt with PBS. Does two half-random attempts make a complete random attempt? --- ci/pbs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/pbs.sh b/ci/pbs.sh index 0132a49a..e0f8d7d3 100644 --- a/ci/pbs.sh +++ b/ci/pbs.sh @@ -19,7 +19,7 @@ function jobqueue_install { } function jobqueue_script { - docker exec -it -u pbsuser pbs_master /bin/bash -c "cd /dask-jobqueue; pytest dask_jobqueue --verbose -s -E pbs" + docker exec -it -u pbsuser pbs_master /bin/bash -c "pytest /dask-jobqueue/dask_jobqueue --verbose -s -E pbs" } function jobqueue_after_script { From b73ccb0d6a439aca531630ffd3ac5f831e5c9898 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 30 Aug 2019 20:11:31 +0200 Subject: [PATCH 065/109] dask-worker could not start because run as pbsuser in /. Could not create the /dask-worker folder. --- ci/pbs.sh | 2 +- dask_jobqueue/local.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/pbs.sh b/ci/pbs.sh index e0f8d7d3..5fb068df 100644 --- a/ci/pbs.sh +++ b/ci/pbs.sh @@ -19,7 +19,7 @@ function jobqueue_install { } function jobqueue_script { - docker exec -it -u pbsuser pbs_master /bin/bash -c "pytest /dask-jobqueue/dask_jobqueue --verbose -s -E pbs" + docker exec -it -u pbsuser pbs_master /bin/bash -c "cd; pytest /dask-jobqueue/dask_jobqueue --verbose -s -E pbs" } function jobqueue_after_script { diff --git a/dask_jobqueue/local.py b/dask_jobqueue/local.py index 9c227891..1c269d0c 100644 --- a/dask_jobqueue/local.py +++ b/dask_jobqueue/local.py @@ -40,6 +40,9 @@ def _submit_job(self, script_filename): self.process = subprocess.Popen( text, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) + # TODO this should raise if self.process.returncode != 0. Refactor + # Job._call to be able to return process (so that we can return self.process.pid below) + self.process.stderr.readline() # make sure that we start return str(self.process.pid) From 4ea85e834c259215de6de256eb44f5d35dd64670 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 30 Aug 2019 20:27:46 +0200 Subject: [PATCH 066/109] Remove .runnin_jobs. --- dask_jobqueue/tests/test_pbs.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/dask_jobqueue/tests/test_pbs.py b/dask_jobqueue/tests/test_pbs.py index f6e835a8..68d9913e 100644 --- a/dask_jobqueue/tests/test_pbs.py +++ b/dask_jobqueue/tests/test_pbs.py @@ -116,7 +116,7 @@ def test_basic(loop): future = client.submit(lambda x: x + 1, 10) assert future.result(QUEUE_WAIT) == 11 - assert cluster.running_jobs + # assert cluster.running_jobs workers = list(client.scheduler_info()["workers"].values()) w = workers[0] @@ -294,9 +294,10 @@ def test_scale_grouped(loop): cluster.scale(4) # Start 2 jobs start = time() - while len(cluster.running_jobs) != 2: - sleep(0.100) - assert time() < start + QUEUE_WAIT + # TODO: Is there a replacement to check for number of jobs (rather than workers) + # while len(cluster.running_jobs) != 2: + # sleep(0.100) + # assert time() < start + QUEUE_WAIT while len(list(client.scheduler_info()["workers"].values())) != 4: sleep(0.100) @@ -304,7 +305,7 @@ def test_scale_grouped(loop): future = client.submit(lambda x: x + 1, 10) assert future.result(QUEUE_WAIT) == 11 - assert cluster.running_jobs + # assert cluster.running_jobs workers = list(client.scheduler_info()["workers"].values()) w = workers[0] @@ -315,23 +316,29 @@ def test_scale_grouped(loop): cluster.scale(1) # Should leave 2 workers, 1 job start = time() - while len(cluster.running_jobs) != 1: + # TODO + # while len(cluster.running_jobs) != 1: + # sleep(0.100) + # assert time() < start + QUEUE_WAIT + + # assert len(cluster.running_jobs) == 1 + # workers = list(client.scheduler_info()["workers"].values()) + while len(cluster.scheduler_info()['workers']) != 2: sleep(0.100) assert time() < start + QUEUE_WAIT - assert len(cluster.running_jobs) == 1 - workers = list(client.scheduler_info()["workers"].values()) - assert len(workers) == 2 - cluster.scale(0) start = time() - while cluster.running_jobs: + # while cluster.running_jobs: + # sleep(0.100) + # assert time() < start + QUEUE_WAIT + + # assert not cluster.running_jobs + while len(cluster.scheduler_info()['workers']) != 0: sleep(0.100) assert time() < start + QUEUE_WAIT - assert not cluster.running_jobs - def test_config(loop): with dask.config.set( From 0789ed9e115652900fc3555e24ac49d34242c154 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 30 Aug 2019 20:43:45 +0200 Subject: [PATCH 067/109] Fix cluster -> client --- dask_jobqueue/tests/test_pbs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dask_jobqueue/tests/test_pbs.py b/dask_jobqueue/tests/test_pbs.py index 68d9913e..39a0b933 100644 --- a/dask_jobqueue/tests/test_pbs.py +++ b/dask_jobqueue/tests/test_pbs.py @@ -323,7 +323,7 @@ def test_scale_grouped(loop): # assert len(cluster.running_jobs) == 1 # workers = list(client.scheduler_info()["workers"].values()) - while len(cluster.scheduler_info()['workers']) != 2: + while len(client.scheduler_info()['workers']) != 2: sleep(0.100) assert time() < start + QUEUE_WAIT @@ -335,7 +335,7 @@ def test_scale_grouped(loop): # assert time() < start + QUEUE_WAIT # assert not cluster.running_jobs - while len(cluster.scheduler_info()['workers']) != 0: + while len(client.scheduler_info()['workers']) != 0: sleep(0.100) assert time() < start + QUEUE_WAIT From 2215227a059ce8853f37345f404d93b8395a1833 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Fri, 30 Aug 2019 19:53:51 -0700 Subject: [PATCH 068/109] get worker_processes from example job --- dask_jobqueue/tests/test_pbs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dask_jobqueue/tests/test_pbs.py b/dask_jobqueue/tests/test_pbs.py index 39a0b933..b876427c 100644 --- a/dask_jobqueue/tests/test_pbs.py +++ b/dask_jobqueue/tests/test_pbs.py @@ -210,7 +210,7 @@ def test_adaptive(loop): assert future.result(QUEUE_WAIT) == 11 start = time() - processes = cluster.worker_processes + processes = cluster.example_job.worker_processes while len(client.scheduler_info()["workers"]) != processes: sleep(0.1) assert time() < start + QUEUE_WAIT @@ -242,7 +242,7 @@ def test_adaptive_grouped(loop): assert future.result(QUEUE_WAIT) == 11 start = time() - processes = cluster.worker_processes + processes = cluster.example_job.worker_processes while len(client.scheduler_info()["workers"]) != processes: sleep(0.1) assert time() < start + QUEUE_WAIT @@ -265,7 +265,7 @@ def test_adaptive_cores_mem(loop): assert future.result(QUEUE_WAIT) == 11 start = time() - processes = cluster.worker_processes + processes = cluster.example_job.worker_processes while len(client.scheduler_info()["workers"]) != processes: sleep(0.1) assert time() < start + QUEUE_WAIT From 6de262eb7b7dbceb0a758302a39a5693f0524037 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Fri, 30 Aug 2019 19:54:10 -0700 Subject: [PATCH 069/109] black --- dask_jobqueue/tests/test_pbs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dask_jobqueue/tests/test_pbs.py b/dask_jobqueue/tests/test_pbs.py index b876427c..80911ea2 100644 --- a/dask_jobqueue/tests/test_pbs.py +++ b/dask_jobqueue/tests/test_pbs.py @@ -323,7 +323,7 @@ def test_scale_grouped(loop): # assert len(cluster.running_jobs) == 1 # workers = list(client.scheduler_info()["workers"].values()) - while len(client.scheduler_info()['workers']) != 2: + while len(client.scheduler_info()["workers"]) != 2: sleep(0.100) assert time() < start + QUEUE_WAIT @@ -335,7 +335,7 @@ def test_scale_grouped(loop): # assert time() < start + QUEUE_WAIT # assert not cluster.running_jobs - while len(client.scheduler_info()['workers']) != 0: + while len(client.scheduler_info()["workers"]) != 0: sleep(0.100) assert time() < start + QUEUE_WAIT From f64c3e8870208d33335c9aaa89294983b62accd7 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Fri, 30 Aug 2019 20:28:55 -0700 Subject: [PATCH 070/109] Move over Oar --- dask_jobqueue/oar.py | 16 +++++++++++----- dask_jobqueue/tests/test_oar.py | 2 +- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/dask_jobqueue/oar.py b/dask_jobqueue/oar.py index 39dc7dda..eb2196e9 100644 --- a/dask_jobqueue/oar.py +++ b/dask_jobqueue/oar.py @@ -1,14 +1,16 @@ +import functools import logging import shlex import dask -from .core import JobQueueCluster, docstrings +from .core import docstrings +from .job import JobQueueCluster, Job logger = logging.getLogger(__name__) -class OARCluster(JobQueueCluster): +class OARJob(Job): __doc__ = docstrings.with_indents( """ Launch Dask on a OAR cluster @@ -49,6 +51,7 @@ class OARCluster(JobQueueCluster): def __init__( self, + *args, queue=None, project=None, resource_spec=None, @@ -68,11 +71,11 @@ def __init__( if job_extra is None: job_extra = dask.config.get("jobqueue.%s.job-extra" % config_name) - super().__init__(config_name=config_name, **kwargs) + super().__init__(*args, config_name=config_name, **kwargs) header_lines = [] - if self.name is not None: - header_lines.append("#OAR -n %s" % self.name) + if self.job_name is not None: + header_lines.append("#OAR -n %s" % self.job_name) if queue is not None: header_lines.append("#OAR -q %s" % queue) if project is not None: @@ -121,3 +124,6 @@ def _submit_job(self, fn): oarsub_command = " ".join([self.submit_command] + oarsub_options) oarsub_command_split = shlex.split(oarsub_command) + [inline_script] return self._call(oarsub_command_split) + + +OARCluster = functools.partial(JobQueueCluster, Job=OARJob, config_name="oar") diff --git a/dask_jobqueue/tests/test_oar.py b/dask_jobqueue/tests/test_oar.py index b7eaaa7e..5035852b 100644 --- a/dask_jobqueue/tests/test_oar.py +++ b/dask_jobqueue/tests/test_oar.py @@ -110,4 +110,4 @@ def test_config_name_oar_takes_custom_config(): with dask.config.set({"jobqueue.oar-config-name": conf}): with OARCluster(config_name="oar-config-name") as cluster: - assert cluster.name == "myname" + assert cluster.job_name == "myname" From 860b85168f2379ee63608d5e5da293bc0381674d Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Fri, 30 Aug 2019 20:32:02 -0700 Subject: [PATCH 071/109] move over htcondor --- dask_jobqueue/htcondor.py | 13 +++++++++---- dask_jobqueue/tests/test_htcondor.py | 8 ++++---- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/dask_jobqueue/htcondor.py b/dask_jobqueue/htcondor.py index 7b1efcba..942775a8 100644 --- a/dask_jobqueue/htcondor.py +++ b/dask_jobqueue/htcondor.py @@ -1,3 +1,4 @@ +import functools import logging import re import shlex @@ -5,12 +6,13 @@ import dask from distributed.utils import parse_bytes -from .core import JobQueueCluster, docstrings +from .core import docstrings +from .job import JobQueueCluster, Job logger = logging.getLogger(__name__) -class HTCondorCluster(JobQueueCluster): +class HTCondorJob(Job): __doc__ = docstrings.with_indents( """ Launch Dask on an HTCondor cluster with a shared file system @@ -57,7 +59,7 @@ class HTCondorCluster(JobQueueCluster): # Python (can't find its libs), so we have to go through the shell. executable = "/bin/sh" - def __init__(self, disk=None, job_extra=None, config_name="htcondor", **kwargs): + def __init__(self, *args, disk=None, job_extra=None, config_name="htcondor", **kwargs): if disk is None: disk = dask.config.get("jobqueue.%s.disk" % config_name) if disk is None: @@ -71,7 +73,7 @@ def __init__(self, disk=None, job_extra=None, config_name="htcondor", **kwargs): self.job_extra = job_extra # Instantiate args and parameters from parent abstract class - super().__init__(config_name=config_name, **kwargs) + super().__init__(*args, config_name=config_name, **kwargs) env_extra = kwargs.get("env_extra", None) if env_extra is None: @@ -220,3 +222,6 @@ def quote_environment(env): entries.append("%s=%s" % (k, qv)) return " ".join(entries) + + +HTCondorCluster = functools.partial(JobQueueCluster, Job=HTCondorJob, config_name="htcondor") diff --git a/dask_jobqueue/tests/test_htcondor.py b/dask_jobqueue/tests/test_htcondor.py index 4f386680..024473ad 100644 --- a/dask_jobqueue/tests/test_htcondor.py +++ b/dask_jobqueue/tests/test_htcondor.py @@ -13,9 +13,9 @@ def test_header(): with HTCondorCluster(cores=1, memory="100MB", disk="100MB") as cluster: - assert cluster.job_header_dict["MY.DaskWorkerCores"] == 1 - assert cluster.job_header_dict["MY.DaskWorkerDisk"] == 100000000 - assert cluster.job_header_dict["MY.DaskWorkerMemory"] == 100000000 + assert cluster.example_job.job_header_dict["MY.DaskWorkerCores"] == 1 + assert cluster.example_job.job_header_dict["MY.DaskWorkerDisk"] == 100000000 + assert cluster.example_job.job_header_dict["MY.DaskWorkerMemory"] == 100000000 def test_job_script(): @@ -98,4 +98,4 @@ def test_config_name_htcondor_takes_custom_config(): with dask.config.set({"jobqueue.htcondor-config-name": conf}): with HTCondorCluster(config_name="htcondor-config-name") as cluster: - assert cluster.name == "myname" + assert cluster.job_name == "myname" From 512b76676b08156dbf42663c609d0310dec4cd9d Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Fri, 30 Aug 2019 20:37:23 -0700 Subject: [PATCH 072/109] remove old core and deploy code --- dask_jobqueue/core.py | 484 +--------------------- dask_jobqueue/deploy/__init__.py | 2 - dask_jobqueue/deploy/cluster_manager.py | 392 ------------------ dask_jobqueue/htcondor.py | 8 +- dask_jobqueue/job.py | 4 - dask_jobqueue/tests/test_jobqueue_core.py | 1 + 6 files changed, 9 insertions(+), 882 deletions(-) delete mode 100644 dask_jobqueue/deploy/__init__.py delete mode 100644 dask_jobqueue/deploy/cluster_manager.py diff --git a/dask_jobqueue/core.py b/dask_jobqueue/core.py index 5f6e6e1c..80d91c88 100644 --- a/dask_jobqueue/core.py +++ b/dask_jobqueue/core.py @@ -1,85 +1,13 @@ -import logging -import math -import os -import re -import shlex -import subprocess -import sys -from collections import OrderedDict -from contextlib import contextmanager - -import dask import docrep -from .deploy import ClusterManager -from distributed import LocalCluster -from distributed.diagnostics.plugin import SchedulerPlugin -from distributed.utils import format_bytes, parse_bytes, tmpfile, get_ip_interface -logger = logging.getLogger(__name__) docstrings = docrep.DocstringProcessor() -def _job_id_from_worker_name(name): - """ utility to parse the job ID from the worker name - - template: 'prefix--jobid--suffix' - """ - _, job_id, _ = name.split("--") - return job_id - - -class JobQueuePlugin(SchedulerPlugin): - def __init__(self): - self.pending_jobs = OrderedDict() - self.running_jobs = OrderedDict() - self.finished_jobs = OrderedDict() - self.all_workers = {} - - def add_worker(self, scheduler, worker=None, name=None, **kwargs): - """ Run when a new worker enters the cluster""" - logger.debug("adding worker %s", worker) - w = scheduler.workers[worker] - job_id = _job_id_from_worker_name(w.name) - logger.debug("job id for new worker: %s", job_id) - self.all_workers[worker] = (w.name, job_id) - - # if this is the first worker for this job, move job to running - if job_id not in self.running_jobs: - logger.debug("%s is a new job or restarting worker", job_id) - if job_id in self.pending_jobs: - logger.debug("%s is a new job, adding to running_jobs", job_id) - self.running_jobs[job_id] = self.pending_jobs.pop(job_id) - elif job_id in self.finished_jobs: - logger.warning( - "Worker %s restart in Job %s. " "This can be due to memory issue.", - w, - job_id, - ) - self.running_jobs[job_id] = self.finished_jobs.pop(job_id) - else: - logger.error("Unknown job_id: %s for worker %s", job_id, w) - self.running_jobs[job_id] = {} - - # add worker to dict of workers in this job - self.running_jobs[job_id][w.name] = w - - def remove_worker(self, scheduler=None, worker=None, **kwargs): - """ Run when a worker leaves the cluster""" - logger.debug("removing worker %s", worker) - name, job_id = self.all_workers[worker] - logger.debug("removing worker name (%s) and job_id (%s)", name, job_id) - - # remove worker from this job - self.running_jobs[job_id].pop(name, None) - - # once there are no more workers, move this job to finished_jobs - if not self.running_jobs[job_id]: - logger.debug("that was the last worker for job %s", job_id) - self.finished_jobs[job_id] = self.running_jobs.pop(job_id) +# TODO: remove this class after we figure out docstrings @docstrings.get_sectionsf("JobQueueCluster") -class JobQueueCluster(ClusterManager): +class JobQueueCluster: """ Base class to launch Dask Clusters for Job queues This class should not be used directly, use inherited class appropriate for your queueing system (e.g. PBScluster @@ -144,411 +72,3 @@ class JobQueueCluster(ClusterManager): %(worker_command)s """.lstrip() - - # Following class attributes should be overridden by extending classes. - submit_command = None - cancel_command = None - job_id_regexp = r"(?P\d+)" - - def __init__( - self, - name=None, - cores=None, - memory=None, - processes=None, - interface=None, - death_timeout=None, - local_directory=None, - extra=None, - env_extra=None, - log_directory=None, - shebang=None, - python=sys.executable, - config_name=None, - **kwargs - ): - """ """ - # """ - # This initializer should be considered as Abstract, and never used directly. - # """ - super().__init__() - - if config_name is None: - raise NotImplementedError( - "JobQueueCluster is an abstract class that should not be instantiated." - ) - - if name is None: - name = dask.config.get("jobqueue.%s.name" % config_name) - if cores is None: - cores = dask.config.get("jobqueue.%s.cores" % config_name) - if memory is None: - memory = dask.config.get("jobqueue.%s.memory" % config_name) - if processes is None: - processes = dask.config.get("jobqueue.%s.processes" % config_name) - if interface is None: - interface = dask.config.get("jobqueue.%s.interface" % config_name) - if death_timeout is None: - death_timeout = dask.config.get("jobqueue.%s.death-timeout" % config_name) - if local_directory is None: - local_directory = dask.config.get( - "jobqueue.%s.local-directory" % config_name - ) - if extra is None: - extra = dask.config.get("jobqueue.%s.extra" % config_name) - if env_extra is None: - env_extra = dask.config.get("jobqueue.%s.env-extra" % config_name) - if log_directory is None: - log_directory = dask.config.get("jobqueue.%s.log-directory" % config_name) - if shebang is None: - shebang = dask.config.get("jobqueue.%s.shebang" % config_name) - - if cores is None or memory is None: - raise ValueError( - "You must specify how much cores and memory per job you want to use, for example:\n" - "cluster = {}(cores={}, memory={!r})".format( - self.__class__.__name__, cores or 8, memory or "24GB" - ) - ) - - # This attribute should be overridden - self.job_header = None - - if interface: - extra += ["--interface", interface] - kwargs.setdefault("host", get_ip_interface(interface)) - else: - kwargs.setdefault("host", "") - - # Bokeh diagnostics server should listen on all interfaces - kwargs.setdefault("dashboard_address", ("", 8787)) - self.local_cluster = LocalCluster(n_workers=0, **kwargs) - - # Keep information on process, cores, and memory, for use in subclasses - self.worker_memory = parse_bytes(memory) if memory is not None else None - self.worker_processes = processes - self.worker_cores = cores - self.name = name - - # plugin for tracking job status - self._scheduler_plugin = JobQueuePlugin() - self.local_cluster.scheduler.add_plugin(self._scheduler_plugin) - - self._adaptive = None - - self.shebang = shebang - - self._env_header = "\n".join(env_extra) - - # dask-worker command line build - dask_worker_command = "%(python)s -m distributed.cli.dask_worker" % dict( - python=python - ) - command_args = [dask_worker_command, self.scheduler.address] - command_args += ["--nthreads", self.worker_process_threads] - if processes is not None and processes > 1: - command_args += ["--nprocs", processes] - - command_args += ["--memory-limit", self.worker_process_memory] - command_args += ["--name", "%s--${JOB_ID}--" % name] - - if death_timeout is not None: - command_args += ["--death-timeout", death_timeout] - if local_directory is not None: - command_args += ["--local-directory", local_directory] - if extra is not None: - command_args += extra - - self._command_template = " ".join(map(str, command_args)) - - self.log_directory = log_directory - if self.log_directory is not None: - if not os.path.exists(self.log_directory): - os.makedirs(self.log_directory) - - def __repr__(self): - running_workers = self._count_active_workers() - running_cores = running_workers * self.worker_process_threads - total_jobs = len(self.pending_jobs) + len(self.running_jobs) - total_workers = total_jobs * self.worker_processes - running_memory = running_workers * self.worker_memory / self.worker_processes - - return ( - self.__class__.__name__ - + "(cores=%d, memory=%s, workers=%d/%d, jobs=%d/%d)" - % ( - running_cores, - format_bytes(running_memory), - running_workers, - total_workers, - len(self.running_jobs), - total_jobs, - ) - ) - - @property - def pending_jobs(self): - """ Jobs pending in the queue """ - return self._scheduler_plugin.pending_jobs - - @property - def running_jobs(self): - """ Jobs with currently active workers """ - return self._scheduler_plugin.running_jobs - - @property - def finished_jobs(self): - """ Jobs that have finished """ - return self._scheduler_plugin.finished_jobs - - @property - def worker_process_threads(self): - return int(self.worker_cores / self.worker_processes) - - @property - def worker_process_memory(self): - mem = format_bytes(self.worker_memory / self.worker_processes) - mem = mem.replace(" ", "") - return mem - - @property - def jobqueue_worker_spec(self): - """ single worker process info needed for scaling on cores or memory """ - return { - "cores": self.worker_process_threads, - "memory": self.worker_process_memory, - } - - @property - def workers(self): - """ workers currently connected to the scheduler """ - return self.scheduler.workers - - def job_script(self): - """ Construct a job submission script """ - pieces = { - "shebang": self.shebang, - "job_header": self.job_header, - "env_header": self._env_header, - "worker_command": self._command_template, - } - return self._script_template % pieces - - @contextmanager - def job_file(self): - """ Write job submission script to temporary file """ - with tmpfile(extension="sh") as fn: - with open(fn, "w") as f: - logger.debug("writing job script: \n%s", self.job_script()) - f.write(self.job_script()) - yield fn - - def _submit_job(self, script_filename): - return self._call(shlex.split(self.submit_command) + [script_filename]) - - def start_workers(self, n=1): - """ Start workers and point them to our local scheduler """ - logger.debug("starting %s workers", n) - num_jobs = int(math.ceil(n / self.worker_processes)) - for _ in range(num_jobs): - with self.job_file() as fn: - out = self._submit_job(fn) - job = self._job_id_from_submit_output(out) - if not job: - raise ValueError("Unable to parse jobid from output of %s" % out) - logger.debug("started job: %s", job) - self.pending_jobs[job] = {} - - @property - def scheduler(self): - """ The scheduler of this cluster """ - return self.local_cluster.scheduler - - def _call(self, cmd, **kwargs): - """ Call a command using subprocess.Popen. - - This centralizes calls out to the command line, providing consistent - outputs, logging, and an opportunity to go asynchronous in the future. - - Parameters - ---------- - cmd: List(str)) - A command, each of which is a list of strings to hand to - subprocess.Popen - - Examples - -------- - >>> self._call(['ls', '/foo']) - - Returns - ------- - The stdout produced by the command, as string. - - Raises - ------ - RuntimeError if the command exits with a non-zero exit code - """ - cmd_str = " ".join(cmd) - logger.debug( - "Executing the following command to command line\n{}".format(cmd_str) - ) - - proc = subprocess.Popen( - cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs - ) - - out, err = proc.communicate() - out, err = out.decode(), err.decode() - if proc.returncode != 0: - raise RuntimeError( - "Command exited with non-zero exit code.\n" - "Exit code: {}\n" - "Command:\n{}\n" - "stdout:\n{}\n" - "stderr:\n{}\n".format(proc.returncode, cmd_str, out, err) - ) - return out - - def stop_workers(self, workers): - """ Stop a list of workers""" - logger.debug("Stopping workers: %s", workers) - if not workers: - return - jobs = self._del_pending_jobs() # stop pending jobs too - for w in workers: - if isinstance(w, dict): - jobs.append(_job_id_from_worker_name(w["name"])) - else: - jobs.append(_job_id_from_worker_name(w.name)) - self.stop_jobs(jobs) - - def stop_jobs(self, jobs): - """ Stop a list of jobs""" - logger.debug("Stopping jobs: %s", jobs) - if jobs: - jobs = list(jobs) - self._call(shlex.split(self.cancel_command) + list(set(jobs))) - - # if any of these jobs were pending, we should remove those now - for job_id in jobs: - if job_id in self.pending_jobs: - del self.pending_jobs[job_id] - - def scale_up(self, n, **kwargs): - """ Brings total worker count up to ``n`` """ - active_and_pending = self._count_active_and_pending_workers() - if n >= active_and_pending: - logger.debug("Scaling up to %d workers.", n) - self.start_workers(n - active_and_pending) - else: - # scale_up should not be called if n < active + pending jobs - logger.warning( - "JobQueueCluster.scale_up was called with a" - " number of workers lower that what is already" - " running or pending" - ) - - def _count_active_and_pending_workers(self): - active_and_pending = ( - self._count_active_workers() + self._count_pending_workers() - ) - logger.debug("Found %d active/pending workers.", active_and_pending) - assert len(self.scheduler.workers) <= active_and_pending - return active_and_pending - - def _count_active_workers(self): - active_workers = sum([len(j) for j in self.running_jobs.values()]) - assert len(self.scheduler.workers) == active_workers - return active_workers - - def _count_pending_workers(self): - return self.worker_processes * len(self.pending_jobs) - - def scale_down(self, workers, n=None): - """ Close the workers with the given addresses """ - if n is None: - # Adaptive currently calls directly scale_down, we need to handle this - # Need to only keep active workers minus those adaptive wants to stop - n = self._count_active_workers() - len(workers) - logger.debug("Scaling down to %d Workers: %s", n, workers) - active_and_pending = self._count_active_and_pending_workers() - n_to_close = active_and_pending - n - if n_to_close < 0: - logger.warning( - "JobQueueCluster.scale_down was called with" - " a number of worker greater than what is" - " already running or pending." - ) - elif n_to_close <= self._count_pending_workers(): - # We only need to kill some pending jobs, - to_kill = int(n_to_close / self.worker_processes) - jobs = list(self.pending_jobs.keys())[-to_kill:] - logger.debug("%d jobs to stop, stopping jobs %s", to_kill, jobs) - self.stop_jobs(jobs) - else: - worker_states = [] - for w in workers: - try: - # Get the actual WorkerState - worker_states.append(self.scheduler.workers[w]) - except KeyError: - logger.debug("worker %s is already gone", w) - self.stop_workers(worker_states) - - def stop_all_jobs(self): - """ Stops all running and pending jobs """ - jobs = self._del_pending_jobs() - jobs += list(self.running_jobs.keys()) - self.stop_jobs(set(jobs)) - - def close(self, **kwargs): - """ Stops all running and pending jobs and stops scheduler """ - self.stop_all_jobs() - return self.local_cluster.close(**kwargs) - - def __enter__(self): - return self - - def __exit__(self, type, value, traceback): - self.close() - self.local_cluster.__exit__(type, value, traceback) - - def _del_pending_jobs(self): - jobs = list(self.pending_jobs.keys()) - logger.debug("Deleting pending jobs %s" % jobs) - for job_id in jobs: - del self.pending_jobs[job_id] - return jobs - - def _job_id_from_submit_output(self, out): - match = re.search(self.job_id_regexp, out) - if match is None: - msg = ( - "Could not parse job id from submission command " - "output.\nJob id regexp is {!r}\nSubmission command " - "output is:\n{}".format(self.job_id_regexp, out) - ) - raise ValueError(msg) - - job_id = match.groupdict().get("job_id") - if job_id is None: - msg = ( - "You need to use a 'job_id' named group in your regexp, e.g. " - "r'(?P\\d+)', in your regexp. Your regexp was: " - "{!r}".format(self.job_id_regexp) - ) - raise ValueError(msg) - - return job_id - - @staticmethod - def worker_key(worker_state): - return _job_id_from_worker_name(worker_state.name) - - @property - def scheduler_comm(self): - return self.local_cluster.scheduler_comm - - @property - def scheduler_info(self): - return self.local_cluster.scheduler_info diff --git a/dask_jobqueue/deploy/__init__.py b/dask_jobqueue/deploy/__init__.py deleted file mode 100644 index c9e11c0f..00000000 --- a/dask_jobqueue/deploy/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# flake8: noqa -from .cluster_manager import ClusterManager diff --git a/dask_jobqueue/deploy/cluster_manager.py b/dask_jobqueue/deploy/cluster_manager.py deleted file mode 100644 index 6910c82f..00000000 --- a/dask_jobqueue/deploy/cluster_manager.py +++ /dev/null @@ -1,392 +0,0 @@ -import logging -import math - -from tornado import gen - -from distributed.deploy.adaptive import Adaptive -from distributed.utils import ( - log_errors, - ignoring, - parse_bytes, - PeriodicCallback, - format_bytes, - format_dashboard_link, -) - -logger = logging.getLogger(__name__) - - -class ClusterManager: - """ Intermediate Cluster object that should lead to a real ClusterManager - - This tries to improve upstream Cluster object and underlines needs for - better decoupling between ClusterManager and Scheduler object - - This currently expects a local Scheduler defined on the object, but should - eventually only rely on RPC calls on remote or local scheduler. - It provides common methods and an IPython widget display. - - Clusters inheriting from this class should provide the following: - - 1. A local ``Scheduler`` object at ``.scheduler``. In the future, just - a URL to local or remote scheduler. - 2. scale_up and scale_down methods as defined below:: - - def scale_up(self, n: int): - ''' Brings total worker count up to ``n`` ''' - - def scale_down(self, workers: List[str], n: int): - ''' Close the workers with the given addresses or remove pending - workers to match n running workers. - ''' - 3. Optionally worker_key: Callable(WorkerState): - ''' Callable mapping a WorkerState object to a group, see - Scheduler.workers_to_close - ''' - 4. jobqueue_worker_spec dict attribute if scale(cores=...) or scale(memory=...) - can be used by users. - jobqueue_worker_spec = {'cores': 4, 'memory': '16 GB'} - - This will provide a general ``scale`` method as well as an IPython widget - for display. - - Things the will need to change for the complete Cluster Manager Design: - - ClusterManager: - - Use it's own event loop, or the notebook one. - - Connect to a local or remote Scheduler through RPC, and then - communicate with it. - - Ability to start a local or remote scheduler. - - Ability to work with different worker pools: in scale, adaptive, - jobqueue_worker_spec... - - Scheduler - - Provide some remote methods: - - retire_workers(n: int): close enough workers ot have only n - running at the end. Return the closed workers. - - status of connected worker, e.g. scheduler_info() - - Examples - -------- - - >>> from distributed.deploy import Cluster - >>> class MyCluster(cluster): - ... def scale_up(self, n): - ... ''' Bring the total worker count up to n ''' - ... pass - ... def scale_down(self, workers, n=None): - ... ''' Close the workers with the given addresses ''' - ... pass - - >>> cluster = MyCluster() - >>> cluster.scale(5) # scale manually - >>> cluster.adapt(minimum=1, maximum=100) # scale automatically - >>> cluster.scale(cores=100) # scale manually to cores nb - """ - - def __init__(self, adaptive_options={}): - self._target_scale = 0 - self._adaptive_options = adaptive_options - self._adaptive_options.setdefault("worker_key", self.worker_key) - - def adapt( - self, - minimum_cores=None, - maximum_cores=None, - minimum_memory=None, - maximum_memory=None, - **kwargs - ): - """ Turn on adaptivity - For keyword arguments see dask.distributed.Adaptive - Instead of minimum and maximum parameters which apply to the number of - worker, If Cluster object implements jobqueue_worker_spec attribute, one can - use the following parameters: - Parameters - ---------- - minimum_cores: int - Minimum number of cores for the cluster - maximum_cores: int - Maximum number of cores for the cluster - minimum_memory: str - Minimum amount of memory for the cluster - maximum_memory: str - Maximum amount of memory for the cluster - Examples - -------- - >>> cluster.adapt(minimum=0, maximum=10, interval='500ms') - >>> cluster.adapt(minimum_cores=24, maximum_cores=96) - >>> cluster.adapt(minimum_memory='60 GB', maximum_memory= '1 TB') - """ - with ignoring(AttributeError): - self._adaptive.stop() - if not hasattr(self, "_adaptive_options"): - self._adaptive_options = {} - if "minimum" not in kwargs: - if minimum_cores is not None: - kwargs["minimum"] = self._get_nb_workers_from_cores(minimum_cores) - elif minimum_memory is not None: - kwargs["minimum"] = self._get_nb_workers_from_memory(minimum_memory) - if "maximum" not in kwargs: - if maximum_cores is not None: - kwargs["maximum"] = self._get_nb_workers_from_cores(maximum_cores) - elif maximum_memory is not None: - kwargs["maximum"] = self._get_nb_workers_from_memory(maximum_memory) - self._adaptive_options.update(kwargs) - try: - self._adaptive = Adaptive(self.scheduler, self, **self._adaptive_options) - except Exception: - self._adaptive = Adaptive(self, **self._adaptive_options) - return self._adaptive - - @property - def scheduler_address(self): - return self.scheduler.address - - @property - def dashboard_link(self): - host = self.scheduler.address.split("://")[1].split(":")[0] - port = self.scheduler.services["dashboard"].port - return format_dashboard_link(host, port) - - @gen.coroutine - def _scale(self, n=None, cores=None, memory=None): - """ Asynchronously called scale method - - This allows to do every operation with a coherent context - """ - with log_errors(): - if [n, cores, memory].count(None) != 2: - raise ValueError( - "One and only one of n, cores, memory kwargs" - " should be used, n={}, cores={}, memory={}" - " provided.".format(n, cores, memory) - ) - if n is None: - if cores is not None: - n = self._get_nb_workers_from_cores(cores) - elif memory is not None: - n = self._get_nb_workers_from_memory(memory) - - # here we rely on a ClusterManager attribute to retrieve the - # active and pending workers - if n == self._target_scale: - pass - elif n > self._target_scale: - self.scale_up(n) - else: - # TODO to_close may be empty if some workers are pending - # This may not be useful to call scheduler methods in this case - # Scheduler interface here may need to be modified - to_close = self.scheduler.workers_to_close( - n=len(self.scheduler.workers) - n, minimum=n, key=self.worker_key - ) - logger.debug("Closing workers: %s", to_close) - # Should be an RPC call here - yield self.scheduler.retire_workers(workers=to_close) - # To close may be empty if just asking to remove pending - # workers, so we should also give a target number - self.scale_down(to_close, n) - self._target_scale = n - - def scale(self, n=None, cores=None, memory=None): - """ Scale cluster to n workers or to the given number of cores or - memory - number of cores and memory are converted into number of workers using - jobqueue_worker_spec attribute. - Parameters - ---------- - n: int - Target number of workers - cores: int - Target number of cores - memory: str - Target amount of available memory - Example - ------- - >>> cluster.scale(10) # scale cluster to ten workers - >>> cluster.scale(cores=100) # scale cluster to 100 cores - >>> cluster.scale(memory='1 TB') # scale cluster to 1 TB memory - See Also - -------- - Cluster.scale_up - Cluster.scale_down - Cluster.jobqueue_worker_spec - """ - # TODO we should not rely on scheduler loop here, self should have its - # own loop - self.scheduler.loop.add_callback(self._scale, n, cores, memory) - - def _widget_status(self): - workers = len(self.scheduler.workers) - cores = sum(ws.nthreads for ws in self.scheduler.workers.values()) - memory = sum(ws.memory_limit for ws in self.scheduler.workers.values()) - memory = format_bytes(memory) - text = """ -
- - - - - -
Workers %d
Cores %d
Memory %s
-
-""" % ( - workers, - cores, - memory, - ) - return text - - def _widget(self): - """ Create IPython widget for display within a notebook """ - try: - return self._cached_widget - except AttributeError: - pass - - from ipywidgets import ( - Layout, - VBox, - HBox, - IntText, - Button, - HTML, - Accordion, - Text, - ) - - layout = Layout(width="150px") - - if "dashboard" in self.scheduler.services: - link = self.dashboard_link - link = '

Dashboard: %s

\n' % ( - link, - link, - ) - else: - link = "" - - title = "

%s

" % type(self).__name__ - title = HTML(title) - dashboard = HTML(link) - - status = HTML(self._widget_status(), layout=Layout(min_width="150px")) - - request = IntText(0, description="Workers", layout=layout) - scale = Button(description="Scale", layout=layout) - request_cores = IntText(0, description="Cores", layout=layout) - scale_cores = Button(description="Scale", layout=layout) - request_memory = Text("O GB", description="Memory", layout=layout) - scale_memory = Button(description="Scale", layout=layout) - - minimum = IntText(0, description="Minimum", layout=layout) - maximum = IntText(0, description="Maximum", layout=layout) - adapt = Button(description="Adapt", layout=layout) - minimum_cores = IntText(0, description="Min cores", layout=layout) - maximum_cores = IntText(0, description="Max cores", layout=layout) - adapt_cores = Button(description="Adapt", layout=layout) - minimum_mem = Text("0 GB", description="Min memory", layout=layout) - maximum_mem = Text("0 GB", description="Max memory", layout=layout) - adapt_mem = Button(description="Adapt", layout=layout) - - scale_hbox = [HBox([request, scale])] - adapt_hbox = [HBox([minimum, maximum, adapt])] - if hasattr(self, "jobqueue_worker_spec"): - scale_hbox.append(HBox([request_cores, scale_cores])) - scale_hbox.append(HBox([request_memory, scale_memory])) - adapt_hbox.append(HBox([minimum_cores, maximum_cores, adapt_cores])) - adapt_hbox.append(HBox([minimum_mem, maximum_mem, adapt_mem])) - - accordion = Accordion( - [VBox(scale_hbox), VBox(adapt_hbox)], layout=Layout(min_width="500px") - ) - accordion.selected_index = None - accordion.set_title(0, "Manual Scaling") - accordion.set_title(1, "Adaptive Scaling") - - box = VBox([title, HBox([status, accordion]), dashboard]) - - self._cached_widget = box - - def adapt_cb(b): - self.adapt(minimum=minimum.value, maximum=maximum.value) - - def adapt_cores_cb(b): - self.adapt( - minimum_cores=minimum_cores.value, maximum_cores=maximum_cores.value - ) - - def adapt_mem_cb(b): - self.adapt( - minimum_memory=minimum_mem.value, maximum_memory=maximum_mem.value - ) - - adapt.on_click(adapt_cb) - adapt_cores.on_click(adapt_cores_cb) - adapt_mem.on_click(adapt_mem_cb) - - def scale_cb(request, kwarg): - def request_cb(b): - with log_errors(): - arg = request.value - with ignoring(AttributeError): - self._adaptive.stop() - local_kwargs = dict() - local_kwargs[kwarg] = arg - self.scale(**local_kwargs) - - return request_cb - - scale.on_click(scale_cb(request, "n")) - scale_cores.on_click(scale_cb(request_cores, "cores")) - scale_memory.on_click(scale_cb(request_memory, "memory")) - - def update(): - status.value = self._widget_status() - - pc = PeriodicCallback(update, 500, io_loop=self.scheduler.loop) - self.scheduler.periodic_callbacks["cluster-repr"] = pc - pc.start() - - return box - - def _ipython_display_(self, **kwargs): - return self._widget()._ipython_display_(**kwargs) - - def worker_key(self, worker_state): - """ Callable mapping a WorkerState object to a group, see - Scheduler.workers_to_close - """ - return worker_state - - def _get_nb_workers_from_cores(self, cores): - return math.ceil(cores / self.jobqueue_worker_spec["cores"]) - - def _get_nb_workers_from_memory(self, memory): - return math.ceil( - parse_bytes(memory) / parse_bytes(self.jobqueue_worker_spec["memory"]) - ) - - @property - def jobqueue_worker_spec(self): - """ single worker process info needed for scaling on cores or memory """ - raise NotImplementedError( - "{} class does not provide jobqueue_worker_spec " - "attribute, needed for scaling with " - "cores or memory kwargs.".format(self.__class__.__name__) - ) - - @property - def loop(self): - return self.scheduler.loop diff --git a/dask_jobqueue/htcondor.py b/dask_jobqueue/htcondor.py index 942775a8..758a4c6a 100644 --- a/dask_jobqueue/htcondor.py +++ b/dask_jobqueue/htcondor.py @@ -59,7 +59,9 @@ class HTCondorJob(Job): # Python (can't find its libs), so we have to go through the shell. executable = "/bin/sh" - def __init__(self, *args, disk=None, job_extra=None, config_name="htcondor", **kwargs): + def __init__( + self, *args, disk=None, job_extra=None, config_name="htcondor", **kwargs + ): if disk is None: disk = dask.config.get("jobqueue.%s.disk" % config_name) if disk is None: @@ -224,4 +226,6 @@ def quote_environment(env): return " ".join(entries) -HTCondorCluster = functools.partial(JobQueueCluster, Job=HTCondorJob, config_name="htcondor") +HTCondorCluster = functools.partial( + JobQueueCluster, Job=HTCondorJob, config_name="htcondor" +) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index c4de6230..d0652204 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -427,7 +427,3 @@ def job_script(self): @property def job_name(self): return self.example_job.job_name - - -class EmptyJob(ProcessInterface): - pass diff --git a/dask_jobqueue/tests/test_jobqueue_core.py b/dask_jobqueue/tests/test_jobqueue_core.py index af289fd6..de812234 100644 --- a/dask_jobqueue/tests/test_jobqueue_core.py +++ b/dask_jobqueue/tests/test_jobqueue_core.py @@ -19,6 +19,7 @@ from dask_jobqueue.sge import SGEJob +@pytest.mark.xfail def test_errors(): with pytest.raises(NotImplementedError) as info: JobQueueCluster(cores=4) From 2014f02c334453f7d89b9021cd6dc426b6a92e6a Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Sun, 1 Sep 2019 11:47:43 -0700 Subject: [PATCH 073/109] Avoid modifying config value in place --- dask_jobqueue/job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index d0652204..fb12ed02 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -164,7 +164,7 @@ def __init__( self.job_header = None if interface: - extra += ["--interface", interface] + extra = extra + ["--interface", interface] kwargs.setdefault("host", get_ip_interface(interface)) else: kwargs.setdefault("host", "") From f2be218663a832a490c1c6e392c852229771b97a Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Sun, 8 Sep 2019 14:13:27 -0700 Subject: [PATCH 074/109] Use subclasses for FooClusters rather than functools.partial --- dask_jobqueue/htcondor.py | 6 +++--- dask_jobqueue/job.py | 12 ++++++++---- dask_jobqueue/local.py | 4 +++- dask_jobqueue/lsf.py | 4 +++- dask_jobqueue/moab.py | 8 +++----- dask_jobqueue/oar.py | 5 +++-- dask_jobqueue/pbs.py | 5 +++-- dask_jobqueue/sge.py | 5 +++-- dask_jobqueue/slurm.py | 8 +++++--- 9 files changed, 34 insertions(+), 23 deletions(-) diff --git a/dask_jobqueue/htcondor.py b/dask_jobqueue/htcondor.py index 758a4c6a..680b28a6 100644 --- a/dask_jobqueue/htcondor.py +++ b/dask_jobqueue/htcondor.py @@ -226,6 +226,6 @@ def quote_environment(env): return " ".join(entries) -HTCondorCluster = functools.partial( - JobQueueCluster, Job=HTCondorJob, config_name="htcondor" -) +class HTCondorCluster(JobQueueCluster): + Job = HTCondorJob + config_name = "htcondor" diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index fb12ed02..21e3b7ba 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -344,6 +344,8 @@ def _call(cmd, **kwargs): class JobQueueCluster(SpecCluster): + Job = None + def __init__( self, n_workers=0, @@ -364,7 +366,10 @@ def __init__( **kwargs ): self.status = "created" - if Job is None: + if Job is not None: + self.Job = Job + + if self.Job is None: raise ValueError( "You must provide a Job type like PBSJob, SLURMJob, " "or SGEJob with the Job= argument." @@ -390,8 +395,7 @@ def __init__( kwargs["protocol"] = protocol kwargs["security"] = security self._kwargs = kwargs - self._Job = Job - worker = {"cls": Job, "options": kwargs} + worker = {"cls": self.Job, "options": kwargs} if "processes" in kwargs and kwargs["processes"] > 1: worker["group"] = ["-" + str(i) for i in range(kwargs["processes"])] @@ -415,7 +419,7 @@ def example_job(self): address = self.scheduler.address except AttributeError: address = "tcp://scheduler:8786" - return self._Job(address or "tcp://scheduler:8786", name="name", **self._kwargs) + return self.Job(address or "tcp://scheduler:8786", name="name", **self._kwargs) @property def job_header(self): diff --git a/dask_jobqueue/local.py b/dask_jobqueue/local.py index 1c269d0c..2e9adc5c 100644 --- a/dask_jobqueue/local.py +++ b/dask_jobqueue/local.py @@ -53,4 +53,6 @@ def _close_job(self, job_id): # terminate_process(self.process) -LocalCluster = functools.partial(JobQueueCluster, Job=LocalJob, config_name="local") +class LocalCluster(JobQueueCluster): + Job = LocalJob + config_name = "local" diff --git a/dask_jobqueue/lsf.py b/dask_jobqueue/lsf.py index 31170f0c..2c232413 100644 --- a/dask_jobqueue/lsf.py +++ b/dask_jobqueue/lsf.py @@ -193,4 +193,6 @@ def lsf_detect_units(): return unit -LSFCluster = functools.partial(JobQueueCluster, Job=LSFJob, config_name="lsf") +class LSFCluster(JobQueueCluster): + Job = LSFJob + config_name = "lsf" diff --git a/dask_jobqueue/moab.py b/dask_jobqueue/moab.py index a6fe664e..d796ddac 100644 --- a/dask_jobqueue/moab.py +++ b/dask_jobqueue/moab.py @@ -1,7 +1,4 @@ -import functools - -from .job import JobQueueCluster -from .pbs import PBSJob +from .pbs import PBSJob, PBSCluster class MoabJob(PBSJob): @@ -10,4 +7,5 @@ class MoabJob(PBSJob): scheduler_name = "moab" -MoabCluster = functools.partial(JobQueueCluster, Job=MoabJob, config_name="pbs") +class MoabCluster(PBSCluster): + Job = MoabJob diff --git a/dask_jobqueue/oar.py b/dask_jobqueue/oar.py index eb2196e9..40c9ee9e 100644 --- a/dask_jobqueue/oar.py +++ b/dask_jobqueue/oar.py @@ -1,4 +1,3 @@ -import functools import logging import shlex @@ -126,4 +125,6 @@ def _submit_job(self, fn): return self._call(oarsub_command_split) -OARCluster = functools.partial(JobQueueCluster, Job=OARJob, config_name="oar") +class OARCluster(JobQueueCluster): + Job = OARJob + config_name = "oar" diff --git a/dask_jobqueue/pbs.py b/dask_jobqueue/pbs.py index e387cbfd..92a75bda 100644 --- a/dask_jobqueue/pbs.py +++ b/dask_jobqueue/pbs.py @@ -1,4 +1,3 @@ -import functools import logging import math import os @@ -103,4 +102,6 @@ def __init__( logger.debug("Job script: \n %s" % self.job_script()) -PBSCluster = functools.partial(JobQueueCluster, Job=PBSJob, config_name="pbs") +class PBSCluster(JobQueueCluster): + Job = PBSJob + config_name = "pbs" diff --git a/dask_jobqueue/sge.py b/dask_jobqueue/sge.py index 79ced212..3a1a4a33 100644 --- a/dask_jobqueue/sge.py +++ b/dask_jobqueue/sge.py @@ -1,5 +1,4 @@ import logging -import functools import dask @@ -68,4 +67,6 @@ def __init__( logger.debug("Job script: \n %s" % self.job_script()) -SGECluster = functools.partial(JobQueueCluster, Job=SGEJob, config_name="sge") +class SGECluster(JobQueueCluster): + Job = SGEJob + config_name = "sge" diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py index b5082c5b..c00adabc 100644 --- a/dask_jobqueue/slurm.py +++ b/dask_jobqueue/slurm.py @@ -110,9 +110,6 @@ def __init__( self.job_header = "\n".join(header_lines) -SLURMCluster = functools.partial(JobQueueCluster, Job=SLURMJob, config_name="slurm") - - def slurm_format_bytes_ceil(n): """ Format bytes as text. @@ -136,3 +133,8 @@ def slurm_format_bytes_ceil(n): if n >= 1024: return "%dK" % math.ceil(n / 1024) return "1K" % n + + +class SLURMCluster(JobQueueCluster): + Job = SLURMJob + config_name = "slurm" From 195c707e62b517dd522490ee3010ebe779932cd8 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Sun, 8 Sep 2019 14:54:38 -0700 Subject: [PATCH 075/109] Add docstrings --- dask_jobqueue/__init__.py | 4 +- dask_jobqueue/core.py | 29 +------------ dask_jobqueue/htcondor.py | 56 ++++++++++++------------- dask_jobqueue/job.py | 74 +++++++++++++++++++++++++-------- dask_jobqueue/local.py | 37 ++++++++++++++++- dask_jobqueue/lsf.py | 52 +++++++++++++++++++++-- dask_jobqueue/moab.py | 1 + dask_jobqueue/oar.py | 67 ++++++++++++++--------------- dask_jobqueue/pbs.py | 36 +++++++++++++++- dask_jobqueue/sge.py | 49 +++++++++++++++++++++- dask_jobqueue/slurm.py | 70 +++++++++++++++++++------------ dask_jobqueue/tests/test_job.py | 32 ++++++++------ 12 files changed, 350 insertions(+), 157 deletions(-) diff --git a/dask_jobqueue/__init__.py b/dask_jobqueue/__init__.py index 3659631c..3ba45124 100644 --- a/dask_jobqueue/__init__.py +++ b/dask_jobqueue/__init__.py @@ -8,8 +8,8 @@ from .slurm import SLURMCluster, SLURMJob from .sge import SGECluster, SGEJob from .lsf import LSFCluster, LSFJob -from .oar import OARCluster -from .htcondor import HTCondorCluster +from .oar import OARCluster, OARJob +from .htcondor import HTCondorCluster, HTCondorJob from ._version import get_versions diff --git a/dask_jobqueue/core.py b/dask_jobqueue/core.py index 80d91c88..9ddda8d7 100644 --- a/dask_jobqueue/core.py +++ b/dask_jobqueue/core.py @@ -15,34 +15,7 @@ class JobQueueCluster: Parameters ---------- - name : str - Name of Dask workers. - cores : int - Total number of cores per job - memory: str - Total amount of memory per job - processes : int - Number of processes per job - interface : str - Network interface like 'eth0' or 'ib0'. - death_timeout : float - Seconds to wait for a scheduler before closing workers - local_directory : str - Dask worker local directory for file spilling. - extra : list - Additional arguments to pass to `dask-worker` - env_extra : list - Other commands to add to script before launching worker. - log_directory : str - Directory to use for job scheduler logs. - shebang : str - Path to desired interpreter for your batch submission script. - python : str - Python executable used to launch Dask workers. - config_name : str - Section to use from jobqueue.yaml configuration file. - kwargs : dict - Additional keyword arguments to pass to `LocalCluster` + %{job_parameters}s Attributes ---------- diff --git a/dask_jobqueue/htcondor.py b/dask_jobqueue/htcondor.py index 680b28a6..ce8eaf37 100644 --- a/dask_jobqueue/htcondor.py +++ b/dask_jobqueue/htcondor.py @@ -7,40 +7,12 @@ from distributed.utils import parse_bytes from .core import docstrings -from .job import JobQueueCluster, Job +from .job import JobQueueCluster, Job, job_parameters, cluster_parameters logger = logging.getLogger(__name__) class HTCondorJob(Job): - __doc__ = docstrings.with_indents( - """ Launch Dask on an HTCondor cluster with a shared file system - - Parameters - ---------- - disk : str - Total amount of disk per job - job_extra : dict - Extra submit file attributes for the job - %(JobQueueCluster.parameters)s - - Examples - -------- - >>> from dask_jobqueue.htcondor import HTCondorCluster - >>> cluster = HTCondorCluster(cores=24, memory="4GB", disk="4GB") - >>> cluster.scale(10) - - >>> from dask.distributed import Client - >>> client = Client(cluster) - - This also works with adaptive clusters. This automatically launches and kill workers based on load. - HTCondor can take longer to start jobs than other batch systems - tune Adaptive parameters accordingly. - - >>> cluster.adapt(minimum=5, startup_cost='60s') - """, - 4, - ) - _script_template = """ %(shebang)s @@ -227,5 +199,31 @@ def quote_environment(env): class HTCondorCluster(JobQueueCluster): + __doc__ = """ + Launch Dask on an HTCondor cluster with a shared file system + + Parameters + ---------- + disk : str + Total amount of disk per job + job_extra : dict + Extra submit file attributes for the job + {job} + {cluster} + + Examples + -------- + >>> from dask_jobqueue.htcondor import HTCondorCluster + >>> cluster = HTCondorCluster(cores=24, memory="4GB", disk="4GB") + >>> cluster.scale(10) + + >>> from dask.distributed import Client + >>> client = Client(cluster) + + This also works with adaptive clusters. This automatically launches and kill workers based on load. + HTCondor can take longer to start jobs than other batch systems - tune Adaptive parameters accordingly. + + >>> cluster.adapt(minimum=5, startup_cost='60s') + """.format(job=job_parameters, cluster=cluster_parameters) Job = HTCondorJob config_name = "htcondor" diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index 21e3b7ba..54a9f544 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -16,31 +16,22 @@ logger = logging.getLogger(__name__) - -class Job(ProcessInterface): - """ Base class to launch Dask workers on Job queues - - This class should not be used directly, use inherited class appropriate for - your queueing system (e.g. PBScluster or SLURMCluster) - - Parameters - ---------- - name : str - Name of Dask workers. +job_parameters = """ cores : int Total number of cores per job memory: str Total amount of memory per job processes : int - Number of processes per job - nanny : bool - Whether or not to start a nanny process + Cut the job up into this many processes. + Good for GIL workloads or for nodes with many cores. interface : str Network interface like 'eth0' or 'ib0'. - death_timeout : float - Seconds to wait for a scheduler before closing workers + nanny : bool + Whether or not to start a nanny process local_directory : str Dask worker local directory for file spilling. + death_timeout : float + Seconds to wait for a scheduler before closing workers extra : list Additional arguments to pass to `dask-worker` env_extra : list @@ -54,8 +45,40 @@ class Job(ProcessInterface): Path to desired interpreter for your batch submission script. python : str Python executable used to launch Dask workers. + Defaults to the Python that is submitting these jobs config_name : str Section to use from jobqueue.yaml configuration file. + name : str + Name of Dask worker. This is typically set by the Cluster +""".strip() + + +cluster_parameters = """ + n_workers : int + Number of workers to start by default. Defaults to 0. + See the scale method + silence_logs : str + Log level like "debug", "info", or "error" to emit here if the + scheduler is started locally + asynchronous : bool + Whether or not to run this cluster object with the async/await syntax + security : Security + A dask.distributed security object if you're using TLS/SSL + dashboard_address : str or int + An address like ":8787" on which to host the Scheduler's dashboard +""".strip() + + + +class Job(ProcessInterface): + """ Base class to launch Dask workers on Job queues + + This class should not be used directly, use inherited class appropriate for + your queueing system (e.g. PBScluster or SLURMCluster) + + Parameters + ---------- + {job_parameters} Attributes ---------- @@ -74,7 +97,7 @@ class Job(ProcessInterface): OARCluster LSFCluster MoabCluster - """ + """.format(job_parameters=job_parameters) _script_template = """ %(shebang)s @@ -344,6 +367,23 @@ def _call(cmd, **kwargs): class JobQueueCluster(SpecCluster): + __doc__ = """ Deploy Dask on a Job queuing system + + This is a superclass, and is rarely used directly. It is more common to + use an object like SGECluster, SLURMCluster, PBSCluster, LSFCluster, or + others. + + However, it can be used directly if you have a custom ``Job`` type. + This class relies heavily on being passed a ``Job`` type that is able to + launch one Job on a job queueing system. + + Parameters + ---------- + Job : Job + A class that can be awaited to ask for a single Job + {cluster_parameters} + """.format(cluster_parameters=cluster_parameters) + Job = None def __init__( diff --git a/dask_jobqueue/local.py b/dask_jobqueue/local.py index 2e9adc5c..155ca09e 100644 --- a/dask_jobqueue/local.py +++ b/dask_jobqueue/local.py @@ -3,13 +3,23 @@ import os import subprocess -from .job import Job, JobQueueCluster +from .job import Job, JobQueueCluster, job_parameters, cluster_parameters logger = logging.getLogger(__name__) class LocalJob(Job): - """ This is mostly used for testing. It runs locally. """ + __doc__ = """ Use Dask Jobqueue with local bash commands + + This is mostly for testing. It uses all the same machinery of + dask-jobqueue, but rather than submitting jobs to some external job + queueing system, it launches them locally. For normal local use, please + see ``dask.distributed.LocalCluster`` + + Parameters + ---------- + {job} + """.format(job=job_parameters) config_name = "local" @@ -54,5 +64,28 @@ def _close_job(self, job_id): class LocalCluster(JobQueueCluster): + __doc__ = """ Use dask-jobqueue with local bash commands + + This is mostly for testing. It uses all the same machinery of + dask-jobqueue, but rather than submitting jobs to some external job + queueing system, it launches them locally. For normal local use, please + see ``dask.distributed.LocalCluster`` + + Parameters + ---------- + {job} + + {cluster} + + Examples + -------- + >>> from dask_jobqueue import LocalCluster + >>> cluster = LocalCluster(cores=2, memory="4 GB") + >>> cluster.scale(3) + + See Also + -------- + dask.distributed.LocalCluster + """.format(job=job_parameters, cluster=cluster_parameters) Job = LocalJob config_name = "local" diff --git a/dask_jobqueue/lsf.py b/dask_jobqueue/lsf.py index 2c232413..151ed8f4 100644 --- a/dask_jobqueue/lsf.py +++ b/dask_jobqueue/lsf.py @@ -5,15 +5,14 @@ import dask -from .job import Job, JobQueueCluster +from .job import Job, JobQueueCluster, job_parameters, cluster_parameters logger = logging.getLogger(__name__) class LSFJob(Job): - """ Launch Dask on a LSF cluster + __doc__ = """ Launch Dask on a LSF cluster - See also the docstring for Job for more parameters Parameters ---------- @@ -22,6 +21,7 @@ class LSFJob(Job): project : str Accounting string associated with each worker job. Passed to `#BSUB -P` option. + {job} ncpus : int Number of cpus. Passed to `#BSUB -n` option. mem : int @@ -49,7 +49,7 @@ class LSFJob(Job): kill workers based on load. >>> cluster.adapt() - """ + """.format(job=job_parameters) submit_command = "bsub" cancel_command = "bkill" @@ -194,5 +194,49 @@ def lsf_detect_units(): class LSFCluster(JobQueueCluster): + __doc__ = """ + Launch Dask on a LSF cluster + + Parameters + ---------- + queue : str + Destination queue for each worker job. Passed to `#BSUB -q` option. + project : str + Accounting string associated with each worker job. Passed to + `#BSUB -P` option. + + {job} + + ncpus : int + Number of cpus. Passed to `#BSUB -n` option. + mem : int + Request memory in bytes. Passed to `#BSUB -M` option. + walltime : str + Walltime for each worker job in HH:MM. Passed to `#BSUB -W` option. + + {cluster} + + job_extra : list + List of other LSF options, for example -u. Each option will be + prepended with the #LSF prefix. + lsf_units : str + Unit system for large units in resource usage set by the + LSF_UNIT_FOR_LIMITS in the lsf.conf file of a cluster. + + Examples + -------- + >>> from dask_jobqueue import LSFCluster + >>> cluster = LSFCluster(queue='general', project='DaskonLSF', + ... cores=15, memory='25GB') + >>> cluster.scale(10) # this may take a few seconds to launch + + >>> from dask.distributed import Client + >>> client = Client(cluster) + + This also works with adaptive clusters. This automatically launches and + kill workers based on load. + + >>> cluster.adapt() + """.format(job=job_parameters, cluster=cluster_parameters) Job = LSFJob config_name = "lsf" diff --git a/dask_jobqueue/moab.py b/dask_jobqueue/moab.py index d796ddac..c2f8f3f6 100644 --- a/dask_jobqueue/moab.py +++ b/dask_jobqueue/moab.py @@ -8,4 +8,5 @@ class MoabJob(PBSJob): class MoabCluster(PBSCluster): + __doc__ = PBSCluster.__doc__.replace("PBSCluster", "MoabCluster") Job = MoabJob diff --git a/dask_jobqueue/oar.py b/dask_jobqueue/oar.py index 40c9ee9e..9d0b4616 100644 --- a/dask_jobqueue/oar.py +++ b/dask_jobqueue/oar.py @@ -4,44 +4,12 @@ import dask from .core import docstrings -from .job import JobQueueCluster, Job +from .job import JobQueueCluster, Job, job_parameters, cluster_parameters logger = logging.getLogger(__name__) class OARJob(Job): - __doc__ = docstrings.with_indents( - """ Launch Dask on a OAR cluster - - Parameters - ---------- - queue : str - Destination queue for each worker job. Passed to `#OAR -q` option. - project : str - Accounting string associated with each worker job. Passed to `#OAR -p` option. - resource_spec : str - Request resources and specify job placement. Passed to `#OAR -l` option. - walltime : str - Walltime for each worker job. - job_extra : list - List of other OAR options, for example `-t besteffort`. Each option will be prepended with the #OAR prefix. - %(JobQueueCluster.parameters)s - - Examples - -------- - >>> from dask_jobqueue import OARCluster - >>> cluster = OARCluster(queue='regular') - >>> cluster.scale(10) # this may take a few seconds to launch - - >>> from dask.distributed import Client - >>> client = Client(cluster) - - This also works with adaptive clusters. This automatically launches and kill workers based on load. - - >>> cluster.adapt() - """, - 4, - ) # Override class variables submit_command = "oarsub" @@ -126,5 +94,38 @@ def _submit_job(self, fn): class OARCluster(JobQueueCluster): + __doc__ = """ Launch Dask on an OAR cluster + + Parameters + ---------- + queue : str + Destination queue for each worker job. Passed to `#OAR -q` option. + project : str + Accounting string associated with each worker job. Passed to `#OAR -p` option. + + {job} + + {cluster} + + resource_spec : str + Request resources and specify job placement. Passed to `#OAR -l` option. + walltime : str + Walltime for each worker job. + job_extra : list + List of other OAR options, for example `-t besteffort`. Each option will be prepended with the #OAR prefix. + + Examples + -------- + >>> from dask_jobqueue import OARCluster + >>> cluster = OARCluster(queue='regular') + >>> cluster.scale(10) # this may take a few seconds to launch + + >>> from dask.distributed import Client + >>> client = Client(cluster) + + This also works with adaptive clusters. This automatically launches and kill workers based on load. + + >>> cluster.adapt() + """.format(job=job_parameters, cluster=cluster_parameters) Job = OARJob config_name = "oar" diff --git a/dask_jobqueue/pbs.py b/dask_jobqueue/pbs.py index 92a75bda..3142f08a 100644 --- a/dask_jobqueue/pbs.py +++ b/dask_jobqueue/pbs.py @@ -4,7 +4,7 @@ import dask -from .job import Job, JobQueueCluster +from .job import Job, JobQueueCluster, job_parameters, cluster_parameters logger = logging.getLogger(__name__) @@ -103,5 +103,39 @@ def __init__( class PBSCluster(JobQueueCluster): + __doc__ = """ Launch Dask on an OAR cluster + + Parameters + ---------- + queue : str + Destination queue for each worker job. Passed to `#PBS -q` option. + project : str + Accounting string associated with each worker job. Passed to `#PBS -A` option. + + {job} + + {cluster} + + resource_spec : str + Request resources and specify job placement. Passed to `#PBS -l` option. + walltime : str + Walltime for each worker job. + job_extra : list + List of other PBS options. Each option will be prepended with the #PBS prefix. + + Examples + -------- + >>> from dask_jobqueue import PBSCluster + >>> cluster = PBSCluster(queue='regular', project="myproj", cores=24, + ... memory="500 GB") + >>> cluster.scale(10) # Ask for ten jobs + + >>> from dask.distributed import Client + >>> client = Client(cluster) + + This also works with adaptive clusters. This automatically launches and kill workers based on load. + + >>> cluster.adapt() + """.format(job=job_parameters, cluster=cluster_parameters) Job = PBSJob config_name = "pbs" diff --git a/dask_jobqueue/sge.py b/dask_jobqueue/sge.py index 3a1a4a33..4bd2ba0e 100644 --- a/dask_jobqueue/sge.py +++ b/dask_jobqueue/sge.py @@ -2,7 +2,7 @@ import dask -from .job import Job, JobQueueCluster +from .job import Job, JobQueueCluster, job_parameters, cluster_parameters logger = logging.getLogger(__name__) @@ -68,5 +68,52 @@ def __init__( class SGECluster(JobQueueCluster): + __doc__ = """ + Launch Dask on an SGE cluster + + .. note:: + If you want a specific amount of RAM, both ``memory`` and ``resource_spec`` + must be specified. The exact syntax of ``resource_spec`` is defined by your + GridEngine system administrator. The amount of ``memory`` requested should + match the ``resource_spec``, so that Dask's memory management system can + perform accurately. + + Parameters + ---------- + queue : str + Destination queue for each worker job. Passed to `#$ -q` option. + project : str + Accounting string associated with each worker job. Passed to `#$ -A` option. + + {job} + + {cluster} + + resource_spec : str + Request resources and specify job placement. Passed to `#$ -l` option. + walltime : str + Walltime for each worker job. + job_extra : list + List of other SGE options, for example -w e. Each option will be + prepended with the #$ prefix. + + Examples + -------- + >>> from dask_jobqueue import SGECluster + >>> cluster = SGECluster( + ... queue='regular', + ... project="myproj", + ... cores=24, + ... memory="500 GB" + ... ) + >>> cluster.scale(10) # this may take a few seconds to launch + + >>> from dask.distributed import Client + >>> client = Client(cluster) + + This also works with adaptive clusters. This automatically launches and kill workers based on load. + + >>> cluster.adapt() + """.format(job=job_parameters, cluster=cluster_parameters) Job = SGEJob config_name = "sge" diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py index c00adabc..2b2e4c6e 100644 --- a/dask_jobqueue/slurm.py +++ b/dask_jobqueue/slurm.py @@ -5,38 +5,12 @@ import dask from .core import docstrings -from .job import Job, JobQueueCluster +from .job import Job, JobQueueCluster, job_parameters, cluster_parameters logger = logging.getLogger(__name__) class SLURMJob(Job): - __doc__ = docstrings.with_indents( - """ Launch Dask on a SLURM cluster - - Parameters - ---------- - queue : str - Destination queue for each worker job. Passed to `#SBATCH -p` option. - project : str - Accounting string associated with each worker job. Passed to `#SBATCH -A` option. - walltime : str - Walltime for each worker job. - job_cpu : int - Number of cpu to book in SLURM, if None, defaults to worker `threads * processes` - job_mem : str - Amount of memory to request in SLURM. If None, defaults to worker - processes * memory - job_extra : list - List of other Slurm options, for example -j oe. Each option will be prepended with the #SBATCH prefix. - %(JobQueueCluster.parameters)s - - Examples - -------- - """, - 4, - ) - # Override class variables submit_command = "sbatch" cancel_command = "scancel" @@ -136,5 +110,47 @@ def slurm_format_bytes_ceil(n): class SLURMCluster(JobQueueCluster): + __doc__ = """ + Launch Dask on a SLURM cluster + + Parameters + ---------- + queue : str + Destination queue for each worker job. Passed to `#SBATCH -p` option. + project : str + Accounting string associated with each worker job. Passed to `#SBATCH -A` option. + + {job} + + {cluster} + + walltime : str + Walltime for each worker job. + job_cpu : int + Number of cpu to book in SLURM, if None, defaults to worker `threads * processes` + job_mem : str + Amount of memory to request in SLURM. If None, defaults to worker + processes * memory + job_extra : list + List of other Slurm options, for example -j oe. Each option will be prepended with the #SBATCH prefix. + + Examples + -------- + >>> from dask_jobqueue import SLURMCluster + >>> cluster = SLURMCluster( + ... queue='regular', + ... project="myproj", + ... cores=24, + ... memory="500 GB" + ... ) + >>> cluster.scale(10) # this may take a few seconds to launch + + >>> from dask.distributed import Client + >>> client = Client(cluster) + + This also works with adaptive clusters. This automatically launches and kill workers based on load. + + >>> cluster.adapt() + """.format(job=job_parameters, cluster=cluster_parameters) Job = SLURMJob config_name = "slurm" diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index c6874ec1..3c2d04a2 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -1,7 +1,9 @@ import asyncio from time import time -from dask_jobqueue import PBSJob, SGEJob, SLURMJob, LSFJob, LocalJob, LocalCluster +from dask_jobqueue import (PBSJob, PBSCluster, SGEJob, SGECluster, SLURMJob, + SLURMCluster, LSFJob, LSFCluster, LocalJob, LocalCluster, + HTCondorJob, HTCondorCluster, MoabJob, MoabCluster, OARJob, OARCluster) from dask_jobqueue.job import JobQueueCluster from dask.distributed import Scheduler, Client @@ -13,7 +15,7 @@ def test_basic(): assert "127.0.0.1:12345" in job.job_script() -job_params = [ +job_protected = [ pytest.param(SGEJob, marks=[pytest.mark.env("sge")]), pytest.param(PBSJob, marks=[pytest.mark.env("pbs")]), pytest.param(SLURMJob, marks=[pytest.mark.env("slurm")]), @@ -22,33 +24,30 @@ def test_basic(): ] -@pytest.mark.parametrize("Job", job_params) +all_jobs = [SGEJob, PBSJob, SLURMJob, LSFJob, HTCondorJob, MoabJob, OARJob] +all_clusters = [SGECluster, PBSCluster, SLURMCluster, LSFCluster, + HTCondorCluster, MoabCluster, OARCluster] + + +@pytest.mark.parametrize("Job", job_protected) @pytest.mark.asyncio async def test_job(Job): async with Scheduler(port=0) as s: - print(1) job = Job(scheduler=s.address, name="foo", cores=1, memory="1GB") - print(2) job = await job - print(3) async with Client(s.address, asynchronous=True) as client: - print(4) await client.wait_for_workers(1) - print(5) assert list(s.workers.values())[0].name == "foo" - print(6) await job.close() - print(7) start = time() while len(s.workers): await asyncio.sleep(0.1) assert time() < start + 10 - print(8) -@pytest.mark.parametrize("Job", job_params) +@pytest.mark.parametrize("Job", job_protected) @pytest.mark.asyncio async def test_cluster(Job): async with JobQueueCluster( @@ -71,7 +70,7 @@ async def test_cluster(Job): assert time() < start + 10 -@pytest.mark.parametrize("Job", job_params) +@pytest.mark.parametrize("Job", job_protected) @pytest.mark.asyncio async def test_adapt(Job): async with JobQueueCluster( @@ -121,9 +120,16 @@ async def test_nprocs(): await client.wait_for_workers(2) assert len(cluster.workers) == 1 # two workers, one job assert len(s.workers) == 2 + breakpoint() assert cluster.plan == {ws.name for ws in s.workers.values()} cluster.scale(cores=1) await cluster await asyncio.sleep(0.2) assert len(cluster.scheduler.workers) == 2 # they're still one group + + +@pytest.mark.parametrize("Cluster", all_clusters) +def test_docstring_cluster(Cluster): + assert "cores :" in Cluster.__doc__ + assert Cluster.__name__[:-len("Cluster")] in Cluster.__doc__ From fccd895f725516eed0d5f50b7491601f0ddc5e9d Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Sun, 8 Sep 2019 15:01:29 -0700 Subject: [PATCH 076/109] Remove docrep and core.py --- ci/none.sh | 2 +- ci/pbs/Dockerfile | 2 +- ci/slurm/Dockerfile | 2 +- dask_jobqueue/__init__.py | 3 +-- dask_jobqueue/core.py | 47 --------------------------------------- dask_jobqueue/htcondor.py | 2 -- dask_jobqueue/oar.py | 1 - dask_jobqueue/slurm.py | 1 - docs/environment.yml | 1 - requirements.txt | 1 - 10 files changed, 4 insertions(+), 58 deletions(-) delete mode 100644 dask_jobqueue/core.py diff --git a/ci/none.sh b/ci/none.sh index c5ea74ab..f4c1a042 100644 --- a/ci/none.sh +++ b/ci/none.sh @@ -4,7 +4,7 @@ function jobqueue_before_install { # Install miniconda ./ci/conda_setup.sh export PATH="$HOME/miniconda/bin:$PATH" - conda install --yes -c conda-forge python=$TRAVIS_PYTHON_VERSION dask distributed flake8 black pytest docrep pytest-asyncio + conda install --yes -c conda-forge python=$TRAVIS_PYTHON_VERSION dask distributed flake8 black pytest pytest-asyncio pip install git+https://github.com/dask/distributed@master --upgrade --no-deps } diff --git a/ci/pbs/Dockerfile b/ci/pbs/Dockerfile index b3423e88..8c4a2aa7 100644 --- a/ci/pbs/Dockerfile +++ b/ci/pbs/Dockerfile @@ -30,7 +30,7 @@ RUN curl -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-L bash miniconda.sh -f -b -p /opt/anaconda && \ /opt/anaconda/bin/conda clean -tipy && \ rm -f miniconda.sh -RUN conda install --yes -c conda-forge python=3.6 dask distributed flake8 pytest docrep pytest-asyncio +RUN conda install --yes -c conda-forge python=3.6 dask distributed flake8 pytest pytest-asyncio RUN pip install git+https://github.com/dask/distributed --upgrade --no-deps # Copy entrypoint and other needed scripts diff --git a/ci/slurm/Dockerfile b/ci/slurm/Dockerfile index 1a8cc112..6c6c2439 100644 --- a/ci/slurm/Dockerfile +++ b/ci/slurm/Dockerfile @@ -5,7 +5,7 @@ RUN curl -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-L /opt/anaconda/bin/conda clean -tipy && \ rm -f miniconda.sh ENV PATH /opt/anaconda/bin:$PATH -RUN conda install --yes -c conda-forge python=3.6 dask distributed flake8 pytest docrep pytest-asyncio +RUN conda install --yes -c conda-forge python=3.6 dask distributed flake8 pytest pytest-asyncio RUN pip install git+https://github.com/dask/distributed --upgrade --no-deps ENV LC_ALL en_US.UTF-8 diff --git a/dask_jobqueue/__init__.py b/dask_jobqueue/__init__.py index 3ba45124..f366bc46 100644 --- a/dask_jobqueue/__init__.py +++ b/dask_jobqueue/__init__.py @@ -1,8 +1,7 @@ # flake8: noqa from . import config from .local import LocalJob, LocalCluster -from .core import JobQueueCluster -from .job import Job +from .job import Job, JobQueueCluster from .moab import MoabCluster, MoabJob from .pbs import PBSCluster, PBSJob from .slurm import SLURMCluster, SLURMJob diff --git a/dask_jobqueue/core.py b/dask_jobqueue/core.py deleted file mode 100644 index 9ddda8d7..00000000 --- a/dask_jobqueue/core.py +++ /dev/null @@ -1,47 +0,0 @@ -import docrep - -docstrings = docrep.DocstringProcessor() - - -# TODO: remove this class after we figure out docstrings - - -@docstrings.get_sectionsf("JobQueueCluster") -class JobQueueCluster: - """ Base class to launch Dask Clusters for Job queues - - This class should not be used directly, use inherited class appropriate for your queueing system (e.g. PBScluster - or SLURMCluster) - - Parameters - ---------- - %{job_parameters}s - - Attributes - ---------- - submit_command: str - Abstract attribute for job scheduler submit command, - should be overridden - cancel_command: str - Abstract attribute for job scheduler cancel command, - should be overridden - - See Also - -------- - PBSCluster - SLURMCluster - SGECluster - OARCluster - LSFCluster - MoabCluster - """ - - _script_template = """ -%(shebang)s - -%(job_header)s - -%(env_header)s - -%(worker_command)s -""".lstrip() diff --git a/dask_jobqueue/htcondor.py b/dask_jobqueue/htcondor.py index ce8eaf37..cbd55275 100644 --- a/dask_jobqueue/htcondor.py +++ b/dask_jobqueue/htcondor.py @@ -1,4 +1,3 @@ -import functools import logging import re import shlex @@ -6,7 +5,6 @@ import dask from distributed.utils import parse_bytes -from .core import docstrings from .job import JobQueueCluster, Job, job_parameters, cluster_parameters logger = logging.getLogger(__name__) diff --git a/dask_jobqueue/oar.py b/dask_jobqueue/oar.py index 9d0b4616..119c97dc 100644 --- a/dask_jobqueue/oar.py +++ b/dask_jobqueue/oar.py @@ -3,7 +3,6 @@ import dask -from .core import docstrings from .job import JobQueueCluster, Job, job_parameters, cluster_parameters logger = logging.getLogger(__name__) diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py index 2b2e4c6e..9abb2bea 100644 --- a/dask_jobqueue/slurm.py +++ b/dask_jobqueue/slurm.py @@ -4,7 +4,6 @@ import dask -from .core import docstrings from .job import Job, JobQueueCluster, job_parameters, cluster_parameters logger = logging.getLogger(__name__) diff --git a/docs/environment.yml b/docs/environment.yml index bf834ebf..7d9241cd 100644 --- a/docs/environment.yml +++ b/docs/environment.yml @@ -4,7 +4,6 @@ channels: dependencies: - python=3.6 - distributed - - docrep - numpydoc - ipython - sphinx diff --git a/requirements.txt b/requirements.txt index 0834a6ff..87c3fc36 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,2 @@ dask>=2.3 distributed>=2.3 -docrep From f3ee152a06b7c4e31a6d2545328dd2fcc6c292fb Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Sun, 8 Sep 2019 15:01:48 -0700 Subject: [PATCH 077/109] remove errant breakpoint --- dask_jobqueue/tests/test_job.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index 3c2d04a2..31992742 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -120,7 +120,6 @@ async def test_nprocs(): await client.wait_for_workers(2) assert len(cluster.workers) == 1 # two workers, one job assert len(s.workers) == 2 - breakpoint() assert cluster.plan == {ws.name for ws in s.workers.values()} cluster.scale(cores=1) From 5ec910e76fcd6ebbb6ee56f2e3bb80008b255d33 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Sun, 8 Sep 2019 15:02:19 -0700 Subject: [PATCH 078/109] flake8 and black --- dask_jobqueue/htcondor.py | 4 +++- dask_jobqueue/job.py | 9 ++++++--- dask_jobqueue/local.py | 9 ++++++--- dask_jobqueue/lsf.py | 9 ++++++--- dask_jobqueue/oar.py | 4 +++- dask_jobqueue/pbs.py | 4 +++- dask_jobqueue/sge.py | 4 +++- dask_jobqueue/slurm.py | 5 +++-- dask_jobqueue/tests/test_job.py | 34 +++++++++++++++++++++++++++------ 9 files changed, 61 insertions(+), 21 deletions(-) diff --git a/dask_jobqueue/htcondor.py b/dask_jobqueue/htcondor.py index cbd55275..7e70c240 100644 --- a/dask_jobqueue/htcondor.py +++ b/dask_jobqueue/htcondor.py @@ -222,6 +222,8 @@ class HTCondorCluster(JobQueueCluster): HTCondor can take longer to start jobs than other batch systems - tune Adaptive parameters accordingly. >>> cluster.adapt(minimum=5, startup_cost='60s') - """.format(job=job_parameters, cluster=cluster_parameters) + """.format( + job=job_parameters, cluster=cluster_parameters + ) Job = HTCondorJob config_name = "htcondor" diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index 54a9f544..65a57889 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -69,7 +69,6 @@ """.strip() - class Job(ProcessInterface): """ Base class to launch Dask workers on Job queues @@ -97,7 +96,9 @@ class Job(ProcessInterface): OARCluster LSFCluster MoabCluster - """.format(job_parameters=job_parameters) + """.format( + job_parameters=job_parameters + ) _script_template = """ %(shebang)s @@ -382,7 +383,9 @@ class JobQueueCluster(SpecCluster): Job : Job A class that can be awaited to ask for a single Job {cluster_parameters} - """.format(cluster_parameters=cluster_parameters) + """.format( + cluster_parameters=cluster_parameters + ) Job = None diff --git a/dask_jobqueue/local.py b/dask_jobqueue/local.py index 155ca09e..097f1453 100644 --- a/dask_jobqueue/local.py +++ b/dask_jobqueue/local.py @@ -1,4 +1,3 @@ -import functools import logging import os import subprocess @@ -19,7 +18,9 @@ class LocalJob(Job): Parameters ---------- {job} - """.format(job=job_parameters) + """.format( + job=job_parameters + ) config_name = "local" @@ -86,6 +87,8 @@ class LocalCluster(JobQueueCluster): See Also -------- dask.distributed.LocalCluster - """.format(job=job_parameters, cluster=cluster_parameters) + """.format( + job=job_parameters, cluster=cluster_parameters + ) Job = LocalJob config_name = "local" diff --git a/dask_jobqueue/lsf.py b/dask_jobqueue/lsf.py index 151ed8f4..6e47b3cd 100644 --- a/dask_jobqueue/lsf.py +++ b/dask_jobqueue/lsf.py @@ -1,4 +1,3 @@ -import functools import logging import math import os @@ -49,7 +48,9 @@ class LSFJob(Job): kill workers based on load. >>> cluster.adapt() - """.format(job=job_parameters) + """.format( + job=job_parameters + ) submit_command = "bsub" cancel_command = "bkill" @@ -237,6 +238,8 @@ class LSFCluster(JobQueueCluster): kill workers based on load. >>> cluster.adapt() - """.format(job=job_parameters, cluster=cluster_parameters) + """.format( + job=job_parameters, cluster=cluster_parameters + ) Job = LSFJob config_name = "lsf" diff --git a/dask_jobqueue/oar.py b/dask_jobqueue/oar.py index 119c97dc..7c9ee2a8 100644 --- a/dask_jobqueue/oar.py +++ b/dask_jobqueue/oar.py @@ -125,6 +125,8 @@ class OARCluster(JobQueueCluster): This also works with adaptive clusters. This automatically launches and kill workers based on load. >>> cluster.adapt() - """.format(job=job_parameters, cluster=cluster_parameters) + """.format( + job=job_parameters, cluster=cluster_parameters + ) Job = OARJob config_name = "oar" diff --git a/dask_jobqueue/pbs.py b/dask_jobqueue/pbs.py index 3142f08a..6dd04738 100644 --- a/dask_jobqueue/pbs.py +++ b/dask_jobqueue/pbs.py @@ -136,6 +136,8 @@ class PBSCluster(JobQueueCluster): This also works with adaptive clusters. This automatically launches and kill workers based on load. >>> cluster.adapt() - """.format(job=job_parameters, cluster=cluster_parameters) + """.format( + job=job_parameters, cluster=cluster_parameters + ) Job = PBSJob config_name = "pbs" diff --git a/dask_jobqueue/sge.py b/dask_jobqueue/sge.py index 4bd2ba0e..f4cefca5 100644 --- a/dask_jobqueue/sge.py +++ b/dask_jobqueue/sge.py @@ -114,6 +114,8 @@ class SGECluster(JobQueueCluster): This also works with adaptive clusters. This automatically launches and kill workers based on load. >>> cluster.adapt() - """.format(job=job_parameters, cluster=cluster_parameters) + """.format( + job=job_parameters, cluster=cluster_parameters + ) Job = SGEJob config_name = "sge" diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py index 9abb2bea..3add6e9a 100644 --- a/dask_jobqueue/slurm.py +++ b/dask_jobqueue/slurm.py @@ -1,4 +1,3 @@ -import functools import logging import math @@ -150,6 +149,8 @@ class SLURMCluster(JobQueueCluster): This also works with adaptive clusters. This automatically launches and kill workers based on load. >>> cluster.adapt() - """.format(job=job_parameters, cluster=cluster_parameters) + """.format( + job=job_parameters, cluster=cluster_parameters + ) Job = SLURMJob config_name = "slurm" diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index 31992742..2285ca31 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -1,9 +1,24 @@ import asyncio from time import time -from dask_jobqueue import (PBSJob, PBSCluster, SGEJob, SGECluster, SLURMJob, - SLURMCluster, LSFJob, LSFCluster, LocalJob, LocalCluster, - HTCondorJob, HTCondorCluster, MoabJob, MoabCluster, OARJob, OARCluster) +from dask_jobqueue import ( + PBSJob, + PBSCluster, + SGEJob, + SGECluster, + SLURMJob, + SLURMCluster, + LSFJob, + LSFCluster, + LocalJob, + LocalCluster, + HTCondorJob, + HTCondorCluster, + MoabJob, + MoabCluster, + OARJob, + OARCluster, +) from dask_jobqueue.job import JobQueueCluster from dask.distributed import Scheduler, Client @@ -25,8 +40,15 @@ def test_basic(): all_jobs = [SGEJob, PBSJob, SLURMJob, LSFJob, HTCondorJob, MoabJob, OARJob] -all_clusters = [SGECluster, PBSCluster, SLURMCluster, LSFCluster, - HTCondorCluster, MoabCluster, OARCluster] +all_clusters = [ + SGECluster, + PBSCluster, + SLURMCluster, + LSFCluster, + HTCondorCluster, + MoabCluster, + OARCluster, +] @pytest.mark.parametrize("Job", job_protected) @@ -131,4 +153,4 @@ async def test_nprocs(): @pytest.mark.parametrize("Cluster", all_clusters) def test_docstring_cluster(Cluster): assert "cores :" in Cluster.__doc__ - assert Cluster.__name__[:-len("Cluster")] in Cluster.__doc__ + assert Cluster.__name__[: -len("Cluster")] in Cluster.__doc__ From 3ad195d49efffe8f81777fc15b11122bc3463be7 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Sun, 8 Sep 2019 17:45:07 -0700 Subject: [PATCH 079/109] xfail minimum/maximum cores/memory test --- dask_jobqueue/tests/test_pbs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dask_jobqueue/tests/test_pbs.py b/dask_jobqueue/tests/test_pbs.py index 80911ea2..815580f7 100644 --- a/dask_jobqueue/tests/test_pbs.py +++ b/dask_jobqueue/tests/test_pbs.py @@ -248,6 +248,7 @@ def test_adaptive_grouped(loop): assert time() < start + QUEUE_WAIT +@pytest.mark.xfail(reason="adapt doesn't yet have cores/memory") @pytest.mark.env("pbs") def test_adaptive_cores_mem(loop): with PBSCluster( From 9deb3bdfbb2a477dd7419d65bb704fe579cee2a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Tue, 10 Sep 2019 15:31:34 +0200 Subject: [PATCH 080/109] Small tweaks and TODO. --- dask_jobqueue/htcondor.py | 3 +-- dask_jobqueue/job.py | 26 +++++++++++++++------- dask_jobqueue/lsf.py | 45 +-------------------------------------- dask_jobqueue/pbs.py | 2 +- dask_jobqueue/sge.py | 3 +-- dask_jobqueue/slurm.py | 4 +--- 6 files changed, 23 insertions(+), 60 deletions(-) diff --git a/dask_jobqueue/htcondor.py b/dask_jobqueue/htcondor.py index 7e70c240..57720f62 100644 --- a/dask_jobqueue/htcondor.py +++ b/dask_jobqueue/htcondor.py @@ -197,8 +197,7 @@ def quote_environment(env): class HTCondorCluster(JobQueueCluster): - __doc__ = """ - Launch Dask on an HTCondor cluster with a shared file system + __doc__ = """ Launch Dask on an HTCondor cluster with a shared file system Parameters ---------- diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index 65a57889..cb2c88cc 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -72,8 +72,8 @@ class Job(ProcessInterface): """ Base class to launch Dask workers on Job queues - This class should not be used directly, use inherited class appropriate for - your queueing system (e.g. PBScluster or SLURMCluster) + This class should not be used directly, use a class appropriate for + your queueing system (e.g. PBScluster or SLURMCluster) instead. Parameters ---------- @@ -144,9 +144,12 @@ def __init__( if config_name is None: config_name = getattr(type(self), "config_name", None) + # TODO I think the __init__ should be an abstractmethod rather than relying on config_name ... if config_name is None: raise NotImplementedError( - "JobQueueCluster is an abstract class that should not be instantiated." + "Job is an abstract class that should not be instantiated." + "Use a cluster class appropriate to your job queueing system, " + "e.g. PBSCluster or SLURMCluster" ) if job_name is None: @@ -203,6 +206,7 @@ def __init__( self.shebang = shebang self._env_header = "\n".join(filter(None, env_extra)) + # TODO: should skip be part of this PR? self.header_skip = set(header_skip) # dask-worker command line build @@ -278,12 +282,13 @@ async def start(self): with self.job_file() as fn: out = self._submit_job(fn) - job = self._job_id_from_submit_output(out) - if not job: - raise ValueError("Unable to parse jobid from output of %s" % out) - self.job_id = job + job_id = self._job_id_from_submit_output(out) + # TODO: why is this needed since _job_id_from_submit_output already raise a ValueError + if not job_id: + raise ValueError("Unable to parse job id from output of %s" % out) + self.job_id = job_id - weakref.finalize(self, self._close_job, job) + weakref.finalize(self, self._close_job, job_id) logger.debug("Starting job: %s", self.job_id) await super().start() @@ -318,6 +323,7 @@ def _close_job(cls, job_id): if job_id: with ignoring(RuntimeError): # deleting job when job already gone cls._call(shlex.split(cls.cancel_command) + [job_id]) + # TODO: Maybe a log.debug here @staticmethod def _call(cmd, **kwargs): @@ -387,6 +393,7 @@ class JobQueueCluster(SpecCluster): cluster_parameters=cluster_parameters ) + # TODO: I have a slight preference for a parameter like job_cls Job = None def __init__( @@ -442,6 +449,9 @@ def __init__( if "processes" in kwargs and kwargs["processes"] > 1: worker["group"] = ["-" + str(i) for i in range(kwargs["processes"])] + # TODO: this seems like this sets self.scheduler.address, is there a + # less magical way of doing the same thing? + # self.example_job is also used for cluster.job_script() self.example_job # trigger property to ensure that the job is valid super().__init__( diff --git a/dask_jobqueue/lsf.py b/dask_jobqueue/lsf.py index 6e47b3cd..4fb6a1a6 100644 --- a/dask_jobqueue/lsf.py +++ b/dask_jobqueue/lsf.py @@ -10,48 +10,6 @@ class LSFJob(Job): - __doc__ = """ Launch Dask on a LSF cluster - - - Parameters - ---------- - queue : str - Destination queue for each worker job. Passed to `#BSUB -q` option. - project : str - Accounting string associated with each worker job. Passed to - `#BSUB -P` option. - {job} - ncpus : int - Number of cpus. Passed to `#BSUB -n` option. - mem : int - Request memory in bytes. Passed to `#BSUB -M` option. - walltime : str - Walltime for each worker job in HH:MM. Passed to `#BSUB -W` option. - job_extra : list - List of other LSF options, for example -u. Each option will be - prepended with the #LSF prefix. - lsf_units : str - Unit system for large units in resource usage set by the - LSF_UNIT_FOR_LIMITS in the lsf.conf file of a cluster. - - Examples - -------- - >>> from dask_jobqueue import LSFCluster - >>> cluster = LSFCluster(queue='general', project='DaskonLSF', - ... cores=15, memory='25GB') - >>> cluster.scale(10) # this may take a few seconds to launch - - >>> from dask.distributed import Client - >>> client = Client(cluster) - - This also works with adaptive clusters. This automatically launches and - kill workers based on load. - - >>> cluster.adapt() - """.format( - job=job_parameters - ) - submit_command = "bsub" cancel_command = "bkill" @@ -195,8 +153,7 @@ def lsf_detect_units(): class LSFCluster(JobQueueCluster): - __doc__ = """ - Launch Dask on a LSF cluster + __doc__ = """ Launch Dask on a LSF cluster Parameters ---------- diff --git a/dask_jobqueue/pbs.py b/dask_jobqueue/pbs.py index 6dd04738..bc412750 100644 --- a/dask_jobqueue/pbs.py +++ b/dask_jobqueue/pbs.py @@ -103,7 +103,7 @@ def __init__( class PBSCluster(JobQueueCluster): - __doc__ = """ Launch Dask on an OAR cluster + __doc__ = """ Launch Dask on a PBS cluster Parameters ---------- diff --git a/dask_jobqueue/sge.py b/dask_jobqueue/sge.py index f4cefca5..3a669dd4 100644 --- a/dask_jobqueue/sge.py +++ b/dask_jobqueue/sge.py @@ -68,8 +68,7 @@ def __init__( class SGECluster(JobQueueCluster): - __doc__ = """ - Launch Dask on an SGE cluster + __doc__ = """ Launch Dask on an SGE cluster .. note:: If you want a specific amount of RAM, both ``memory`` and ``resource_spec`` diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py index 3add6e9a..ef986665 100644 --- a/dask_jobqueue/slurm.py +++ b/dask_jobqueue/slurm.py @@ -40,7 +40,6 @@ def __init__( super().__init__(*args, config_name=config_name, **kwargs) - # Always ask for only one task header_lines = [] # SLURM header build if self.job_name is not None: @@ -108,8 +107,7 @@ def slurm_format_bytes_ceil(n): class SLURMCluster(JobQueueCluster): - __doc__ = """ - Launch Dask on a SLURM cluster + __doc__ = """ Launch Dask on a SLURM cluster Parameters ---------- From 34bece893d63659d0f0d0c60adcd5281feba15cb Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 11 Sep 2019 10:38:14 -0700 Subject: [PATCH 081/109] Job -> job_cls --- dask_jobqueue/htcondor.py | 2 +- dask_jobqueue/job.py | 10 +++++----- dask_jobqueue/local.py | 2 +- dask_jobqueue/lsf.py | 2 +- dask_jobqueue/moab.py | 2 +- dask_jobqueue/oar.py | 2 +- dask_jobqueue/pbs.py | 2 +- dask_jobqueue/sge.py | 2 +- dask_jobqueue/slurm.py | 2 +- 9 files changed, 13 insertions(+), 13 deletions(-) diff --git a/dask_jobqueue/htcondor.py b/dask_jobqueue/htcondor.py index 57720f62..2402a2a1 100644 --- a/dask_jobqueue/htcondor.py +++ b/dask_jobqueue/htcondor.py @@ -224,5 +224,5 @@ class HTCondorCluster(JobQueueCluster): """.format( job=job_parameters, cluster=cluster_parameters ) - Job = HTCondorJob + job_cls = HTCondorJob config_name = "htcondor" diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index cb2c88cc..8414a8fb 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -394,7 +394,7 @@ class JobQueueCluster(SpecCluster): ) # TODO: I have a slight preference for a parameter like job_cls - Job = None + job_cls = None def __init__( self, @@ -417,9 +417,9 @@ def __init__( ): self.status = "created" if Job is not None: - self.Job = Job + self.job_cls = Job - if self.Job is None: + if self.job_cls is None: raise ValueError( "You must provide a Job type like PBSJob, SLURMJob, " "or SGEJob with the Job= argument." @@ -445,7 +445,7 @@ def __init__( kwargs["protocol"] = protocol kwargs["security"] = security self._kwargs = kwargs - worker = {"cls": self.Job, "options": kwargs} + worker = {"cls": self.job_cls, "options": kwargs} if "processes" in kwargs and kwargs["processes"] > 1: worker["group"] = ["-" + str(i) for i in range(kwargs["processes"])] @@ -472,7 +472,7 @@ def example_job(self): address = self.scheduler.address except AttributeError: address = "tcp://scheduler:8786" - return self.Job(address or "tcp://scheduler:8786", name="name", **self._kwargs) + return self.job_cls(address or "tcp://scheduler:8786", name="name", **self._kwargs) @property def job_header(self): diff --git a/dask_jobqueue/local.py b/dask_jobqueue/local.py index 097f1453..757a7020 100644 --- a/dask_jobqueue/local.py +++ b/dask_jobqueue/local.py @@ -90,5 +90,5 @@ class LocalCluster(JobQueueCluster): """.format( job=job_parameters, cluster=cluster_parameters ) - Job = LocalJob + job_cls = LocalJob config_name = "local" diff --git a/dask_jobqueue/lsf.py b/dask_jobqueue/lsf.py index 4fb6a1a6..9b2b850b 100644 --- a/dask_jobqueue/lsf.py +++ b/dask_jobqueue/lsf.py @@ -198,5 +198,5 @@ class LSFCluster(JobQueueCluster): """.format( job=job_parameters, cluster=cluster_parameters ) - Job = LSFJob + job_cls = LSFJob config_name = "lsf" diff --git a/dask_jobqueue/moab.py b/dask_jobqueue/moab.py index c2f8f3f6..969a8475 100644 --- a/dask_jobqueue/moab.py +++ b/dask_jobqueue/moab.py @@ -9,4 +9,4 @@ class MoabJob(PBSJob): class MoabCluster(PBSCluster): __doc__ = PBSCluster.__doc__.replace("PBSCluster", "MoabCluster") - Job = MoabJob + job_cls = MoabJob diff --git a/dask_jobqueue/oar.py b/dask_jobqueue/oar.py index 7c9ee2a8..cdd8f961 100644 --- a/dask_jobqueue/oar.py +++ b/dask_jobqueue/oar.py @@ -128,5 +128,5 @@ class OARCluster(JobQueueCluster): """.format( job=job_parameters, cluster=cluster_parameters ) - Job = OARJob + job_cls = OARJob config_name = "oar" diff --git a/dask_jobqueue/pbs.py b/dask_jobqueue/pbs.py index bc412750..40dfddd6 100644 --- a/dask_jobqueue/pbs.py +++ b/dask_jobqueue/pbs.py @@ -139,5 +139,5 @@ class PBSCluster(JobQueueCluster): """.format( job=job_parameters, cluster=cluster_parameters ) - Job = PBSJob + job_cls = PBSJob config_name = "pbs" diff --git a/dask_jobqueue/sge.py b/dask_jobqueue/sge.py index 3a669dd4..ba6bd71d 100644 --- a/dask_jobqueue/sge.py +++ b/dask_jobqueue/sge.py @@ -116,5 +116,5 @@ class SGECluster(JobQueueCluster): """.format( job=job_parameters, cluster=cluster_parameters ) - Job = SGEJob + job_cls = SGEJob config_name = "sge" diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py index ef986665..d1fb707b 100644 --- a/dask_jobqueue/slurm.py +++ b/dask_jobqueue/slurm.py @@ -150,5 +150,5 @@ class SLURMCluster(JobQueueCluster): """.format( job=job_parameters, cluster=cluster_parameters ) - Job = SLURMJob + job_cls = SLURMJob config_name = "slurm" From 6b394e302a653d38d8d5e0f4a05c9b4b91e2edd7 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 11 Sep 2019 10:39:41 -0700 Subject: [PATCH 082/109] remove spacing around job/cluster parametrers --- dask_jobqueue/local.py | 1 - dask_jobqueue/lsf.py | 4 ---- dask_jobqueue/oar.py | 3 --- dask_jobqueue/pbs.py | 3 --- dask_jobqueue/sge.py | 3 --- dask_jobqueue/slurm.py | 3 --- 6 files changed, 17 deletions(-) diff --git a/dask_jobqueue/local.py b/dask_jobqueue/local.py index 757a7020..077aeae8 100644 --- a/dask_jobqueue/local.py +++ b/dask_jobqueue/local.py @@ -75,7 +75,6 @@ class LocalCluster(JobQueueCluster): Parameters ---------- {job} - {cluster} Examples diff --git a/dask_jobqueue/lsf.py b/dask_jobqueue/lsf.py index 9b2b850b..669dcedb 100644 --- a/dask_jobqueue/lsf.py +++ b/dask_jobqueue/lsf.py @@ -162,18 +162,14 @@ class LSFCluster(JobQueueCluster): project : str Accounting string associated with each worker job. Passed to `#BSUB -P` option. - {job} - ncpus : int Number of cpus. Passed to `#BSUB -n` option. mem : int Request memory in bytes. Passed to `#BSUB -M` option. walltime : str Walltime for each worker job in HH:MM. Passed to `#BSUB -W` option. - {cluster} - job_extra : list List of other LSF options, for example -u. Each option will be prepended with the #LSF prefix. diff --git a/dask_jobqueue/oar.py b/dask_jobqueue/oar.py index cdd8f961..285d41a9 100644 --- a/dask_jobqueue/oar.py +++ b/dask_jobqueue/oar.py @@ -101,11 +101,8 @@ class OARCluster(JobQueueCluster): Destination queue for each worker job. Passed to `#OAR -q` option. project : str Accounting string associated with each worker job. Passed to `#OAR -p` option. - {job} - {cluster} - resource_spec : str Request resources and specify job placement. Passed to `#OAR -l` option. walltime : str diff --git a/dask_jobqueue/pbs.py b/dask_jobqueue/pbs.py index 40dfddd6..ae670e8e 100644 --- a/dask_jobqueue/pbs.py +++ b/dask_jobqueue/pbs.py @@ -111,11 +111,8 @@ class PBSCluster(JobQueueCluster): Destination queue for each worker job. Passed to `#PBS -q` option. project : str Accounting string associated with each worker job. Passed to `#PBS -A` option. - {job} - {cluster} - resource_spec : str Request resources and specify job placement. Passed to `#PBS -l` option. walltime : str diff --git a/dask_jobqueue/sge.py b/dask_jobqueue/sge.py index ba6bd71d..e61524f8 100644 --- a/dask_jobqueue/sge.py +++ b/dask_jobqueue/sge.py @@ -83,11 +83,8 @@ class SGECluster(JobQueueCluster): Destination queue for each worker job. Passed to `#$ -q` option. project : str Accounting string associated with each worker job. Passed to `#$ -A` option. - {job} - {cluster} - resource_spec : str Request resources and specify job placement. Passed to `#$ -l` option. walltime : str diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py index d1fb707b..d92bc094 100644 --- a/dask_jobqueue/slurm.py +++ b/dask_jobqueue/slurm.py @@ -115,11 +115,8 @@ class SLURMCluster(JobQueueCluster): Destination queue for each worker job. Passed to `#SBATCH -p` option. project : str Accounting string associated with each worker job. Passed to `#SBATCH -A` option. - {job} - {cluster} - walltime : str Walltime for each worker job. job_cpu : int From faef618c0a09167f146503e8a092573033eefe9e Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 11 Sep 2019 11:58:59 -0700 Subject: [PATCH 083/109] black --- dask_jobqueue/job.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index 8414a8fb..bf72c7da 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -472,7 +472,9 @@ def example_job(self): address = self.scheduler.address except AttributeError: address = "tcp://scheduler:8786" - return self.job_cls(address or "tcp://scheduler:8786", name="name", **self._kwargs) + return self.job_cls( + address or "tcp://scheduler:8786", name="name", **self._kwargs + ) @property def job_header(self): From d90cc53c3ff87165ed423083c383b0ef48740967 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 12 Sep 2019 08:09:41 -0700 Subject: [PATCH 084/109] remove LocalFoo from __init__.py --- dask_jobqueue/__init__.py | 1 - dask_jobqueue/tests/test_job.py | 6 ++++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/dask_jobqueue/__init__.py b/dask_jobqueue/__init__.py index f366bc46..66168ac6 100644 --- a/dask_jobqueue/__init__.py +++ b/dask_jobqueue/__init__.py @@ -1,6 +1,5 @@ # flake8: noqa from . import config -from .local import LocalJob, LocalCluster from .job import Job, JobQueueCluster from .moab import MoabCluster, MoabJob from .pbs import PBSCluster, PBSJob diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index 2285ca31..d32140b1 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -10,8 +10,6 @@ SLURMCluster, LSFJob, LSFCluster, - LocalJob, - LocalCluster, HTCondorJob, HTCondorCluster, MoabJob, @@ -19,6 +17,10 @@ OARJob, OARCluster, ) +from dask_jobqueue.local import ( + LocalJob, + LocalCluster, +) from dask_jobqueue.job import JobQueueCluster from dask.distributed import Scheduler, Client From fa629af2d3227bc4bc02a49bf1c6c3dfcd7b1268 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 12 Sep 2019 08:29:56 -0700 Subject: [PATCH 085/109] Remove Job classes from __init__.py --- dask_jobqueue/__init__.py | 16 ++++++++-------- dask_jobqueue/tests/test_job.py | 17 +++++++++-------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/dask_jobqueue/__init__.py b/dask_jobqueue/__init__.py index 66168ac6..4d48b0f1 100644 --- a/dask_jobqueue/__init__.py +++ b/dask_jobqueue/__init__.py @@ -1,13 +1,13 @@ # flake8: noqa from . import config -from .job import Job, JobQueueCluster -from .moab import MoabCluster, MoabJob -from .pbs import PBSCluster, PBSJob -from .slurm import SLURMCluster, SLURMJob -from .sge import SGECluster, SGEJob -from .lsf import LSFCluster, LSFJob -from .oar import OARCluster, OARJob -from .htcondor import HTCondorCluster, HTCondorJob +from .job import JobQueueCluster +from .moab import MoabCluster +from .pbs import PBSCluster +from .slurm import SLURMCluster +from .sge import SGECluster +from .lsf import LSFCluster +from .oar import OARCluster +from .htcondor import HTCondorCluster from ._version import get_versions diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index d32140b1..46f54f6a 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -2,26 +2,27 @@ from time import time from dask_jobqueue import ( - PBSJob, PBSCluster, - SGEJob, SGECluster, - SLURMJob, SLURMCluster, - LSFJob, LSFCluster, - HTCondorJob, HTCondorCluster, - MoabJob, MoabCluster, - OARJob, OARCluster, ) from dask_jobqueue.local import ( LocalJob, LocalCluster, ) -from dask_jobqueue.job import JobQueueCluster +from dask_jobqueue.pbs import PBSJob +from dask_jobqueue.sge import SGEJob +from dask_jobqueue.slurm import SLURMJob +from dask_jobqueue.lsf import LSFJob +from dask_jobqueue.moab import MoabJob +from dask_jobqueue.htcondor import HTCondorJob +from dask_jobqueue.oar import OARJob + +from dask_jobqueue.job import Job, JobQueueCluster from dask.distributed import Scheduler, Client import pytest From bece3e459a8ce5e8053731e4ea0caf8bf6752611 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 12 Sep 2019 08:31:05 -0700 Subject: [PATCH 086/109] lint --- dask_jobqueue/tests/test_job.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index 46f54f6a..4c776ace 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -10,10 +10,7 @@ MoabCluster, OARCluster, ) -from dask_jobqueue.local import ( - LocalJob, - LocalCluster, -) +from dask_jobqueue.local import LocalJob, LocalCluster from dask_jobqueue.pbs import PBSJob from dask_jobqueue.sge import SGEJob from dask_jobqueue.slurm import SLURMJob @@ -22,7 +19,7 @@ from dask_jobqueue.htcondor import HTCondorJob from dask_jobqueue.oar import OARJob -from dask_jobqueue.job import Job, JobQueueCluster +from dask_jobqueue.job import JobQueueCluster from dask.distributed import Scheduler, Client import pytest From eef73344d637ac58f79806d20c6e26593348d61b Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 12 Sep 2019 13:04:19 -0700 Subject: [PATCH 087/109] Raise error in LocalJob if failure This also changes _submit_job to an async function, and uses an async subprocess module from tornado --- dask_jobqueue/job.py | 4 ++-- dask_jobqueue/local.py | 23 ++++++++++++++--------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index bf72c7da..8c20c9ef 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -262,7 +262,7 @@ def job_file(self): f.write(self.job_script()) yield fn - def _submit_job(self, script_filename): + async def _submit_job(self, script_filename): # Should we make this async friendly? return self._call(shlex.split(self.submit_command) + [script_filename]) @@ -281,7 +281,7 @@ async def start(self): logger.debug("Starting worker: %s", self.name) with self.job_file() as fn: - out = self._submit_job(fn) + out = await self._submit_job(fn) job_id = self._job_id_from_submit_output(out) # TODO: why is this needed since _job_id_from_submit_output already raise a ValueError if not job_id: diff --git a/dask_jobqueue/local.py b/dask_jobqueue/local.py index 077aeae8..8832c088 100644 --- a/dask_jobqueue/local.py +++ b/dask_jobqueue/local.py @@ -1,6 +1,6 @@ import logging import os -import subprocess +from tornado.process import Subprocess from .job import Job, JobQueueCluster, job_parameters, cluster_parameters @@ -39,22 +39,27 @@ def __init__( super().__init__(*args, config_name=config_name, shebang="", **kwargs) # Declare class attribute that shall be overridden - header_lines = [] - self.job_header = "\n".join(header_lines) + self.job_header = "" logger.debug("Job script: \n %s" % self.job_script()) - def _submit_job(self, script_filename): + async def _submit_job(self, script_filename): # Should we make this async friendly? with open(script_filename) as f: text = f.read().strip().split() - self.process = subprocess.Popen( - text, stdout=subprocess.PIPE, stderr=subprocess.PIPE + self.process = Subprocess( + text, stdout=Subprocess.STREAM, stderr=Subprocess.STREAM ) - # TODO this should raise if self.process.returncode != 0. Refactor - # Job._call to be able to return process (so that we can return self.process.pid below) - self.process.stderr.readline() # make sure that we start + lines = [] + while True: + line = await self.process.stderr.read_until(b'\n') # make sure that we start + lines.append(line.decode()) + if b"Registered to:" in line: + break + if b"error" in line.lower(): + raise Exception("Worker failed\n\n" + "".join(lines)) + return str(self.process.pid) @classmethod From 7d13e6ba58321aa17bcb142f8948be1ef58d76de Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 12 Sep 2019 18:43:10 -0700 Subject: [PATCH 088/109] Remove some TODOs --- dask_jobqueue/job.py | 6 +----- dask_jobqueue/local.py | 4 +++- dask_jobqueue/tests/test_pbs.py | 16 +--------------- dask_jobqueue/tests/test_sge.py | 3 --- 4 files changed, 5 insertions(+), 24 deletions(-) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index 8c20c9ef..44a3d434 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -323,7 +323,7 @@ def _close_job(cls, job_id): if job_id: with ignoring(RuntimeError): # deleting job when job already gone cls._call(shlex.split(cls.cancel_command) + [job_id]) - # TODO: Maybe a log.debug here + logger.debug("Closed job %s", job_id) @staticmethod def _call(cmd, **kwargs): @@ -393,7 +393,6 @@ class JobQueueCluster(SpecCluster): cluster_parameters=cluster_parameters ) - # TODO: I have a slight preference for a parameter like job_cls job_cls = None def __init__( @@ -449,9 +448,6 @@ def __init__( if "processes" in kwargs and kwargs["processes"] > 1: worker["group"] = ["-" + str(i) for i in range(kwargs["processes"])] - # TODO: this seems like this sets self.scheduler.address, is there a - # less magical way of doing the same thing? - # self.example_job is also used for cluster.job_script() self.example_job # trigger property to ensure that the job is valid super().__init__( diff --git a/dask_jobqueue/local.py b/dask_jobqueue/local.py index 8832c088..66003b09 100644 --- a/dask_jobqueue/local.py +++ b/dask_jobqueue/local.py @@ -53,7 +53,9 @@ async def _submit_job(self, script_filename): lines = [] while True: - line = await self.process.stderr.read_until(b'\n') # make sure that we start + line = await self.process.stderr.read_until( + b"\n" + ) # make sure that we start lines.append(line.decode()) if b"Registered to:" in line: break diff --git a/dask_jobqueue/tests/test_pbs.py b/dask_jobqueue/tests/test_pbs.py index 815580f7..3de55096 100644 --- a/dask_jobqueue/tests/test_pbs.py +++ b/dask_jobqueue/tests/test_pbs.py @@ -295,10 +295,6 @@ def test_scale_grouped(loop): cluster.scale(4) # Start 2 jobs start = time() - # TODO: Is there a replacement to check for number of jobs (rather than workers) - # while len(cluster.running_jobs) != 2: - # sleep(0.100) - # assert time() < start + QUEUE_WAIT while len(list(client.scheduler_info()["workers"].values())) != 4: sleep(0.100) @@ -317,13 +313,6 @@ def test_scale_grouped(loop): cluster.scale(1) # Should leave 2 workers, 1 job start = time() - # TODO - # while len(cluster.running_jobs) != 1: - # sleep(0.100) - # assert time() < start + QUEUE_WAIT - - # assert len(cluster.running_jobs) == 1 - # workers = list(client.scheduler_info()["workers"].values()) while len(client.scheduler_info()["workers"]) != 2: sleep(0.100) assert time() < start + QUEUE_WAIT @@ -331,11 +320,8 @@ def test_scale_grouped(loop): cluster.scale(0) start = time() - # while cluster.running_jobs: - # sleep(0.100) - # assert time() < start + QUEUE_WAIT - # assert not cluster.running_jobs + assert not cluster.worker_spec while len(client.scheduler_info()["workers"]) != 0: sleep(0.100) assert time() < start + QUEUE_WAIT diff --git a/dask_jobqueue/tests/test_sge.py b/dask_jobqueue/tests/test_sge.py index 980de436..6020dabb 100644 --- a/dask_jobqueue/tests/test_sge.py +++ b/dask_jobqueue/tests/test_sge.py @@ -118,9 +118,6 @@ def test_complex_cancel_command(loop): sleep(0.100) assert time() < start + QUEUE_WAIT - # TODO: Is there a replacement for .stop_all_jobs? stop_all_jobs - # does make sure that the pending jobs get qdeled. - # cluster.stop_all_jobs() cluster.scale(0) start = time() From a1d1343bdc1745f36383c070f391cf678dbd34a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 13 Sep 2019 15:57:55 +0200 Subject: [PATCH 089/109] Remove None check for job_id. --- dask_jobqueue/job.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index 44a3d434..61360c06 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -282,13 +282,9 @@ async def start(self): with self.job_file() as fn: out = await self._submit_job(fn) - job_id = self._job_id_from_submit_output(out) - # TODO: why is this needed since _job_id_from_submit_output already raise a ValueError - if not job_id: - raise ValueError("Unable to parse job id from output of %s" % out) - self.job_id = job_id + self.job_id = self._job_id_from_submit_output(out) - weakref.finalize(self, self._close_job, job_id) + weakref.finalize(self, self._close_job, self.job_id) logger.debug("Starting job: %s", self.job_id) await super().start() From e1f2e20bcc5a6e4f0d171dfb87dca7a7e41fa8c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 13 Sep 2019 16:58:00 +0200 Subject: [PATCH 090/109] Some tweaks. * Make Job.__init__ abstractmethod rather than relying on config_name None check * rename Job -> job_cls in a few remaining places * put back xfail test. --- dask_jobqueue/job.py | 31 ++++++++++++----------- dask_jobqueue/tests/test_job.py | 20 +++++++-------- dask_jobqueue/tests/test_jobqueue_core.py | 19 ++++++-------- 3 files changed, 34 insertions(+), 36 deletions(-) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index 61360c06..226e9c6b 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -6,6 +6,7 @@ import subprocess import sys import weakref +import abc import dask from dask.utils import ignoring @@ -69,7 +70,7 @@ """.strip() -class Job(ProcessInterface): +class Job(ProcessInterface, abc.ABC): """ Base class to launch Dask workers on Job queues This class should not be used directly, use a class appropriate for @@ -111,8 +112,10 @@ class Job(ProcessInterface): # Following class attributes should be overridden by extending classes. submit_command = None cancel_command = None + config_name = None job_id_regexp = r"(?P\d+)" + @abc.abstractmethod def __init__( self, scheduler=None, @@ -134,22 +137,20 @@ def __init__( config_name=None, **kwargs ): - # """ - # This initializer should be considered as Abstract, and never used directly. - # """ self.scheduler = scheduler self.job_id = None super().__init__() - if config_name is None: - config_name = getattr(type(self), "config_name", None) - # TODO I think the __init__ should be an abstractmethod rather than relying on config_name ... if config_name is None: - raise NotImplementedError( - "Job is an abstract class that should not be instantiated." - "Use a cluster class appropriate to your job queueing system, " - "e.g. PBSCluster or SLURMCluster" + config_name = getattr(type(self), "config_name") + if config_name is None: + raise ValueError( + "Looks like you are trying to create a class that inherits from dask_jobqueue.job.Job. " + "If that is the case, you need to:\n" + "- set the 'config_name' class variable to a non-None value\n" + "- create a section in jobqueue.yaml with the value of 'config_name'\n" + "If that is not the case, please open an issue in https://github.com/dask/dask-jobqueue/issues." ) if job_name is None: @@ -394,7 +395,7 @@ class JobQueueCluster(SpecCluster): def __init__( self, n_workers=0, - Job: Job = None, + job_cls: Job = None, # Cluster keywords loop=None, security=None, @@ -411,13 +412,13 @@ def __init__( **kwargs ): self.status = "created" - if Job is not None: - self.job_cls = Job + if job_cls is not None: + self.job_cls = job_cls if self.job_cls is None: raise ValueError( "You must provide a Job type like PBSJob, SLURMJob, " - "or SGEJob with the Job= argument." + "or SGEJob with the job_cls= argument." ) if config_name: diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index 4c776ace..894006b9 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -51,11 +51,11 @@ def test_basic(): ] -@pytest.mark.parametrize("Job", job_protected) +@pytest.mark.parametrize("job_cls", job_protected) @pytest.mark.asyncio -async def test_job(Job): +async def test_job(job_cls): async with Scheduler(port=0) as s: - job = Job(scheduler=s.address, name="foo", cores=1, memory="1GB") + job = job_cls(scheduler=s.address, name="foo", cores=1, memory="1GB") job = await job async with Client(s.address, asynchronous=True) as client: await client.wait_for_workers(1) @@ -69,18 +69,18 @@ async def test_job(Job): assert time() < start + 10 -@pytest.mark.parametrize("Job", job_protected) +@pytest.mark.parametrize("job_cls", job_protected) @pytest.mark.asyncio -async def test_cluster(Job): +async def test_cluster(job_cls): async with JobQueueCluster( - 1, cores=1, memory="1GB", Job=Job, asynchronous=True, name="foo" + 1, cores=1, memory="1GB", job_cls=job_cls, asynchronous=True, name="foo" ) as cluster: async with Client(cluster, asynchronous=True) as client: assert len(cluster.workers) == 1 cluster.scale(2) await cluster assert len(cluster.workers) == 2 - assert all(isinstance(w, Job) for w in cluster.workers.values()) + assert all(isinstance(w, job_cls) for w in cluster.workers.values()) assert all(w.status == "running" for w in cluster.workers.values()) await client.wait_for_workers(2) @@ -92,11 +92,11 @@ async def test_cluster(Job): assert time() < start + 10 -@pytest.mark.parametrize("Job", job_protected) +@pytest.mark.parametrize("job_cls", job_protected) @pytest.mark.asyncio -async def test_adapt(Job): +async def test_adapt(job_cls): async with JobQueueCluster( - 1, cores=1, memory="1GB", Job=Job, asynchronous=True, name="foo" + 1, cores=1, memory="1GB", job_cls=job_cls, asynchronous=True, name="foo" ) as cluster: async with Client(cluster, asynchronous=True) as client: await client.wait_for_workers(1) diff --git a/dask_jobqueue/tests/test_jobqueue_core.py b/dask_jobqueue/tests/test_jobqueue_core.py index de812234..a376e08e 100644 --- a/dask_jobqueue/tests/test_jobqueue_core.py +++ b/dask_jobqueue/tests/test_jobqueue_core.py @@ -19,13 +19,10 @@ from dask_jobqueue.sge import SGEJob -@pytest.mark.xfail def test_errors(): - with pytest.raises(NotImplementedError) as info: + with pytest.raises(ValueError, match="Job type.*job_cls="): JobQueueCluster(cores=4) - assert "abstract class" in str(info.value) - def test_command_template(): with PBSCluster(cores=2, memory="4GB") as cluster: @@ -111,7 +108,7 @@ def test_job_id_from_qsub_legacy(Cluster, qsub_return_string): assert original_job_id == cluster._job_id_from_submit_output(qsub_return_string) -@pytest.mark.parametrize("Job", [SGEJob]) +@pytest.mark.parametrize("job_cls", [SGEJob]) @pytest.mark.parametrize( "qsub_return_string", [ @@ -123,10 +120,10 @@ def test_job_id_from_qsub_legacy(Cluster, qsub_return_string): "{job_id}", ], ) -def test_job_id_from_qsub(Job, qsub_return_string): +def test_job_id_from_qsub(job_cls, qsub_return_string): original_job_id = "654321" qsub_return_string = qsub_return_string.format(job_id=original_job_id) - job = Job(cores=1, memory="1GB") + job = job_cls(cores=1, memory="1GB") assert original_job_id == job._job_id_from_submit_output(qsub_return_string) @@ -146,16 +143,16 @@ def test_job_id_error_handling_legacy(Cluster): cluster._job_id_from_submit_output(return_string) -@pytest.mark.parametrize("Job", [SGEJob]) -def test_job_id_error_handling(Job): +@pytest.mark.parametrize("job_cls", [SGEJob]) +def test_job_id_error_handling(job_cls): # non-matching regexp - job = Job(cores=1, memory="1GB") + job = job_cls(cores=1, memory="1GB") with pytest.raises(ValueError, match="Could not parse job id"): return_string = "there is no number here" job._job_id_from_submit_output(return_string) # no job_id named group in the regexp - job = Job(cores=1, memory="1GB") + job = job_cls(cores=1, memory="1GB") with pytest.raises(ValueError, match="You need to use a 'job_id' named group"): return_string = "Job <12345> submitted to ." job.job_id_regexp = r"(\d+)" From ccfe946523a8cf92b316f84576aad8bf7210fe0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 13 Sep 2019 17:19:24 +0200 Subject: [PATCH 091/109] Fine to have header_skip in the PR. --- dask_jobqueue/job.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py index 226e9c6b..92bf7b12 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/job.py @@ -207,7 +207,6 @@ def __init__( self.shebang = shebang self._env_header = "\n".join(filter(None, env_extra)) - # TODO: should skip be part of this PR? self.header_skip = set(header_skip) # dask-worker command line build From ddb692f25226952d82cf835d32b8629d5191da01 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Fri, 13 Sep 2019 14:51:52 -0700 Subject: [PATCH 092/109] Move job.py to core.py --- dask_jobqueue/__init__.py | 2 +- dask_jobqueue/{job.py => core.py} | 2 +- dask_jobqueue/htcondor.py | 2 +- dask_jobqueue/local.py | 2 +- dask_jobqueue/lsf.py | 2 +- dask_jobqueue/oar.py | 2 +- dask_jobqueue/pbs.py | 2 +- dask_jobqueue/sge.py | 2 +- dask_jobqueue/slurm.py | 2 +- dask_jobqueue/tests/test_job.py | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) rename dask_jobqueue/{job.py => core.py} (99%) diff --git a/dask_jobqueue/__init__.py b/dask_jobqueue/__init__.py index 4d48b0f1..4cd6a49e 100644 --- a/dask_jobqueue/__init__.py +++ b/dask_jobqueue/__init__.py @@ -1,6 +1,6 @@ # flake8: noqa from . import config -from .job import JobQueueCluster +from .core import JobQueueCluster from .moab import MoabCluster from .pbs import PBSCluster from .slurm import SLURMCluster diff --git a/dask_jobqueue/job.py b/dask_jobqueue/core.py similarity index 99% rename from dask_jobqueue/job.py rename to dask_jobqueue/core.py index 92bf7b12..dde42ce4 100644 --- a/dask_jobqueue/job.py +++ b/dask_jobqueue/core.py @@ -146,7 +146,7 @@ def __init__( config_name = getattr(type(self), "config_name") if config_name is None: raise ValueError( - "Looks like you are trying to create a class that inherits from dask_jobqueue.job.Job. " + "Looks like you are trying to create a class that inherits from dask_jobqueue.core.Job. " "If that is the case, you need to:\n" "- set the 'config_name' class variable to a non-None value\n" "- create a section in jobqueue.yaml with the value of 'config_name'\n" diff --git a/dask_jobqueue/htcondor.py b/dask_jobqueue/htcondor.py index 2402a2a1..7becf983 100644 --- a/dask_jobqueue/htcondor.py +++ b/dask_jobqueue/htcondor.py @@ -5,7 +5,7 @@ import dask from distributed.utils import parse_bytes -from .job import JobQueueCluster, Job, job_parameters, cluster_parameters +from .core import JobQueueCluster, Job, job_parameters, cluster_parameters logger = logging.getLogger(__name__) diff --git a/dask_jobqueue/local.py b/dask_jobqueue/local.py index 66003b09..a0485d2b 100644 --- a/dask_jobqueue/local.py +++ b/dask_jobqueue/local.py @@ -2,7 +2,7 @@ import os from tornado.process import Subprocess -from .job import Job, JobQueueCluster, job_parameters, cluster_parameters +from .core import Job, JobQueueCluster, job_parameters, cluster_parameters logger = logging.getLogger(__name__) diff --git a/dask_jobqueue/lsf.py b/dask_jobqueue/lsf.py index 669dcedb..a5b01495 100644 --- a/dask_jobqueue/lsf.py +++ b/dask_jobqueue/lsf.py @@ -4,7 +4,7 @@ import dask -from .job import Job, JobQueueCluster, job_parameters, cluster_parameters +from .core import Job, JobQueueCluster, job_parameters, cluster_parameters logger = logging.getLogger(__name__) diff --git a/dask_jobqueue/oar.py b/dask_jobqueue/oar.py index 285d41a9..91d9da1a 100644 --- a/dask_jobqueue/oar.py +++ b/dask_jobqueue/oar.py @@ -3,7 +3,7 @@ import dask -from .job import JobQueueCluster, Job, job_parameters, cluster_parameters +from .core import JobQueueCluster, Job, job_parameters, cluster_parameters logger = logging.getLogger(__name__) diff --git a/dask_jobqueue/pbs.py b/dask_jobqueue/pbs.py index ae670e8e..b975957b 100644 --- a/dask_jobqueue/pbs.py +++ b/dask_jobqueue/pbs.py @@ -4,7 +4,7 @@ import dask -from .job import Job, JobQueueCluster, job_parameters, cluster_parameters +from .core import Job, JobQueueCluster, job_parameters, cluster_parameters logger = logging.getLogger(__name__) diff --git a/dask_jobqueue/sge.py b/dask_jobqueue/sge.py index e61524f8..71745616 100644 --- a/dask_jobqueue/sge.py +++ b/dask_jobqueue/sge.py @@ -2,7 +2,7 @@ import dask -from .job import Job, JobQueueCluster, job_parameters, cluster_parameters +from .core import Job, JobQueueCluster, job_parameters, cluster_parameters logger = logging.getLogger(__name__) diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py index d92bc094..8d80cc6a 100644 --- a/dask_jobqueue/slurm.py +++ b/dask_jobqueue/slurm.py @@ -3,7 +3,7 @@ import dask -from .job import Job, JobQueueCluster, job_parameters, cluster_parameters +from .core import Job, JobQueueCluster, job_parameters, cluster_parameters logger = logging.getLogger(__name__) diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index 894006b9..f050f7d8 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -19,7 +19,7 @@ from dask_jobqueue.htcondor import HTCondorJob from dask_jobqueue.oar import OARJob -from dask_jobqueue.job import JobQueueCluster +from dask_jobqueue.core import JobQueueCluster from dask.distributed import Scheduler, Client import pytest From 21965e2638dedf0236d885a04b9fed52976b0923 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Fri, 13 Sep 2019 15:17:36 -0700 Subject: [PATCH 093/109] Add into example_job address --- dask_jobqueue/core.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dask_jobqueue/core.py b/dask_jobqueue/core.py index dde42ce4..e17592d0 100644 --- a/dask_jobqueue/core.py +++ b/dask_jobqueue/core.py @@ -463,9 +463,11 @@ def example_job(self): try: address = self.scheduler.address except AttributeError: - address = "tcp://scheduler:8786" + address = "tcp://:8786" return self.job_cls( - address or "tcp://scheduler:8786", name="name", **self._kwargs + address or "tcp://:8786", + name="name", + **self._kwargs ) @property From 6f6e3d261c5e5e90e0263bd2f43f203ae7adc398 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Mon, 16 Sep 2019 16:50:26 -0700 Subject: [PATCH 094/109] add changelog entry --- docs/source/changelog.rst | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 97ddc4a2..2a08c442 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,6 +1,29 @@ Changelog ========= +0.7.0 / 2019-XX-XX +------------------ + +- Base Dask-Jobqueue on top of the core ``dask.distributed.SpecCluster`` class + (:pr:`307`) + + This is nearly complete reimplementation of the dask-jobqueue logic on top + of more centralized logic. This improves standardization and adds new + features, but does include the following **breaking changes**: + + - The scale method now refers to the number of jobs rather than the + number of workers. Previously if each job launched two workers then + ``cluster.scale(4)`` would launch two jobs for a total of four workers. + Now it launches four jobs for a total of eight workers. + - The ``cluster.stop_all_jobs()`` method has been removed. + Please use ``cluster.scale(0)`` instead. + - The attributes ``running_jobs``, ``pending_jobs``, and + ``cancelled_jobs`` have been removed. These have been moved upstream to + the ``dask.distributed.SpecCluster`` class instead as ``workers`` and + ``worker_spec``, as well as ``.plan``, ``.requested``, and ``.observed``. + - The ``name`` attribute has been moved to ``job_name``. + + 0.6.3 / 2019-08-18 ------------------ From 0b21d289b8edec64dc27e60b0b1474a37abeed96 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Tue, 17 Sep 2019 09:34:29 -0700 Subject: [PATCH 095/109] example_job -> _dummy_job --- dask_jobqueue/core.py | 19 +++++++++++++------ dask_jobqueue/tests/test_htcondor.py | 6 +++--- dask_jobqueue/tests/test_jobqueue_core.py | 14 +++++++------- dask_jobqueue/tests/test_pbs.py | 6 +++--- 4 files changed, 26 insertions(+), 19 deletions(-) diff --git a/dask_jobqueue/core.py b/dask_jobqueue/core.py index e17592d0..33725175 100644 --- a/dask_jobqueue/core.py +++ b/dask_jobqueue/core.py @@ -444,7 +444,7 @@ def __init__( if "processes" in kwargs and kwargs["processes"] > 1: worker["group"] = ["-" + str(i) for i in range(kwargs["processes"])] - self.example_job # trigger property to ensure that the job is valid + self._dummy_job # trigger property to ensure that the job is valid super().__init__( scheduler=scheduler, @@ -459,9 +459,16 @@ def __init__( self.scale(n_workers) @property - def example_job(self): + def _dummy_job(self): + """ + Creates a Job similar to what we will use in practice + + This is used for backwards functionality and a variety of convenience + functions. It is also used on construction to raise errors if any of + the keywords are improper. + """ try: - address = self.scheduler.address + address = self.scheduler.address # Have we already connected? except AttributeError: address = "tcp://:8786" return self.job_cls( @@ -472,11 +479,11 @@ def example_job(self): @property def job_header(self): - return self.example_job.job_header + return self._dummy_job.job_header def job_script(self): - return self.example_job.job_script() + return self._dummy_job.job_script() @property def job_name(self): - return self.example_job.job_name + return self._dummy_job.job_name diff --git a/dask_jobqueue/tests/test_htcondor.py b/dask_jobqueue/tests/test_htcondor.py index 024473ad..cd9349f0 100644 --- a/dask_jobqueue/tests/test_htcondor.py +++ b/dask_jobqueue/tests/test_htcondor.py @@ -13,9 +13,9 @@ def test_header(): with HTCondorCluster(cores=1, memory="100MB", disk="100MB") as cluster: - assert cluster.example_job.job_header_dict["MY.DaskWorkerCores"] == 1 - assert cluster.example_job.job_header_dict["MY.DaskWorkerDisk"] == 100000000 - assert cluster.example_job.job_header_dict["MY.DaskWorkerMemory"] == 100000000 + assert cluster._dummy_job.job_header_dict["MY.DaskWorkerCores"] == 1 + assert cluster._dummy_job.job_header_dict["MY.DaskWorkerDisk"] == 100000000 + assert cluster._dummy_job.job_header_dict["MY.DaskWorkerMemory"] == 100000000 def test_job_script(): diff --git a/dask_jobqueue/tests/test_jobqueue_core.py b/dask_jobqueue/tests/test_jobqueue_core.py index a376e08e..c4acd785 100644 --- a/dask_jobqueue/tests/test_jobqueue_core.py +++ b/dask_jobqueue/tests/test_jobqueue_core.py @@ -28,11 +28,11 @@ def test_command_template(): with PBSCluster(cores=2, memory="4GB") as cluster: assert ( "%s -m distributed.cli.dask_worker" % (sys.executable) - in cluster.example_job._command_template + in cluster._dummy_job._command_template ) - assert " --nthreads 2" in cluster.example_job._command_template - assert " --memory-limit " in cluster.example_job._command_template - assert " --name " in cluster.example_job._command_template + assert " --nthreads 2" in cluster._dummy_job._command_template + assert " --memory-limit " in cluster._dummy_job._command_template + assert " --name " in cluster._dummy_job._command_template with PBSCluster( cores=2, @@ -41,9 +41,9 @@ def test_command_template(): local_directory="/scratch", extra=["--preload", "mymodule"], ) as cluster: - assert " --death-timeout 60" in cluster.example_job._command_template - assert " --local-directory /scratch" in cluster.example_job._command_template - assert " --preload mymodule" in cluster.example_job._command_template + assert " --death-timeout 60" in cluster._dummy_job._command_template + assert " --local-directory /scratch" in cluster._dummy_job._command_template + assert " --preload mymodule" in cluster._dummy_job._command_template @pytest.mark.parametrize( diff --git a/dask_jobqueue/tests/test_pbs.py b/dask_jobqueue/tests/test_pbs.py index 3de55096..d00dd008 100644 --- a/dask_jobqueue/tests/test_pbs.py +++ b/dask_jobqueue/tests/test_pbs.py @@ -210,7 +210,7 @@ def test_adaptive(loop): assert future.result(QUEUE_WAIT) == 11 start = time() - processes = cluster.example_job.worker_processes + processes = cluster._dummy_job.worker_processes while len(client.scheduler_info()["workers"]) != processes: sleep(0.1) assert time() < start + QUEUE_WAIT @@ -242,7 +242,7 @@ def test_adaptive_grouped(loop): assert future.result(QUEUE_WAIT) == 11 start = time() - processes = cluster.example_job.worker_processes + processes = cluster._dummy_job.worker_processes while len(client.scheduler_info()["workers"]) != processes: sleep(0.1) assert time() < start + QUEUE_WAIT @@ -266,7 +266,7 @@ def test_adaptive_cores_mem(loop): assert future.result(QUEUE_WAIT) == 11 start = time() - processes = cluster.example_job.worker_processes + processes = cluster._dummy_job.worker_processes while len(client.scheduler_info()["workers"]) != processes: sleep(0.1) assert time() < start + QUEUE_WAIT From b9c24c003b881b5dbdf6e290299c8d78ed522e13 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Tue, 17 Sep 2019 12:40:48 -0700 Subject: [PATCH 096/109] Add scale/adapt memory/cores to docs --- docs/source/configuration.rst | 14 +++++++++++--- docs/source/howitworks.rst | 7 ++++++- docs/source/index.rst | 5 +++-- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/docs/source/configuration.rst b/docs/source/configuration.rst index e52b75d8..2ec86e6c 100644 --- a/docs/source/configuration.rst +++ b/docs/source/configuration.rst @@ -30,12 +30,20 @@ define a single job: Note that the ``cores`` and ``memory`` keywords above correspond not to your full desired deployment, but rather to the size of a *single job* which should -be no larger than the size of a single machine in your cluster. Separately you -will specify how many jobs to deploy using the scale method. +be no larger than the size of a single machine in your cluster. + +Separately you will specify how many jobs to deploy using the scale method. +You can either specify the number of jobs, or the total number of cores or +memory that you want. .. code-block:: python - cluster.scale(12) # launch 12 workers (2 jobs of 6 workers each) of the specification provided above + cluster.scale(2) # launch 2 jobs, each of which starts 6 worker processes + cluster.scale(cores=48) # Or specify cores or memory directly + cluster.scale(memory="200 GB") # Or specify cores or memory directly + +These all accomplish the same thing. You can chose whichever makes the most +sense to you. Configuration Files diff --git a/docs/source/howitworks.rst b/docs/source/howitworks.rst index 00c26b3a..12a1dd78 100644 --- a/docs/source/howitworks.rst +++ b/docs/source/howitworks.rst @@ -24,10 +24,15 @@ object is instantiated: ) You then ask for more workers using the ``scale`` command: +You can either specify the number of jobs, or the total number of cores or +memory that you want. .. code-block:: python - cluster.scale(36) + + cluster.scale(2) # launch 2 jobs, each of which starts 6 worker processes + cluster.scale(cores=48) # Or specify cores or memory directly + cluster.scale(memory="200 GB") # Or specify cores or memory directly The cluster generates a traditional job script and submits that an appropriate number of times to the job queue. You can see the job script that it will diff --git a/docs/source/index.rst b/docs/source/index.rst index 3d0eec19..9392a8d8 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -19,7 +19,7 @@ Example from dask_jobqueue import PBSCluster cluster = PBSCluster() - cluster.scale(10) # Ask for ten workers + cluster.scale(10) # Deploy ten single-node jobs from dask.distributed import Client client = Client(cluster) # Connect this local process to remote workers @@ -45,7 +45,8 @@ save resources when not actively computing. .. code-block:: python - cluster.adapt(minimum=6, maximum=90) # auto-scale between 6 and 90 workers + cluster.adapt(minimum=6, maximum=90) # auto-scale between 6 and 90 jobs + cluster.adapt(maximum_memory="10 TB") # or use core/memory limits More details ------------ From 79d108b53174cbd8ee2fc965108ddeedd6e8832c Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 25 Sep 2019 08:08:57 -0500 Subject: [PATCH 097/109] specify jobs= keyword in scale --- docs/source/configuration.rst | 4 ++-- docs/source/howitworks.rst | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/configuration.rst b/docs/source/configuration.rst index 2ec86e6c..eb867c64 100644 --- a/docs/source/configuration.rst +++ b/docs/source/configuration.rst @@ -33,12 +33,12 @@ full desired deployment, but rather to the size of a *single job* which should be no larger than the size of a single machine in your cluster. Separately you will specify how many jobs to deploy using the scale method. -You can either specify the number of jobs, or the total number of cores or +You can either specify the number of workers, or the total number of cores or memory that you want. .. code-block:: python - cluster.scale(2) # launch 2 jobs, each of which starts 6 worker processes + cluster.scale(jobs=2) # launch 2 workers, each of which starts 6 worker processes cluster.scale(cores=48) # Or specify cores or memory directly cluster.scale(memory="200 GB") # Or specify cores or memory directly diff --git a/docs/source/howitworks.rst b/docs/source/howitworks.rst index 12a1dd78..6fc7978f 100644 --- a/docs/source/howitworks.rst +++ b/docs/source/howitworks.rst @@ -30,7 +30,7 @@ memory that you want. .. code-block:: python - cluster.scale(2) # launch 2 jobs, each of which starts 6 worker processes + cluster.scale(jobs=2) # launch 2 jobs, each of which starts 6 worker processes cluster.scale(cores=48) # Or specify cores or memory directly cluster.scale(memory="200 GB") # Or specify cores or memory directly From 7809511b3071dd33d43fd349f9c1a75a74808d9c Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 25 Sep 2019 08:15:34 -0500 Subject: [PATCH 098/109] Support jobs= keyword in scale --- dask_jobqueue/core.py | 7 +++++++ dask_jobqueue/tests/test_job.py | 11 +++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/dask_jobqueue/core.py b/dask_jobqueue/core.py index 33725175..d6b1f4c0 100644 --- a/dask_jobqueue/core.py +++ b/dask_jobqueue/core.py @@ -1,5 +1,6 @@ from contextlib import contextmanager import logging +import math import os import re import shlex @@ -487,3 +488,9 @@ def job_script(self): @property def job_name(self): return self._dummy_job.job_name + + def scale(self, n=None, jobs=0, memory=None, cores=None): + if n is not None: + jobs = int(math.ceil(n / self._dummy_job.worker_processes)) + + return super().scale(jobs, memory=memory, cores=cores) diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index f050f7d8..aa2782c0 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -77,7 +77,7 @@ async def test_cluster(job_cls): ) as cluster: async with Client(cluster, asynchronous=True) as client: assert len(cluster.workers) == 1 - cluster.scale(2) + cluster.scale(jobs=2) await cluster assert len(cluster.workers) == 2 assert all(isinstance(w, job_cls) for w in cluster.workers.values()) @@ -131,7 +131,7 @@ def test_header_lines_skip(): @pytest.mark.asyncio -async def test_nprocs(): +async def test_nprocs_scale(): async with LocalCluster( cores=2, memory="4GB", processes=2, asynchronous=True ) as cluster: @@ -149,6 +149,13 @@ async def test_nprocs(): await asyncio.sleep(0.2) assert len(cluster.scheduler.workers) == 2 # they're still one group + cluster.scale(jobs=2) + assert len(cluster.worker_spec) == 2 + cluster.scale(5) + assert len(cluster.worker_spec) == 3 + cluster.scale(1) + assert len(cluster.worker_spec) == 1 + @pytest.mark.parametrize("Cluster", all_clusters) def test_docstring_cluster(Cluster): From 4a0c62670e22143baf06b1ed8dc2e1da6897d5c6 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 25 Sep 2019 08:24:17 -0500 Subject: [PATCH 099/109] Add minimum/maximum_jobs to adapt --- dask_jobqueue/core.py | 13 +++++++++++++ dask_jobqueue/tests/test_job.py | 16 ++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/dask_jobqueue/core.py b/dask_jobqueue/core.py index d6b1f4c0..6a56300a 100644 --- a/dask_jobqueue/core.py +++ b/dask_jobqueue/core.py @@ -494,3 +494,16 @@ def scale(self, n=None, jobs=0, memory=None, cores=None): jobs = int(math.ceil(n / self._dummy_job.worker_processes)) return super().scale(jobs, memory=memory, cores=cores) + + def adapt( + self, + *args, + minimum_jobs: int = None, + maximum_jobs: int = None, + **kwargs + ): + if minimum_jobs is not None: + kwargs["minimum"] = minimum_jobs * self._dummy_job.worker_processes + if maximum_jobs is not None: + kwargs["maximum"] = maximum_jobs * self._dummy_job.worker_processes + return super().adapt(*args, **kwargs) diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index aa2782c0..213e2dc5 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -122,6 +122,22 @@ async def test_adapt(job_cls): assert not cluster.workers +@pytest.mark.parametrize("job_cls", job_protected) +@pytest.mark.asyncio +async def test_adapt_parameters(job_cls): + async with JobQueueCluster( + cores=4, memory="2GB", processes=2, job_cls=job_cls, asynchronous=True, + ) as cluster: + async with Client(cluster, asynchronous=True) as client: + adapt = cluster.adapt(minimum=2, maximum=4, interval="10ms") + await adapt.adapt() + assert len(cluster.worker_spec) == 1 # 2 workers, 4 jobs + + adapt = cluster.adapt(minimum_jobs=2, maximum_jobs=4, interval="10ms") + await adapt.adapt() + assert len(cluster.worker_spec) == 2 # 2 workers, 4 jobs + + def test_header_lines_skip(): job = PBSJob(cores=1, memory="1GB", job_name="foobar") assert "foobar" in job.job_script() From 289915f493971c44616c23297d862a90ea4ea255 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 25 Sep 2019 08:24:58 -0500 Subject: [PATCH 100/109] lint --- dask_jobqueue/core.py | 6 +----- dask_jobqueue/tests/test_job.py | 15 +++++++-------- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/dask_jobqueue/core.py b/dask_jobqueue/core.py index 6a56300a..888e8e9d 100644 --- a/dask_jobqueue/core.py +++ b/dask_jobqueue/core.py @@ -496,11 +496,7 @@ def scale(self, n=None, jobs=0, memory=None, cores=None): return super().scale(jobs, memory=memory, cores=cores) def adapt( - self, - *args, - minimum_jobs: int = None, - maximum_jobs: int = None, - **kwargs + self, *args, minimum_jobs: int = None, maximum_jobs: int = None, **kwargs ): if minimum_jobs is not None: kwargs["minimum"] = minimum_jobs * self._dummy_job.worker_processes diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index 213e2dc5..4e839ef1 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -126,16 +126,15 @@ async def test_adapt(job_cls): @pytest.mark.asyncio async def test_adapt_parameters(job_cls): async with JobQueueCluster( - cores=4, memory="2GB", processes=2, job_cls=job_cls, asynchronous=True, + cores=4, memory="2GB", processes=2, job_cls=job_cls, asynchronous=True ) as cluster: - async with Client(cluster, asynchronous=True) as client: - adapt = cluster.adapt(minimum=2, maximum=4, interval="10ms") - await adapt.adapt() - assert len(cluster.worker_spec) == 1 # 2 workers, 4 jobs + adapt = cluster.adapt(minimum=2, maximum=4, interval="10ms") + await adapt.adapt() + assert len(cluster.worker_spec) == 1 # 2 workers, 4 jobs - adapt = cluster.adapt(minimum_jobs=2, maximum_jobs=4, interval="10ms") - await adapt.adapt() - assert len(cluster.worker_spec) == 2 # 2 workers, 4 jobs + adapt = cluster.adapt(minimum_jobs=2, maximum_jobs=4, interval="10ms") + await adapt.adapt() + assert len(cluster.worker_spec) == 2 # 2 workers, 4 jobs def test_header_lines_skip(): From bf4840bdee35829df107e99d1c591a34fa7067a7 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 25 Sep 2019 08:29:48 -0500 Subject: [PATCH 101/109] unxfail adaptive cores/memory pbs test --- dask_jobqueue/tests/test_pbs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dask_jobqueue/tests/test_pbs.py b/dask_jobqueue/tests/test_pbs.py index d00dd008..2e92162c 100644 --- a/dask_jobqueue/tests/test_pbs.py +++ b/dask_jobqueue/tests/test_pbs.py @@ -248,7 +248,6 @@ def test_adaptive_grouped(loop): assert time() < start + QUEUE_WAIT -@pytest.mark.xfail(reason="adapt doesn't yet have cores/memory") @pytest.mark.env("pbs") def test_adaptive_cores_mem(loop): with PBSCluster( From 18cd5cd670d2354e7c1317af659452bcd8b73d32 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 25 Sep 2019 08:38:51 -0500 Subject: [PATCH 102/109] Remove startup_cost keyword from HTCondor docstring This has been removed upstream --- dask_jobqueue/htcondor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dask_jobqueue/htcondor.py b/dask_jobqueue/htcondor.py index 7becf983..c61e91cd 100644 --- a/dask_jobqueue/htcondor.py +++ b/dask_jobqueue/htcondor.py @@ -218,9 +218,8 @@ class HTCondorCluster(JobQueueCluster): >>> client = Client(cluster) This also works with adaptive clusters. This automatically launches and kill workers based on load. - HTCondor can take longer to start jobs than other batch systems - tune Adaptive parameters accordingly. - >>> cluster.adapt(minimum=5, startup_cost='60s') + >>> cluster.adapt(minimum=5) """.format( job=job_parameters, cluster=cluster_parameters ) From e2b7cb5a8ef9cc2a2741c79c903cb23f50015e9b Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 25 Sep 2019 08:52:49 -0500 Subject: [PATCH 103/109] await clusters before closing --- dask_jobqueue/tests/test_job.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index 4e839ef1..71f0352f 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -130,11 +130,13 @@ async def test_adapt_parameters(job_cls): ) as cluster: adapt = cluster.adapt(minimum=2, maximum=4, interval="10ms") await adapt.adapt() - assert len(cluster.worker_spec) == 1 # 2 workers, 4 jobs + await cluster + assert len(cluster.workers) == 1 # 2 workers, 4 jobs adapt = cluster.adapt(minimum_jobs=2, maximum_jobs=4, interval="10ms") await adapt.adapt() - assert len(cluster.worker_spec) == 2 # 2 workers, 4 jobs + await cluster + assert len(cluster.workers) == 2 # 2 workers, 4 jobs def test_header_lines_skip(): From dd2db2df6a3f6275ff195a45af6109613e7ed44d Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 25 Sep 2019 09:12:49 -0500 Subject: [PATCH 104/109] reduce core count in test SLURM was constrained --- dask_jobqueue/tests/test_job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py index 71f0352f..71f26f09 100644 --- a/dask_jobqueue/tests/test_job.py +++ b/dask_jobqueue/tests/test_job.py @@ -126,7 +126,7 @@ async def test_adapt(job_cls): @pytest.mark.asyncio async def test_adapt_parameters(job_cls): async with JobQueueCluster( - cores=4, memory="2GB", processes=2, job_cls=job_cls, asynchronous=True + cores=2, memory="1GB", processes=2, job_cls=job_cls, asynchronous=True ) as cluster: adapt = cluster.adapt(minimum=2, maximum=4, interval="10ms") await adapt.adapt() From 9b11c652027865e1cbeffb857e78da077304c0ee Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 25 Sep 2019 09:23:41 -0500 Subject: [PATCH 105/109] update docstrings to use the jobs= parameter --- dask_jobqueue/htcondor.py | 4 ++-- dask_jobqueue/local.py | 2 +- dask_jobqueue/lsf.py | 4 ++-- dask_jobqueue/oar.py | 4 ++-- dask_jobqueue/pbs.py | 4 ++-- dask_jobqueue/sge.py | 4 ++-- dask_jobqueue/slurm.py | 4 ++-- 7 files changed, 13 insertions(+), 13 deletions(-) diff --git a/dask_jobqueue/htcondor.py b/dask_jobqueue/htcondor.py index c61e91cd..6b3769ed 100644 --- a/dask_jobqueue/htcondor.py +++ b/dask_jobqueue/htcondor.py @@ -212,14 +212,14 @@ class HTCondorCluster(JobQueueCluster): -------- >>> from dask_jobqueue.htcondor import HTCondorCluster >>> cluster = HTCondorCluster(cores=24, memory="4GB", disk="4GB") - >>> cluster.scale(10) + >>> cluster.scale(jobs=10) # ask for 10 jobs >>> from dask.distributed import Client >>> client = Client(cluster) This also works with adaptive clusters. This automatically launches and kill workers based on load. - >>> cluster.adapt(minimum=5) + >>> cluster.adapt(maximum_jobs=20) """.format( job=job_parameters, cluster=cluster_parameters ) diff --git a/dask_jobqueue/local.py b/dask_jobqueue/local.py index a0485d2b..3ac2fd12 100644 --- a/dask_jobqueue/local.py +++ b/dask_jobqueue/local.py @@ -88,7 +88,7 @@ class LocalCluster(JobQueueCluster): -------- >>> from dask_jobqueue import LocalCluster >>> cluster = LocalCluster(cores=2, memory="4 GB") - >>> cluster.scale(3) + >>> cluster.scale(jobs=3) # ask for 3 jobs See Also -------- diff --git a/dask_jobqueue/lsf.py b/dask_jobqueue/lsf.py index a5b01495..d1cbe53e 100644 --- a/dask_jobqueue/lsf.py +++ b/dask_jobqueue/lsf.py @@ -182,7 +182,7 @@ class LSFCluster(JobQueueCluster): >>> from dask_jobqueue import LSFCluster >>> cluster = LSFCluster(queue='general', project='DaskonLSF', ... cores=15, memory='25GB') - >>> cluster.scale(10) # this may take a few seconds to launch + >>> cluster.scale(jobs=10) # this may take a few seconds to launch >>> from dask.distributed import Client >>> client = Client(cluster) @@ -190,7 +190,7 @@ class LSFCluster(JobQueueCluster): This also works with adaptive clusters. This automatically launches and kill workers based on load. - >>> cluster.adapt() + >>> cluster.adapt(maximum_jobs=20) """.format( job=job_parameters, cluster=cluster_parameters ) diff --git a/dask_jobqueue/oar.py b/dask_jobqueue/oar.py index 91d9da1a..7a68ad4e 100644 --- a/dask_jobqueue/oar.py +++ b/dask_jobqueue/oar.py @@ -114,14 +114,14 @@ class OARCluster(JobQueueCluster): -------- >>> from dask_jobqueue import OARCluster >>> cluster = OARCluster(queue='regular') - >>> cluster.scale(10) # this may take a few seconds to launch + >>> cluster.scale(jobs=10) # this may take a few seconds to launch >>> from dask.distributed import Client >>> client = Client(cluster) This also works with adaptive clusters. This automatically launches and kill workers based on load. - >>> cluster.adapt() + >>> cluster.adapt(maximum_jobs=20) """.format( job=job_parameters, cluster=cluster_parameters ) diff --git a/dask_jobqueue/pbs.py b/dask_jobqueue/pbs.py index b975957b..6ce2f11b 100644 --- a/dask_jobqueue/pbs.py +++ b/dask_jobqueue/pbs.py @@ -125,14 +125,14 @@ class PBSCluster(JobQueueCluster): >>> from dask_jobqueue import PBSCluster >>> cluster = PBSCluster(queue='regular', project="myproj", cores=24, ... memory="500 GB") - >>> cluster.scale(10) # Ask for ten jobs + >>> cluster.scale(jobs=10) # This may take a few seconds to launch >>> from dask.distributed import Client >>> client = Client(cluster) This also works with adaptive clusters. This automatically launches and kill workers based on load. - >>> cluster.adapt() + >>> cluster.adapt(maximum_jobs=20) """.format( job=job_parameters, cluster=cluster_parameters ) diff --git a/dask_jobqueue/sge.py b/dask_jobqueue/sge.py index 71745616..dd58d419 100644 --- a/dask_jobqueue/sge.py +++ b/dask_jobqueue/sge.py @@ -102,14 +102,14 @@ class SGECluster(JobQueueCluster): ... cores=24, ... memory="500 GB" ... ) - >>> cluster.scale(10) # this may take a few seconds to launch + >>> cluster.scale(jobs=10) # this may take a few seconds to launch >>> from dask.distributed import Client >>> client = Client(cluster) This also works with adaptive clusters. This automatically launches and kill workers based on load. - >>> cluster.adapt() + >>> cluster.adapt(maximum_jobs=20) """.format( job=job_parameters, cluster=cluster_parameters ) diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py index 8d80cc6a..aaccad62 100644 --- a/dask_jobqueue/slurm.py +++ b/dask_jobqueue/slurm.py @@ -136,14 +136,14 @@ class SLURMCluster(JobQueueCluster): ... cores=24, ... memory="500 GB" ... ) - >>> cluster.scale(10) # this may take a few seconds to launch + >>> cluster.scale(jobs=10) # this may take a few seconds to launch >>> from dask.distributed import Client >>> client = Client(cluster) This also works with adaptive clusters. This automatically launches and kill workers based on load. - >>> cluster.adapt() + >>> cluster.adapt(maximum_jobs=20) """.format( job=job_parameters, cluster=cluster_parameters ) From 38c4c3d61c4369e2140338027249713fd09a0008 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 25 Sep 2019 09:45:06 -0500 Subject: [PATCH 106/109] update docs --- docs/source/howitworks.rst | 11 +++++++---- docs/source/index.rst | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/docs/source/howitworks.rst b/docs/source/howitworks.rst index 6fc7978f..e8f62fb0 100644 --- a/docs/source/howitworks.rst +++ b/docs/source/howitworks.rst @@ -23,17 +23,20 @@ object is instantiated: walltime='02:00:00', ) -You then ask for more workers using the ``scale`` command: -You can either specify the number of jobs, or the total number of cores or -memory that you want. +These parameters specify the characteristics of a *single job* or a *single +compute node*, rather than the characteristics of your computation as a whole. +For the full computation, you will then ask for a number of jobs using the +``scale`` command: .. code-block:: python - cluster.scale(jobs=2) # launch 2 jobs, each of which starts 6 worker processes cluster.scale(cores=48) # Or specify cores or memory directly cluster.scale(memory="200 GB") # Or specify cores or memory directly +You can either specify the number of jobs, or the total number of cores or +memory that you want. + The cluster generates a traditional job script and submits that an appropriate number of times to the job queue. You can see the job script that it will generate as follows: diff --git a/docs/source/index.rst b/docs/source/index.rst index 9392a8d8..ad9dbce3 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -45,7 +45,7 @@ save resources when not actively computing. .. code-block:: python - cluster.adapt(minimum=6, maximum=90) # auto-scale between 6 and 90 jobs + cluster.adapt(minimum_jobs=10, maximum_jobs=100) # auto-scale between 10 and 100 jobs cluster.adapt(maximum_memory="10 TB") # or use core/memory limits More details From 23562a44bf99d7b880d320f1dc87ece74a503905 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 25 Sep 2019 09:56:07 -0500 Subject: [PATCH 107/109] two more small doc fixes [skip ci] --- docs/source/howitworks.rst | 1 + docs/source/index.rst | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/howitworks.rst b/docs/source/howitworks.rst index e8f62fb0..3e0a2ca2 100644 --- a/docs/source/howitworks.rst +++ b/docs/source/howitworks.rst @@ -25,6 +25,7 @@ object is instantiated: These parameters specify the characteristics of a *single job* or a *single compute node*, rather than the characteristics of your computation as a whole. +It hasn't actually launched any jobs yet. For the full computation, you will then ask for a number of jobs using the ``scale`` command: diff --git a/docs/source/index.rst b/docs/source/index.rst index ad9dbce3..f9f352ad 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -19,7 +19,7 @@ Example from dask_jobqueue import PBSCluster cluster = PBSCluster() - cluster.scale(10) # Deploy ten single-node jobs + cluster.scale(jobs=10) # Deploy ten single-node jobs from dask.distributed import Client client = Client(cluster) # Connect this local process to remote workers From 18660e6e1dae7ebf196aca2a6be176e1b40d4c20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Wed, 25 Sep 2019 16:59:56 +0200 Subject: [PATCH 108/109] Change comment that I do not understand. --- dask_jobqueue/lsf.py | 2 +- dask_jobqueue/oar.py | 2 +- dask_jobqueue/pbs.py | 2 +- dask_jobqueue/sge.py | 2 +- dask_jobqueue/slurm.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dask_jobqueue/lsf.py b/dask_jobqueue/lsf.py index d1cbe53e..348a356d 100644 --- a/dask_jobqueue/lsf.py +++ b/dask_jobqueue/lsf.py @@ -182,7 +182,7 @@ class LSFCluster(JobQueueCluster): >>> from dask_jobqueue import LSFCluster >>> cluster = LSFCluster(queue='general', project='DaskonLSF', ... cores=15, memory='25GB') - >>> cluster.scale(jobs=10) # this may take a few seconds to launch + >>> cluster.scale(jobs=10) # ask for 10 jobs >>> from dask.distributed import Client >>> client = Client(cluster) diff --git a/dask_jobqueue/oar.py b/dask_jobqueue/oar.py index 7a68ad4e..f59f8512 100644 --- a/dask_jobqueue/oar.py +++ b/dask_jobqueue/oar.py @@ -114,7 +114,7 @@ class OARCluster(JobQueueCluster): -------- >>> from dask_jobqueue import OARCluster >>> cluster = OARCluster(queue='regular') - >>> cluster.scale(jobs=10) # this may take a few seconds to launch + >>> cluster.scale(jobs=10) # ask for 10 jobs >>> from dask.distributed import Client >>> client = Client(cluster) diff --git a/dask_jobqueue/pbs.py b/dask_jobqueue/pbs.py index 6ce2f11b..f8b3acd1 100644 --- a/dask_jobqueue/pbs.py +++ b/dask_jobqueue/pbs.py @@ -125,7 +125,7 @@ class PBSCluster(JobQueueCluster): >>> from dask_jobqueue import PBSCluster >>> cluster = PBSCluster(queue='regular', project="myproj", cores=24, ... memory="500 GB") - >>> cluster.scale(jobs=10) # This may take a few seconds to launch + >>> cluster.scale(jobs=10) # ask for 10 jobs >>> from dask.distributed import Client >>> client = Client(cluster) diff --git a/dask_jobqueue/sge.py b/dask_jobqueue/sge.py index dd58d419..cc022b47 100644 --- a/dask_jobqueue/sge.py +++ b/dask_jobqueue/sge.py @@ -102,7 +102,7 @@ class SGECluster(JobQueueCluster): ... cores=24, ... memory="500 GB" ... ) - >>> cluster.scale(jobs=10) # this may take a few seconds to launch + >>> cluster.scale(jobs=10) # ask for 10 jobs >>> from dask.distributed import Client >>> client = Client(cluster) diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py index aaccad62..e17c85e2 100644 --- a/dask_jobqueue/slurm.py +++ b/dask_jobqueue/slurm.py @@ -136,7 +136,7 @@ class SLURMCluster(JobQueueCluster): ... cores=24, ... memory="500 GB" ... ) - >>> cluster.scale(jobs=10) # this may take a few seconds to launch + >>> cluster.scale(jobs=10) # ask for 10 jobs >>> from dask.distributed import Client >>> client = Client(cluster) From 55aca3ada3d1ded11917469b6511e8470b75fa31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Wed, 25 Sep 2019 17:02:33 +0200 Subject: [PATCH 109/109] Use .scale(jobs=...) in doc and remove invalid changelog entry. --- docs/source/changelog.rst | 4 ---- docs/source/index.rst | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 2a08c442..069bb6e2 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -11,10 +11,6 @@ Changelog of more centralized logic. This improves standardization and adds new features, but does include the following **breaking changes**: - - The scale method now refers to the number of jobs rather than the - number of workers. Previously if each job launched two workers then - ``cluster.scale(4)`` would launch two jobs for a total of four workers. - Now it launches four jobs for a total of eight workers. - The ``cluster.stop_all_jobs()`` method has been removed. Please use ``cluster.scale(0)`` instead. - The attributes ``running_jobs``, ``pending_jobs``, and diff --git a/docs/source/index.rst b/docs/source/index.rst index f9f352ad..d4625452 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -19,7 +19,7 @@ Example from dask_jobqueue import PBSCluster cluster = PBSCluster() - cluster.scale(jobs=10) # Deploy ten single-node jobs + cluster.scale(jobs=10) # Deploy ten single-node jobs from dask.distributed import Client client = Client(cluster) # Connect this local process to remote workers