From cfdb97f6037c9341fb27f4fd33a8c6e2285459e3 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Mon, 5 Aug 2019 16:54:43 -0600
Subject: [PATCH 001/109] Add basic Job class, and PBS implementation

This is the first step towards rewriting with SpecCluster
I mostly copied the implementations from the Cluster classes,
but then removed the cluster bits
---
 dask_jobqueue/__init__.py       |   3 +-
 dask_jobqueue/job.py            | 330 ++++++++++++++++++++++++++++++++
 dask_jobqueue/pbs.py            |  71 +++++++
 dask_jobqueue/tests/test_job.py |  19 ++
 4 files changed, 422 insertions(+), 1 deletion(-)
 create mode 100644 dask_jobqueue/job.py
 create mode 100644 dask_jobqueue/tests/test_job.py

diff --git a/dask_jobqueue/__init__.py b/dask_jobqueue/__init__.py
index 4cd6a49e..864fcba2 100644
--- a/dask_jobqueue/__init__.py
+++ b/dask_jobqueue/__init__.py
@@ -1,8 +1,9 @@
 # flake8: noqa
 from . import config
 from .core import JobQueueCluster
+from .job import Job
 from .moab import MoabCluster
-from .pbs import PBSCluster
+from .pbs import PBSCluster, PBSJob
 from .slurm import SLURMCluster
 from .sge import SGECluster
 from .lsf import LSFCluster
diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
new file mode 100644
index 00000000..f8fc0637
--- /dev/null
+++ b/dask_jobqueue/job.py
@@ -0,0 +1,330 @@
+import sys
+from contextlib import contextmanager
+
+import dask
+from distributed.deploy.spec import ProcessInterface
+
+import logging
+import math
+import os
+import re
+import shlex
+import subprocess
+import sys
+from collections import OrderedDict
+from contextlib import contextmanager
+
+import six
+
+import dask
+import docrep
+from .deploy import ClusterManager
+from distributed import LocalCluster
+from distributed.utils import format_bytes, parse_bytes, tmpfile, get_ip_interface
+
+logger = logging.getLogger(__name__)
+
+
+class Job(ProcessInterface):
+    """ Base class to launch Dask workers on Job queues
+
+    This class should not be used directly, use inherited class appropriate for
+    your queueing system (e.g. PBScluster or SLURMCluster)
+
+    Parameters
+    ----------
+    name : str
+        Name of Dask workers.
+    cores : int
+        Total number of cores per job
+    memory: str
+        Total amount of memory per job
+    processes : int
+        Number of processes per job
+    interface : str
+        Network interface like 'eth0' or 'ib0'.
+    death_timeout : float
+        Seconds to wait for a scheduler before closing workers
+    local_directory : str
+        Dask worker local directory for file spilling.
+    extra : list
+        Additional arguments to pass to `dask-worker`
+    env_extra : list
+        Other commands to add to script before launching worker.
+    log_directory : str
+        Directory to use for job scheduler logs.
+    shebang : str
+        Path to desired interpreter for your batch submission script.
+    python : str
+        Python executable used to launch Dask workers.
+    config_name : str
+        Section to use from jobqueue.yaml configuration file.
+    kwargs : dict
+        Additional keyword arguments to pass to `LocalCluster`
+
+    Attributes
+    ----------
+    submit_command: str
+        Abstract attribute for job scheduler submit command,
+        should be overridden
+    cancel_command: str
+        Abstract attribute for job scheduler cancel command,
+        should be overridden
+
+    See Also
+    --------
+    PBSCluster
+    SLURMCluster
+    SGECluster
+    OARCluster
+    LSFCluster
+    MoabCluster
+    """
+
+    _script_template = """
+%(shebang)s
+
+%(job_header)s
+
+%(env_header)s
+
+%(worker_command)s
+""".lstrip()
+
+    # Following class attributes should be overridden by extending classes.
+    submit_command = None
+    cancel_command = None
+    job_id_regexp = r"(?P<job_id>\d+)"
+
+    def __init__(
+        self,
+        scheduler=None,
+        name=None,
+        job_name=None,
+        cores=None,
+        memory=None,
+        processes=None,
+        interface=None,
+        death_timeout=None,
+        local_directory=None,
+        extra=None,
+        env_extra=None,
+        log_directory=None,
+        shebang=None,
+        python=sys.executable,
+        config_name=None,
+        **kwargs
+    ):
+        # """
+        # This initializer should be considered as Abstract, and never used directly.
+        # """
+        self.scheduler = scheduler
+        self.job_id = None
+
+        super().__init__()
+        if config_name is None:
+            config_name = getattr(type(self), "config_name", None)
+
+        if config_name is None:
+            raise NotImplementedError(
+                "JobQueueCluster is an abstract class that should not be instantiated."
+            )
+
+        if job_name is None:
+            job_name = dask.config.get("jobqueue.%s.name" % config_name)
+        if cores is None:
+            cores = dask.config.get("jobqueue.%s.cores" % config_name)
+        if memory is None:
+            memory = dask.config.get("jobqueue.%s.memory" % config_name)
+        if processes is None:
+            processes = dask.config.get("jobqueue.%s.processes" % config_name)
+        if interface is None:
+            interface = dask.config.get("jobqueue.%s.interface" % config_name)
+        if death_timeout is None:
+            death_timeout = dask.config.get("jobqueue.%s.death-timeout" % config_name)
+        if local_directory is None:
+            local_directory = dask.config.get(
+                "jobqueue.%s.local-directory" % config_name
+            )
+        if extra is None:
+            extra = dask.config.get("jobqueue.%s.extra" % config_name)
+        if env_extra is None:
+            env_extra = dask.config.get("jobqueue.%s.env-extra" % config_name)
+        if log_directory is None:
+            log_directory = dask.config.get("jobqueue.%s.log-directory" % config_name)
+        if shebang is None:
+            shebang = dask.config.get("jobqueue.%s.shebang" % config_name)
+
+        if cores is None:
+            raise ValueError(
+                "You must specify how many cores to use per job like ``cores=8``"
+            )
+
+        if memory is None:
+            raise ValueError(
+                "You must specify how much memory to use per job like ``memory='24 GB'``"
+            )
+
+        # This attribute should be overridden
+        self.job_header = None
+
+        if interface:
+            extra += ["--interface", interface]
+            kwargs.setdefault("host", get_ip_interface(interface))
+        else:
+            kwargs.setdefault("host", "")
+
+        # Keep information on process, cores, and memory, for use in subclasses
+        self.worker_memory = parse_bytes(memory) if memory is not None else None
+        self.worker_processes = processes
+        self.worker_cores = cores
+        self.name = name
+
+        self.shebang = shebang
+
+        self._env_header = "\n".join(env_extra)
+
+        # dask-worker command line build
+        dask_worker_command = "%(python)s -m distributed.cli.dask_worker" % dict(
+            python=python
+        )
+        command_args = [dask_worker_command, self.scheduler]
+        command_args += ["--nthreads", self.worker_process_threads]
+        if processes is not None and processes > 1:
+            command_args += ["--nprocs", processes]
+
+        command_args += ["--memory-limit", self.worker_process_memory]
+        command_args += ["--name", "%s--${JOB_ID}--" % name]
+
+        if death_timeout is not None:
+            command_args += ["--death-timeout", death_timeout]
+        if local_directory is not None:
+            command_args += ["--local-directory", local_directory]
+        if extra is not None:
+            command_args += extra
+
+        self._command_template = " ".join(map(str, command_args))
+
+        self.log_directory = log_directory
+        if self.log_directory is not None:
+            if not os.path.exists(self.log_directory):
+                os.makedirs(self.log_directory)
+
+    def job_script(self):
+        """ Construct a job submission script """
+        pieces = {
+            "shebang": self.shebang,
+            "job_header": self.job_header,
+            "env_header": self._env_header,
+            "worker_command": self._command_template,
+        }
+        return self._script_template % pieces
+
+    @contextmanager
+    def job_file(self):
+        """ Write job submission script to temporary file """
+        with tmpfile(extension="sh") as fn:
+            with open(fn, "w") as f:
+                logger.debug("writing job script: \n%s", self.job_script())
+                f.write(self.job_script())
+            yield fn
+
+    def _submit_job(self, script_filename):
+        # Should we make this async friendly?
+        return self._call(shlex.split(self.submit_command) + [script_filename])
+
+    @property
+    def worker_process_threads(self):
+        return int(self.worker_cores / self.worker_processes)
+
+    @property
+    def worker_process_memory(self):
+        mem = format_bytes(self.worker_memory / self.worker_processes)
+        mem = mem.replace(" ", "")
+        return mem
+
+    async def start(self):
+        """ Start workers and point them to our local scheduler """
+        logger.debug("Starting job: %s", self.name)
+
+        with self.job_file() as fn:
+            out = self._submit_job(fn)
+            job = self._job_id_from_submit_output(out)
+            if not job:
+                raise ValueError("Unable to parse jobid from output of %s" % out)
+            self.job_id = job
+
+        await super().start()
+
+    def _job_id_from_submit_output(self, out):
+        match = re.search(self.job_id_regexp, out)
+        if match is None:
+            msg = (
+                "Could not parse job id from submission command "
+                "output.\nJob id regexp is {!r}\nSubmission command "
+                "output is:\n{}".format(self.job_id_regexp, out)
+            )
+            raise ValueError(msg)
+
+        job_id = match.groupdict().get("job_id")
+        if job_id is None:
+            msg = (
+                "You need to use a 'job_id' named group in your regexp, e.g. "
+                "r'(?P<job_id>\\d+)', in your regexp. Your regexp was: "
+                "{!r}".format(self.job_id_regexp)
+            )
+            raise ValueError(msg)
+
+        return job_id
+
+    async def close(self):
+        logger.debug("Stopping job: %s", self.name)
+        if self.job_id:
+            self._call(shlex.split(self.cancel_command) + [self.job_id])
+
+    def _call(self, cmd, **kwargs):
+        """ Call a command using subprocess.Popen.
+
+        This centralizes calls out to the command line, providing consistent
+        outputs, logging, and an opportunity to go asynchronous in the future.
+
+        Parameters
+        ----------
+        cmd: List(str))
+            A command, each of which is a list of strings to hand to
+            subprocess.Popen
+
+        Examples
+        --------
+        >>> self._call(['ls', '/foo'])
+
+        Returns
+        -------
+        The stdout produced by the command, as string.
+
+        Raises
+        ------
+        RuntimeError if the command exits with a non-zero exit code
+        """
+        cmd_str = " ".join(cmd)
+        logger.debug(
+            "Executing the following command to command line\n{}".format(cmd_str)
+        )
+
+        proc = subprocess.Popen(
+            cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs
+        )
+
+        out, err = proc.communicate()
+        if six.PY3:
+            out, err = out.decode(), err.decode()
+        if proc.returncode != 0:
+            raise RuntimeError(
+                "Command exited with non-zero exit code.\n"
+                "Exit code: {}\n"
+                "Command:\n{}\n"
+                "stdout:\n{}\n"
+                "stderr:\n{}\n".format(proc.returncode, cmd_str, out, err)
+            )
+        return out
+
diff --git a/dask_jobqueue/pbs.py b/dask_jobqueue/pbs.py
index 54459430..db75f988 100644
--- a/dask_jobqueue/pbs.py
+++ b/dask_jobqueue/pbs.py
@@ -7,6 +7,7 @@
 import dask
 
 from .core import JobQueueCluster, docstrings
+from .job import Job
 
 logger = logging.getLogger(__name__)
 
@@ -142,3 +143,73 @@ def pbs_format_bytes_ceil(n):
     if n >= 10 * 1024:
         return "%dkB" % math.ceil(n / 1024)
     return "%dB" % n
+
+
+class PBSJob(Job):
+    submit_command = "qsub"
+    cancel_command = "qdel"
+    config_name = "pbs"
+
+
+    def __init__(
+        self,
+        *args,
+        queue=None,
+        project=None,
+        resource_spec=None,
+        walltime=None,
+        job_extra=None,
+        config_name="pbs",
+        **kwargs
+    ):
+        if queue is None:
+            queue = dask.config.get("jobqueue.%s.queue" % config_name)
+        if resource_spec is None:
+            resource_spec = dask.config.get("jobqueue.%s.resource-spec" % config_name)
+        if walltime is None:
+            walltime = dask.config.get("jobqueue.%s.walltime" % config_name)
+        if job_extra is None:
+            job_extra = dask.config.get("jobqueue.%s.job-extra" % config_name)
+        if project is None:
+            project = dask.config.get(
+                "jobqueue.%s.project" % config_name
+            ) or os.environ.get("PBS_ACCOUNT")
+
+        # Instantiate args and parameters from parent abstract class
+        super().__init__(*args, config_name=config_name, **kwargs)
+
+        # Try to find a project name from environment variable
+        project = project or os.environ.get("PBS_ACCOUNT")
+
+        header_lines = []
+        # PBS header build
+        if self.name is not None:
+            header_lines.append("#PBS -N %s" % self.name)
+        if queue is not None:
+            header_lines.append("#PBS -q %s" % queue)
+        if project is not None:
+            header_lines.append("#PBS -A %s" % project)
+        if resource_spec is None:
+            # Compute default resources specifications
+            resource_spec = "select=1:ncpus=%d" % self.worker_cores
+            memory_string = pbs_format_bytes_ceil(self.worker_memory)
+            resource_spec += ":mem=" + memory_string
+            logger.info(
+                "Resource specification for PBS not set, initializing it to %s"
+                % resource_spec
+            )
+        if resource_spec is not None:
+            header_lines.append("#PBS -l %s" % resource_spec)
+        if walltime is not None:
+            header_lines.append("#PBS -l walltime=%s" % walltime)
+        if self.log_directory is not None:
+            header_lines.append("#PBS -e %s/" % self.log_directory)
+            header_lines.append("#PBS -o %s/" % self.log_directory)
+        header_lines.extend(["#PBS %s" % arg for arg in job_extra])
+        header_lines.append("JOB_ID=${PBS_JOBID%%.*}")
+
+        # Declare class attribute that shall be overridden
+        self.job_header = "\n".join(header_lines)
+
+        logger.debug("Job script: \n %s" % self.job_script())
+
diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
new file mode 100644
index 00000000..48ddee16
--- /dev/null
+++ b/dask_jobqueue/tests/test_job.py
@@ -0,0 +1,19 @@
+from dask_jobqueue import PBSJob
+from dask.distributed import Scheduler, Client
+from distributed.utils_test import cleanup
+import pytest
+
+
+def test_basic():
+    job = PBSJob(scheduler="127.0.0.1:12345")
+    assert "127.0.0.1:12345" in job.job_script()
+
+
+@pytest.mark.env("pbs")
+@pytest.mark.asyncio
+async def test_live():
+    async with Scheduler(port=0) as s:
+        async with PBSJob(s.address, name="foo") as job:
+            async with Client(s.address, asynchronous=True) as client:
+                await client.wait_for_workers(1)
+                assert list(s.workers.values())[0].name == "foo"

From 7fde0005db3297f44467fc1d46ac27a25c748835 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Mon, 5 Aug 2019 17:17:03 -0700
Subject: [PATCH 002/109] add pytest-asyncio and fix pbs test

---
 ci/none.sh                      | 2 +-
 ci/pbs/Dockerfile               | 2 +-
 ci/sge/Dockerfile-master        | 2 +-
 ci/sge/Dockerfile-slave         | 2 +-
 ci/slurm/Dockerfile             | 2 +-
 dask_jobqueue/tests/test_job.py | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/ci/none.sh b/ci/none.sh
index c826f30c..6fbba1aa 100644
--- a/ci/none.sh
+++ b/ci/none.sh
@@ -4,7 +4,7 @@ function jobqueue_before_install {
   # Install miniconda
   ./ci/conda_setup.sh
   export PATH="$HOME/miniconda/bin:$PATH"
-  conda install --yes -c conda-forge python=$TRAVIS_PYTHON_VERSION dask distributed flake8 pytest docrep
+  conda install --yes -c conda-forge python=$TRAVIS_PYTHON_VERSION dask distributed flake8 pytest docrep pytest-asyncio
   # black only available for python 3
   if [[ "$TRAVIS_PYTHON_VERSION" =~ ^[3-9].+ ]]; then
     pip install black
diff --git a/ci/pbs/Dockerfile b/ci/pbs/Dockerfile
index 1013c91e..47fe2770 100644
--- a/ci/pbs/Dockerfile
+++ b/ci/pbs/Dockerfile
@@ -30,7 +30,7 @@ RUN curl -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-L
     bash miniconda.sh -f -b -p /opt/anaconda && \
     /opt/anaconda/bin/conda clean -tipy && \
     rm -f miniconda.sh
-RUN conda install --yes -c conda-forge python=3.6 dask distributed flake8 pytest docrep
+RUN conda install --yes -c conda-forge python=3.6 dask distributed flake8 pytest docrep pytest-asyncio
 
 # Copy entrypoint and other needed scripts
 COPY ./*.sh /
diff --git a/ci/sge/Dockerfile-master b/ci/sge/Dockerfile-master
index d596fd60..c6f47340 100644
--- a/ci/sge/Dockerfile-master
+++ b/ci/sge/Dockerfile-master
@@ -10,7 +10,7 @@ RUN curl -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-L
     rm -f miniconda.sh
 ENV PATH /opt/anaconda/bin:$PATH
 ARG PYTHON_VERSION
-RUN conda install -c conda-forge python=$PYTHON_VERSION dask distributed pytest && conda clean -tipy
+RUN conda install -c conda-forge python=$PYTHON_VERSION dask distributed pytest pytest-asyncio && conda clean -tipy
 
 COPY ./*.sh /
 COPY ./*.txt /
diff --git a/ci/sge/Dockerfile-slave b/ci/sge/Dockerfile-slave
index d97647cf..10e51d2e 100644
--- a/ci/sge/Dockerfile-slave
+++ b/ci/sge/Dockerfile-slave
@@ -10,7 +10,7 @@ RUN curl -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-L
     rm -f miniconda.sh
 ENV PATH /opt/anaconda/bin:$PATH
 ARG PYTHON_VERSION
-RUN conda install -c conda-forge python=$PYTHON_VERSION dask distributed pytest && conda clean -tipy
+RUN conda install -c conda-forge python=$PYTHON_VERSION dask distributed pytest pytest-asyncio && conda clean -tipy
 
 COPY ./setup-slave.sh /
 COPY ./*.sh /
diff --git a/ci/slurm/Dockerfile b/ci/slurm/Dockerfile
index 814cf792..e2bb7ad8 100644
--- a/ci/slurm/Dockerfile
+++ b/ci/slurm/Dockerfile
@@ -5,7 +5,7 @@ RUN curl -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-L
     /opt/anaconda/bin/conda clean -tipy && \
     rm -f miniconda.sh
 ENV PATH /opt/anaconda/bin:$PATH
-RUN conda install --yes -c conda-forge python=3.6 dask distributed flake8 pytest docrep
+RUN conda install --yes -c conda-forge python=3.6 dask distributed flake8 pytest docrep pytest-asyncio
 
 ENV LC_ALL en_US.UTF-8
 
diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index 48ddee16..d06aa132 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -5,7 +5,7 @@
 
 
 def test_basic():
-    job = PBSJob(scheduler="127.0.0.1:12345")
+    job = PBSJob(scheduler="127.0.0.1:12345", cores=1, memory="1 GB")
     assert "127.0.0.1:12345" in job.job_script()
 
 

From a49ac2a414873bc486efdf5be18ac876ae8a9d2f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= <loic.esteve@ymail.com>
Date: Tue, 6 Aug 2019 17:15:37 +0200
Subject: [PATCH 003/109] Add SGEJob with passing test.

Fix flake8 as well.
---
 dask_jobqueue/__init__.py       |  2 +-
 dask_jobqueue/job.py            | 10 ------
 dask_jobqueue/pbs.py            |  2 --
 dask_jobqueue/sge.py            | 61 +++++++++++++++++++++++++++++++++
 dask_jobqueue/tests/test_job.py | 25 +++++++++++---
 5 files changed, 83 insertions(+), 17 deletions(-)

diff --git a/dask_jobqueue/__init__.py b/dask_jobqueue/__init__.py
index 864fcba2..6faa8404 100644
--- a/dask_jobqueue/__init__.py
+++ b/dask_jobqueue/__init__.py
@@ -5,7 +5,7 @@
 from .moab import MoabCluster
 from .pbs import PBSCluster, PBSJob
 from .slurm import SLURMCluster
-from .sge import SGECluster
+from .sge import SGECluster, SGEJob
 from .lsf import LSFCluster
 from .oar import OARCluster
 from .htcondor import HTCondorCluster
diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index f8fc0637..1a4b56fc 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -5,21 +5,12 @@
 from distributed.deploy.spec import ProcessInterface
 
 import logging
-import math
 import os
 import re
 import shlex
 import subprocess
-import sys
-from collections import OrderedDict
-from contextlib import contextmanager
-
 import six
 
-import dask
-import docrep
-from .deploy import ClusterManager
-from distributed import LocalCluster
 from distributed.utils import format_bytes, parse_bytes, tmpfile, get_ip_interface
 
 logger = logging.getLogger(__name__)
@@ -327,4 +318,3 @@ def _call(self, cmd, **kwargs):
                 "stderr:\n{}\n".format(proc.returncode, cmd_str, out, err)
             )
         return out
-
diff --git a/dask_jobqueue/pbs.py b/dask_jobqueue/pbs.py
index db75f988..86141dc7 100644
--- a/dask_jobqueue/pbs.py
+++ b/dask_jobqueue/pbs.py
@@ -150,7 +150,6 @@ class PBSJob(Job):
     cancel_command = "qdel"
     config_name = "pbs"
 
-
     def __init__(
         self,
         *args,
@@ -212,4 +211,3 @@ def __init__(
         self.job_header = "\n".join(header_lines)
 
         logger.debug("Job script: \n %s" % self.job_script())
-
diff --git a/dask_jobqueue/sge.py b/dask_jobqueue/sge.py
index a9a8679a..490b7c56 100644
--- a/dask_jobqueue/sge.py
+++ b/dask_jobqueue/sge.py
@@ -5,6 +5,7 @@
 import dask
 
 from .core import JobQueueCluster, docstrings
+from .job import Job
 
 logger = logging.getLogger(__name__)
 
@@ -108,3 +109,63 @@ def __init__(
         self.job_header = header_template % config
 
         logger.debug("Job script: \n %s" % self.job_script())
+
+
+class SGEJob(Job):
+    submit_command = "qsub"
+    cancel_command = "qdel"
+
+    def __init__(
+        self,
+        *args,
+        queue=None,
+        project=None,
+        resource_spec=None,
+        walltime=None,
+        job_extra=None,
+        config_name="sge",
+        **kwargs
+    ):
+        if queue is None:
+            queue = dask.config.get("jobqueue.%s.queue" % config_name)
+        if project is None:
+            project = dask.config.get("jobqueue.%s.project" % config_name)
+        if resource_spec is None:
+            resource_spec = dask.config.get("jobqueue.%s.resource-spec" % config_name)
+        if walltime is None:
+            walltime = dask.config.get("jobqueue.%s.walltime" % config_name)
+        if job_extra is None:
+            job_extra = dask.config.get("jobqueue.%s.job-extra" % config_name)
+
+        super().__init__(config_name=config_name, **kwargs)
+
+        header_lines = []
+        if self.name is not None:
+            header_lines.append("#$ -N %(name)s")
+        if queue is not None:
+            header_lines.append("#$ -q %(queue)s")
+        if project is not None:
+            header_lines.append("#$ -P %(project)s")
+        if resource_spec is not None:
+            header_lines.append("#$ -l %(resource_spec)s")
+        if walltime is not None:
+            header_lines.append("#$ -l h_rt=%(walltime)s")
+        if self.log_directory is not None:
+            header_lines.append("#$ -e %(log_directory)s/")
+            header_lines.append("#$ -o %(log_directory)s/")
+        header_lines.extend(["#$ -cwd", "#$ -j y"])
+        header_lines.extend(["#$ %s" % arg for arg in job_extra])
+        header_template = "\n".join(header_lines)
+
+        config = {
+            "name": self.name,
+            "queue": queue,
+            "project": project,
+            "processes": self.worker_processes,
+            "walltime": walltime,
+            "resource_spec": resource_spec,
+            "log_directory": self.log_directory,
+        }
+        self.job_header = header_template % config
+
+        logger.debug("Job script: \n %s" % self.job_script())
diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index d06aa132..87bfc65f 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -1,6 +1,5 @@
-from dask_jobqueue import PBSJob
+from dask_jobqueue import PBSJob, SGEJob
 from dask.distributed import Scheduler, Client
-from distributed.utils_test import cleanup
 import pytest
 
 
@@ -13,7 +12,25 @@ def test_basic():
 @pytest.mark.asyncio
 async def test_live():
     async with Scheduler(port=0) as s:
-        async with PBSJob(s.address, name="foo") as job:
+        async with PBSJob(
+            scheduler=s.address, name="foo", cores=1, memory="1GB"
+        ) as job:
             async with Client(s.address, asynchronous=True) as client:
                 await client.wait_for_workers(1)
-                assert list(s.workers.values())[0].name == "foo"
+                worker_name = list(s.workers.values())[0].name
+                assert worker_name.startswith("foo")
+                assert job.job_id in worker_name
+
+
+@pytest.mark.env("sge")
+@pytest.mark.asyncio
+async def test_live_sge():
+    async with Scheduler(port=0) as s:
+        async with SGEJob(
+            scheduler=s.address, name="foo", cores=1, memory="1GB"
+        ) as job:
+            async with Client(s.address, asynchronous=True) as client:
+                await client.wait_for_workers(1)
+                worker_name = list(s.workers.values())[0].name
+                assert worker_name.startswith("foo")
+                assert job.job_id in worker_name

From 86261f4a318a58d932a2d7bcb9f8c1ec6f83aec4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= <loic.esteve@ymail.com>
Date: Tue, 6 Aug 2019 17:28:59 +0200
Subject: [PATCH 004/109] Rewrite the test to be dask 2.2.0 compatible.

---
 dask_jobqueue/tests/test_job.py | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index 87bfc65f..02abdf99 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -12,25 +12,23 @@ def test_basic():
 @pytest.mark.asyncio
 async def test_live():
     async with Scheduler(port=0) as s:
-        async with PBSJob(
-            scheduler=s.address, name="foo", cores=1, memory="1GB"
-        ) as job:
-            async with Client(s.address, asynchronous=True) as client:
-                await client.wait_for_workers(1)
-                worker_name = list(s.workers.values())[0].name
-                assert worker_name.startswith("foo")
-                assert job.job_id in worker_name
+        job = PBSJob(scheduler=s.address, name="foo", cores=1, memory="1GB")
+        job = await job
+        async with Client(s.address, asynchronous=True) as client:
+            await client.wait_for_workers(1)
+            worker_name = list(s.workers.values())[0].name
+            assert worker_name.startswith("foo")
+            assert job.job_id in worker_name
 
 
 @pytest.mark.env("sge")
 @pytest.mark.asyncio
 async def test_live_sge():
     async with Scheduler(port=0) as s:
-        async with SGEJob(
-            scheduler=s.address, name="foo", cores=1, memory="1GB"
-        ) as job:
-            async with Client(s.address, asynchronous=True) as client:
-                await client.wait_for_workers(1)
-                worker_name = list(s.workers.values())[0].name
-                assert worker_name.startswith("foo")
-                assert job.job_id in worker_name
+        job = SGEJob(scheduler=s.address, name="foo", cores=1, memory="1GB")
+        job = await job
+        async with Client(s.address, asynchronous=True) as client:
+            await client.wait_for_workers(1)
+            worker_name = list(s.workers.values())[0].name
+            assert worker_name.startswith("foo")
+            assert job.job_id in worker_name

From 3fdfd5723a0685343173f6511fbe522968606b27 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Tue, 6 Aug 2019 12:12:44 -0600
Subject: [PATCH 005/109] Add basic JobQueueCluster SpecCluster implementation

---
 dask_jobqueue/job.py            | 43 ++++++++++++++++++++++++++++++++-
 dask_jobqueue/tests/test_job.py | 25 +++++++++++++++++++
 2 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index 1a4b56fc..037d5752 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -2,7 +2,8 @@
 from contextlib import contextmanager
 
 import dask
-from distributed.deploy.spec import ProcessInterface
+from distributed.deploy.spec import ProcessInterface, SpecCluster
+from distributed.scheduler import Scheduler
 
 import logging
 import os
@@ -318,3 +319,43 @@ def _call(self, cmd, **kwargs):
                 "stderr:\n{}\n".format(proc.returncode, cmd_str, out, err)
             )
         return out
+
+
+def JobQueueCluster(
+        *args,
+        Job : Job = None,
+        n_workers=0,
+        # Cluster keywords
+        loop=None,
+        security=None,
+        silence_logs=False,
+        name=None,
+        asynchronous=False,
+        # Scheduler keywords
+        interface=None,
+        protocol="tcp://",
+        dashboard_address=":8787",
+        # Job keywords
+        **kwargs
+    ):
+    if Job is None:
+        raise ValueError("You must provide a Job type like PBSJob, SLURMJob, "
+                          "or SGEJob with the Job= argument.")
+
+    scheduler = {
+        "cls": Scheduler,  # Use local scheduler for now
+        "options": {
+            "protocol" : protocol,
+            "interface": interface,
+            "dashboard_address": dashboard_address,
+            "security": security,
+        }
+    }
+    kwargs["interface"] = interface
+    kwargs["protocol"] = protocol
+    kwargs["security"] = security
+    worker = {"cls": Job, "options": kwargs}
+
+    return SpecCluster(scheduler=scheduler, worker=worker, loop=loop,
+            silence_logs=silence_logs,
+            asynchronous=asynchronous, name=name)
diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index 02abdf99..6dcc2305 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -1,4 +1,5 @@
 from dask_jobqueue import PBSJob, SGEJob
+from dask_jobqueue.job import JobQueueCluster
 from dask.distributed import Scheduler, Client
 import pytest
 
@@ -21,6 +22,18 @@ async def test_live():
             assert job.job_id in worker_name
 
 
+@pytest.mark.env("pbs")
+@pytest.mark.asyncio
+async def test_pbs_cluster():
+    async with JobQueueCluster(cores=1, memory="1GB", Job=PBSJob,
+            asynchronous=True) as cluster:
+        cluster.scale(2)
+        await cluster
+        assert len(cluster.workers) == 2
+        assert all(isinstance(w, PBSJob) for w in cluster.workers.values())
+        assert all(w.status == "running" for w in cluster.workers.values())
+
+
 @pytest.mark.env("sge")
 @pytest.mark.asyncio
 async def test_live_sge():
@@ -32,3 +45,15 @@ async def test_live_sge():
             worker_name = list(s.workers.values())[0].name
             assert worker_name.startswith("foo")
             assert job.job_id in worker_name
+
+
+@pytest.mark.env("sge")
+@pytest.mark.asyncio
+async def test_sge_cluster():
+    async with JobQueueCluster(cores=1, memory="1GB", Job=SGEJob,
+            asynchronous=True) as cluster:
+        cluster.scale(2)
+        await cluster
+        assert len(cluster.workers) == 2
+        assert all(isinstance(w, SGEJob) for w in cluster.workers.values())
+        assert all(w.status == "running" for w in cluster.workers.values())

From 91ad132714d16b587273acbb4250469e06677c90 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Tue, 6 Aug 2019 12:13:45 -0600
Subject: [PATCH 006/109] black

---
 dask_jobqueue/job.py            | 53 +++++++++++++++++++--------------
 dask_jobqueue/tests/test_job.py | 10 ++++---
 2 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index 037d5752..4f037ac7 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -322,40 +322,47 @@ def _call(self, cmd, **kwargs):
 
 
 def JobQueueCluster(
-        *args,
-        Job : Job = None,
-        n_workers=0,
-        # Cluster keywords
-        loop=None,
-        security=None,
-        silence_logs=False,
-        name=None,
-        asynchronous=False,
-        # Scheduler keywords
-        interface=None,
-        protocol="tcp://",
-        dashboard_address=":8787",
-        # Job keywords
-        **kwargs
-    ):
+    *args,
+    Job: Job = None,
+    n_workers=0,
+    # Cluster keywords
+    loop=None,
+    security=None,
+    silence_logs=False,
+    name=None,
+    asynchronous=False,
+    # Scheduler keywords
+    interface=None,
+    protocol="tcp://",
+    dashboard_address=":8787",
+    # Job keywords
+    **kwargs
+):
     if Job is None:
-        raise ValueError("You must provide a Job type like PBSJob, SLURMJob, "
-                          "or SGEJob with the Job= argument.")
+        raise ValueError(
+            "You must provide a Job type like PBSJob, SLURMJob, "
+            "or SGEJob with the Job= argument."
+        )
 
     scheduler = {
         "cls": Scheduler,  # Use local scheduler for now
         "options": {
-            "protocol" : protocol,
+            "protocol": protocol,
             "interface": interface,
             "dashboard_address": dashboard_address,
             "security": security,
-        }
+        },
     }
     kwargs["interface"] = interface
     kwargs["protocol"] = protocol
     kwargs["security"] = security
     worker = {"cls": Job, "options": kwargs}
 
-    return SpecCluster(scheduler=scheduler, worker=worker, loop=loop,
-            silence_logs=silence_logs,
-            asynchronous=asynchronous, name=name)
+    return SpecCluster(
+        scheduler=scheduler,
+        worker=worker,
+        loop=loop,
+        silence_logs=silence_logs,
+        asynchronous=asynchronous,
+        name=name,
+    )
diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index 6dcc2305..71cf2edf 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -25,8 +25,9 @@ async def test_live():
 @pytest.mark.env("pbs")
 @pytest.mark.asyncio
 async def test_pbs_cluster():
-    async with JobQueueCluster(cores=1, memory="1GB", Job=PBSJob,
-            asynchronous=True) as cluster:
+    async with JobQueueCluster(
+        cores=1, memory="1GB", Job=PBSJob, asynchronous=True
+    ) as cluster:
         cluster.scale(2)
         await cluster
         assert len(cluster.workers) == 2
@@ -50,8 +51,9 @@ async def test_live_sge():
 @pytest.mark.env("sge")
 @pytest.mark.asyncio
 async def test_sge_cluster():
-    async with JobQueueCluster(cores=1, memory="1GB", Job=SGEJob,
-            asynchronous=True) as cluster:
+    async with JobQueueCluster(
+        cores=1, memory="1GB", Job=SGEJob, asynchronous=True
+    ) as cluster:
         cluster.scale(2)
         await cluster
         assert len(cluster.workers) == 2

From d67e4845f83337696a862f2d228b9dc53b4ed08b Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Tue, 6 Aug 2019 12:17:40 -0600
Subject: [PATCH 007/109] Add test for initial scale

---
 dask_jobqueue/job.py            | 10 +++++++---
 dask_jobqueue/tests/test_job.py |  6 ++++--
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index 4f037ac7..f73bf5a8 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -322,9 +322,8 @@ def _call(self, cmd, **kwargs):
 
 
 def JobQueueCluster(
-    *args,
-    Job: Job = None,
     n_workers=0,
+    Job: Job = None,
     # Cluster keywords
     loop=None,
     security=None,
@@ -358,7 +357,7 @@ def JobQueueCluster(
     kwargs["security"] = security
     worker = {"cls": Job, "options": kwargs}
 
-    return SpecCluster(
+    cluster = SpecCluster(
         scheduler=scheduler,
         worker=worker,
         loop=loop,
@@ -366,3 +365,8 @@ def JobQueueCluster(
         asynchronous=asynchronous,
         name=name,
     )
+
+    if n_workers:
+        cluster.scale(n_workers)
+
+    return cluster
diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index 71cf2edf..80792d31 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -26,8 +26,9 @@ async def test_live():
 @pytest.mark.asyncio
 async def test_pbs_cluster():
     async with JobQueueCluster(
-        cores=1, memory="1GB", Job=PBSJob, asynchronous=True
+            1, cores=1, memory="1GB", Job=PBSJob, asynchronous=True
     ) as cluster:
+        assert len(cluster.workers) == 1
         cluster.scale(2)
         await cluster
         assert len(cluster.workers) == 2
@@ -52,8 +53,9 @@ async def test_live_sge():
 @pytest.mark.asyncio
 async def test_sge_cluster():
     async with JobQueueCluster(
-        cores=1, memory="1GB", Job=SGEJob, asynchronous=True
+        1, cores=1, memory="1GB", Job=SGEJob, asynchronous=True
     ) as cluster:
+        assert len(cluster.workers) == 1
         cluster.scale(2)
         await cluster
         assert len(cluster.workers) == 2

From 146555f576b3eb74285621406c0730ff9bd2c633 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= <loic.esteve@ymail.com>
Date: Tue, 6 Aug 2019 23:05:50 +0200
Subject: [PATCH 008/109] Fix name / job_name.

---
 dask_jobqueue/job.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index f73bf5a8..f34e944f 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -92,7 +92,6 @@ def __init__(
         self,
         scheduler=None,
         name=None,
-        job_name=None,
         cores=None,
         memory=None,
         processes=None,
@@ -122,8 +121,8 @@ def __init__(
                 "JobQueueCluster is an abstract class that should not be instantiated."
             )
 
-        if job_name is None:
-            job_name = dask.config.get("jobqueue.%s.name" % config_name)
+        if name is None:
+            name = dask.config.get("jobqueue.%s.name" % config_name)
         if cores is None:
             cores = dask.config.get("jobqueue.%s.cores" % config_name)
         if memory is None:

From 83f832abcc57ff9afc22c313bc1b0846dc1a21e2 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Tue, 6 Aug 2019 17:15:32 -0700
Subject: [PATCH 009/109] add names to cluster tests

---
 dask_jobqueue/tests/test_job.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index 80792d31..e9d4bba1 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -26,7 +26,7 @@ async def test_live():
 @pytest.mark.asyncio
 async def test_pbs_cluster():
     async with JobQueueCluster(
-            1, cores=1, memory="1GB", Job=PBSJob, asynchronous=True
+            1, cores=1, memory="1GB", Job=PBSJob, asynchronous=True, name="foo",
     ) as cluster:
         assert len(cluster.workers) == 1
         cluster.scale(2)
@@ -38,7 +38,7 @@ async def test_pbs_cluster():
 
 @pytest.mark.env("sge")
 @pytest.mark.asyncio
-async def test_live_sge():
+async def test_sge():
     async with Scheduler(port=0) as s:
         job = SGEJob(scheduler=s.address, name="foo", cores=1, memory="1GB")
         job = await job
@@ -53,7 +53,7 @@ async def test_live_sge():
 @pytest.mark.asyncio
 async def test_sge_cluster():
     async with JobQueueCluster(
-        1, cores=1, memory="1GB", Job=SGEJob, asynchronous=True
+        1, cores=1, memory="1GB", Job=SGEJob, asynchronous=True, name="foo",
     ) as cluster:
         assert len(cluster.workers) == 1
         cluster.scale(2)

From e520e214bdb2b54744dda845e48c3c80626b969d Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Tue, 6 Aug 2019 17:58:27 -0700
Subject: [PATCH 010/109] Add echo into job script template

---
 dask_jobqueue/htcondor.py | 13 -------------
 dask_jobqueue/job.py      |  8 +++++---
 2 files changed, 5 insertions(+), 16 deletions(-)

diff --git a/dask_jobqueue/htcondor.py b/dask_jobqueue/htcondor.py
index b751773e..5130bde3 100644
--- a/dask_jobqueue/htcondor.py
+++ b/dask_jobqueue/htcondor.py
@@ -53,7 +53,6 @@ class HTCondorCluster(JobQueueCluster):
 
     submit_command = "condor_submit -queue 1 -file"
     cancel_command = "condor_rm"
-    job_id_regexp = r"(?P<job_id>\d+\.\d+)"
 
     # condor sets argv[0] of the executable to "condor_exec.exe", which confuses
     # Python (can't find its libs), so we have to go through the shell.
@@ -139,18 +138,6 @@ def job_script(self):
             "executable": self.executable,
         }
 
-    def _job_id_from_submit_output(self, out):
-        cluster_id_regexp = r"submitted to cluster (\d+)"
-        match = re.search(cluster_id_regexp, out)
-        if match is None:
-            msg = (
-                "Could not parse cluster id from submission command output.\n"
-                "Cluster id regexp is {!r}\n"
-                "Submission command output is:\n{}".format(cluster_id_regexp, out)
-            )
-            raise ValueError(msg)
-        return "%s.0" % match.group(1)
-
 
 def _double_up_quotes(instr):
     return instr.replace("'", "''").replace('"', '""')
diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index f34e944f..59eacb83 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -26,7 +26,7 @@ class Job(ProcessInterface):
     Parameters
     ----------
     name : str
-        Name of Dask workers.
+        Name of Dask worker.
     cores : int
         Total number of cores per job
     memory: str
@@ -80,13 +80,15 @@ class Job(ProcessInterface):
 
 %(env_header)s
 
+echo "Job ID: $JOB_ID"
+
 %(worker_command)s
 """.lstrip()
 
     # Following class attributes should be overridden by extending classes.
     submit_command = None
     cancel_command = None
-    job_id_regexp = r"(?P<job_id>\d+)"
+    job_id_regexp = r"Job ID:\s*(?P<job_id>.*)"
 
     def __init__(
         self,
@@ -185,7 +187,7 @@ def __init__(
             command_args += ["--nprocs", processes]
 
         command_args += ["--memory-limit", self.worker_process_memory]
-        command_args += ["--name", "%s--${JOB_ID}--" % name]
+        command_args += ["--name", str(name)]
 
         if death_timeout is not None:
             command_args += ["--death-timeout", death_timeout]

From c07133d2c554abf41bd36c95f0a2376b2a5cd99e Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Tue, 6 Aug 2019 19:32:17 -0700
Subject: [PATCH 011/109] Revert "Add echo into job script template"

This reverts commit e520e214bdb2b54744dda845e48c3c80626b969d.
---
 dask_jobqueue/htcondor.py | 13 +++++++++++++
 dask_jobqueue/job.py      |  8 +++-----
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/dask_jobqueue/htcondor.py b/dask_jobqueue/htcondor.py
index 5130bde3..b751773e 100644
--- a/dask_jobqueue/htcondor.py
+++ b/dask_jobqueue/htcondor.py
@@ -53,6 +53,7 @@ class HTCondorCluster(JobQueueCluster):
 
     submit_command = "condor_submit -queue 1 -file"
     cancel_command = "condor_rm"
+    job_id_regexp = r"(?P<job_id>\d+\.\d+)"
 
     # condor sets argv[0] of the executable to "condor_exec.exe", which confuses
     # Python (can't find its libs), so we have to go through the shell.
@@ -138,6 +139,18 @@ def job_script(self):
             "executable": self.executable,
         }
 
+    def _job_id_from_submit_output(self, out):
+        cluster_id_regexp = r"submitted to cluster (\d+)"
+        match = re.search(cluster_id_regexp, out)
+        if match is None:
+            msg = (
+                "Could not parse cluster id from submission command output.\n"
+                "Cluster id regexp is {!r}\n"
+                "Submission command output is:\n{}".format(cluster_id_regexp, out)
+            )
+            raise ValueError(msg)
+        return "%s.0" % match.group(1)
+
 
 def _double_up_quotes(instr):
     return instr.replace("'", "''").replace('"', '""')
diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index 59eacb83..f34e944f 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -26,7 +26,7 @@ class Job(ProcessInterface):
     Parameters
     ----------
     name : str
-        Name of Dask worker.
+        Name of Dask workers.
     cores : int
         Total number of cores per job
     memory: str
@@ -80,15 +80,13 @@ class Job(ProcessInterface):
 
 %(env_header)s
 
-echo "Job ID: $JOB_ID"
-
 %(worker_command)s
 """.lstrip()
 
     # Following class attributes should be overridden by extending classes.
     submit_command = None
     cancel_command = None
-    job_id_regexp = r"Job ID:\s*(?P<job_id>.*)"
+    job_id_regexp = r"(?P<job_id>\d+)"
 
     def __init__(
         self,
@@ -187,7 +185,7 @@ def __init__(
             command_args += ["--nprocs", processes]
 
         command_args += ["--memory-limit", self.worker_process_memory]
-        command_args += ["--name", str(name)]
+        command_args += ["--name", "%s--${JOB_ID}--" % name]
 
         if death_timeout is not None:
             command_args += ["--death-timeout", death_timeout]

From 3ddd0b2ae709372c5c45a5097c3df452559b7dbc Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Tue, 6 Aug 2019 19:40:35 -0700
Subject: [PATCH 012/109] improve job-name

---
 dask_jobqueue/job.py | 13 ++++++++-----
 dask_jobqueue/pbs.py |  4 ++--
 dask_jobqueue/sge.py |  6 +++---
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index f34e944f..48603110 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -103,6 +103,7 @@ def __init__(
         log_directory=None,
         shebang=None,
         python=sys.executable,
+        job_name="dask-worker",
         config_name=None,
         **kwargs
     ):
@@ -121,8 +122,8 @@ def __init__(
                 "JobQueueCluster is an abstract class that should not be instantiated."
             )
 
-        if name is None:
-            name = dask.config.get("jobqueue.%s.name" % config_name)
+        if job_name is None:
+            job_name = dask.config.get("jobqueue.%s.name" % config_name)
         if cores is None:
             cores = dask.config.get("jobqueue.%s.cores" % config_name)
         if memory is None:
@@ -170,6 +171,7 @@ def __init__(
         self.worker_processes = processes
         self.worker_cores = cores
         self.name = name
+        self.job_name = job_name
 
         self.shebang = shebang
 
@@ -185,7 +187,7 @@ def __init__(
             command_args += ["--nprocs", processes]
 
         command_args += ["--memory-limit", self.worker_process_memory]
-        command_args += ["--name", "%s--${JOB_ID}--" % name]
+        command_args += ["--name", str(name)]
 
         if death_timeout is not None:
             command_args += ["--death-timeout", death_timeout]
@@ -236,7 +238,7 @@ def worker_process_memory(self):
 
     async def start(self):
         """ Start workers and point them to our local scheduler """
-        logger.debug("Starting job: %s", self.name)
+        logger.debug("Starting worker: %s", self.name)
 
         with self.job_file() as fn:
             out = self._submit_job(fn)
@@ -245,6 +247,7 @@ async def start(self):
                 raise ValueError("Unable to parse jobid from output of %s" % out)
             self.job_id = job
 
+        logger.debug("Starting job: %s", self.job_id)
         await super().start()
 
     def _job_id_from_submit_output(self, out):
@@ -269,7 +272,7 @@ def _job_id_from_submit_output(self, out):
         return job_id
 
     async def close(self):
-        logger.debug("Stopping job: %s", self.name)
+        logger.debug("Stopping worker: %s job: %s", self.name, self.job_id)
         if self.job_id:
             self._call(shlex.split(self.cancel_command) + [self.job_id])
 
diff --git a/dask_jobqueue/pbs.py b/dask_jobqueue/pbs.py
index 86141dc7..728e045d 100644
--- a/dask_jobqueue/pbs.py
+++ b/dask_jobqueue/pbs.py
@@ -182,8 +182,8 @@ def __init__(
 
         header_lines = []
         # PBS header build
-        if self.name is not None:
-            header_lines.append("#PBS -N %s" % self.name)
+        if self.job_name is not None:
+            header_lines.append("#PBS -N %s" % self.job_name)
         if queue is not None:
             header_lines.append("#PBS -q %s" % queue)
         if project is not None:
diff --git a/dask_jobqueue/sge.py b/dask_jobqueue/sge.py
index 490b7c56..04db64d6 100644
--- a/dask_jobqueue/sge.py
+++ b/dask_jobqueue/sge.py
@@ -140,8 +140,8 @@ def __init__(
         super().__init__(config_name=config_name, **kwargs)
 
         header_lines = []
-        if self.name is not None:
-            header_lines.append("#$ -N %(name)s")
+        if self.job_name is not None:
+            header_lines.append("#$ -N %(job-name)s")
         if queue is not None:
             header_lines.append("#$ -q %(queue)s")
         if project is not None:
@@ -158,7 +158,7 @@ def __init__(
         header_template = "\n".join(header_lines)
 
         config = {
-            "name": self.name,
+            "job-name": self.job_name,
             "queue": queue,
             "project": project,
             "processes": self.worker_processes,

From 6f81aec57d6695baf1af057ac8b624e068466257 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Tue, 6 Aug 2019 19:40:49 -0700
Subject: [PATCH 013/109] black

---
 dask_jobqueue/tests/test_job.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index e9d4bba1..8297fcc1 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -26,7 +26,7 @@ async def test_live():
 @pytest.mark.asyncio
 async def test_pbs_cluster():
     async with JobQueueCluster(
-            1, cores=1, memory="1GB", Job=PBSJob, asynchronous=True, name="foo",
+        1, cores=1, memory="1GB", Job=PBSJob, asynchronous=True, name="foo"
     ) as cluster:
         assert len(cluster.workers) == 1
         cluster.scale(2)
@@ -53,7 +53,7 @@ async def test_sge():
 @pytest.mark.asyncio
 async def test_sge_cluster():
     async with JobQueueCluster(
-        1, cores=1, memory="1GB", Job=SGEJob, asynchronous=True, name="foo",
+        1, cores=1, memory="1GB", Job=SGEJob, asynchronous=True, name="foo"
     ) as cluster:
         assert len(cluster.workers) == 1
         cluster.scale(2)

From 370ea47fe94df468edbc9c01cf1a77c03258370e Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Tue, 6 Aug 2019 20:16:16 -0700
Subject: [PATCH 014/109] cleanup jobname

---
 dask_jobqueue/job.py            | 2 +-
 dask_jobqueue/tests/test_job.py | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index 48603110..66b158fb 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -103,7 +103,7 @@ def __init__(
         log_directory=None,
         shebang=None,
         python=sys.executable,
-        job_name="dask-worker",
+        job_name=None,
         config_name=None,
         **kwargs
     ):
diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index 8297fcc1..49563508 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -17,9 +17,7 @@ async def test_live():
         job = await job
         async with Client(s.address, asynchronous=True) as client:
             await client.wait_for_workers(1)
-            worker_name = list(s.workers.values())[0].name
-            assert worker_name.startswith("foo")
-            assert job.job_id in worker_name
+            assert list(s.workers.values())[0].name == "foo"
 
 
 @pytest.mark.env("pbs")

From e7bc03b610a2555fae2c04f1199e0b8fa5d1b821 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Tue, 6 Aug 2019 20:17:46 -0700
Subject: [PATCH 015/109] get debug information for tests in sge

---
 ci/sge.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/sge.sh b/ci/sge.sh
index 1673e9b3..1f244283 100644
--- a/ci/sge.sh
+++ b/ci/sge.sh
@@ -17,7 +17,7 @@ function jobqueue_install {
 }
 
 function jobqueue_script {
-    docker exec -it sge_master /bin/bash -c "cd /dask-jobqueue; pytest dask_jobqueue --verbose -E sge"
+    docker exec -it sge_master /bin/bash -c "cd /dask-jobqueue; pytest dask_jobqueue --verbose -s -E sge"
 }
 
 function jobqueue_after_script {

From 9cdac926d42a9769518fc486e6282a234866098e Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Wed, 7 Aug 2019 07:01:17 -0700
Subject: [PATCH 016/109] pass through `*args`

---
 dask_jobqueue/sge.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dask_jobqueue/sge.py b/dask_jobqueue/sge.py
index 04db64d6..f50fd1cf 100644
--- a/dask_jobqueue/sge.py
+++ b/dask_jobqueue/sge.py
@@ -137,7 +137,7 @@ def __init__(
         if job_extra is None:
             job_extra = dask.config.get("jobqueue.%s.job-extra" % config_name)
 
-        super().__init__(config_name=config_name, **kwargs)
+        super().__init__(*args, config_name=config_name, **kwargs)
 
         header_lines = []
         if self.job_name is not None:

From 09e4d6fde8632731aa410a28f667b32cce76ef30 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Wed, 7 Aug 2019 07:15:33 -0700
Subject: [PATCH 017/109] cleanup sge test

---
 dask_jobqueue/tests/test_job.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index 49563508..0b104765 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -11,7 +11,7 @@ def test_basic():
 
 @pytest.mark.env("pbs")
 @pytest.mark.asyncio
-async def test_live():
+async def test_pbs_job():
     async with Scheduler(port=0) as s:
         job = PBSJob(scheduler=s.address, name="foo", cores=1, memory="1GB")
         job = await job
@@ -36,15 +36,13 @@ async def test_pbs_cluster():
 
 @pytest.mark.env("sge")
 @pytest.mark.asyncio
-async def test_sge():
+async def test_sge_job():
     async with Scheduler(port=0) as s:
         job = SGEJob(scheduler=s.address, name="foo", cores=1, memory="1GB")
         job = await job
         async with Client(s.address, asynchronous=True) as client:
             await client.wait_for_workers(1)
-            worker_name = list(s.workers.values())[0].name
-            assert worker_name.startswith("foo")
-            assert job.job_id in worker_name
+            assert list(s.workers.values())[0].name == "foo"
 
 
 @pytest.mark.env("sge")

From f5c86ebe5372a3258d81dd5bcd59db75c0510974 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Wed, 7 Aug 2019 07:32:33 -0700
Subject: [PATCH 018/109] test scale down

---
 dask_jobqueue/tests/test_job.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index 0b104765..7673dace 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -57,3 +57,10 @@ async def test_sge_cluster():
         assert len(cluster.workers) == 2
         assert all(isinstance(w, SGEJob) for w in cluster.workers.values())
         assert all(w.status == "running" for w in cluster.workers.values())
+
+        cluster.scale(1)
+        await cluster
+        start = time()
+        while len(cluster.scheduler.workers) != 1:
+            await asyncio.sleep(0.1)
+            assert time() < start + 5

From 4b7182632fec2f0c813267d1d043428a79222c98 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Wed, 7 Aug 2019 07:35:51 -0700
Subject: [PATCH 019/109] parametrize tests

---
 dask_jobqueue/tests/test_job.py | 39 +++++++++------------------------
 1 file changed, 10 insertions(+), 29 deletions(-)

diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index 7673dace..cbb98b53 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -9,47 +9,28 @@ def test_basic():
     assert "127.0.0.1:12345" in job.job_script()
 
 
-@pytest.mark.env("pbs")
-@pytest.mark.asyncio
-async def test_pbs_job():
-    async with Scheduler(port=0) as s:
-        job = PBSJob(scheduler=s.address, name="foo", cores=1, memory="1GB")
-        job = await job
-        async with Client(s.address, asynchronous=True) as client:
-            await client.wait_for_workers(1)
-            assert list(s.workers.values())[0].name == "foo"
-
-
-@pytest.mark.env("pbs")
-@pytest.mark.asyncio
-async def test_pbs_cluster():
-    async with JobQueueCluster(
-        1, cores=1, memory="1GB", Job=PBSJob, asynchronous=True, name="foo"
-    ) as cluster:
-        assert len(cluster.workers) == 1
-        cluster.scale(2)
-        await cluster
-        assert len(cluster.workers) == 2
-        assert all(isinstance(w, PBSJob) for w in cluster.workers.values())
-        assert all(w.status == "running" for w in cluster.workers.values())
+job_params = [
+    pytest.param(SGEJob, marks=[pytest.mark.env("sge")]),
+    pytest.param(PBSJob, marks=[pytest.mark.env("pbs")]),
+]
 
 
-@pytest.mark.env("sge")
+@pytest.mark.parametrize("Job", job_params)
 @pytest.mark.asyncio
-async def test_sge_job():
+async def test_job(Job):
     async with Scheduler(port=0) as s:
-        job = SGEJob(scheduler=s.address, name="foo", cores=1, memory="1GB")
+        job = Job(scheduler=s.address, name="foo", cores=1, memory="1GB")
         job = await job
         async with Client(s.address, asynchronous=True) as client:
             await client.wait_for_workers(1)
             assert list(s.workers.values())[0].name == "foo"
 
 
-@pytest.mark.env("sge")
+@pytest.mark.parametrize("Job", job_params)
 @pytest.mark.asyncio
-async def test_sge_cluster():
+async def test_cluster(Job):
     async with JobQueueCluster(
-        1, cores=1, memory="1GB", Job=SGEJob, asynchronous=True, name="foo"
+        1, cores=1, memory="1GB", Job=Job, asynchronous=True, name="foo"
     ) as cluster:
         assert len(cluster.workers) == 1
         cluster.scale(2)

From 6b554ab7a203ce4d659c5545933833165a8522ac Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Wed, 7 Aug 2019 13:39:45 -0600
Subject: [PATCH 020/109] cleanup tests

---
 dask_jobqueue/tests/test_job.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index cbb98b53..fcc98926 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -1,6 +1,9 @@
+from time import time
+
 from dask_jobqueue import PBSJob, SGEJob
 from dask_jobqueue.job import JobQueueCluster
 from dask.distributed import Scheduler, Client
+
 import pytest
 
 
@@ -36,12 +39,11 @@ async def test_cluster(Job):
         cluster.scale(2)
         await cluster
         assert len(cluster.workers) == 2
-        assert all(isinstance(w, SGEJob) for w in cluster.workers.values())
+        assert all(isinstance(w, Job) for w in cluster.workers.values())
         assert all(w.status == "running" for w in cluster.workers.values())
 
         cluster.scale(1)
         await cluster
-        start = time()
         while len(cluster.scheduler.workers) != 1:
             await asyncio.sleep(0.1)
             assert time() < start + 5

From cada8a8027effed2ca3b370ad24afb7e9c6066ef Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Wed, 7 Aug 2019 13:39:51 -0600
Subject: [PATCH 021/109] close job with weakref.finalize

---
 dask_jobqueue/job.py | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index 66b158fb..3deda7c8 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -1,16 +1,16 @@
-import sys
 from contextlib import contextmanager
-
-import dask
-from distributed.deploy.spec import ProcessInterface, SpecCluster
-from distributed.scheduler import Scheduler
-
 import logging
 import os
 import re
 import shlex
 import subprocess
 import six
+import sys
+import weakref
+
+import dask
+from distributed.deploy.spec import ProcessInterface, SpecCluster
+from distributed.scheduler import Scheduler
 
 from distributed.utils import format_bytes, parse_bytes, tmpfile, get_ip_interface
 
@@ -247,6 +247,8 @@ async def start(self):
                 raise ValueError("Unable to parse jobid from output of %s" % out)
             self.job_id = job
 
+        weakref.finalize(self, self._close_job, job)
+
         logger.debug("Starting job: %s", self.job_id)
         await super().start()
 
@@ -273,10 +275,15 @@ def _job_id_from_submit_output(self, out):
 
     async def close(self):
         logger.debug("Stopping worker: %s job: %s", self.name, self.job_id)
-        if self.job_id:
-            self._call(shlex.split(self.cancel_command) + [self.job_id])
+        self._close_job(self.job_id)
+
+    @classmethod
+    def _close_job(cls, job_id):
+        if job_id:
+            cls._call(shlex.split(cls.cancel_command) + [job_id])
 
-    def _call(self, cmd, **kwargs):
+    @staticmethod
+    def _call(cmd, **kwargs):
         """ Call a command using subprocess.Popen.
 
         This centralizes calls out to the command line, providing consistent

From 1d70a19feed475cbc0f276ac93df6082bda51a1d Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Sat, 10 Aug 2019 08:34:37 -0600
Subject: [PATCH 022/109] Add SLURMJob

---
 dask_jobqueue/__init__.py       |  2 +-
 dask_jobqueue/slurm.py          | 99 +++++++++++++++++++++++++++++++++
 dask_jobqueue/tests/test_job.py |  5 +-
 3 files changed, 104 insertions(+), 2 deletions(-)

diff --git a/dask_jobqueue/__init__.py b/dask_jobqueue/__init__.py
index 6faa8404..ed6d4de8 100644
--- a/dask_jobqueue/__init__.py
+++ b/dask_jobqueue/__init__.py
@@ -4,7 +4,7 @@
 from .job import Job
 from .moab import MoabCluster
 from .pbs import PBSCluster, PBSJob
-from .slurm import SLURMCluster
+from .slurm import SLURMCluster, SLURMJob
 from .sge import SGECluster, SGEJob
 from .lsf import LSFCluster
 from .oar import OARCluster
diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py
index 7b051972..2d617267 100644
--- a/dask_jobqueue/slurm.py
+++ b/dask_jobqueue/slurm.py
@@ -6,6 +6,7 @@
 import dask
 
 from .core import JobQueueCluster, docstrings
+from .job import Job
 
 logger = logging.getLogger(__name__)
 
@@ -145,3 +146,101 @@ def slurm_format_bytes_ceil(n):
     if n >= 1024:
         return "%dK" % math.ceil(n / 1024)
     return "1K" % n
+
+
+class SLURMJob(Job):
+    __doc__ = docstrings.with_indents(
+        """ Launch Dask on a SLURM cluster
+
+    Parameters
+    ----------
+    queue : str
+        Destination queue for each worker job. Passed to `#SBATCH -p` option.
+    project : str
+        Accounting string associated with each worker job. Passed to `#SBATCH -A` option.
+    walltime : str
+        Walltime for each worker job.
+    job_cpu : int
+        Number of cpu to book in SLURM, if None, defaults to worker `threads * processes`
+    job_mem : str
+        Amount of memory to request in SLURM. If None, defaults to worker
+        processes * memory
+    job_extra : list
+        List of other Slurm options, for example -j oe. Each option will be prepended with the #SBATCH prefix.
+    %(JobQueueCluster.parameters)s
+
+    Examples
+    --------
+    """,
+        4,
+    )
+
+    # Override class variables
+    submit_command = "sbatch"
+    cancel_command = "scancel"
+
+    def __init__(
+        self,
+        *args,
+        queue=None,
+        project=None,
+        walltime=None,
+        job_cpu=None,
+        job_mem=None,
+        job_extra=None,
+        config_name="slurm",
+        **kwargs
+    ):
+        if queue is None:
+            queue = dask.config.get("jobqueue.%s.queue" % config_name)
+        if project is None:
+            project = dask.config.get("jobqueue.%s.project" % config_name)
+        if walltime is None:
+            walltime = dask.config.get("jobqueue.%s.walltime" % config_name)
+        if job_cpu is None:
+            job_cpu = dask.config.get("jobqueue.%s.job-cpu" % config_name)
+        if job_mem is None:
+            job_mem = dask.config.get("jobqueue.%s.job-mem" % config_name)
+        if job_extra is None:
+            job_extra = dask.config.get("jobqueue.%s.job-extra" % config_name)
+
+        super().__init__(*args, config_name=config_name, **kwargs)
+
+        # Always ask for only one task
+        header_lines = []
+        # SLURM header build
+        if self.job_name is not None:
+            header_lines.append("#SBATCH -J %s" % self.job_name)
+        if self.log_directory is not None:
+            header_lines.append(
+                "#SBATCH -e %s/%s-%%J.err" % (self.log_directory, self.job_name or "worker")
+            )
+            header_lines.append(
+                "#SBATCH -o %s/%s-%%J.out" % (self.log_directory, self.job_name or "worker")
+            )
+        if queue is not None:
+            header_lines.append("#SBATCH -p %s" % queue)
+        if project is not None:
+            header_lines.append("#SBATCH -A %s" % project)
+
+        # Init resources, always 1 task,
+        # and then number of cpu is processes * threads if not set
+        header_lines.append("#SBATCH -n 1")
+        header_lines.append(
+            "#SBATCH --cpus-per-task=%d" % (job_cpu or self.worker_cores)
+        )
+        # Memory
+        memory = job_mem
+        if job_mem is None:
+            memory = slurm_format_bytes_ceil(self.worker_memory)
+        if memory is not None:
+            header_lines.append("#SBATCH --mem=%s" % memory)
+
+        if walltime is not None:
+            header_lines.append("#SBATCH -t %s" % walltime)
+        header_lines.extend(["#SBATCH %s" % arg for arg in job_extra])
+
+        header_lines.append("\nJOB_ID=${SLURM_JOB_ID%;*}")
+
+        # Declare class attribute that shall be overridden
+        self.job_header = "\n".join(header_lines)
diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index fcc98926..1a4340cb 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -1,6 +1,7 @@
+import asyncio
 from time import time
 
-from dask_jobqueue import PBSJob, SGEJob
+from dask_jobqueue import PBSJob, SGEJob, SLURMJob
 from dask_jobqueue.job import JobQueueCluster
 from dask.distributed import Scheduler, Client
 
@@ -15,6 +16,7 @@ def test_basic():
 job_params = [
     pytest.param(SGEJob, marks=[pytest.mark.env("sge")]),
     pytest.param(PBSJob, marks=[pytest.mark.env("pbs")]),
+    pytest.param(SLURMJob, marks=[pytest.mark.env("slurm")]),
 ]
 
 
@@ -43,6 +45,7 @@ async def test_cluster(Job):
         assert all(w.status == "running" for w in cluster.workers.values())
 
         cluster.scale(1)
+        start = time()
         await cluster
         while len(cluster.scheduler.workers) != 1:
             await asyncio.sleep(0.1)

From 2545ebcd6a9b0aafe6b537485092cf2fa6ba3c42 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Sat, 10 Aug 2019 09:05:42 -0600
Subject: [PATCH 023/109] Add test for adaptive

---
 dask_jobqueue/tests/test_job.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index 1a4340cb..a25bef27 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -50,3 +50,28 @@ async def test_cluster(Job):
         while len(cluster.scheduler.workers) != 1:
             await asyncio.sleep(0.1)
             assert time() < start + 5
+
+
+@pytest.mark.parametrize("Job", job_params)
+@pytest.mark.asyncio
+async def test_adapt(Job):
+    async with JobQueueCluster(
+        1, cores=1, memory="1GB", Job=Job, asynchronous=True, name="foo"
+    ) as cluster:
+        cluster.adapt(minimum=0, maximum=4, interval="10ms")
+
+        start = time()
+        while len(cluster.scheduler.workers):
+            await asyncio.sleep(0.050)
+            assert time() < start + 5
+
+        async with Client(cluster, asynchronous=True) as client:
+            future = client.submit(lambda: 0)
+            await client.wait_for_workers(1)
+
+            del future
+
+            start = time()
+            while len(cluster.scheduler.workers):
+                await asyncio.sleep(0.050)
+                assert time() < start + 5

From dae702dcff20f048cfe1f774b042458852e5a3dc Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Sat, 10 Aug 2019 09:20:14 -0600
Subject: [PATCH 024/109] fixup adaptive tests

---
 dask_jobqueue/tests/test_job.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index a25bef27..d314227c 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -58,14 +58,17 @@ async def test_adapt(Job):
     async with JobQueueCluster(
         1, cores=1, memory="1GB", Job=Job, asynchronous=True, name="foo"
     ) as cluster:
-        cluster.adapt(minimum=0, maximum=4, interval="10ms")
+        async with Client(cluster, asynchronous=True) as client:
+            await client.wait_for_workers(1)
+            cluster.adapt(minimum=0, maximum=4, interval="10ms")
 
-        start = time()
-        while len(cluster.scheduler.workers):
-            await asyncio.sleep(0.050)
-            assert time() < start + 5
+            start = time()
+            while len(cluster.scheduler.workers):
+                await asyncio.sleep(0.050)
+                assert time() < start + 10
+            assert not cluster.worker_spec
+            assert not cluster.workers
 
-        async with Client(cluster, asynchronous=True) as client:
             future = client.submit(lambda: 0)
             await client.wait_for_workers(1)
 
@@ -74,4 +77,6 @@ async def test_adapt(Job):
             start = time()
             while len(cluster.scheduler.workers):
                 await asyncio.sleep(0.050)
-                assert time() < start + 5
+                assert time() < start + 10
+            assert not cluster.worker_spec
+            assert not cluster.workers

From 043888b4eb7d2579349b13f6228f973930542808 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Sat, 10 Aug 2019 09:38:43 -0600
Subject: [PATCH 025/109] black

---
 dask_jobqueue/slurm.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py
index 2d617267..ec0b6f04 100644
--- a/dask_jobqueue/slurm.py
+++ b/dask_jobqueue/slurm.py
@@ -213,10 +213,12 @@ def __init__(
             header_lines.append("#SBATCH -J %s" % self.job_name)
         if self.log_directory is not None:
             header_lines.append(
-                "#SBATCH -e %s/%s-%%J.err" % (self.log_directory, self.job_name or "worker")
+                "#SBATCH -e %s/%s-%%J.err"
+                % (self.log_directory, self.job_name or "worker")
             )
             header_lines.append(
-                "#SBATCH -o %s/%s-%%J.out" % (self.log_directory, self.job_name or "worker")
+                "#SBATCH -o %s/%s-%%J.out"
+                % (self.log_directory, self.job_name or "worker")
             )
         if queue is not None:
             header_lines.append("#SBATCH -p %s" % queue)

From f3974bb5eac426c8a1c79b70ddb3759a8865baaf Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Sat, 10 Aug 2019 09:41:29 -0600
Subject: [PATCH 026/109] close job in test_job

---
 dask_jobqueue/tests/test_job.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index d314227c..a76f70b5 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -30,6 +30,13 @@ async def test_job(Job):
             await client.wait_for_workers(1)
             assert list(s.workers.values())[0].name == "foo"
 
+        await job.close()
+
+        start = time()
+        while len(s.workers):
+            await asyncio.sleep(0.1)
+            assert time() < start + 5
+
 
 @pytest.mark.parametrize("Job", job_params)
 @pytest.mark.asyncio

From abcb4e1101fefa99b462d10df143a31f36493350 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Sun, 11 Aug 2019 13:32:18 -0600
Subject: [PATCH 027/109] Also wait for workers in test

---
 dask_jobqueue/tests/test_job.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index a76f70b5..6ab335ac 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -70,7 +70,7 @@ async def test_adapt(Job):
             cluster.adapt(minimum=0, maximum=4, interval="10ms")
 
             start = time()
-            while len(cluster.scheduler.workers):
+            while len(cluster.scheduler.workers) or cluster.workers:
                 await asyncio.sleep(0.050)
                 assert time() < start + 10
             assert not cluster.worker_spec
@@ -82,7 +82,7 @@ async def test_adapt(Job):
             del future
 
             start = time()
-            while len(cluster.scheduler.workers):
+            while len(cluster.scheduler.workers) or cluster.workers:
                 await asyncio.sleep(0.050)
                 assert time() < start + 10
             assert not cluster.worker_spec

From 5a709c0bbb77795545a9fe29630f5ad79d58dcc6 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Tue, 13 Aug 2019 14:58:24 -0400
Subject: [PATCH 028/109] add dask/distributed git master to CI

---
 ci/pbs/Dockerfile        | 1 +
 ci/sge/Dockerfile-master | 1 +
 ci/sge/Dockerfile-slave  | 1 +
 ci/slurm/Dockerfile      | 1 +
 4 files changed, 4 insertions(+)

diff --git a/ci/pbs/Dockerfile b/ci/pbs/Dockerfile
index 47fe2770..b3423e88 100644
--- a/ci/pbs/Dockerfile
+++ b/ci/pbs/Dockerfile
@@ -31,6 +31,7 @@ RUN curl -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-L
     /opt/anaconda/bin/conda clean -tipy && \
     rm -f miniconda.sh
 RUN conda install --yes -c conda-forge python=3.6 dask distributed flake8 pytest docrep pytest-asyncio
+RUN pip install git+https://github.com/dask/distributed --upgrade --no-deps
 
 # Copy entrypoint and other needed scripts
 COPY ./*.sh /
diff --git a/ci/sge/Dockerfile-master b/ci/sge/Dockerfile-master
index c6f47340..d6e486d2 100644
--- a/ci/sge/Dockerfile-master
+++ b/ci/sge/Dockerfile-master
@@ -11,6 +11,7 @@ RUN curl -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-L
 ENV PATH /opt/anaconda/bin:$PATH
 ARG PYTHON_VERSION
 RUN conda install -c conda-forge python=$PYTHON_VERSION dask distributed pytest pytest-asyncio && conda clean -tipy
+RUN pip install git+https://github.com/dask/distributed --upgrade --no-deps
 
 COPY ./*.sh /
 COPY ./*.txt /
diff --git a/ci/sge/Dockerfile-slave b/ci/sge/Dockerfile-slave
index 10e51d2e..777547ef 100644
--- a/ci/sge/Dockerfile-slave
+++ b/ci/sge/Dockerfile-slave
@@ -11,6 +11,7 @@ RUN curl -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-L
 ENV PATH /opt/anaconda/bin:$PATH
 ARG PYTHON_VERSION
 RUN conda install -c conda-forge python=$PYTHON_VERSION dask distributed pytest pytest-asyncio && conda clean -tipy
+RUN pip install git+https://github.com/dask/distributed --upgrade --no-deps
 
 COPY ./setup-slave.sh /
 COPY ./*.sh /
diff --git a/ci/slurm/Dockerfile b/ci/slurm/Dockerfile
index e2bb7ad8..1a8cc112 100644
--- a/ci/slurm/Dockerfile
+++ b/ci/slurm/Dockerfile
@@ -6,6 +6,7 @@ RUN curl -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-L
     rm -f miniconda.sh
 ENV PATH /opt/anaconda/bin:$PATH
 RUN conda install --yes -c conda-forge python=3.6 dask distributed flake8 pytest docrep pytest-asyncio
+RUN pip install git+https://github.com/dask/distributed --upgrade --no-deps
 
 ENV LC_ALL en_US.UTF-8
 

From 9b0b0f8c9d4562991bb4cf8b66b5e3e300a1ff04 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Mon, 19 Aug 2019 07:26:11 -0600
Subject: [PATCH 029/109] Relax test_slurm.py to not check jobs attributes

---
 dask_jobqueue/tests/test_slurm.py | 29 ++++++++---------------------
 1 file changed, 8 insertions(+), 21 deletions(-)

diff --git a/dask_jobqueue/tests/test_slurm.py b/dask_jobqueue/tests/test_slurm.py
index fee6ed02..9d9eb262 100644
--- a/dask_jobqueue/tests/test_slurm.py
+++ b/dask_jobqueue/tests/test_slurm.py
@@ -25,7 +25,7 @@ def test_header():
         assert "#SBATCH --mem=27G" in cluster.job_header
         assert "#SBATCH -t 00:02:00" in cluster.job_header
         assert "#SBATCH -p" not in cluster.job_header
-        assert "#SBATCH -A" not in cluster.job_header
+        # assert "#SBATCH -A" not in cluster.job_header
 
     with SLURMCluster(
         queue="regular",
@@ -51,7 +51,7 @@ def test_header():
         assert "#SBATCH -n 1" in cluster.job_header
         assert "#SBATCH -t " in cluster.job_header
         assert "#SBATCH -p" not in cluster.job_header
-        assert "#SBATCH -A" not in cluster.job_header
+        # assert "#SBATCH -A" not in cluster.job_header
 
 
 def test_job_script():
@@ -68,7 +68,7 @@ def test_job_script():
         assert "#SBATCH --mem=27G" in job_script
         assert "#SBATCH -t 00:02:00" in job_script
         assert "#SBATCH -p" not in job_script
-        assert "#SBATCH -A" not in job_script
+        # assert "#SBATCH -A" not in job_script
 
         assert "export " not in job_script
 
@@ -97,7 +97,7 @@ def test_job_script():
         assert "#SBATCH --mem=27G" in job_script
         assert "#SBATCH -t 00:02:00" in job_script
         assert "#SBATCH -p" not in job_script
-        assert "#SBATCH -A" not in job_script
+        # assert "#SBATCH -A" not in job_script
 
         assert 'export LANG="en_US.utf8"' in job_script
         assert 'export LANGUAGE="en_US.utf8"' in job_script
@@ -125,13 +125,10 @@ def test_basic(loop):
             cluster.scale(2)
 
             start = time()
-            while not (cluster.pending_jobs or cluster.running_jobs):
-                sleep(0.100)
-                assert time() < start + QUEUE_WAIT
+            client.wait_for_workers(2)
 
             future = client.submit(lambda x: x + 1, 10)
             assert future.result(QUEUE_WAIT) == 11
-            assert cluster.running_jobs
 
             workers = list(client.scheduler_info()["workers"].values())
             w = workers[0]
@@ -141,7 +138,7 @@ def test_basic(loop):
             cluster.scale(0)
 
             start = time()
-            while cluster.running_jobs:
+            while client.scheduler_info()["workers"]:
                 sleep(0.100)
                 assert time() < start + QUEUE_WAIT
 
@@ -161,27 +158,17 @@ def test_adaptive(loop):
             future = client.submit(lambda x: x + 1, 10)
 
             start = time()
-            while not (cluster.pending_jobs or cluster.running_jobs):
-                sleep(0.100)
-                assert time() < start + QUEUE_WAIT
+            client.wait_for_workers(1)
 
             assert future.result(QUEUE_WAIT) == 11
 
-            start = time()
-            processes = cluster.worker_processes
-            while len(client.scheduler_info()["workers"]) != processes:
-                sleep(0.1)
-                assert time() < start + QUEUE_WAIT
-
             del future
 
             start = time()
-            while cluster.running_jobs:
+            while client.scheduler_info()["workers"]:
                 sleep(0.100)
                 assert time() < start + QUEUE_WAIT
 
-            assert cluster.finished_jobs
-
 
 def test_config_name_slurm_takes_custom_config():
     conf = {

From 1443435fd8257ea15e7d932d34a6cad4caf629cc Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Tue, 20 Aug 2019 16:25:57 -0600
Subject: [PATCH 030/109] Replace SLURMCluster with new variant

---
 dask_jobqueue/job.py              | 108 +++++++++++++++------------
 dask_jobqueue/slurm.py            | 120 ++----------------------------
 dask_jobqueue/tests/__init__.py   |   2 +-
 dask_jobqueue/tests/test_job.py   |  28 +++----
 dask_jobqueue/tests/test_slurm.py |   4 +-
 5 files changed, 85 insertions(+), 177 deletions(-)

diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index 3deda7c8..9903db85 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -330,52 +330,66 @@ def _call(cmd, **kwargs):
         return out
 
 
-def JobQueueCluster(
-    n_workers=0,
-    Job: Job = None,
-    # Cluster keywords
-    loop=None,
-    security=None,
-    silence_logs=False,
-    name=None,
-    asynchronous=False,
-    # Scheduler keywords
-    interface=None,
-    protocol="tcp://",
-    dashboard_address=":8787",
-    # Job keywords
-    **kwargs
-):
-    if Job is None:
-        raise ValueError(
-            "You must provide a Job type like PBSJob, SLURMJob, "
-            "or SGEJob with the Job= argument."
+class JobQueueCluster(SpecCluster):
+
+    def __init__(
+        self,
+        n_workers=0,
+        Job: Job = None,
+        # Cluster keywords
+        loop=None,
+        security=None,
+        silence_logs=False,
+        name=None,
+        asynchronous=False,
+        # Scheduler keywords
+        interface=None,
+        protocol="tcp://",
+        dashboard_address=":8787",
+        # Job keywords
+        **kwargs
+    ):
+        if Job is None:
+            raise ValueError(
+                "You must provide a Job type like PBSJob, SLURMJob, "
+                "or SGEJob with the Job= argument."
+            )
+
+
+        scheduler = {
+            "cls": Scheduler,  # Use local scheduler for now
+            "options": {
+                "protocol": protocol,
+                "interface": interface,
+                "dashboard_address": dashboard_address,
+                "security": security,
+            },
+        }
+        kwargs["interface"] = interface
+        kwargs["protocol"] = protocol
+        kwargs["security"] = security
+        worker = {"cls": Job, "options": kwargs}
+        self.example_job = Job("tcp://scheduler:8786", name="name", **kwargs)
+
+        super().__init__(
+            scheduler=scheduler,
+            worker=worker,
+            loop=loop,
+            silence_logs=silence_logs,
+            asynchronous=asynchronous,
+            name=name,
         )
 
-    scheduler = {
-        "cls": Scheduler,  # Use local scheduler for now
-        "options": {
-            "protocol": protocol,
-            "interface": interface,
-            "dashboard_address": dashboard_address,
-            "security": security,
-        },
-    }
-    kwargs["interface"] = interface
-    kwargs["protocol"] = protocol
-    kwargs["security"] = security
-    worker = {"cls": Job, "options": kwargs}
-
-    cluster = SpecCluster(
-        scheduler=scheduler,
-        worker=worker,
-        loop=loop,
-        silence_logs=silence_logs,
-        asynchronous=asynchronous,
-        name=name,
-    )
-
-    if n_workers:
-        cluster.scale(n_workers)
-
-    return cluster
+        if n_workers:
+            self.scale(n_workers)
+
+    @property
+    def job_header(self):
+        return self.example_job.job_header
+
+    def job_script(self):
+        return self.example_job.job_script()
+
+    @property
+    def name(self):
+        return self.example_job.job_name
diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py
index ec0b6f04..a0a97550 100644
--- a/dask_jobqueue/slurm.py
+++ b/dask_jobqueue/slurm.py
@@ -1,128 +1,17 @@
 from __future__ import absolute_import, division, print_function
 
+import functools
 import logging
 import math
 
 import dask
 
-from .core import JobQueueCluster, docstrings
-from .job import Job
+from .core import docstrings
+from .job import Job, JobQueueCluster
 
 logger = logging.getLogger(__name__)
 
 
-class SLURMCluster(JobQueueCluster):
-    __doc__ = docstrings.with_indents(
-        """ Launch Dask on a SLURM cluster
-
-    Parameters
-    ----------
-    queue : str
-        Destination queue for each worker job. Passed to `#SBATCH -p` option.
-    project : str
-        Accounting string associated with each worker job. Passed to `#SBATCH -A` option.
-    walltime : str
-        Walltime for each worker job.
-    job_cpu : int
-        Number of cpu to book in SLURM, if None, defaults to worker `threads * processes`
-    job_mem : str
-        Amount of memory to request in SLURM. If None, defaults to worker
-        processes * memory
-    job_extra : list
-        List of other Slurm options, for example -j oe. Each option will be prepended with the #SBATCH prefix.
-    %(JobQueueCluster.parameters)s
-
-    Examples
-    --------
-    >>> from dask_jobqueue import SLURMCluster
-    >>> cluster = SLURMCluster(processes=6, cores=24, memory="120GB",
-                               env_extra=['export LANG="en_US.utf8"',
-                                          'export LANGUAGE="en_US.utf8"',
-                                          'export LC_ALL="en_US.utf8"'])
-    >>> cluster.scale(10)  # this may take a few seconds to launch
-
-    >>> from dask.distributed import Client
-    >>> client = Client(cluster)
-
-    This also works with adaptive clusters.  This automatically launches and kill workers based on load.
-
-    >>> cluster.adapt()
-    """,
-        4,
-    )
-
-    # Override class variables
-    submit_command = "sbatch"
-    cancel_command = "scancel"
-
-    def __init__(
-        self,
-        queue=None,
-        project=None,
-        walltime=None,
-        job_cpu=None,
-        job_mem=None,
-        job_extra=None,
-        config_name="slurm",
-        **kwargs
-    ):
-        if queue is None:
-            queue = dask.config.get("jobqueue.%s.queue" % config_name)
-        if project is None:
-            project = dask.config.get("jobqueue.%s.project" % config_name)
-        if walltime is None:
-            walltime = dask.config.get("jobqueue.%s.walltime" % config_name)
-        if job_cpu is None:
-            job_cpu = dask.config.get("jobqueue.%s.job-cpu" % config_name)
-        if job_mem is None:
-            job_mem = dask.config.get("jobqueue.%s.job-mem" % config_name)
-        if job_extra is None:
-            job_extra = dask.config.get("jobqueue.%s.job-extra" % config_name)
-
-        super(SLURMCluster, self).__init__(config_name=config_name, **kwargs)
-
-        # Always ask for only one task
-        header_lines = []
-        # SLURM header build
-        if self.name is not None:
-            header_lines.append("#SBATCH -J %s" % self.name)
-        if self.log_directory is not None:
-            header_lines.append(
-                "#SBATCH -e %s/%s-%%J.err" % (self.log_directory, self.name or "worker")
-            )
-            header_lines.append(
-                "#SBATCH -o %s/%s-%%J.out" % (self.log_directory, self.name or "worker")
-            )
-        if queue is not None:
-            header_lines.append("#SBATCH -p %s" % queue)
-        if project is not None:
-            header_lines.append("#SBATCH -A %s" % project)
-
-        # Init resources, always 1 task,
-        # and then number of cpu is processes * threads if not set
-        header_lines.append("#SBATCH -n 1")
-        header_lines.append(
-            "#SBATCH --cpus-per-task=%d" % (job_cpu or self.worker_cores)
-        )
-        # Memory
-        memory = job_mem
-        if job_mem is None:
-            memory = slurm_format_bytes_ceil(self.worker_memory)
-        if memory is not None:
-            header_lines.append("#SBATCH --mem=%s" % memory)
-
-        if walltime is not None:
-            header_lines.append("#SBATCH -t %s" % walltime)
-        header_lines.extend(["#SBATCH %s" % arg for arg in job_extra])
-
-        header_lines.append("JOB_ID=${SLURM_JOB_ID%;*}")
-
-        # Declare class attribute that shall be overridden
-        self.job_header = "\n".join(header_lines)
-
-        logger.debug("Job script: \n %s" % self.job_script())
-
-
 def slurm_format_bytes_ceil(n):
     """ Format bytes as text.
 
@@ -246,3 +135,6 @@ def __init__(
 
         # Declare class attribute that shall be overridden
         self.job_header = "\n".join(header_lines)
+
+
+SLURMCluster = functools.partial(JobQueueCluster, Job=SLURMJob)
diff --git a/dask_jobqueue/tests/__init__.py b/dask_jobqueue/tests/__init__.py
index 9d193036..7c7408c7 100644
--- a/dask_jobqueue/tests/__init__.py
+++ b/dask_jobqueue/tests/__init__.py
@@ -1,3 +1,3 @@
 from __future__ import absolute_import, division, print_function
 
-QUEUE_WAIT = 15  # seconds
+QUEUE_WAIT = 60  # seconds
diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index 6ab335ac..392fdb09 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -44,19 +44,21 @@ async def test_cluster(Job):
     async with JobQueueCluster(
         1, cores=1, memory="1GB", Job=Job, asynchronous=True, name="foo"
     ) as cluster:
-        assert len(cluster.workers) == 1
-        cluster.scale(2)
-        await cluster
-        assert len(cluster.workers) == 2
-        assert all(isinstance(w, Job) for w in cluster.workers.values())
-        assert all(w.status == "running" for w in cluster.workers.values())
-
-        cluster.scale(1)
-        start = time()
-        await cluster
-        while len(cluster.scheduler.workers) != 1:
-            await asyncio.sleep(0.1)
-            assert time() < start + 5
+        async with Client(cluster, asynchronous=True) as client:
+            assert len(cluster.workers) == 1
+            cluster.scale(2)
+            await cluster
+            assert len(cluster.workers) == 2
+            assert all(isinstance(w, Job) for w in cluster.workers.values())
+            assert all(w.status == "running" for w in cluster.workers.values())
+            await client.wait_for_workers(2)
+
+            cluster.scale(1)
+            start = time()
+            await cluster
+            while len(cluster.scheduler.workers) > 1:
+                await asyncio.sleep(0.1)
+                assert time() < start + 10
 
 
 @pytest.mark.parametrize("Job", job_params)
diff --git a/dask_jobqueue/tests/test_slurm.py b/dask_jobqueue/tests/test_slurm.py
index 9d9eb262..4aeac472 100644
--- a/dask_jobqueue/tests/test_slurm.py
+++ b/dask_jobqueue/tests/test_slurm.py
@@ -117,7 +117,7 @@ def test_basic(loop):
         cores=2,
         processes=1,
         memory="2GB",
-        job_extra=["-D /"],
+        # job_extra=["-D /"],
         loop=loop,
     ) as cluster:
         with Client(cluster) as client:
@@ -150,7 +150,7 @@ def test_adaptive(loop):
         cores=2,
         processes=1,
         memory="2GB",
-        job_extra=["-D /"],
+        # job_extra=["-D /"],
         loop=loop,
     ) as cluster:
         cluster.adapt()

From 29dfccbcea50e06dfe440b40b2023452ff65a14a Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Tue, 20 Aug 2019 17:54:14 -0600
Subject: [PATCH 031/109] update tests

---
 dask_jobqueue/job.py                      | 3 +--
 dask_jobqueue/tests/test_jobqueue_core.py | 4 ++--
 dask_jobqueue/tests/test_slurm.py         | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index 9903db85..7f6daf47 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -355,7 +355,6 @@ def __init__(
                 "or SGEJob with the Job= argument."
             )
 
-
         scheduler = {
             "cls": Scheduler,  # Use local scheduler for now
             "options": {
@@ -391,5 +390,5 @@ def job_script(self):
         return self.example_job.job_script()
 
     @property
-    def name(self):
+    def job_name(self):
         return self.example_job.job_name
diff --git a/dask_jobqueue/tests/test_jobqueue_core.py b/dask_jobqueue/tests/test_jobqueue_core.py
index 5a94663e..222276ff 100644
--- a/dask_jobqueue/tests/test_jobqueue_core.py
+++ b/dask_jobqueue/tests/test_jobqueue_core.py
@@ -104,7 +104,7 @@ def test_forward_ip():
 
 
 @pytest.mark.parametrize(
-    "Cluster", [PBSCluster, MoabCluster, SLURMCluster, SGECluster, LSFCluster]
+    "Cluster", [PBSCluster, MoabCluster, SGECluster, LSFCluster]
 )
 @pytest.mark.parametrize(
     "qsub_return_string",
@@ -125,7 +125,7 @@ def test_job_id_from_qsub(Cluster, qsub_return_string):
 
 
 @pytest.mark.parametrize(
-    "Cluster", [PBSCluster, MoabCluster, SLURMCluster, SGECluster, LSFCluster]
+    "Cluster", [PBSCluster, MoabCluster, SGECluster, LSFCluster]
 )
 def test_job_id_error_handling(Cluster):
     # non-matching regexp
diff --git a/dask_jobqueue/tests/test_slurm.py b/dask_jobqueue/tests/test_slurm.py
index db9a0e9b..6b26ffc0 100644
--- a/dask_jobqueue/tests/test_slurm.py
+++ b/dask_jobqueue/tests/test_slurm.py
@@ -192,4 +192,4 @@ def test_config_name_slurm_takes_custom_config():
 
     with dask.config.set({"jobqueue.slurm-config-name": conf}):
         with SLURMCluster(config_name="slurm-config-name") as cluster:
-            assert cluster.name == "myname"
+            assert cluster.job_name == "myname"

From 152d8e4350eca2e84592a4d41a769c14153cc32d Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Tue, 20 Aug 2019 19:38:39 -0700
Subject: [PATCH 032/109] remove SLURMCluster from another test

---
 dask_jobqueue/tests/test_jobqueue_core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dask_jobqueue/tests/test_jobqueue_core.py b/dask_jobqueue/tests/test_jobqueue_core.py
index 222276ff..61a2272b 100644
--- a/dask_jobqueue/tests/test_jobqueue_core.py
+++ b/dask_jobqueue/tests/test_jobqueue_core.py
@@ -63,7 +63,7 @@ def test_shebang_settings(Cluster):
 
 
 @pytest.mark.parametrize(
-    "Cluster", [PBSCluster, MoabCluster, SLURMCluster, SGECluster, LSFCluster]
+    "Cluster", [PBSCluster, MoabCluster, SGECluster, LSFCluster]
 )
 def test_repr(Cluster):
     with Cluster(

From 87d33f45a8f7acb76a2832174bb965d3a870af61 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Wed, 21 Aug 2019 16:00:22 -0700
Subject: [PATCH 033/109] move around functions in slurm.py

---
 dask_jobqueue/slurm.py | 50 +++++++++++++++++++++---------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py
index ea1faade..d3ed18b7 100644
--- a/dask_jobqueue/slurm.py
+++ b/dask_jobqueue/slurm.py
@@ -10,31 +10,6 @@
 logger = logging.getLogger(__name__)
 
 
-def slurm_format_bytes_ceil(n):
-    """ Format bytes as text.
-
-    SLURM expects KiB, MiB or Gib, but names it KB, MB, GB. SLURM does not handle Bytes, only starts at KB.
-
-    >>> slurm_format_bytes_ceil(1)
-    '1K'
-    >>> slurm_format_bytes_ceil(1234)
-    '2K'
-    >>> slurm_format_bytes_ceil(12345678)
-    '13M'
-    >>> slurm_format_bytes_ceil(1234567890)
-    '2G'
-    >>> slurm_format_bytes_ceil(15000000000)
-    '14G'
-    """
-    if n >= (1024 ** 3):
-        return "%dG" % math.ceil(n / (1024 ** 3))
-    if n >= (1024 ** 2):
-        return "%dM" % math.ceil(n / (1024 ** 2))
-    if n >= 1024:
-        return "%dK" % math.ceil(n / 1024)
-    return "1K" % n
-
-
 class SLURMJob(Job):
     __doc__ = docstrings.with_indents(
         """ Launch Dask on a SLURM cluster
@@ -136,3 +111,28 @@ def __init__(
 
 
 SLURMCluster = functools.partial(JobQueueCluster, Job=SLURMJob)
+
+
+def slurm_format_bytes_ceil(n):
+    """ Format bytes as text.
+
+    SLURM expects KiB, MiB or Gib, but names it KB, MB, GB. SLURM does not handle Bytes, only starts at KB.
+
+    >>> slurm_format_bytes_ceil(1)
+    '1K'
+    >>> slurm_format_bytes_ceil(1234)
+    '2K'
+    >>> slurm_format_bytes_ceil(12345678)
+    '13M'
+    >>> slurm_format_bytes_ceil(1234567890)
+    '2G'
+    >>> slurm_format_bytes_ceil(15000000000)
+    '14G'
+    """
+    if n >= (1024 ** 3):
+        return "%dG" % math.ceil(n / (1024 ** 3))
+    if n >= (1024 ** 2):
+        return "%dM" % math.ceil(n / (1024 ** 2))
+    if n >= 1024:
+        return "%dK" % math.ceil(n / 1024)
+    return "1K" % n

From c5ac57355b31ce4cae2055dc358887c722f4cd18 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= <loic.esteve@ymail.com>
Date: Thu, 22 Aug 2019 14:12:24 +0200
Subject: [PATCH 034/109] SGECluster now uses SpecCluster.

With some quick test fixes.
---
 dask_jobqueue/sge.py            | 108 ++------------------------------
 dask_jobqueue/tests/test_sge.py |  36 ++++++-----
 2 files changed, 25 insertions(+), 119 deletions(-)

diff --git a/dask_jobqueue/sge.py b/dask_jobqueue/sge.py
index 0abd49e9..5ee873e8 100644
--- a/dask_jobqueue/sge.py
+++ b/dask_jobqueue/sge.py
@@ -1,114 +1,13 @@
 import logging
+import functools
 
 import dask
 
-from .core import JobQueueCluster, docstrings
-from .job import Job
+from .job import Job, JobQueueCluster
 
 logger = logging.getLogger(__name__)
 
 
-class SGECluster(JobQueueCluster):
-    __doc__ = docstrings.with_indents(
-        """ Launch Dask on a SGE cluster
-
-    .. note::
-        If you want a specific amount of RAM, both ``memory`` and ``resource_spec``
-        must be specified. The exact syntax of ``resource_spec`` is defined by your
-        GridEngine system administrator. The amount of ``memory`` requested should
-        match the ``resource_spec``, so that Dask's memory management system can
-        perform accurately.
-
-    Parameters
-    ----------
-    queue : str
-        Destination queue for each worker job. Passed to `#$ -q` option.
-    project : str
-        Accounting string associated with each worker job. Passed to `#$ -A` option.
-    resource_spec : str
-        Request resources and specify job placement. Passed to `#$ -l` option.
-    walltime : str
-        Walltime for each worker job.
-    job_extra : list
-        List of other SGE options, for example -w e. Each option will be
-        prepended with the #$ prefix.
-    %(JobQueueCluster.parameters)s
-
-    Examples
-    --------
-    >>> from dask_jobqueue import SGECluster
-    >>> cluster = SGECluster(queue='regular')
-    >>> cluster.scale(10)  # this may take a few seconds to launch
-
-    >>> from dask.distributed import Client
-    >>> client = Client(cluster)
-
-    This also works with adaptive clusters.  This automatically launches and kill workers based on load.
-
-    >>> cluster.adapt()
-    """,
-        4,
-    )
-
-    # Override class variables
-    submit_command = "qsub -terse"
-    cancel_command = "qdel"
-
-    def __init__(
-        self,
-        queue=None,
-        project=None,
-        resource_spec=None,
-        walltime=None,
-        job_extra=None,
-        config_name="sge",
-        **kwargs
-    ):
-        if queue is None:
-            queue = dask.config.get("jobqueue.%s.queue" % config_name)
-        if project is None:
-            project = dask.config.get("jobqueue.%s.project" % config_name)
-        if resource_spec is None:
-            resource_spec = dask.config.get("jobqueue.%s.resource-spec" % config_name)
-        if walltime is None:
-            walltime = dask.config.get("jobqueue.%s.walltime" % config_name)
-        if job_extra is None:
-            job_extra = dask.config.get("jobqueue.%s.job-extra" % config_name)
-
-        super().__init__(config_name=config_name, **kwargs)
-
-        header_lines = []
-        if self.name is not None:
-            header_lines.append("#$ -N %(name)s")
-        if queue is not None:
-            header_lines.append("#$ -q %(queue)s")
-        if project is not None:
-            header_lines.append("#$ -P %(project)s")
-        if resource_spec is not None:
-            header_lines.append("#$ -l %(resource_spec)s")
-        if walltime is not None:
-            header_lines.append("#$ -l h_rt=%(walltime)s")
-        if self.log_directory is not None:
-            header_lines.append("#$ -e %(log_directory)s/")
-            header_lines.append("#$ -o %(log_directory)s/")
-        header_lines.extend(["#$ -cwd", "#$ -j y"])
-        header_lines.extend(["#$ %s" % arg for arg in job_extra])
-        header_template = "\n".join(header_lines)
-
-        config = {
-            "name": self.name,
-            "queue": queue,
-            "project": project,
-            "processes": self.worker_processes,
-            "walltime": walltime,
-            "resource_spec": resource_spec,
-            "log_directory": self.log_directory,
-        }
-        self.job_header = header_template % config
-
-        logger.debug("Job script: \n %s" % self.job_script())
-
-
 class SGEJob(Job):
     submit_command = "qsub"
     cancel_command = "qdel"
@@ -167,3 +66,6 @@ def __init__(
         self.job_header = header_template % config
 
         logger.debug("Job script: \n %s" % self.job_script())
+
+
+SGECluster = functools.partial(JobQueueCluster, Job=SGEJob)
diff --git a/dask_jobqueue/tests/test_sge.py b/dask_jobqueue/tests/test_sge.py
index 0d8dac75..980de436 100644
--- a/dask_jobqueue/tests/test_sge.py
+++ b/dask_jobqueue/tests/test_sge.py
@@ -19,13 +19,13 @@ def test_basic(loop):
             cluster.scale(2)
 
             start = time()
-            while not (cluster.pending_jobs or cluster.running_jobs):
+            while not client.scheduler_info()["workers"]:
                 sleep(0.100)
                 assert time() < start + QUEUE_WAIT
 
             future = client.submit(lambda x: x + 1, 10)
             assert future.result(QUEUE_WAIT) == 11
-            assert cluster.running_jobs
+            assert len(client.scheduler_info()["workers"]) > 0
 
             workers = list(client.scheduler_info()["workers"].values())
             w = workers[0]
@@ -35,7 +35,7 @@ def test_basic(loop):
             cluster.scale(0)
 
             start = time()
-            while cluster.running_jobs:
+            while client.scheduler_info()["workers"]:
                 sleep(0.100)
                 assert time() < start + QUEUE_WAIT
 
@@ -65,7 +65,7 @@ def test_config_name_sge_takes_custom_config():
 
     with dask.config.set({"jobqueue.sge-config-name": conf}):
         with SGECluster(config_name="sge-config-name") as cluster:
-            assert cluster.name == "myname"
+            assert cluster.job_name == "myname"
 
 
 def test_job_script(tmpdir):
@@ -107,19 +107,23 @@ def test_complex_cancel_command(loop):
     with SGECluster(
         walltime="00:02:00", cores=1, processes=1, memory="2GB", loop=loop
     ) as cluster:
-        username = "root"
-        cluster.cancel_command = "qdel -u {}".format(username)
+        with Client(cluster) as client:
+            username = "root"
+            cluster.cancel_command = "qdel -u {}".format(username)
 
-        cluster.scale(2)
+            cluster.scale(2)
 
-        start = time()
-        while not cluster.running_jobs:
-            sleep(0.100)
-            assert time() < start + QUEUE_WAIT
+            start = time()
+            while not client.scheduler_info()["workers"]:
+                sleep(0.100)
+                assert time() < start + QUEUE_WAIT
 
-        cluster.stop_all_jobs()
+            # TODO: Is there a replacement for .stop_all_jobs? stop_all_jobs
+            # does make sure that the pending jobs get qdeled.
+            # cluster.stop_all_jobs()
+            cluster.scale(0)
 
-        start = time()
-        while cluster.running_jobs:
-            sleep(0.100)
-            assert time() < start + QUEUE_WAIT
+            start = time()
+            while client.scheduler_info()["workers"]:
+                sleep(0.100)
+                assert time() < start + QUEUE_WAIT

From f3847e080211d1fbbcd7c2403ca8e168de8e2acb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= <loic.esteve@ymail.com>
Date: Thu, 22 Aug 2019 17:02:35 +0200
Subject: [PATCH 035/109] Two more fixes now that ._job_id_from_submit_output
 has moved to the job level.

---
 dask_jobqueue/tests/test_jobqueue_core.py | 56 +++++++++++++++++++----
 1 file changed, 46 insertions(+), 10 deletions(-)

diff --git a/dask_jobqueue/tests/test_jobqueue_core.py b/dask_jobqueue/tests/test_jobqueue_core.py
index 61a2272b..159583c5 100644
--- a/dask_jobqueue/tests/test_jobqueue_core.py
+++ b/dask_jobqueue/tests/test_jobqueue_core.py
@@ -16,6 +16,8 @@
     OARCluster,
 )
 
+from dask_jobqueue.sge import SGEJob
+
 
 def test_errors():
     with pytest.raises(NotImplementedError) as info:
@@ -62,12 +64,15 @@ def test_shebang_settings(Cluster):
         assert job_script.startswith(default_shebang)
 
 
-@pytest.mark.parametrize(
-    "Cluster", [PBSCluster, MoabCluster, SGECluster, LSFCluster]
-)
+@pytest.mark.parametrize("Cluster", [PBSCluster, MoabCluster, SGECluster, LSFCluster])
 def test_repr(Cluster):
     with Cluster(
-        walltime="00:02:00", processes=4, cores=8, memory="28GB", name="dask-worker"
+        # TODO name -> job_name could be a problem ...
+        walltime="00:02:00",
+        processes=4,
+        cores=8,
+        memory="28GB",
+        job_name="dask-worker",
     ) as cluster:
         cluster_repr = repr(cluster)
         assert cluster.__class__.__name__ in cluster_repr
@@ -103,9 +108,7 @@ def test_forward_ip():
         assert cluster.local_cluster.scheduler.ip == default_ip
 
 
-@pytest.mark.parametrize(
-    "Cluster", [PBSCluster, MoabCluster, SGECluster, LSFCluster]
-)
+@pytest.mark.parametrize("Cluster", [PBSCluster, MoabCluster, LSFCluster])
 @pytest.mark.parametrize(
     "qsub_return_string",
     [
@@ -117,17 +120,34 @@ def test_forward_ip():
         "{job_id}",
     ],
 )
-def test_job_id_from_qsub(Cluster, qsub_return_string):
+def test_job_id_from_qsub_legacy(Cluster, qsub_return_string):
     original_job_id = "654321"
     qsub_return_string = qsub_return_string.format(job_id=original_job_id)
     with Cluster(cores=1, memory="1GB") as cluster:
         assert original_job_id == cluster._job_id_from_submit_output(qsub_return_string)
 
 
+@pytest.mark.parametrize("Job", [SGEJob])
 @pytest.mark.parametrize(
-    "Cluster", [PBSCluster, MoabCluster, SGECluster, LSFCluster]
+    "qsub_return_string",
+    [
+        "{job_id}.admin01",
+        "Request {job_id}.asdf was sumbitted to queue: standard.",
+        "sbatch: Submitted batch job {job_id}",
+        "{job_id};cluster",
+        "Job <{job_id}> is submitted to default queue <normal>.",
+        "{job_id}",
+    ],
 )
-def test_job_id_error_handling(Cluster):
+def test_job_id_from_qsub(Job, qsub_return_string):
+    original_job_id = "654321"
+    qsub_return_string = qsub_return_string.format(job_id=original_job_id)
+    job = Job(cores=1, memory="1GB")
+    assert original_job_id == job._job_id_from_submit_output(qsub_return_string)
+
+
+@pytest.mark.parametrize("Cluster", [PBSCluster, MoabCluster, LSFCluster])
+def test_job_id_error_handling_legacy(Cluster):
     # non-matching regexp
     with Cluster(cores=1, memory="1GB") as cluster:
         with pytest.raises(ValueError, match="Could not parse job id"):
@@ -142,6 +162,22 @@ def test_job_id_error_handling(Cluster):
             cluster._job_id_from_submit_output(return_string)
 
 
+@pytest.mark.parametrize("Job", [SGEJob])
+def test_job_id_error_handling(Job):
+    # non-matching regexp
+    job = Job(cores=1, memory="1GB")
+    with pytest.raises(ValueError, match="Could not parse job id"):
+        return_string = "there is no number here"
+        job._job_id_from_submit_output(return_string)
+
+    # no job_id named group in the regexp
+    job = Job(cores=1, memory="1GB")
+    with pytest.raises(ValueError, match="You need to use a 'job_id' named group"):
+        return_string = "Job <12345> submitted to <normal>."
+        job.job_id_regexp = r"(\d+)"
+        job._job_id_from_submit_output(return_string)
+
+
 def test_log_directory(tmpdir):
     shutil.rmtree(tmpdir.strpath, ignore_errors=True)
     with PBSCluster(cores=1, memory="1GB"):

From 1ff904d97d3a65b4b1c467c3c6d7de215fedf581 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Thu, 22 Aug 2019 15:31:44 -0700
Subject: [PATCH 036/109] silence logs by default

---
 dask_jobqueue/job.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index 7f6daf47..a20f57c5 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -339,7 +339,7 @@ def __init__(
         # Cluster keywords
         loop=None,
         security=None,
-        silence_logs=False,
+        silence_logs="error",
         name=None,
         asynchronous=False,
         # Scheduler keywords

From afe8e9e4c5d661cb7b0e34cf5b215e7131475404 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Thu, 22 Aug 2019 15:32:08 -0700
Subject: [PATCH 037/109] bump requirements

---
 requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index ba31c0ce..0834a6ff 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,3 @@
-dask>=2
-distributed>=2.1
+dask>=2.3
+distributed>=2.3
 docrep

From 95f00ac84c4c919019dc86cc7224b569a1b47013 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Sun, 25 Aug 2019 12:46:22 -0700
Subject: [PATCH 038/109] add config_name to JobQueueCluster

---
 dask_jobqueue/job.py   | 7 +++++++
 dask_jobqueue/sge.py   | 2 +-
 dask_jobqueue/slurm.py | 2 +-
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index a20f57c5..ebdfc7d8 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -346,6 +346,7 @@ def __init__(
         interface=None,
         protocol="tcp://",
         dashboard_address=":8787",
+        config_name=None,
         # Job keywords
         **kwargs
     ):
@@ -355,6 +356,10 @@ def __init__(
                 "or SGEJob with the Job= argument."
             )
 
+        if config_name:
+            if interface is None:
+                interface = dask.config.get("jobqueue.%s.interface" % config_name)
+
         scheduler = {
             "cls": Scheduler,  # Use local scheduler for now
             "options": {
@@ -364,6 +369,8 @@ def __init__(
                 "security": security,
             },
         }
+        if config_name:
+            kwargs["config_name"] = config_name
         kwargs["interface"] = interface
         kwargs["protocol"] = protocol
         kwargs["security"] = security
diff --git a/dask_jobqueue/sge.py b/dask_jobqueue/sge.py
index 5ee873e8..79ced212 100644
--- a/dask_jobqueue/sge.py
+++ b/dask_jobqueue/sge.py
@@ -68,4 +68,4 @@ def __init__(
         logger.debug("Job script: \n %s" % self.job_script())
 
 
-SGECluster = functools.partial(JobQueueCluster, Job=SGEJob)
+SGECluster = functools.partial(JobQueueCluster, Job=SGEJob, config_name="sge")
diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py
index d3ed18b7..b5082c5b 100644
--- a/dask_jobqueue/slurm.py
+++ b/dask_jobqueue/slurm.py
@@ -110,7 +110,7 @@ def __init__(
         self.job_header = "\n".join(header_lines)
 
 
-SLURMCluster = functools.partial(JobQueueCluster, Job=SLURMJob)
+SLURMCluster = functools.partial(JobQueueCluster, Job=SLURMJob, config_name="slurm")
 
 
 def slurm_format_bytes_ceil(n):

From 191186100fc9a5b62eea4bad2b07f9632fc6e03c Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Sun, 25 Aug 2019 13:22:53 -0700
Subject: [PATCH 039/109] Move repr functionality upstream to Cluster.__repr__

https://github.com/dask/distributed/pull/2995
---
 dask_jobqueue/tests/test_jobqueue_core.py | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/dask_jobqueue/tests/test_jobqueue_core.py b/dask_jobqueue/tests/test_jobqueue_core.py
index 159583c5..8f301295 100644
--- a/dask_jobqueue/tests/test_jobqueue_core.py
+++ b/dask_jobqueue/tests/test_jobqueue_core.py
@@ -64,23 +64,6 @@ def test_shebang_settings(Cluster):
         assert job_script.startswith(default_shebang)
 
 
-@pytest.mark.parametrize("Cluster", [PBSCluster, MoabCluster, SGECluster, LSFCluster])
-def test_repr(Cluster):
-    with Cluster(
-        # TODO name -> job_name could be a problem ...
-        walltime="00:02:00",
-        processes=4,
-        cores=8,
-        memory="28GB",
-        job_name="dask-worker",
-    ) as cluster:
-        cluster_repr = repr(cluster)
-        assert cluster.__class__.__name__ in cluster_repr
-        assert "cores=0" in cluster_repr
-        assert "memory=0 B" in cluster_repr
-        assert "workers=0" in cluster_repr
-
-
 @pytest.mark.parametrize(
     "Cluster", [PBSCluster, MoabCluster, SLURMCluster, SGECluster, LSFCluster]
 )

From 716dd40cf0b05ca0e559c75aebc8fb6d0d2fa328 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Sun, 25 Aug 2019 13:39:41 -0700
Subject: [PATCH 040/109] move pbs and moab

---
 dask_jobqueue/job.py                      |   2 +
 dask_jobqueue/moab.py                     |  46 ++-------
 dask_jobqueue/pbs.py                      | 115 +---------------------
 dask_jobqueue/tests/test_jobqueue_core.py |  23 ++---
 dask_jobqueue/tests/test_pbs.py           |   8 +-
 5 files changed, 30 insertions(+), 164 deletions(-)

diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index ebdfc7d8..a7db2e12 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -344,6 +344,7 @@ def __init__(
         asynchronous=False,
         # Scheduler keywords
         interface=None,
+        host=None,
         protocol="tcp://",
         dashboard_address=":8787",
         config_name=None,
@@ -365,6 +366,7 @@ def __init__(
             "options": {
                 "protocol": protocol,
                 "interface": interface,
+                "host": host,
                 "dashboard_address": dashboard_address,
                 "security": security,
             },
diff --git a/dask_jobqueue/moab.py b/dask_jobqueue/moab.py
index 0114ac91..3c6f5bc5 100644
--- a/dask_jobqueue/moab.py
+++ b/dask_jobqueue/moab.py
@@ -1,45 +1,13 @@
-from .core import docstrings
-from .pbs import PBSCluster
+import functools
 
+from .job import JobQueueCluster
+from .pbs import PBSJob
 
-class MoabCluster(PBSCluster):
-    __doc__ = docstrings.with_indents(
-        """Launch Dask on a Moab cluster
 
-    Parameters
-    ----------
-    queue : str
-        Destination queue for each worker job. Passed to `#PBS -q` option.
-    project : str
-        Accounting string associated with each worker job. Passed to
-        `#PBS -A` option.
-    resource_spec : str
-        Request resources and specify job placement. Passed to `#PBS -l` option.
-    walltime : str
-        Walltime for each worker job.
-    job_extra : list
-        List of other PBS options, for example -j oe. Each option will be prepended with the #PBS prefix.
-    %(JobQueueCluster.parameters)s
-
-    Examples
-    --------
-    >>> import os
-    >>> from dask_jobqueue import MoabCluster
-    >>> cluster = MoabCluster(processes=6, cores=6, project='gfdl_m',
-                              memory='96G', resource_spec='96G',
-                              job_extra=['-d /home/First.Last', '-M none'],
-                              local_directory=os.getenv('TMPDIR', '/tmp'))
-    >>> cluster.scale(60)  # submit enough jobs to deploy 10 workers
-
-    >>> from dask.distributed import Client
-    >>> client = Client(cluster)
-
-    This also works with adaptive clusters.  This automatically launches and kill workers based on load.
-
-    >>> cluster.adapt()
-    """,
-        4,
-    )
+class MoabJob(PBSJob):
     submit_command = "msub"
     cancel_command = "canceljob"
     scheduler_name = "moab"
+
+
+MoabCluster = functools.partial(JobQueueCluster, Job=MoabJob, config_name='pbs')
diff --git a/dask_jobqueue/pbs.py b/dask_jobqueue/pbs.py
index 1314d618..383e70a3 100644
--- a/dask_jobqueue/pbs.py
+++ b/dask_jobqueue/pbs.py
@@ -1,123 +1,15 @@
+import functools
 import logging
 import math
 import os
 
 import dask
 
-from .core import JobQueueCluster, docstrings
-from .job import Job
+from .job import Job, JobQueueCluster
 
 logger = logging.getLogger(__name__)
 
 
-class PBSCluster(JobQueueCluster):
-    __doc__ = docstrings.with_indents(
-        """ Launch Dask on a PBS cluster
-
-    Parameters
-    ----------
-    queue : str
-        Destination queue for each worker job. Passed to `#PBS -q` option.
-    project : str
-        Accounting string associated with each worker job. Passed to
-        `#PBS -A` option.
-    resource_spec : str
-        Request resources and specify job placement. Passed to `#PBS -l`
-        option.
-    walltime : str
-        Walltime for each worker job.
-    job_extra : list
-        List of other PBS options, for example -j oe. Each option will be prepended with the #PBS prefix.
-    %(JobQueueCluster.parameters)s
-
-    Examples
-    --------
-    >>> from dask_jobqueue import PBSCluster
-    >>> cluster = PBSCluster(queue='regular', project='DaskOnPBS', cores=12)
-    >>> cluster.scale(10)  # this may take a few seconds to launch
-
-    >>> from dask.distributed import Client
-    >>> client = Client(cluster)
-
-    This also works with adaptive clusters.  This automatically launches and kill workers based on load.
-
-    >>> cluster.adapt()
-
-    It is a good practice to define local_directory to your PBS system scratch directory:
-
-    >>> cluster = PBSCluster(queue='regular', project='DaskOnPBS',
-    ...                      local_directory='$TMPDIR',
-    ...                      cores=24, processes=6, memory='100GB')
-    """,
-        4,
-    )
-
-    # Override class variables
-    submit_command = "qsub"
-    cancel_command = "qdel"
-
-    def __init__(
-        self,
-        queue=None,
-        project=None,
-        resource_spec=None,
-        walltime=None,
-        job_extra=None,
-        config_name="pbs",
-        **kwargs
-    ):
-        if queue is None:
-            queue = dask.config.get("jobqueue.%s.queue" % config_name)
-        if resource_spec is None:
-            resource_spec = dask.config.get("jobqueue.%s.resource-spec" % config_name)
-        if walltime is None:
-            walltime = dask.config.get("jobqueue.%s.walltime" % config_name)
-        if job_extra is None:
-            job_extra = dask.config.get("jobqueue.%s.job-extra" % config_name)
-        if project is None:
-            project = dask.config.get(
-                "jobqueue.%s.project" % config_name
-            ) or os.environ.get("PBS_ACCOUNT")
-
-        # Instantiate args and parameters from parent abstract class
-        super().__init__(config_name=config_name, **kwargs)
-
-        # Try to find a project name from environment variable
-        project = project or os.environ.get("PBS_ACCOUNT")
-
-        header_lines = []
-        # PBS header build
-        if self.name is not None:
-            header_lines.append("#PBS -N %s" % self.name)
-        if queue is not None:
-            header_lines.append("#PBS -q %s" % queue)
-        if project is not None:
-            header_lines.append("#PBS -A %s" % project)
-        if resource_spec is None:
-            # Compute default resources specifications
-            resource_spec = "select=1:ncpus=%d" % self.worker_cores
-            memory_string = pbs_format_bytes_ceil(self.worker_memory)
-            resource_spec += ":mem=" + memory_string
-            logger.info(
-                "Resource specification for PBS not set, initializing it to %s"
-                % resource_spec
-            )
-        if resource_spec is not None:
-            header_lines.append("#PBS -l %s" % resource_spec)
-        if walltime is not None:
-            header_lines.append("#PBS -l walltime=%s" % walltime)
-        if self.log_directory is not None:
-            header_lines.append("#PBS -e %s/" % self.log_directory)
-            header_lines.append("#PBS -o %s/" % self.log_directory)
-        header_lines.extend(["#PBS %s" % arg for arg in job_extra])
-        header_lines.append("JOB_ID=${PBS_JOBID%%.*}")
-
-        # Declare class attribute that shall be overridden
-        self.job_header = "\n".join(header_lines)
-
-        logger.debug("Job script: \n %s" % self.job_script())
-
-
 def pbs_format_bytes_ceil(n):
     """ Format bytes as text.
 
@@ -209,3 +101,6 @@ def __init__(
         self.job_header = "\n".join(header_lines)
 
         logger.debug("Job script: \n %s" % self.job_script())
+
+
+PBSCluster = functools.partial(JobQueueCluster, Job=PBSJob, config_name='pbs')
diff --git a/dask_jobqueue/tests/test_jobqueue_core.py b/dask_jobqueue/tests/test_jobqueue_core.py
index 8f301295..2020bcb5 100644
--- a/dask_jobqueue/tests/test_jobqueue_core.py
+++ b/dask_jobqueue/tests/test_jobqueue_core.py
@@ -30,11 +30,11 @@ def test_command_template():
     with PBSCluster(cores=2, memory="4GB") as cluster:
         assert (
             "%s -m distributed.cli.dask_worker" % (sys.executable)
-            in cluster._command_template
+            in cluster.example_job._command_template
         )
-        assert " --nthreads 2" in cluster._command_template
-        assert " --memory-limit " in cluster._command_template
-        assert " --name " in cluster._command_template
+        assert " --nthreads 2" in cluster.example_job._command_template
+        assert " --memory-limit " in cluster.example_job._command_template
+        assert " --name " in cluster.example_job._command_template
 
     with PBSCluster(
         cores=2,
@@ -43,9 +43,9 @@ def test_command_template():
         local_directory="/scratch",
         extra=["--preload", "mymodule"],
     ) as cluster:
-        assert " --death-timeout 60" in cluster._command_template
-        assert " --local-directory /scratch" in cluster._command_template
-        assert " --preload mymodule" in cluster._command_template
+        assert " --death-timeout 60" in cluster.example_job._command_template
+        assert " --local-directory /scratch" in cluster.example_job._command_template
+        assert " --preload mymodule" in cluster.example_job._command_template
 
 
 @pytest.mark.parametrize(
@@ -82,16 +82,16 @@ def test_forward_ip():
         name="dask-worker",
         host=ip,
     ) as cluster:
-        assert cluster.local_cluster.scheduler.ip == ip
+        assert cluster.scheduler.ip == ip
 
     default_ip = socket.gethostbyname("")
     with PBSCluster(
         walltime="00:02:00", processes=4, cores=8, memory="28GB", name="dask-worker"
     ) as cluster:
-        assert cluster.local_cluster.scheduler.ip == default_ip
+        assert cluster.scheduler.ip == default_ip
 
 
-@pytest.mark.parametrize("Cluster", [PBSCluster, MoabCluster, LSFCluster])
+@pytest.mark.parametrize("Cluster", [LSFCluster])
 @pytest.mark.parametrize(
     "qsub_return_string",
     [
@@ -129,7 +129,7 @@ def test_job_id_from_qsub(Job, qsub_return_string):
     assert original_job_id == job._job_id_from_submit_output(qsub_return_string)
 
 
-@pytest.mark.parametrize("Cluster", [PBSCluster, MoabCluster, LSFCluster])
+@pytest.mark.parametrize("Cluster", [LSFCluster])
 def test_job_id_error_handling_legacy(Cluster):
     # non-matching regexp
     with Cluster(cores=1, memory="1GB") as cluster:
@@ -170,6 +170,7 @@ def test_log_directory(tmpdir):
         assert os.path.exists(tmpdir.strpath)
 
 
+@pytest.mark.skip
 def test_jobqueue_cluster_call(tmpdir):
     cluster = PBSCluster(cores=1, memory="1GB")
 
diff --git a/dask_jobqueue/tests/test_pbs.py b/dask_jobqueue/tests/test_pbs.py
index f7abb53f..1e2f4de8 100644
--- a/dask_jobqueue/tests/test_pbs.py
+++ b/dask_jobqueue/tests/test_pbs.py
@@ -22,7 +22,6 @@ def test_header(Cluster):
         assert "#PBS -l walltime=00:02:00" in cluster.job_header
         assert "#PBS -q" not in cluster.job_header
         assert "#PBS -A" not in cluster.job_header
-        assert "--name dask-worker--${JOB_ID}--" in cluster.job_script()
 
     with Cluster(
         queue="regular",
@@ -388,7 +387,7 @@ def test_config_name_pbs_takes_custom_config():
 
     with dask.config.set({"jobqueue.pbs-config-name": conf}):
         with PBSCluster(config_name="pbs-config-name") as cluster:
-            assert cluster.name == "myname"
+            assert cluster.job_name == "myname"
 
 
 def test_informative_errors():
@@ -401,6 +400,7 @@ def test_informative_errors():
     assert "cores" in str(info.value)
 
 
-def test_adapt(loop):
-    with PBSCluster(loop, cores=1, memory="1 GB") as cluster:
+@pytest.mark.asyncio
+async def test_adapt(loop):
+    async with PBSCluster(cores=1, memory="1 GB", asynchronous=True) as cluster:
         cluster.adapt()

From 1f206815c19d3c0001debeb1172cab99ef692d7b Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Sun, 25 Aug 2019 16:03:18 -0700
Subject: [PATCH 041/109] move over LSF

---
 dask_jobqueue/lsf.py                      | 20 ++++++++++++--------
 dask_jobqueue/tests/test_jobqueue_core.py |  4 ++--
 dask_jobqueue/tests/test_lsf.py           |  3 +--
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/dask_jobqueue/lsf.py b/dask_jobqueue/lsf.py
index fd8d20af..95042a8d 100644
--- a/dask_jobqueue/lsf.py
+++ b/dask_jobqueue/lsf.py
@@ -1,15 +1,17 @@
+import functools
 import logging
 import math
 import os
 
 import dask
 
-from .core import JobQueueCluster, docstrings
+from .core import docstrings
+from .job import Job, JobQueueCluster
 
 logger = logging.getLogger(__name__)
 
 
-class LSFCluster(JobQueueCluster):
+class LSFJob(Job):
     __doc__ = docstrings.with_indents(
         """ Launch Dask on a LSF cluster
 
@@ -32,12 +34,12 @@ class LSFCluster(JobQueueCluster):
     lsf_units : str
         Unit system for large units in resource usage set by the
         LSF_UNIT_FOR_LIMITS in the lsf.conf file of a cluster.
-    %(JobQueueCluster.parameters)s
+    %(Job.parameters)s
 
     Examples
     --------
     >>> from dask_jobqueue import LSFCluster
-    >>> cluster = LSFcluster(queue='general', project='DaskonLSF',
+    >>> cluster = LSFCluster(queue='general', project='DaskonLSF',
     ...                      cores=15, memory='25GB')
     >>> cluster.scale(10)  # this may take a few seconds to launch
 
@@ -51,13 +53,12 @@ class LSFCluster(JobQueueCluster):
     """,
         4,
     )
-
-    # Override class variables
     submit_command = "bsub <"
     cancel_command = "bkill"
 
     def __init__(
         self,
+        *args,
         queue=None,
         project=None,
         ncpus=None,
@@ -84,12 +85,12 @@ def __init__(
             lsf_units = dask.config.get("jobqueue.%s.lsf-units" % config_name)
 
         # Instantiate args and parameters from parent abstract class
-        super().__init__(config_name=config_name, **kwargs)
+        super().__init__(*args, config_name=config_name, **kwargs)
 
         header_lines = []
         # LSF header build
         if self.name is not None:
-            header_lines.append("#BSUB -J %s" % self.name)
+            header_lines.append("#BSUB -J %s" % self.job_name)
         if self.log_directory is not None:
             header_lines.append(
                 "#BSUB -e %s/%s-%%J.err" % (self.log_directory, self.name or "worker")
@@ -196,3 +197,6 @@ def lsf_detect_units():
             "default unit of %s." % unit
         )
     return unit
+
+
+LSFCluster = functools.partial(JobQueueCluster, Job=LSFJob, config_name="lsf")
diff --git a/dask_jobqueue/tests/test_jobqueue_core.py b/dask_jobqueue/tests/test_jobqueue_core.py
index 2020bcb5..929140ec 100644
--- a/dask_jobqueue/tests/test_jobqueue_core.py
+++ b/dask_jobqueue/tests/test_jobqueue_core.py
@@ -91,7 +91,7 @@ def test_forward_ip():
         assert cluster.scheduler.ip == default_ip
 
 
-@pytest.mark.parametrize("Cluster", [LSFCluster])
+@pytest.mark.parametrize("Cluster", [])
 @pytest.mark.parametrize(
     "qsub_return_string",
     [
@@ -129,7 +129,7 @@ def test_job_id_from_qsub(Job, qsub_return_string):
     assert original_job_id == job._job_id_from_submit_output(qsub_return_string)
 
 
-@pytest.mark.parametrize("Cluster", [LSFCluster])
+@pytest.mark.parametrize("Cluster", [])
 def test_job_id_error_handling_legacy(Cluster):
     # non-matching regexp
     with Cluster(cores=1, memory="1GB") as cluster:
diff --git a/dask_jobqueue/tests/test_lsf.py b/dask_jobqueue/tests/test_lsf.py
index bd3ca5e7..7915a1bb 100644
--- a/dask_jobqueue/tests/test_lsf.py
+++ b/dask_jobqueue/tests/test_lsf.py
@@ -25,7 +25,6 @@ def test_header():
         assert "#BSUB -W 00:02" in cluster.job_header
         assert "#BSUB -q" not in cluster.job_header
         assert "#BSUB -P" not in cluster.job_header
-        assert "--name dask-worker--${JOB_ID}--" in cluster.job_script()
 
     with LSFCluster(
         queue="general",
@@ -249,7 +248,7 @@ def test_config_name_lsf_takes_custom_config():
 
     with dask.config.set({"jobqueue.lsf-config-name": conf}):
         with LSFCluster(config_name="lsf-config-name") as cluster:
-            assert cluster.name == "myname"
+            assert cluster.job_name == "myname"
 
 
 def test_informative_errors():

From 172ecd43917f2f35a6d51d1fcb825267521425f1 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Sun, 25 Aug 2019 16:11:01 -0700
Subject: [PATCH 042/109] Add header_skip option to Job constructor

---
 dask_jobqueue/__init__.py       |  4 ++--
 dask_jobqueue/job.py            | 11 ++++++++++-
 dask_jobqueue/tests/test_job.py | 11 ++++++++++-
 3 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/dask_jobqueue/__init__.py b/dask_jobqueue/__init__.py
index ed6d4de8..d88ec417 100644
--- a/dask_jobqueue/__init__.py
+++ b/dask_jobqueue/__init__.py
@@ -2,11 +2,11 @@
 from . import config
 from .core import JobQueueCluster
 from .job import Job
-from .moab import MoabCluster
+from .moab import MoabCluster, MoabJob
 from .pbs import PBSCluster, PBSJob
 from .slurm import SLURMCluster, SLURMJob
 from .sge import SGECluster, SGEJob
-from .lsf import LSFCluster
+from .lsf import LSFCluster, LSFJob
 from .oar import OARCluster
 from .htcondor import HTCondorCluster
 
diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index a7db2e12..1e70bfaa 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -43,6 +43,9 @@ class Job(ProcessInterface):
         Additional arguments to pass to `dask-worker`
     env_extra : list
         Other commands to add to script before launching worker.
+    header_skip : list
+        Lines to skip in the header.
+        Header lines matching this text will be removed
     log_directory : str
         Directory to use for job scheduler logs.
     shebang : str
@@ -100,6 +103,7 @@ def __init__(
         local_directory=None,
         extra=None,
         env_extra=None,
+        header_skip=None,
         log_directory=None,
         shebang=None,
         python=sys.executable,
@@ -142,6 +146,8 @@ def __init__(
             extra = dask.config.get("jobqueue.%s.extra" % config_name)
         if env_extra is None:
             env_extra = dask.config.get("jobqueue.%s.env-extra" % config_name)
+        if header_skip is None:
+            header_skip = dask.config.get("jobqueue.%s.header-skip" % config_name, ())
         if log_directory is None:
             log_directory = dask.config.get("jobqueue.%s.log-directory" % config_name)
         if shebang is None:
@@ -176,6 +182,7 @@ def __init__(
         self.shebang = shebang
 
         self._env_header = "\n".join(env_extra)
+        self.header_skip = set(header_skip)
 
         # dask-worker command line build
         dask_worker_command = "%(python)s -m distributed.cli.dask_worker" % dict(
@@ -205,9 +212,11 @@ def __init__(
 
     def job_script(self):
         """ Construct a job submission script """
+        header = "\n".join([line for line in self.job_header.split("\n") if not any(skip
+            in line for skip in self.header_skip)])
         pieces = {
             "shebang": self.shebang,
-            "job_header": self.job_header,
+            "job_header": header,
             "env_header": self._env_header,
             "worker_command": self._command_template,
         }
diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index 392fdb09..c7669b80 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -1,7 +1,7 @@
 import asyncio
 from time import time
 
-from dask_jobqueue import PBSJob, SGEJob, SLURMJob
+from dask_jobqueue import PBSJob, SGEJob, SLURMJob, LSFJob
 from dask_jobqueue.job import JobQueueCluster
 from dask.distributed import Scheduler, Client
 
@@ -17,6 +17,7 @@ def test_basic():
     pytest.param(SGEJob, marks=[pytest.mark.env("sge")]),
     pytest.param(PBSJob, marks=[pytest.mark.env("pbs")]),
     pytest.param(SLURMJob, marks=[pytest.mark.env("slurm")]),
+    pytest.param(LSFJob, marks=[pytest.mark.env("lsf")]),
 ]
 
 
@@ -89,3 +90,11 @@ async def test_adapt(Job):
                 assert time() < start + 10
             assert not cluster.worker_spec
             assert not cluster.workers
+
+
+def test_header_lines_skip():
+    job = PBSJob(cores=1, memory="1GB", job_name="foobar")
+    assert "foobar" in job.job_script()
+
+    job = PBSJob(cores=1, memory="1GB", job_name="foobar", header_skip=["-N"])
+    assert "foobar" not in job.job_script()

From 9bf286a4a71e4515e9bc756a2b1be4047930a573 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Sun, 25 Aug 2019 16:54:25 -0700
Subject: [PATCH 043/109] simplify bsub management in lsf

---
 dask_jobqueue/lsf.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/dask_jobqueue/lsf.py b/dask_jobqueue/lsf.py
index 95042a8d..12126ee2 100644
--- a/dask_jobqueue/lsf.py
+++ b/dask_jobqueue/lsf.py
@@ -53,7 +53,7 @@ class LSFJob(Job):
     """,
         4,
     )
-    submit_command = "bsub <"
+    submit_command = "bsub"
     cancel_command = "bkill"
 
     def __init__(
@@ -134,10 +134,6 @@ def __init__(
 
         logger.debug("Job script: \n %s" % self.job_script())
 
-    def _submit_job(self, script_filename):
-        piped_cmd = [self.submit_command + " " + script_filename + " 2> /dev/null"]
-        return self._call(piped_cmd, shell=True)
-
 
 def lsf_format_bytes_ceil(n, lsf_units="mb"):
     """ Format bytes as text

From f91bec220b986c4a734eff61c18d535f31ee72ae Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Sun, 25 Aug 2019 18:00:57 -0700
Subject: [PATCH 044/109] add nanny keyword

---
 dask_jobqueue/job.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index 1e70bfaa..a448b7fd 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -33,6 +33,8 @@ class Job(ProcessInterface):
         Total amount of memory per job
     processes : int
         Number of processes per job
+    nanny : bool
+        Whether or not to start a nanny process
     interface : str
         Network interface like 'eth0' or 'ib0'.
     death_timeout : float
@@ -54,8 +56,6 @@ class Job(ProcessInterface):
         Python executable used to launch Dask workers.
     config_name : str
         Section to use from jobqueue.yaml configuration file.
-    kwargs : dict
-        Additional keyword arguments to pass to `LocalCluster`
 
     Attributes
     ----------
@@ -98,6 +98,7 @@ def __init__(
         cores=None,
         memory=None,
         processes=None,
+        nanny=True,
         interface=None,
         death_timeout=None,
         local_directory=None,
@@ -195,6 +196,7 @@ def __init__(
 
         command_args += ["--memory-limit", self.worker_process_memory]
         command_args += ["--name", str(name)]
+        command_args += ["--nanny" if nanny else "--no-nanny"]
 
         if death_timeout is not None:
             command_args += ["--death-timeout", death_timeout]

From 6db1366bc36d39d0527ad5fd7fc04fbaab422cde Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Sun, 25 Aug 2019 18:01:04 -0700
Subject: [PATCH 045/109] remove docstring wrapping in LSF for now

---
 dask_jobqueue/lsf.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/dask_jobqueue/lsf.py b/dask_jobqueue/lsf.py
index 12126ee2..5d030102 100644
--- a/dask_jobqueue/lsf.py
+++ b/dask_jobqueue/lsf.py
@@ -12,8 +12,9 @@
 
 
 class LSFJob(Job):
-    __doc__ = docstrings.with_indents(
-        """ Launch Dask on a LSF cluster
+    """ Launch Dask on a LSF cluster
+
+    See also the docstring for Job for more parameters
 
     Parameters
     ----------
@@ -34,7 +35,6 @@ class LSFJob(Job):
     lsf_units : str
         Unit system for large units in resource usage set by the
         LSF_UNIT_FOR_LIMITS in the lsf.conf file of a cluster.
-    %(Job.parameters)s
 
     Examples
     --------
@@ -50,9 +50,7 @@ class LSFJob(Job):
     kill workers based on load.
 
     >>> cluster.adapt()
-    """,
-        4,
-    )
+    """
     submit_command = "bsub"
     cancel_command = "bkill"
 

From f523cb58a1a1db6c9bbc4e59798f380a6162e889 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Mon, 26 Aug 2019 07:21:06 -0700
Subject: [PATCH 046/109] black + flake8

---
 dask_jobqueue/job.py  | 10 +++++++---
 dask_jobqueue/lsf.py  |  2 +-
 dask_jobqueue/moab.py |  2 +-
 dask_jobqueue/pbs.py  |  2 +-
 4 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index a448b7fd..3fe2a431 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -214,8 +214,13 @@ def __init__(
 
     def job_script(self):
         """ Construct a job submission script """
-        header = "\n".join([line for line in self.job_header.split("\n") if not any(skip
-            in line for skip in self.header_skip)])
+        header = "\n".join(
+            [
+                line
+                for line in self.job_header.split("\n")
+                if not any(skip in line for skip in self.header_skip)
+            ]
+        )
         pieces = {
             "shebang": self.shebang,
             "job_header": header,
@@ -342,7 +347,6 @@ def _call(cmd, **kwargs):
 
 
 class JobQueueCluster(SpecCluster):
-
     def __init__(
         self,
         n_workers=0,
diff --git a/dask_jobqueue/lsf.py b/dask_jobqueue/lsf.py
index 5d030102..31170f0c 100644
--- a/dask_jobqueue/lsf.py
+++ b/dask_jobqueue/lsf.py
@@ -5,7 +5,6 @@
 
 import dask
 
-from .core import docstrings
 from .job import Job, JobQueueCluster
 
 logger = logging.getLogger(__name__)
@@ -51,6 +50,7 @@ class LSFJob(Job):
 
     >>> cluster.adapt()
     """
+
     submit_command = "bsub"
     cancel_command = "bkill"
 
diff --git a/dask_jobqueue/moab.py b/dask_jobqueue/moab.py
index 3c6f5bc5..a6fe664e 100644
--- a/dask_jobqueue/moab.py
+++ b/dask_jobqueue/moab.py
@@ -10,4 +10,4 @@ class MoabJob(PBSJob):
     scheduler_name = "moab"
 
 
-MoabCluster = functools.partial(JobQueueCluster, Job=MoabJob, config_name='pbs')
+MoabCluster = functools.partial(JobQueueCluster, Job=MoabJob, config_name="pbs")
diff --git a/dask_jobqueue/pbs.py b/dask_jobqueue/pbs.py
index 383e70a3..e387cbfd 100644
--- a/dask_jobqueue/pbs.py
+++ b/dask_jobqueue/pbs.py
@@ -103,4 +103,4 @@ def __init__(
         logger.debug("Job script: \n %s" % self.job_script())
 
 
-PBSCluster = functools.partial(JobQueueCluster, Job=PBSJob, config_name='pbs')
+PBSCluster = functools.partial(JobQueueCluster, Job=PBSJob, config_name="pbs")

From 00352ac7763c0c6111488e82fe93ff343097d66d Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Mon, 26 Aug 2019 08:16:15 -0700
Subject: [PATCH 047/109] cleanup pbs testing

---
 dask_jobqueue/tests/test_pbs.py | 44 +++++++++------------------------
 1 file changed, 12 insertions(+), 32 deletions(-)

diff --git a/dask_jobqueue/tests/test_pbs.py b/dask_jobqueue/tests/test_pbs.py
index 1e2f4de8..f1d3400d 100644
--- a/dask_jobqueue/tests/test_pbs.py
+++ b/dask_jobqueue/tests/test_pbs.py
@@ -112,11 +112,7 @@ def test_basic(loop):
         with Client(cluster) as client:
 
             cluster.scale(2)
-
-            start = time()
-            while not (cluster.pending_jobs or cluster.running_jobs):
-                sleep(0.100)
-                assert time() < start + QUEUE_WAIT
+            client.wait_for_workers(2)
 
             future = client.submit(lambda x: x + 1, 10)
             assert future.result(QUEUE_WAIT) == 11
@@ -130,11 +126,11 @@ def test_basic(loop):
             cluster.scale(0)
 
             start = time()
-            while cluster.running_jobs:
+            while client.scheduler_info()["workers"]:
                 sleep(0.100)
                 assert time() < start + QUEUE_WAIT
 
-            assert not cluster.running_jobs
+            assert not cluster.workers and not cluster.worker_spec
 
 
 @pytest.mark.env("pbs")
@@ -151,15 +147,11 @@ def test_scale_cores_memory(loop):
         with Client(cluster) as client:
 
             cluster.scale(cores=2)
-
-            start = time()
-            while not (cluster.pending_jobs or cluster.running_jobs):
-                sleep(0.100)
-                assert time() < start + QUEUE_WAIT
+            client.wait_for_workers(2)
 
             future = client.submit(lambda x: x + 1, 10)
             assert future.result(QUEUE_WAIT) == 11
-            assert cluster.running_jobs
+            assert cluster.workers
 
             workers = list(client.scheduler_info()["workers"].values())
             w = workers[0]
@@ -169,11 +161,11 @@ def test_scale_cores_memory(loop):
             cluster.scale(memory="0GB")
 
             start = time()
-            while cluster.running_jobs:
+            while client.scheduler_info()["workers"]:
                 sleep(0.100)
                 assert time() < start + QUEUE_WAIT
 
-            assert not cluster.running_jobs
+            assert not cluster.workers
 
 
 @pytest.mark.env("pbs")
@@ -194,11 +186,11 @@ def test_basic_scale_edge_cases(loop):
         # Wait to see what happens
         sleep(0.2)
         start = time()
-        while cluster.pending_jobs or cluster.running_jobs:
+        while cluster.workers or client.scheduler_info()["workers"]:
             sleep(0.1)
             assert time() < start + QUEUE_WAIT
 
-        assert not (cluster.pending_jobs or cluster.running_jobs)
+        assert not cluster.workers
 
 
 @pytest.mark.env("pbs")
@@ -226,12 +218,10 @@ def test_adaptive(loop):
             del future
 
             start = time()
-            while cluster.pending_jobs or cluster.running_jobs:
+            while client.scheduler_info()["workers"] or cluster.workers:
                 sleep(0.100)
                 assert time() < start + QUEUE_WAIT
 
-            assert cluster.finished_jobs
-
 
 @pytest.mark.env("pbs")
 def test_adaptive_grouped(loop):
@@ -246,19 +236,11 @@ def test_adaptive_grouped(loop):
     ) as cluster:
         cluster.adapt(minimum=1)  # at least 1 worker
         with Client(cluster) as client:
-            start = time()
-            while not (cluster.pending_jobs or cluster.running_jobs):
-                sleep(0.100)
-                assert time() < start + QUEUE_WAIT
+            client.wait_for_workers(1)
 
             future = client.submit(lambda x: x + 1, 10)
             assert future.result(QUEUE_WAIT) == 11
 
-            start = time()
-            while not cluster.running_jobs:
-                sleep(0.100)
-                assert time() < start + QUEUE_WAIT
-
             start = time()
             processes = cluster.worker_processes
             while len(client.scheduler_info()["workers"]) != processes:
@@ -291,12 +273,10 @@ def test_adaptive_cores_mem(loop):
             del future
 
             start = time()
-            while cluster.pending_jobs or cluster.running_jobs:
+            while cluster.workers:
                 sleep(0.100)
                 assert time() < start + QUEUE_WAIT
 
-            assert cluster.finished_jobs
-
 
 @pytest.mark.env("pbs")
 def test_scale_grouped(loop):

From 1b0e212e7d4fe0fedf60ba37ac78d2df16f7942d Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Mon, 26 Aug 2019 13:20:22 -0700
Subject: [PATCH 048/109] typo

---
 dask_jobqueue/tests/test_pbs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dask_jobqueue/tests/test_pbs.py b/dask_jobqueue/tests/test_pbs.py
index f1d3400d..4eaeae8b 100644
--- a/dask_jobqueue/tests/test_pbs.py
+++ b/dask_jobqueue/tests/test_pbs.py
@@ -186,7 +186,7 @@ def test_basic_scale_edge_cases(loop):
         # Wait to see what happens
         sleep(0.2)
         start = time()
-        while cluster.workers or client.scheduler_info()["workers"]:
+        while cluster.workers:
             sleep(0.1)
             assert time() < start + QUEUE_WAIT
 

From 5eda238bb9680400eabb53ea5bad11b7b46ec23a Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Mon, 26 Aug 2019 13:38:23 -0700
Subject: [PATCH 049/109] move Cluster.example_job down to a property

---
 dask_jobqueue/job.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index 3fe2a431..6e82b710 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -391,8 +391,9 @@ def __init__(
         kwargs["interface"] = interface
         kwargs["protocol"] = protocol
         kwargs["security"] = security
+        self._kwargs = kwargs
+        self._Job = Job
         worker = {"cls": Job, "options": kwargs}
-        self.example_job = Job("tcp://scheduler:8786", name="name", **kwargs)
 
         super().__init__(
             scheduler=scheduler,
@@ -406,6 +407,10 @@ def __init__(
         if n_workers:
             self.scale(n_workers)
 
+    @property
+    def example_job(self):
+        return self._Job(self.scheduler.address or "tcp://scheduler:8786", name="name", **self._kwargs)
+
     @property
     def job_header(self):
         return self.example_job.job_header

From e10ce606d9ac78344692f2aa73510ab155f653e4 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Mon, 26 Aug 2019 13:45:51 -0700
Subject: [PATCH 050/109] verify that we can create a Job on instantiation

---
 dask_jobqueue/job.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index 6e82b710..58411e61 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -394,6 +394,7 @@ def __init__(
         self._kwargs = kwargs
         self._Job = Job
         worker = {"cls": Job, "options": kwargs}
+        self.example_job
 
         super().__init__(
             scheduler=scheduler,
@@ -409,7 +410,11 @@ def __init__(
 
     @property
     def example_job(self):
-        return self._Job(self.scheduler.address or "tcp://scheduler:8786", name="name", **self._kwargs)
+        try:
+            address = self.scheduler.address
+        except AttributeError:
+            address = "tcp://scheduler:8786"
+        return self._Job(address or "tcp://scheduler:8786", name="name", **self._kwargs)
 
     @property
     def job_header(self):

From 5b8cb14be50173eaf7b80f072c81a49658e2a304 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Mon, 26 Aug 2019 15:11:50 -0700
Subject: [PATCH 051/109] cleanup pbs tests

---
 dask_jobqueue/tests/test_pbs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dask_jobqueue/tests/test_pbs.py b/dask_jobqueue/tests/test_pbs.py
index 4eaeae8b..f6e835a8 100644
--- a/dask_jobqueue/tests/test_pbs.py
+++ b/dask_jobqueue/tests/test_pbs.py
@@ -147,7 +147,7 @@ def test_scale_cores_memory(loop):
         with Client(cluster) as client:
 
             cluster.scale(cores=2)
-            client.wait_for_workers(2)
+            client.wait_for_workers(1)
 
             future = client.submit(lambda x: x + 1, 10)
             assert future.result(QUEUE_WAIT) == 11

From 50340d5c57fe668c60a46f777adbdd223e2a4eff Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Mon, 26 Aug 2019 15:13:45 -0700
Subject: [PATCH 052/109] make hanging slurm test verbose

---
 ci/slurm.sh                     | 2 +-
 dask_jobqueue/tests/test_job.py | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/ci/slurm.sh b/ci/slurm.sh
index 8f12aadf..a0cb1bc0 100644
--- a/ci/slurm.sh
+++ b/ci/slurm.sh
@@ -18,7 +18,7 @@ function jobqueue_install {
 }
 
 function jobqueue_script {
-    docker exec -it slurmctld /bin/bash -c "cd /dask-jobqueue; pytest dask_jobqueue --verbose -E slurm"
+    docker exec -it slurmctld /bin/bash -c "cd /dask-jobqueue; pytest dask_jobqueue --verbose -E slurm -s"
 }
 
 function jobqueue_after_script {
diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index c7669b80..9a6cf606 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -25,18 +25,26 @@ def test_basic():
 @pytest.mark.asyncio
 async def test_job(Job):
     async with Scheduler(port=0) as s:
+        print(1)
         job = Job(scheduler=s.address, name="foo", cores=1, memory="1GB")
+        print(2)
         job = await job
+        print(3)
         async with Client(s.address, asynchronous=True) as client:
+            print(4)
             await client.wait_for_workers(1)
+            print(5)
             assert list(s.workers.values())[0].name == "foo"
 
+        print(6)
         await job.close()
+        print(7)
 
         start = time()
         while len(s.workers):
             await asyncio.sleep(0.1)
             assert time() < start + 5
+        print(8)
 
 
 @pytest.mark.parametrize("Job", job_params)

From 75392348e48bbd4291c6d35cafcf8a6553b8a085 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Tue, 27 Aug 2019 14:29:32 -0700
Subject: [PATCH 053/109] Add LocalJob for testing

---
 dask_jobqueue/__init__.py       |  1 +
 dask_jobqueue/job.py            | 11 +++----
 dask_jobqueue/jobqueue.yaml     | 15 ++++++++++
 dask_jobqueue/local.py          | 53 +++++++++++++++++++++++++++++++++
 dask_jobqueue/tests/test_job.py |  5 ++--
 5 files changed, 76 insertions(+), 9 deletions(-)
 create mode 100644 dask_jobqueue/local.py

diff --git a/dask_jobqueue/__init__.py b/dask_jobqueue/__init__.py
index d88ec417..3659631c 100644
--- a/dask_jobqueue/__init__.py
+++ b/dask_jobqueue/__init__.py
@@ -1,5 +1,6 @@
 # flake8: noqa
 from . import config
+from .local import LocalJob, LocalCluster
 from .core import JobQueueCluster
 from .job import Job
 from .moab import MoabCluster, MoabJob
diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index 58411e61..de7fa833 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -4,7 +4,6 @@
 import re
 import shlex
 import subprocess
-import six
 import sys
 import weakref
 
@@ -80,9 +79,7 @@ class Job(ProcessInterface):
 %(shebang)s
 
 %(job_header)s
-
 %(env_header)s
-
 %(worker_command)s
 """.lstrip()
 
@@ -182,7 +179,7 @@ def __init__(
 
         self.shebang = shebang
 
-        self._env_header = "\n".join(env_extra)
+        self._env_header = "\n".join(filter(None, env_extra))
         self.header_skip = set(header_skip)
 
         # dask-worker command line build
@@ -329,12 +326,12 @@ def _call(cmd, **kwargs):
         )
 
         proc = subprocess.Popen(
-            cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs
+            cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, **kwargs
         )
 
         out, err = proc.communicate()
-        if six.PY3:
-            out, err = out.decode(), err.decode()
+        out, err = out.decode(), err.decode()
+
         if proc.returncode != 0:
             raise RuntimeError(
                 "Command exited with non-zero exit code.\n"
diff --git a/dask_jobqueue/jobqueue.yaml b/dask_jobqueue/jobqueue.yaml
index eb1d3b49..d2174258 100644
--- a/dask_jobqueue/jobqueue.yaml
+++ b/dask_jobqueue/jobqueue.yaml
@@ -161,3 +161,18 @@ jobqueue:
     log-directory: null
     shebang: "#!/usr/bin/env condor_submit"
 
+  local:
+    name: dask-worker
+    # Dask worker options
+    cores: null                 # Total number of cores per job
+    memory: null                # Total amount of memory per job
+    processes: 1                # Number of Python processes per job
+
+    interface: null             # Network interface to use like eth0 or ib0
+    death-timeout: 60           # Number of seconds to wait if a worker can not find a scheduler
+    local-directory: null       # Location of fast local storage like /scratch or $TMPDIR
+
+    extra: []
+    env-extra: []
+    job-extra: []
+    log-directory: null
diff --git a/dask_jobqueue/local.py b/dask_jobqueue/local.py
new file mode 100644
index 00000000..9c227891
--- /dev/null
+++ b/dask_jobqueue/local.py
@@ -0,0 +1,53 @@
+import functools
+import logging
+import os
+import subprocess
+
+from .job import Job, JobQueueCluster
+
+logger = logging.getLogger(__name__)
+
+
+class LocalJob(Job):
+    """ This is mostly used for testing.  It runs locally. """
+
+    config_name = "local"
+
+    def __init__(
+        self,
+        *args,
+        queue=None,
+        project=None,
+        resource_spec=None,
+        walltime=None,
+        job_extra=None,
+        config_name="local",
+        **kwargs
+    ):
+        # Instantiate args and parameters from parent abstract class
+        super().__init__(*args, config_name=config_name, shebang="", **kwargs)
+
+        # Declare class attribute that shall be overridden
+        header_lines = []
+        self.job_header = "\n".join(header_lines)
+
+        logger.debug("Job script: \n %s" % self.job_script())
+
+    def _submit_job(self, script_filename):
+        # Should we make this async friendly?
+        with open(script_filename) as f:
+            text = f.read().strip().split()
+        self.process = subprocess.Popen(
+            text, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        self.process.stderr.readline()  # make sure that we start
+        return str(self.process.pid)
+
+    @classmethod
+    def _close_job(self, job_id):
+        os.kill(int(job_id), 9)
+        # from distributed.utils_test import terminate_process
+        # terminate_process(self.process)
+
+
+LocalCluster = functools.partial(JobQueueCluster, Job=LocalJob, config_name="local")
diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index 9a6cf606..26b3f0ba 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -1,7 +1,7 @@
 import asyncio
 from time import time
 
-from dask_jobqueue import PBSJob, SGEJob, SLURMJob, LSFJob
+from dask_jobqueue import PBSJob, SGEJob, SLURMJob, LSFJob, LocalJob
 from dask_jobqueue.job import JobQueueCluster
 from dask.distributed import Scheduler, Client
 
@@ -18,6 +18,7 @@ def test_basic():
     pytest.param(PBSJob, marks=[pytest.mark.env("pbs")]),
     pytest.param(SLURMJob, marks=[pytest.mark.env("slurm")]),
     pytest.param(LSFJob, marks=[pytest.mark.env("lsf")]),
+    LocalJob,
 ]
 
 
@@ -43,7 +44,7 @@ async def test_job(Job):
         start = time()
         while len(s.workers):
             await asyncio.sleep(0.1)
-            assert time() < start + 5
+            assert time() < start + 10
         print(8)
 
 

From 01481bf4507583b5ec44ebef964398956ef81842 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Wed, 28 Aug 2019 08:50:43 -0700
Subject: [PATCH 054/109] Add empty jobs to fill out worker spec with many
 processes

---
 dask_jobqueue/job.py            | 20 ++++++++++++++++++++
 dask_jobqueue/tests/test_job.py | 26 +++++++++++++++++++++++++-
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index de7fa833..9821c6bc 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -5,6 +5,7 @@
 import shlex
 import subprocess
 import sys
+import toolz
 import weakref
 
 import dask
@@ -405,6 +406,21 @@ def __init__(
         if n_workers:
             self.scale(n_workers)
 
+    def new_worker_spec(self):
+        spec = super().new_worker_spec()
+        nprocs = self.new_spec["options"]["processes"]
+        if nprocs >= 1:
+            [(name, value)] = spec.items()
+            value = value.copy()
+            value["options"] = toolz.assoc(value["options"], "name", name)
+            name = str(name)
+
+            spec = {name + "-0": value}
+            for i in range(1, nprocs):
+                spec[name + "-" + str(i)] = {"cls": EmptyJob}
+
+        return spec
+
     @property
     def example_job(self):
         try:
@@ -423,3 +439,7 @@ def job_script(self):
     @property
     def job_name(self):
         return self.example_job.job_name
+
+
+class EmptyJob(ProcessInterface):
+    pass
diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index 26b3f0ba..9123f906 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -1,7 +1,7 @@
 import asyncio
 from time import time
 
-from dask_jobqueue import PBSJob, SGEJob, SLURMJob, LSFJob, LocalJob
+from dask_jobqueue import PBSJob, SGEJob, SLURMJob, LSFJob, LocalJob, LocalCluster
 from dask_jobqueue.job import JobQueueCluster
 from dask.distributed import Scheduler, Client
 
@@ -107,3 +107,27 @@ def test_header_lines_skip():
 
     job = PBSJob(cores=1, memory="1GB", job_name="foobar", header_skip=["-N"])
     assert "foobar" not in job.job_script()
+
+
+@pytest.mark.asyncio
+async def test_nprocs():
+    async with LocalCluster(
+        cores=2, memory="4GB", processes=2, asynchronous=True
+    ) as cluster:
+        async with Client(cluster, asynchronous=True) as client:
+            cluster.scale(cores=2)
+            assert len(cluster.worker_spec) == 2  # two workers
+            await cluster
+            await client.wait_for_workers(2)
+
+            assert set(cluster.workers) == {
+                ws.name for ws in cluster.scheduler.workers.values()
+            }
+
+            cluster.scale(cores=1)
+            await cluster
+            await asyncio.sleep(0.2)
+            assert len(cluster.scheduler.workers) == 2  # they're still one group
+
+            # this fails
+            # assert len(cluster.workers) == len(cluster.worker_spec) == 2

From 3f81a997eddd99c21d771104a729266e5664acd6 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Wed, 28 Aug 2019 15:16:45 -0700
Subject: [PATCH 055/109] cleanup new_worker_spec

---
 dask_jobqueue/job.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index 9821c6bc..950079ba 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -407,8 +407,10 @@ def __init__(
             self.scale(n_workers)
 
     def new_worker_spec(self):
-        spec = super().new_worker_spec()
-        nprocs = self.new_spec["options"]["processes"]
+        spec = {self._i: self.new_spec}
+        self._i += 1
+
+        nprocs = self.new_spec.get("options", {}).get("processes", 1)
         if nprocs >= 1:
             [(name, value)] = spec.items()
             value = value.copy()

From 5c97395dd9804d44a56ffbeb6bde7304a6e0344c Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Wed, 28 Aug 2019 15:29:39 -0700
Subject: [PATCH 056/109] add distributed master to CI for none

---
 ci/none.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/none.sh b/ci/none.sh
index 6aeb3afd..c5ea74ab 100644
--- a/ci/none.sh
+++ b/ci/none.sh
@@ -5,6 +5,7 @@ function jobqueue_before_install {
   ./ci/conda_setup.sh
   export PATH="$HOME/miniconda/bin:$PATH"
   conda install --yes -c conda-forge python=$TRAVIS_PYTHON_VERSION dask distributed flake8 black pytest docrep pytest-asyncio
+  pip install git+https://github.com/dask/distributed@master --upgrade --no-deps
 }
 
 function jobqueue_install {

From 44f01206b7c0c0c3c8adea5af49b21cfd9e8cf98 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Wed, 28 Aug 2019 17:11:12 -0700
Subject: [PATCH 057/109] Update multi-job work for upstream PR

See https://github.com/dask/distributed/pull/3013
---
 dask_jobqueue/job.py            | 22 ++++------------------
 dask_jobqueue/tests/test_job.py | 12 ++++--------
 2 files changed, 8 insertions(+), 26 deletions(-)

diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index 950079ba..e4dafb57 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -392,7 +392,10 @@ def __init__(
         self._kwargs = kwargs
         self._Job = Job
         worker = {"cls": Job, "options": kwargs}
-        self.example_job
+        if "processes" in kwargs and kwargs["processes"] > 1:
+            worker["group"] = ["-" + str(i) for i in range(kwargs["processes"])]
+
+        self.example_job  # trigger property to ensure that the job is valid
 
         super().__init__(
             scheduler=scheduler,
@@ -406,23 +409,6 @@ def __init__(
         if n_workers:
             self.scale(n_workers)
 
-    def new_worker_spec(self):
-        spec = {self._i: self.new_spec}
-        self._i += 1
-
-        nprocs = self.new_spec.get("options", {}).get("processes", 1)
-        if nprocs >= 1:
-            [(name, value)] = spec.items()
-            value = value.copy()
-            value["options"] = toolz.assoc(value["options"], "name", name)
-            name = str(name)
-
-            spec = {name + "-0": value}
-            for i in range(1, nprocs):
-                spec[name + "-" + str(i)] = {"cls": EmptyJob}
-
-        return spec
-
     @property
     def example_job(self):
         try:
diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index 9123f906..c6874ec1 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -114,20 +114,16 @@ async def test_nprocs():
     async with LocalCluster(
         cores=2, memory="4GB", processes=2, asynchronous=True
     ) as cluster:
+        s = cluster.scheduler
         async with Client(cluster, asynchronous=True) as client:
             cluster.scale(cores=2)
-            assert len(cluster.worker_spec) == 2  # two workers
             await cluster
             await client.wait_for_workers(2)
-
-            assert set(cluster.workers) == {
-                ws.name for ws in cluster.scheduler.workers.values()
-            }
+            assert len(cluster.workers) == 1  # two workers, one job
+            assert len(s.workers) == 2
+            assert cluster.plan == {ws.name for ws in s.workers.values()}
 
             cluster.scale(cores=1)
             await cluster
             await asyncio.sleep(0.2)
             assert len(cluster.scheduler.workers) == 2  # they're still one group
-
-            # this fails
-            # assert len(cluster.workers) == len(cluster.worker_spec) == 2

From 9d3e181362d2f99a10c06e2739ea8ee0863c4b86 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Thu, 29 Aug 2019 09:33:07 -0700
Subject: [PATCH 058/109] remove errant shell=True keyword

---
 dask_jobqueue/job.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index e4dafb57..a429ab19 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -327,7 +327,7 @@ def _call(cmd, **kwargs):
         )
 
         proc = subprocess.Popen(
-            cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, **kwargs
+            cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs
         )
 
         out, err = proc.communicate()

From 2ac776c2ea8e1eac07c31fa135af5340b085f050 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Thu, 29 Aug 2019 11:51:33 -0700
Subject: [PATCH 059/109] relax Cluster name test, add status

---
 dask_jobqueue/job.py                      | 1 +
 dask_jobqueue/tests/test_jobqueue_core.py | 7 +++----
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index a429ab19..8cc70eb9 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -364,6 +364,7 @@ def __init__(
         # Job keywords
         **kwargs
     ):
+        self.status = "created"
         if Job is None:
             raise ValueError(
                 "You must provide a Job type like PBSJob, SLURMJob, "
diff --git a/dask_jobqueue/tests/test_jobqueue_core.py b/dask_jobqueue/tests/test_jobqueue_core.py
index 51db8bb5..af289fd6 100644
--- a/dask_jobqueue/tests/test_jobqueue_core.py
+++ b/dask_jobqueue/tests/test_jobqueue_core.py
@@ -200,12 +200,11 @@ def test_jobqueue_cluster_call(tmpdir):
     [PBSCluster, MoabCluster, SLURMCluster, SGECluster, LSFCluster, OARCluster],
 )
 def test_cluster_has_cores_and_memory(Cluster):
-    cls_name = Cluster.__name__ + r"\("
-    with pytest.raises(ValueError, match=cls_name + r"cores=\d, memory='\d+GB'"):
+    with pytest.raises(ValueError, match=r"cores=\d, memory='\d+GB'"):
         Cluster()
 
-    with pytest.raises(ValueError, match=cls_name + r"cores=\d, memory='1GB'"):
+    with pytest.raises(ValueError, match=r"cores=\d, memory='1GB'"):
         Cluster(memory="1GB")
 
-    with pytest.raises(ValueError, match=cls_name + r"cores=4, memory='\d+GB'"):
+    with pytest.raises(ValueError, match=r"cores=4, memory='\d+GB'"):
         Cluster(cores=4)

From 875ce5fc7578a7ee103051978bccca63c871a1b9 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Thu, 29 Aug 2019 12:03:25 -0700
Subject: [PATCH 060/109] copy over cores=, memory= error message from master

---
 dask_jobqueue/job.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index 8cc70eb9..eab69877 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -5,7 +5,6 @@
 import shlex
 import subprocess
 import sys
-import toolz
 import weakref
 
 import dask
@@ -152,14 +151,12 @@ def __init__(
         if shebang is None:
             shebang = dask.config.get("jobqueue.%s.shebang" % config_name)
 
-        if cores is None:
-            raise ValueError(
-                "You must specify how many cores to use per job like ``cores=8``"
-            )
-
-        if memory is None:
+        if cores is None or memory is None:
             raise ValueError(
-                "You must specify how much memory to use per job like ``memory='24 GB'``"
+                "You must specify how much cores and memory per job you want to use, for example:\n"
+                "cluster = {}(cores={}, memory={!r})".format(
+                    self.__class__.__name__, cores or 8, memory or "24GB"
+                )
             )
 
         # This attribute should be overridden

From fbbfc4b12c4a051b2d72842c9a82452f2c80a6b1 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Thu, 29 Aug 2019 13:36:37 -0700
Subject: [PATCH 061/109] add -s to pbs test

---
 ci/pbs.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/pbs.sh b/ci/pbs.sh
index 2229d812..0132a49a 100644
--- a/ci/pbs.sh
+++ b/ci/pbs.sh
@@ -19,7 +19,7 @@ function jobqueue_install {
 }
 
 function jobqueue_script {
-    docker exec -it -u pbsuser pbs_master /bin/bash -c "cd /dask-jobqueue; pytest dask_jobqueue --verbose -E pbs"
+    docker exec -it -u pbsuser pbs_master /bin/bash -c "cd /dask-jobqueue; pytest dask_jobqueue --verbose -s -E pbs"
 }
 
 function jobqueue_after_script {

From a92d0ff7189e951e76837c64cf54028a58d22140 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Thu, 29 Aug 2019 15:32:32 -0600
Subject: [PATCH 062/109] ignore Runtime Errors when closing jobs

---
 dask_jobqueue/job.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index eab69877..c4de6230 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -8,6 +8,7 @@
 import weakref
 
 import dask
+from dask.utils import ignoring
 from distributed.deploy.spec import ProcessInterface, SpecCluster
 from distributed.scheduler import Scheduler
 
@@ -291,7 +292,8 @@ async def close(self):
     @classmethod
     def _close_job(cls, job_id):
         if job_id:
-            cls._call(shlex.split(cls.cancel_command) + [job_id])
+            with ignoring(RuntimeError):  # deleting job when job already gone
+                cls._call(shlex.split(cls.cancel_command) + [job_id])
 
     @staticmethod
     def _call(cmd, **kwargs):

From 0a338f74439bd1982718c6703977f6e713d7e078 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= <loic.esteve@ymail.com>
Date: Fri, 30 Aug 2019 16:54:46 +0200
Subject: [PATCH 063/109] Friday afternoon semi random attempt.

---
 ci/slurm.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/slurm.sh b/ci/slurm.sh
index a0cb1bc0..065bfa02 100644
--- a/ci/slurm.sh
+++ b/ci/slurm.sh
@@ -18,7 +18,7 @@ function jobqueue_install {
 }
 
 function jobqueue_script {
-    docker exec -it slurmctld /bin/bash -c "cd /dask-jobqueue; pytest dask_jobqueue --verbose -E slurm -s"
+    docker exec -it slurmctld /bin/bash -c "pytest /dask-jobqueue/dask_jobqueue --verbose -E slurm -s"
 }
 
 function jobqueue_after_script {

From db5c397d18b71e987d9e0f14dcca54cb0d492906 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= <loic.esteve@ymail.com>
Date: Fri, 30 Aug 2019 17:35:50 +0200
Subject: [PATCH 064/109] Same attempt with PBS. Does two half-random attempts
 make a complete random attempt?

---
 ci/pbs.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/pbs.sh b/ci/pbs.sh
index 0132a49a..e0f8d7d3 100644
--- a/ci/pbs.sh
+++ b/ci/pbs.sh
@@ -19,7 +19,7 @@ function jobqueue_install {
 }
 
 function jobqueue_script {
-    docker exec -it -u pbsuser pbs_master /bin/bash -c "cd /dask-jobqueue; pytest dask_jobqueue --verbose -s -E pbs"
+    docker exec -it -u pbsuser pbs_master /bin/bash -c "pytest /dask-jobqueue/dask_jobqueue --verbose -s -E pbs"
 }
 
 function jobqueue_after_script {

From b73ccb0d6a439aca531630ffd3ac5f831e5c9898 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= <loic.esteve@ymail.com>
Date: Fri, 30 Aug 2019 20:11:31 +0200
Subject: [PATCH 065/109] dask-worker could not start because run as pbsuser in
 /.

Could not create the /dask-worker folder.
---
 ci/pbs.sh              | 2 +-
 dask_jobqueue/local.py | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/ci/pbs.sh b/ci/pbs.sh
index e0f8d7d3..5fb068df 100644
--- a/ci/pbs.sh
+++ b/ci/pbs.sh
@@ -19,7 +19,7 @@ function jobqueue_install {
 }
 
 function jobqueue_script {
-    docker exec -it -u pbsuser pbs_master /bin/bash -c "pytest /dask-jobqueue/dask_jobqueue --verbose -s -E pbs"
+    docker exec -it -u pbsuser pbs_master /bin/bash -c "cd; pytest /dask-jobqueue/dask_jobqueue --verbose -s -E pbs"
 }
 
 function jobqueue_after_script {
diff --git a/dask_jobqueue/local.py b/dask_jobqueue/local.py
index 9c227891..1c269d0c 100644
--- a/dask_jobqueue/local.py
+++ b/dask_jobqueue/local.py
@@ -40,6 +40,9 @@ def _submit_job(self, script_filename):
         self.process = subprocess.Popen(
             text, stdout=subprocess.PIPE, stderr=subprocess.PIPE
         )
+        # TODO this should raise if self.process.returncode != 0. Refactor
+        # Job._call to be able to return process (so that we can return self.process.pid below)
+
         self.process.stderr.readline()  # make sure that we start
         return str(self.process.pid)
 

From 4ea85e834c259215de6de256eb44f5d35dd64670 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= <loic.esteve@ymail.com>
Date: Fri, 30 Aug 2019 20:27:46 +0200
Subject: [PATCH 066/109] Remove .runnin_jobs.

---
 dask_jobqueue/tests/test_pbs.py | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/dask_jobqueue/tests/test_pbs.py b/dask_jobqueue/tests/test_pbs.py
index f6e835a8..68d9913e 100644
--- a/dask_jobqueue/tests/test_pbs.py
+++ b/dask_jobqueue/tests/test_pbs.py
@@ -116,7 +116,7 @@ def test_basic(loop):
 
             future = client.submit(lambda x: x + 1, 10)
             assert future.result(QUEUE_WAIT) == 11
-            assert cluster.running_jobs
+            # assert cluster.running_jobs
 
             workers = list(client.scheduler_info()["workers"].values())
             w = workers[0]
@@ -294,9 +294,10 @@ def test_scale_grouped(loop):
             cluster.scale(4)  # Start 2 jobs
 
             start = time()
-            while len(cluster.running_jobs) != 2:
-                sleep(0.100)
-                assert time() < start + QUEUE_WAIT
+            # TODO: Is there a replacement to check for number of jobs (rather than workers)
+            # while len(cluster.running_jobs) != 2:
+            #     sleep(0.100)
+            #     assert time() < start + QUEUE_WAIT
 
             while len(list(client.scheduler_info()["workers"].values())) != 4:
                 sleep(0.100)
@@ -304,7 +305,7 @@ def test_scale_grouped(loop):
 
             future = client.submit(lambda x: x + 1, 10)
             assert future.result(QUEUE_WAIT) == 11
-            assert cluster.running_jobs
+            # assert cluster.running_jobs
 
             workers = list(client.scheduler_info()["workers"].values())
             w = workers[0]
@@ -315,23 +316,29 @@ def test_scale_grouped(loop):
             cluster.scale(1)  # Should leave 2 workers, 1 job
 
             start = time()
-            while len(cluster.running_jobs) != 1:
+            # TODO
+            # while len(cluster.running_jobs) != 1:
+            #     sleep(0.100)
+            #     assert time() < start + QUEUE_WAIT
+
+            # assert len(cluster.running_jobs) == 1
+            # workers = list(client.scheduler_info()["workers"].values())
+            while len(cluster.scheduler_info()['workers']) != 2:
                 sleep(0.100)
                 assert time() < start + QUEUE_WAIT
 
-            assert len(cluster.running_jobs) == 1
-            workers = list(client.scheduler_info()["workers"].values())
-            assert len(workers) == 2
-
             cluster.scale(0)
 
             start = time()
-            while cluster.running_jobs:
+            # while cluster.running_jobs:
+            #     sleep(0.100)
+            #     assert time() < start + QUEUE_WAIT
+
+            # assert not cluster.running_jobs
+            while len(cluster.scheduler_info()['workers']) != 0:
                 sleep(0.100)
                 assert time() < start + QUEUE_WAIT
 
-            assert not cluster.running_jobs
-
 
 def test_config(loop):
     with dask.config.set(

From 0789ed9e115652900fc3555e24ac49d34242c154 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= <loic.esteve@ymail.com>
Date: Fri, 30 Aug 2019 20:43:45 +0200
Subject: [PATCH 067/109] Fix cluster -> client

---
 dask_jobqueue/tests/test_pbs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dask_jobqueue/tests/test_pbs.py b/dask_jobqueue/tests/test_pbs.py
index 68d9913e..39a0b933 100644
--- a/dask_jobqueue/tests/test_pbs.py
+++ b/dask_jobqueue/tests/test_pbs.py
@@ -323,7 +323,7 @@ def test_scale_grouped(loop):
 
             # assert len(cluster.running_jobs) == 1
             # workers = list(client.scheduler_info()["workers"].values())
-            while len(cluster.scheduler_info()['workers']) != 2:
+            while len(client.scheduler_info()['workers']) != 2:
                 sleep(0.100)
                 assert time() < start + QUEUE_WAIT
 
@@ -335,7 +335,7 @@ def test_scale_grouped(loop):
             #     assert time() < start + QUEUE_WAIT
 
             # assert not cluster.running_jobs
-            while len(cluster.scheduler_info()['workers']) != 0:
+            while len(client.scheduler_info()['workers']) != 0:
                 sleep(0.100)
                 assert time() < start + QUEUE_WAIT
 

From 2215227a059ce8853f37345f404d93b8395a1833 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Fri, 30 Aug 2019 19:53:51 -0700
Subject: [PATCH 068/109] get worker_processes from example job

---
 dask_jobqueue/tests/test_pbs.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dask_jobqueue/tests/test_pbs.py b/dask_jobqueue/tests/test_pbs.py
index 39a0b933..b876427c 100644
--- a/dask_jobqueue/tests/test_pbs.py
+++ b/dask_jobqueue/tests/test_pbs.py
@@ -210,7 +210,7 @@ def test_adaptive(loop):
             assert future.result(QUEUE_WAIT) == 11
 
             start = time()
-            processes = cluster.worker_processes
+            processes = cluster.example_job.worker_processes
             while len(client.scheduler_info()["workers"]) != processes:
                 sleep(0.1)
                 assert time() < start + QUEUE_WAIT
@@ -242,7 +242,7 @@ def test_adaptive_grouped(loop):
             assert future.result(QUEUE_WAIT) == 11
 
             start = time()
-            processes = cluster.worker_processes
+            processes = cluster.example_job.worker_processes
             while len(client.scheduler_info()["workers"]) != processes:
                 sleep(0.1)
                 assert time() < start + QUEUE_WAIT
@@ -265,7 +265,7 @@ def test_adaptive_cores_mem(loop):
             assert future.result(QUEUE_WAIT) == 11
 
             start = time()
-            processes = cluster.worker_processes
+            processes = cluster.example_job.worker_processes
             while len(client.scheduler_info()["workers"]) != processes:
                 sleep(0.1)
                 assert time() < start + QUEUE_WAIT

From 6de262eb7b7dbceb0a758302a39a5693f0524037 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Fri, 30 Aug 2019 19:54:10 -0700
Subject: [PATCH 069/109] black

---
 dask_jobqueue/tests/test_pbs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dask_jobqueue/tests/test_pbs.py b/dask_jobqueue/tests/test_pbs.py
index b876427c..80911ea2 100644
--- a/dask_jobqueue/tests/test_pbs.py
+++ b/dask_jobqueue/tests/test_pbs.py
@@ -323,7 +323,7 @@ def test_scale_grouped(loop):
 
             # assert len(cluster.running_jobs) == 1
             # workers = list(client.scheduler_info()["workers"].values())
-            while len(client.scheduler_info()['workers']) != 2:
+            while len(client.scheduler_info()["workers"]) != 2:
                 sleep(0.100)
                 assert time() < start + QUEUE_WAIT
 
@@ -335,7 +335,7 @@ def test_scale_grouped(loop):
             #     assert time() < start + QUEUE_WAIT
 
             # assert not cluster.running_jobs
-            while len(client.scheduler_info()['workers']) != 0:
+            while len(client.scheduler_info()["workers"]) != 0:
                 sleep(0.100)
                 assert time() < start + QUEUE_WAIT
 

From f64c3e8870208d33335c9aaa89294983b62accd7 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Fri, 30 Aug 2019 20:28:55 -0700
Subject: [PATCH 070/109] Move over Oar

---
 dask_jobqueue/oar.py            | 16 +++++++++++-----
 dask_jobqueue/tests/test_oar.py |  2 +-
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/dask_jobqueue/oar.py b/dask_jobqueue/oar.py
index 39dc7dda..eb2196e9 100644
--- a/dask_jobqueue/oar.py
+++ b/dask_jobqueue/oar.py
@@ -1,14 +1,16 @@
+import functools
 import logging
 import shlex
 
 import dask
 
-from .core import JobQueueCluster, docstrings
+from .core import docstrings
+from .job import JobQueueCluster, Job
 
 logger = logging.getLogger(__name__)
 
 
-class OARCluster(JobQueueCluster):
+class OARJob(Job):
     __doc__ = docstrings.with_indents(
         """ Launch Dask on a OAR cluster
 
@@ -49,6 +51,7 @@ class OARCluster(JobQueueCluster):
 
     def __init__(
         self,
+        *args,
         queue=None,
         project=None,
         resource_spec=None,
@@ -68,11 +71,11 @@ def __init__(
         if job_extra is None:
             job_extra = dask.config.get("jobqueue.%s.job-extra" % config_name)
 
-        super().__init__(config_name=config_name, **kwargs)
+        super().__init__(*args, config_name=config_name, **kwargs)
 
         header_lines = []
-        if self.name is not None:
-            header_lines.append("#OAR -n %s" % self.name)
+        if self.job_name is not None:
+            header_lines.append("#OAR -n %s" % self.job_name)
         if queue is not None:
             header_lines.append("#OAR -q %s" % queue)
         if project is not None:
@@ -121,3 +124,6 @@ def _submit_job(self, fn):
         oarsub_command = " ".join([self.submit_command] + oarsub_options)
         oarsub_command_split = shlex.split(oarsub_command) + [inline_script]
         return self._call(oarsub_command_split)
+
+
+OARCluster = functools.partial(JobQueueCluster, Job=OARJob, config_name="oar")
diff --git a/dask_jobqueue/tests/test_oar.py b/dask_jobqueue/tests/test_oar.py
index b7eaaa7e..5035852b 100644
--- a/dask_jobqueue/tests/test_oar.py
+++ b/dask_jobqueue/tests/test_oar.py
@@ -110,4 +110,4 @@ def test_config_name_oar_takes_custom_config():
 
     with dask.config.set({"jobqueue.oar-config-name": conf}):
         with OARCluster(config_name="oar-config-name") as cluster:
-            assert cluster.name == "myname"
+            assert cluster.job_name == "myname"

From 860b85168f2379ee63608d5e5da293bc0381674d Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Fri, 30 Aug 2019 20:32:02 -0700
Subject: [PATCH 071/109] move over htcondor

---
 dask_jobqueue/htcondor.py            | 13 +++++++++----
 dask_jobqueue/tests/test_htcondor.py |  8 ++++----
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/dask_jobqueue/htcondor.py b/dask_jobqueue/htcondor.py
index 7b1efcba..942775a8 100644
--- a/dask_jobqueue/htcondor.py
+++ b/dask_jobqueue/htcondor.py
@@ -1,3 +1,4 @@
+import functools
 import logging
 import re
 import shlex
@@ -5,12 +6,13 @@
 import dask
 from distributed.utils import parse_bytes
 
-from .core import JobQueueCluster, docstrings
+from .core import docstrings
+from .job import JobQueueCluster, Job
 
 logger = logging.getLogger(__name__)
 
 
-class HTCondorCluster(JobQueueCluster):
+class HTCondorJob(Job):
     __doc__ = docstrings.with_indents(
         """ Launch Dask on an HTCondor cluster with a shared file system
 
@@ -57,7 +59,7 @@ class HTCondorCluster(JobQueueCluster):
     # Python (can't find its libs), so we have to go through the shell.
     executable = "/bin/sh"
 
-    def __init__(self, disk=None, job_extra=None, config_name="htcondor", **kwargs):
+    def __init__(self, *args, disk=None, job_extra=None, config_name="htcondor", **kwargs):
         if disk is None:
             disk = dask.config.get("jobqueue.%s.disk" % config_name)
         if disk is None:
@@ -71,7 +73,7 @@ def __init__(self, disk=None, job_extra=None, config_name="htcondor", **kwargs):
             self.job_extra = job_extra
 
         # Instantiate args and parameters from parent abstract class
-        super().__init__(config_name=config_name, **kwargs)
+        super().__init__(*args, config_name=config_name, **kwargs)
 
         env_extra = kwargs.get("env_extra", None)
         if env_extra is None:
@@ -220,3 +222,6 @@ def quote_environment(env):
         entries.append("%s=%s" % (k, qv))
 
     return " ".join(entries)
+
+
+HTCondorCluster = functools.partial(JobQueueCluster, Job=HTCondorJob, config_name="htcondor")
diff --git a/dask_jobqueue/tests/test_htcondor.py b/dask_jobqueue/tests/test_htcondor.py
index 4f386680..024473ad 100644
--- a/dask_jobqueue/tests/test_htcondor.py
+++ b/dask_jobqueue/tests/test_htcondor.py
@@ -13,9 +13,9 @@
 
 def test_header():
     with HTCondorCluster(cores=1, memory="100MB", disk="100MB") as cluster:
-        assert cluster.job_header_dict["MY.DaskWorkerCores"] == 1
-        assert cluster.job_header_dict["MY.DaskWorkerDisk"] == 100000000
-        assert cluster.job_header_dict["MY.DaskWorkerMemory"] == 100000000
+        assert cluster.example_job.job_header_dict["MY.DaskWorkerCores"] == 1
+        assert cluster.example_job.job_header_dict["MY.DaskWorkerDisk"] == 100000000
+        assert cluster.example_job.job_header_dict["MY.DaskWorkerMemory"] == 100000000
 
 
 def test_job_script():
@@ -98,4 +98,4 @@ def test_config_name_htcondor_takes_custom_config():
 
     with dask.config.set({"jobqueue.htcondor-config-name": conf}):
         with HTCondorCluster(config_name="htcondor-config-name") as cluster:
-            assert cluster.name == "myname"
+            assert cluster.job_name == "myname"

From 512b76676b08156dbf42663c609d0310dec4cd9d Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Fri, 30 Aug 2019 20:37:23 -0700
Subject: [PATCH 072/109] remove old core and deploy code

---
 dask_jobqueue/core.py                     | 484 +---------------------
 dask_jobqueue/deploy/__init__.py          |   2 -
 dask_jobqueue/deploy/cluster_manager.py   | 392 ------------------
 dask_jobqueue/htcondor.py                 |   8 +-
 dask_jobqueue/job.py                      |   4 -
 dask_jobqueue/tests/test_jobqueue_core.py |   1 +
 6 files changed, 9 insertions(+), 882 deletions(-)
 delete mode 100644 dask_jobqueue/deploy/__init__.py
 delete mode 100644 dask_jobqueue/deploy/cluster_manager.py

diff --git a/dask_jobqueue/core.py b/dask_jobqueue/core.py
index 5f6e6e1c..80d91c88 100644
--- a/dask_jobqueue/core.py
+++ b/dask_jobqueue/core.py
@@ -1,85 +1,13 @@
-import logging
-import math
-import os
-import re
-import shlex
-import subprocess
-import sys
-from collections import OrderedDict
-from contextlib import contextmanager
-
-import dask
 import docrep
-from .deploy import ClusterManager
-from distributed import LocalCluster
-from distributed.diagnostics.plugin import SchedulerPlugin
-from distributed.utils import format_bytes, parse_bytes, tmpfile, get_ip_interface
 
-logger = logging.getLogger(__name__)
 docstrings = docrep.DocstringProcessor()
 
 
-def _job_id_from_worker_name(name):
-    """ utility to parse the job ID from the worker name
-
-    template: 'prefix--jobid--suffix'
-    """
-    _, job_id, _ = name.split("--")
-    return job_id
-
-
-class JobQueuePlugin(SchedulerPlugin):
-    def __init__(self):
-        self.pending_jobs = OrderedDict()
-        self.running_jobs = OrderedDict()
-        self.finished_jobs = OrderedDict()
-        self.all_workers = {}
-
-    def add_worker(self, scheduler, worker=None, name=None, **kwargs):
-        """ Run when a new worker enters the cluster"""
-        logger.debug("adding worker %s", worker)
-        w = scheduler.workers[worker]
-        job_id = _job_id_from_worker_name(w.name)
-        logger.debug("job id for new worker: %s", job_id)
-        self.all_workers[worker] = (w.name, job_id)
-
-        # if this is the first worker for this job, move job to running
-        if job_id not in self.running_jobs:
-            logger.debug("%s is a new job or restarting worker", job_id)
-            if job_id in self.pending_jobs:
-                logger.debug("%s is a new job, adding to running_jobs", job_id)
-                self.running_jobs[job_id] = self.pending_jobs.pop(job_id)
-            elif job_id in self.finished_jobs:
-                logger.warning(
-                    "Worker %s restart in Job %s. " "This can be due to memory issue.",
-                    w,
-                    job_id,
-                )
-                self.running_jobs[job_id] = self.finished_jobs.pop(job_id)
-            else:
-                logger.error("Unknown job_id: %s for worker %s", job_id, w)
-                self.running_jobs[job_id] = {}
-
-        # add worker to dict of workers in this job
-        self.running_jobs[job_id][w.name] = w
-
-    def remove_worker(self, scheduler=None, worker=None, **kwargs):
-        """ Run when a worker leaves the cluster"""
-        logger.debug("removing worker %s", worker)
-        name, job_id = self.all_workers[worker]
-        logger.debug("removing worker name (%s) and job_id (%s)", name, job_id)
-
-        # remove worker from this job
-        self.running_jobs[job_id].pop(name, None)
-
-        # once there are no more workers, move this job to finished_jobs
-        if not self.running_jobs[job_id]:
-            logger.debug("that was the last worker for job %s", job_id)
-            self.finished_jobs[job_id] = self.running_jobs.pop(job_id)
+# TODO: remove this class after we figure out docstrings
 
 
 @docstrings.get_sectionsf("JobQueueCluster")
-class JobQueueCluster(ClusterManager):
+class JobQueueCluster:
     """ Base class to launch Dask Clusters for Job queues
 
     This class should not be used directly, use inherited class appropriate for your queueing system (e.g. PBScluster
@@ -144,411 +72,3 @@ class JobQueueCluster(ClusterManager):
 
 %(worker_command)s
 """.lstrip()
-
-    # Following class attributes should be overridden by extending classes.
-    submit_command = None
-    cancel_command = None
-    job_id_regexp = r"(?P<job_id>\d+)"
-
-    def __init__(
-        self,
-        name=None,
-        cores=None,
-        memory=None,
-        processes=None,
-        interface=None,
-        death_timeout=None,
-        local_directory=None,
-        extra=None,
-        env_extra=None,
-        log_directory=None,
-        shebang=None,
-        python=sys.executable,
-        config_name=None,
-        **kwargs
-    ):
-        """ """
-        # """
-        # This initializer should be considered as Abstract, and never used directly.
-        # """
-        super().__init__()
-
-        if config_name is None:
-            raise NotImplementedError(
-                "JobQueueCluster is an abstract class that should not be instantiated."
-            )
-
-        if name is None:
-            name = dask.config.get("jobqueue.%s.name" % config_name)
-        if cores is None:
-            cores = dask.config.get("jobqueue.%s.cores" % config_name)
-        if memory is None:
-            memory = dask.config.get("jobqueue.%s.memory" % config_name)
-        if processes is None:
-            processes = dask.config.get("jobqueue.%s.processes" % config_name)
-        if interface is None:
-            interface = dask.config.get("jobqueue.%s.interface" % config_name)
-        if death_timeout is None:
-            death_timeout = dask.config.get("jobqueue.%s.death-timeout" % config_name)
-        if local_directory is None:
-            local_directory = dask.config.get(
-                "jobqueue.%s.local-directory" % config_name
-            )
-        if extra is None:
-            extra = dask.config.get("jobqueue.%s.extra" % config_name)
-        if env_extra is None:
-            env_extra = dask.config.get("jobqueue.%s.env-extra" % config_name)
-        if log_directory is None:
-            log_directory = dask.config.get("jobqueue.%s.log-directory" % config_name)
-        if shebang is None:
-            shebang = dask.config.get("jobqueue.%s.shebang" % config_name)
-
-        if cores is None or memory is None:
-            raise ValueError(
-                "You must specify how much cores and memory per job you want to use, for example:\n"
-                "cluster = {}(cores={}, memory={!r})".format(
-                    self.__class__.__name__, cores or 8, memory or "24GB"
-                )
-            )
-
-        # This attribute should be overridden
-        self.job_header = None
-
-        if interface:
-            extra += ["--interface", interface]
-            kwargs.setdefault("host", get_ip_interface(interface))
-        else:
-            kwargs.setdefault("host", "")
-
-        # Bokeh diagnostics server should listen on all interfaces
-        kwargs.setdefault("dashboard_address", ("", 8787))
-        self.local_cluster = LocalCluster(n_workers=0, **kwargs)
-
-        # Keep information on process, cores, and memory, for use in subclasses
-        self.worker_memory = parse_bytes(memory) if memory is not None else None
-        self.worker_processes = processes
-        self.worker_cores = cores
-        self.name = name
-
-        # plugin for tracking job status
-        self._scheduler_plugin = JobQueuePlugin()
-        self.local_cluster.scheduler.add_plugin(self._scheduler_plugin)
-
-        self._adaptive = None
-
-        self.shebang = shebang
-
-        self._env_header = "\n".join(env_extra)
-
-        # dask-worker command line build
-        dask_worker_command = "%(python)s -m distributed.cli.dask_worker" % dict(
-            python=python
-        )
-        command_args = [dask_worker_command, self.scheduler.address]
-        command_args += ["--nthreads", self.worker_process_threads]
-        if processes is not None and processes > 1:
-            command_args += ["--nprocs", processes]
-
-        command_args += ["--memory-limit", self.worker_process_memory]
-        command_args += ["--name", "%s--${JOB_ID}--" % name]
-
-        if death_timeout is not None:
-            command_args += ["--death-timeout", death_timeout]
-        if local_directory is not None:
-            command_args += ["--local-directory", local_directory]
-        if extra is not None:
-            command_args += extra
-
-        self._command_template = " ".join(map(str, command_args))
-
-        self.log_directory = log_directory
-        if self.log_directory is not None:
-            if not os.path.exists(self.log_directory):
-                os.makedirs(self.log_directory)
-
-    def __repr__(self):
-        running_workers = self._count_active_workers()
-        running_cores = running_workers * self.worker_process_threads
-        total_jobs = len(self.pending_jobs) + len(self.running_jobs)
-        total_workers = total_jobs * self.worker_processes
-        running_memory = running_workers * self.worker_memory / self.worker_processes
-
-        return (
-            self.__class__.__name__
-            + "(cores=%d, memory=%s, workers=%d/%d, jobs=%d/%d)"
-            % (
-                running_cores,
-                format_bytes(running_memory),
-                running_workers,
-                total_workers,
-                len(self.running_jobs),
-                total_jobs,
-            )
-        )
-
-    @property
-    def pending_jobs(self):
-        """ Jobs pending in the queue """
-        return self._scheduler_plugin.pending_jobs
-
-    @property
-    def running_jobs(self):
-        """ Jobs with currently active workers """
-        return self._scheduler_plugin.running_jobs
-
-    @property
-    def finished_jobs(self):
-        """ Jobs that have finished """
-        return self._scheduler_plugin.finished_jobs
-
-    @property
-    def worker_process_threads(self):
-        return int(self.worker_cores / self.worker_processes)
-
-    @property
-    def worker_process_memory(self):
-        mem = format_bytes(self.worker_memory / self.worker_processes)
-        mem = mem.replace(" ", "")
-        return mem
-
-    @property
-    def jobqueue_worker_spec(self):
-        """ single worker process info needed for scaling on cores or memory """
-        return {
-            "cores": self.worker_process_threads,
-            "memory": self.worker_process_memory,
-        }
-
-    @property
-    def workers(self):
-        """ workers currently connected to the scheduler """
-        return self.scheduler.workers
-
-    def job_script(self):
-        """ Construct a job submission script """
-        pieces = {
-            "shebang": self.shebang,
-            "job_header": self.job_header,
-            "env_header": self._env_header,
-            "worker_command": self._command_template,
-        }
-        return self._script_template % pieces
-
-    @contextmanager
-    def job_file(self):
-        """ Write job submission script to temporary file """
-        with tmpfile(extension="sh") as fn:
-            with open(fn, "w") as f:
-                logger.debug("writing job script: \n%s", self.job_script())
-                f.write(self.job_script())
-            yield fn
-
-    def _submit_job(self, script_filename):
-        return self._call(shlex.split(self.submit_command) + [script_filename])
-
-    def start_workers(self, n=1):
-        """ Start workers and point them to our local scheduler """
-        logger.debug("starting %s workers", n)
-        num_jobs = int(math.ceil(n / self.worker_processes))
-        for _ in range(num_jobs):
-            with self.job_file() as fn:
-                out = self._submit_job(fn)
-                job = self._job_id_from_submit_output(out)
-                if not job:
-                    raise ValueError("Unable to parse jobid from output of %s" % out)
-                logger.debug("started job: %s", job)
-                self.pending_jobs[job] = {}
-
-    @property
-    def scheduler(self):
-        """ The scheduler of this cluster """
-        return self.local_cluster.scheduler
-
-    def _call(self, cmd, **kwargs):
-        """ Call a command using subprocess.Popen.
-
-        This centralizes calls out to the command line, providing consistent
-        outputs, logging, and an opportunity to go asynchronous in the future.
-
-        Parameters
-        ----------
-        cmd: List(str))
-            A command, each of which is a list of strings to hand to
-            subprocess.Popen
-
-        Examples
-        --------
-        >>> self._call(['ls', '/foo'])
-
-        Returns
-        -------
-        The stdout produced by the command, as string.
-
-        Raises
-        ------
-        RuntimeError if the command exits with a non-zero exit code
-        """
-        cmd_str = " ".join(cmd)
-        logger.debug(
-            "Executing the following command to command line\n{}".format(cmd_str)
-        )
-
-        proc = subprocess.Popen(
-            cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs
-        )
-
-        out, err = proc.communicate()
-        out, err = out.decode(), err.decode()
-        if proc.returncode != 0:
-            raise RuntimeError(
-                "Command exited with non-zero exit code.\n"
-                "Exit code: {}\n"
-                "Command:\n{}\n"
-                "stdout:\n{}\n"
-                "stderr:\n{}\n".format(proc.returncode, cmd_str, out, err)
-            )
-        return out
-
-    def stop_workers(self, workers):
-        """ Stop a list of workers"""
-        logger.debug("Stopping workers: %s", workers)
-        if not workers:
-            return
-        jobs = self._del_pending_jobs()  # stop pending jobs too
-        for w in workers:
-            if isinstance(w, dict):
-                jobs.append(_job_id_from_worker_name(w["name"]))
-            else:
-                jobs.append(_job_id_from_worker_name(w.name))
-        self.stop_jobs(jobs)
-
-    def stop_jobs(self, jobs):
-        """ Stop a list of jobs"""
-        logger.debug("Stopping jobs: %s", jobs)
-        if jobs:
-            jobs = list(jobs)
-            self._call(shlex.split(self.cancel_command) + list(set(jobs)))
-
-        # if any of these jobs were pending, we should remove those now
-        for job_id in jobs:
-            if job_id in self.pending_jobs:
-                del self.pending_jobs[job_id]
-
-    def scale_up(self, n, **kwargs):
-        """ Brings total worker count up to ``n`` """
-        active_and_pending = self._count_active_and_pending_workers()
-        if n >= active_and_pending:
-            logger.debug("Scaling up to %d workers.", n)
-            self.start_workers(n - active_and_pending)
-        else:
-            # scale_up should not be called if n < active + pending jobs
-            logger.warning(
-                "JobQueueCluster.scale_up was called with a"
-                " number of workers lower that what is already"
-                " running or pending"
-            )
-
-    def _count_active_and_pending_workers(self):
-        active_and_pending = (
-            self._count_active_workers() + self._count_pending_workers()
-        )
-        logger.debug("Found %d active/pending workers.", active_and_pending)
-        assert len(self.scheduler.workers) <= active_and_pending
-        return active_and_pending
-
-    def _count_active_workers(self):
-        active_workers = sum([len(j) for j in self.running_jobs.values()])
-        assert len(self.scheduler.workers) == active_workers
-        return active_workers
-
-    def _count_pending_workers(self):
-        return self.worker_processes * len(self.pending_jobs)
-
-    def scale_down(self, workers, n=None):
-        """ Close the workers with the given addresses """
-        if n is None:
-            # Adaptive currently calls directly scale_down, we need to handle this
-            # Need to only keep active workers minus those adaptive wants to stop
-            n = self._count_active_workers() - len(workers)
-        logger.debug("Scaling down to %d Workers: %s", n, workers)
-        active_and_pending = self._count_active_and_pending_workers()
-        n_to_close = active_and_pending - n
-        if n_to_close < 0:
-            logger.warning(
-                "JobQueueCluster.scale_down was called with"
-                " a number of worker greater than what is"
-                " already running or pending."
-            )
-        elif n_to_close <= self._count_pending_workers():
-            # We only need to kill some pending jobs,
-            to_kill = int(n_to_close / self.worker_processes)
-            jobs = list(self.pending_jobs.keys())[-to_kill:]
-            logger.debug("%d jobs to stop, stopping jobs %s", to_kill, jobs)
-            self.stop_jobs(jobs)
-        else:
-            worker_states = []
-            for w in workers:
-                try:
-                    # Get the actual WorkerState
-                    worker_states.append(self.scheduler.workers[w])
-                except KeyError:
-                    logger.debug("worker %s is already gone", w)
-            self.stop_workers(worker_states)
-
-    def stop_all_jobs(self):
-        """ Stops all running and pending jobs """
-        jobs = self._del_pending_jobs()
-        jobs += list(self.running_jobs.keys())
-        self.stop_jobs(set(jobs))
-
-    def close(self, **kwargs):
-        """ Stops all running and pending jobs and stops scheduler """
-        self.stop_all_jobs()
-        return self.local_cluster.close(**kwargs)
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, type, value, traceback):
-        self.close()
-        self.local_cluster.__exit__(type, value, traceback)
-
-    def _del_pending_jobs(self):
-        jobs = list(self.pending_jobs.keys())
-        logger.debug("Deleting pending jobs %s" % jobs)
-        for job_id in jobs:
-            del self.pending_jobs[job_id]
-        return jobs
-
-    def _job_id_from_submit_output(self, out):
-        match = re.search(self.job_id_regexp, out)
-        if match is None:
-            msg = (
-                "Could not parse job id from submission command "
-                "output.\nJob id regexp is {!r}\nSubmission command "
-                "output is:\n{}".format(self.job_id_regexp, out)
-            )
-            raise ValueError(msg)
-
-        job_id = match.groupdict().get("job_id")
-        if job_id is None:
-            msg = (
-                "You need to use a 'job_id' named group in your regexp, e.g. "
-                "r'(?P<job_id>\\d+)', in your regexp. Your regexp was: "
-                "{!r}".format(self.job_id_regexp)
-            )
-            raise ValueError(msg)
-
-        return job_id
-
-    @staticmethod
-    def worker_key(worker_state):
-        return _job_id_from_worker_name(worker_state.name)
-
-    @property
-    def scheduler_comm(self):
-        return self.local_cluster.scheduler_comm
-
-    @property
-    def scheduler_info(self):
-        return self.local_cluster.scheduler_info
diff --git a/dask_jobqueue/deploy/__init__.py b/dask_jobqueue/deploy/__init__.py
deleted file mode 100644
index c9e11c0f..00000000
--- a/dask_jobqueue/deploy/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-# flake8: noqa
-from .cluster_manager import ClusterManager
diff --git a/dask_jobqueue/deploy/cluster_manager.py b/dask_jobqueue/deploy/cluster_manager.py
deleted file mode 100644
index 6910c82f..00000000
--- a/dask_jobqueue/deploy/cluster_manager.py
+++ /dev/null
@@ -1,392 +0,0 @@
-import logging
-import math
-
-from tornado import gen
-
-from distributed.deploy.adaptive import Adaptive
-from distributed.utils import (
-    log_errors,
-    ignoring,
-    parse_bytes,
-    PeriodicCallback,
-    format_bytes,
-    format_dashboard_link,
-)
-
-logger = logging.getLogger(__name__)
-
-
-class ClusterManager:
-    """ Intermediate Cluster object that should lead to a real ClusterManager
-
-    This tries to improve upstream Cluster object and underlines needs for
-    better decoupling between ClusterManager and Scheduler object
-
-    This currently expects a local Scheduler defined on the object, but should
-    eventually only rely on RPC calls on remote or local scheduler.
-    It provides common methods and an IPython widget display.
-
-    Clusters inheriting from this class should provide the following:
-
-    1.  A local ``Scheduler`` object at ``.scheduler``. In the future, just
-        a URL to local or remote scheduler.
-    2.  scale_up and scale_down methods as defined below::
-
-        def scale_up(self, n: int):
-            ''' Brings total worker count up to ``n`` '''
-
-        def scale_down(self, workers: List[str], n: int):
-            ''' Close the workers with the given addresses or remove pending
-                workers to match n running workers.
-            '''
-    3.  Optionally worker_key: Callable(WorkerState):
-            ''' Callable mapping a WorkerState object to a group, see
-                Scheduler.workers_to_close
-            '''
-    4.  jobqueue_worker_spec dict attribute if scale(cores=...) or scale(memory=...)
-        can be used by users.
-            jobqueue_worker_spec = {'cores': 4, 'memory': '16 GB'}
-
-    This will provide a general ``scale`` method as well as an IPython widget
-    for display.
-
-    Things the will need to change for the complete Cluster Manager Design:
-    -   ClusterManager:
-        - Use it's own event loop, or the notebook one.
-        - Connect to a local or remote Scheduler through RPC, and then
-          communicate with it.
-        - Ability to start a local or remote scheduler.
-        - Ability to work with different worker pools: in scale, adaptive,
-          jobqueue_worker_spec...
-    -   Scheduler
-        - Provide some remote methods:
-          - retire_workers(n: int): close enough workers ot have only n
-            running at the end. Return the closed workers.
-          - status of connected worker, e.g. scheduler_info()
-
-    Examples
-    --------
-
-    >>> from distributed.deploy import Cluster
-    >>> class MyCluster(cluster):
-    ...     def scale_up(self, n):
-    ...         ''' Bring the total worker count up to n '''
-    ...         pass
-    ...     def scale_down(self, workers, n=None):
-    ...         ''' Close the workers with the given addresses '''
-    ...         pass
-
-    >>> cluster = MyCluster()
-    >>> cluster.scale(5)                       # scale manually
-    >>> cluster.adapt(minimum=1, maximum=100)  # scale automatically
-    >>> cluster.scale(cores=100)               # scale manually to cores nb
-    """
-
-    def __init__(self, adaptive_options={}):
-        self._target_scale = 0
-        self._adaptive_options = adaptive_options
-        self._adaptive_options.setdefault("worker_key", self.worker_key)
-
-    def adapt(
-        self,
-        minimum_cores=None,
-        maximum_cores=None,
-        minimum_memory=None,
-        maximum_memory=None,
-        **kwargs
-    ):
-        """ Turn on adaptivity
-        For keyword arguments see dask.distributed.Adaptive
-        Instead of minimum and maximum parameters which apply to the number of
-        worker, If Cluster object implements jobqueue_worker_spec attribute, one can
-        use the following parameters:
-        Parameters
-        ----------
-        minimum_cores: int
-            Minimum number of cores for the cluster
-        maximum_cores: int
-            Maximum number of cores for the cluster
-        minimum_memory: str
-            Minimum amount of memory for the cluster
-        maximum_memory: str
-            Maximum amount of memory for the cluster
-        Examples
-        --------
-        >>> cluster.adapt(minimum=0, maximum=10, interval='500ms')
-        >>> cluster.adapt(minimum_cores=24, maximum_cores=96)
-        >>> cluster.adapt(minimum_memory='60 GB', maximum_memory= '1 TB')
-        """
-        with ignoring(AttributeError):
-            self._adaptive.stop()
-        if not hasattr(self, "_adaptive_options"):
-            self._adaptive_options = {}
-        if "minimum" not in kwargs:
-            if minimum_cores is not None:
-                kwargs["minimum"] = self._get_nb_workers_from_cores(minimum_cores)
-            elif minimum_memory is not None:
-                kwargs["minimum"] = self._get_nb_workers_from_memory(minimum_memory)
-        if "maximum" not in kwargs:
-            if maximum_cores is not None:
-                kwargs["maximum"] = self._get_nb_workers_from_cores(maximum_cores)
-            elif maximum_memory is not None:
-                kwargs["maximum"] = self._get_nb_workers_from_memory(maximum_memory)
-        self._adaptive_options.update(kwargs)
-        try:
-            self._adaptive = Adaptive(self.scheduler, self, **self._adaptive_options)
-        except Exception:
-            self._adaptive = Adaptive(self, **self._adaptive_options)
-        return self._adaptive
-
-    @property
-    def scheduler_address(self):
-        return self.scheduler.address
-
-    @property
-    def dashboard_link(self):
-        host = self.scheduler.address.split("://")[1].split(":")[0]
-        port = self.scheduler.services["dashboard"].port
-        return format_dashboard_link(host, port)
-
-    @gen.coroutine
-    def _scale(self, n=None, cores=None, memory=None):
-        """ Asynchronously called scale method
-
-        This allows to do every operation with a coherent context
-        """
-        with log_errors():
-            if [n, cores, memory].count(None) != 2:
-                raise ValueError(
-                    "One and only one of n, cores, memory kwargs"
-                    " should be used, n={}, cores={}, memory={}"
-                    " provided.".format(n, cores, memory)
-                )
-            if n is None:
-                if cores is not None:
-                    n = self._get_nb_workers_from_cores(cores)
-                elif memory is not None:
-                    n = self._get_nb_workers_from_memory(memory)
-
-            # here we rely on a ClusterManager attribute to retrieve the
-            # active and pending workers
-            if n == self._target_scale:
-                pass
-            elif n > self._target_scale:
-                self.scale_up(n)
-            else:
-                # TODO to_close may be empty if some workers are pending
-                # This may not be useful to call scheduler methods in this case
-                # Scheduler interface here may need to be modified
-                to_close = self.scheduler.workers_to_close(
-                    n=len(self.scheduler.workers) - n, minimum=n, key=self.worker_key
-                )
-                logger.debug("Closing workers: %s", to_close)
-                # Should  be an RPC call here
-                yield self.scheduler.retire_workers(workers=to_close)
-                # To close may be empty if just asking to remove pending
-                # workers, so we should also give a target number
-                self.scale_down(to_close, n)
-            self._target_scale = n
-
-    def scale(self, n=None, cores=None, memory=None):
-        """ Scale cluster to n workers or to the given number of cores or
-        memory
-        number of cores and memory are converted into number of workers using
-        jobqueue_worker_spec attribute.
-        Parameters
-        ----------
-        n: int
-            Target number of workers
-        cores: int
-            Target number of cores
-        memory: str
-            Target amount of available memory
-        Example
-        -------
-        >>> cluster.scale(10)  # scale cluster to ten workers
-        >>> cluster.scale(cores=100) # scale cluster to 100 cores
-        >>> cluster.scale(memory='1 TB') # scale cluster to 1 TB memory
-        See Also
-        --------
-        Cluster.scale_up
-        Cluster.scale_down
-        Cluster.jobqueue_worker_spec
-        """
-        # TODO we should not rely on scheduler loop here, self should have its
-        # own loop
-        self.scheduler.loop.add_callback(self._scale, n, cores, memory)
-
-    def _widget_status(self):
-        workers = len(self.scheduler.workers)
-        cores = sum(ws.nthreads for ws in self.scheduler.workers.values())
-        memory = sum(ws.memory_limit for ws in self.scheduler.workers.values())
-        memory = format_bytes(memory)
-        text = """
-<div>
-  <style scoped>
-    .dataframe tbody tr th:only-of-type {
-        vertical-align: middle;
-    }
-
-    .dataframe tbody tr th {
-        vertical-align: top;
-    }
-
-    .dataframe thead th {
-        text-align: right;
-    }
-  </style>
-  <table style="text-align: right;">
-    <tr><th>Workers</th> <td>%d</td></tr>
-    <tr><th>Cores</th> <td>%d</td></tr>
-    <tr><th>Memory</th> <td>%s</td></tr>
-  </table>
-</div>
-""" % (
-            workers,
-            cores,
-            memory,
-        )
-        return text
-
-    def _widget(self):
-        """ Create IPython widget for display within a notebook """
-        try:
-            return self._cached_widget
-        except AttributeError:
-            pass
-
-        from ipywidgets import (
-            Layout,
-            VBox,
-            HBox,
-            IntText,
-            Button,
-            HTML,
-            Accordion,
-            Text,
-        )
-
-        layout = Layout(width="150px")
-
-        if "dashboard" in self.scheduler.services:
-            link = self.dashboard_link
-            link = '<p><b>Dashboard: </b><a href="%s" target="_blank">%s</a></p>\n' % (
-                link,
-                link,
-            )
-        else:
-            link = ""
-
-        title = "<h2>%s</h2>" % type(self).__name__
-        title = HTML(title)
-        dashboard = HTML(link)
-
-        status = HTML(self._widget_status(), layout=Layout(min_width="150px"))
-
-        request = IntText(0, description="Workers", layout=layout)
-        scale = Button(description="Scale", layout=layout)
-        request_cores = IntText(0, description="Cores", layout=layout)
-        scale_cores = Button(description="Scale", layout=layout)
-        request_memory = Text("O GB", description="Memory", layout=layout)
-        scale_memory = Button(description="Scale", layout=layout)
-
-        minimum = IntText(0, description="Minimum", layout=layout)
-        maximum = IntText(0, description="Maximum", layout=layout)
-        adapt = Button(description="Adapt", layout=layout)
-        minimum_cores = IntText(0, description="Min cores", layout=layout)
-        maximum_cores = IntText(0, description="Max cores", layout=layout)
-        adapt_cores = Button(description="Adapt", layout=layout)
-        minimum_mem = Text("0 GB", description="Min memory", layout=layout)
-        maximum_mem = Text("0 GB", description="Max memory", layout=layout)
-        adapt_mem = Button(description="Adapt", layout=layout)
-
-        scale_hbox = [HBox([request, scale])]
-        adapt_hbox = [HBox([minimum, maximum, adapt])]
-        if hasattr(self, "jobqueue_worker_spec"):
-            scale_hbox.append(HBox([request_cores, scale_cores]))
-            scale_hbox.append(HBox([request_memory, scale_memory]))
-            adapt_hbox.append(HBox([minimum_cores, maximum_cores, adapt_cores]))
-            adapt_hbox.append(HBox([minimum_mem, maximum_mem, adapt_mem]))
-
-        accordion = Accordion(
-            [VBox(scale_hbox), VBox(adapt_hbox)], layout=Layout(min_width="500px")
-        )
-        accordion.selected_index = None
-        accordion.set_title(0, "Manual Scaling")
-        accordion.set_title(1, "Adaptive Scaling")
-
-        box = VBox([title, HBox([status, accordion]), dashboard])
-
-        self._cached_widget = box
-
-        def adapt_cb(b):
-            self.adapt(minimum=minimum.value, maximum=maximum.value)
-
-        def adapt_cores_cb(b):
-            self.adapt(
-                minimum_cores=minimum_cores.value, maximum_cores=maximum_cores.value
-            )
-
-        def adapt_mem_cb(b):
-            self.adapt(
-                minimum_memory=minimum_mem.value, maximum_memory=maximum_mem.value
-            )
-
-        adapt.on_click(adapt_cb)
-        adapt_cores.on_click(adapt_cores_cb)
-        adapt_mem.on_click(adapt_mem_cb)
-
-        def scale_cb(request, kwarg):
-            def request_cb(b):
-                with log_errors():
-                    arg = request.value
-                    with ignoring(AttributeError):
-                        self._adaptive.stop()
-                    local_kwargs = dict()
-                    local_kwargs[kwarg] = arg
-                    self.scale(**local_kwargs)
-
-            return request_cb
-
-        scale.on_click(scale_cb(request, "n"))
-        scale_cores.on_click(scale_cb(request_cores, "cores"))
-        scale_memory.on_click(scale_cb(request_memory, "memory"))
-
-        def update():
-            status.value = self._widget_status()
-
-        pc = PeriodicCallback(update, 500, io_loop=self.scheduler.loop)
-        self.scheduler.periodic_callbacks["cluster-repr"] = pc
-        pc.start()
-
-        return box
-
-    def _ipython_display_(self, **kwargs):
-        return self._widget()._ipython_display_(**kwargs)
-
-    def worker_key(self, worker_state):
-        """ Callable mapping a WorkerState object to a group, see
-            Scheduler.workers_to_close
-        """
-        return worker_state
-
-    def _get_nb_workers_from_cores(self, cores):
-        return math.ceil(cores / self.jobqueue_worker_spec["cores"])
-
-    def _get_nb_workers_from_memory(self, memory):
-        return math.ceil(
-            parse_bytes(memory) / parse_bytes(self.jobqueue_worker_spec["memory"])
-        )
-
-    @property
-    def jobqueue_worker_spec(self):
-        """ single worker process info needed for scaling on cores or memory """
-        raise NotImplementedError(
-            "{} class does not provide jobqueue_worker_spec "
-            "attribute, needed for scaling with "
-            "cores or memory kwargs.".format(self.__class__.__name__)
-        )
-
-    @property
-    def loop(self):
-        return self.scheduler.loop
diff --git a/dask_jobqueue/htcondor.py b/dask_jobqueue/htcondor.py
index 942775a8..758a4c6a 100644
--- a/dask_jobqueue/htcondor.py
+++ b/dask_jobqueue/htcondor.py
@@ -59,7 +59,9 @@ class HTCondorJob(Job):
     # Python (can't find its libs), so we have to go through the shell.
     executable = "/bin/sh"
 
-    def __init__(self, *args, disk=None, job_extra=None, config_name="htcondor", **kwargs):
+    def __init__(
+        self, *args, disk=None, job_extra=None, config_name="htcondor", **kwargs
+    ):
         if disk is None:
             disk = dask.config.get("jobqueue.%s.disk" % config_name)
         if disk is None:
@@ -224,4 +226,6 @@ def quote_environment(env):
     return " ".join(entries)
 
 
-HTCondorCluster = functools.partial(JobQueueCluster, Job=HTCondorJob, config_name="htcondor")
+HTCondorCluster = functools.partial(
+    JobQueueCluster, Job=HTCondorJob, config_name="htcondor"
+)
diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index c4de6230..d0652204 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -427,7 +427,3 @@ def job_script(self):
     @property
     def job_name(self):
         return self.example_job.job_name
-
-
-class EmptyJob(ProcessInterface):
-    pass
diff --git a/dask_jobqueue/tests/test_jobqueue_core.py b/dask_jobqueue/tests/test_jobqueue_core.py
index af289fd6..de812234 100644
--- a/dask_jobqueue/tests/test_jobqueue_core.py
+++ b/dask_jobqueue/tests/test_jobqueue_core.py
@@ -19,6 +19,7 @@
 from dask_jobqueue.sge import SGEJob
 
 
+@pytest.mark.xfail
 def test_errors():
     with pytest.raises(NotImplementedError) as info:
         JobQueueCluster(cores=4)

From 2014f02c334453f7d89b9021cd6dc426b6a92e6a Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Sun, 1 Sep 2019 11:47:43 -0700
Subject: [PATCH 073/109] Avoid modifying config value in place

---
 dask_jobqueue/job.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index d0652204..fb12ed02 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -164,7 +164,7 @@ def __init__(
         self.job_header = None
 
         if interface:
-            extra += ["--interface", interface]
+            extra = extra + ["--interface", interface]
             kwargs.setdefault("host", get_ip_interface(interface))
         else:
             kwargs.setdefault("host", "")

From f2be218663a832a490c1c6e392c852229771b97a Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Sun, 8 Sep 2019 14:13:27 -0700
Subject: [PATCH 074/109] Use subclasses for FooClusters rather than
 functools.partial

---
 dask_jobqueue/htcondor.py |  6 +++---
 dask_jobqueue/job.py      | 12 ++++++++----
 dask_jobqueue/local.py    |  4 +++-
 dask_jobqueue/lsf.py      |  4 +++-
 dask_jobqueue/moab.py     |  8 +++-----
 dask_jobqueue/oar.py      |  5 +++--
 dask_jobqueue/pbs.py      |  5 +++--
 dask_jobqueue/sge.py      |  5 +++--
 dask_jobqueue/slurm.py    |  8 +++++---
 9 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/dask_jobqueue/htcondor.py b/dask_jobqueue/htcondor.py
index 758a4c6a..680b28a6 100644
--- a/dask_jobqueue/htcondor.py
+++ b/dask_jobqueue/htcondor.py
@@ -226,6 +226,6 @@ def quote_environment(env):
     return " ".join(entries)
 
 
-HTCondorCluster = functools.partial(
-    JobQueueCluster, Job=HTCondorJob, config_name="htcondor"
-)
+class HTCondorCluster(JobQueueCluster):
+    Job = HTCondorJob
+    config_name = "htcondor"
diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index fb12ed02..21e3b7ba 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -344,6 +344,8 @@ def _call(cmd, **kwargs):
 
 
 class JobQueueCluster(SpecCluster):
+    Job = None
+
     def __init__(
         self,
         n_workers=0,
@@ -364,7 +366,10 @@ def __init__(
         **kwargs
     ):
         self.status = "created"
-        if Job is None:
+        if Job is not None:
+            self.Job = Job
+
+        if self.Job is None:
             raise ValueError(
                 "You must provide a Job type like PBSJob, SLURMJob, "
                 "or SGEJob with the Job= argument."
@@ -390,8 +395,7 @@ def __init__(
         kwargs["protocol"] = protocol
         kwargs["security"] = security
         self._kwargs = kwargs
-        self._Job = Job
-        worker = {"cls": Job, "options": kwargs}
+        worker = {"cls": self.Job, "options": kwargs}
         if "processes" in kwargs and kwargs["processes"] > 1:
             worker["group"] = ["-" + str(i) for i in range(kwargs["processes"])]
 
@@ -415,7 +419,7 @@ def example_job(self):
             address = self.scheduler.address
         except AttributeError:
             address = "tcp://scheduler:8786"
-        return self._Job(address or "tcp://scheduler:8786", name="name", **self._kwargs)
+        return self.Job(address or "tcp://scheduler:8786", name="name", **self._kwargs)
 
     @property
     def job_header(self):
diff --git a/dask_jobqueue/local.py b/dask_jobqueue/local.py
index 1c269d0c..2e9adc5c 100644
--- a/dask_jobqueue/local.py
+++ b/dask_jobqueue/local.py
@@ -53,4 +53,6 @@ def _close_job(self, job_id):
         # terminate_process(self.process)
 
 
-LocalCluster = functools.partial(JobQueueCluster, Job=LocalJob, config_name="local")
+class LocalCluster(JobQueueCluster):
+    Job = LocalJob
+    config_name = "local"
diff --git a/dask_jobqueue/lsf.py b/dask_jobqueue/lsf.py
index 31170f0c..2c232413 100644
--- a/dask_jobqueue/lsf.py
+++ b/dask_jobqueue/lsf.py
@@ -193,4 +193,6 @@ def lsf_detect_units():
     return unit
 
 
-LSFCluster = functools.partial(JobQueueCluster, Job=LSFJob, config_name="lsf")
+class LSFCluster(JobQueueCluster):
+    Job = LSFJob
+    config_name = "lsf"
diff --git a/dask_jobqueue/moab.py b/dask_jobqueue/moab.py
index a6fe664e..d796ddac 100644
--- a/dask_jobqueue/moab.py
+++ b/dask_jobqueue/moab.py
@@ -1,7 +1,4 @@
-import functools
-
-from .job import JobQueueCluster
-from .pbs import PBSJob
+from .pbs import PBSJob, PBSCluster
 
 
 class MoabJob(PBSJob):
@@ -10,4 +7,5 @@ class MoabJob(PBSJob):
     scheduler_name = "moab"
 
 
-MoabCluster = functools.partial(JobQueueCluster, Job=MoabJob, config_name="pbs")
+class MoabCluster(PBSCluster):
+    Job = MoabJob
diff --git a/dask_jobqueue/oar.py b/dask_jobqueue/oar.py
index eb2196e9..40c9ee9e 100644
--- a/dask_jobqueue/oar.py
+++ b/dask_jobqueue/oar.py
@@ -1,4 +1,3 @@
-import functools
 import logging
 import shlex
 
@@ -126,4 +125,6 @@ def _submit_job(self, fn):
         return self._call(oarsub_command_split)
 
 
-OARCluster = functools.partial(JobQueueCluster, Job=OARJob, config_name="oar")
+class OARCluster(JobQueueCluster):
+    Job = OARJob
+    config_name = "oar"
diff --git a/dask_jobqueue/pbs.py b/dask_jobqueue/pbs.py
index e387cbfd..92a75bda 100644
--- a/dask_jobqueue/pbs.py
+++ b/dask_jobqueue/pbs.py
@@ -1,4 +1,3 @@
-import functools
 import logging
 import math
 import os
@@ -103,4 +102,6 @@ def __init__(
         logger.debug("Job script: \n %s" % self.job_script())
 
 
-PBSCluster = functools.partial(JobQueueCluster, Job=PBSJob, config_name="pbs")
+class PBSCluster(JobQueueCluster):
+    Job = PBSJob
+    config_name = "pbs"
diff --git a/dask_jobqueue/sge.py b/dask_jobqueue/sge.py
index 79ced212..3a1a4a33 100644
--- a/dask_jobqueue/sge.py
+++ b/dask_jobqueue/sge.py
@@ -1,5 +1,4 @@
 import logging
-import functools
 
 import dask
 
@@ -68,4 +67,6 @@ def __init__(
         logger.debug("Job script: \n %s" % self.job_script())
 
 
-SGECluster = functools.partial(JobQueueCluster, Job=SGEJob, config_name="sge")
+class SGECluster(JobQueueCluster):
+    Job = SGEJob
+    config_name = "sge"
diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py
index b5082c5b..c00adabc 100644
--- a/dask_jobqueue/slurm.py
+++ b/dask_jobqueue/slurm.py
@@ -110,9 +110,6 @@ def __init__(
         self.job_header = "\n".join(header_lines)
 
 
-SLURMCluster = functools.partial(JobQueueCluster, Job=SLURMJob, config_name="slurm")
-
-
 def slurm_format_bytes_ceil(n):
     """ Format bytes as text.
 
@@ -136,3 +133,8 @@ def slurm_format_bytes_ceil(n):
     if n >= 1024:
         return "%dK" % math.ceil(n / 1024)
     return "1K" % n
+
+
+class SLURMCluster(JobQueueCluster):
+    Job = SLURMJob
+    config_name = "slurm"

From 195c707e62b517dd522490ee3010ebe779932cd8 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Sun, 8 Sep 2019 14:54:38 -0700
Subject: [PATCH 075/109] Add docstrings

---
 dask_jobqueue/__init__.py       |  4 +-
 dask_jobqueue/core.py           | 29 +------------
 dask_jobqueue/htcondor.py       | 56 ++++++++++++-------------
 dask_jobqueue/job.py            | 74 +++++++++++++++++++++++++--------
 dask_jobqueue/local.py          | 37 ++++++++++++++++-
 dask_jobqueue/lsf.py            | 52 +++++++++++++++++++++--
 dask_jobqueue/moab.py           |  1 +
 dask_jobqueue/oar.py            | 67 ++++++++++++++---------------
 dask_jobqueue/pbs.py            | 36 +++++++++++++++-
 dask_jobqueue/sge.py            | 49 +++++++++++++++++++++-
 dask_jobqueue/slurm.py          | 70 +++++++++++++++++++------------
 dask_jobqueue/tests/test_job.py | 32 ++++++++------
 12 files changed, 350 insertions(+), 157 deletions(-)

diff --git a/dask_jobqueue/__init__.py b/dask_jobqueue/__init__.py
index 3659631c..3ba45124 100644
--- a/dask_jobqueue/__init__.py
+++ b/dask_jobqueue/__init__.py
@@ -8,8 +8,8 @@
 from .slurm import SLURMCluster, SLURMJob
 from .sge import SGECluster, SGEJob
 from .lsf import LSFCluster, LSFJob
-from .oar import OARCluster
-from .htcondor import HTCondorCluster
+from .oar import OARCluster, OARJob
+from .htcondor import HTCondorCluster, HTCondorJob
 
 from ._version import get_versions
 
diff --git a/dask_jobqueue/core.py b/dask_jobqueue/core.py
index 80d91c88..9ddda8d7 100644
--- a/dask_jobqueue/core.py
+++ b/dask_jobqueue/core.py
@@ -15,34 +15,7 @@ class JobQueueCluster:
 
     Parameters
     ----------
-    name : str
-        Name of Dask workers.
-    cores : int
-        Total number of cores per job
-    memory: str
-        Total amount of memory per job
-    processes : int
-        Number of processes per job
-    interface : str
-        Network interface like 'eth0' or 'ib0'.
-    death_timeout : float
-        Seconds to wait for a scheduler before closing workers
-    local_directory : str
-        Dask worker local directory for file spilling.
-    extra : list
-        Additional arguments to pass to `dask-worker`
-    env_extra : list
-        Other commands to add to script before launching worker.
-    log_directory : str
-        Directory to use for job scheduler logs.
-    shebang : str
-        Path to desired interpreter for your batch submission script.
-    python : str
-        Python executable used to launch Dask workers.
-    config_name : str
-        Section to use from jobqueue.yaml configuration file.
-    kwargs : dict
-        Additional keyword arguments to pass to `LocalCluster`
+    %{job_parameters}s
 
     Attributes
     ----------
diff --git a/dask_jobqueue/htcondor.py b/dask_jobqueue/htcondor.py
index 680b28a6..ce8eaf37 100644
--- a/dask_jobqueue/htcondor.py
+++ b/dask_jobqueue/htcondor.py
@@ -7,40 +7,12 @@
 from distributed.utils import parse_bytes
 
 from .core import docstrings
-from .job import JobQueueCluster, Job
+from .job import JobQueueCluster, Job, job_parameters, cluster_parameters
 
 logger = logging.getLogger(__name__)
 
 
 class HTCondorJob(Job):
-    __doc__ = docstrings.with_indents(
-        """ Launch Dask on an HTCondor cluster with a shared file system
-
-    Parameters
-    ----------
-    disk : str
-        Total amount of disk per job
-    job_extra : dict
-        Extra submit file attributes for the job
-    %(JobQueueCluster.parameters)s
-
-    Examples
-    --------
-    >>> from dask_jobqueue.htcondor import HTCondorCluster
-    >>> cluster = HTCondorCluster(cores=24, memory="4GB", disk="4GB")
-    >>> cluster.scale(10)
-
-    >>> from dask.distributed import Client
-    >>> client = Client(cluster)
-
-    This also works with adaptive clusters.  This automatically launches and kill workers based on load.
-    HTCondor can take longer to start jobs than other batch systems - tune Adaptive parameters accordingly.
-
-    >>> cluster.adapt(minimum=5, startup_cost='60s')
-    """,
-        4,
-    )
-
     _script_template = """
 %(shebang)s
 
@@ -227,5 +199,31 @@ def quote_environment(env):
 
 
 class HTCondorCluster(JobQueueCluster):
+    __doc__ = """
+    Launch Dask on an HTCondor cluster with a shared file system
+
+    Parameters
+    ----------
+    disk : str
+        Total amount of disk per job
+    job_extra : dict
+        Extra submit file attributes for the job
+    {job}
+    {cluster}
+
+    Examples
+    --------
+    >>> from dask_jobqueue.htcondor import HTCondorCluster
+    >>> cluster = HTCondorCluster(cores=24, memory="4GB", disk="4GB")
+    >>> cluster.scale(10)
+
+    >>> from dask.distributed import Client
+    >>> client = Client(cluster)
+
+    This also works with adaptive clusters.  This automatically launches and kill workers based on load.
+    HTCondor can take longer to start jobs than other batch systems - tune Adaptive parameters accordingly.
+
+    >>> cluster.adapt(minimum=5, startup_cost='60s')
+    """.format(job=job_parameters, cluster=cluster_parameters)
     Job = HTCondorJob
     config_name = "htcondor"
diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index 21e3b7ba..54a9f544 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -16,31 +16,22 @@
 
 logger = logging.getLogger(__name__)
 
-
-class Job(ProcessInterface):
-    """ Base class to launch Dask workers on Job queues
-
-    This class should not be used directly, use inherited class appropriate for
-    your queueing system (e.g. PBScluster or SLURMCluster)
-
-    Parameters
-    ----------
-    name : str
-        Name of Dask workers.
+job_parameters = """
     cores : int
         Total number of cores per job
     memory: str
         Total amount of memory per job
     processes : int
-        Number of processes per job
-    nanny : bool
-        Whether or not to start a nanny process
+        Cut the job up into this many processes.
+        Good for GIL workloads or for nodes with many cores.
     interface : str
         Network interface like 'eth0' or 'ib0'.
-    death_timeout : float
-        Seconds to wait for a scheduler before closing workers
+    nanny : bool
+        Whether or not to start a nanny process
     local_directory : str
         Dask worker local directory for file spilling.
+    death_timeout : float
+        Seconds to wait for a scheduler before closing workers
     extra : list
         Additional arguments to pass to `dask-worker`
     env_extra : list
@@ -54,8 +45,40 @@ class Job(ProcessInterface):
         Path to desired interpreter for your batch submission script.
     python : str
         Python executable used to launch Dask workers.
+        Defaults to the Python that is submitting these jobs
     config_name : str
         Section to use from jobqueue.yaml configuration file.
+    name : str
+        Name of Dask worker.  This is typically set by the Cluster
+""".strip()
+
+
+cluster_parameters = """
+    n_workers : int
+        Number of workers to start by default.  Defaults to 0.
+        See the scale method
+    silence_logs : str
+        Log level like "debug", "info", or "error" to emit here if the
+        scheduler is started locally
+    asynchronous : bool
+        Whether or not to run this cluster object with the async/await syntax
+    security : Security
+        A dask.distributed security object if you're using TLS/SSL
+    dashboard_address : str or int
+        An address like ":8787" on which to host the Scheduler's dashboard
+""".strip()
+
+
+
+class Job(ProcessInterface):
+    """ Base class to launch Dask workers on Job queues
+
+    This class should not be used directly, use inherited class appropriate for
+    your queueing system (e.g. PBScluster or SLURMCluster)
+
+    Parameters
+    ----------
+    {job_parameters}
 
     Attributes
     ----------
@@ -74,7 +97,7 @@ class Job(ProcessInterface):
     OARCluster
     LSFCluster
     MoabCluster
-    """
+    """.format(job_parameters=job_parameters)
 
     _script_template = """
 %(shebang)s
@@ -344,6 +367,23 @@ def _call(cmd, **kwargs):
 
 
 class JobQueueCluster(SpecCluster):
+    __doc__ = """ Deploy Dask on a Job queuing system
+
+    This is a superclass, and is rarely used directly.  It is more common to
+    use an object like SGECluster, SLURMCluster, PBSCluster, LSFCluster, or
+    others.
+
+    However, it can be used directly if you have a custom ``Job`` type.
+    This class relies heavily on being passed a ``Job`` type that is able to
+    launch one Job on a job queueing system.
+
+    Parameters
+    ----------
+    Job : Job
+        A class that can be awaited to ask for a single Job
+    {cluster_parameters}
+    """.format(cluster_parameters=cluster_parameters)
+
     Job = None
 
     def __init__(
diff --git a/dask_jobqueue/local.py b/dask_jobqueue/local.py
index 2e9adc5c..155ca09e 100644
--- a/dask_jobqueue/local.py
+++ b/dask_jobqueue/local.py
@@ -3,13 +3,23 @@
 import os
 import subprocess
 
-from .job import Job, JobQueueCluster
+from .job import Job, JobQueueCluster, job_parameters, cluster_parameters
 
 logger = logging.getLogger(__name__)
 
 
 class LocalJob(Job):
-    """ This is mostly used for testing.  It runs locally. """
+    __doc__ = """ Use Dask Jobqueue with local bash commands
+
+    This is mostly for testing.  It uses all the same machinery of
+    dask-jobqueue, but rather than submitting jobs to some external job
+    queueing system, it launches them locally.  For normal local use, please
+    see ``dask.distributed.LocalCluster``
+
+    Parameters
+    ----------
+    {job}
+    """.format(job=job_parameters)
 
     config_name = "local"
 
@@ -54,5 +64,28 @@ def _close_job(self, job_id):
 
 
 class LocalCluster(JobQueueCluster):
+    __doc__ = """ Use dask-jobqueue with local bash commands
+
+    This is mostly for testing.  It uses all the same machinery of
+    dask-jobqueue, but rather than submitting jobs to some external job
+    queueing system, it launches them locally.  For normal local use, please
+    see ``dask.distributed.LocalCluster``
+
+    Parameters
+    ----------
+    {job}
+
+    {cluster}
+
+    Examples
+    --------
+    >>> from dask_jobqueue import LocalCluster
+    >>> cluster = LocalCluster(cores=2, memory="4 GB")
+    >>> cluster.scale(3)
+
+    See Also
+    --------
+    dask.distributed.LocalCluster
+    """.format(job=job_parameters, cluster=cluster_parameters)
     Job = LocalJob
     config_name = "local"
diff --git a/dask_jobqueue/lsf.py b/dask_jobqueue/lsf.py
index 2c232413..151ed8f4 100644
--- a/dask_jobqueue/lsf.py
+++ b/dask_jobqueue/lsf.py
@@ -5,15 +5,14 @@
 
 import dask
 
-from .job import Job, JobQueueCluster
+from .job import Job, JobQueueCluster, job_parameters, cluster_parameters
 
 logger = logging.getLogger(__name__)
 
 
 class LSFJob(Job):
-    """ Launch Dask on a LSF cluster
+    __doc__ = """ Launch Dask on a LSF cluster
 
-    See also the docstring for Job for more parameters
 
     Parameters
     ----------
@@ -22,6 +21,7 @@ class LSFJob(Job):
     project : str
         Accounting string associated with each worker job. Passed to
         `#BSUB -P` option.
+    {job}
     ncpus : int
         Number of cpus. Passed to `#BSUB -n` option.
     mem : int
@@ -49,7 +49,7 @@ class LSFJob(Job):
     kill workers based on load.
 
     >>> cluster.adapt()
-    """
+    """.format(job=job_parameters)
 
     submit_command = "bsub"
     cancel_command = "bkill"
@@ -194,5 +194,49 @@ def lsf_detect_units():
 
 
 class LSFCluster(JobQueueCluster):
+    __doc__ = """
+    Launch Dask on a LSF cluster
+
+    Parameters
+    ----------
+    queue : str
+        Destination queue for each worker job. Passed to `#BSUB -q` option.
+    project : str
+        Accounting string associated with each worker job. Passed to
+        `#BSUB -P` option.
+
+    {job}
+
+    ncpus : int
+        Number of cpus. Passed to `#BSUB -n` option.
+    mem : int
+        Request memory in bytes. Passed to `#BSUB -M` option.
+    walltime : str
+        Walltime for each worker job in HH:MM. Passed to `#BSUB -W` option.
+
+    {cluster}
+
+    job_extra : list
+        List of other LSF options, for example -u. Each option will be
+        prepended with the #LSF prefix.
+    lsf_units : str
+        Unit system for large units in resource usage set by the
+        LSF_UNIT_FOR_LIMITS in the lsf.conf file of a cluster.
+
+    Examples
+    --------
+    >>> from dask_jobqueue import LSFCluster
+    >>> cluster = LSFCluster(queue='general', project='DaskonLSF',
+    ...                      cores=15, memory='25GB')
+    >>> cluster.scale(10)  # this may take a few seconds to launch
+
+    >>> from dask.distributed import Client
+    >>> client = Client(cluster)
+
+    This also works with adaptive clusters.  This automatically launches and
+    kill workers based on load.
+
+    >>> cluster.adapt()
+    """.format(job=job_parameters, cluster=cluster_parameters)
     Job = LSFJob
     config_name = "lsf"
diff --git a/dask_jobqueue/moab.py b/dask_jobqueue/moab.py
index d796ddac..c2f8f3f6 100644
--- a/dask_jobqueue/moab.py
+++ b/dask_jobqueue/moab.py
@@ -8,4 +8,5 @@ class MoabJob(PBSJob):
 
 
 class MoabCluster(PBSCluster):
+    __doc__ = PBSCluster.__doc__.replace("PBSCluster", "MoabCluster")
     Job = MoabJob
diff --git a/dask_jobqueue/oar.py b/dask_jobqueue/oar.py
index 40c9ee9e..9d0b4616 100644
--- a/dask_jobqueue/oar.py
+++ b/dask_jobqueue/oar.py
@@ -4,44 +4,12 @@
 import dask
 
 from .core import docstrings
-from .job import JobQueueCluster, Job
+from .job import JobQueueCluster, Job, job_parameters, cluster_parameters
 
 logger = logging.getLogger(__name__)
 
 
 class OARJob(Job):
-    __doc__ = docstrings.with_indents(
-        """ Launch Dask on a OAR cluster
-
-    Parameters
-    ----------
-    queue : str
-        Destination queue for each worker job. Passed to `#OAR -q` option.
-    project : str
-        Accounting string associated with each worker job. Passed to `#OAR -p` option.
-    resource_spec : str
-        Request resources and specify job placement. Passed to `#OAR -l` option.
-    walltime : str
-        Walltime for each worker job.
-    job_extra : list
-        List of other OAR options, for example `-t besteffort`. Each option will be prepended with the #OAR prefix.
-    %(JobQueueCluster.parameters)s
-
-    Examples
-    --------
-    >>> from dask_jobqueue import OARCluster
-    >>> cluster = OARCluster(queue='regular')
-    >>> cluster.scale(10)  # this may take a few seconds to launch
-
-    >>> from dask.distributed import Client
-    >>> client = Client(cluster)
-
-    This also works with adaptive clusters.  This automatically launches and kill workers based on load.
-
-    >>> cluster.adapt()
-    """,
-        4,
-    )
 
     # Override class variables
     submit_command = "oarsub"
@@ -126,5 +94,38 @@ def _submit_job(self, fn):
 
 
 class OARCluster(JobQueueCluster):
+    __doc__ = """ Launch Dask on an OAR cluster
+
+    Parameters
+    ----------
+    queue : str
+        Destination queue for each worker job. Passed to `#OAR -q` option.
+    project : str
+        Accounting string associated with each worker job. Passed to `#OAR -p` option.
+
+    {job}
+
+    {cluster}
+
+    resource_spec : str
+        Request resources and specify job placement. Passed to `#OAR -l` option.
+    walltime : str
+        Walltime for each worker job.
+    job_extra : list
+        List of other OAR options, for example `-t besteffort`. Each option will be prepended with the #OAR prefix.
+
+    Examples
+    --------
+    >>> from dask_jobqueue import OARCluster
+    >>> cluster = OARCluster(queue='regular')
+    >>> cluster.scale(10)  # this may take a few seconds to launch
+
+    >>> from dask.distributed import Client
+    >>> client = Client(cluster)
+
+    This also works with adaptive clusters.  This automatically launches and kill workers based on load.
+
+    >>> cluster.adapt()
+    """.format(job=job_parameters, cluster=cluster_parameters)
     Job = OARJob
     config_name = "oar"
diff --git a/dask_jobqueue/pbs.py b/dask_jobqueue/pbs.py
index 92a75bda..3142f08a 100644
--- a/dask_jobqueue/pbs.py
+++ b/dask_jobqueue/pbs.py
@@ -4,7 +4,7 @@
 
 import dask
 
-from .job import Job, JobQueueCluster
+from .job import Job, JobQueueCluster, job_parameters, cluster_parameters
 
 logger = logging.getLogger(__name__)
 
@@ -103,5 +103,39 @@ def __init__(
 
 
 class PBSCluster(JobQueueCluster):
+    __doc__ = """ Launch Dask on an OAR cluster
+
+    Parameters
+    ----------
+    queue : str
+        Destination queue for each worker job. Passed to `#PBS -q` option.
+    project : str
+        Accounting string associated with each worker job. Passed to `#PBS -A` option.
+
+    {job}
+
+    {cluster}
+
+    resource_spec : str
+        Request resources and specify job placement. Passed to `#PBS -l` option.
+    walltime : str
+        Walltime for each worker job.
+    job_extra : list
+        List of other PBS options. Each option will be prepended with the #PBS prefix.
+
+    Examples
+    --------
+    >>> from dask_jobqueue import PBSCluster
+    >>> cluster = PBSCluster(queue='regular', project="myproj", cores=24,
+    ...     memory="500 GB")
+    >>> cluster.scale(10)  # Ask for ten jobs
+
+    >>> from dask.distributed import Client
+    >>> client = Client(cluster)
+
+    This also works with adaptive clusters.  This automatically launches and kill workers based on load.
+
+    >>> cluster.adapt()
+    """.format(job=job_parameters, cluster=cluster_parameters)
     Job = PBSJob
     config_name = "pbs"
diff --git a/dask_jobqueue/sge.py b/dask_jobqueue/sge.py
index 3a1a4a33..4bd2ba0e 100644
--- a/dask_jobqueue/sge.py
+++ b/dask_jobqueue/sge.py
@@ -2,7 +2,7 @@
 
 import dask
 
-from .job import Job, JobQueueCluster
+from .job import Job, JobQueueCluster, job_parameters, cluster_parameters
 
 logger = logging.getLogger(__name__)
 
@@ -68,5 +68,52 @@ def __init__(
 
 
 class SGECluster(JobQueueCluster):
+    __doc__ = """
+    Launch Dask on an SGE cluster
+
+    .. note::
+        If you want a specific amount of RAM, both ``memory`` and ``resource_spec``
+        must be specified. The exact syntax of ``resource_spec`` is defined by your
+        GridEngine system administrator. The amount of ``memory`` requested should
+        match the ``resource_spec``, so that Dask's memory management system can
+        perform accurately.
+
+    Parameters
+    ----------
+    queue : str
+        Destination queue for each worker job. Passed to `#$ -q` option.
+    project : str
+        Accounting string associated with each worker job. Passed to `#$ -A` option.
+
+    {job}
+
+    {cluster}
+
+    resource_spec : str
+        Request resources and specify job placement. Passed to `#$ -l` option.
+    walltime : str
+        Walltime for each worker job.
+    job_extra : list
+        List of other SGE options, for example -w e. Each option will be
+        prepended with the #$ prefix.
+
+    Examples
+    --------
+    >>> from dask_jobqueue import SGECluster
+    >>> cluster = SGECluster(
+    ...     queue='regular',
+    ...     project="myproj",
+    ...     cores=24,
+    ...     memory="500 GB"
+    ... )
+    >>> cluster.scale(10)  # this may take a few seconds to launch
+
+    >>> from dask.distributed import Client
+    >>> client = Client(cluster)
+
+    This also works with adaptive clusters.  This automatically launches and kill workers based on load.
+
+    >>> cluster.adapt()
+    """.format(job=job_parameters, cluster=cluster_parameters)
     Job = SGEJob
     config_name = "sge"
diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py
index c00adabc..2b2e4c6e 100644
--- a/dask_jobqueue/slurm.py
+++ b/dask_jobqueue/slurm.py
@@ -5,38 +5,12 @@
 import dask
 
 from .core import docstrings
-from .job import Job, JobQueueCluster
+from .job import Job, JobQueueCluster, job_parameters, cluster_parameters
 
 logger = logging.getLogger(__name__)
 
 
 class SLURMJob(Job):
-    __doc__ = docstrings.with_indents(
-        """ Launch Dask on a SLURM cluster
-
-    Parameters
-    ----------
-    queue : str
-        Destination queue for each worker job. Passed to `#SBATCH -p` option.
-    project : str
-        Accounting string associated with each worker job. Passed to `#SBATCH -A` option.
-    walltime : str
-        Walltime for each worker job.
-    job_cpu : int
-        Number of cpu to book in SLURM, if None, defaults to worker `threads * processes`
-    job_mem : str
-        Amount of memory to request in SLURM. If None, defaults to worker
-        processes * memory
-    job_extra : list
-        List of other Slurm options, for example -j oe. Each option will be prepended with the #SBATCH prefix.
-    %(JobQueueCluster.parameters)s
-
-    Examples
-    --------
-    """,
-        4,
-    )
-
     # Override class variables
     submit_command = "sbatch"
     cancel_command = "scancel"
@@ -136,5 +110,47 @@ def slurm_format_bytes_ceil(n):
 
 
 class SLURMCluster(JobQueueCluster):
+    __doc__ = """
+    Launch Dask on a SLURM cluster
+
+    Parameters
+    ----------
+    queue : str
+        Destination queue for each worker job. Passed to `#SBATCH -p` option.
+    project : str
+        Accounting string associated with each worker job. Passed to `#SBATCH -A` option.
+
+    {job}
+
+    {cluster}
+
+    walltime : str
+        Walltime for each worker job.
+    job_cpu : int
+        Number of cpu to book in SLURM, if None, defaults to worker `threads * processes`
+    job_mem : str
+        Amount of memory to request in SLURM. If None, defaults to worker
+        processes * memory
+    job_extra : list
+        List of other Slurm options, for example -j oe. Each option will be prepended with the #SBATCH prefix.
+
+    Examples
+    --------
+    >>> from dask_jobqueue import SLURMCluster
+    >>> cluster = SLURMCluster(
+    ...     queue='regular',
+    ...     project="myproj",
+    ...     cores=24,
+    ...     memory="500 GB"
+    ... )
+    >>> cluster.scale(10)  # this may take a few seconds to launch
+
+    >>> from dask.distributed import Client
+    >>> client = Client(cluster)
+
+    This also works with adaptive clusters.  This automatically launches and kill workers based on load.
+
+    >>> cluster.adapt()
+    """.format(job=job_parameters, cluster=cluster_parameters)
     Job = SLURMJob
     config_name = "slurm"
diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index c6874ec1..3c2d04a2 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -1,7 +1,9 @@
 import asyncio
 from time import time
 
-from dask_jobqueue import PBSJob, SGEJob, SLURMJob, LSFJob, LocalJob, LocalCluster
+from dask_jobqueue import (PBSJob, PBSCluster, SGEJob, SGECluster, SLURMJob,
+        SLURMCluster, LSFJob, LSFCluster, LocalJob, LocalCluster,
+        HTCondorJob, HTCondorCluster, MoabJob, MoabCluster, OARJob, OARCluster)
 from dask_jobqueue.job import JobQueueCluster
 from dask.distributed import Scheduler, Client
 
@@ -13,7 +15,7 @@ def test_basic():
     assert "127.0.0.1:12345" in job.job_script()
 
 
-job_params = [
+job_protected = [
     pytest.param(SGEJob, marks=[pytest.mark.env("sge")]),
     pytest.param(PBSJob, marks=[pytest.mark.env("pbs")]),
     pytest.param(SLURMJob, marks=[pytest.mark.env("slurm")]),
@@ -22,33 +24,30 @@ def test_basic():
 ]
 
 
-@pytest.mark.parametrize("Job", job_params)
+all_jobs = [SGEJob, PBSJob, SLURMJob, LSFJob, HTCondorJob, MoabJob, OARJob]
+all_clusters = [SGECluster, PBSCluster, SLURMCluster, LSFCluster,
+        HTCondorCluster, MoabCluster, OARCluster]
+
+
+@pytest.mark.parametrize("Job", job_protected)
 @pytest.mark.asyncio
 async def test_job(Job):
     async with Scheduler(port=0) as s:
-        print(1)
         job = Job(scheduler=s.address, name="foo", cores=1, memory="1GB")
-        print(2)
         job = await job
-        print(3)
         async with Client(s.address, asynchronous=True) as client:
-            print(4)
             await client.wait_for_workers(1)
-            print(5)
             assert list(s.workers.values())[0].name == "foo"
 
-        print(6)
         await job.close()
-        print(7)
 
         start = time()
         while len(s.workers):
             await asyncio.sleep(0.1)
             assert time() < start + 10
-        print(8)
 
 
-@pytest.mark.parametrize("Job", job_params)
+@pytest.mark.parametrize("Job", job_protected)
 @pytest.mark.asyncio
 async def test_cluster(Job):
     async with JobQueueCluster(
@@ -71,7 +70,7 @@ async def test_cluster(Job):
                 assert time() < start + 10
 
 
-@pytest.mark.parametrize("Job", job_params)
+@pytest.mark.parametrize("Job", job_protected)
 @pytest.mark.asyncio
 async def test_adapt(Job):
     async with JobQueueCluster(
@@ -121,9 +120,16 @@ async def test_nprocs():
             await client.wait_for_workers(2)
             assert len(cluster.workers) == 1  # two workers, one job
             assert len(s.workers) == 2
+            breakpoint()
             assert cluster.plan == {ws.name for ws in s.workers.values()}
 
             cluster.scale(cores=1)
             await cluster
             await asyncio.sleep(0.2)
             assert len(cluster.scheduler.workers) == 2  # they're still one group
+
+
+@pytest.mark.parametrize("Cluster", all_clusters)
+def test_docstring_cluster(Cluster):
+    assert "cores :" in Cluster.__doc__
+    assert Cluster.__name__[:-len("Cluster")] in Cluster.__doc__

From fccd895f725516eed0d5f50b7491601f0ddc5e9d Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Sun, 8 Sep 2019 15:01:29 -0700
Subject: [PATCH 076/109] Remove docrep and core.py

---
 ci/none.sh                |  2 +-
 ci/pbs/Dockerfile         |  2 +-
 ci/slurm/Dockerfile       |  2 +-
 dask_jobqueue/__init__.py |  3 +--
 dask_jobqueue/core.py     | 47 ---------------------------------------
 dask_jobqueue/htcondor.py |  2 --
 dask_jobqueue/oar.py      |  1 -
 dask_jobqueue/slurm.py    |  1 -
 docs/environment.yml      |  1 -
 requirements.txt          |  1 -
 10 files changed, 4 insertions(+), 58 deletions(-)
 delete mode 100644 dask_jobqueue/core.py

diff --git a/ci/none.sh b/ci/none.sh
index c5ea74ab..f4c1a042 100644
--- a/ci/none.sh
+++ b/ci/none.sh
@@ -4,7 +4,7 @@ function jobqueue_before_install {
   # Install miniconda
   ./ci/conda_setup.sh
   export PATH="$HOME/miniconda/bin:$PATH"
-  conda install --yes -c conda-forge python=$TRAVIS_PYTHON_VERSION dask distributed flake8 black pytest docrep pytest-asyncio
+  conda install --yes -c conda-forge python=$TRAVIS_PYTHON_VERSION dask distributed flake8 black pytest pytest-asyncio
   pip install git+https://github.com/dask/distributed@master --upgrade --no-deps
 }
 
diff --git a/ci/pbs/Dockerfile b/ci/pbs/Dockerfile
index b3423e88..8c4a2aa7 100644
--- a/ci/pbs/Dockerfile
+++ b/ci/pbs/Dockerfile
@@ -30,7 +30,7 @@ RUN curl -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-L
     bash miniconda.sh -f -b -p /opt/anaconda && \
     /opt/anaconda/bin/conda clean -tipy && \
     rm -f miniconda.sh
-RUN conda install --yes -c conda-forge python=3.6 dask distributed flake8 pytest docrep pytest-asyncio
+RUN conda install --yes -c conda-forge python=3.6 dask distributed flake8 pytest pytest-asyncio
 RUN pip install git+https://github.com/dask/distributed --upgrade --no-deps
 
 # Copy entrypoint and other needed scripts
diff --git a/ci/slurm/Dockerfile b/ci/slurm/Dockerfile
index 1a8cc112..6c6c2439 100644
--- a/ci/slurm/Dockerfile
+++ b/ci/slurm/Dockerfile
@@ -5,7 +5,7 @@ RUN curl -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-L
     /opt/anaconda/bin/conda clean -tipy && \
     rm -f miniconda.sh
 ENV PATH /opt/anaconda/bin:$PATH
-RUN conda install --yes -c conda-forge python=3.6 dask distributed flake8 pytest docrep pytest-asyncio
+RUN conda install --yes -c conda-forge python=3.6 dask distributed flake8 pytest pytest-asyncio
 RUN pip install git+https://github.com/dask/distributed --upgrade --no-deps
 
 ENV LC_ALL en_US.UTF-8
diff --git a/dask_jobqueue/__init__.py b/dask_jobqueue/__init__.py
index 3ba45124..f366bc46 100644
--- a/dask_jobqueue/__init__.py
+++ b/dask_jobqueue/__init__.py
@@ -1,8 +1,7 @@
 # flake8: noqa
 from . import config
 from .local import LocalJob, LocalCluster
-from .core import JobQueueCluster
-from .job import Job
+from .job import Job, JobQueueCluster
 from .moab import MoabCluster, MoabJob
 from .pbs import PBSCluster, PBSJob
 from .slurm import SLURMCluster, SLURMJob
diff --git a/dask_jobqueue/core.py b/dask_jobqueue/core.py
deleted file mode 100644
index 9ddda8d7..00000000
--- a/dask_jobqueue/core.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import docrep
-
-docstrings = docrep.DocstringProcessor()
-
-
-# TODO: remove this class after we figure out docstrings
-
-
-@docstrings.get_sectionsf("JobQueueCluster")
-class JobQueueCluster:
-    """ Base class to launch Dask Clusters for Job queues
-
-    This class should not be used directly, use inherited class appropriate for your queueing system (e.g. PBScluster
-    or SLURMCluster)
-
-    Parameters
-    ----------
-    %{job_parameters}s
-
-    Attributes
-    ----------
-    submit_command: str
-        Abstract attribute for job scheduler submit command,
-        should be overridden
-    cancel_command: str
-        Abstract attribute for job scheduler cancel command,
-        should be overridden
-
-    See Also
-    --------
-    PBSCluster
-    SLURMCluster
-    SGECluster
-    OARCluster
-    LSFCluster
-    MoabCluster
-    """
-
-    _script_template = """
-%(shebang)s
-
-%(job_header)s
-
-%(env_header)s
-
-%(worker_command)s
-""".lstrip()
diff --git a/dask_jobqueue/htcondor.py b/dask_jobqueue/htcondor.py
index ce8eaf37..cbd55275 100644
--- a/dask_jobqueue/htcondor.py
+++ b/dask_jobqueue/htcondor.py
@@ -1,4 +1,3 @@
-import functools
 import logging
 import re
 import shlex
@@ -6,7 +5,6 @@
 import dask
 from distributed.utils import parse_bytes
 
-from .core import docstrings
 from .job import JobQueueCluster, Job, job_parameters, cluster_parameters
 
 logger = logging.getLogger(__name__)
diff --git a/dask_jobqueue/oar.py b/dask_jobqueue/oar.py
index 9d0b4616..119c97dc 100644
--- a/dask_jobqueue/oar.py
+++ b/dask_jobqueue/oar.py
@@ -3,7 +3,6 @@
 
 import dask
 
-from .core import docstrings
 from .job import JobQueueCluster, Job, job_parameters, cluster_parameters
 
 logger = logging.getLogger(__name__)
diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py
index 2b2e4c6e..9abb2bea 100644
--- a/dask_jobqueue/slurm.py
+++ b/dask_jobqueue/slurm.py
@@ -4,7 +4,6 @@
 
 import dask
 
-from .core import docstrings
 from .job import Job, JobQueueCluster, job_parameters, cluster_parameters
 
 logger = logging.getLogger(__name__)
diff --git a/docs/environment.yml b/docs/environment.yml
index bf834ebf..7d9241cd 100644
--- a/docs/environment.yml
+++ b/docs/environment.yml
@@ -4,7 +4,6 @@ channels:
 dependencies:
   - python=3.6
   - distributed
-  - docrep
   - numpydoc
   - ipython
   - sphinx
diff --git a/requirements.txt b/requirements.txt
index 0834a6ff..87c3fc36 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,2 @@
 dask>=2.3
 distributed>=2.3
-docrep

From f3ee152a06b7c4e31a6d2545328dd2fcc6c292fb Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Sun, 8 Sep 2019 15:01:48 -0700
Subject: [PATCH 077/109] remove errant breakpoint

---
 dask_jobqueue/tests/test_job.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index 3c2d04a2..31992742 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -120,7 +120,6 @@ async def test_nprocs():
             await client.wait_for_workers(2)
             assert len(cluster.workers) == 1  # two workers, one job
             assert len(s.workers) == 2
-            breakpoint()
             assert cluster.plan == {ws.name for ws in s.workers.values()}
 
             cluster.scale(cores=1)

From 5ec910e76fcd6ebbb6ee56f2e3bb80008b255d33 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Sun, 8 Sep 2019 15:02:19 -0700
Subject: [PATCH 078/109] flake8 and black

---
 dask_jobqueue/htcondor.py       |  4 +++-
 dask_jobqueue/job.py            |  9 ++++++---
 dask_jobqueue/local.py          |  9 ++++++---
 dask_jobqueue/lsf.py            |  9 ++++++---
 dask_jobqueue/oar.py            |  4 +++-
 dask_jobqueue/pbs.py            |  4 +++-
 dask_jobqueue/sge.py            |  4 +++-
 dask_jobqueue/slurm.py          |  5 +++--
 dask_jobqueue/tests/test_job.py | 34 +++++++++++++++++++++++++++------
 9 files changed, 61 insertions(+), 21 deletions(-)

diff --git a/dask_jobqueue/htcondor.py b/dask_jobqueue/htcondor.py
index cbd55275..7e70c240 100644
--- a/dask_jobqueue/htcondor.py
+++ b/dask_jobqueue/htcondor.py
@@ -222,6 +222,8 @@ class HTCondorCluster(JobQueueCluster):
     HTCondor can take longer to start jobs than other batch systems - tune Adaptive parameters accordingly.
 
     >>> cluster.adapt(minimum=5, startup_cost='60s')
-    """.format(job=job_parameters, cluster=cluster_parameters)
+    """.format(
+        job=job_parameters, cluster=cluster_parameters
+    )
     Job = HTCondorJob
     config_name = "htcondor"
diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index 54a9f544..65a57889 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -69,7 +69,6 @@
 """.strip()
 
 
-
 class Job(ProcessInterface):
     """ Base class to launch Dask workers on Job queues
 
@@ -97,7 +96,9 @@ class Job(ProcessInterface):
     OARCluster
     LSFCluster
     MoabCluster
-    """.format(job_parameters=job_parameters)
+    """.format(
+        job_parameters=job_parameters
+    )
 
     _script_template = """
 %(shebang)s
@@ -382,7 +383,9 @@ class JobQueueCluster(SpecCluster):
     Job : Job
         A class that can be awaited to ask for a single Job
     {cluster_parameters}
-    """.format(cluster_parameters=cluster_parameters)
+    """.format(
+        cluster_parameters=cluster_parameters
+    )
 
     Job = None
 
diff --git a/dask_jobqueue/local.py b/dask_jobqueue/local.py
index 155ca09e..097f1453 100644
--- a/dask_jobqueue/local.py
+++ b/dask_jobqueue/local.py
@@ -1,4 +1,3 @@
-import functools
 import logging
 import os
 import subprocess
@@ -19,7 +18,9 @@ class LocalJob(Job):
     Parameters
     ----------
     {job}
-    """.format(job=job_parameters)
+    """.format(
+        job=job_parameters
+    )
 
     config_name = "local"
 
@@ -86,6 +87,8 @@ class LocalCluster(JobQueueCluster):
     See Also
     --------
     dask.distributed.LocalCluster
-    """.format(job=job_parameters, cluster=cluster_parameters)
+    """.format(
+        job=job_parameters, cluster=cluster_parameters
+    )
     Job = LocalJob
     config_name = "local"
diff --git a/dask_jobqueue/lsf.py b/dask_jobqueue/lsf.py
index 151ed8f4..6e47b3cd 100644
--- a/dask_jobqueue/lsf.py
+++ b/dask_jobqueue/lsf.py
@@ -1,4 +1,3 @@
-import functools
 import logging
 import math
 import os
@@ -49,7 +48,9 @@ class LSFJob(Job):
     kill workers based on load.
 
     >>> cluster.adapt()
-    """.format(job=job_parameters)
+    """.format(
+        job=job_parameters
+    )
 
     submit_command = "bsub"
     cancel_command = "bkill"
@@ -237,6 +238,8 @@ class LSFCluster(JobQueueCluster):
     kill workers based on load.
 
     >>> cluster.adapt()
-    """.format(job=job_parameters, cluster=cluster_parameters)
+    """.format(
+        job=job_parameters, cluster=cluster_parameters
+    )
     Job = LSFJob
     config_name = "lsf"
diff --git a/dask_jobqueue/oar.py b/dask_jobqueue/oar.py
index 119c97dc..7c9ee2a8 100644
--- a/dask_jobqueue/oar.py
+++ b/dask_jobqueue/oar.py
@@ -125,6 +125,8 @@ class OARCluster(JobQueueCluster):
     This also works with adaptive clusters.  This automatically launches and kill workers based on load.
 
     >>> cluster.adapt()
-    """.format(job=job_parameters, cluster=cluster_parameters)
+    """.format(
+        job=job_parameters, cluster=cluster_parameters
+    )
     Job = OARJob
     config_name = "oar"
diff --git a/dask_jobqueue/pbs.py b/dask_jobqueue/pbs.py
index 3142f08a..6dd04738 100644
--- a/dask_jobqueue/pbs.py
+++ b/dask_jobqueue/pbs.py
@@ -136,6 +136,8 @@ class PBSCluster(JobQueueCluster):
     This also works with adaptive clusters.  This automatically launches and kill workers based on load.
 
     >>> cluster.adapt()
-    """.format(job=job_parameters, cluster=cluster_parameters)
+    """.format(
+        job=job_parameters, cluster=cluster_parameters
+    )
     Job = PBSJob
     config_name = "pbs"
diff --git a/dask_jobqueue/sge.py b/dask_jobqueue/sge.py
index 4bd2ba0e..f4cefca5 100644
--- a/dask_jobqueue/sge.py
+++ b/dask_jobqueue/sge.py
@@ -114,6 +114,8 @@ class SGECluster(JobQueueCluster):
     This also works with adaptive clusters.  This automatically launches and kill workers based on load.
 
     >>> cluster.adapt()
-    """.format(job=job_parameters, cluster=cluster_parameters)
+    """.format(
+        job=job_parameters, cluster=cluster_parameters
+    )
     Job = SGEJob
     config_name = "sge"
diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py
index 9abb2bea..3add6e9a 100644
--- a/dask_jobqueue/slurm.py
+++ b/dask_jobqueue/slurm.py
@@ -1,4 +1,3 @@
-import functools
 import logging
 import math
 
@@ -150,6 +149,8 @@ class SLURMCluster(JobQueueCluster):
     This also works with adaptive clusters.  This automatically launches and kill workers based on load.
 
     >>> cluster.adapt()
-    """.format(job=job_parameters, cluster=cluster_parameters)
+    """.format(
+        job=job_parameters, cluster=cluster_parameters
+    )
     Job = SLURMJob
     config_name = "slurm"
diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index 31992742..2285ca31 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -1,9 +1,24 @@
 import asyncio
 from time import time
 
-from dask_jobqueue import (PBSJob, PBSCluster, SGEJob, SGECluster, SLURMJob,
-        SLURMCluster, LSFJob, LSFCluster, LocalJob, LocalCluster,
-        HTCondorJob, HTCondorCluster, MoabJob, MoabCluster, OARJob, OARCluster)
+from dask_jobqueue import (
+    PBSJob,
+    PBSCluster,
+    SGEJob,
+    SGECluster,
+    SLURMJob,
+    SLURMCluster,
+    LSFJob,
+    LSFCluster,
+    LocalJob,
+    LocalCluster,
+    HTCondorJob,
+    HTCondorCluster,
+    MoabJob,
+    MoabCluster,
+    OARJob,
+    OARCluster,
+)
 from dask_jobqueue.job import JobQueueCluster
 from dask.distributed import Scheduler, Client
 
@@ -25,8 +40,15 @@ def test_basic():
 
 
 all_jobs = [SGEJob, PBSJob, SLURMJob, LSFJob, HTCondorJob, MoabJob, OARJob]
-all_clusters = [SGECluster, PBSCluster, SLURMCluster, LSFCluster,
-        HTCondorCluster, MoabCluster, OARCluster]
+all_clusters = [
+    SGECluster,
+    PBSCluster,
+    SLURMCluster,
+    LSFCluster,
+    HTCondorCluster,
+    MoabCluster,
+    OARCluster,
+]
 
 
 @pytest.mark.parametrize("Job", job_protected)
@@ -131,4 +153,4 @@ async def test_nprocs():
 @pytest.mark.parametrize("Cluster", all_clusters)
 def test_docstring_cluster(Cluster):
     assert "cores :" in Cluster.__doc__
-    assert Cluster.__name__[:-len("Cluster")] in Cluster.__doc__
+    assert Cluster.__name__[: -len("Cluster")] in Cluster.__doc__

From 3ad195d49efffe8f81777fc15b11122bc3463be7 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Sun, 8 Sep 2019 17:45:07 -0700
Subject: [PATCH 079/109] xfail minimum/maximum cores/memory test

---
 dask_jobqueue/tests/test_pbs.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dask_jobqueue/tests/test_pbs.py b/dask_jobqueue/tests/test_pbs.py
index 80911ea2..815580f7 100644
--- a/dask_jobqueue/tests/test_pbs.py
+++ b/dask_jobqueue/tests/test_pbs.py
@@ -248,6 +248,7 @@ def test_adaptive_grouped(loop):
                 assert time() < start + QUEUE_WAIT
 
 
+@pytest.mark.xfail(reason="adapt doesn't yet have cores/memory")
 @pytest.mark.env("pbs")
 def test_adaptive_cores_mem(loop):
     with PBSCluster(

From 9deb3bdfbb2a477dd7419d65bb704fe579cee2a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= <loic.esteve@ymail.com>
Date: Tue, 10 Sep 2019 15:31:34 +0200
Subject: [PATCH 080/109] Small tweaks and TODO.

---
 dask_jobqueue/htcondor.py |  3 +--
 dask_jobqueue/job.py      | 26 +++++++++++++++-------
 dask_jobqueue/lsf.py      | 45 +--------------------------------------
 dask_jobqueue/pbs.py      |  2 +-
 dask_jobqueue/sge.py      |  3 +--
 dask_jobqueue/slurm.py    |  4 +---
 6 files changed, 23 insertions(+), 60 deletions(-)

diff --git a/dask_jobqueue/htcondor.py b/dask_jobqueue/htcondor.py
index 7e70c240..57720f62 100644
--- a/dask_jobqueue/htcondor.py
+++ b/dask_jobqueue/htcondor.py
@@ -197,8 +197,7 @@ def quote_environment(env):
 
 
 class HTCondorCluster(JobQueueCluster):
-    __doc__ = """
-    Launch Dask on an HTCondor cluster with a shared file system
+    __doc__ = """ Launch Dask on an HTCondor cluster with a shared file system
 
     Parameters
     ----------
diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index 65a57889..cb2c88cc 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -72,8 +72,8 @@
 class Job(ProcessInterface):
     """ Base class to launch Dask workers on Job queues
 
-    This class should not be used directly, use inherited class appropriate for
-    your queueing system (e.g. PBScluster or SLURMCluster)
+    This class should not be used directly, use a class appropriate for
+    your queueing system (e.g. PBScluster or SLURMCluster) instead.
 
     Parameters
     ----------
@@ -144,9 +144,12 @@ def __init__(
         if config_name is None:
             config_name = getattr(type(self), "config_name", None)
 
+        # TODO I think the __init__ should be an abstractmethod rather than relying on config_name ...
         if config_name is None:
             raise NotImplementedError(
-                "JobQueueCluster is an abstract class that should not be instantiated."
+                "Job is an abstract class that should not be instantiated."
+                "Use a cluster class appropriate to your job queueing system, "
+                "e.g. PBSCluster or SLURMCluster"
             )
 
         if job_name is None:
@@ -203,6 +206,7 @@ def __init__(
         self.shebang = shebang
 
         self._env_header = "\n".join(filter(None, env_extra))
+        # TODO: should skip be part of this PR?
         self.header_skip = set(header_skip)
 
         # dask-worker command line build
@@ -278,12 +282,13 @@ async def start(self):
 
         with self.job_file() as fn:
             out = self._submit_job(fn)
-            job = self._job_id_from_submit_output(out)
-            if not job:
-                raise ValueError("Unable to parse jobid from output of %s" % out)
-            self.job_id = job
+            job_id = self._job_id_from_submit_output(out)
+            # TODO: why is this needed since _job_id_from_submit_output already raise a ValueError
+            if not job_id:
+                raise ValueError("Unable to parse job id from output of %s" % out)
+            self.job_id = job_id
 
-        weakref.finalize(self, self._close_job, job)
+        weakref.finalize(self, self._close_job, job_id)
 
         logger.debug("Starting job: %s", self.job_id)
         await super().start()
@@ -318,6 +323,7 @@ def _close_job(cls, job_id):
         if job_id:
             with ignoring(RuntimeError):  # deleting job when job already gone
                 cls._call(shlex.split(cls.cancel_command) + [job_id])
+            # TODO: Maybe a log.debug here
 
     @staticmethod
     def _call(cmd, **kwargs):
@@ -387,6 +393,7 @@ class JobQueueCluster(SpecCluster):
         cluster_parameters=cluster_parameters
     )
 
+    # TODO: I have a slight preference for a parameter like job_cls
     Job = None
 
     def __init__(
@@ -442,6 +449,9 @@ def __init__(
         if "processes" in kwargs and kwargs["processes"] > 1:
             worker["group"] = ["-" + str(i) for i in range(kwargs["processes"])]
 
+        # TODO: this seems like this sets self.scheduler.address, is there a
+        # less magical way of doing the same thing?
+        # self.example_job is also used for cluster.job_script()
         self.example_job  # trigger property to ensure that the job is valid
 
         super().__init__(
diff --git a/dask_jobqueue/lsf.py b/dask_jobqueue/lsf.py
index 6e47b3cd..4fb6a1a6 100644
--- a/dask_jobqueue/lsf.py
+++ b/dask_jobqueue/lsf.py
@@ -10,48 +10,6 @@
 
 
 class LSFJob(Job):
-    __doc__ = """ Launch Dask on a LSF cluster
-
-
-    Parameters
-    ----------
-    queue : str
-        Destination queue for each worker job. Passed to `#BSUB -q` option.
-    project : str
-        Accounting string associated with each worker job. Passed to
-        `#BSUB -P` option.
-    {job}
-    ncpus : int
-        Number of cpus. Passed to `#BSUB -n` option.
-    mem : int
-        Request memory in bytes. Passed to `#BSUB -M` option.
-    walltime : str
-        Walltime for each worker job in HH:MM. Passed to `#BSUB -W` option.
-    job_extra : list
-        List of other LSF options, for example -u. Each option will be
-        prepended with the #LSF prefix.
-    lsf_units : str
-        Unit system for large units in resource usage set by the
-        LSF_UNIT_FOR_LIMITS in the lsf.conf file of a cluster.
-
-    Examples
-    --------
-    >>> from dask_jobqueue import LSFCluster
-    >>> cluster = LSFCluster(queue='general', project='DaskonLSF',
-    ...                      cores=15, memory='25GB')
-    >>> cluster.scale(10)  # this may take a few seconds to launch
-
-    >>> from dask.distributed import Client
-    >>> client = Client(cluster)
-
-    This also works with adaptive clusters.  This automatically launches and
-    kill workers based on load.
-
-    >>> cluster.adapt()
-    """.format(
-        job=job_parameters
-    )
-
     submit_command = "bsub"
     cancel_command = "bkill"
 
@@ -195,8 +153,7 @@ def lsf_detect_units():
 
 
 class LSFCluster(JobQueueCluster):
-    __doc__ = """
-    Launch Dask on a LSF cluster
+    __doc__ = """ Launch Dask on a LSF cluster
 
     Parameters
     ----------
diff --git a/dask_jobqueue/pbs.py b/dask_jobqueue/pbs.py
index 6dd04738..bc412750 100644
--- a/dask_jobqueue/pbs.py
+++ b/dask_jobqueue/pbs.py
@@ -103,7 +103,7 @@ def __init__(
 
 
 class PBSCluster(JobQueueCluster):
-    __doc__ = """ Launch Dask on an OAR cluster
+    __doc__ = """ Launch Dask on a PBS cluster
 
     Parameters
     ----------
diff --git a/dask_jobqueue/sge.py b/dask_jobqueue/sge.py
index f4cefca5..3a669dd4 100644
--- a/dask_jobqueue/sge.py
+++ b/dask_jobqueue/sge.py
@@ -68,8 +68,7 @@ def __init__(
 
 
 class SGECluster(JobQueueCluster):
-    __doc__ = """
-    Launch Dask on an SGE cluster
+    __doc__ = """ Launch Dask on an SGE cluster
 
     .. note::
         If you want a specific amount of RAM, both ``memory`` and ``resource_spec``
diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py
index 3add6e9a..ef986665 100644
--- a/dask_jobqueue/slurm.py
+++ b/dask_jobqueue/slurm.py
@@ -40,7 +40,6 @@ def __init__(
 
         super().__init__(*args, config_name=config_name, **kwargs)
 
-        # Always ask for only one task
         header_lines = []
         # SLURM header build
         if self.job_name is not None:
@@ -108,8 +107,7 @@ def slurm_format_bytes_ceil(n):
 
 
 class SLURMCluster(JobQueueCluster):
-    __doc__ = """
-    Launch Dask on a SLURM cluster
+    __doc__ = """ Launch Dask on a SLURM cluster
 
     Parameters
     ----------

From 34bece893d63659d0f0d0c60adcd5281feba15cb Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Wed, 11 Sep 2019 10:38:14 -0700
Subject: [PATCH 081/109] Job -> job_cls

---
 dask_jobqueue/htcondor.py |  2 +-
 dask_jobqueue/job.py      | 10 +++++-----
 dask_jobqueue/local.py    |  2 +-
 dask_jobqueue/lsf.py      |  2 +-
 dask_jobqueue/moab.py     |  2 +-
 dask_jobqueue/oar.py      |  2 +-
 dask_jobqueue/pbs.py      |  2 +-
 dask_jobqueue/sge.py      |  2 +-
 dask_jobqueue/slurm.py    |  2 +-
 9 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/dask_jobqueue/htcondor.py b/dask_jobqueue/htcondor.py
index 57720f62..2402a2a1 100644
--- a/dask_jobqueue/htcondor.py
+++ b/dask_jobqueue/htcondor.py
@@ -224,5 +224,5 @@ class HTCondorCluster(JobQueueCluster):
     """.format(
         job=job_parameters, cluster=cluster_parameters
     )
-    Job = HTCondorJob
+    job_cls = HTCondorJob
     config_name = "htcondor"
diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index cb2c88cc..8414a8fb 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -394,7 +394,7 @@ class JobQueueCluster(SpecCluster):
     )
 
     # TODO: I have a slight preference for a parameter like job_cls
-    Job = None
+    job_cls = None
 
     def __init__(
         self,
@@ -417,9 +417,9 @@ def __init__(
     ):
         self.status = "created"
         if Job is not None:
-            self.Job = Job
+            self.job_cls = Job
 
-        if self.Job is None:
+        if self.job_cls is None:
             raise ValueError(
                 "You must provide a Job type like PBSJob, SLURMJob, "
                 "or SGEJob with the Job= argument."
@@ -445,7 +445,7 @@ def __init__(
         kwargs["protocol"] = protocol
         kwargs["security"] = security
         self._kwargs = kwargs
-        worker = {"cls": self.Job, "options": kwargs}
+        worker = {"cls": self.job_cls, "options": kwargs}
         if "processes" in kwargs and kwargs["processes"] > 1:
             worker["group"] = ["-" + str(i) for i in range(kwargs["processes"])]
 
@@ -472,7 +472,7 @@ def example_job(self):
             address = self.scheduler.address
         except AttributeError:
             address = "tcp://scheduler:8786"
-        return self.Job(address or "tcp://scheduler:8786", name="name", **self._kwargs)
+        return self.job_cls(address or "tcp://scheduler:8786", name="name", **self._kwargs)
 
     @property
     def job_header(self):
diff --git a/dask_jobqueue/local.py b/dask_jobqueue/local.py
index 097f1453..757a7020 100644
--- a/dask_jobqueue/local.py
+++ b/dask_jobqueue/local.py
@@ -90,5 +90,5 @@ class LocalCluster(JobQueueCluster):
     """.format(
         job=job_parameters, cluster=cluster_parameters
     )
-    Job = LocalJob
+    job_cls = LocalJob
     config_name = "local"
diff --git a/dask_jobqueue/lsf.py b/dask_jobqueue/lsf.py
index 4fb6a1a6..9b2b850b 100644
--- a/dask_jobqueue/lsf.py
+++ b/dask_jobqueue/lsf.py
@@ -198,5 +198,5 @@ class LSFCluster(JobQueueCluster):
     """.format(
         job=job_parameters, cluster=cluster_parameters
     )
-    Job = LSFJob
+    job_cls = LSFJob
     config_name = "lsf"
diff --git a/dask_jobqueue/moab.py b/dask_jobqueue/moab.py
index c2f8f3f6..969a8475 100644
--- a/dask_jobqueue/moab.py
+++ b/dask_jobqueue/moab.py
@@ -9,4 +9,4 @@ class MoabJob(PBSJob):
 
 class MoabCluster(PBSCluster):
     __doc__ = PBSCluster.__doc__.replace("PBSCluster", "MoabCluster")
-    Job = MoabJob
+    job_cls = MoabJob
diff --git a/dask_jobqueue/oar.py b/dask_jobqueue/oar.py
index 7c9ee2a8..cdd8f961 100644
--- a/dask_jobqueue/oar.py
+++ b/dask_jobqueue/oar.py
@@ -128,5 +128,5 @@ class OARCluster(JobQueueCluster):
     """.format(
         job=job_parameters, cluster=cluster_parameters
     )
-    Job = OARJob
+    job_cls = OARJob
     config_name = "oar"
diff --git a/dask_jobqueue/pbs.py b/dask_jobqueue/pbs.py
index bc412750..40dfddd6 100644
--- a/dask_jobqueue/pbs.py
+++ b/dask_jobqueue/pbs.py
@@ -139,5 +139,5 @@ class PBSCluster(JobQueueCluster):
     """.format(
         job=job_parameters, cluster=cluster_parameters
     )
-    Job = PBSJob
+    job_cls = PBSJob
     config_name = "pbs"
diff --git a/dask_jobqueue/sge.py b/dask_jobqueue/sge.py
index 3a669dd4..ba6bd71d 100644
--- a/dask_jobqueue/sge.py
+++ b/dask_jobqueue/sge.py
@@ -116,5 +116,5 @@ class SGECluster(JobQueueCluster):
     """.format(
         job=job_parameters, cluster=cluster_parameters
     )
-    Job = SGEJob
+    job_cls = SGEJob
     config_name = "sge"
diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py
index ef986665..d1fb707b 100644
--- a/dask_jobqueue/slurm.py
+++ b/dask_jobqueue/slurm.py
@@ -150,5 +150,5 @@ class SLURMCluster(JobQueueCluster):
     """.format(
         job=job_parameters, cluster=cluster_parameters
     )
-    Job = SLURMJob
+    job_cls = SLURMJob
     config_name = "slurm"

From 6b394e302a653d38d8d5e0f4a05c9b4b91e2edd7 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Wed, 11 Sep 2019 10:39:41 -0700
Subject: [PATCH 082/109] remove spacing around job/cluster parametrers

---
 dask_jobqueue/local.py | 1 -
 dask_jobqueue/lsf.py   | 4 ----
 dask_jobqueue/oar.py   | 3 ---
 dask_jobqueue/pbs.py   | 3 ---
 dask_jobqueue/sge.py   | 3 ---
 dask_jobqueue/slurm.py | 3 ---
 6 files changed, 17 deletions(-)

diff --git a/dask_jobqueue/local.py b/dask_jobqueue/local.py
index 757a7020..077aeae8 100644
--- a/dask_jobqueue/local.py
+++ b/dask_jobqueue/local.py
@@ -75,7 +75,6 @@ class LocalCluster(JobQueueCluster):
     Parameters
     ----------
     {job}
-
     {cluster}
 
     Examples
diff --git a/dask_jobqueue/lsf.py b/dask_jobqueue/lsf.py
index 9b2b850b..669dcedb 100644
--- a/dask_jobqueue/lsf.py
+++ b/dask_jobqueue/lsf.py
@@ -162,18 +162,14 @@ class LSFCluster(JobQueueCluster):
     project : str
         Accounting string associated with each worker job. Passed to
         `#BSUB -P` option.
-
     {job}
-
     ncpus : int
         Number of cpus. Passed to `#BSUB -n` option.
     mem : int
         Request memory in bytes. Passed to `#BSUB -M` option.
     walltime : str
         Walltime for each worker job in HH:MM. Passed to `#BSUB -W` option.
-
     {cluster}
-
     job_extra : list
         List of other LSF options, for example -u. Each option will be
         prepended with the #LSF prefix.
diff --git a/dask_jobqueue/oar.py b/dask_jobqueue/oar.py
index cdd8f961..285d41a9 100644
--- a/dask_jobqueue/oar.py
+++ b/dask_jobqueue/oar.py
@@ -101,11 +101,8 @@ class OARCluster(JobQueueCluster):
         Destination queue for each worker job. Passed to `#OAR -q` option.
     project : str
         Accounting string associated with each worker job. Passed to `#OAR -p` option.
-
     {job}
-
     {cluster}
-
     resource_spec : str
         Request resources and specify job placement. Passed to `#OAR -l` option.
     walltime : str
diff --git a/dask_jobqueue/pbs.py b/dask_jobqueue/pbs.py
index 40dfddd6..ae670e8e 100644
--- a/dask_jobqueue/pbs.py
+++ b/dask_jobqueue/pbs.py
@@ -111,11 +111,8 @@ class PBSCluster(JobQueueCluster):
         Destination queue for each worker job. Passed to `#PBS -q` option.
     project : str
         Accounting string associated with each worker job. Passed to `#PBS -A` option.
-
     {job}
-
     {cluster}
-
     resource_spec : str
         Request resources and specify job placement. Passed to `#PBS -l` option.
     walltime : str
diff --git a/dask_jobqueue/sge.py b/dask_jobqueue/sge.py
index ba6bd71d..e61524f8 100644
--- a/dask_jobqueue/sge.py
+++ b/dask_jobqueue/sge.py
@@ -83,11 +83,8 @@ class SGECluster(JobQueueCluster):
         Destination queue for each worker job. Passed to `#$ -q` option.
     project : str
         Accounting string associated with each worker job. Passed to `#$ -A` option.
-
     {job}
-
     {cluster}
-
     resource_spec : str
         Request resources and specify job placement. Passed to `#$ -l` option.
     walltime : str
diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py
index d1fb707b..d92bc094 100644
--- a/dask_jobqueue/slurm.py
+++ b/dask_jobqueue/slurm.py
@@ -115,11 +115,8 @@ class SLURMCluster(JobQueueCluster):
         Destination queue for each worker job. Passed to `#SBATCH -p` option.
     project : str
         Accounting string associated with each worker job. Passed to `#SBATCH -A` option.
-
     {job}
-
     {cluster}
-
     walltime : str
         Walltime for each worker job.
     job_cpu : int

From faef618c0a09167f146503e8a092573033eefe9e Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Wed, 11 Sep 2019 11:58:59 -0700
Subject: [PATCH 083/109] black

---
 dask_jobqueue/job.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index 8414a8fb..bf72c7da 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -472,7 +472,9 @@ def example_job(self):
             address = self.scheduler.address
         except AttributeError:
             address = "tcp://scheduler:8786"
-        return self.job_cls(address or "tcp://scheduler:8786", name="name", **self._kwargs)
+        return self.job_cls(
+            address or "tcp://scheduler:8786", name="name", **self._kwargs
+        )
 
     @property
     def job_header(self):

From d90cc53c3ff87165ed423083c383b0ef48740967 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Thu, 12 Sep 2019 08:09:41 -0700
Subject: [PATCH 084/109] remove LocalFoo from __init__.py

---
 dask_jobqueue/__init__.py       | 1 -
 dask_jobqueue/tests/test_job.py | 6 ++++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/dask_jobqueue/__init__.py b/dask_jobqueue/__init__.py
index f366bc46..66168ac6 100644
--- a/dask_jobqueue/__init__.py
+++ b/dask_jobqueue/__init__.py
@@ -1,6 +1,5 @@
 # flake8: noqa
 from . import config
-from .local import LocalJob, LocalCluster
 from .job import Job, JobQueueCluster
 from .moab import MoabCluster, MoabJob
 from .pbs import PBSCluster, PBSJob
diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index 2285ca31..d32140b1 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -10,8 +10,6 @@
     SLURMCluster,
     LSFJob,
     LSFCluster,
-    LocalJob,
-    LocalCluster,
     HTCondorJob,
     HTCondorCluster,
     MoabJob,
@@ -19,6 +17,10 @@
     OARJob,
     OARCluster,
 )
+from dask_jobqueue.local import (
+    LocalJob,
+    LocalCluster,
+)
 from dask_jobqueue.job import JobQueueCluster
 from dask.distributed import Scheduler, Client
 

From fa629af2d3227bc4bc02a49bf1c6c3dfcd7b1268 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Thu, 12 Sep 2019 08:29:56 -0700
Subject: [PATCH 085/109] Remove Job classes from __init__.py

---
 dask_jobqueue/__init__.py       | 16 ++++++++--------
 dask_jobqueue/tests/test_job.py | 17 +++++++++--------
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/dask_jobqueue/__init__.py b/dask_jobqueue/__init__.py
index 66168ac6..4d48b0f1 100644
--- a/dask_jobqueue/__init__.py
+++ b/dask_jobqueue/__init__.py
@@ -1,13 +1,13 @@
 # flake8: noqa
 from . import config
-from .job import Job, JobQueueCluster
-from .moab import MoabCluster, MoabJob
-from .pbs import PBSCluster, PBSJob
-from .slurm import SLURMCluster, SLURMJob
-from .sge import SGECluster, SGEJob
-from .lsf import LSFCluster, LSFJob
-from .oar import OARCluster, OARJob
-from .htcondor import HTCondorCluster, HTCondorJob
+from .job import JobQueueCluster
+from .moab import MoabCluster
+from .pbs import PBSCluster
+from .slurm import SLURMCluster
+from .sge import SGECluster
+from .lsf import LSFCluster
+from .oar import OARCluster
+from .htcondor import HTCondorCluster
 
 from ._version import get_versions
 
diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index d32140b1..46f54f6a 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -2,26 +2,27 @@
 from time import time
 
 from dask_jobqueue import (
-    PBSJob,
     PBSCluster,
-    SGEJob,
     SGECluster,
-    SLURMJob,
     SLURMCluster,
-    LSFJob,
     LSFCluster,
-    HTCondorJob,
     HTCondorCluster,
-    MoabJob,
     MoabCluster,
-    OARJob,
     OARCluster,
 )
 from dask_jobqueue.local import (
     LocalJob,
     LocalCluster,
 )
-from dask_jobqueue.job import JobQueueCluster
+from dask_jobqueue.pbs import PBSJob
+from dask_jobqueue.sge import SGEJob
+from dask_jobqueue.slurm import SLURMJob
+from dask_jobqueue.lsf import LSFJob
+from dask_jobqueue.moab import MoabJob
+from dask_jobqueue.htcondor import HTCondorJob
+from dask_jobqueue.oar import OARJob
+
+from dask_jobqueue.job import Job, JobQueueCluster
 from dask.distributed import Scheduler, Client
 
 import pytest

From bece3e459a8ce5e8053731e4ea0caf8bf6752611 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Thu, 12 Sep 2019 08:31:05 -0700
Subject: [PATCH 086/109] lint

---
 dask_jobqueue/tests/test_job.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index 46f54f6a..4c776ace 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -10,10 +10,7 @@
     MoabCluster,
     OARCluster,
 )
-from dask_jobqueue.local import (
-    LocalJob,
-    LocalCluster,
-)
+from dask_jobqueue.local import LocalJob, LocalCluster
 from dask_jobqueue.pbs import PBSJob
 from dask_jobqueue.sge import SGEJob
 from dask_jobqueue.slurm import SLURMJob
@@ -22,7 +19,7 @@
 from dask_jobqueue.htcondor import HTCondorJob
 from dask_jobqueue.oar import OARJob
 
-from dask_jobqueue.job import Job, JobQueueCluster
+from dask_jobqueue.job import JobQueueCluster
 from dask.distributed import Scheduler, Client
 
 import pytest

From eef73344d637ac58f79806d20c6e26593348d61b Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Thu, 12 Sep 2019 13:04:19 -0700
Subject: [PATCH 087/109] Raise error in LocalJob if failure

This also changes _submit_job to an async function,
and uses an async subprocess module from tornado
---
 dask_jobqueue/job.py   |  4 ++--
 dask_jobqueue/local.py | 23 ++++++++++++++---------
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index bf72c7da..8c20c9ef 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -262,7 +262,7 @@ def job_file(self):
                 f.write(self.job_script())
             yield fn
 
-    def _submit_job(self, script_filename):
+    async def _submit_job(self, script_filename):
         # Should we make this async friendly?
         return self._call(shlex.split(self.submit_command) + [script_filename])
 
@@ -281,7 +281,7 @@ async def start(self):
         logger.debug("Starting worker: %s", self.name)
 
         with self.job_file() as fn:
-            out = self._submit_job(fn)
+            out = await self._submit_job(fn)
             job_id = self._job_id_from_submit_output(out)
             # TODO: why is this needed since _job_id_from_submit_output already raise a ValueError
             if not job_id:
diff --git a/dask_jobqueue/local.py b/dask_jobqueue/local.py
index 077aeae8..8832c088 100644
--- a/dask_jobqueue/local.py
+++ b/dask_jobqueue/local.py
@@ -1,6 +1,6 @@
 import logging
 import os
-import subprocess
+from tornado.process import Subprocess
 
 from .job import Job, JobQueueCluster, job_parameters, cluster_parameters
 
@@ -39,22 +39,27 @@ def __init__(
         super().__init__(*args, config_name=config_name, shebang="", **kwargs)
 
         # Declare class attribute that shall be overridden
-        header_lines = []
-        self.job_header = "\n".join(header_lines)
+        self.job_header = ""
 
         logger.debug("Job script: \n %s" % self.job_script())
 
-    def _submit_job(self, script_filename):
+    async def _submit_job(self, script_filename):
         # Should we make this async friendly?
         with open(script_filename) as f:
             text = f.read().strip().split()
-        self.process = subprocess.Popen(
-            text, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        self.process = Subprocess(
+            text, stdout=Subprocess.STREAM, stderr=Subprocess.STREAM
         )
-        # TODO this should raise if self.process.returncode != 0. Refactor
-        # Job._call to be able to return process (so that we can return self.process.pid below)
 
-        self.process.stderr.readline()  # make sure that we start
+        lines = []
+        while True:
+            line = await self.process.stderr.read_until(b'\n')  # make sure that we start
+            lines.append(line.decode())
+            if b"Registered to:" in line:
+                break
+            if b"error" in line.lower():
+                raise Exception("Worker failed\n\n" + "".join(lines))
+
         return str(self.process.pid)
 
     @classmethod

From 7d13e6ba58321aa17bcb142f8948be1ef58d76de Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Thu, 12 Sep 2019 18:43:10 -0700
Subject: [PATCH 088/109] Remove some TODOs

---
 dask_jobqueue/job.py            |  6 +-----
 dask_jobqueue/local.py          |  4 +++-
 dask_jobqueue/tests/test_pbs.py | 16 +---------------
 dask_jobqueue/tests/test_sge.py |  3 ---
 4 files changed, 5 insertions(+), 24 deletions(-)

diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index 8c20c9ef..44a3d434 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -323,7 +323,7 @@ def _close_job(cls, job_id):
         if job_id:
             with ignoring(RuntimeError):  # deleting job when job already gone
                 cls._call(shlex.split(cls.cancel_command) + [job_id])
-            # TODO: Maybe a log.debug here
+            logger.debug("Closed job %s", job_id)
 
     @staticmethod
     def _call(cmd, **kwargs):
@@ -393,7 +393,6 @@ class JobQueueCluster(SpecCluster):
         cluster_parameters=cluster_parameters
     )
 
-    # TODO: I have a slight preference for a parameter like job_cls
     job_cls = None
 
     def __init__(
@@ -449,9 +448,6 @@ def __init__(
         if "processes" in kwargs and kwargs["processes"] > 1:
             worker["group"] = ["-" + str(i) for i in range(kwargs["processes"])]
 
-        # TODO: this seems like this sets self.scheduler.address, is there a
-        # less magical way of doing the same thing?
-        # self.example_job is also used for cluster.job_script()
         self.example_job  # trigger property to ensure that the job is valid
 
         super().__init__(
diff --git a/dask_jobqueue/local.py b/dask_jobqueue/local.py
index 8832c088..66003b09 100644
--- a/dask_jobqueue/local.py
+++ b/dask_jobqueue/local.py
@@ -53,7 +53,9 @@ async def _submit_job(self, script_filename):
 
         lines = []
         while True:
-            line = await self.process.stderr.read_until(b'\n')  # make sure that we start
+            line = await self.process.stderr.read_until(
+                b"\n"
+            )  # make sure that we start
             lines.append(line.decode())
             if b"Registered to:" in line:
                 break
diff --git a/dask_jobqueue/tests/test_pbs.py b/dask_jobqueue/tests/test_pbs.py
index 815580f7..3de55096 100644
--- a/dask_jobqueue/tests/test_pbs.py
+++ b/dask_jobqueue/tests/test_pbs.py
@@ -295,10 +295,6 @@ def test_scale_grouped(loop):
             cluster.scale(4)  # Start 2 jobs
 
             start = time()
-            # TODO: Is there a replacement to check for number of jobs (rather than workers)
-            # while len(cluster.running_jobs) != 2:
-            #     sleep(0.100)
-            #     assert time() < start + QUEUE_WAIT
 
             while len(list(client.scheduler_info()["workers"].values())) != 4:
                 sleep(0.100)
@@ -317,13 +313,6 @@ def test_scale_grouped(loop):
             cluster.scale(1)  # Should leave 2 workers, 1 job
 
             start = time()
-            # TODO
-            # while len(cluster.running_jobs) != 1:
-            #     sleep(0.100)
-            #     assert time() < start + QUEUE_WAIT
-
-            # assert len(cluster.running_jobs) == 1
-            # workers = list(client.scheduler_info()["workers"].values())
             while len(client.scheduler_info()["workers"]) != 2:
                 sleep(0.100)
                 assert time() < start + QUEUE_WAIT
@@ -331,11 +320,8 @@ def test_scale_grouped(loop):
             cluster.scale(0)
 
             start = time()
-            # while cluster.running_jobs:
-            #     sleep(0.100)
-            #     assert time() < start + QUEUE_WAIT
 
-            # assert not cluster.running_jobs
+            assert not cluster.worker_spec
             while len(client.scheduler_info()["workers"]) != 0:
                 sleep(0.100)
                 assert time() < start + QUEUE_WAIT
diff --git a/dask_jobqueue/tests/test_sge.py b/dask_jobqueue/tests/test_sge.py
index 980de436..6020dabb 100644
--- a/dask_jobqueue/tests/test_sge.py
+++ b/dask_jobqueue/tests/test_sge.py
@@ -118,9 +118,6 @@ def test_complex_cancel_command(loop):
                 sleep(0.100)
                 assert time() < start + QUEUE_WAIT
 
-            # TODO: Is there a replacement for .stop_all_jobs? stop_all_jobs
-            # does make sure that the pending jobs get qdeled.
-            # cluster.stop_all_jobs()
             cluster.scale(0)
 
             start = time()

From a1d1343bdc1745f36383c070f391cf678dbd34a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= <loic.esteve@ymail.com>
Date: Fri, 13 Sep 2019 15:57:55 +0200
Subject: [PATCH 089/109] Remove None check for job_id.

---
 dask_jobqueue/job.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index 44a3d434..61360c06 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -282,13 +282,9 @@ async def start(self):
 
         with self.job_file() as fn:
             out = await self._submit_job(fn)
-            job_id = self._job_id_from_submit_output(out)
-            # TODO: why is this needed since _job_id_from_submit_output already raise a ValueError
-            if not job_id:
-                raise ValueError("Unable to parse job id from output of %s" % out)
-            self.job_id = job_id
+            self.job_id = self._job_id_from_submit_output(out)
 
-        weakref.finalize(self, self._close_job, job_id)
+        weakref.finalize(self, self._close_job, self.job_id)
 
         logger.debug("Starting job: %s", self.job_id)
         await super().start()

From e1f2e20bcc5a6e4f0d171dfb87dca7a7e41fa8c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= <loic.esteve@ymail.com>
Date: Fri, 13 Sep 2019 16:58:00 +0200
Subject: [PATCH 090/109] Some tweaks.

* Make Job.__init__ abstractmethod rather than relying on config_name None check
* rename Job -> job_cls in a few remaining places
* put back xfail test.
---
 dask_jobqueue/job.py                      | 31 ++++++++++++-----------
 dask_jobqueue/tests/test_job.py           | 20 +++++++--------
 dask_jobqueue/tests/test_jobqueue_core.py | 19 ++++++--------
 3 files changed, 34 insertions(+), 36 deletions(-)

diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index 61360c06..226e9c6b 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -6,6 +6,7 @@
 import subprocess
 import sys
 import weakref
+import abc
 
 import dask
 from dask.utils import ignoring
@@ -69,7 +70,7 @@
 """.strip()
 
 
-class Job(ProcessInterface):
+class Job(ProcessInterface, abc.ABC):
     """ Base class to launch Dask workers on Job queues
 
     This class should not be used directly, use a class appropriate for
@@ -111,8 +112,10 @@ class Job(ProcessInterface):
     # Following class attributes should be overridden by extending classes.
     submit_command = None
     cancel_command = None
+    config_name = None
     job_id_regexp = r"(?P<job_id>\d+)"
 
+    @abc.abstractmethod
     def __init__(
         self,
         scheduler=None,
@@ -134,22 +137,20 @@ def __init__(
         config_name=None,
         **kwargs
     ):
-        # """
-        # This initializer should be considered as Abstract, and never used directly.
-        # """
         self.scheduler = scheduler
         self.job_id = None
 
         super().__init__()
-        if config_name is None:
-            config_name = getattr(type(self), "config_name", None)
 
-        # TODO I think the __init__ should be an abstractmethod rather than relying on config_name ...
         if config_name is None:
-            raise NotImplementedError(
-                "Job is an abstract class that should not be instantiated."
-                "Use a cluster class appropriate to your job queueing system, "
-                "e.g. PBSCluster or SLURMCluster"
+            config_name = getattr(type(self), "config_name")
+        if config_name is None:
+            raise ValueError(
+                "Looks like you are trying to create a class that inherits from dask_jobqueue.job.Job. "
+                "If that is the case, you need to:\n"
+                "- set the 'config_name' class variable to a non-None value\n"
+                "- create a section in jobqueue.yaml with the value of 'config_name'\n"
+                "If that is not the case, please open an issue in https://github.com/dask/dask-jobqueue/issues."
             )
 
         if job_name is None:
@@ -394,7 +395,7 @@ class JobQueueCluster(SpecCluster):
     def __init__(
         self,
         n_workers=0,
-        Job: Job = None,
+        job_cls: Job = None,
         # Cluster keywords
         loop=None,
         security=None,
@@ -411,13 +412,13 @@ def __init__(
         **kwargs
     ):
         self.status = "created"
-        if Job is not None:
-            self.job_cls = Job
+        if job_cls is not None:
+            self.job_cls = job_cls
 
         if self.job_cls is None:
             raise ValueError(
                 "You must provide a Job type like PBSJob, SLURMJob, "
-                "or SGEJob with the Job= argument."
+                "or SGEJob with the job_cls= argument."
             )
 
         if config_name:
diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index 4c776ace..894006b9 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -51,11 +51,11 @@ def test_basic():
 ]
 
 
-@pytest.mark.parametrize("Job", job_protected)
+@pytest.mark.parametrize("job_cls", job_protected)
 @pytest.mark.asyncio
-async def test_job(Job):
+async def test_job(job_cls):
     async with Scheduler(port=0) as s:
-        job = Job(scheduler=s.address, name="foo", cores=1, memory="1GB")
+        job = job_cls(scheduler=s.address, name="foo", cores=1, memory="1GB")
         job = await job
         async with Client(s.address, asynchronous=True) as client:
             await client.wait_for_workers(1)
@@ -69,18 +69,18 @@ async def test_job(Job):
             assert time() < start + 10
 
 
-@pytest.mark.parametrize("Job", job_protected)
+@pytest.mark.parametrize("job_cls", job_protected)
 @pytest.mark.asyncio
-async def test_cluster(Job):
+async def test_cluster(job_cls):
     async with JobQueueCluster(
-        1, cores=1, memory="1GB", Job=Job, asynchronous=True, name="foo"
+        1, cores=1, memory="1GB", job_cls=job_cls, asynchronous=True, name="foo"
     ) as cluster:
         async with Client(cluster, asynchronous=True) as client:
             assert len(cluster.workers) == 1
             cluster.scale(2)
             await cluster
             assert len(cluster.workers) == 2
-            assert all(isinstance(w, Job) for w in cluster.workers.values())
+            assert all(isinstance(w, job_cls) for w in cluster.workers.values())
             assert all(w.status == "running" for w in cluster.workers.values())
             await client.wait_for_workers(2)
 
@@ -92,11 +92,11 @@ async def test_cluster(Job):
                 assert time() < start + 10
 
 
-@pytest.mark.parametrize("Job", job_protected)
+@pytest.mark.parametrize("job_cls", job_protected)
 @pytest.mark.asyncio
-async def test_adapt(Job):
+async def test_adapt(job_cls):
     async with JobQueueCluster(
-        1, cores=1, memory="1GB", Job=Job, asynchronous=True, name="foo"
+        1, cores=1, memory="1GB", job_cls=job_cls, asynchronous=True, name="foo"
     ) as cluster:
         async with Client(cluster, asynchronous=True) as client:
             await client.wait_for_workers(1)
diff --git a/dask_jobqueue/tests/test_jobqueue_core.py b/dask_jobqueue/tests/test_jobqueue_core.py
index de812234..a376e08e 100644
--- a/dask_jobqueue/tests/test_jobqueue_core.py
+++ b/dask_jobqueue/tests/test_jobqueue_core.py
@@ -19,13 +19,10 @@
 from dask_jobqueue.sge import SGEJob
 
 
-@pytest.mark.xfail
 def test_errors():
-    with pytest.raises(NotImplementedError) as info:
+    with pytest.raises(ValueError, match="Job type.*job_cls="):
         JobQueueCluster(cores=4)
 
-    assert "abstract class" in str(info.value)
-
 
 def test_command_template():
     with PBSCluster(cores=2, memory="4GB") as cluster:
@@ -111,7 +108,7 @@ def test_job_id_from_qsub_legacy(Cluster, qsub_return_string):
         assert original_job_id == cluster._job_id_from_submit_output(qsub_return_string)
 
 
-@pytest.mark.parametrize("Job", [SGEJob])
+@pytest.mark.parametrize("job_cls", [SGEJob])
 @pytest.mark.parametrize(
     "qsub_return_string",
     [
@@ -123,10 +120,10 @@ def test_job_id_from_qsub_legacy(Cluster, qsub_return_string):
         "{job_id}",
     ],
 )
-def test_job_id_from_qsub(Job, qsub_return_string):
+def test_job_id_from_qsub(job_cls, qsub_return_string):
     original_job_id = "654321"
     qsub_return_string = qsub_return_string.format(job_id=original_job_id)
-    job = Job(cores=1, memory="1GB")
+    job = job_cls(cores=1, memory="1GB")
     assert original_job_id == job._job_id_from_submit_output(qsub_return_string)
 
 
@@ -146,16 +143,16 @@ def test_job_id_error_handling_legacy(Cluster):
             cluster._job_id_from_submit_output(return_string)
 
 
-@pytest.mark.parametrize("Job", [SGEJob])
-def test_job_id_error_handling(Job):
+@pytest.mark.parametrize("job_cls", [SGEJob])
+def test_job_id_error_handling(job_cls):
     # non-matching regexp
-    job = Job(cores=1, memory="1GB")
+    job = job_cls(cores=1, memory="1GB")
     with pytest.raises(ValueError, match="Could not parse job id"):
         return_string = "there is no number here"
         job._job_id_from_submit_output(return_string)
 
     # no job_id named group in the regexp
-    job = Job(cores=1, memory="1GB")
+    job = job_cls(cores=1, memory="1GB")
     with pytest.raises(ValueError, match="You need to use a 'job_id' named group"):
         return_string = "Job <12345> submitted to <normal>."
         job.job_id_regexp = r"(\d+)"

From ccfe946523a8cf92b316f84576aad8bf7210fe0d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= <loic.esteve@ymail.com>
Date: Fri, 13 Sep 2019 17:19:24 +0200
Subject: [PATCH 091/109] Fine to have header_skip in the PR.

---
 dask_jobqueue/job.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dask_jobqueue/job.py b/dask_jobqueue/job.py
index 226e9c6b..92bf7b12 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/job.py
@@ -207,7 +207,6 @@ def __init__(
         self.shebang = shebang
 
         self._env_header = "\n".join(filter(None, env_extra))
-        # TODO: should skip be part of this PR?
         self.header_skip = set(header_skip)
 
         # dask-worker command line build

From ddb692f25226952d82cf835d32b8629d5191da01 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Fri, 13 Sep 2019 14:51:52 -0700
Subject: [PATCH 092/109] Move job.py to core.py

---
 dask_jobqueue/__init__.py         | 2 +-
 dask_jobqueue/{job.py => core.py} | 2 +-
 dask_jobqueue/htcondor.py         | 2 +-
 dask_jobqueue/local.py            | 2 +-
 dask_jobqueue/lsf.py              | 2 +-
 dask_jobqueue/oar.py              | 2 +-
 dask_jobqueue/pbs.py              | 2 +-
 dask_jobqueue/sge.py              | 2 +-
 dask_jobqueue/slurm.py            | 2 +-
 dask_jobqueue/tests/test_job.py   | 2 +-
 10 files changed, 10 insertions(+), 10 deletions(-)
 rename dask_jobqueue/{job.py => core.py} (99%)

diff --git a/dask_jobqueue/__init__.py b/dask_jobqueue/__init__.py
index 4d48b0f1..4cd6a49e 100644
--- a/dask_jobqueue/__init__.py
+++ b/dask_jobqueue/__init__.py
@@ -1,6 +1,6 @@
 # flake8: noqa
 from . import config
-from .job import JobQueueCluster
+from .core import JobQueueCluster
 from .moab import MoabCluster
 from .pbs import PBSCluster
 from .slurm import SLURMCluster
diff --git a/dask_jobqueue/job.py b/dask_jobqueue/core.py
similarity index 99%
rename from dask_jobqueue/job.py
rename to dask_jobqueue/core.py
index 92bf7b12..dde42ce4 100644
--- a/dask_jobqueue/job.py
+++ b/dask_jobqueue/core.py
@@ -146,7 +146,7 @@ def __init__(
             config_name = getattr(type(self), "config_name")
         if config_name is None:
             raise ValueError(
-                "Looks like you are trying to create a class that inherits from dask_jobqueue.job.Job. "
+                "Looks like you are trying to create a class that inherits from dask_jobqueue.core.Job. "
                 "If that is the case, you need to:\n"
                 "- set the 'config_name' class variable to a non-None value\n"
                 "- create a section in jobqueue.yaml with the value of 'config_name'\n"
diff --git a/dask_jobqueue/htcondor.py b/dask_jobqueue/htcondor.py
index 2402a2a1..7becf983 100644
--- a/dask_jobqueue/htcondor.py
+++ b/dask_jobqueue/htcondor.py
@@ -5,7 +5,7 @@
 import dask
 from distributed.utils import parse_bytes
 
-from .job import JobQueueCluster, Job, job_parameters, cluster_parameters
+from .core import JobQueueCluster, Job, job_parameters, cluster_parameters
 
 logger = logging.getLogger(__name__)
 
diff --git a/dask_jobqueue/local.py b/dask_jobqueue/local.py
index 66003b09..a0485d2b 100644
--- a/dask_jobqueue/local.py
+++ b/dask_jobqueue/local.py
@@ -2,7 +2,7 @@
 import os
 from tornado.process import Subprocess
 
-from .job import Job, JobQueueCluster, job_parameters, cluster_parameters
+from .core import Job, JobQueueCluster, job_parameters, cluster_parameters
 
 logger = logging.getLogger(__name__)
 
diff --git a/dask_jobqueue/lsf.py b/dask_jobqueue/lsf.py
index 669dcedb..a5b01495 100644
--- a/dask_jobqueue/lsf.py
+++ b/dask_jobqueue/lsf.py
@@ -4,7 +4,7 @@
 
 import dask
 
-from .job import Job, JobQueueCluster, job_parameters, cluster_parameters
+from .core import Job, JobQueueCluster, job_parameters, cluster_parameters
 
 logger = logging.getLogger(__name__)
 
diff --git a/dask_jobqueue/oar.py b/dask_jobqueue/oar.py
index 285d41a9..91d9da1a 100644
--- a/dask_jobqueue/oar.py
+++ b/dask_jobqueue/oar.py
@@ -3,7 +3,7 @@
 
 import dask
 
-from .job import JobQueueCluster, Job, job_parameters, cluster_parameters
+from .core import JobQueueCluster, Job, job_parameters, cluster_parameters
 
 logger = logging.getLogger(__name__)
 
diff --git a/dask_jobqueue/pbs.py b/dask_jobqueue/pbs.py
index ae670e8e..b975957b 100644
--- a/dask_jobqueue/pbs.py
+++ b/dask_jobqueue/pbs.py
@@ -4,7 +4,7 @@
 
 import dask
 
-from .job import Job, JobQueueCluster, job_parameters, cluster_parameters
+from .core import Job, JobQueueCluster, job_parameters, cluster_parameters
 
 logger = logging.getLogger(__name__)
 
diff --git a/dask_jobqueue/sge.py b/dask_jobqueue/sge.py
index e61524f8..71745616 100644
--- a/dask_jobqueue/sge.py
+++ b/dask_jobqueue/sge.py
@@ -2,7 +2,7 @@
 
 import dask
 
-from .job import Job, JobQueueCluster, job_parameters, cluster_parameters
+from .core import Job, JobQueueCluster, job_parameters, cluster_parameters
 
 logger = logging.getLogger(__name__)
 
diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py
index d92bc094..8d80cc6a 100644
--- a/dask_jobqueue/slurm.py
+++ b/dask_jobqueue/slurm.py
@@ -3,7 +3,7 @@
 
 import dask
 
-from .job import Job, JobQueueCluster, job_parameters, cluster_parameters
+from .core import Job, JobQueueCluster, job_parameters, cluster_parameters
 
 logger = logging.getLogger(__name__)
 
diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index 894006b9..f050f7d8 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -19,7 +19,7 @@
 from dask_jobqueue.htcondor import HTCondorJob
 from dask_jobqueue.oar import OARJob
 
-from dask_jobqueue.job import JobQueueCluster
+from dask_jobqueue.core import JobQueueCluster
 from dask.distributed import Scheduler, Client
 
 import pytest

From 21965e2638dedf0236d885a04b9fed52976b0923 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Fri, 13 Sep 2019 15:17:36 -0700
Subject: [PATCH 093/109] Add <insert-scheduler-address-here> into example_job
 address

---
 dask_jobqueue/core.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/dask_jobqueue/core.py b/dask_jobqueue/core.py
index dde42ce4..e17592d0 100644
--- a/dask_jobqueue/core.py
+++ b/dask_jobqueue/core.py
@@ -463,9 +463,11 @@ def example_job(self):
         try:
             address = self.scheduler.address
         except AttributeError:
-            address = "tcp://scheduler:8786"
+            address = "tcp://<insert-scheduler-address-here>:8786"
         return self.job_cls(
-            address or "tcp://scheduler:8786", name="name", **self._kwargs
+            address or "tcp://<insert-scheduler-address-here>:8786",
+            name="name",
+            **self._kwargs
         )
 
     @property

From 6f6e3d261c5e5e90e0263bd2f43f203ae7adc398 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Mon, 16 Sep 2019 16:50:26 -0700
Subject: [PATCH 094/109] add changelog entry

---
 docs/source/changelog.rst | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
index 97ddc4a2..2a08c442 100644
--- a/docs/source/changelog.rst
+++ b/docs/source/changelog.rst
@@ -1,6 +1,29 @@
 Changelog
 =========
 
+0.7.0 / 2019-XX-XX
+------------------
+
+-   Base Dask-Jobqueue on top of the core ``dask.distributed.SpecCluster`` class
+    (:pr:`307`)
+
+    This is nearly complete reimplementation of the dask-jobqueue logic on top
+    of more centralized logic.  This improves standardization and adds new
+    features, but does include the following **breaking changes**:
+
+    -   The scale method now refers to the number of jobs rather than the
+        number of workers.  Previously if each job launched two workers then
+        ``cluster.scale(4)`` would launch two jobs for a total of four workers.
+        Now it launches four jobs for a total of eight workers.
+    -   The ``cluster.stop_all_jobs()`` method has been removed.
+        Please use ``cluster.scale(0)`` instead.
+    -   The attributes ``running_jobs``, ``pending_jobs``, and
+        ``cancelled_jobs`` have been removed.  These have been moved upstream to
+        the ``dask.distributed.SpecCluster`` class instead as ``workers`` and
+        ``worker_spec``, as well as ``.plan``, ``.requested``, and ``.observed``.
+    -   The ``name`` attribute has been moved to ``job_name``.
+
+
 0.6.3 / 2019-08-18
 ------------------
 

From 0b21d289b8edec64dc27e60b0b1474a37abeed96 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Tue, 17 Sep 2019 09:34:29 -0700
Subject: [PATCH 095/109] example_job -> _dummy_job

---
 dask_jobqueue/core.py                     | 19 +++++++++++++------
 dask_jobqueue/tests/test_htcondor.py      |  6 +++---
 dask_jobqueue/tests/test_jobqueue_core.py | 14 +++++++-------
 dask_jobqueue/tests/test_pbs.py           |  6 +++---
 4 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/dask_jobqueue/core.py b/dask_jobqueue/core.py
index e17592d0..33725175 100644
--- a/dask_jobqueue/core.py
+++ b/dask_jobqueue/core.py
@@ -444,7 +444,7 @@ def __init__(
         if "processes" in kwargs and kwargs["processes"] > 1:
             worker["group"] = ["-" + str(i) for i in range(kwargs["processes"])]
 
-        self.example_job  # trigger property to ensure that the job is valid
+        self._dummy_job  # trigger property to ensure that the job is valid
 
         super().__init__(
             scheduler=scheduler,
@@ -459,9 +459,16 @@ def __init__(
             self.scale(n_workers)
 
     @property
-    def example_job(self):
+    def _dummy_job(self):
+        """
+        Creates a Job similar to what we will use in practice
+
+        This is used for backwards functionality and a variety of convenience
+        functions.  It is also used on construction to raise errors if any of
+        the keywords are improper.
+        """
         try:
-            address = self.scheduler.address
+            address = self.scheduler.address  # Have we already connected?
         except AttributeError:
             address = "tcp://<insert-scheduler-address-here>:8786"
         return self.job_cls(
@@ -472,11 +479,11 @@ def example_job(self):
 
     @property
     def job_header(self):
-        return self.example_job.job_header
+        return self._dummy_job.job_header
 
     def job_script(self):
-        return self.example_job.job_script()
+        return self._dummy_job.job_script()
 
     @property
     def job_name(self):
-        return self.example_job.job_name
+        return self._dummy_job.job_name
diff --git a/dask_jobqueue/tests/test_htcondor.py b/dask_jobqueue/tests/test_htcondor.py
index 024473ad..cd9349f0 100644
--- a/dask_jobqueue/tests/test_htcondor.py
+++ b/dask_jobqueue/tests/test_htcondor.py
@@ -13,9 +13,9 @@
 
 def test_header():
     with HTCondorCluster(cores=1, memory="100MB", disk="100MB") as cluster:
-        assert cluster.example_job.job_header_dict["MY.DaskWorkerCores"] == 1
-        assert cluster.example_job.job_header_dict["MY.DaskWorkerDisk"] == 100000000
-        assert cluster.example_job.job_header_dict["MY.DaskWorkerMemory"] == 100000000
+        assert cluster._dummy_job.job_header_dict["MY.DaskWorkerCores"] == 1
+        assert cluster._dummy_job.job_header_dict["MY.DaskWorkerDisk"] == 100000000
+        assert cluster._dummy_job.job_header_dict["MY.DaskWorkerMemory"] == 100000000
 
 
 def test_job_script():
diff --git a/dask_jobqueue/tests/test_jobqueue_core.py b/dask_jobqueue/tests/test_jobqueue_core.py
index a376e08e..c4acd785 100644
--- a/dask_jobqueue/tests/test_jobqueue_core.py
+++ b/dask_jobqueue/tests/test_jobqueue_core.py
@@ -28,11 +28,11 @@ def test_command_template():
     with PBSCluster(cores=2, memory="4GB") as cluster:
         assert (
             "%s -m distributed.cli.dask_worker" % (sys.executable)
-            in cluster.example_job._command_template
+            in cluster._dummy_job._command_template
         )
-        assert " --nthreads 2" in cluster.example_job._command_template
-        assert " --memory-limit " in cluster.example_job._command_template
-        assert " --name " in cluster.example_job._command_template
+        assert " --nthreads 2" in cluster._dummy_job._command_template
+        assert " --memory-limit " in cluster._dummy_job._command_template
+        assert " --name " in cluster._dummy_job._command_template
 
     with PBSCluster(
         cores=2,
@@ -41,9 +41,9 @@ def test_command_template():
         local_directory="/scratch",
         extra=["--preload", "mymodule"],
     ) as cluster:
-        assert " --death-timeout 60" in cluster.example_job._command_template
-        assert " --local-directory /scratch" in cluster.example_job._command_template
-        assert " --preload mymodule" in cluster.example_job._command_template
+        assert " --death-timeout 60" in cluster._dummy_job._command_template
+        assert " --local-directory /scratch" in cluster._dummy_job._command_template
+        assert " --preload mymodule" in cluster._dummy_job._command_template
 
 
 @pytest.mark.parametrize(
diff --git a/dask_jobqueue/tests/test_pbs.py b/dask_jobqueue/tests/test_pbs.py
index 3de55096..d00dd008 100644
--- a/dask_jobqueue/tests/test_pbs.py
+++ b/dask_jobqueue/tests/test_pbs.py
@@ -210,7 +210,7 @@ def test_adaptive(loop):
             assert future.result(QUEUE_WAIT) == 11
 
             start = time()
-            processes = cluster.example_job.worker_processes
+            processes = cluster._dummy_job.worker_processes
             while len(client.scheduler_info()["workers"]) != processes:
                 sleep(0.1)
                 assert time() < start + QUEUE_WAIT
@@ -242,7 +242,7 @@ def test_adaptive_grouped(loop):
             assert future.result(QUEUE_WAIT) == 11
 
             start = time()
-            processes = cluster.example_job.worker_processes
+            processes = cluster._dummy_job.worker_processes
             while len(client.scheduler_info()["workers"]) != processes:
                 sleep(0.1)
                 assert time() < start + QUEUE_WAIT
@@ -266,7 +266,7 @@ def test_adaptive_cores_mem(loop):
             assert future.result(QUEUE_WAIT) == 11
 
             start = time()
-            processes = cluster.example_job.worker_processes
+            processes = cluster._dummy_job.worker_processes
             while len(client.scheduler_info()["workers"]) != processes:
                 sleep(0.1)
                 assert time() < start + QUEUE_WAIT

From b9c24c003b881b5dbdf6e290299c8d78ed522e13 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Tue, 17 Sep 2019 12:40:48 -0700
Subject: [PATCH 096/109] Add scale/adapt memory/cores to docs

---
 docs/source/configuration.rst | 14 +++++++++++---
 docs/source/howitworks.rst    |  7 ++++++-
 docs/source/index.rst         |  5 +++--
 3 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/docs/source/configuration.rst b/docs/source/configuration.rst
index e52b75d8..2ec86e6c 100644
--- a/docs/source/configuration.rst
+++ b/docs/source/configuration.rst
@@ -30,12 +30,20 @@ define a single job:
 
 Note that the ``cores`` and ``memory`` keywords above correspond not to your
 full desired deployment, but rather to the size of a *single job* which should
-be no larger than the size of a single machine in your cluster.  Separately you
-will specify how many jobs to deploy using the scale method.
+be no larger than the size of a single machine in your cluster.
+
+Separately you will specify how many jobs to deploy using the scale method.
+You can either specify the number of jobs, or the total number of cores or
+memory that you want.
 
 .. code-block:: python
 
-   cluster.scale(12)  # launch 12 workers (2 jobs of 6 workers each) of the specification provided above
+   cluster.scale(2)  # launch 2 jobs, each of which starts 6 worker processes
+   cluster.scale(cores=48)  # Or specify cores or memory directly
+   cluster.scale(memory="200 GB")  # Or specify cores or memory directly
+
+These all accomplish the same thing.  You can chose whichever makes the most
+sense to you.
 
 
 Configuration Files
diff --git a/docs/source/howitworks.rst b/docs/source/howitworks.rst
index 00c26b3a..12a1dd78 100644
--- a/docs/source/howitworks.rst
+++ b/docs/source/howitworks.rst
@@ -24,10 +24,15 @@ object is instantiated:
    )
 
 You then ask for more workers using the ``scale`` command:
+You can either specify the number of jobs, or the total number of cores or
+memory that you want.
 
 .. code-block:: python
 
-   cluster.scale(36)
+
+   cluster.scale(2)  # launch 2 jobs, each of which starts 6 worker processes
+   cluster.scale(cores=48)  # Or specify cores or memory directly
+   cluster.scale(memory="200 GB")  # Or specify cores or memory directly
 
 The cluster generates a traditional job script and submits that an appropriate
 number of times to the job queue.  You can see the job script that it will
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 3d0eec19..9392a8d8 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -19,7 +19,7 @@ Example
 
    from dask_jobqueue import PBSCluster
    cluster = PBSCluster()
-   cluster.scale(10)         # Ask for ten workers
+   cluster.scale(10)         # Deploy ten single-node jobs
 
    from dask.distributed import Client
    client = Client(cluster)  # Connect this local process to remote workers
@@ -45,7 +45,8 @@ save resources when not actively computing.
 
 .. code-block:: python
 
-   cluster.adapt(minimum=6, maximum=90)  # auto-scale between 6 and 90 workers
+   cluster.adapt(minimum=6, maximum=90)  # auto-scale between 6 and 90 jobs
+   cluster.adapt(maximum_memory="10 TB")  # or use core/memory limits
 
 More details
 ------------

From 79d108b53174cbd8ee2fc965108ddeedd6e8832c Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Wed, 25 Sep 2019 08:08:57 -0500
Subject: [PATCH 097/109] specify jobs= keyword in scale

---
 docs/source/configuration.rst | 4 ++--
 docs/source/howitworks.rst    | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/configuration.rst b/docs/source/configuration.rst
index 2ec86e6c..eb867c64 100644
--- a/docs/source/configuration.rst
+++ b/docs/source/configuration.rst
@@ -33,12 +33,12 @@ full desired deployment, but rather to the size of a *single job* which should
 be no larger than the size of a single machine in your cluster.
 
 Separately you will specify how many jobs to deploy using the scale method.
-You can either specify the number of jobs, or the total number of cores or
+You can either specify the number of workers, or the total number of cores or
 memory that you want.
 
 .. code-block:: python
 
-   cluster.scale(2)  # launch 2 jobs, each of which starts 6 worker processes
+   cluster.scale(jobs=2)  # launch 2 workers, each of which starts 6 worker processes
    cluster.scale(cores=48)  # Or specify cores or memory directly
    cluster.scale(memory="200 GB")  # Or specify cores or memory directly
 
diff --git a/docs/source/howitworks.rst b/docs/source/howitworks.rst
index 12a1dd78..6fc7978f 100644
--- a/docs/source/howitworks.rst
+++ b/docs/source/howitworks.rst
@@ -30,7 +30,7 @@ memory that you want.
 .. code-block:: python
 
 
-   cluster.scale(2)  # launch 2 jobs, each of which starts 6 worker processes
+   cluster.scale(jobs=2)  # launch 2 jobs, each of which starts 6 worker processes
    cluster.scale(cores=48)  # Or specify cores or memory directly
    cluster.scale(memory="200 GB")  # Or specify cores or memory directly
 

From 7809511b3071dd33d43fd349f9c1a75a74808d9c Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Wed, 25 Sep 2019 08:15:34 -0500
Subject: [PATCH 098/109] Support jobs= keyword in scale

---
 dask_jobqueue/core.py           |  7 +++++++
 dask_jobqueue/tests/test_job.py | 11 +++++++++--
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/dask_jobqueue/core.py b/dask_jobqueue/core.py
index 33725175..d6b1f4c0 100644
--- a/dask_jobqueue/core.py
+++ b/dask_jobqueue/core.py
@@ -1,5 +1,6 @@
 from contextlib import contextmanager
 import logging
+import math
 import os
 import re
 import shlex
@@ -487,3 +488,9 @@ def job_script(self):
     @property
     def job_name(self):
         return self._dummy_job.job_name
+
+    def scale(self, n=None, jobs=0, memory=None, cores=None):
+        if n is not None:
+            jobs = int(math.ceil(n / self._dummy_job.worker_processes))
+
+        return super().scale(jobs, memory=memory, cores=cores)
diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index f050f7d8..aa2782c0 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -77,7 +77,7 @@ async def test_cluster(job_cls):
     ) as cluster:
         async with Client(cluster, asynchronous=True) as client:
             assert len(cluster.workers) == 1
-            cluster.scale(2)
+            cluster.scale(jobs=2)
             await cluster
             assert len(cluster.workers) == 2
             assert all(isinstance(w, job_cls) for w in cluster.workers.values())
@@ -131,7 +131,7 @@ def test_header_lines_skip():
 
 
 @pytest.mark.asyncio
-async def test_nprocs():
+async def test_nprocs_scale():
     async with LocalCluster(
         cores=2, memory="4GB", processes=2, asynchronous=True
     ) as cluster:
@@ -149,6 +149,13 @@ async def test_nprocs():
             await asyncio.sleep(0.2)
             assert len(cluster.scheduler.workers) == 2  # they're still one group
 
+            cluster.scale(jobs=2)
+            assert len(cluster.worker_spec) == 2
+            cluster.scale(5)
+            assert len(cluster.worker_spec) == 3
+            cluster.scale(1)
+            assert len(cluster.worker_spec) == 1
+
 
 @pytest.mark.parametrize("Cluster", all_clusters)
 def test_docstring_cluster(Cluster):

From 4a0c62670e22143baf06b1ed8dc2e1da6897d5c6 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Wed, 25 Sep 2019 08:24:17 -0500
Subject: [PATCH 099/109] Add minimum/maximum_jobs to adapt

---
 dask_jobqueue/core.py           | 13 +++++++++++++
 dask_jobqueue/tests/test_job.py | 16 ++++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/dask_jobqueue/core.py b/dask_jobqueue/core.py
index d6b1f4c0..6a56300a 100644
--- a/dask_jobqueue/core.py
+++ b/dask_jobqueue/core.py
@@ -494,3 +494,16 @@ def scale(self, n=None, jobs=0, memory=None, cores=None):
             jobs = int(math.ceil(n / self._dummy_job.worker_processes))
 
         return super().scale(jobs, memory=memory, cores=cores)
+
+    def adapt(
+        self,
+        *args,
+        minimum_jobs: int = None,
+        maximum_jobs: int = None,
+        **kwargs
+    ):
+        if minimum_jobs is not None:
+            kwargs["minimum"] = minimum_jobs * self._dummy_job.worker_processes
+        if maximum_jobs is not None:
+            kwargs["maximum"] = maximum_jobs * self._dummy_job.worker_processes
+        return super().adapt(*args, **kwargs)
diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index aa2782c0..213e2dc5 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -122,6 +122,22 @@ async def test_adapt(job_cls):
             assert not cluster.workers
 
 
+@pytest.mark.parametrize("job_cls", job_protected)
+@pytest.mark.asyncio
+async def test_adapt_parameters(job_cls):
+    async with JobQueueCluster(
+        cores=4, memory="2GB", processes=2, job_cls=job_cls, asynchronous=True,
+    ) as cluster:
+        async with Client(cluster, asynchronous=True) as client:
+            adapt = cluster.adapt(minimum=2, maximum=4, interval="10ms")
+            await adapt.adapt()
+            assert len(cluster.worker_spec) == 1  # 2 workers, 4 jobs
+
+            adapt = cluster.adapt(minimum_jobs=2, maximum_jobs=4, interval="10ms")
+            await adapt.adapt()
+            assert len(cluster.worker_spec) == 2  # 2 workers, 4 jobs
+
+
 def test_header_lines_skip():
     job = PBSJob(cores=1, memory="1GB", job_name="foobar")
     assert "foobar" in job.job_script()

From 289915f493971c44616c23297d862a90ea4ea255 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Wed, 25 Sep 2019 08:24:58 -0500
Subject: [PATCH 100/109] lint

---
 dask_jobqueue/core.py           |  6 +-----
 dask_jobqueue/tests/test_job.py | 15 +++++++--------
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/dask_jobqueue/core.py b/dask_jobqueue/core.py
index 6a56300a..888e8e9d 100644
--- a/dask_jobqueue/core.py
+++ b/dask_jobqueue/core.py
@@ -496,11 +496,7 @@ def scale(self, n=None, jobs=0, memory=None, cores=None):
         return super().scale(jobs, memory=memory, cores=cores)
 
     def adapt(
-        self,
-        *args,
-        minimum_jobs: int = None,
-        maximum_jobs: int = None,
-        **kwargs
+        self, *args, minimum_jobs: int = None, maximum_jobs: int = None, **kwargs
     ):
         if minimum_jobs is not None:
             kwargs["minimum"] = minimum_jobs * self._dummy_job.worker_processes
diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index 213e2dc5..4e839ef1 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -126,16 +126,15 @@ async def test_adapt(job_cls):
 @pytest.mark.asyncio
 async def test_adapt_parameters(job_cls):
     async with JobQueueCluster(
-        cores=4, memory="2GB", processes=2, job_cls=job_cls, asynchronous=True,
+        cores=4, memory="2GB", processes=2, job_cls=job_cls, asynchronous=True
     ) as cluster:
-        async with Client(cluster, asynchronous=True) as client:
-            adapt = cluster.adapt(minimum=2, maximum=4, interval="10ms")
-            await adapt.adapt()
-            assert len(cluster.worker_spec) == 1  # 2 workers, 4 jobs
+        adapt = cluster.adapt(minimum=2, maximum=4, interval="10ms")
+        await adapt.adapt()
+        assert len(cluster.worker_spec) == 1  # 2 workers, 4 jobs
 
-            adapt = cluster.adapt(minimum_jobs=2, maximum_jobs=4, interval="10ms")
-            await adapt.adapt()
-            assert len(cluster.worker_spec) == 2  # 2 workers, 4 jobs
+        adapt = cluster.adapt(minimum_jobs=2, maximum_jobs=4, interval="10ms")
+        await adapt.adapt()
+        assert len(cluster.worker_spec) == 2  # 2 workers, 4 jobs
 
 
 def test_header_lines_skip():

From bf4840bdee35829df107e99d1c591a34fa7067a7 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Wed, 25 Sep 2019 08:29:48 -0500
Subject: [PATCH 101/109] unxfail adaptive cores/memory pbs test

---
 dask_jobqueue/tests/test_pbs.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dask_jobqueue/tests/test_pbs.py b/dask_jobqueue/tests/test_pbs.py
index d00dd008..2e92162c 100644
--- a/dask_jobqueue/tests/test_pbs.py
+++ b/dask_jobqueue/tests/test_pbs.py
@@ -248,7 +248,6 @@ def test_adaptive_grouped(loop):
                 assert time() < start + QUEUE_WAIT
 
 
-@pytest.mark.xfail(reason="adapt doesn't yet have cores/memory")
 @pytest.mark.env("pbs")
 def test_adaptive_cores_mem(loop):
     with PBSCluster(

From 18cd5cd670d2354e7c1317af659452bcd8b73d32 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Wed, 25 Sep 2019 08:38:51 -0500
Subject: [PATCH 102/109] Remove startup_cost keyword from HTCondor docstring

This has been removed upstream
---
 dask_jobqueue/htcondor.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/dask_jobqueue/htcondor.py b/dask_jobqueue/htcondor.py
index 7becf983..c61e91cd 100644
--- a/dask_jobqueue/htcondor.py
+++ b/dask_jobqueue/htcondor.py
@@ -218,9 +218,8 @@ class HTCondorCluster(JobQueueCluster):
     >>> client = Client(cluster)
 
     This also works with adaptive clusters.  This automatically launches and kill workers based on load.
-    HTCondor can take longer to start jobs than other batch systems - tune Adaptive parameters accordingly.
 
-    >>> cluster.adapt(minimum=5, startup_cost='60s')
+    >>> cluster.adapt(minimum=5)
     """.format(
         job=job_parameters, cluster=cluster_parameters
     )

From e2b7cb5a8ef9cc2a2741c79c903cb23f50015e9b Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Wed, 25 Sep 2019 08:52:49 -0500
Subject: [PATCH 103/109] await clusters before closing

---
 dask_jobqueue/tests/test_job.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index 4e839ef1..71f0352f 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -130,11 +130,13 @@ async def test_adapt_parameters(job_cls):
     ) as cluster:
         adapt = cluster.adapt(minimum=2, maximum=4, interval="10ms")
         await adapt.adapt()
-        assert len(cluster.worker_spec) == 1  # 2 workers, 4 jobs
+        await cluster
+        assert len(cluster.workers) == 1  # 2 workers, 4 jobs
 
         adapt = cluster.adapt(minimum_jobs=2, maximum_jobs=4, interval="10ms")
         await adapt.adapt()
-        assert len(cluster.worker_spec) == 2  # 2 workers, 4 jobs
+        await cluster
+        assert len(cluster.workers) == 2  # 2 workers, 4 jobs
 
 
 def test_header_lines_skip():

From dd2db2df6a3f6275ff195a45af6109613e7ed44d Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Wed, 25 Sep 2019 09:12:49 -0500
Subject: [PATCH 104/109] reduce core count in test

SLURM was constrained
---
 dask_jobqueue/tests/test_job.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dask_jobqueue/tests/test_job.py b/dask_jobqueue/tests/test_job.py
index 71f0352f..71f26f09 100644
--- a/dask_jobqueue/tests/test_job.py
+++ b/dask_jobqueue/tests/test_job.py
@@ -126,7 +126,7 @@ async def test_adapt(job_cls):
 @pytest.mark.asyncio
 async def test_adapt_parameters(job_cls):
     async with JobQueueCluster(
-        cores=4, memory="2GB", processes=2, job_cls=job_cls, asynchronous=True
+        cores=2, memory="1GB", processes=2, job_cls=job_cls, asynchronous=True
     ) as cluster:
         adapt = cluster.adapt(minimum=2, maximum=4, interval="10ms")
         await adapt.adapt()

From 9b11c652027865e1cbeffb857e78da077304c0ee Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Wed, 25 Sep 2019 09:23:41 -0500
Subject: [PATCH 105/109] update docstrings to use the jobs= parameter

---
 dask_jobqueue/htcondor.py | 4 ++--
 dask_jobqueue/local.py    | 2 +-
 dask_jobqueue/lsf.py      | 4 ++--
 dask_jobqueue/oar.py      | 4 ++--
 dask_jobqueue/pbs.py      | 4 ++--
 dask_jobqueue/sge.py      | 4 ++--
 dask_jobqueue/slurm.py    | 4 ++--
 7 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/dask_jobqueue/htcondor.py b/dask_jobqueue/htcondor.py
index c61e91cd..6b3769ed 100644
--- a/dask_jobqueue/htcondor.py
+++ b/dask_jobqueue/htcondor.py
@@ -212,14 +212,14 @@ class HTCondorCluster(JobQueueCluster):
     --------
     >>> from dask_jobqueue.htcondor import HTCondorCluster
     >>> cluster = HTCondorCluster(cores=24, memory="4GB", disk="4GB")
-    >>> cluster.scale(10)
+    >>> cluster.scale(jobs=10)  # ask for 10 jobs
 
     >>> from dask.distributed import Client
     >>> client = Client(cluster)
 
     This also works with adaptive clusters.  This automatically launches and kill workers based on load.
 
-    >>> cluster.adapt(minimum=5)
+    >>> cluster.adapt(maximum_jobs=20)
     """.format(
         job=job_parameters, cluster=cluster_parameters
     )
diff --git a/dask_jobqueue/local.py b/dask_jobqueue/local.py
index a0485d2b..3ac2fd12 100644
--- a/dask_jobqueue/local.py
+++ b/dask_jobqueue/local.py
@@ -88,7 +88,7 @@ class LocalCluster(JobQueueCluster):
     --------
     >>> from dask_jobqueue import LocalCluster
     >>> cluster = LocalCluster(cores=2, memory="4 GB")
-    >>> cluster.scale(3)
+    >>> cluster.scale(jobs=3)  # ask for 3 jobs
 
     See Also
     --------
diff --git a/dask_jobqueue/lsf.py b/dask_jobqueue/lsf.py
index a5b01495..d1cbe53e 100644
--- a/dask_jobqueue/lsf.py
+++ b/dask_jobqueue/lsf.py
@@ -182,7 +182,7 @@ class LSFCluster(JobQueueCluster):
     >>> from dask_jobqueue import LSFCluster
     >>> cluster = LSFCluster(queue='general', project='DaskonLSF',
     ...                      cores=15, memory='25GB')
-    >>> cluster.scale(10)  # this may take a few seconds to launch
+    >>> cluster.scale(jobs=10)  # this may take a few seconds to launch
 
     >>> from dask.distributed import Client
     >>> client = Client(cluster)
@@ -190,7 +190,7 @@ class LSFCluster(JobQueueCluster):
     This also works with adaptive clusters.  This automatically launches and
     kill workers based on load.
 
-    >>> cluster.adapt()
+    >>> cluster.adapt(maximum_jobs=20)
     """.format(
         job=job_parameters, cluster=cluster_parameters
     )
diff --git a/dask_jobqueue/oar.py b/dask_jobqueue/oar.py
index 91d9da1a..7a68ad4e 100644
--- a/dask_jobqueue/oar.py
+++ b/dask_jobqueue/oar.py
@@ -114,14 +114,14 @@ class OARCluster(JobQueueCluster):
     --------
     >>> from dask_jobqueue import OARCluster
     >>> cluster = OARCluster(queue='regular')
-    >>> cluster.scale(10)  # this may take a few seconds to launch
+    >>> cluster.scale(jobs=10)  # this may take a few seconds to launch
 
     >>> from dask.distributed import Client
     >>> client = Client(cluster)
 
     This also works with adaptive clusters.  This automatically launches and kill workers based on load.
 
-    >>> cluster.adapt()
+    >>> cluster.adapt(maximum_jobs=20)
     """.format(
         job=job_parameters, cluster=cluster_parameters
     )
diff --git a/dask_jobqueue/pbs.py b/dask_jobqueue/pbs.py
index b975957b..6ce2f11b 100644
--- a/dask_jobqueue/pbs.py
+++ b/dask_jobqueue/pbs.py
@@ -125,14 +125,14 @@ class PBSCluster(JobQueueCluster):
     >>> from dask_jobqueue import PBSCluster
     >>> cluster = PBSCluster(queue='regular', project="myproj", cores=24,
     ...     memory="500 GB")
-    >>> cluster.scale(10)  # Ask for ten jobs
+    >>> cluster.scale(jobs=10)  # This may take a few seconds to launch
 
     >>> from dask.distributed import Client
     >>> client = Client(cluster)
 
     This also works with adaptive clusters.  This automatically launches and kill workers based on load.
 
-    >>> cluster.adapt()
+    >>> cluster.adapt(maximum_jobs=20)
     """.format(
         job=job_parameters, cluster=cluster_parameters
     )
diff --git a/dask_jobqueue/sge.py b/dask_jobqueue/sge.py
index 71745616..dd58d419 100644
--- a/dask_jobqueue/sge.py
+++ b/dask_jobqueue/sge.py
@@ -102,14 +102,14 @@ class SGECluster(JobQueueCluster):
     ...     cores=24,
     ...     memory="500 GB"
     ... )
-    >>> cluster.scale(10)  # this may take a few seconds to launch
+    >>> cluster.scale(jobs=10)  # this may take a few seconds to launch
 
     >>> from dask.distributed import Client
     >>> client = Client(cluster)
 
     This also works with adaptive clusters.  This automatically launches and kill workers based on load.
 
-    >>> cluster.adapt()
+    >>> cluster.adapt(maximum_jobs=20)
     """.format(
         job=job_parameters, cluster=cluster_parameters
     )
diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py
index 8d80cc6a..aaccad62 100644
--- a/dask_jobqueue/slurm.py
+++ b/dask_jobqueue/slurm.py
@@ -136,14 +136,14 @@ class SLURMCluster(JobQueueCluster):
     ...     cores=24,
     ...     memory="500 GB"
     ... )
-    >>> cluster.scale(10)  # this may take a few seconds to launch
+    >>> cluster.scale(jobs=10)  # this may take a few seconds to launch
 
     >>> from dask.distributed import Client
     >>> client = Client(cluster)
 
     This also works with adaptive clusters.  This automatically launches and kill workers based on load.
 
-    >>> cluster.adapt()
+    >>> cluster.adapt(maximum_jobs=20)
     """.format(
         job=job_parameters, cluster=cluster_parameters
     )

From 38c4c3d61c4369e2140338027249713fd09a0008 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Wed, 25 Sep 2019 09:45:06 -0500
Subject: [PATCH 106/109] update docs

---
 docs/source/howitworks.rst | 11 +++++++----
 docs/source/index.rst      |  2 +-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/docs/source/howitworks.rst b/docs/source/howitworks.rst
index 6fc7978f..e8f62fb0 100644
--- a/docs/source/howitworks.rst
+++ b/docs/source/howitworks.rst
@@ -23,17 +23,20 @@ object is instantiated:
         walltime='02:00:00',
    )
 
-You then ask for more workers using the ``scale`` command:
-You can either specify the number of jobs, or the total number of cores or
-memory that you want.
+These parameters specify the characteristics of a *single job* or a *single
+compute node*, rather than the characteristics of your computation as a whole.
+For the full computation, you will then ask for a number of jobs using the
+``scale`` command:
 
 .. code-block:: python
 
-
    cluster.scale(jobs=2)  # launch 2 jobs, each of which starts 6 worker processes
    cluster.scale(cores=48)  # Or specify cores or memory directly
    cluster.scale(memory="200 GB")  # Or specify cores or memory directly
 
+You can either specify the number of jobs, or the total number of cores or
+memory that you want.
+
 The cluster generates a traditional job script and submits that an appropriate
 number of times to the job queue.  You can see the job script that it will
 generate as follows:
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 9392a8d8..ad9dbce3 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -45,7 +45,7 @@ save resources when not actively computing.
 
 .. code-block:: python
 
-   cluster.adapt(minimum=6, maximum=90)  # auto-scale between 6 and 90 jobs
+   cluster.adapt(minimum_jobs=10, maximum_jobs=100)  # auto-scale between 10 and 100 jobs
    cluster.adapt(maximum_memory="10 TB")  # or use core/memory limits
 
 More details

From 23562a44bf99d7b880d320f1dc87ece74a503905 Mon Sep 17 00:00:00 2001
From: Matthew Rocklin <mrocklin@gmail.com>
Date: Wed, 25 Sep 2019 09:56:07 -0500
Subject: [PATCH 107/109] two more small doc fixes [skip ci]

---
 docs/source/howitworks.rst | 1 +
 docs/source/index.rst      | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/howitworks.rst b/docs/source/howitworks.rst
index e8f62fb0..3e0a2ca2 100644
--- a/docs/source/howitworks.rst
+++ b/docs/source/howitworks.rst
@@ -25,6 +25,7 @@ object is instantiated:
 
 These parameters specify the characteristics of a *single job* or a *single
 compute node*, rather than the characteristics of your computation as a whole.
+It hasn't actually launched any jobs yet.
 For the full computation, you will then ask for a number of jobs using the
 ``scale`` command:
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index ad9dbce3..f9f352ad 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -19,7 +19,7 @@ Example
 
    from dask_jobqueue import PBSCluster
    cluster = PBSCluster()
-   cluster.scale(10)         # Deploy ten single-node jobs
+   cluster.scale(jobs=10)         # Deploy ten single-node jobs
 
    from dask.distributed import Client
    client = Client(cluster)  # Connect this local process to remote workers

From 18660e6e1dae7ebf196aca2a6be176e1b40d4c20 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= <loic.esteve@ymail.com>
Date: Wed, 25 Sep 2019 16:59:56 +0200
Subject: [PATCH 108/109] Change comment that I do not understand.

---
 dask_jobqueue/lsf.py   | 2 +-
 dask_jobqueue/oar.py   | 2 +-
 dask_jobqueue/pbs.py   | 2 +-
 dask_jobqueue/sge.py   | 2 +-
 dask_jobqueue/slurm.py | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/dask_jobqueue/lsf.py b/dask_jobqueue/lsf.py
index d1cbe53e..348a356d 100644
--- a/dask_jobqueue/lsf.py
+++ b/dask_jobqueue/lsf.py
@@ -182,7 +182,7 @@ class LSFCluster(JobQueueCluster):
     >>> from dask_jobqueue import LSFCluster
     >>> cluster = LSFCluster(queue='general', project='DaskonLSF',
     ...                      cores=15, memory='25GB')
-    >>> cluster.scale(jobs=10)  # this may take a few seconds to launch
+    >>> cluster.scale(jobs=10)  # ask for 10 jobs
 
     >>> from dask.distributed import Client
     >>> client = Client(cluster)
diff --git a/dask_jobqueue/oar.py b/dask_jobqueue/oar.py
index 7a68ad4e..f59f8512 100644
--- a/dask_jobqueue/oar.py
+++ b/dask_jobqueue/oar.py
@@ -114,7 +114,7 @@ class OARCluster(JobQueueCluster):
     --------
     >>> from dask_jobqueue import OARCluster
     >>> cluster = OARCluster(queue='regular')
-    >>> cluster.scale(jobs=10)  # this may take a few seconds to launch
+    >>> cluster.scale(jobs=10)  # ask for 10 jobs
 
     >>> from dask.distributed import Client
     >>> client = Client(cluster)
diff --git a/dask_jobqueue/pbs.py b/dask_jobqueue/pbs.py
index 6ce2f11b..f8b3acd1 100644
--- a/dask_jobqueue/pbs.py
+++ b/dask_jobqueue/pbs.py
@@ -125,7 +125,7 @@ class PBSCluster(JobQueueCluster):
     >>> from dask_jobqueue import PBSCluster
     >>> cluster = PBSCluster(queue='regular', project="myproj", cores=24,
     ...     memory="500 GB")
-    >>> cluster.scale(jobs=10)  # This may take a few seconds to launch
+    >>> cluster.scale(jobs=10)  # ask for 10 jobs
 
     >>> from dask.distributed import Client
     >>> client = Client(cluster)
diff --git a/dask_jobqueue/sge.py b/dask_jobqueue/sge.py
index dd58d419..cc022b47 100644
--- a/dask_jobqueue/sge.py
+++ b/dask_jobqueue/sge.py
@@ -102,7 +102,7 @@ class SGECluster(JobQueueCluster):
     ...     cores=24,
     ...     memory="500 GB"
     ... )
-    >>> cluster.scale(jobs=10)  # this may take a few seconds to launch
+    >>> cluster.scale(jobs=10)  # ask for 10 jobs
 
     >>> from dask.distributed import Client
     >>> client = Client(cluster)
diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py
index aaccad62..e17c85e2 100644
--- a/dask_jobqueue/slurm.py
+++ b/dask_jobqueue/slurm.py
@@ -136,7 +136,7 @@ class SLURMCluster(JobQueueCluster):
     ...     cores=24,
     ...     memory="500 GB"
     ... )
-    >>> cluster.scale(jobs=10)  # this may take a few seconds to launch
+    >>> cluster.scale(jobs=10)  # ask for 10 jobs
 
     >>> from dask.distributed import Client
     >>> client = Client(cluster)

From 55aca3ada3d1ded11917469b6511e8470b75fa31 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= <loic.esteve@ymail.com>
Date: Wed, 25 Sep 2019 17:02:33 +0200
Subject: [PATCH 109/109] Use .scale(jobs=...) in doc and remove invalid
 changelog entry.

---
 docs/source/changelog.rst | 4 ----
 docs/source/index.rst     | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
index 2a08c442..069bb6e2 100644
--- a/docs/source/changelog.rst
+++ b/docs/source/changelog.rst
@@ -11,10 +11,6 @@ Changelog
     of more centralized logic.  This improves standardization and adds new
     features, but does include the following **breaking changes**:
 
-    -   The scale method now refers to the number of jobs rather than the
-        number of workers.  Previously if each job launched two workers then
-        ``cluster.scale(4)`` would launch two jobs for a total of four workers.
-        Now it launches four jobs for a total of eight workers.
     -   The ``cluster.stop_all_jobs()`` method has been removed.
         Please use ``cluster.scale(0)`` instead.
     -   The attributes ``running_jobs``, ``pending_jobs``, and
diff --git a/docs/source/index.rst b/docs/source/index.rst
index f9f352ad..d4625452 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -19,7 +19,7 @@ Example
 
    from dask_jobqueue import PBSCluster
    cluster = PBSCluster()
-   cluster.scale(jobs=10)         # Deploy ten single-node jobs
+   cluster.scale(jobs=10)    # Deploy ten single-node jobs
 
    from dask.distributed import Client
    client = Client(cluster)  # Connect this local process to remote workers