Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ matrix:
env:
- OS=ubuntu-14.04
- JOBQUEUE=pbs
- python: "3.6"
env:
- OS=ubuntu-14.04
- JOBQUEUE=slurm

env:
global:
- DOCKER_COMPOSE_VERSION=1.6.0
Expand Down
2 changes: 0 additions & 2 deletions ci/pbs/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,6 @@ RUN curl -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-L
/opt/anaconda/bin/conda clean -tipy && \
rm -f miniconda.sh
RUN conda install --yes -c conda-forge python=3.6 dask distributed flake8 pytest docrep
# Take the latest version of distributed due to test failure otherwise (see #47 comment by mrocklin)
RUN pip install --no-cache-dir git+https://github.com/dask/distributed.git --upgrade

# Copy entrypoint and other needed scripts
COPY ./*.sh /
Expand Down
30 changes: 30 additions & 0 deletions ci/slurm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env bash

set -x

function jobqueue_before_install {
docker version
docker-compose version

# start slurm cluster
cd ./ci/slurm
./start-slurm.sh
cd -

docker ps -a
docker images
}

function jobqueue_install {
docker exec -it slurmctld /bin/bash -c "cd /dask-jobqueue; python setup.py install"
}

function jobqueue_script {
docker exec -it slurmctld /bin/bash -c "cd /dask-jobqueue; py.test dask_jobqueue --verbose -E slurm"
}

function jobqueue_after_script {
docker exec -it slurmctld bash -c 'sinfo'
docker exec -it slurmctld bash -c 'squeue'
docker exec -it slurmctld bash -c 'sacct -l'
}
12 changes: 12 additions & 0 deletions ci/slurm/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
FROM giovtorres/slurm-docker-cluster

RUN curl -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
bash miniconda.sh -f -b -p /opt/anaconda && \
/opt/anaconda/bin/conda clean -tipy && \
rm -f miniconda.sh
ENV PATH /opt/anaconda/bin:$PATH
RUN conda install --yes -c conda-forge python=3.6 dask distributed flake8 pytest docrep

ENV LC_ALL en_US.UTF-8

COPY slurm.conf /etc/slurm/slurm.conf
81 changes: 81 additions & 0 deletions ci/slurm/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
version: "2.2"

services:
mysql:
image: mysql:5.7
hostname: mysql
container_name: mysql
environment:
MYSQL_RANDOM_ROOT_PASSWORD: "yes"
MYSQL_DATABASE: slurm_acct_db
MYSQL_USER: slurm
MYSQL_PASSWORD: password
volumes:
- var_lib_mysql:/var/lib/mysql

slurmdbd:
build: .
command: ["slurmdbd"]
container_name: slurmdbd
hostname: slurmdbd
volumes:
- etc_munge:/etc/munge
- etc_slurm:/etc/slurm
- var_log_slurm:/var/log/slurm
expose:
- "6819"
depends_on:
- mysql

slurmctld:
build: .
command: ["slurmctld"]
container_name: slurmctld
hostname: slurmctld
volumes:
- etc_munge:/etc/munge
- etc_slurm:/etc/slurm
- slurm_jobdir:/data
- var_log_slurm:/var/log/slurm
- ../..:/dask-jobqueue
expose:
- "6817"
depends_on:
- "slurmdbd"

c1:
build: .
command: ["slurmd"]
hostname: c1
container_name: c1
volumes:
- etc_munge:/etc/munge
- etc_slurm:/etc/slurm
- slurm_jobdir:/data
- var_log_slurm:/var/log/slurm
expose:
- "6818"
depends_on:
- "slurmctld"

c2:
build: .
command: ["slurmd"]
hostname: c2
container_name: c2
volumes:
- etc_munge:/etc/munge
- etc_slurm:/etc/slurm
- slurm_jobdir:/data
- var_log_slurm:/var/log/slurm
expose:
- "6818"
depends_on:
- "slurmctld"

volumes:
etc_munge:
etc_slurm:
slurm_jobdir:
var_lib_mysql:
var_log_slurm:
5 changes: 5 additions & 0 deletions ci/slurm/register_cluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash
set -e

docker exec slurmctld bash -c "/usr/bin/sacctmgr --immediate add cluster name=linux" && \
docker-compose restart slurmdbd slurmctld
94 changes: 94 additions & 0 deletions ci/slurm/slurm.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# slurm.conf
#
# See the slurm.conf man page for more information.
#
ClusterName=linux
ControlMachine=slurmctld
ControlAddr=slurmctld
#BackupController=
#BackupAddr=
#
SlurmUser=slurm
#SlurmdUser=root
SlurmctldPort=6817
SlurmdPort=6818
AuthType=auth/munge
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
StateSaveLocation=/var/lib/slurmd
SlurmdSpoolDir=/var/spool/slurmd
SwitchType=switch/none
MpiDefault=none
SlurmctldPidFile=/var/run/slurmd/slurmctld.pid
SlurmdPidFile=/var/run/slurmd/slurmd.pid
ProctrackType=proctrack/linuxproc
#PluginDir=
CacheGroups=0
#FirstJobId=
ReturnToService=0
#MaxJobCount=
#PlugStackConfig=
#PropagatePrioProcess=
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#Prolog=
#Epilog=
#SrunProlog=
#SrunEpilog=
#TaskProlog=
#TaskEpilog=
#TaskPlugin=
#TrackWCKey=no
#TreeWidth=50
#TmpFS=
#UsePAM=
#
# TIMERS
SlurmctldTimeout=300
SlurmdTimeout=300
InactiveLimit=0
MinJobAge=300
KillWait=30
Waittime=0
#
# SCHEDULING
SchedulerType=sched/backfill
#SchedulerAuth=
#SchedulerPort=
#SchedulerRootFilter=
SelectType=select/cons_res
SelectTypeParameters=CR_CPU_Memory
FastSchedule=1
#PriorityType=priority/multifactor
#PriorityDecayHalfLife=14-0
#PriorityUsageResetPeriod=14-0
#PriorityWeightFairshare=100000
#PriorityWeightAge=1000
#PriorityWeightPartition=10000
#PriorityWeightJobSize=1000
#PriorityMaxAge=1-0
#
# LOGGING
SlurmctldDebug=3
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmdDebug=3
SlurmdLogFile=/var/log/slurm/slurmd.log
JobCompType=jobcomp/filetxt
JobCompLoc=/var/log/slurm/jobcomp.log
#
# ACCOUNTING
JobAcctGatherType=jobacct_gather/linux
JobAcctGatherFrequency=30
#
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost=slurmdbd
AccountingStoragePort=6819
AccountingStorageLoc=slurm_acct_db
#AccountingStoragePass=
#AccountingStorageUser=
#
# COMPUTE NODES
NodeName=c[1-2] RealMemory=4096 CPUs=2 State=UNKNOWN
#
# PARTITIONS
PartitionName=normal Default=yes Nodes=c[1-2] Priority=50 DefMemPerCPU=2048 Shared=NO MaxNodes=2 MaxTime=5-00:00:00 DefaultTime=5-00:00:00 State=UP
9 changes: 9 additions & 0 deletions ci/slurm/start-slurm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

docker-compose up --build -d
while [ `./register_cluster.sh | grep "sacctmgr: error" | wc -l` -ne 0 ]
do
echo "Waiting for SLURM cluster to become ready";
sleep 2
done
echo "SLURM properly configured"
23 changes: 13 additions & 10 deletions dask_jobqueue/tests/test_slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,8 @@ def test_job_script():

@pytest.mark.env("slurm") # noqa: F811
def test_basic(loop):
with SLURMCluster(walltime='00:02:00', threads_per_worker=2, memory='7GB',
loop=loop) as cluster:
with SLURMCluster(walltime='00:02:00', threads=2, processes=1, memory='4GB',
job_extra=['-D /'], loop=loop) as cluster:
with Client(cluster) as client:
workers = cluster.start_workers(2)
future = client.submit(lambda x: x + 1, 10)
Expand All @@ -93,7 +93,7 @@ def test_basic(loop):

info = client.scheduler_info()
w = list(info['workers'].values())[0]
assert w['memory_limit'] == 7e9
assert w['memory_limit'] == 4e9
assert w['ncores'] == 2

cluster.stop_workers(workers)
Expand All @@ -108,7 +108,8 @@ def test_basic(loop):

@pytest.mark.env("slurm") # noqa: F811
def test_adaptive(loop):
with SLURMCluster(walltime='00:02:00', loop=loop) as cluster:
with SLURMCluster(walltime='00:02:00', threads=2, processes=1, memory='4GB',
job_extra=['-D /'], loop=loop) as cluster:
cluster.adapt()
with Client(cluster) as client:
future = client.submit(lambda x: x + 1, 10)
Expand All @@ -117,8 +118,8 @@ def test_adaptive(loop):
assert cluster.jobs

start = time()
while (len(client.scheduler_info()['workers']) !=
cluster.config['processes']):
processes = cluster.worker_processes
while (len(client.scheduler_info()['workers']) != processes):
sleep(0.1)
assert time() < start + 10

Expand All @@ -129,7 +130,9 @@ def test_adaptive(loop):
sleep(0.100)
assert time() < start + 10

start = time()
while cluster.jobs:
sleep(0.100)
assert time() < start + 10
# There is probably a bug to fix in the adaptive methods of the JobQueueCluster
# Currently cluster.jobs is not cleaned up.
# start = time()
# while cluster.jobs:
# sleep(0.100)
# assert time() < start + 10