Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ matrix:
- OS=ubuntu-14.04
# JOBQUEUE=none is for tests that do not need a cluster to run
- JOBQUEUE=none
- python: "3.6"
env:
- OS=ubuntu-14.04
- JOBQUEUE=pbs
env:
global:
- DOCKER_COMPOSE_VERSION=1.6.0
Expand All @@ -40,8 +44,8 @@ install:
- jobqueue_install
script:
- jobqueue_script
after_success:
- jobqueue_after_success
after_script:
- jobqueue_after_script

# TODO
# - pip install --no-cache-dir coveralls
Expand Down
4 changes: 2 additions & 2 deletions ci/none.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,6 @@ function jobqueue_script {
py.test --verbose
}

function jobqueue_after_success {
echo "Hurrah"
function jobqueue_after_script {

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you remind me the difference between after_success and after_script, is that that after_script can still fail the build but not after_success?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The difference as I understand it is that after_success is run only if the script succeeded, and after_script is run no matter if the script succeeded or failed.
I don't believe any of those can fail the build.
I needed after_script to debug the failures.

echo "Done."
}
40 changes: 40 additions & 0 deletions ci/pbs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/usr/bin/env bash

set -x

function jobqueue_before_install {
docker version
docker-compose version

# start pbs cluster
cd ./ci/pbs
./start-pbs.sh
cd -

docker exec -it -u pbsuser pbs_master pbsnodes -a
docker ps -a
docker images
}

function jobqueue_install {
docker exec -it pbs_master /bin/bash -c "cd /dask-jobqueue; python setup.py install"
}

function jobqueue_script {
docker exec -it -u pbsuser pbs_master /bin/bash -c "cd /dask-jobqueue; py.test dask_jobqueue --verbose -E pbs"
}

function jobqueue_after_script {
docker exec -it -u pbsuser pbs_master qstat -fx
docker exec -it pbs_master bash -c 'cat /var/spool/pbs/sched_logs/*'
docker exec -it pbs_master bash -c 'cat /var/spool/pbs/server_logs/*'
docker exec -it pbs_master bash -c 'cat /var/spool/pbs/server_priv/accounting/*'
docker exec -it pbs_slave_1 bash -c 'cat /var/spool/pbs/mom_logs/*'
docker exec -it pbs_slave_1 bash -c 'cat /var/spool/pbs/spool/*'
docker exec -it pbs_slave_1 bash -c 'cat /tmp/*.e*'
docker exec -it pbs_slave_1 bash -c 'cat /tmp/*.o*'
docker exec -it pbs_slave_2 bash -c 'cat /var/spool/pbs/mom_logs/*'
docker exec -it pbs_slave_2 bash -c 'cat /var/spool/pbs/spool/*'
docker exec -it pbs_slave_2 bash -c 'cat /tmp/*.e*'
docker exec -it pbs_slave_2 bash -c 'cat /tmp/*.o*'
}
42 changes: 42 additions & 0 deletions ci/pbs/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# inspired from https://github.com/PBSPro/pbspro/blob/v18.1.beta/docker/centos7/
# multi-stage build
# build script will be triggered
FROM centos:7.4.1708 AS builder
# install dependencies for building
RUN yum install -y gcc make rpm-build libtool hwloc-devel libX11-devel \
libXt-devel libedit-devel libical-devel ncurses-devel perl \
postgresql-devel python-devel tcl-devel tk-devel swig expat-devel \
openssl-devel libXext libXft git
# get known PBS Pro source code
RUN git clone --branch v14.1.2 https://github.com/pbspro/pbspro.git /src/pbspro
COPY build.sh /
RUN bash /build.sh

# base image
FROM centos:7.4.1708
LABEL description="PBS Professional Open Source and conda"

#The pbs master node name, can be overriden if needed
ENV PBS_MASTER pbs_master
ENV PATH /opt/pbs/bin:/opt/anaconda/bin:$PATH
ENV LANG en_US.UTF-8
ENV LC_ALL en_US.UTF-8

COPY --from=builder /root/rpmbuild/RPMS/x86_64/pbspro-server-*.rpm .
# install pbspro and useful packages
RUN yum install -y pbspro-server-*.rpm curl bzip2 git gcc sudo openssh-server && yum clean all
# install python
RUN curl -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
bash miniconda.sh -f -b -p /opt/anaconda && \
/opt/anaconda/bin/conda clean -tipy && \
rm -f miniconda.sh
RUN conda install --yes -c conda-forge python=3.6 dask distributed flake8 pytest docrep
# Take the latest version of distributed due to test failure otherwise (see #47 comment by mrocklin)
RUN pip install --no-cache-dir git+https://github.com/dask/distributed.git --upgrade

# Copy entrypoint and other needed scripts
COPY ./*.sh /
RUN chmod a+x ./*.sh

# default entrypoint launch pbs master
ENTRYPOINT ["bash", "/master-entrypoint.sh"]
10 changes: 10 additions & 0 deletions ci/pbs/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash
cd /src/pbspro
./autogen.sh
./configure -prefix=/opt/pbs
make dist
mkdir /root/rpmbuild /root/rpmbuild/SOURCES /root/rpmbuild/SPECS
cp pbspro-*.tar.gz /root/rpmbuild/SOURCES
cp pbspro.spec /root/rpmbuild/SPECS
cd /root/rpmbuild/SPECS
rpmbuild -ba pbspro.spec
42 changes: 42 additions & 0 deletions ci/pbs/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
version: "2"

services:

master:
build: .
container_name: pbs_master
hostname: pbs_master
volumes:
- ../..:/dask-jobqueue
command: bash /run-master.sh

slave_one:
build: .
container_name: pbs_slave_1
hostname: pbs_slave_1
volumes:
- ../..:/dask-jobqueue
entrypoint: "bash /slave-entrypoint.sh"
command: bash /run-slave.sh
links:
- "master:pbs_master"
environment:
- PBS_MASTER=pbs_master
depends_on:
- master

slave_two:
build: .
container_name: pbs_slave_2
hostname: pbs_slave_2
volumes:
- ../..:/dask-jobqueue
entrypoint: "bash /slave-entrypoint.sh"
command: bash /run-slave.sh
links:
- "master:pbs_master"
environment:
- PBS_MASTER=pbs_master
depends_on:
- master

16 changes: 16 additions & 0 deletions ci/pbs/master-entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/sh
pbs_conf_file=/etc/pbs.conf
mom_conf_file=/var/spool/pbs/mom_priv/config
hostname=$(hostname)

# replace hostname in pbs.conf and mom_priv/config
sed -i "s/PBS_SERVER=.*/PBS_SERVER=$hostname/" $pbs_conf_file
sed -i "s/\$clienthost .*/\$clienthost $hostname/" $mom_conf_file

# start PBS Pro
/etc/init.d/pbs start

# create default non-root user
adduser pbsuser

exec "$@"
13 changes: 13 additions & 0 deletions ci/pbs/run-master.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

# Reduce time between PBS scheduling and add history
qmgr -c "set server scheduler_iteration = 20"
qmgr -c "set server job_history_enable = True"
qmgr -c "set server job_history_duration = 24:00:00"

# add two slaves to pbs
qmgr -c "create node pbs_slave_1"
qmgr -c "create node pbs_slave_2"

# Start hanging process to leave the container up and running
python -m http.server 8888
4 changes: 4 additions & 0 deletions ci/pbs/run-slave.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

# Start hanging process to leave the container up and running
python -m http.server 8888
25 changes: 25 additions & 0 deletions ci/pbs/slave-entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/sh
pbs_conf_file=/etc/pbs.conf
mom_conf_file=/var/spool/pbs/mom_priv/config
hostname=$(hostname)

# replace hostname in pbs.conf and mom_priv/config
sed -i "s/PBS_SERVER=.*/PBS_SERVER=$PBS_MASTER/" $pbs_conf_file
sed -i "s/\$clienthost .*/\$clienthost $hostname/" $mom_conf_file
sed -i "s/PBS_START_SERVER=.*/PBS_START_SERVER=0/" $pbs_conf_file
sed -i "s/PBS_START_SCHED=.*/PBS_START_SCHED=0/" $pbs_conf_file
sed -i "s/PBS_START_COMM=.*/PBS_START_COMM=0/" $pbs_conf_file
sed -i "s/PBS_START_MOM=.*/PBS_START_MOM=1/" $pbs_conf_file

# Prevent PBS trying to use scp between host for stdout and stderr file of jobs
# On standard PBS deployement, you would use a shared mount, or correctly configured passwordless scp
echo "\$usecp *:/home/ /home/" >> $mom_conf_file
echo "\$usecp *:/dask-jobqueue/ /tmp/" >> $mom_conf_file

# start PBS Pro
/etc/init.d/pbs start

# create default non-root user
adduser pbsuser

exec "$@"
9 changes: 9 additions & 0 deletions ci/pbs/start-pbs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

docker-compose up -d
while [ `docker exec -it -u pbsuser pbs_master pbsnodes -a | grep "Mom = pbs_slave" | wc -l` -ne 2 ]
do
echo "Waiting for PBS slave nodes to become available";
sleep 2
done
echo "PBS properly configured"
2 changes: 1 addition & 1 deletion ci/sge.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ function jobqueue_script {
docker exec -it sge_master /bin/bash -c "cd /dask-jobqueue; py.test dask_jobqueue --verbose -E sge"
}

function jobqueue_after_success {
function jobqueue_after_script {
docker exec -it sge_master bash -c 'cat /tmp/sge*'
docker exec -it slave_one bash -c 'cat /tmp/exec*'
docker exec -it slave_two bash -c 'cat /tmp/exec*'
Expand Down
21 changes: 12 additions & 9 deletions dask_jobqueue/tests/test_pbs.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ def test_job_script():

@pytest.mark.env("pbs") # noqa: F811
def test_basic(loop):
with PBSCluster(walltime='00:02:00', threads_per_worker=2, memory='7GB',
interface='ib0', loop=loop) as cluster:
with PBSCluster(walltime='00:02:00', processes=1, threads=2, memory='2GB', local_directory='/tmp',
job_extra=['-V'], loop=loop) as cluster:
with Client(cluster) as client:
workers = cluster.start_workers(2)
future = client.submit(lambda x: x + 1, 10)
Expand All @@ -87,7 +87,7 @@ def test_basic(loop):

info = client.scheduler_info()
w = list(info['workers'].values())[0]
assert w['memory_limit'] == 7e9
assert w['memory_limit'] == 2e9
assert w['ncores'] == 2

cluster.stop_workers(workers)
Expand All @@ -102,7 +102,8 @@ def test_basic(loop):

@pytest.mark.env("pbs") # noqa: F811
def test_adaptive(loop):
with PBSCluster(walltime='00:02:00', loop=loop) as cluster:
with PBSCluster(walltime='00:02:00', processes=1, threads=2, memory='2GB', local_directory='/tmp',
job_extra=['-V'], loop=loop) as cluster:
cluster.adapt()
with Client(cluster) as client:
future = client.submit(lambda x: x + 1, 10)
Expand All @@ -111,7 +112,7 @@ def test_adaptive(loop):
assert cluster.jobs

start = time()
processes = cluster.config['processes']
processes = cluster.worker_processes
while len(client.scheduler_info()['workers']) != processes:
sleep(0.1)
assert time() < start + 10
Expand All @@ -123,7 +124,9 @@ def test_adaptive(loop):
sleep(0.100)
assert time() < start + 10

start = time()
while cluster.jobs:
sleep(0.100)
assert time() < start + 10
# There is probably a bug to fix in the adaptive methods of the JobQueueCluster
# Currently cluster.jobs is not cleaned up.
#start = time()
#while cluster.jobs:
# sleep(0.100)
# assert time() < start + 10