From c8cb61c72fffd9e06a580073a97125f8ab83dd11 Mon Sep 17 00:00:00 2001 From: Shaochen Shi Date: Tue, 31 Aug 2021 14:22:18 +0800 Subject: [PATCH 1/4] Remove dupliated setting of `allow_growth` in trainer. --- deepmd/train/trainer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index 72dc61418e..e82033d1b5 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -397,7 +397,6 @@ def _init_session(self): config = get_tf_session_config() device, idx = self.run_opt.my_device.split(":", 1) if device == "gpu": - config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = idx self.sess = tf.Session(config=config) From 5c8f18b1cae451c5d6b43687428076f4f27f313d Mon Sep 17 00:00:00 2001 From: Shaochen Shi Date: Tue, 31 Aug 2021 14:23:00 +0800 Subject: [PATCH 2/4] Make parallel training UT independent of its working folder. --- source/tests/test_parallel_training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/tests/test_parallel_training.py b/source/tests/test_parallel_training.py index e2310e8267..0c80890abe 100644 --- a/source/tests/test_parallel_training.py +++ b/source/tests/test_parallel_training.py @@ -20,7 +20,7 @@ def test_two_workers(self): penv = os.environ.copy() if len(get_gpus() or []) > 1: penv['CUDA_VISIBLE_DEVICES'] = '0,1' - popen = sp.Popen(command, shell=True, env=penv, stdout=sp.PIPE, stderr=sp.STDOUT) + popen = sp.Popen(command, shell=True, cwd=str(tests_path), env=penv, stdout=sp.PIPE, stderr=sp.STDOUT) for line in iter(popen.stdout.readline, b''): if hasattr(line, 'decode'): line = line.decode('utf-8') From d38338f1a1bcc4d896c708cf02b1be0bcc154a43 Mon Sep 17 00:00:00 2001 From: Shaochen Shi Date: Tue, 31 Aug 2021 21:46:33 +0800 Subject: [PATCH 3/4] Skip parallel-training tests when there is only 1 GPU card. --- source/tests/test_parallel_training.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/source/tests/test_parallel_training.py b/source/tests/test_parallel_training.py index 0c80890abe..320d8ca22c 100644 --- a/source/tests/test_parallel_training.py +++ b/source/tests/test_parallel_training.py @@ -18,8 +18,11 @@ def setUp(self): def test_two_workers(self): command = 'horovodrun -np 2 dp train -m workers ' + self.input_file penv = os.environ.copy() - if len(get_gpus() or []) > 1: + num_gpus = len(get_gpus() or []) + if num_gpus > 1: penv['CUDA_VISIBLE_DEVICES'] = '0,1' + elif num_gpus == 1: + raise unittest.SkipTest("At least 2 GPU cards are needed for parallel-training tests.") popen = sp.Popen(command, shell=True, cwd=str(tests_path), env=penv, stdout=sp.PIPE, stderr=sp.STDOUT) for line in iter(popen.stdout.readline, b''): if hasattr(line, 'decode'): From 428000020b54eae27c0186b35982b95c23c2ec73 Mon Sep 17 00:00:00 2001 From: Shaochen Shi Date: Tue, 31 Aug 2021 22:12:18 +0800 Subject: [PATCH 4/4] Enable parallel training UT in GitHub CI. --- .github/workflows/test_python.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/test_python.yml b/.github/workflows/test_python.yml index 8517560fc9..27f8e866d6 100644 --- a/.github/workflows/test_python.yml +++ b/.github/workflows/test_python.yml @@ -71,4 +71,9 @@ jobs: CXX: g++-${{ matrix.gcc }} TENSORFLOW_VERSION: ${{ matrix.tf }} - run: dp --version + - name: Prepare parallel runtime + if: ${{ matrix.tf == '' }} + run: | + sudo apt install libopenmpi-dev openmpi-bin + HOROVOD_WITHOUT_GLOO=1 HOROVOD_WITH_TENSORFLOW=1 pip install horovod mpi4py - run: pytest --cov=deepmd source/tests && codecov