diff --git a/.github/workflows/test_python.yml b/.github/workflows/test_python.yml index 8517560fc9..27f8e866d6 100644 --- a/.github/workflows/test_python.yml +++ b/.github/workflows/test_python.yml @@ -71,4 +71,9 @@ jobs: CXX: g++-${{ matrix.gcc }} TENSORFLOW_VERSION: ${{ matrix.tf }} - run: dp --version + - name: Prepare parallel runtime + if: ${{ matrix.tf == '' }} + run: | + sudo apt install libopenmpi-dev openmpi-bin + HOROVOD_WITHOUT_GLOO=1 HOROVOD_WITH_TENSORFLOW=1 pip install horovod mpi4py - run: pytest --cov=deepmd source/tests && codecov diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index 72dc61418e..e82033d1b5 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -397,7 +397,6 @@ def _init_session(self): config = get_tf_session_config() device, idx = self.run_opt.my_device.split(":", 1) if device == "gpu": - config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = idx self.sess = tf.Session(config=config) diff --git a/source/tests/test_parallel_training.py b/source/tests/test_parallel_training.py index e2310e8267..320d8ca22c 100644 --- a/source/tests/test_parallel_training.py +++ b/source/tests/test_parallel_training.py @@ -18,9 +18,12 @@ def setUp(self): def test_two_workers(self): command = 'horovodrun -np 2 dp train -m workers ' + self.input_file penv = os.environ.copy() - if len(get_gpus() or []) > 1: + num_gpus = len(get_gpus() or []) + if num_gpus > 1: penv['CUDA_VISIBLE_DEVICES'] = '0,1' - popen = sp.Popen(command, shell=True, env=penv, stdout=sp.PIPE, stderr=sp.STDOUT) + elif num_gpus == 1: + raise unittest.SkipTest("At least 2 GPU cards are needed for parallel-training tests.") + popen = sp.Popen(command, shell=True, cwd=str(tests_path), env=penv, stdout=sp.PIPE, stderr=sp.STDOUT) for line in iter(popen.stdout.readline, b''): if hasattr(line, 'decode'): line = line.decode('utf-8')