From c7435a83a67ff588636d9e821b18ce3f36bcf165 Mon Sep 17 00:00:00 2001 From: Chun Cai Date: Mon, 6 Jan 2025 16:56:57 +0800 Subject: [PATCH 1/2] fix: lower `num_workers` to 4 For multi-task training in pytorch, each data source will have their own dataloader. If the number of workers of dataloaders is large, there will be many worker processes stressing CPU. Signed-off-by: Chun Cai --- deepmd/pt/utils/env.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepmd/pt/utils/env.py b/deepmd/pt/utils/env.py index 9803f8d04d..0e1322a640 100644 --- a/deepmd/pt/utils/env.py +++ b/deepmd/pt/utils/env.py @@ -21,7 +21,7 @@ ncpus = len(os.sched_getaffinity(0)) except AttributeError: ncpus = os.cpu_count() -NUM_WORKERS = int(os.environ.get("NUM_WORKERS", min(8, ncpus))) +NUM_WORKERS = int(os.environ.get("NUM_WORKERS", min(4, ncpus))) # Make sure DDP uses correct device if applicable LOCAL_RANK = os.environ.get("LOCAL_RANK") LOCAL_RANK = int(0 if LOCAL_RANK is None else LOCAL_RANK) From 3fdf2b79f3e1f9c5c40e65569f5c9362f8888585 Mon Sep 17 00:00:00 2001 From: Chun Cai Date: Mon, 6 Jan 2025 17:14:36 +0800 Subject: [PATCH 2/2] update docs --- doc/env.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/env.md b/doc/env.md index 3cf42b724a..4ca7101236 100644 --- a/doc/env.md +++ b/doc/env.md @@ -72,7 +72,7 @@ Default backend. :::{envvar} NUM_WORKERS -**Default**: 8 or the number of cores (whichever is smaller) +**Default**: 4 or the number of cores (whichever is smaller) {{ pytorch_icon }} Number of subprocesses to use for data loading in the PyTorch backend. See [PyTorch documentation](https://pytorch.org/docs/stable/data.html) for details.