From 1fd45446cdf8e6d517e82f1047417da27cf5f5b4 Mon Sep 17 00:00:00 2001 From: Anyang Peng <137014849+anyangml@users.noreply.github.com> Date: Mon, 26 May 2025 13:31:54 +0800 Subject: [PATCH 01/14] feat: add rglob by patterns --- deepmd/common.py | 21 +++++++++++++++++++++ deepmd/pt/entrypoints/main.py | 6 ++++-- deepmd/utils/data_system.py | 10 ++++++++-- 3 files changed, 33 insertions(+), 4 deletions(-) diff --git a/deepmd/common.py b/deepmd/common.py index c9873a6d94..8ab8722bce 100644 --- a/deepmd/common.py +++ b/deepmd/common.py @@ -206,6 +206,27 @@ def expand_sys_str(root_dir: Union[str, Path]) -> list[str]: matches.append(str(root_dir)) return matches +def rglob_sys_str(root_dir: str, patterns: list[str]) -> list[str]: + """Recursively iterate over directories taking those that contain `type.raw` file. + + Parameters + ---------- + root_dir : str, Path + starting directory + patterns : list[str] + list of glob patterns to match directories + + Returns + ------- + list[str] + list of string pointing to system directories + """ + root_dir = Path(root_dir) + matches = [] + for pattern in patterns: + matches.extend([str(d) for d in root_dir.rglob(pattern) if (d / "type.raw").is_file()]) + return matches + def get_np_precision(precision: "_PRECISION") -> np.dtype: """Get numpy precision constant from string. diff --git a/deepmd/pt/entrypoints/main.py b/deepmd/pt/entrypoints/main.py index 3fe507ecc2..0e248583ec 100644 --- a/deepmd/pt/entrypoints/main.py +++ b/deepmd/pt/entrypoints/main.py @@ -114,9 +114,11 @@ def prepare_trainer_input_single( validation_dataset_params["systems"] if validation_dataset_params else None ) training_systems = training_dataset_params["systems"] - training_systems = process_systems(training_systems) + trn_patterns = training_dataset_params.get("rglob_patterns", None) + training_systems = process_systems(training_systems, patterns=trn_patterns) if validation_systems is not None: - validation_systems = process_systems(validation_systems) + val_patterns = validation_dataset_params.get("rglob_patterns", None) + validation_systems = process_systems(validation_systems, val_patterns) # stat files stat_file_path_single = data_dict_single.get("stat_file", None) diff --git a/deepmd/utils/data_system.py b/deepmd/utils/data_system.py index fbc9c1a684..081f0ddf5a 100644 --- a/deepmd/utils/data_system.py +++ b/deepmd/utils/data_system.py @@ -16,6 +16,7 @@ import deepmd.utils.random as dp_random from deepmd.common import ( expand_sys_str, + rglob_sys_str, make_default_mesh, ) from deepmd.env import ( @@ -730,7 +731,7 @@ def prob_sys_size_ext(keywords, nsystems, nbatch): return sys_probs -def process_systems(systems: Union[str, list[str]]) -> list[str]: +def process_systems(systems: Union[str, list[str]], patterns: Optional[list[str]]=None) -> list[str]: """Process the user-input systems. If it is a single directory, search for all the systems in the directory. @@ -740,6 +741,8 @@ def process_systems(systems: Union[str, list[str]]) -> list[str]: ---------- systems : str or list of str The user-input systems + patterns : list of str, optional + The patterns to match the systems, by default None Returns ------- @@ -747,7 +750,10 @@ def process_systems(systems: Union[str, list[str]]) -> list[str]: The valid systems """ if isinstance(systems, str): - systems = expand_sys_str(systems) + if patterns is None: + systems = expand_sys_str(systems) + else: + systems = rglob_sys_str(systems, patterns) elif isinstance(systems, list): systems = systems.copy() return systems From b5c6be4765595fed4e0822950e4b9b7c7fab526f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 26 May 2025 05:34:23 +0000 Subject: [PATCH 02/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/common.py | 5 ++++- deepmd/utils/data_system.py | 6 ++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/deepmd/common.py b/deepmd/common.py index 8ab8722bce..d7cc118ba1 100644 --- a/deepmd/common.py +++ b/deepmd/common.py @@ -206,6 +206,7 @@ def expand_sys_str(root_dir: Union[str, Path]) -> list[str]: matches.append(str(root_dir)) return matches + def rglob_sys_str(root_dir: str, patterns: list[str]) -> list[str]: """Recursively iterate over directories taking those that contain `type.raw` file. @@ -224,7 +225,9 @@ def rglob_sys_str(root_dir: str, patterns: list[str]) -> list[str]: root_dir = Path(root_dir) matches = [] for pattern in patterns: - matches.extend([str(d) for d in root_dir.rglob(pattern) if (d / "type.raw").is_file()]) + matches.extend( + [str(d) for d in root_dir.rglob(pattern) if (d / "type.raw").is_file()] + ) return matches diff --git a/deepmd/utils/data_system.py b/deepmd/utils/data_system.py index 081f0ddf5a..072eb5b7b4 100644 --- a/deepmd/utils/data_system.py +++ b/deepmd/utils/data_system.py @@ -16,8 +16,8 @@ import deepmd.utils.random as dp_random from deepmd.common import ( expand_sys_str, - rglob_sys_str, make_default_mesh, + rglob_sys_str, ) from deepmd.env import ( GLOBAL_NP_FLOAT_PRECISION, @@ -731,7 +731,9 @@ def prob_sys_size_ext(keywords, nsystems, nbatch): return sys_probs -def process_systems(systems: Union[str, list[str]], patterns: Optional[list[str]]=None) -> list[str]: +def process_systems( + systems: Union[str, list[str]], patterns: Optional[list[str]] = None +) -> list[str]: """Process the user-input systems. If it is a single directory, search for all the systems in the directory. From 6f999331fa9b2e0a71b12e50c763979645cea3c2 Mon Sep 17 00:00:00 2001 From: Anyang Peng <137014849+anyangml@users.noreply.github.com> Date: Mon, 26 May 2025 13:57:23 +0800 Subject: [PATCH 03/14] feat: add argcheck --- deepmd/utils/argcheck.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index 7554abc7e6..126c69b80a 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -2886,6 +2886,9 @@ def training_data_args(): # ! added by Ziyao: new specification style for data "This key can be provided with a list that specifies the systems, or be provided with a string " "by which the prefix of all systems are given and the list of the systems is automatically generated." ) + doc_patterns = ( + "The customized patterns used in `rglob` to collect all training systems. " + ) doc_batch_size = f'This key can be \n\n\ - list: the length of which is the same as the {link_sys}. The batch size of each system is given by the elements of the list.\n\n\ - int: all {link_sys} use the same batch size.\n\n\ @@ -2909,6 +2912,9 @@ def training_data_args(): # ! added by Ziyao: new specification style for data Argument( "systems", [list[str], str], optional=False, default=".", doc=doc_systems ), + Argument( + "rglob_patterns", [list[str]], optional=True, default=None, doc=doc_patterns + ), Argument( "batch_size", [list[int], int, str], @@ -2955,6 +2961,9 @@ def validation_data_args(): # ! added by Ziyao: new specification style for dat "This key can be provided with a list that specifies the systems, or be provided with a string " "by which the prefix of all systems are given and the list of the systems is automatically generated." ) + doc_patterns = ( + "The customized patterns used in `rglob` to collect all validation systems. " + ) doc_batch_size = f'This key can be \n\n\ - list: the length of which is the same as the {link_sys}. The batch size of each system is given by the elements of the list.\n\n\ - int: all {link_sys} use the same batch size.\n\n\ @@ -2974,6 +2983,9 @@ def validation_data_args(): # ! added by Ziyao: new specification style for dat args = [ Argument( "systems", [list[str], str], optional=False, default=".", doc=doc_systems + ), + Argument( + "rglob_patterns", [list[str]], optional=True, default=None, doc=doc_patterns ), Argument( "batch_size", From 1aa0ddf9a29fb1333d4c37d951e32df0812980de Mon Sep 17 00:00:00 2001 From: Anyang Peng <137014849+anyangml@users.noreply.github.com> Date: Mon, 26 May 2025 13:58:15 +0800 Subject: [PATCH 04/14] fix: add duplicates check --- deepmd/common.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deepmd/common.py b/deepmd/common.py index d7cc118ba1..23b55203d3 100644 --- a/deepmd/common.py +++ b/deepmd/common.py @@ -228,7 +228,8 @@ def rglob_sys_str(root_dir: str, patterns: list[str]) -> list[str]: matches.extend( [str(d) for d in root_dir.rglob(pattern) if (d / "type.raw").is_file()] ) - return matches + return list(set(matches)) # remove duplicates + def get_np_precision(precision: "_PRECISION") -> np.dtype: From 030143e7a1bac3e49cdbe5cf0faaf985f45e814c Mon Sep 17 00:00:00 2001 From: Anyang Peng <137014849+anyangml@users.noreply.github.com> Date: Mon, 26 May 2025 14:01:49 +0800 Subject: [PATCH 05/14] doc: add support PT only --- deepmd/utils/argcheck.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index 126c69b80a..0d6437ab62 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -2913,7 +2913,7 @@ def training_data_args(): # ! added by Ziyao: new specification style for data "systems", [list[str], str], optional=False, default=".", doc=doc_systems ), Argument( - "rglob_patterns", [list[str]], optional=True, default=None, doc=doc_patterns + "rglob_patterns", [list[str]], optional=True, default=None, doc=doc_patterns + doc_only_pt_supported ), Argument( "batch_size", @@ -2985,7 +2985,7 @@ def validation_data_args(): # ! added by Ziyao: new specification style for dat "systems", [list[str], str], optional=False, default=".", doc=doc_systems ), Argument( - "rglob_patterns", [list[str]], optional=True, default=None, doc=doc_patterns + "rglob_patterns", [list[str]], optional=True, default=None, doc=doc_patterns + doc_only_pt_supported ), Argument( "batch_size", From 9b704878fa360069f9d04b2378044c51f855ab25 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 26 May 2025 06:06:05 +0000 Subject: [PATCH 06/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/common.py | 1 - deepmd/utils/argcheck.py | 14 +++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/deepmd/common.py b/deepmd/common.py index 23b55203d3..3ab936db67 100644 --- a/deepmd/common.py +++ b/deepmd/common.py @@ -231,7 +231,6 @@ def rglob_sys_str(root_dir: str, patterns: list[str]) -> list[str]: return list(set(matches)) # remove duplicates - def get_np_precision(precision: "_PRECISION") -> np.dtype: """Get numpy precision constant from string. diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index 0d6437ab62..0ac2fa8b82 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -2913,7 +2913,11 @@ def training_data_args(): # ! added by Ziyao: new specification style for data "systems", [list[str], str], optional=False, default=".", doc=doc_systems ), Argument( - "rglob_patterns", [list[str]], optional=True, default=None, doc=doc_patterns + doc_only_pt_supported + "rglob_patterns", + [list[str]], + optional=True, + default=None, + doc=doc_patterns + doc_only_pt_supported, ), Argument( "batch_size", @@ -2984,8 +2988,12 @@ def validation_data_args(): # ! added by Ziyao: new specification style for dat Argument( "systems", [list[str], str], optional=False, default=".", doc=doc_systems ), - Argument( - "rglob_patterns", [list[str]], optional=True, default=None, doc=doc_patterns + doc_only_pt_supported + Argument( + "rglob_patterns", + [list[str]], + optional=True, + default=None, + doc=doc_patterns + doc_only_pt_supported, ), Argument( "batch_size", From 7c247d5067c7dca4a44573f947663077afc7b8df Mon Sep 17 00:00:00 2001 From: Anyang Peng <137014849+anyangml@users.noreply.github.com> Date: Tue, 27 May 2025 14:44:22 +0800 Subject: [PATCH 07/14] fix: pass patterns to get_data --- deepmd/utils/data_system.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deepmd/utils/data_system.py b/deepmd/utils/data_system.py index 072eb5b7b4..07dab35a90 100644 --- a/deepmd/utils/data_system.py +++ b/deepmd/utils/data_system.py @@ -785,7 +785,8 @@ def get_data( The data system """ systems = jdata["systems"] - systems = process_systems(systems) + rglob_patterns = jdata.get("rglob_patterns", None) + systems = process_systems(systems, patterns=rglob_patterns) batch_size = jdata["batch_size"] sys_probs = jdata.get("sys_probs", None) From fd59bf9f133ea96304b585a7e920c51461558170 Mon Sep 17 00:00:00 2001 From: Anyang Peng <137014849+anyangml@users.noreply.github.com> Date: Tue, 3 Jun 2025 12:46:26 +0800 Subject: [PATCH 08/14] feat: add integration test --- source/tests/pt/test_training.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/source/tests/pt/test_training.py b/source/tests/pt/test_training.py index ad52c5db16..237fa9e8a6 100644 --- a/source/tests/pt/test_training.py +++ b/source/tests/pt/test_training.py @@ -516,5 +516,22 @@ def tearDown(self) -> None: shutil.rmtree(f) +class TestCustomizedRGLOB(unittest.TestCase, DPTrainTest): + def setUp(self) -> None: + input_json = str(Path(__file__).parent / "water/se_atten.json") + with open(input_json) as f: + self.config = json.load(f) + data_file = [str(Path(__file__).parent / "water/data/data_0")] + self.config["training"]["training_data"]["rglob_patterns"] = "water/data/data_*" + self.config["training"]["training_data"]["systems"] = str(Path(__file__).parent) + self.config["training"]["validation_data"]["rglob_patterns"] = "water/data/data_1" + self.config["training"]["validation_data"]["systems"] = str(Path(__file__).parent) + self.config["model"] = deepcopy(model_dpa1) + self.config["training"]["numb_steps"] = 1 + self.config["training"]["save_freq"] = 1 + + def tearDown(self) -> None: + DPTrainTest.tearDown(self) + if __name__ == "__main__": unittest.main() From ff52ad294d11166d5fbdff5795010f1ff4249858 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 3 Jun 2025 04:48:06 +0000 Subject: [PATCH 09/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- source/tests/pt/test_training.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/source/tests/pt/test_training.py b/source/tests/pt/test_training.py index 237fa9e8a6..9db43c1b37 100644 --- a/source/tests/pt/test_training.py +++ b/source/tests/pt/test_training.py @@ -524,8 +524,12 @@ def setUp(self) -> None: data_file = [str(Path(__file__).parent / "water/data/data_0")] self.config["training"]["training_data"]["rglob_patterns"] = "water/data/data_*" self.config["training"]["training_data"]["systems"] = str(Path(__file__).parent) - self.config["training"]["validation_data"]["rglob_patterns"] = "water/data/data_1" - self.config["training"]["validation_data"]["systems"] = str(Path(__file__).parent) + self.config["training"]["validation_data"]["rglob_patterns"] = ( + "water/data/data_1" + ) + self.config["training"]["validation_data"]["systems"] = str( + Path(__file__).parent + ) self.config["model"] = deepcopy(model_dpa1) self.config["training"]["numb_steps"] = 1 self.config["training"]["save_freq"] = 1 @@ -533,5 +537,6 @@ def setUp(self) -> None: def tearDown(self) -> None: DPTrainTest.tearDown(self) + if __name__ == "__main__": unittest.main() From e621ce205f740effdaaa13be622f8d9e025bdaac Mon Sep 17 00:00:00 2001 From: Anyang Peng <137014849+anyangml@users.noreply.github.com> Date: Tue, 3 Jun 2025 12:52:32 +0800 Subject: [PATCH 10/14] chore: remove redundant --- source/tests/pt/test_training.py | 1 - 1 file changed, 1 deletion(-) diff --git a/source/tests/pt/test_training.py b/source/tests/pt/test_training.py index 9db43c1b37..cfc63c7c4b 100644 --- a/source/tests/pt/test_training.py +++ b/source/tests/pt/test_training.py @@ -521,7 +521,6 @@ def setUp(self) -> None: input_json = str(Path(__file__).parent / "water/se_atten.json") with open(input_json) as f: self.config = json.load(f) - data_file = [str(Path(__file__).parent / "water/data/data_0")] self.config["training"]["training_data"]["rglob_patterns"] = "water/data/data_*" self.config["training"]["training_data"]["systems"] = str(Path(__file__).parent) self.config["training"]["validation_data"]["rglob_patterns"] = ( From fc402537d3a2bac9368e90d6b5e718363c61302c Mon Sep 17 00:00:00 2001 From: Anyang Peng <137014849+anyangml@users.noreply.github.com> Date: Tue, 3 Jun 2025 13:12:39 +0800 Subject: [PATCH 11/14] fix: update pattern --- source/tests/pt/test_training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/tests/pt/test_training.py b/source/tests/pt/test_training.py index cfc63c7c4b..336e2a7e2e 100644 --- a/source/tests/pt/test_training.py +++ b/source/tests/pt/test_training.py @@ -524,7 +524,7 @@ def setUp(self) -> None: self.config["training"]["training_data"]["rglob_patterns"] = "water/data/data_*" self.config["training"]["training_data"]["systems"] = str(Path(__file__).parent) self.config["training"]["validation_data"]["rglob_patterns"] = ( - "water/data/data_1" + "water/*/data_0" ) self.config["training"]["validation_data"]["systems"] = str( Path(__file__).parent From 16fd12d1de088a331bca0446af54a32f9e11ac03 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 3 Jun 2025 05:14:15 +0000 Subject: [PATCH 12/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- source/tests/pt/test_training.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/source/tests/pt/test_training.py b/source/tests/pt/test_training.py index 336e2a7e2e..30705ee6ab 100644 --- a/source/tests/pt/test_training.py +++ b/source/tests/pt/test_training.py @@ -523,9 +523,7 @@ def setUp(self) -> None: self.config = json.load(f) self.config["training"]["training_data"]["rglob_patterns"] = "water/data/data_*" self.config["training"]["training_data"]["systems"] = str(Path(__file__).parent) - self.config["training"]["validation_data"]["rglob_patterns"] = ( - "water/*/data_0" - ) + self.config["training"]["validation_data"]["rglob_patterns"] = "water/*/data_0" self.config["training"]["validation_data"]["systems"] = str( Path(__file__).parent ) From 7f4c958747735641db7c275bb84a2674af50d24d Mon Sep 17 00:00:00 2001 From: Anyang Peng <137014849+anyangml@users.noreply.github.com> Date: Tue, 3 Jun 2025 14:15:35 +0800 Subject: [PATCH 13/14] fix: UT dtype --- source/tests/pt/test_training.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/tests/pt/test_training.py b/source/tests/pt/test_training.py index 30705ee6ab..cf3c010b0b 100644 --- a/source/tests/pt/test_training.py +++ b/source/tests/pt/test_training.py @@ -521,9 +521,9 @@ def setUp(self) -> None: input_json = str(Path(__file__).parent / "water/se_atten.json") with open(input_json) as f: self.config = json.load(f) - self.config["training"]["training_data"]["rglob_patterns"] = "water/data/data_*" + self.config["training"]["training_data"]["rglob_patterns"] = ["water/data/data_*"] self.config["training"]["training_data"]["systems"] = str(Path(__file__).parent) - self.config["training"]["validation_data"]["rglob_patterns"] = "water/*/data_0" + self.config["training"]["validation_data"]["rglob_patterns"] = ["water/*/data_0"] self.config["training"]["validation_data"]["systems"] = str( Path(__file__).parent ) From b8c5db63a4fc39e49c90ef514677650d0c75d682 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 3 Jun 2025 06:17:16 +0000 Subject: [PATCH 14/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- source/tests/pt/test_training.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/source/tests/pt/test_training.py b/source/tests/pt/test_training.py index cf3c010b0b..3df95e4b14 100644 --- a/source/tests/pt/test_training.py +++ b/source/tests/pt/test_training.py @@ -521,9 +521,13 @@ def setUp(self) -> None: input_json = str(Path(__file__).parent / "water/se_atten.json") with open(input_json) as f: self.config = json.load(f) - self.config["training"]["training_data"]["rglob_patterns"] = ["water/data/data_*"] + self.config["training"]["training_data"]["rglob_patterns"] = [ + "water/data/data_*" + ] self.config["training"]["training_data"]["systems"] = str(Path(__file__).parent) - self.config["training"]["validation_data"]["rglob_patterns"] = ["water/*/data_0"] + self.config["training"]["validation_data"]["rglob_patterns"] = [ + "water/*/data_0" + ] self.config["training"]["validation_data"]["systems"] = str( Path(__file__).parent )