From 35d5577a12b75903b2e5e1c9de4681957cbf9ded Mon Sep 17 00:00:00 2001 From: Chun Cai Date: Thu, 26 Dec 2024 01:27:32 +0800 Subject: [PATCH 01/24] Perf: replace unnecessary `torch.split` with indexing (#4505) Some operations only use the first segment of the result tensor of `torch.split`. In this case, all the other segments are created and discarded. This slightly adds an overhead to the training process. ## Summary by CodeRabbit - **Bug Fixes** - Simplified tensor slicing operations in the `RepformerLayer` class and the `nlist_distinguish_types` function, enhancing readability and performance. - **Documentation** - Updated comments for clarity regarding tensor shapes in the `RepformerLayer` class. (cherry picked from commit 3cecca44a587b3d3160ada40b07d86abd734eeb7) --- deepmd/pt/model/descriptor/repformer_layer.py | 2 +- deepmd/pt/utils/nlist.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/deepmd/pt/model/descriptor/repformer_layer.py b/deepmd/pt/model/descriptor/repformer_layer.py index 86b09e9b40..1e2cba66d6 100644 --- a/deepmd/pt/model/descriptor/repformer_layer.py +++ b/deepmd/pt/model/descriptor/repformer_layer.py @@ -1003,7 +1003,7 @@ def _cal_grrg(h2g2: torch.Tensor, axis_neuron: int) -> torch.Tensor: # nb x nloc x 3 x ng2 nb, nloc, _, ng2 = h2g2.shape # nb x nloc x 3 x axis - h2g2m = torch.split(h2g2, axis_neuron, dim=-1)[0] + h2g2m = h2g2[..., :axis_neuron] # nb x nloc x axis x ng2 g1_13 = torch.matmul(torch.transpose(h2g2m, -1, -2), h2g2) / (3.0**1) # nb x nloc x (axisxng2) diff --git a/deepmd/pt/utils/nlist.py b/deepmd/pt/utils/nlist.py index db1e87785b..ec94e8cd60 100644 --- a/deepmd/pt/utils/nlist.py +++ b/deepmd/pt/utils/nlist.py @@ -310,7 +310,7 @@ def nlist_distinguish_types( inlist = torch.gather(nlist, 2, imap) inlist = inlist.masked_fill(~(pick_mask.to(torch.bool)), -1) # nloc x nsel[ii] - ret_nlist.append(torch.split(inlist, [ss, snsel - ss], dim=-1)[0]) + ret_nlist.append(inlist[..., :ss]) return torch.concat(ret_nlist, dim=-1) From dfaa2ba4b2cf47c1ccf61fdedee1558aa974d73f Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 26 Dec 2024 00:22:39 -0500 Subject: [PATCH 02/24] docs: fix the header of the scaling test table (#4507) Fix #4494. ## Summary by CodeRabbit - **Documentation** - Updated the parallel training documentation for TensorFlow and PyTorch to enhance clarity. - Expanded explanations on parallel training processes and data loading utilities. - Introduced a flowchart to illustrate data flow and modified the scaling tests table format for better understanding. Signed-off-by: Jinzhe Zeng (cherry picked from commit bd2395cf7a40afd90dc0f203583bdb836f06feda) --- doc/train/parallel-training.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/doc/train/parallel-training.md b/doc/train/parallel-training.md index 9ea92b4751..00df0a63f0 100644 --- a/doc/train/parallel-training.md +++ b/doc/train/parallel-training.md @@ -27,13 +27,14 @@ In some cases, it won't work well when scaling the learning rate by worker count ### Scaling test Testing `examples/water/se_e2_a` on an 8-GPU host, linear acceleration can be observed with the increasing number of cards. - -| Num of GPU cards | Seconds every 100 samples | Samples per second | Speed up | -| ---------------- | ------------------------- | ------------------ | -------- | -| 1 | 1.4515 | 68.89 | 1.00 | -| 2 | 1.5962 | 62.65\*2 | 1.82 | -| 4 | 1.7635 | 56.71\*4 | 3.29 | -| 8 | 1.7267 | 57.91\*8 | 6.72 | +In this example, the number of samples per batch on a single GPU card ({ref}`batch_size `) is set to `1`. + +| Num of GPU cards | Samples per batch | Seconds every 100 batches | Samples per second | Speed up | +| ---------------- | ----------------- | ------------------------- | ------------------ | -------- | +| 1 | 1 | 1.4515 | 68.89 | 1.00 | +| 2 | 2 | 1.5962 | 62.65\*2 | 1.82 | +| 4 | 4 | 1.7635 | 56.71\*4 | 3.29 | +| 8 | 8 | 1.7267 | 57.91\*8 | 6.72 | ### How to use From a44c6ca935b21acfdb9cad5e4047d2644fb34f06 Mon Sep 17 00:00:00 2001 From: James Misaka Date: Thu, 26 Dec 2024 15:44:30 +0800 Subject: [PATCH 03/24] Fix: Modify docs of DPA models (#4510) Modify docs of DPA models, especially for DPA-1 website ## Summary by CodeRabbit - **Documentation** - Updated DPA-2 model documentation for improved clarity and accessibility. - Changed references in the "se_atten" descriptor documentation to link to a formal publication on Nature. - Revised citations in the fine-tuning documentation to point to the DPA-1 paper on Nature, enhancing the credibility of sources. (cherry picked from commit ff9b75ed57317feb2d30a792b34fe8b35298e97f) --- doc/model/dpa2.md | 2 +- doc/model/train-se-atten.md | 4 ++-- doc/train/finetuning.md | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/model/dpa2.md b/doc/model/dpa2.md index 300876bf05..a041547a14 100644 --- a/doc/model/dpa2.md +++ b/doc/model/dpa2.md @@ -4,7 +4,7 @@ **Supported backends**: PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, DP {{ dpmodel_icon }} ::: -The DPA-2 model implementation. See https://doi.org/10.1038/s41524-024-01493-2 for more details. +The DPA-2 model implementation. See [DPA-2 paper](https://doi.org/10.1038/s41524-024-01493-2) for more details. Training example: `examples/water/dpa2/input_torch_medium.json`, see [README](../../examples/water/dpa2/README.md) for inputs in different levels. diff --git a/doc/model/train-se-atten.md b/doc/model/train-se-atten.md index 92a56395f6..504b214737 100644 --- a/doc/model/train-se-atten.md +++ b/doc/model/train-se-atten.md @@ -8,7 +8,7 @@ Here we propose DPA-1, a Deep Potential model with a novel attention mechanism, which is highly effective for representing the conformation and chemical spaces of atomic systems and learning the PES. -See [this paper](https://arxiv.org/abs/2208.08236) for more information. DPA-1 is implemented as a new descriptor `"se_atten"` for model training, which can be used after simply editing the input.json. +See [this paper](https://www.nature.com/articles/s41524-024-01278-7) for more information. DPA-1 is implemented as a new descriptor `"se_atten"` for model training, which can be used after simply editing the input.json. ## Theory @@ -71,7 +71,7 @@ Then layer normalization is added in a residual way to finally obtain the self-a Next, we will list the detailed settings in input.json and the data format, especially for large systems with dozens of elements. An example of DPA-1 input can be found in `examples/water/se_atten/input.json`. The notation of `se_atten` is short for the smooth edition of Deep Potential with an attention mechanism. -This descriptor was described in detail in [the DPA-1 paper](https://arxiv.org/abs/2208.08236) and the images above. +This descriptor was described in detail in [the DPA-1 paper](https://www.nature.com/articles/s41524-024-01278-7) and the images above. In this example, we will train a DPA-1 model for a water system. A complete training input script of this example can be found in the directory: diff --git a/doc/train/finetuning.md b/doc/train/finetuning.md index 04d86cfc98..beb6012003 100644 --- a/doc/train/finetuning.md +++ b/doc/train/finetuning.md @@ -9,7 +9,7 @@ to vastly reduce the training cost, while it's not trivial in potential models. Compositions and configurations of data samples or even computational parameters in upstream software (such as VASP) may be different between the pre-trained and target datasets, leading to energy shifts or other diversities of training data. -Recently the emerging of methods such as [DPA-1](https://arxiv.org/abs/2208.08236) has brought us to a new stage where we can +Recently the emerging of methods such as [DPA-1](https://www.nature.com/articles/s41524-024-01278-7) has brought us to a new stage where we can perform similar pretraining-finetuning approaches. They can hopefully learn the common knowledge in the pre-trained dataset (especially the `force` information) and thus reduce the computational cost in downstream training tasks. @@ -19,7 +19,7 @@ and thus reduce the computational cost in downstream training tasks. If you have a pre-trained model `pretrained.pb` (here we support models using [`se_atten`](../model/train-se-atten.md) descriptor and [`ener`](../model/train-energy.md) fitting net) on a large dataset (for example, [OC2M](https://github.com/Open-Catalyst-Project/ocp/blob/main/DATASET.md) in -DPA-1 [paper](https://arxiv.org/abs/2208.08236)), a finetuning strategy can be performed by simply running: +DPA-1 [paper](https://www.nature.com/articles/s41524-024-01278-7)), a finetuning strategy can be performed by simply running: ```bash $ dp train input.json --finetune pretrained.pb From 43c8cae30d569cdb8f6af96e947a3a12c3a7a415 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Tue, 7 Jan 2025 10:23:36 +0800 Subject: [PATCH 04/24] fix(pt): fix clearing the list in set_eval_descriptor_hook (#4534) Fix #4533. ## Summary by CodeRabbit - **Bug Fixes** - Improved list clearing mechanism in `DPAtomicModel` class - Enhanced test coverage for descriptor evaluation in `TestDeepPot` Signed-off-by: Jinzhe Zeng (cherry picked from commit c544c94520a153962fc4a623702c17a020be25b2) --- deepmd/pt/model/atomic_model/dp_atomic_model.py | 3 ++- source/tests/infer/test_models.py | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/deepmd/pt/model/atomic_model/dp_atomic_model.py b/deepmd/pt/model/atomic_model/dp_atomic_model.py index 2cdc97f934..0f34f155ac 100644 --- a/deepmd/pt/model/atomic_model/dp_atomic_model.py +++ b/deepmd/pt/model/atomic_model/dp_atomic_model.py @@ -69,7 +69,8 @@ def __init__( def set_eval_descriptor_hook(self, enable: bool) -> None: """Set the hook for evaluating descriptor and clear the cache for descriptor list.""" self.enable_eval_descriptor_hook = enable - self.eval_descriptor_list = [] + # = [] does not work; See #4533 + self.eval_descriptor_list.clear() def eval_descriptor(self) -> torch.Tensor: """Evaluate the descriptor.""" diff --git a/source/tests/infer/test_models.py b/source/tests/infer/test_models.py index a6cde3206c..bf1a04766c 100644 --- a/source/tests/infer/test_models.py +++ b/source/tests/infer/test_models.py @@ -159,6 +159,10 @@ def test_descriptor(self) -> None: descpt = self.dp.eval_descriptor(result.coord, result.box, result.atype) expected_descpt = result.descriptor np.testing.assert_almost_equal(descpt.ravel(), expected_descpt.ravel()) + # See #4533 + descpt = self.dp.eval_descriptor(result.coord, result.box, result.atype) + expected_descpt = result.descriptor + np.testing.assert_almost_equal(descpt.ravel(), expected_descpt.ravel()) def test_2frame_atm(self) -> None: for ii, result in enumerate(self.case.results): From 2b7f53cc5c4ef56bd23ee119421bad5475ca763e Mon Sep 17 00:00:00 2001 From: Jia-Xin Zhu <53895049+ChiahsinChu@users.noreply.github.com> Date: Tue, 7 Jan 2025 10:27:11 +0800 Subject: [PATCH 05/24] [fix bug] load atomic_*.npy for tf tensor model (#4538) Fix bug mentioned in https://github.com/deepmodeling/deepmd-kit/issues/4536 - **Bug Fixes** - Updated atomic property and weight label naming conventions across the machine learning training and loss components to ensure consistent terminology. - Corrected placeholder key references in the training process to match updated label names. (cherry picked from commit 380efb9837b0b396155a98959d3cb954fc630d92) --- deepmd/tf/loss/tensor.py | 2 +- deepmd/tf/train/trainer.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/deepmd/tf/loss/tensor.py b/deepmd/tf/loss/tensor.py index aca9182ff6..1e90c89cb8 100644 --- a/deepmd/tf/loss/tensor.py +++ b/deepmd/tf/loss/tensor.py @@ -145,7 +145,7 @@ def label_requirement(self) -> list[DataRequirementItem]: # data required data_requirements.append( DataRequirementItem( - "atom_" + self.label_name, + "atomic_" + self.label_name, self.tensor_size, atomic=True, must=False, diff --git a/deepmd/tf/train/trainer.py b/deepmd/tf/train/trainer.py index c6affdef7b..16e2c27a40 100644 --- a/deepmd/tf/train/trainer.py +++ b/deepmd/tf/train/trainer.py @@ -282,6 +282,13 @@ def _build_network(self, data, suffix="") -> None: tf.int32, [None], name="t_mesh" ) self.place_holders["is_training"] = tf.placeholder(tf.bool) + # update "atomic_" in self.place_holders.keys() with "atom_" + for kk in list(self.place_holders.keys()): + if "atomic_" in kk: + self.place_holders[kk.replace("atomic_", "atom_")] = ( + self.place_holders.pop(kk) + ) + self.model_pred = self.model.build( self.place_holders["coord"], self.place_holders["type"], From cc37fd158c529caa6c1203583bfed6f50162679b Mon Sep 17 00:00:00 2001 From: Chun Cai Date: Tue, 7 Jan 2025 14:52:10 +0800 Subject: [PATCH 06/24] fix: lower `num_workers` to 4 (#4535) For multi-task training in pytorch, each data source will have their own dataloader. If the number of workers of dataloaders is large, there will be many (number of tasks * num_workers) worker processes stressing CPU. ## Summary by CodeRabbit - **Performance Optimization** - Adjusted default maximum worker configuration from 8 to 4 CPUs - Reduced potential parallel processing resources for the environment - **Documentation** - Updated documentation to reflect the change in default value for `NUM_WORKERS` from 8 to 4 --------- Signed-off-by: Chun Cai (cherry picked from commit 38dc5c91e2171abda3793b677183a3f72313990f) --- deepmd/pt/utils/env.py | 2 +- doc/env.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/deepmd/pt/utils/env.py b/deepmd/pt/utils/env.py index 81dce669ff..ef2654241b 100644 --- a/deepmd/pt/utils/env.py +++ b/deepmd/pt/utils/env.py @@ -21,7 +21,7 @@ ncpus = len(os.sched_getaffinity(0)) except AttributeError: ncpus = os.cpu_count() -NUM_WORKERS = int(os.environ.get("NUM_WORKERS", min(8, ncpus))) +NUM_WORKERS = int(os.environ.get("NUM_WORKERS", min(4, ncpus))) # Make sure DDP uses correct device if applicable LOCAL_RANK = os.environ.get("LOCAL_RANK") LOCAL_RANK = int(0 if LOCAL_RANK is None else LOCAL_RANK) diff --git a/doc/env.md b/doc/env.md index 3cf42b724a..4ca7101236 100644 --- a/doc/env.md +++ b/doc/env.md @@ -72,7 +72,7 @@ Default backend. :::{envvar} NUM_WORKERS -**Default**: 8 or the number of cores (whichever is smaller) +**Default**: 4 or the number of cores (whichever is smaller) {{ pytorch_icon }} Number of subprocesses to use for data loading in the PyTorch backend. See [PyTorch documentation](https://pytorch.org/docs/stable/data.html) for details. From 5f9bbc46c02a9259d5a702d8aead0783e30934ef Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 10 Jan 2025 16:58:46 +0800 Subject: [PATCH 07/24] feat(tf): support tensor fitting with hybrid descriptor (#4542) Fix #4527. ## Summary by CodeRabbit - **New Features** - Added rotation matrix handling methods to descriptor classes. - Introduced validation for model initialization. - **Bug Fixes** - Improved error handling for fitting network configuration. - **Tests** - Added new test suite for dipole hybrid descriptor model validation. - **Refactor** - Enhanced descriptor class functionality with default rotation matrix methods. --------- Signed-off-by: Jinzhe Zeng (cherry picked from commit 65ca05a9117b6d8c79f8bd34c719f33fb15532c7) --- deepmd/tf/descriptor/descriptor.py | 9 +- deepmd/tf/descriptor/hybrid.py | 18 +++ deepmd/tf/model/model.py | 5 + source/tests/tf/test_dipole_hybrid_descrpt.py | 143 ++++++++++++++++++ 4 files changed, 174 insertions(+), 1 deletion(-) create mode 100644 source/tests/tf/test_dipole_hybrid_descrpt.py diff --git a/deepmd/tf/descriptor/descriptor.py b/deepmd/tf/descriptor/descriptor.py index dd86beb21e..bd1af8c72e 100644 --- a/deepmd/tf/descriptor/descriptor.py +++ b/deepmd/tf/descriptor/descriptor.py @@ -105,7 +105,8 @@ def get_dim_rot_mat_1(self) -> int: int the first dimension of the rotation matrix """ - raise NotImplementedError + # by default, no rotation matrix + return 0 def get_nlist(self) -> tuple[tf.Tensor, tf.Tensor, list[int], list[int]]: """Returns neighbor information. @@ -534,3 +535,9 @@ def serialize(self, suffix: str = "") -> dict: def input_requirement(self) -> list[DataRequirementItem]: """Return data requirements needed for the model input.""" return [] + + def get_rot_mat(self) -> tf.Tensor: + """Get rotational matrix.""" + nframes = tf.shape(self.dout)[0] + natoms = tf.shape(self.dout)[1] + return tf.zeros([nframes, natoms, 0], dtype=GLOBAL_TF_FLOAT_PRECISION) diff --git a/deepmd/tf/descriptor/hybrid.py b/deepmd/tf/descriptor/hybrid.py index 2ee35d9ebb..57c21f0ee6 100644 --- a/deepmd/tf/descriptor/hybrid.py +++ b/deepmd/tf/descriptor/hybrid.py @@ -492,3 +492,21 @@ def deserialize(cls, data: dict, suffix: str = "") -> "DescrptHybrid": if hasattr(ii, "type_embedding"): raise NotImplementedError("hybrid + type embedding is not supported") return obj + + def get_dim_rot_mat_1(self) -> int: + """Returns the first dimension of the rotation matrix. The rotation is of shape + dim_1 x 3. + + Returns + ------- + int + the first dimension of the rotation matrix + """ + return sum([ii.get_dim_rot_mat_1() for ii in self.descrpt_list]) + + def get_rot_mat(self) -> tf.Tensor: + """Get rotational matrix.""" + all_rot_mat = [] + for ii in self.descrpt_list: + all_rot_mat.append(ii.get_rot_mat()) + return tf.concat(all_rot_mat, axis=2) diff --git a/deepmd/tf/model/model.py b/deepmd/tf/model/model.py index 8991bf1baf..3377ed2d51 100644 --- a/deepmd/tf/model/model.py +++ b/deepmd/tf/model/model.py @@ -668,6 +668,11 @@ def __init__( else: if fitting_net["type"] in ["dipole", "polar"]: fitting_net["embedding_width"] = self.descrpt.get_dim_rot_mat_1() + if fitting_net["embedding_width"] == 0: + raise ValueError( + "This descriptor cannot provide a rotation matrix " + "for a tensorial fitting." + ) self.fitting = Fitting( **fitting_net, descrpt=self.descrpt, diff --git a/source/tests/tf/test_dipole_hybrid_descrpt.py b/source/tests/tf/test_dipole_hybrid_descrpt.py new file mode 100644 index 0000000000..cc500c43ac --- /dev/null +++ b/source/tests/tf/test_dipole_hybrid_descrpt.py @@ -0,0 +1,143 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +import numpy as np + +from deepmd.tf.descriptor.hybrid import ( + DescrptHybrid, +) +from deepmd.tf.env import ( + tf, +) +from deepmd.tf.fit import ( + DipoleFittingSeA, +) +from deepmd.tf.model import ( + DipoleModel, +) + +from .common import ( + DataSystem, + gen_data, + j_loader, +) + +GLOBAL_ENER_FLOAT_PRECISION = tf.float64 +GLOBAL_TF_FLOAT_PRECISION = tf.float64 +GLOBAL_NP_FLOAT_PRECISION = np.float64 + + +class TestModel(tf.test.TestCase): + def setUp(self) -> None: + gen_data() + + def test_model(self) -> None: + jfile = "polar_se_a.json" + jdata = j_loader(jfile) + + systems = jdata["systems"] + set_pfx = "set" + batch_size = 1 + test_size = 1 + rcut = jdata["model"]["descriptor"]["rcut"] + + data = DataSystem(systems, set_pfx, batch_size, test_size, rcut, run_opt=None) + + test_data = data.get_test() + numb_test = 1 + + descrpt = DescrptHybrid( + list=[ + { + "type": "se_e2_a", + "sel": [20, 20], + "rcut_smth": 1.8, + "rcut": 6.0, + "neuron": [2, 4, 8], + "resnet_dt": False, + "axis_neuron": 8, + "precision": "float64", + "type_one_side": True, + "seed": 1, + }, + { + "type": "se_e2_a", + "sel": [20, 20], + "rcut_smth": 1.8, + "rcut": 6.0, + "neuron": [2, 4, 8], + "resnet_dt": False, + "axis_neuron": 8, + "precision": "float64", + "type_one_side": True, + "seed": 1, + }, + { + "type": "se_e3", + "sel": [5, 5], + "rcut_smth": 1.8, + "rcut": 2.0, + "neuron": [2], + "resnet_dt": False, + "precision": "float64", + "seed": 1, + }, + ] + ) + jdata["model"]["fitting_net"].pop("type", None) + jdata["model"]["fitting_net"].pop("fit_diag", None) + jdata["model"]["fitting_net"]["ntypes"] = descrpt.get_ntypes() + jdata["model"]["fitting_net"]["dim_descrpt"] = descrpt.get_dim_out() + jdata["model"]["fitting_net"]["embedding_width"] = descrpt.get_dim_rot_mat_1() + fitting = DipoleFittingSeA(**jdata["model"]["fitting_net"], uniform_seed=True) + model = DipoleModel(descrpt, fitting) + + # model._compute_dstats([test_data['coord']], [test_data['box']], [test_data['type']], [test_data['natoms_vec']], [test_data['default_mesh']]) + input_data = { + "coord": [test_data["coord"]], + "box": [test_data["box"]], + "type": [test_data["type"]], + "natoms_vec": [test_data["natoms_vec"]], + "default_mesh": [test_data["default_mesh"]], + "fparam": [test_data["fparam"]], + } + model._compute_input_stat(input_data) + + t_prop_c = tf.placeholder(tf.float32, [5], name="t_prop_c") + t_coord = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None], name="i_coord") + t_type = tf.placeholder(tf.int32, [None], name="i_type") + t_natoms = tf.placeholder(tf.int32, [model.ntypes + 2], name="i_natoms") + t_box = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None, 9], name="i_box") + t_mesh = tf.placeholder(tf.int32, [None], name="i_mesh") + is_training = tf.placeholder(tf.bool) + t_fparam = None + + model_pred = model.build( + t_coord, + t_type, + t_natoms, + t_box, + t_mesh, + t_fparam, + suffix="dipole_hybrid", + reuse=False, + ) + dipole = model_pred["dipole"] + gdipole = model_pred["global_dipole"] + force = model_pred["force"] + virial = model_pred["virial"] + atom_virial = model_pred["atom_virial"] + + feed_dict_test = { + t_prop_c: test_data["prop_c"], + t_coord: np.reshape(test_data["coord"][:numb_test, :], [-1]), + t_box: test_data["box"][:numb_test, :], + t_type: np.reshape(test_data["type"][:numb_test, :], [-1]), + t_natoms: test_data["natoms_vec"], + t_mesh: test_data["default_mesh"], + is_training: False, + } + + sess = self.cached_session().__enter__() + sess.run(tf.global_variables_initializer()) + [p, gp, f, v, av] = sess.run( + [dipole, gdipole, force, virial, atom_virial], feed_dict=feed_dict_test + ) From 4cb16a1fd2cf093b90df64f345a17cc93a3acaf4 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 16 Jan 2025 09:14:42 +0800 Subject: [PATCH 08/24] docs: add `sphinx.configuration` to .readthedocs.yml (#4553) See https://about.readthedocs.com/blog/2024/12/deprecate-config-files-without-sphinx-or-mkdocs-config/ ## Summary by CodeRabbit - **Documentation** - Updated Read the Docs configuration to specify Sphinx documentation configuration file. Signed-off-by: Jinzhe Zeng (cherry picked from commit fdf80497a3f08421aa721e6e52be963656af2031) --- .readthedocs.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.readthedocs.yml b/.readthedocs.yml index 499411eaa3..7cda715627 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -10,5 +10,7 @@ build: - VIRTUAL_ENV=$READTHEDOCS_VIRTUALENV_PATH uv pip install -r doc/requirements.txt apt_packages: - inkscape +sphinx: + configuration: doc/conf.py formats: - pdf From 476f34d443645acb1ba8aa21cd8f954a16fde860 Mon Sep 17 00:00:00 2001 From: Chun Cai Date: Thu, 16 Jan 2025 19:51:36 +0800 Subject: [PATCH 09/24] Perf: use F.linear for MLP (#4513) It brings <1% speedup. ## Summary by CodeRabbit - **Refactor** - Simplified linear transformation implementation in the neural network layer - Improved code readability and efficiency in matrix operations (cherry picked from commit 2ba310097253ae3e54b2271dd66774aad4d56575) --- deepmd/pt/model/network/mlp.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/deepmd/pt/model/network/mlp.py b/deepmd/pt/model/network/mlp.py index 31162fe80e..22675d6163 100644 --- a/deepmd/pt/model/network/mlp.py +++ b/deepmd/pt/model/network/mlp.py @@ -8,6 +8,7 @@ import numpy as np import torch import torch.nn as nn +import torch.nn.functional as F from deepmd.pt.utils import ( env, @@ -202,18 +203,14 @@ def forward( ori_prec = xx.dtype if not env.DP_DTYPE_PROMOTION_STRICT: xx = xx.to(self.prec) - yy = ( - torch.matmul(xx, self.matrix) + self.bias - if self.bias is not None - else torch.matmul(xx, self.matrix) - ) - yy = self.activate(yy).clone() + yy = F.linear(xx, self.matrix.t(), self.bias) + yy = self.activate(yy) yy = yy * self.idt if self.idt is not None else yy if self.resnet: if xx.shape[-1] == yy.shape[-1]: - yy += xx + yy = yy + xx elif 2 * xx.shape[-1] == yy.shape[-1]: - yy += torch.concat([xx, xx], dim=-1) + yy = yy + torch.concat([xx, xx], dim=-1) else: yy = yy if not env.DP_DTYPE_PROMOTION_STRICT: From f199a2c75345adc2315ecfda52692e78f0c7a727 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 17 Jan 2025 12:04:54 +0800 Subject: [PATCH 10/24] CI: switch linux_aarch64 to GitHub hosted runners (#4557) See https://github.blog/changelog/2025-01-16-linux-arm64-hosted-runners-now-available-for-free-in-public-repositories-public-preview/ ## Summary by CodeRabbit - **Chores** - Simplified GitHub Actions workflow for wheel building - Streamlined ARM64 build configuration by directly specifying runner - Removed conditional runner determination logic Signed-off-by: Jinzhe Zeng (cherry picked from commit a3242c4ea4ab84de6e7d409fd98eedcdace57f9d) --- .github/workflows/build_wheel.yml | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml index 8788bc929b..8d27e08974 100644 --- a/.github/workflows/build_wheel.yml +++ b/.github/workflows/build_wheel.yml @@ -14,24 +14,8 @@ concurrency: cancel-in-progress: true jobs: - determine-arm64-runner: - runs-on: ubuntu-latest - permissions: read-all - outputs: - runner: ${{ steps.set-runner.outputs.runner }} - steps: - - name: Determine which runner to use for ARM64 build - id: set-runner - run: | - if [ "${{ github.repository_owner }}" == "deepmodeling" ]; then - echo "runner=[\"Linux\",\"ARM64\"]" >> $GITHUB_OUTPUT - else - echo "runner=\"ubuntu-latest\"" >> $GITHUB_OUTPUT - fi - build_wheels: name: Build wheels for cp${{ matrix.python }}-${{ matrix.platform_id }} - needs: determine-arm64-runner runs-on: ${{ matrix.os }} strategy: fail-fast: false @@ -65,7 +49,7 @@ jobs: platform_id: win_amd64 dp_variant: cpu # linux-aarch64 - - os: ${{ fromJson(needs.determine-arm64-runner.outputs.runner) }} + - os: ubuntu-24.04-arm python: 310 platform_id: manylinux_aarch64 dp_variant: cpu From bbbb426b5c0bd9ebf2f7fe2aff6315fb80f88eb1 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Mon, 20 Jan 2025 16:24:10 +0800 Subject: [PATCH 11/24] chore: improve neighbor stat log (#4561) ## Summary by CodeRabbit - **Documentation** - Improved logging statements in the `NeighborStat` class for better clarity and detail - Enhanced log messages to include more descriptive information about neighbor distances and sizes --------- Signed-off-by: Jinzhe Zeng Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> (cherry picked from commit 1b0d6a174a56a793a48d8abfa71f8b4ca63b30b2) --- deepmd/utils/neighbor_stat.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/deepmd/utils/neighbor_stat.py b/deepmd/utils/neighbor_stat.py index 40e629d9db..4c3d24edd8 100644 --- a/deepmd/utils/neighbor_stat.py +++ b/deepmd/utils/neighbor_stat.py @@ -81,8 +81,12 @@ def get_stat(self, data: DeepmdDataSystem) -> tuple[float, np.ndarray]: # do sqrt in the final min_nbor_dist = math.sqrt(min_nbor_dist) - log.info("training data with min nbor dist: " + str(min_nbor_dist)) - log.info("training data with max nbor size: " + str(max_nbor_size)) + log.info( + f"Neighbor statistics: training data with minimal neighbor distance: {min_nbor_dist:f}" + ) + log.info( + f"Neighbor statistics: training data with maximum neighbor size: {max_nbor_size!s} (cutoff radius: {self.rcut:f})" + ) return min_nbor_dist, max_nbor_size @abstractmethod From 4fc1d8946c7dd53b5b29774ea9521aede0efe06e Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Tue, 21 Jan 2025 08:32:09 +0800 Subject: [PATCH 12/24] fix: fix YAML conversion (#4565) Fix #4564. ## Summary by CodeRabbit - **Bug Fixes** - Improved serialization of NumPy arrays when saving model data to YAML files. - Enhanced metadata handling for array representations during model export. Signed-off-by: Jinzhe Zeng (cherry picked from commit ec19ab3cc8d7852cd65300647967cb35878f31d4) --- deepmd/dpmodel/utils/serialization.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/deepmd/dpmodel/utils/serialization.py b/deepmd/dpmodel/utils/serialization.py index 1c07038a23..5520933753 100644 --- a/deepmd/dpmodel/utils/serialization.py +++ b/deepmd/dpmodel/utils/serialization.py @@ -113,7 +113,9 @@ def save_dp_model(filename: str, model_dict: dict) -> None: "@version": 1, "dtype": x.dtype.name, "value": x.tolist(), - }, + } + if isinstance(x, np.ndarray) + else x, ) with open(filename, "w") as f: yaml.safe_dump( From cfa50648959921b5aa4edec2a56facfe5446ee43 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 24 Jan 2025 18:32:48 +0800 Subject: [PATCH 13/24] fix(cc): remove C++ 17 usage (#4570) Fix #4569. ## Summary by CodeRabbit - **Build Configuration** - Enhanced flexibility in C++ standard settings based on project dependencies - Added conditional checks for setting C++ standard version - **Code Improvements** - Updated type checking mechanisms in compute methods - Refined method signatures for handling different data types in DeepPotPT and DeepSpinPT classes - **Technical Refinements** - Improved conditional compilation and configuration handling - Introduced more flexible type-specific method overloads --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> (cherry picked from commit 11ad7d7455f0215e69a9179b03167ddf9c0b1ea7) --- source/CMakeLists.txt | 4 +++- source/api_cc/src/DeepPotPT.cc | 4 ++-- source/api_cc/src/DeepSpinPT.cc | 4 ++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index ca79860450..1fee62ee8c 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -14,7 +14,9 @@ set(DEEPMD_C_ROOT "" CACHE PATH "Path to imported DeePMD-kit C library") -set(CMAKE_CXX_STANDARD 11) +if(NOT DEFINED CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 11) +endif() macro(set_if_higher VARIABLE VALUE) # ${VARIABLE} is a variable name, not a string if(${VARIABLE} LESS "${VALUE}") diff --git a/source/api_cc/src/DeepPotPT.cc b/source/api_cc/src/DeepPotPT.cc index 6910de3ccd..4be03aea18 100644 --- a/source/api_cc/src/DeepPotPT.cc +++ b/source/api_cc/src/DeepPotPT.cc @@ -143,7 +143,7 @@ void DeepPotPT::compute(ENERGYVTYPE& ener, int natoms = atype.size(); auto options = torch::TensorOptions().dtype(torch::kFloat64); torch::ScalarType floatType = torch::kFloat64; - if (std::is_same_v) { + if (std::is_same::value) { options = torch::TensorOptions().dtype(torch::kFloat32); floatType = torch::kFloat32; } @@ -341,7 +341,7 @@ void DeepPotPT::compute(ENERGYVTYPE& ener, int natoms = atype.size(); auto options = torch::TensorOptions().dtype(torch::kFloat64); torch::ScalarType floatType = torch::kFloat64; - if (std::is_same_v) { + if (std::is_same::value) { options = torch::TensorOptions().dtype(torch::kFloat32); floatType = torch::kFloat32; } diff --git a/source/api_cc/src/DeepSpinPT.cc b/source/api_cc/src/DeepSpinPT.cc index aef2d60150..cf9d8010e7 100644 --- a/source/api_cc/src/DeepSpinPT.cc +++ b/source/api_cc/src/DeepSpinPT.cc @@ -145,7 +145,7 @@ void DeepSpinPT::compute(ENERGYVTYPE& ener, int natoms = atype.size(); auto options = torch::TensorOptions().dtype(torch::kFloat64); torch::ScalarType floatType = torch::kFloat64; - if (std::is_same_v) { + if (std::is_same::value) { options = torch::TensorOptions().dtype(torch::kFloat32); floatType = torch::kFloat32; } @@ -365,7 +365,7 @@ void DeepSpinPT::compute(ENERGYVTYPE& ener, int natoms = atype.size(); auto options = torch::TensorOptions().dtype(torch::kFloat64); torch::ScalarType floatType = torch::kFloat64; - if (std::is_same_v) { + if (std::is_same::value) { options = torch::TensorOptions().dtype(torch::kFloat32); floatType = torch::kFloat32; } From a1290ea9adbab05fca826cba442b7eba03113dde Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Mon, 3 Feb 2025 00:33:00 +0800 Subject: [PATCH 14/24] chore: bump pytorch to 2.6.0 (#4575) - **Dependency Updates** - Updated PyTorch and libtorch library versions from 2.5.0 to 2.6.0 - Modified download links and version references across development and build configurations - Added import for `torch._dynamo` in test module --------- Signed-off-by: Jinzhe Zeng (cherry picked from commit 1b94e47320ca87b99e82a13645c39b79c0fc3772) --- .devcontainer/download_libtorch.sh | 2 +- .github/workflows/test_cuda.yml | 6 +++--- backend/find_pytorch.py | 2 +- source/tests/pt/__init__.py | 1 + 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.devcontainer/download_libtorch.sh b/.devcontainer/download_libtorch.sh index d78b559997..ef68a2e615 100755 --- a/.devcontainer/download_libtorch.sh +++ b/.devcontainer/download_libtorch.sh @@ -4,5 +4,5 @@ set -ev SCRIPT_PATH=$(dirname $(realpath -s $0)) cd ${SCRIPT_PATH}/.. -wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.5.0%2Bcpu.zip -O ~/libtorch.zip +wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.6.0%2Bcpu.zip -O ~/libtorch.zip unzip ~/libtorch.zip diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml index 4dbdc5acb9..e8b90be136 100644 --- a/.github/workflows/test_cuda.yml +++ b/.github/workflows/test_cuda.yml @@ -47,11 +47,11 @@ jobs: && sudo apt-get -y install cuda-12-3 libcudnn8=8.9.5.*-1+cuda12.3 if: false # skip as we use nvidia image - run: python -m pip install -U uv - - run: source/install/uv_with_retry.sh pip install --system "tensorflow~=2.18.0rc2" "torch~=2.5.0" "jax[cuda12]" + - run: source/install/uv_with_retry.sh pip install --system "tensorflow~=2.18.0rc2" "torch~=2.6.0" "jax[cuda12]" - run: | export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])') export TENSORFLOW_ROOT=$(python -c 'import importlib,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)') - source/install/uv_with_retry.sh pip install --system -v -e .[gpu,test,lmp,cu12,torch,jax] mpi4py + source/install/uv_with_retry.sh pip install --system -v -e .[gpu,test,lmp,cu12,torch,jax] mpi4py --reinstall-package deepmd-kit env: DP_VARIANT: cuda DP_ENABLE_NATIVE_OPTIMIZATION: 1 @@ -67,7 +67,7 @@ jobs: run: source/tests/infer/convert-models.sh - name: Download libtorch run: | - wget https://download.pytorch.org/libtorch/cu124/libtorch-cxx11-abi-shared-with-deps-2.5.0%2Bcu124.zip -O libtorch.zip + wget https://download.pytorch.org/libtorch/cu124/libtorch-cxx11-abi-shared-with-deps-2.6.0%2Bcu124.zip -O libtorch.zip unzip libtorch.zip - run: | export CMAKE_PREFIX_PATH=$GITHUB_WORKSPACE/libtorch diff --git a/backend/find_pytorch.py b/backend/find_pytorch.py index 125fd6a389..a309e3b9e9 100644 --- a/backend/find_pytorch.py +++ b/backend/find_pytorch.py @@ -116,7 +116,7 @@ def get_pt_requirement(pt_version: str = "") -> dict: cuda_version = os.environ.get("CUDA_VERSION", "12.2") if cuda_version == "" or cuda_version in SpecifierSet(">=12,<13"): # CUDA 12.2, cudnn 9 - pt_version = "2.5.0" + pt_version = "2.6.0" elif cuda_version in SpecifierSet(">=11,<12"): # CUDA 11.8, cudnn 8 pt_version = "2.3.1" diff --git a/source/tests/pt/__init__.py b/source/tests/pt/__init__.py index 1a6de0591a..045a752b96 100644 --- a/source/tests/pt/__init__.py +++ b/source/tests/pt/__init__.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: LGPL-3.0-or-later import torch +import torch._dynamo torch.set_num_threads(1) torch.set_num_interop_threads(1) From 0bc73e6cedfe99af13f932573f365dcfde45a695 Mon Sep 17 00:00:00 2001 From: Rocco Meli Date: Tue, 4 Feb 2025 15:13:49 +0100 Subject: [PATCH 15/24] Fix version in DeePMDConfigVersion.cmake (#4577) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The double quotes end up as follows in `DeePMDConfigVersion.cmake`: ```cmake set(PACKAGE_VERSION "$") ``` This result in the following warning: ``` CMake Error (dev) at /opt/cp2k-toolchain/install/deepmd-kit-3.0.1/lib/cmake/DeePMD/DeePMDConfigVersion.cmake:10: Syntax Warning in cmake code at column 30 Argument not separated from preceding token by whitespace. Call Stack (most recent call first): CMakeLists.txt:703 (find_package) This error is for project developers. Use -Wno-error=dev to suppress it. ``` ## Summary by CodeRabbit - **Bug Fixes** - Improved the project’s version handling to ensure that when no version is provided, a reliable default is used, resulting in more consistent and accurate version information for end-users. (cherry picked from commit 8104841a3f6362c4213d61677e52d09a9c90b4e1) --- source/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index 1fee62ee8c..eb795cd47a 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -448,7 +448,7 @@ if(BUILD_CPP_IF set(version_file "${generated_dir}/${CMAKE_PROJECT_NAME}ConfigVersion.cmake") write_basic_package_version_file( ${version_file} - VERSION $ + VERSION $ COMPATIBILITY AnyNewerVersion) install( EXPORT ${targets_export_name} From 44e40bf30ab0ce5ec57e0af6c508804e436de5fd Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Wed, 5 Feb 2025 10:24:35 +0800 Subject: [PATCH 16/24] fix(pt): detach computed descriptor tensor to prevent OOM (#4547) Fix #4544. ## Summary by CodeRabbit - **Optimization** - Improved descriptor handling in the atomic model to enhance computational efficiency and memory management during backpropagation. Signed-off-by: Jinzhe Zeng (cherry picked from commit 228062ce59a52a57811737916cdf2470cbe91199) --- deepmd/pt/model/atomic_model/dp_atomic_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepmd/pt/model/atomic_model/dp_atomic_model.py b/deepmd/pt/model/atomic_model/dp_atomic_model.py index 0f34f155ac..63a18bbd4e 100644 --- a/deepmd/pt/model/atomic_model/dp_atomic_model.py +++ b/deepmd/pt/model/atomic_model/dp_atomic_model.py @@ -237,7 +237,7 @@ def forward_atomic( ) assert descriptor is not None if self.enable_eval_descriptor_hook: - self.eval_descriptor_list.append(descriptor) + self.eval_descriptor_list.append(descriptor.detach()) # energy, force fit_ret = self.fitting_net( descriptor, From fca0e6efc7925ac1618b42166df6f71ed3e67762 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 7 Feb 2025 10:20:14 +0800 Subject: [PATCH 17/24] fix(pt): throw errors for GPU tensors and the CPU OP library (#4582) In another OP which is not in this PR, @iProzd has triggered a situation where he used the CUDA-compiled PyTorch along with the CPU version of customized OP libraries. Then the OP gave incorrect results since it didn't do any computation. ## Summary by CodeRabbit - **Bug Fixes** - Improved error messaging for scenarios where GPU operations are attempted without proper support, ensuring you receive clear notifications while CPU functionality remains unaffected. Signed-off-by: Jinzhe Zeng (cherry picked from commit f01fa531da598654e1dc003a45704272a88554a8) --- source/op/pt/tabulate_multi_device.cc | 36 +++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/source/op/pt/tabulate_multi_device.cc b/source/op/pt/tabulate_multi_device.cc index 5c710f5c37..feae37af81 100644 --- a/source/op/pt/tabulate_multi_device.cc +++ b/source/op/pt/tabulate_multi_device.cc @@ -61,6 +61,10 @@ void TabulateFusionSeAForward(const torch::Tensor& table_tensor, #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::tabulate_fusion_se_a_gpu(descriptor, table, table_info, em_x, em, two_embed, nloc, nnei, last_layer_size); +#else + throw std::runtime_error( + "The input tensor is on the GPU, but the GPU support for the " + "customized OP library is not enabled."); #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::tabulate_fusion_se_a_cpu(descriptor, table, table_info, em_x, em, @@ -110,6 +114,10 @@ void TabulateFusionSeAGradForward(const torch::Tensor& table_tensor, deepmd::tabulate_fusion_se_a_grad_gpu(dy_dem_x, dy_dem, dy_dtwo, table, table_info, em_x, em, two_embed, dy, nloc, nnei, last_layer_size); +#else + throw std::runtime_error( + "The input tensor is on the GPU, but the GPU support for the " + "customized OP library is not enabled."); #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::tabulate_fusion_se_a_grad_cpu(dy_dem_x, dy_dem, dy_dtwo, table, @@ -166,6 +174,10 @@ void TabulateFusionSeAGradGradForward(const torch::Tensor& table_tensor, deepmd::tabulate_fusion_se_a_grad_grad_gpu( dz_dy, table, table_info, em_x, em, two_embed, dz_dy_dem_x, dz_dy_dem, dz_dy_dtwo, nloc, nnei, last_layer_size, is_sorted); +#else + throw std::runtime_error( + "The input tensor is on the GPU, but the GPU support for the " + "customized OP library is not enabled."); #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM TORCH_CHECK(last_layer_size <= 1024, "In the process of model compression, the size of the " @@ -212,6 +224,10 @@ void TabulateFusionSeTForward(const torch::Tensor& table_tensor, #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::tabulate_fusion_se_t_gpu(descriptor, table, table_info, em_x, em, nloc, nnei_i, nnei_j, last_layer_size); +#else + throw std::runtime_error( + "The input tensor is on the GPU, but the GPU support for the " + "customized OP library is not enabled."); #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::tabulate_fusion_se_t_cpu(descriptor, table, table_info, em_x, em, @@ -254,6 +270,10 @@ void TabulateFusionSeTGradForward(const torch::Tensor& table_tensor, deepmd::tabulate_fusion_se_t_grad_gpu(dy_dem_x, dy_dem, table, table_info, em_x, em, dy, nloc, nnei_i, nnei_j, last_layer_size); +#else + throw std::runtime_error( + "The input tensor is on the GPU, but the GPU support for the " + "customized OP library is not enabled."); #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::tabulate_fusion_se_t_grad_cpu(dy_dem_x, dy_dem, table, table_info, @@ -300,6 +320,10 @@ void TabulateFusionSeTGradGradForward(const torch::Tensor& table_tensor, deepmd::tabulate_fusion_se_t_grad_grad_gpu(dz_dy, table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc, nnei_i, nnei_j, last_layer_size); +#else + throw std::runtime_error( + "The input tensor is on the GPU, but the GPU support for the " + "customized OP library is not enabled."); #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM TORCH_CHECK(last_layer_size <= 1024, "In the process of model compression, the size of the " @@ -340,6 +364,10 @@ void TabulateFusionSeRForward(const torch::Tensor& table_tensor, #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::tabulate_fusion_se_r_gpu(descriptor, table, table_info, em, nloc, nnei, last_layer_size); +#else + throw std::runtime_error( + "The input tensor is on the GPU, but the GPU support for the " + "customized OP library is not enabled."); #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::tabulate_fusion_se_r_cpu(descriptor, table, table_info, em, nloc, @@ -376,6 +404,10 @@ void TabulateFusionSeRGradForward(const torch::Tensor& table_tensor, #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::tabulate_fusion_se_r_grad_gpu(dy_dem, table, table_info, em, dy, nloc, nnei, last_layer_size); +#else + throw std::runtime_error( + "The input tensor is on the GPU, but the GPU support for the " + "customized OP library is not enabled."); #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::tabulate_fusion_se_r_grad_cpu(dy_dem, table, table_info, em, dy, @@ -412,6 +444,10 @@ void TabulateFusionSeRGradGradForward(const torch::Tensor& table_tensor, #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::tabulate_fusion_se_r_grad_grad_gpu( dz_dy, table, table_info, em, dz_dy_dem, nloc, nnei, last_layer_size); +#else + throw std::runtime_error( + "The input tensor is on the GPU, but the GPU support for the " + "customized OP library is not enabled."); #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM TORCH_CHECK(last_layer_size <= 1024, "In the process of model compression, the size of the " From db9071e8830b798b36831685b17b7b4cdf42f359 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yifan=20Li=E6=9D=8E=E4=B8=80=E5=B8=86?= Date: Sat, 8 Feb 2025 01:22:56 -0500 Subject: [PATCH 18/24] use variable to store the bias of atomic polarizability (#4581) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR fixes #4559. It uses variable to store the bias of atomic polarizability, which is the mean of the diagonal terms of the polarizability tensor. ## Summary by CodeRabbit - New Features - Enhanced core processing reliability with improved error management and resource consistency, leading to more stable model performance during inference. - Tests - Refined internal validation checks to better support advanced configurations and ensure consistent outcomes. These improvements contribute to a more robust and reliable experience for end-users. --------- Signed-off-by: Yifan Li李一帆 Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> (cherry picked from commit 3ad94ff410ab1ceb5583e7206ac9400effb4714c) --- deepmd/tf/fit/polar.py | 42 +++++++++++++++++++++++++++++-- source/tests/consistent/common.py | 3 +++ 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/deepmd/tf/fit/polar.py b/deepmd/tf/fit/polar.py index 2f1400e697..4a240656d4 100644 --- a/deepmd/tf/fit/polar.py +++ b/deepmd/tf/fit/polar.py @@ -27,8 +27,12 @@ from deepmd.tf.loss.tensor import ( TensorLoss, ) +from deepmd.tf.utils.errors import ( + GraphWithoutTensorError, +) from deepmd.tf.utils.graph import ( get_fitting_net_variables_from_graph_def, + get_tensor_by_name_from_graph, ) from deepmd.tf.utils.network import ( one_layer, @@ -423,6 +427,16 @@ def build( atype = input_dict.get("atype", None) nframes = input_dict.get("nframes") start_index = 0 + + with tf.variable_scope("fitting_attr" + suffix, reuse=reuse): + self.t_constant_matrix = tf.get_variable( + "t_constant_matrix", + self.constant_matrix.shape, + dtype=GLOBAL_TF_FLOAT_PRECISION, + trainable=False, + initializer=tf.constant_initializer(self.constant_matrix), + ) + inputs = tf.reshape(input_d, [-1, self.dim_descrpt * natoms[0]]) rot_mat = tf.reshape(rot_mat, [-1, self.dim_rot_mat * natoms[0]]) if nframes is None: @@ -446,7 +460,9 @@ def build( # nframes x nloc_masked constant_matrix = tf.reshape( tf.reshape( - tf.tile(tf.repeat(self.constant_matrix, natoms[2:]), [nframes]), + tf.tile( + tf.repeat(self.t_constant_matrix, natoms[2:]), [nframes] + ), [nframes, -1], )[nloc_mask], [nframes, -1], @@ -498,7 +514,9 @@ def build( # shift and scale sel_type_idx = self.sel_type.index(type_i) final_layer = final_layer * self.scale[sel_type_idx] - final_layer = final_layer + self.constant_matrix[sel_type_idx] * tf.eye( + final_layer = final_layer + tf.slice( + self.t_constant_matrix, [sel_type_idx], [1] + ) * tf.eye( 3, batch_shape=[tf.shape(inputs)[0], natoms[2 + type_i]], dtype=GLOBAL_TF_FLOAT_PRECISION, @@ -545,6 +563,16 @@ def init_variables( self.fitting_net_variables = get_fitting_net_variables_from_graph_def( graph_def, suffix=suffix ) + if self.shift_diag: + try: + self.constant_matrix = get_tensor_by_name_from_graph( + graph, f"fitting_attr{suffix}/t_constant_matrix" + ) + except GraphWithoutTensorError: + warnings.warn( + "You are trying to read a model trained with shift_diag=True, but the mean of the diagonal terms of the polarizability is not stored in the graph. This will lead to wrong inference results. You may train your model with the latest DeePMD-kit to avoid this issue.", + stacklevel=2, + ) def enable_mixed_precision(self, mixed_prec: Optional[dict] = None) -> None: """Receive the mixed precision setting. @@ -605,6 +633,15 @@ def serialize(self, suffix: str) -> dict: variables=self.fitting_net_variables, suffix=suffix, ), + "@variables": { + "fparam_avg": None, + "fparam_inv_std": None, + "aparam_avg": None, + "aparam_inv_std": None, + "case_embd": None, + "scale": self.scale.reshape(-1, 1), + "constant_matrix": self.constant_matrix.reshape(-1), + }, "type_map": self.type_map, } return data @@ -632,6 +669,7 @@ def deserialize(cls, data: dict, suffix: str): data["nets"], suffix=suffix, ) + fitting.constant_matrix = data["@variables"]["constant_matrix"].ravel() return fitting diff --git a/source/tests/consistent/common.py b/source/tests/consistent/common.py index 358ac8d542..88f12bb6a8 100644 --- a/source/tests/consistent/common.py +++ b/source/tests/consistent/common.py @@ -326,6 +326,9 @@ def test_tf_consistent_with_ref(self) -> None: data1.pop("@version") data2.pop("@version") + if tf_obj.__class__.__name__.startswith("Polar"): + data1["@variables"].pop("bias_atom_e") + np.testing.assert_equal(data1, data2) for rr1, rr2 in zip(ret1, ret2): np.testing.assert_allclose( From 910b8a7548f347dccb56becc016023a3ea5e8778 Mon Sep 17 00:00:00 2001 From: Anyang Peng <137014849+anyangml@users.noreply.github.com> Date: Sat, 8 Feb 2025 18:23:20 +0800 Subject: [PATCH 19/24] Fix: pt tensor loss label name (#4587) To address polar nan loss mentioned in #4586 ## Summary by CodeRabbit - **Refactor** - Adjusted the internal processing order in computation routines to enhance consistency while maintaining the same overall user experience. - Updated model prediction handling to ensure compatibility in shape during statistical computations, reducing potential runtime errors. --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> (cherry picked from commit 65facbbce03b11873f74b1ea257d7cdf901fac22) --- deepmd/pt/train/training.py | 10 ++++------ deepmd/pt/utils/stat.py | 4 +++- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index 0feb7fbbd2..38a41aa726 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -1230,13 +1230,11 @@ def get_loss(loss_params, start_lr, _ntypes, _model): if "mask" in model_output_type: model_output_type.pop(model_output_type.index("mask")) tensor_name = model_output_type[0] - loss_params["tensor_name"] = tensor_name loss_params["tensor_size"] = _model.model_output_def()[tensor_name].output_size - label_name = tensor_name - if label_name == "polarizability": - label_name = "polar" - loss_params["label_name"] = label_name - loss_params["tensor_name"] = label_name + loss_params["label_name"] = tensor_name + if tensor_name == "polarizability": + tensor_name = "polar" + loss_params["tensor_name"] = tensor_name return TensorLoss(**loss_params) elif loss_type == "property": task_dim = _model.get_task_dim() diff --git a/deepmd/pt/utils/stat.py b/deepmd/pt/utils/stat.py index 1c5e3f1c52..5b4fa77e5f 100644 --- a/deepmd/pt/utils/stat.py +++ b/deepmd/pt/utils/stat.py @@ -469,7 +469,9 @@ def compute_output_stats_global( # subtract the model bias and output the delta bias stats_input = { - kk: merged_output[kk] - model_pred[kk] for kk in keys if kk in merged_output + kk: merged_output[kk] - model_pred[kk].reshape(merged_output[kk].shape) + for kk in keys + if kk in merged_output } bias_atom_e = {} From be468b11ff84ebc1a7361ccba3d0cc3508e7e10a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yifan=20Li=E6=9D=8E=E4=B8=80=E5=B8=86?= Date: Mon, 10 Feb 2025 08:28:44 -0500 Subject: [PATCH 20/24] Fix UT `test_tf_consistent_with_ref` (#216) * pop variable case_embd to pass the UT for polarizability * remove case_embd from tf PolarFittingSeA --- deepmd/tf/fit/polar.py | 1 - 1 file changed, 1 deletion(-) diff --git a/deepmd/tf/fit/polar.py b/deepmd/tf/fit/polar.py index 4a240656d4..3bb0fa841e 100644 --- a/deepmd/tf/fit/polar.py +++ b/deepmd/tf/fit/polar.py @@ -638,7 +638,6 @@ def serialize(self, suffix: str) -> dict: "fparam_inv_std": None, "aparam_avg": None, "aparam_inv_std": None, - "case_embd": None, "scale": self.scale.reshape(-1, 1), "constant_matrix": self.constant_matrix.reshape(-1), }, From c2e6b6ad4de720fc02478e8da6777e2c87f8ebef Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Wed, 26 Feb 2025 16:51:02 +0800 Subject: [PATCH 21/24] CI: pin jax to 0.5.0 (#4613) ## Summary by CodeRabbit - **Chores** - Updated dependency installation in our CI workflows by enforcing explicit version constraints for key libraries to ensure a consistent and stable testing environment. --------- Signed-off-by: Jinzhe Zeng Signed-off-by: Jinzhe Zeng Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> (cherry picked from commit 47bbd650c602a406e5290a72a512c99e81c36981) --- .github/workflows/test_cc.yml | 2 +- .github/workflows/test_cuda.yml | 2 +- .github/workflows/test_python.yml | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test_cc.yml b/.github/workflows/test_cc.yml index f7f3a4f431..da69f01225 100644 --- a/.github/workflows/test_cc.yml +++ b/.github/workflows/test_cc.yml @@ -29,7 +29,7 @@ jobs: - run: python -m pip install uv - name: Install Python dependencies run: | - source/install/uv_with_retry.sh pip install --system tensorflow-cpu + source/install/uv_with_retry.sh pip install --system tensorflow-cpu~=2.18.0 jax==0.5.0 export TENSORFLOW_ROOT=$(python -c 'import importlib,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)') source/install/uv_with_retry.sh pip install --system -e .[cpu,test,lmp,jax] mpi4py - name: Convert models diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml index e8b90be136..9870ff6183 100644 --- a/.github/workflows/test_cuda.yml +++ b/.github/workflows/test_cuda.yml @@ -47,7 +47,7 @@ jobs: && sudo apt-get -y install cuda-12-3 libcudnn8=8.9.5.*-1+cuda12.3 if: false # skip as we use nvidia image - run: python -m pip install -U uv - - run: source/install/uv_with_retry.sh pip install --system "tensorflow~=2.18.0rc2" "torch~=2.6.0" "jax[cuda12]" + - run: source/install/uv_with_retry.sh pip install --system "tensorflow~=2.18.0rc2" "torch~=2.6.0" "jax[cuda12]==0.5.0" - run: | export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])') export TENSORFLOW_ROOT=$(python -c 'import importlib,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)') diff --git a/.github/workflows/test_python.yml b/.github/workflows/test_python.yml index e30a19c8b1..f2169b2633 100644 --- a/.github/workflows/test_python.yml +++ b/.github/workflows/test_python.yml @@ -25,11 +25,11 @@ jobs: python-version: ${{ matrix.python }} - run: python -m pip install -U uv - run: | - source/install/uv_with_retry.sh pip install --system openmpi tensorflow-cpu + source/install/uv_with_retry.sh pip install --system openmpi tensorflow-cpu~=2.18.0 source/install/uv_with_retry.sh pip install --system torch -i https://download.pytorch.org/whl/cpu export TENSORFLOW_ROOT=$(python -c 'import tensorflow;print(tensorflow.__path__[0])') export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])') - source/install/uv_with_retry.sh pip install --system -e .[test,jax] mpi4py + source/install/uv_with_retry.sh pip install --system -e .[test,jax] mpi4py "jax==0.5.0;python_version>='3.10'" source/install/uv_with_retry.sh pip install --system horovod --no-build-isolation env: # Please note that uv has some issues with finding From 2580af87f99c508e529d1631a7a7ee026fe1c894 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 27 Feb 2025 17:50:52 +0800 Subject: [PATCH 22/24] docs: add v3 paper citations (#4619) ## Summary by CodeRabbit - **Documentation** - Added a new citation for the "DeePMD-kit v3" article in various documentation files. - Clarified references to distinguish between the "v2" and "v3" publications. - Enhanced bibliography and credits details for improved clarity. - **Chores** - Revised citation formatting across the package to reflect formal publication standards. --------- Signed-off-by: Jinzhe Zeng Signed-off-by: Jinzhe Zeng Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> (cherry picked from commit 918d4deb78edfeb95ddcd74b1fb50c213be55689) --- CITATIONS.bib | 25 +++++++++++++++++++++++++ README.md | 5 ++++- deepmd/utils/summary.py | 1 + doc/credits.rst | 1 + source/lmp/pair_deepmd.cpp | 34 +++++++++++++++++++++++++++++++++- source/lmp/pair_deepspin.cpp | 34 +++++++++++++++++++++++++++++++++- 6 files changed, 97 insertions(+), 3 deletions(-) diff --git a/CITATIONS.bib b/CITATIONS.bib index 52c8045bf3..61f9f34ff5 100644 --- a/CITATIONS.bib +++ b/CITATIONS.bib @@ -40,6 +40,31 @@ @article{Zeng_JChemPhys_2023_v159_p054801 doi = {10.1063/5.0155600}, } +@article{Zeng_arXiv_2025_p2502.19161, + annote = {general purpose}, + author = { + Jinzhe Zeng and Duo Zhang and Anyang Peng and Xiangyu Zhang and Sensen He + and Yan Wang and Xinzijian Liu and Hangrui Bi and Yifan Li and Chun Cai and + Chengqian Zhang and Yiming Du and Jia-Xin Zhu and Pinghui Mo and Zhengtao + Huang and Qiyu Zeng and Shaochen Shi and Xuejian Qin and Zhaoxi Yu and + Chenxing Luo and Ye Ding and Yun-Pei Liu and Ruosong Shi and Zhenyu Wang + and Sigbj{\o}rn L{\o}land Bore and Junhan Chang and Zhe Deng and Zhaohan + Ding and Siyuan Han and Wanrun Jiang and Guolin Ke and Zhaoqing Liu and + Denghui Lu and Koki Muraoka and Hananeh Oliaei and Anurag Kumar Singh and + Haohui Que and Weihong Xu and Zhangmancang Xu and Yong-Bin Zhuang and Jiayu + Dai and Timothy J. Giese and Weile Jia and Ben Xu and Darrin M. York and + Linfeng Zhang and Han Wang + }, + title = { + {DeePMD-kit v3: A Multiple-Backend Framework for Machine Learning + Potentials} + }, + journal = {arXiv}, + year = 2025, + pages = {2502.19161}, + doi = {10.48550/arXiv.2502.19161}, +} + @article{Lu_CompPhysCommun_2021_v259_p107624, annote = {GPU support}, title = { diff --git a/README.md b/README.md index 18bdfd6560..15e88b218f 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ If you use this code in any future publications, please cite the following publi - Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang. "DeePMD-kit v2: A software package for deep potential models." J. Chem. Phys. 159 (2023): 054801. [![doi:10.1063/5.0155600](https://img.shields.io/badge/DOI-10.1063%2F5.0155600-blue)](https://doi.org/10.1063/5.0155600) [![Citations](https://citations.njzjz.win/10.1063/5.0155600)](https://badge.dimensions.ai/details/doi/10.1063/5.0155600) +- Jinzhe Zeng, Duo Zhang, Anyang Peng, Xiangyu Zhang, Sensen He, Yan Wang, Xinzijian Liu, Hangrui Bi, Yifan Li, Chun Cai, Chengqian Zhang, Yiming Du, Jia-Xin Zhu, Pinghui Mo, Zhengtao Huang, Qiyu Zeng, Shaochen Shi, Xuejian Qin, Zhaoxi Yu, Chenxing Luo, Ye Ding, Yun-Pei Liu, Ruosong Shi, Zhenyu Wang, Sigbjørn Løland Bore, Junhan Chang, Zhe Deng, Zhaohan Ding, Siyuan Han, Wanrun Jiang, Guolin Ke, Zhaoqing Liu, Denghui Lu, Koki Muraoka, Hananeh Oliaei, Anurag Kumar Singh, Haohui Que, Weihong Xu, Zhangmancang Xu, Yong-Bin Zhuang, Jiayu Dai, Timothy J. Giese, Weile Jia, Ben Xu, Darrin M. York, Linfeng Zhang, Han Wang. "DeePMD-kit v3: A Multiple-Backend Framework for Machine Learning Potentials." [arXiv:2502.19161](https://arxiv.org/abs/2502.19161). In addition, please follow [the bib file](CITATIONS.bib) to cite the methods you used. @@ -68,7 +69,7 @@ In addition to building up potential energy models, DeePMD-kit can also be used - Non-von-Neumann. - C API to interface with the third-party packages. -See [our latest paper](https://doi.org/10.1063/5.0155600) for details of all features until v2.2.3. +See [our v2 paper](https://doi.org/10.1063/5.0155600) for details of all features until v2.2.3. #### v3 @@ -76,6 +77,8 @@ See [our latest paper](https://doi.org/10.1063/5.0155600) for details of all fea - The DPA-2 model. - Plugin mechanisms for external models. +See [our v3 paper](https://doi.org/10.48550/arXiv.2502.19161) for details of all features until v3.0. + ## Install and use DeePMD-kit Please read the [online documentation](https://deepmd.readthedocs.io/) for how to install and use DeePMD-kit. diff --git a/deepmd/utils/summary.py b/deepmd/utils/summary.py index a35dd4db93..f093a5f7bc 100644 --- a/deepmd/utils/summary.py +++ b/deepmd/utils/summary.py @@ -42,6 +42,7 @@ class SummaryPrinter(ABC): "Please read and cite:", "Wang, Zhang, Han and E, Comput.Phys.Comm. 228, 178-184 (2018)", "Zeng et al, J. Chem. Phys., 159, 054801 (2023)", + "Zeng et al, arxiv:2502.19161", "See https://deepmd.rtfd.io/credits/ for details.", ) diff --git a/doc/credits.rst b/doc/credits.rst index 059746ee0b..2dacc848c9 100644 --- a/doc/credits.rst +++ b/doc/credits.rst @@ -13,6 +13,7 @@ Cite DeePMD-kit and methods Wang_ComputPhysCommun_2018_v228_p178 Zeng_JChemPhys_2023_v159_p054801 + Zeng_arXiv_2025_p2502.19161 - If GPU version is used, diff --git a/source/lmp/pair_deepmd.cpp b/source/lmp/pair_deepmd.cpp index 8127979cd1..d4887573cc 100644 --- a/source/lmp/pair_deepmd.cpp +++ b/source/lmp/pair_deepmd.cpp @@ -50,7 +50,7 @@ static const char cite_user_deepmd_package[] = "energy representation and molecular dynamics}},\n" " pages = {178--184}\n" "}\n" - "@misc{Zeng_JChemPhys_2023_v159_p054801,\n" + "@article{Zeng_JChemPhys_2023_v159_p054801,\n" " title = {{DeePMD-kit v2: A software package for deep potential " "models}},\n" " author = {Jinzhe Zeng and Duo Zhang and Denghui Lu and Pinghui Mo and " @@ -82,6 +82,38 @@ static const char cite_user_deepmd_package[] = " year = 2023,\n" " pages = 054801,\n" " doi = {10.1063/5.0155600},\n" + "}\n" + "@Article{Zeng_arXiv_2025_p2502.19161,\n" + " annote = {general purpose},\n" + " author = {Jinzhe Zeng and Duo Zhang and Anyang Peng and Xiangyu " + "Zhang and Sensen\n" + " He and Yan Wang and Xinzijian Liu and Hangrui Bi and Yifan " + "Li and Chun\n" + " Cai and Chengqian Zhang and Yiming Du and Jia-Xin Zhu and " + "Pinghui Mo\n" + " and Zhengtao Huang and Qiyu Zeng and Shaochen Shi and " + "Xuejian Qin and\n" + " Zhaoxi Yu and Chenxing Luo and Ye Ding and Yun-Pei Liu and " + "Ruosong Shi\n" + " and Zhenyu Wang and Sigbj{\\o}rn L{\\o}land Bore and Junhan " + "Chang and\n" + " Zhe Deng and Zhaohan Ding and Siyuan Han and Wanrun Jiang " + "and Guolin\n" + " Ke and Zhaoqing Liu and Denghui Lu and Koki Muraoka and " + "Hananeh Oliaei\n" + " and Anurag Kumar Singh and Haohui Que and Weihong Xu and " + "Zhangmancang\n" + " Xu and Yong-Bin Zhuang and Jiayu Dai and Timothy J. Giese " + "and Weile\n" + " Jia and Ben Xu and Darrin M. York and Linfeng Zhang and Han " + "Wang},\n" + " title = {{DeePMD-kit v3: A Multiple-Backend Framework for Machine " + "Learning\n" + " Potentials}},\n" + " journal = {arXiv},\n" + " year = 2025,\n" + " pages = {2502.19161},\n" + " doi = {10.48550/arXiv.2502.19161},\n" "}\n\n"; PairDeepMD::PairDeepMD(LAMMPS *lmp) diff --git a/source/lmp/pair_deepspin.cpp b/source/lmp/pair_deepspin.cpp index 5e7d4474b9..105e98fa70 100644 --- a/source/lmp/pair_deepspin.cpp +++ b/source/lmp/pair_deepspin.cpp @@ -50,7 +50,7 @@ static const char cite_user_deepmd_package[] = "energy representation and molecular dynamics}},\n" " pages = {178--184}\n" "}\n" - "@misc{Zeng_JChemPhys_2023_v159_p054801,\n" + "@article{Zeng_JChemPhys_2023_v159_p054801,\n" " title = {{DeePMD-kit v2: A software package for deep potential " "models}},\n" " author = {Jinzhe Zeng and Duo Zhang and Denghui Lu and Pinghui Mo and " @@ -82,6 +82,38 @@ static const char cite_user_deepmd_package[] = " year = 2023,\n" " pages = 054801,\n" " doi = {10.1063/5.0155600},\n" + "}\n" + "@Article{Zeng_arXiv_2025_p2502.19161,\n" + " annote = {general purpose},\n" + " author = {Jinzhe Zeng and Duo Zhang and Anyang Peng and Xiangyu " + "Zhang and Sensen\n" + " He and Yan Wang and Xinzijian Liu and Hangrui Bi and Yifan " + "Li and Chun\n" + " Cai and Chengqian Zhang and Yiming Du and Jia-Xin Zhu and " + "Pinghui Mo\n" + " and Zhengtao Huang and Qiyu Zeng and Shaochen Shi and " + "Xuejian Qin and\n" + " Zhaoxi Yu and Chenxing Luo and Ye Ding and Yun-Pei Liu and " + "Ruosong Shi\n" + " and Zhenyu Wang and Sigbj{\\o}rn L{\\o}land Bore and Junhan " + "Chang and\n" + " Zhe Deng and Zhaohan Ding and Siyuan Han and Wanrun Jiang " + "and Guolin\n" + " Ke and Zhaoqing Liu and Denghui Lu and Koki Muraoka and " + "Hananeh Oliaei\n" + " and Anurag Kumar Singh and Haohui Que and Weihong Xu and " + "Zhangmancang\n" + " Xu and Yong-Bin Zhuang and Jiayu Dai and Timothy J. Giese " + "and Weile\n" + " Jia and Ben Xu and Darrin M. York and Linfeng Zhang and Han " + "Wang},\n" + " title = {{DeePMD-kit v3: A Multiple-Backend Framework for Machine " + "Learning\n" + " Potentials}},\n" + " journal = {arXiv},\n" + " year = 2025,\n" + " pages = {2502.19161},\n" + " doi = {10.48550/arXiv.2502.19161},\n" "}\n\n"; PairDeepSpin::PairDeepSpin(LAMMPS *lmp) From 6d6c3fda571eb1d89deab664ca4eb1e40c37dbda Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Sat, 1 Mar 2025 15:30:39 +0800 Subject: [PATCH 23/24] fix(array-api): fix xp.where errors (#4624) `xp.where` always requires a bool array as its first input, but previously, the array-api-strict package didn't require it. ## Summary by CodeRabbit - **Refactor** - Enhanced the internal filtering logic by standardizing type handling for exclusion conditions, ensuring more reliable and consistent operations across the system. --------- Signed-off-by: Jinzhe Zeng (cherry picked from commit 05ba1bf7a9379987c792f8990a34dd7b2a9a6ab1) --- deepmd/dpmodel/descriptor/dpa1.py | 1 + deepmd/dpmodel/descriptor/repformers.py | 1 + deepmd/dpmodel/descriptor/se_t_tebd.py | 1 + deepmd/dpmodel/fitting/general_fitting.py | 1 + 4 files changed, 4 insertions(+) diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py index 20a758b170..fffeaae232 100644 --- a/deepmd/dpmodel/descriptor/dpa1.py +++ b/deepmd/dpmodel/descriptor/dpa1.py @@ -899,6 +899,7 @@ def call( exclude_mask = self.emask.build_type_exclude_mask(nlist, atype_ext) # nfnl x nnei exclude_mask = xp.reshape(exclude_mask, (nf * nloc, nnei)) + exclude_mask = xp.astype(exclude_mask, xp.bool) # nfnl x nnei nlist = xp.reshape(nlist, (nf * nloc, nnei)) nlist = xp.where(exclude_mask, nlist, xp.full_like(nlist, -1)) diff --git a/deepmd/dpmodel/descriptor/repformers.py b/deepmd/dpmodel/descriptor/repformers.py index ae6b5de511..e15a20926f 100644 --- a/deepmd/dpmodel/descriptor/repformers.py +++ b/deepmd/dpmodel/descriptor/repformers.py @@ -393,6 +393,7 @@ def call( ): xp = array_api_compat.array_namespace(nlist, coord_ext, atype_ext) exclude_mask = self.emask.build_type_exclude_mask(nlist, atype_ext) + exclude_mask = xp.astype(exclude_mask, xp.bool) nlist = xp.where(exclude_mask, nlist, xp.full_like(nlist, -1)) # nf x nloc x nnei x 4 dmatrix, diff, sw = self.env_mat.call( diff --git a/deepmd/dpmodel/descriptor/se_t_tebd.py b/deepmd/dpmodel/descriptor/se_t_tebd.py index c350e3eb47..ec54fd08aa 100644 --- a/deepmd/dpmodel/descriptor/se_t_tebd.py +++ b/deepmd/dpmodel/descriptor/se_t_tebd.py @@ -682,6 +682,7 @@ def call( exclude_mask = xp.reshape(exclude_mask, (nf * nloc, nnei)) # nfnl x nnei nlist = xp.reshape(nlist, (nf * nloc, nnei)) + exclude_mask = xp.astype(exclude_mask, xp.bool) nlist = xp.where(exclude_mask, nlist, xp.full_like(nlist, -1)) # nfnl x nnei nlist_mask = nlist != -1 diff --git a/deepmd/dpmodel/fitting/general_fitting.py b/deepmd/dpmodel/fitting/general_fitting.py index 2958a7d18d..9d51e35fd0 100644 --- a/deepmd/dpmodel/fitting/general_fitting.py +++ b/deepmd/dpmodel/fitting/general_fitting.py @@ -455,6 +455,7 @@ def _call_common( ) # nf x nloc exclude_mask = self.emask.build_type_exclude_mask(atype) + exclude_mask = xp.astype(exclude_mask, xp.bool) # nf x nloc x nod outs = xp.where(exclude_mask[:, :, None], outs, xp.zeros_like(outs)) return {self.var_name: outs} From b59bc33799cd8ef0b3e3d7d12229cc95c24de9de Mon Sep 17 00:00:00 2001 From: Chun Cai Date: Sat, 1 Mar 2025 15:33:50 +0800 Subject: [PATCH 24/24] docs: add PyTorch Profiler support details to TensorBoard documentation (#4615) This PR adds instructions on how to profile with the PyTorch backend. ## Summary by CodeRabbit - **Documentation** - Added a new section detailing the integration of PyTorch Profiler with TensorBoard. - Provided clear instructions on package installation, configuration adjustments, and how to visualize profiling data. - Enhanced the readability of commands and the overall formatting of the training documentation. --------- Co-authored-by: Han Wang <92130845+wanghan-iapcm@users.noreply.github.com> (cherry picked from commit 80d445b209b9c87e47684960f95be085703819cd) --- doc/train/tensorboard.md | 59 +++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 25 deletions(-) diff --git a/doc/train/tensorboard.md b/doc/train/tensorboard.md index 32ecdd0ab2..b2635479ce 100644 --- a/doc/train/tensorboard.md +++ b/doc/train/tensorboard.md @@ -26,42 +26,51 @@ Before running TensorBoard, make sure you have generated summary data in a log directory by modifying the input script, setting {ref}`tensorboard ` to true in the training subsection will enable the TensorBoard data analysis. eg. **water_se_a.json**. ```json - "training" : { - "systems": ["../data/"], - "stop_batch": 1000000, - "batch_size": 1, - - "seed": 1, - - "_comment": " display and restart", - "_comment": " frequencies counted in batch", - "disp_file": "lcurve.out", - "disp_freq": 100, - "numb_test": 10, - "save_freq": 1000, - "save_ckpt": "model.ckpt", - - "disp_training":true, - "time_training":true, - "tensorboard": true, - "tensorboard_log_dir":"log", - "tensorboard_freq": 1000, - "profiling": false, - "profiling_file":"timeline.json", - "_comment": "that's all" - } +"training": { + "systems": ["../data/"], + "stop_batch": 1000000, + "batch_size": 1, + + "seed": 1, + "_comment": " display and restart", + "_comment": " frequencies counted in batch", + "disp_file": "lcurve.out", + "disp_freq": 100, + "numb_test": 10, + "save_freq": 1000, + "save_ckpt": "model.ckpt", + + "disp_training": true, + "time_training": true, + "tensorboard": true, + "tensorboard_log_dir": "log", + "tensorboard_freq": 1000, + "profiling": false, + "profiling_file": "timeline.json", + "_comment": "that's all" +} ``` Once you have event files, run TensorBoard and provide the log directory. This should print that TensorBoard has started. Next, connect to http://tensorboard_server_ip:6006. -TensorBoard requires a logdir to read logs from. For info on configuring TensorBoard, run TensorBoard --help. +TensorBoard requires a logdir to read logs from. For info on configuring TensorBoard, run `tensorboard --help`. One can easily change the log name with "tensorboard_log_dir" and the sampling frequency with "tensorboard_freq". ```bash tensorboard --logdir path/to/logs ``` +## PyTorch Profiler With TensorBoard {{ pytorch_icon }} + +DeePMD-kit has a built-in support for [PyTorch Profiler](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html#use-profiler-to-record-execution-events). +The profiler requires extra packages for recording and visualization: +`pip install tensorboard torch-tb-profiler` +Set `"enable_profiler": true` in the training section of the input script, and launch a training task with 10 steps, since the default setting of the profiler scheduler is `wait=1, warmup=1, active=3, repeat=1`. +The profiler will generate recording files in `tensorboard_log_dir`. + +To [visualize the profiling data](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html#use-tensorboard-to-view-results-and-analyze-model-performance), launch TensorBoard (see above) and navigate to the "pytorch_profiler" tab. + ## Examples ### Tracking and visualizing loss metrics(red:train, blue:test)