Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 68 additions & 9 deletions deepmd/dpmodel/descriptor/dpa1.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@
NetworkCollection,
PairExcludeMask,
)
from deepmd.dpmodel.utils.env_mat_stat import (
EnvMatStatSe,
)
from deepmd.dpmodel.utils.network import (
LayerNorm,
NativeLayer,
Expand All @@ -48,6 +51,9 @@
from deepmd.utils.data_system import (
DeepmdDataSystem,
)
from deepmd.utils.env_mat_stat import (
StatItem,
)
from deepmd.utils.finetune import (
get_index_between_two_maps,
map_pair_exclude_types,
Expand Down Expand Up @@ -408,10 +414,27 @@ def dim_emb(self):
return self.get_dim_emb()

def compute_input_stats(
self, merged: list[dict], path: Optional[DPPath] = None
) -> NoReturn:
"""Update mean and stddev for descriptor elements."""
raise NotImplementedError
self,
merged: Union[Callable[[], list[dict]], list[dict]],
path: Optional[DPPath] = None,
):
"""
Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data.

Parameters
----------
merged : Union[Callable[[], list[dict]], list[dict]]
- list[dict]: A list of data samples from various data systems.
Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor`
originating from the `i`-th data system.
- Callable[[], list[dict]]: A lazy function that returns data samples in the above format
only when needed. Since the sampling process can be slow and memory-intensive,
the lazy function helps by only sampling once.
path : Optional[DPPath]
The path to the stat file.

"""
return self.se_atten.compute_input_stats(merged, path)

def set_stat_mean_and_stddev(
self,
Expand Down Expand Up @@ -842,13 +865,49 @@ def compute_input_stats(
self,
merged: Union[Callable[[], list[dict]], list[dict]],
path: Optional[DPPath] = None,
) -> NoReturn:
"""Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data."""
raise NotImplementedError
) -> None:
"""
Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data.

Parameters
----------
merged : Union[Callable[[], list[dict]], list[dict]]
- list[dict]: A list of data samples from various data systems.
Each element, `merged[i]`, is a data dictionary containing `keys`: `paddle.Tensor`
originating from the `i`-th data system.
- Callable[[], list[dict]]: A lazy function that returns data samples in the above format
only when needed. Since the sampling process can be slow and memory-intensive,
the lazy function helps by only sampling once.
path : Optional[DPPath]
The path to the stat file.

def get_stats(self) -> NoReturn:
"""
env_mat_stat = EnvMatStatSe(self)
if path is not None:
path = path / env_mat_stat.get_hash()
if path is None or not path.is_dir():
if callable(merged):
# only get data for once
sampled = merged()
else:
sampled = merged
else:
sampled = []
env_mat_stat.load_or_compute_stats(sampled, path)
self.stats = env_mat_stat.stats
mean, stddev = env_mat_stat()
xp = array_api_compat.array_namespace(self.stddev)
if not self.set_davg_zero:
self.mean = xp.asarray(mean, dtype=self.mean.dtype, copy=True)
self.stddev = xp.asarray(stddev, dtype=self.stddev.dtype, copy=True)

def get_stats(self) -> dict[str, StatItem]:
"""Get the statistics of the descriptor."""
raise NotImplementedError
if self.stats is None:
raise RuntimeError(
"The statistics of the descriptor has not been computed."
)
return self.stats
Comment thread
njzjz marked this conversation as resolved.

def reinit_exclude(
self,
Expand Down
30 changes: 26 additions & 4 deletions deepmd/dpmodel/descriptor/dpa2.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# SPDX-License-Identifier: LGPL-3.0-or-later
from typing import (
Callable,
NoReturn,
Optional,
Union,
Expand Down Expand Up @@ -737,10 +738,31 @@ def dim_emb(self):
return self.get_dim_emb()

def compute_input_stats(
self, merged: list[dict], path: Optional[DPPath] = None
) -> NoReturn:
"""Update mean and stddev for descriptor elements."""
raise NotImplementedError
self,
merged: Union[Callable[[], list[dict]], list[dict]],
path: Optional[DPPath] = None,
):
"""
Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data.

Parameters
----------
merged : Union[Callable[[], list[dict]], list[dict]]
- list[dict]: A list of data samples from various data systems.
Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor`
originating from the `i`-th data system.
- Callable[[], list[dict]]: A lazy function that returns data samples in the above format
only when needed. Since the sampling process can be slow and memory-intensive,
the lazy function helps by only sampling once.
path : Optional[DPPath]
The path to the stat file.

"""
descrpt_list = [self.repinit, self.repformers]
if self.use_three_body:
descrpt_list.append(self.repinit_three_body)
for ii, descrpt in enumerate(descrpt_list):
descrpt.compute_input_stats(merged, path)

def set_stat_mean_and_stddev(
self,
Expand Down
9 changes: 4 additions & 5 deletions deepmd/dpmodel/descriptor/dpa3.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# SPDX-License-Identifier: LGPL-3.0-or-later
from typing import (
NoReturn,
Optional,
Union,
)
Expand Down Expand Up @@ -444,11 +443,11 @@ def dim_emb(self):
"""Returns the embedding dimension g2."""
return self.get_dim_emb()

def compute_input_stats(
self, merged: list[dict], path: Optional[DPPath] = None
) -> NoReturn:
def compute_input_stats(self, merged: list[dict], path: Optional[DPPath] = None):
"""Update mean and stddev for descriptor elements."""
raise NotImplementedError
descrpt_list = [self.repflows]
for ii, descrpt in enumerate(descrpt_list):
descrpt.compute_input_stats(merged, path)

def set_stat_mean_and_stddev(
self,
Expand Down
53 changes: 47 additions & 6 deletions deepmd/dpmodel/descriptor/repflows.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# SPDX-License-Identifier: LGPL-3.0-or-later
from typing import (
Callable,
NoReturn,
Optional,
Union,
)
Expand All @@ -23,6 +22,9 @@
EnvMat,
PairExcludeMask,
)
from deepmd.dpmodel.utils.env_mat_stat import (
EnvMatStatSe,
)
from deepmd.dpmodel.utils.network import (
NativeLayer,
get_activation_fn,
Expand All @@ -33,6 +35,9 @@
from deepmd.dpmodel.utils.seed import (
child_seed,
)
from deepmd.utils.env_mat_stat import (
StatItem,
)
from deepmd.utils.path import (
DPPath,
)
Expand Down Expand Up @@ -349,13 +354,49 @@ def compute_input_stats(
self,
merged: Union[Callable[[], list[dict]], list[dict]],
path: Optional[DPPath] = None,
) -> NoReturn:
"""Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data."""
raise NotImplementedError
) -> None:
"""
Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data.

def get_stats(self) -> NoReturn:
Parameters
----------
merged : Union[Callable[[], list[dict]], list[dict]]
- list[dict]: A list of data samples from various data systems.
Each element, `merged[i]`, is a data dictionary containing `keys`: `paddle.Tensor`
originating from the `i`-th data system.
- Callable[[], list[dict]]: A lazy function that returns data samples in the above format
only when needed. Since the sampling process can be slow and memory-intensive,
the lazy function helps by only sampling once.
path : Optional[DPPath]
The path to the stat file.

"""
env_mat_stat = EnvMatStatSe(self)
if path is not None:
path = path / env_mat_stat.get_hash()
if path is None or not path.is_dir():
if callable(merged):
# only get data for once
sampled = merged()
else:
sampled = merged
else:
sampled = []
env_mat_stat.load_or_compute_stats(sampled, path)
self.stats = env_mat_stat.stats
mean, stddev = env_mat_stat()
xp = array_api_compat.array_namespace(self.stddev)
if not self.set_davg_zero:
self.mean = xp.asarray(mean, dtype=self.mean.dtype, copy=True)
self.stddev = xp.asarray(stddev, dtype=self.stddev.dtype, copy=True)

def get_stats(self) -> dict[str, StatItem]:
"""Get the statistics of the descriptor."""
raise NotImplementedError
if self.stats is None:
raise RuntimeError(
"The statistics of the descriptor has not been computed."
)
return self.stats

def reinit_exclude(
self,
Expand Down
53 changes: 47 additions & 6 deletions deepmd/dpmodel/descriptor/repformers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# SPDX-License-Identifier: LGPL-3.0-or-later
from typing import (
Callable,
NoReturn,
Optional,
Union,
)
Expand All @@ -23,6 +22,9 @@
EnvMat,
PairExcludeMask,
)
from deepmd.dpmodel.utils.env_mat_stat import (
EnvMatStatSe,
)
from deepmd.dpmodel.utils.network import (
LayerNorm,
NativeLayer,
Expand All @@ -34,6 +36,9 @@
from deepmd.dpmodel.utils.seed import (
child_seed,
)
from deepmd.utils.env_mat_stat import (
StatItem,
)
from deepmd.utils.path import (
DPPath,
)
Expand Down Expand Up @@ -370,13 +375,49 @@ def compute_input_stats(
self,
merged: Union[Callable[[], list[dict]], list[dict]],
path: Optional[DPPath] = None,
) -> NoReturn:
"""Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data."""
raise NotImplementedError
) -> None:
"""
Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data.

def get_stats(self) -> NoReturn:
Parameters
----------
merged : Union[Callable[[], list[dict]], list[dict]]
- list[dict]: A list of data samples from various data systems.
Each element, `merged[i]`, is a data dictionary containing `keys`: `paddle.Tensor`
originating from the `i`-th data system.
- Callable[[], list[dict]]: A lazy function that returns data samples in the above format
only when needed. Since the sampling process can be slow and memory-intensive,
the lazy function helps by only sampling once.
path : Optional[DPPath]
The path to the stat file.

"""
env_mat_stat = EnvMatStatSe(self)
if path is not None:
path = path / env_mat_stat.get_hash()
if path is None or not path.is_dir():
if callable(merged):
# only get data for once
sampled = merged()
else:
sampled = merged
else:
sampled = []
env_mat_stat.load_or_compute_stats(sampled, path)
self.stats = env_mat_stat.stats
mean, stddev = env_mat_stat()
xp = array_api_compat.array_namespace(self.stddev)
if not self.set_davg_zero:
self.mean = xp.asarray(mean, dtype=self.mean.dtype, copy=True)
self.stddev = xp.asarray(stddev, dtype=self.stddev.dtype, copy=True)

def get_stats(self) -> dict[str, StatItem]:
"""Get the statistics of the descriptor."""
raise NotImplementedError
if self.stats is None:
raise RuntimeError(
"The statistics of the descriptor has not been computed."
)
return self.stats

def reinit_exclude(
self,
Expand Down
Loading