diff --git a/deepmd/pt/entrypoints/main.py b/deepmd/pt/entrypoints/main.py
index 630fb6d86f..06a7603cc0 100644
--- a/deepmd/pt/entrypoints/main.py
+++ b/deepmd/pt/entrypoints/main.py
@@ -8,6 +8,7 @@
     Path,
 )
 from typing import (
+    Any,
     Optional,
     Union,
 )
@@ -95,20 +96,23 @@
 
 
 def get_trainer(
-    config,
-    init_model=None,
-    restart_model=None,
-    finetune_model=None,
-    force_load=False,
-    init_frz_model=None,
-    shared_links=None,
-    finetune_links=None,
-):
+    config: dict[str, Any],
+    init_model: Optional[str] = None,
+    restart_model: Optional[str] = None,
+    finetune_model: Optional[str] = None,
+    force_load: bool = False,
+    init_frz_model: Optional[str] = None,
+    shared_links: Optional[dict[str, Any]] = None,
+    finetune_links: Optional[dict[str, Any]] = None,
+) -> training.Trainer:
     multi_task = "model_dict" in config.get("model", {})
 
     def prepare_trainer_input_single(
-        model_params_single, data_dict_single, rank=0, seed=None
-    ):
+        model_params_single: dict[str, Any],
+        data_dict_single: dict[str, Any],
+        rank: int = 0,
+        seed: Optional[int] = None,
+    ) -> tuple[DpLoaderSet, Optional[DpLoaderSet], Optional[DPPath]]:
         training_dataset_params = data_dict_single["training_data"]
         validation_dataset_params = data_dict_single.get("validation_data", None)
         validation_systems = (
diff --git a/deepmd/pt/infer/deep_eval.py b/deepmd/pt/infer/deep_eval.py
index 13bd4d2bf0..ab022e9625 100644
--- a/deepmd/pt/infer/deep_eval.py
+++ b/deepmd/pt/infer/deep_eval.py
@@ -272,15 +272,15 @@ def get_ntypes_spin(self) -> int:
         """Get the number of spin atom types of this model. Only used in old implement."""
         return 0
 
-    def get_has_spin(self):
+    def get_has_spin(self) -> bool:
         """Check if the model has spin atom types."""
         return self._has_spin
 
-    def get_has_hessian(self):
+    def get_has_hessian(self) -> bool:
         """Check if the model has hessian."""
         return self._has_hessian
 
-    def get_model_branch(self):
+    def get_model_branch(self) -> tuple[dict[str, str], dict[str, dict[str, Any]]]:
         """Get the model branch information."""
         if "model_dict" in self.model_def_script:
             model_alias_dict, model_branch_dict = get_model_dict(
@@ -419,7 +419,7 @@ def _eval_func(self, inner_func: Callable, numb_test: int, natoms: int) -> Calla
         """
         if self.auto_batch_size is not None:
 
-            def eval_func(*args, **kwargs):
+            def eval_func(*args: Any, **kwargs: Any) -> Any:
                 return self.auto_batch_size.execute_all(
                     inner_func, numb_test, natoms, *args, **kwargs
                 )
@@ -453,7 +453,7 @@ def _eval_model(
         fparam: Optional[np.ndarray],
         aparam: Optional[np.ndarray],
         request_defs: list[OutputVariableDef],
-    ):
+    ) -> tuple[np.ndarray, ...]:
         model = self.dp.to(DEVICE)
         prec = NP_PRECISION_DICT[RESERVED_PRECISION_DICT[GLOBAL_PT_FLOAT_PRECISION]]
 
@@ -531,7 +531,7 @@ def _eval_model_spin(
         fparam: Optional[np.ndarray],
         aparam: Optional[np.ndarray],
         request_defs: list[OutputVariableDef],
-    ):
+    ) -> tuple[np.ndarray, ...]:
         model = self.dp.to(DEVICE)
 
         nframes = coords.shape[0]
@@ -608,7 +608,9 @@ def _eval_model_spin(
                 )  # this is kinda hacky
         return tuple(results)
 
-    def _get_output_shape(self, odef, nframes, natoms):
+    def _get_output_shape(
+        self, odef: OutputVariableDef, nframes: int, natoms: int
+    ) -> list[int]:
         if odef.category == OutputVariableCategory.DERV_C_REDU:
             # virial
             return [nframes, *odef.shape[:-1], 9]
diff --git a/deepmd/pt/infer/inference.py b/deepmd/pt/infer/inference.py
index dd0e7eaccb..ac11d160aa 100644
--- a/deepmd/pt/infer/inference.py
+++ b/deepmd/pt/infer/inference.py
@@ -3,6 +3,10 @@
 from copy import (
     deepcopy,
 )
+from typing import (
+    Optional,
+    Union,
+)
 
 import torch
 
@@ -25,8 +29,8 @@
 class Tester:
     def __init__(
         self,
-        model_ckpt,
-        head=None,
+        model_ckpt: Union[str, torch.nn.Module],
+        head: Optional[str] = None,
     ) -> None:
         """Construct a DeePMD tester.
 
diff --git a/deepmd/pt/loss/denoise.py b/deepmd/pt/loss/denoise.py
index 574210adb6..c8eeff6185 100644
--- a/deepmd/pt/loss/denoise.py
+++ b/deepmd/pt/loss/denoise.py
@@ -1,4 +1,8 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
+
 import torch
 import torch.nn.functional as F
 
@@ -13,15 +17,15 @@
 class DenoiseLoss(TaskLoss):
     def __init__(
         self,
-        ntypes,
-        masked_token_loss=1.0,
-        masked_coord_loss=1.0,
-        norm_loss=0.01,
-        use_l1=True,
-        beta=1.00,
-        mask_loss_coord=True,
-        mask_loss_token=True,
-        **kwargs,
+        ntypes: int,
+        masked_token_loss: float = 1.0,
+        masked_coord_loss: float = 1.0,
+        norm_loss: float = 0.01,
+        use_l1: bool = True,
+        beta: float = 1.00,
+        mask_loss_coord: bool = True,
+        mask_loss_token: bool = True,
+        **kwargs: Any,
     ) -> None:
         """Construct a layer to compute loss on coord, and type reconstruction."""
         super().__init__()
@@ -38,7 +42,14 @@ def __init__(
         self.mask_loss_coord = mask_loss_coord
         self.mask_loss_token = mask_loss_token
 
-    def forward(self, model_pred, label, natoms, learning_rate, mae=False):
+    def forward(
+        self,
+        model_pred: dict[str, torch.Tensor],
+        label: dict[str, torch.Tensor],
+        natoms: int,
+        learning_rate: float,
+        mae: bool = False,
+    ) -> tuple[torch.Tensor, dict[str, torch.Tensor]]:
         """Return loss on coord and type denoise.
 
         Returns
diff --git a/deepmd/pt/loss/dos.py b/deepmd/pt/loss/dos.py
index 493cc85694..bc77f34437 100644
--- a/deepmd/pt/loss/dos.py
+++ b/deepmd/pt/loss/dos.py
@@ -1,4 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
 
 import torch
 
@@ -26,8 +29,8 @@ def __init__(
         limit_pref_ados: float = 0.0,
         start_pref_acdf: float = 0.0,
         limit_pref_acdf: float = 0.0,
-        inference=False,
-        **kwargs,
+        inference: bool = False,
+        **kwargs: Any,
     ) -> None:
         r"""Construct a loss for local and global tensors.
 
@@ -85,7 +88,15 @@ def __init__(
             )
         )
 
-    def forward(self, input_dict, model, label, natoms, learning_rate=0.0, mae=False):
+    def forward(
+        self,
+        input_dict: dict[str, torch.Tensor],
+        model: torch.nn.Module,
+        label: dict[str, torch.Tensor],
+        natoms: int,
+        learning_rate: float = 0.0,
+        mae: bool = False,
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor, dict[str, torch.Tensor]]:
         """Return loss on local and global tensors.
 
         Parameters
diff --git a/deepmd/pt/loss/ener.py b/deepmd/pt/loss/ener.py
index 10e2bf9971..cccdc8949e 100644
--- a/deepmd/pt/loss/ener.py
+++ b/deepmd/pt/loss/ener.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
 )
 
@@ -23,7 +24,9 @@
 )
 
 
-def custom_huber_loss(predictions, targets, delta=1.0):
+def custom_huber_loss(
+    predictions: torch.Tensor, targets: torch.Tensor, delta: float = 1.0
+) -> torch.Tensor:
     error = targets - predictions
     abs_error = torch.abs(error)
     quadratic_loss = 0.5 * torch.pow(error, 2)
@@ -35,13 +38,13 @@ def custom_huber_loss(predictions, targets, delta=1.0):
 class EnergyStdLoss(TaskLoss):
     def __init__(
         self,
-        starter_learning_rate=1.0,
-        start_pref_e=0.0,
-        limit_pref_e=0.0,
-        start_pref_f=0.0,
-        limit_pref_f=0.0,
-        start_pref_v=0.0,
-        limit_pref_v=0.0,
+        starter_learning_rate: float = 1.0,
+        start_pref_e: float = 0.0,
+        limit_pref_e: float = 0.0,
+        start_pref_f: float = 0.0,
+        limit_pref_f: float = 0.0,
+        start_pref_v: float = 0.0,
+        limit_pref_v: float = 0.0,
         start_pref_ae: float = 0.0,
         limit_pref_ae: float = 0.0,
         start_pref_pf: float = 0.0,
@@ -52,10 +55,10 @@ def __init__(
         limit_pref_gf: float = 0.0,
         numb_generalized_coord: int = 0,
         use_l1_all: bool = False,
-        inference=False,
-        use_huber=False,
-        huber_delta=0.01,
-        **kwargs,
+        inference: bool = False,
+        use_huber: bool = False,
+        huber_delta: float = 0.01,
+        **kwargs: Any,
     ) -> None:
         r"""Construct a layer to compute loss on energy, force and virial.
 
@@ -149,7 +152,15 @@ def __init__(
                 "Huber loss is not implemented for force with atom_pref, generalized force and relative force. "
             )
 
-    def forward(self, input_dict, model, label, natoms, learning_rate, mae=False):
+    def forward(
+        self,
+        input_dict: dict[str, torch.Tensor],
+        model: torch.nn.Module,
+        label: dict[str, torch.Tensor],
+        natoms: int,
+        learning_rate: float,
+        mae: bool = False,
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor, dict[str, torch.Tensor]]:
         """Return loss on energy and force.
 
         Parameters
@@ -528,10 +539,10 @@ def deserialize(cls, data: dict) -> "TaskLoss":
 class EnergyHessianStdLoss(EnergyStdLoss):
     def __init__(
         self,
-        start_pref_h=0.0,
-        limit_pref_h=0.0,
-        **kwargs,
-    ):
+        start_pref_h: float = 0.0,
+        limit_pref_h: float = 0.0,
+        **kwargs: Any,
+    ) -> None:
         r"""Enable the layer to compute loss on hessian.
 
         Parameters
@@ -549,7 +560,15 @@ def __init__(
         self.start_pref_h = start_pref_h
         self.limit_pref_h = limit_pref_h
 
-    def forward(self, input_dict, model, label, natoms, learning_rate, mae=False):
+    def forward(
+        self,
+        input_dict: dict[str, torch.Tensor],
+        model: torch.nn.Module,
+        label: dict[str, torch.Tensor],
+        natoms: int,
+        learning_rate: float,
+        mae: bool = False,
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor, dict[str, torch.Tensor]]:
         model_pred, loss, more_loss = super().forward(
             input_dict, model, label, natoms, learning_rate, mae=mae
         )
diff --git a/deepmd/pt/loss/ener_spin.py b/deepmd/pt/loss/ener_spin.py
index 6a926f4051..9b87d4234f 100644
--- a/deepmd/pt/loss/ener_spin.py
+++ b/deepmd/pt/loss/ener_spin.py
@@ -1,4 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
 
 import torch
 import torch.nn.functional as F
@@ -20,21 +23,21 @@
 class EnergySpinLoss(TaskLoss):
     def __init__(
         self,
-        starter_learning_rate=1.0,
-        start_pref_e=0.0,
-        limit_pref_e=0.0,
-        start_pref_fr=0.0,
-        limit_pref_fr=0.0,
-        start_pref_fm=0.0,
-        limit_pref_fm=0.0,
-        start_pref_v=0.0,
-        limit_pref_v=0.0,
+        starter_learning_rate: float = 1.0,
+        start_pref_e: float = 0.0,
+        limit_pref_e: float = 0.0,
+        start_pref_fr: float = 0.0,
+        limit_pref_fr: float = 0.0,
+        start_pref_fm: float = 0.0,
+        limit_pref_fm: float = 0.0,
+        start_pref_v: float = 0.0,
+        limit_pref_v: float = 0.0,
         start_pref_ae: float = 0.0,
         limit_pref_ae: float = 0.0,
         enable_atom_ener_coeff: bool = False,
         use_l1_all: bool = False,
-        inference=False,
-        **kwargs,
+        inference: bool = False,
+        **kwargs: Any,
     ) -> None:
         r"""Construct a layer to compute loss on energy, real force, magnetic force and virial.
 
@@ -93,7 +96,15 @@ def __init__(
         self.use_l1_all = use_l1_all
         self.inference = inference
 
-    def forward(self, input_dict, model, label, natoms, learning_rate, mae=False):
+    def forward(
+        self,
+        input_dict: dict[str, torch.Tensor],
+        model: torch.nn.Module,
+        label: dict[str, torch.Tensor],
+        natoms: int,
+        learning_rate: float,
+        mae: bool = False,
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor, dict[str, torch.Tensor]]:
         """Return energy loss with magnetic labels.
 
         Parameters
diff --git a/deepmd/pt/loss/loss.py b/deepmd/pt/loss/loss.py
index d1777a29b3..13cad6f59b 100644
--- a/deepmd/pt/loss/loss.py
+++ b/deepmd/pt/loss/loss.py
@@ -4,7 +4,9 @@
     abstractmethod,
 )
 from typing import (
+    Any,
     NoReturn,
+    Union,
 )
 
 import torch
@@ -18,11 +20,18 @@
 
 
 class TaskLoss(torch.nn.Module, ABC, make_plugin_registry("loss")):
-    def __init__(self, **kwargs) -> None:
+    def __init__(self, **kwargs: Any) -> None:
         """Construct loss."""
         super().__init__()
 
-    def forward(self, input_dict, model, label, natoms, learning_rate) -> NoReturn:
+    def forward(
+        self,
+        input_dict: dict[str, torch.Tensor],
+        model: torch.nn.Module,
+        label: dict[str, torch.Tensor],
+        natoms: int,
+        learning_rate: Union[float, torch.Tensor],
+    ) -> NoReturn:
         """Return loss ."""
         raise NotImplementedError
 
diff --git a/deepmd/pt/loss/property.py b/deepmd/pt/loss/property.py
index bbe3403aa2..1cd842650d 100644
--- a/deepmd/pt/loss/property.py
+++ b/deepmd/pt/loss/property.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import logging
 from typing import (
+    Any,
     Union,
 )
 
@@ -23,15 +24,15 @@
 class PropertyLoss(TaskLoss):
     def __init__(
         self,
-        task_dim,
+        task_dim: int,
         var_name: str,
         loss_func: str = "smooth_mae",
-        metric: list = ["mae"],
+        metric: list[str] = ["mae"],
         beta: float = 1.00,
         out_bias: Union[list, None] = None,
         out_std: Union[list, None] = None,
         intensive: bool = False,
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         r"""Construct a layer to compute loss on property.
 
@@ -66,7 +67,15 @@ def __init__(
         self.intensive = intensive
         self.var_name = var_name
 
-    def forward(self, input_dict, model, label, natoms, learning_rate=0.0, mae=False):
+    def forward(
+        self,
+        input_dict: dict[str, torch.Tensor],
+        model: torch.nn.Module,
+        label: dict[str, torch.Tensor],
+        natoms: int,
+        learning_rate: float = 0.0,
+        mae: bool = False,
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor, dict[str, torch.Tensor]]:
         """Return loss on properties .
 
         Parameters
diff --git a/deepmd/pt/loss/tensor.py b/deepmd/pt/loss/tensor.py
index 0acc3989be..625a9b30bc 100644
--- a/deepmd/pt/loss/tensor.py
+++ b/deepmd/pt/loss/tensor.py
@@ -1,4 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
 
 import torch
 
@@ -21,9 +24,9 @@ def __init__(
         label_name: str,
         pref_atomic: float = 0.0,
         pref: float = 0.0,
-        inference=False,
+        inference: bool = False,
         enable_atomic_weight: bool = False,
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         r"""Construct a loss for local and global tensors.
 
@@ -64,7 +67,15 @@ def __init__(
             "Can not assian zero weight both to `pref` and `pref_atomic`"
         )
 
-    def forward(self, input_dict, model, label, natoms, learning_rate=0.0, mae=False):
+    def forward(
+        self,
+        input_dict: dict[str, torch.Tensor],
+        model: torch.nn.Module,
+        label: dict[str, torch.Tensor],
+        natoms: int,
+        learning_rate: float = 0.0,
+        mae: bool = False,
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor, dict[str, torch.Tensor]]:
         """Return loss on local and global tensors.
 
         Parameters
diff --git a/deepmd/pt/model/atomic_model/base_atomic_model.py b/deepmd/pt/model/atomic_model/base_atomic_model.py
index a2cbef3eee..a18834e40b 100644
--- a/deepmd/pt/model/atomic_model/base_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/base_atomic_model.py
@@ -106,7 +106,7 @@ def init_out_stat(self) -> None:
     def set_out_bias(self, out_bias: torch.Tensor) -> None:
         self.out_bias = out_bias
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: torch.Tensor) -> None:
         if key in ["out_bias"]:
             self.out_bias = value
         elif key in ["out_std"]:
@@ -114,7 +114,7 @@ def __setitem__(self, key, value) -> None:
         else:
             raise KeyError(key)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> torch.Tensor:
         if key in ["out_bias"]:
             return self.out_bias
         elif key in ["out_std"]:
@@ -296,7 +296,9 @@ def forward(
         )
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self,
+        type_map: list[str],
+        model_with_new_type_stat: Optional["BaseAtomicModel"] = None,
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -417,7 +419,7 @@ def apply_out_stat(
         self,
         ret: dict[str, torch.Tensor],
         atype: torch.Tensor,
-    ):
+    ) -> dict[str, torch.Tensor]:
         """Apply the stat to each atomic output.
         The developer may override the method to define how the bias is applied
         to the atomic output of the model.
@@ -438,9 +440,9 @@ def apply_out_stat(
 
     def change_out_bias(
         self,
-        sample_merged,
+        sample_merged: Union[Callable[[], list[dict]], list[dict]],
         stat_file_path: Optional[DPPath] = None,
-        bias_adjust_mode="change-by-statistic",
+        bias_adjust_mode: str = "change-by-statistic",
     ) -> None:
         """Change the output bias according to the input data and the pretrained model.
 
@@ -490,7 +492,13 @@ def change_out_bias(
     def _get_forward_wrapper_func(self) -> Callable[..., torch.Tensor]:
         """Get a forward wrapper of the atomic model for output bias calculation."""
 
-        def model_forward(coord, atype, box, fparam=None, aparam=None):
+        def model_forward(
+            coord: torch.Tensor,
+            atype: torch.Tensor,
+            box: Optional[torch.Tensor],
+            fparam: Optional[torch.Tensor] = None,
+            aparam: Optional[torch.Tensor] = None,
+        ) -> dict[str, torch.Tensor]:
             with (
                 torch.no_grad()
             ):  # it's essential for pure torch forward function to use auto_batchsize
@@ -519,13 +527,13 @@ def model_forward(coord, atype, box, fparam=None, aparam=None):
 
         return model_forward
 
-    def _default_bias(self):
+    def _default_bias(self) -> torch.Tensor:
         ntypes = self.get_ntypes()
         return torch.zeros(
             [self.n_out, ntypes, self.max_out_size], dtype=dtype, device=device
         )
 
-    def _default_std(self):
+    def _default_std(self) -> torch.Tensor:
         ntypes = self.get_ntypes()
         return torch.ones(
             [self.n_out, ntypes, self.max_out_size], dtype=dtype, device=device
diff --git a/deepmd/pt/model/atomic_model/dipole_atomic_model.py b/deepmd/pt/model/atomic_model/dipole_atomic_model.py
index 3796aa2e83..c9badefcad 100644
--- a/deepmd/pt/model/atomic_model/dipole_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/dipole_atomic_model.py
@@ -1,4 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
 
 import torch
 
@@ -12,7 +15,9 @@
 
 
 class DPDipoleAtomicModel(DPAtomicModel):
-    def __init__(self, descriptor, fitting, type_map, **kwargs):
+    def __init__(
+        self, descriptor: Any, fitting: Any, type_map: Any, **kwargs: Any
+    ) -> None:
         if not isinstance(fitting, DipoleFittingNet):
             raise TypeError(
                 "fitting must be an instance of DipoleFittingNet for DPDipoleAtomicModel"
@@ -23,6 +28,6 @@ def apply_out_stat(
         self,
         ret: dict[str, torch.Tensor],
         atype: torch.Tensor,
-    ):
+    ) -> dict[str, torch.Tensor]:
         # dipole not applying bias
         return ret
diff --git a/deepmd/pt/model/atomic_model/dos_atomic_model.py b/deepmd/pt/model/atomic_model/dos_atomic_model.py
index 2af1a4e052..7bc0108fc5 100644
--- a/deepmd/pt/model/atomic_model/dos_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/dos_atomic_model.py
@@ -1,4 +1,8 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
+
 from deepmd.pt.model.task.dos import (
     DOSFittingNet,
 )
@@ -9,7 +13,9 @@
 
 
 class DPDOSAtomicModel(DPAtomicModel):
-    def __init__(self, descriptor, fitting, type_map, **kwargs):
+    def __init__(
+        self, descriptor: Any, fitting: Any, type_map: Any, **kwargs: Any
+    ) -> None:
         if not isinstance(fitting, DOSFittingNet):
             raise TypeError(
                 "fitting must be an instance of DOSFittingNet for DPDOSAtomicModel"
diff --git a/deepmd/pt/model/atomic_model/dp_atomic_model.py b/deepmd/pt/model/atomic_model/dp_atomic_model.py
index 62c7d78d75..1a34eb986a 100644
--- a/deepmd/pt/model/atomic_model/dp_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/dp_atomic_model.py
@@ -2,6 +2,8 @@
 import functools
 import logging
 from typing import (
+    Any,
+    Callable,
     Optional,
 )
 
@@ -47,10 +49,10 @@ class DPAtomicModel(BaseAtomicModel):
 
     def __init__(
         self,
-        descriptor,
-        fitting,
+        descriptor: BaseDescriptor,
+        fitting: BaseFitting,
         type_map: list[str],
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         super().__init__(type_map, **kwargs)
         ntypes = len(type_map)
@@ -108,7 +110,7 @@ def get_sel(self) -> list[int]:
         """Get the neighbor selection."""
         return self.sel
 
-    def set_case_embd(self, case_idx: int):
+    def set_case_embd(self, case_idx: int) -> None:
         """
         Set the case embedding of this atomic model by the given case_idx,
         typically concatenated with the output of the descriptor and fed into the fitting net.
@@ -128,7 +130,9 @@ def mixed_types(self) -> bool:
         return self.descriptor.mixed_types()
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self,
+        type_map: list[str],
+        model_with_new_type_stat: Optional["DPAtomicModel"] = None,
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -169,7 +173,7 @@ def serialize(self) -> dict:
         return dd
 
     @classmethod
-    def deserialize(cls, data) -> "DPAtomicModel":
+    def deserialize(cls, data: dict) -> "DPAtomicModel":
         data = data.copy()
         check_version_compatibility(data.pop("@version", 1), 2, 1)
         data.pop("@class", None)
@@ -214,9 +218,9 @@ def enable_compression(
 
     def forward_atomic(
         self,
-        extended_coord,
-        extended_atype,
-        nlist,
+        extended_coord: torch.Tensor,
+        extended_atype: torch.Tensor,
+        nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
@@ -283,7 +287,7 @@ def get_out_bias(self) -> torch.Tensor:
 
     def compute_or_load_stat(
         self,
-        sampled_func,
+        sampled_func: Callable[[], list[dict]],
         stat_file_path: Optional[DPPath] = None,
         compute_or_load_out_stat: bool = True,
     ) -> None:
@@ -311,7 +315,7 @@ def compute_or_load_stat(
             stat_file_path /= " ".join(self.type_map)
 
         @functools.lru_cache
-        def wrapped_sampler():
+        def wrapped_sampler() -> list[dict]:
             sampled = sampled_func()
             if self.pair_excl is not None:
                 pair_exclude_types = self.pair_excl.get_exclude_types()
diff --git a/deepmd/pt/model/atomic_model/energy_atomic_model.py b/deepmd/pt/model/atomic_model/energy_atomic_model.py
index 6d894b4aab..9f513fc53d 100644
--- a/deepmd/pt/model/atomic_model/energy_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/energy_atomic_model.py
@@ -1,4 +1,8 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
+
 from deepmd.pt.model.task.ener import (
     EnergyFittingNet,
     EnergyFittingNetDirect,
@@ -11,7 +15,9 @@
 
 
 class DPEnergyAtomicModel(DPAtomicModel):
-    def __init__(self, descriptor, fitting, type_map, **kwargs):
+    def __init__(
+        self, descriptor: Any, fitting: Any, type_map: Any, **kwargs: Any
+    ) -> None:
         if not (
             isinstance(fitting, EnergyFittingNet)
             or isinstance(fitting, EnergyFittingNetDirect)
diff --git a/deepmd/pt/model/atomic_model/linear_atomic_model.py b/deepmd/pt/model/atomic_model/linear_atomic_model.py
index 46881c73e7..b510448ec3 100644
--- a/deepmd/pt/model/atomic_model/linear_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/linear_atomic_model.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import functools
 from typing import (
+    Any,
+    Callable,
     Optional,
     Union,
 )
@@ -56,7 +58,7 @@ def __init__(
         models: list[BaseAtomicModel],
         type_map: list[str],
         weights: Optional[Union[str, list[float]]] = "mean",
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         super().__init__(type_map, **kwargs)
         super().init_out_stat()
@@ -135,7 +137,9 @@ def get_type_map(self) -> list[str]:
         return self.type_map
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self,
+        type_map: list[str],
+        model_with_new_type_stat: Optional["LinearEnergyAtomicModel"] = None,
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -158,7 +162,7 @@ def get_model_rcuts(self) -> list[float]:
     def get_sel(self) -> list[int]:
         return [max([model.get_nsel() for model in self.models])]
 
-    def set_case_embd(self, case_idx: int):
+    def set_case_embd(self, case_idx: int) -> None:
         """
         Set the case embedding of this atomic model by the given case_idx,
         typically concatenated with the output of the descriptor and fed into the fitting net.
@@ -307,7 +311,7 @@ def apply_out_stat(
         self,
         ret: dict[str, torch.Tensor],
         atype: torch.Tensor,
-    ):
+    ) -> dict[str, torch.Tensor]:
         """Apply the stat to each atomic output.
         The developer may override the method to define how the bias is applied
         to the atomic output of the model.
@@ -471,7 +475,7 @@ def is_aparam_nall(self) -> bool:
 
     def compute_or_load_stat(
         self,
-        sampled_func,
+        sampled_func: Callable[[], list[dict[str, Any]]],
         stat_file_path: Optional[DPPath] = None,
         compute_or_load_out_stat: bool = True,
     ) -> None:
@@ -504,7 +508,7 @@ def compute_or_load_stat(
             stat_file_path /= " ".join(self.type_map)
 
         @functools.lru_cache
-        def wrapped_sampler():
+        def wrapped_sampler() -> list[dict[str, Any]]:
             sampled = sampled_func()
             if self.pair_excl is not None:
                 pair_exclude_types = self.pair_excl.get_exclude_types()
@@ -548,7 +552,7 @@ def __init__(
         sw_rmax: float,
         type_map: list[str],
         smin_alpha: Optional[float] = 0.1,
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         models = [dp_model, zbl_model]
         kwargs["models"] = models
@@ -576,7 +580,7 @@ def serialize(self) -> dict:
         )
         return dd
 
-    def set_case_embd(self, case_idx: int):
+    def set_case_embd(self, case_idx: int) -> None:
         """
         Set the case embedding of this atomic model by the given case_idx,
         typically concatenated with the output of the descriptor and fed into the fitting net.
@@ -585,7 +589,7 @@ def set_case_embd(self, case_idx: int):
         self.models[0].set_case_embd(case_idx)
 
     @classmethod
-    def deserialize(cls, data) -> "DPZBLLinearEnergyAtomicModel":
+    def deserialize(cls, data: dict[str, Any]) -> "DPZBLLinearEnergyAtomicModel":
         data = data.copy()
         check_version_compatibility(data.pop("@version", 1), 2, 1)
         models = [
diff --git a/deepmd/pt/model/atomic_model/pairtab_atomic_model.py b/deepmd/pt/model/atomic_model/pairtab_atomic_model.py
index 8f73d81d76..b022e6bfc9 100644
--- a/deepmd/pt/model/atomic_model/pairtab_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/pairtab_atomic_model.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Callable,
     Optional,
     Union,
@@ -68,7 +69,7 @@ def __init__(
         rcut: float,
         sel: Union[int, list[int]],
         type_map: list[str],
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         super().__init__(type_map, **kwargs)
         super().init_out_stat()
@@ -141,7 +142,7 @@ def get_type_map(self) -> list[str]:
     def get_sel(self) -> list[int]:
         return [self.sel]
 
-    def set_case_embd(self, case_idx: int):
+    def set_case_embd(self, case_idx: int) -> None:
         """
         Set the case embedding of this atomic model by the given case_idx,
         typically concatenated with the output of the descriptor and fed into the fitting net.
@@ -175,7 +176,9 @@ def need_sorted_nlist_for_lower(self) -> bool:
         return False
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self,
+        type_map: list[str],
+        model_with_new_type_stat: Optional["PairTabAtomicModel"] = None,
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -202,7 +205,7 @@ def serialize(self) -> dict:
         return dd
 
     @classmethod
-    def deserialize(cls, data) -> "PairTabAtomicModel":
+    def deserialize(cls, data: dict[str, Any]) -> "PairTabAtomicModel":
         data = data.copy()
         check_version_compatibility(data.pop("@version", 1), 2, 1)
         tab = PairTab.deserialize(data.pop("tab"))
diff --git a/deepmd/pt/model/atomic_model/polar_atomic_model.py b/deepmd/pt/model/atomic_model/polar_atomic_model.py
index 6bd063591f..4484d1945b 100644
--- a/deepmd/pt/model/atomic_model/polar_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/polar_atomic_model.py
@@ -1,4 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
 
 import torch
 
@@ -12,7 +15,9 @@
 
 
 class DPPolarAtomicModel(DPAtomicModel):
-    def __init__(self, descriptor, fitting, type_map, **kwargs):
+    def __init__(
+        self, descriptor: Any, fitting: Any, type_map: Any, **kwargs: Any
+    ) -> None:
         if not isinstance(fitting, PolarFittingNet):
             raise TypeError(
                 "fitting must be an instance of PolarFittingNet for DPPolarAtomicModel"
@@ -23,7 +28,7 @@ def apply_out_stat(
         self,
         ret: dict[str, torch.Tensor],
         atype: torch.Tensor,
-    ):
+    ) -> dict[str, torch.Tensor]:
         """Apply the stat to each atomic output.
 
         Parameters
diff --git a/deepmd/pt/model/atomic_model/property_atomic_model.py b/deepmd/pt/model/atomic_model/property_atomic_model.py
index 3622c9f476..baf9c5b7fc 100644
--- a/deepmd/pt/model/atomic_model/property_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/property_atomic_model.py
@@ -1,4 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
 
 import torch
 
@@ -12,7 +15,9 @@
 
 
 class DPPropertyAtomicModel(DPAtomicModel):
-    def __init__(self, descriptor, fitting, type_map, **kwargs):
+    def __init__(
+        self, descriptor: Any, fitting: Any, type_map: Any, **kwargs: Any
+    ) -> None:
         if not isinstance(fitting, PropertyFittingNet):
             raise TypeError(
                 "fitting must be an instance of PropertyFittingNet for DPPropertyAtomicModel"
@@ -31,7 +36,7 @@ def apply_out_stat(
         self,
         ret: dict[str, torch.Tensor],
         atype: torch.Tensor,
-    ):
+    ) -> dict[str, torch.Tensor]:
         """Apply the stat to each atomic output.
         In property fitting, each output will be multiplied by label std and then plus the label average value.
 
diff --git a/deepmd/pt/model/descriptor/descriptor.py b/deepmd/pt/model/descriptor/descriptor.py
index 3b374751c7..c1a3529ae0 100644
--- a/deepmd/pt/model/descriptor/descriptor.py
+++ b/deepmd/pt/model/descriptor/descriptor.py
@@ -5,6 +5,7 @@
     abstractmethod,
 )
 from typing import (
+    Any,
     Callable,
     NoReturn,
     Optional,
@@ -43,7 +44,7 @@ class DescriptorBlock(torch.nn.Module, ABC, make_plugin_registry("DescriptorBloc
 
     local_cluster = False
 
-    def __new__(cls, *args, **kwargs):
+    def __new__(cls, *args: Any, **kwargs: Any) -> "DescriptorBlock":
         if cls is DescriptorBlock:
             try:
                 descrpt_type = kwargs["type"]
@@ -126,7 +127,9 @@ def get_stats(self) -> dict[str, StatItem]:
         """Get the statistics of the descriptor."""
         raise NotImplementedError
 
-    def share_params(self, base_class, shared_level, resume=False) -> None:
+    def share_params(
+        self, base_class: "DescriptorBlock", shared_level: int, resume: bool = False
+    ) -> None:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -178,7 +181,13 @@ def forward(
         extended_atype_embd: Optional[torch.Tensor] = None,
         mapping: Optional[torch.Tensor] = None,
         type_embedding: Optional[torch.Tensor] = None,
-    ):
+    ) -> tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
         """Calculate DescriptorBlock."""
         pass
 
@@ -192,14 +201,18 @@ def need_sorted_nlist_for_lower(self) -> bool:
 
 
 def make_default_type_embedding(
-    ntypes,
-):
+    ntypes: int,
+) -> tuple[TypeEmbedNet, dict[str, Any]]:
     aux = {}
     aux["tebd_dim"] = 8
     return TypeEmbedNet(ntypes, aux["tebd_dim"]), aux
 
 
-def extend_descrpt_stat(des, type_map, des_with_stat=None) -> None:
+def extend_descrpt_stat(
+    des: DescriptorBlock,
+    type_map: list[str],
+    des_with_stat: Optional[DescriptorBlock] = None,
+) -> None:
     r"""
     Extend the statistics of a descriptor block with types from newly provided `type_map`.
 
diff --git a/deepmd/pt/model/descriptor/dpa1.py b/deepmd/pt/model/descriptor/dpa1.py
index 16603dc75d..e158dd3725 100644
--- a/deepmd/pt/model/descriptor/dpa1.py
+++ b/deepmd/pt/model/descriptor/dpa1.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Callable,
     Optional,
     Union,
@@ -236,8 +237,8 @@ def __init__(
         exclude_types: list[tuple[int, int]] = [],
         env_protection: float = 0.0,
         scaling_factor: int = 1.0,
-        normalize=True,
-        temperature=None,
+        normalize: bool = True,
+        temperature: Optional[float] = None,
         concat_output_tebd: bool = True,
         trainable: bool = True,
         trainable_ln: bool = True,
@@ -250,7 +251,7 @@ def __init__(
         use_tebd_bias: bool = False,
         type_map: Optional[list[str]] = None,
         # not implemented
-        spin=None,
+        spin: Optional[Any] = None,
         type: Optional[str] = None,
     ) -> None:
         super().__init__()
@@ -380,7 +381,9 @@ def get_env_protection(self) -> float:
         """Returns the protection of building environment matrix."""
         return self.se_atten.get_env_protection()
 
-    def share_params(self, base_class, shared_level, resume=False) -> None:
+    def share_params(
+        self, base_class: Any, shared_level: int, resume: bool = False
+    ) -> None:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -404,18 +407,18 @@ def share_params(self, base_class, shared_level, resume=False) -> None:
             raise NotImplementedError
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         return self.get_dim_out()
 
     @property
-    def dim_emb(self):
+    def dim_emb(self) -> int:
         return self.get_dim_emb()
 
     def compute_input_stats(
         self,
         merged: Union[Callable[[], list[dict]], list[dict]],
         path: Optional[DPPath] = None,
-    ):
+    ) -> None:
         """
         Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data.
 
@@ -448,7 +451,7 @@ def get_stat_mean_and_stddev(self) -> tuple[torch.Tensor, torch.Tensor]:
         return self.se_atten.mean, self.se_atten.stddev
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -548,7 +551,7 @@ def deserialize(cls, data: dict) -> "DescrptDPA1":
             data["use_tebd_bias"] = True
         obj = cls(**data)
 
-        def t_cvt(xx):
+        def t_cvt(xx: Any) -> torch.Tensor:
             return torch.tensor(xx, dtype=obj.se_atten.prec, device=env.DEVICE)
 
         obj.type_embedding.embedding = TypeEmbedNetConsistent.deserialize(
@@ -651,7 +654,13 @@ def forward(
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
         """Compute the descriptor.
 
         Parameters
@@ -708,10 +717,12 @@ def forward(
 
         return (
             g1.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
-            rot_mat.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
+            rot_mat.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION)
+            if rot_mat is not None
+            else None,
             g2.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION) if g2 is not None else None,
-            h2.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
-            sw.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
+            h2.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION) if h2 is not None else None,
+            sw.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION) if sw is not None else None,
         )
 
     @classmethod
diff --git a/deepmd/pt/model/descriptor/dpa2.py b/deepmd/pt/model/descriptor/dpa2.py
index 0d6fbd84e5..5858206cc3 100644
--- a/deepmd/pt/model/descriptor/dpa2.py
+++ b/deepmd/pt/model/descriptor/dpa2.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Callable,
     Optional,
     Union,
@@ -155,7 +156,7 @@ def __init__(
         """
         super().__init__()
 
-        def init_subclass_params(sub_data, sub_class):
+        def init_subclass_params(sub_data: Any, sub_class: Any) -> Any:
             if isinstance(sub_data, dict):
                 return sub_class(**sub_data)
             elif isinstance(sub_data, sub_class):
@@ -390,7 +391,9 @@ def get_env_protection(self) -> float:
         # the env_protection of repinit is the same as that of the repformer
         return self.repinit.get_env_protection()
 
-    def share_params(self, base_class, shared_level, resume=False) -> None:
+    def share_params(
+        self, base_class: Any, shared_level: int, resume: bool = False
+    ) -> None:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -422,7 +425,7 @@ def share_params(self, base_class, shared_level, resume=False) -> None:
             raise NotImplementedError
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -477,11 +480,11 @@ def change_type_map(
             repinit_three_body["dstd"] = repinit_three_body["dstd"][remap_index]
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         return self.get_dim_out()
 
     @property
-    def dim_emb(self):
+    def dim_emb(self) -> int:
         """Returns the embedding dimension g2."""
         return self.get_dim_emb()
 
@@ -656,7 +659,7 @@ def deserialize(cls, data: dict) -> "DescrptDPA2":
         if obj.repinit.dim_out != obj.repformers.dim_in:
             obj.g1_shape_tranform = MLPLayer.deserialize(g1_shape_tranform)
 
-        def t_cvt(xx):
+        def t_cvt(xx: Any) -> torch.Tensor:
             return torch.tensor(xx, dtype=obj.repinit.prec, device=env.DEVICE)
 
         # deserialize repinit
@@ -711,7 +714,13 @@ def forward(
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
         """Compute the descriptor.
 
         Parameters
@@ -820,10 +829,12 @@ def forward(
             g1 = torch.cat([g1, g1_inp], dim=-1)
         return (
             g1.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
-            rot_mat.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
-            g2.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
-            h2.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
-            sw.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
+            rot_mat.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION)
+            if rot_mat is not None
+            else None,
+            g2.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION) if g2 is not None else None,
+            h2.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION) if h2 is not None else None,
+            sw.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION) if sw is not None else None,
         )
 
     @classmethod
diff --git a/deepmd/pt/model/descriptor/dpa3.py b/deepmd/pt/model/descriptor/dpa3.py
index b96d130619..2de7851a51 100644
--- a/deepmd/pt/model/descriptor/dpa3.py
+++ b/deepmd/pt/model/descriptor/dpa3.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Callable,
     Optional,
     Union,
@@ -122,7 +123,7 @@ def __init__(
     ) -> None:
         super().__init__()
 
-        def init_subclass_params(sub_data, sub_class):
+        def init_subclass_params(sub_data: Any, sub_class: Any) -> Any:
             if isinstance(sub_data, dict):
                 return sub_class(**sub_data)
             elif isinstance(sub_data, sub_class):
@@ -272,7 +273,9 @@ def get_env_protection(self) -> float:
         """Returns the protection of building environment matrix."""
         return self.repflows.get_env_protection()
 
-    def share_params(self, base_class, shared_level, resume=False) -> None:
+    def share_params(
+        self, base_class: Any, shared_level: int, resume: bool = False
+    ) -> None:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -296,7 +299,7 @@ def share_params(self, base_class, shared_level, resume=False) -> None:
             raise NotImplementedError
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -325,11 +328,11 @@ def change_type_map(
         repflow["dstd"] = repflow["dstd"][remap_index]
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         return self.get_dim_out()
 
     @property
-    def dim_emb(self):
+    def dim_emb(self) -> int:
         """Returns the embedding dimension g2."""
         return self.get_dim_emb()
 
@@ -427,7 +430,7 @@ def deserialize(cls, data: dict) -> "DescrptDPA3":
             type_embedding
         )
 
-        def t_cvt(xx):
+        def t_cvt(xx: Any) -> torch.Tensor:
             return torch.tensor(xx, dtype=obj.repflows.prec, device=env.DEVICE)
 
         # deserialize repflow
@@ -452,7 +455,13 @@ def forward(
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
         """Compute the descriptor.
 
         Parameters
@@ -509,10 +518,14 @@ def forward(
             node_ebd = torch.cat([node_ebd, node_ebd_inp], dim=-1)
         return (
             node_ebd.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
-            rot_mat.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
-            edge_ebd.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
-            h2.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
-            sw.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
+            rot_mat.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION)
+            if rot_mat is not None
+            else None,
+            edge_ebd.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION)
+            if edge_ebd is not None
+            else None,
+            h2.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION) if h2 is not None else None,
+            sw.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION) if sw is not None else None,
         )
 
     @classmethod
diff --git a/deepmd/pt/model/descriptor/env_mat.py b/deepmd/pt/model/descriptor/env_mat.py
index c57ae209fd..0ffdbb7dbb 100644
--- a/deepmd/pt/model/descriptor/env_mat.py
+++ b/deepmd/pt/model/descriptor/env_mat.py
@@ -9,14 +9,14 @@
 
 
 def _make_env_mat(
-    nlist,
-    coord,
+    nlist: torch.Tensor,
+    coord: torch.Tensor,
     rcut: float,
     ruct_smth: float,
     radial_only: bool = False,
     protection: float = 0.0,
     use_exp_switch: bool = False,
-):
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """Make smooth environment matrix."""
     bsz, natoms, nnei = nlist.shape
     coord = coord.view(bsz, -1, 3)
@@ -49,17 +49,17 @@ def _make_env_mat(
 
 
 def prod_env_mat(
-    extended_coord,
-    nlist,
-    atype,
-    mean,
-    stddev,
+    extended_coord: torch.Tensor,
+    nlist: torch.Tensor,
+    atype: torch.Tensor,
+    mean: torch.Tensor,
+    stddev: torch.Tensor,
     rcut: float,
     rcut_smth: float,
     radial_only: bool = False,
     protection: float = 0.0,
     use_exp_switch: bool = False,
-):
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """Generate smooth environment matrix from atom coordinates and other context.
 
     Args:
diff --git a/deepmd/pt/model/descriptor/hybrid.py b/deepmd/pt/model/descriptor/hybrid.py
index e13b014037..545fba7019 100644
--- a/deepmd/pt/model/descriptor/hybrid.py
+++ b/deepmd/pt/model/descriptor/hybrid.py
@@ -45,7 +45,7 @@ class DescrptHybrid(BaseDescriptor, torch.nn.Module):
     def __init__(
         self,
         list: list[Union[BaseDescriptor, dict[str, Any]]],
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         super().__init__()
         # warning: list is conflict with built-in list
@@ -140,7 +140,7 @@ def get_dim_emb(self) -> int:
         """Returns the output dimension."""
         return sum([descrpt.get_dim_emb() for descrpt in self.descrpt_list])
 
-    def mixed_types(self):
+    def mixed_types(self) -> bool:
         """Returns if the descriptor requires a neighbor list that distinguish different
         atomic types or not.
         """
@@ -164,7 +164,9 @@ def get_env_protection(self) -> float:
             )
         return all_protection[0]
 
-    def share_params(self, base_class, shared_level, resume=False) -> None:
+    def share_params(
+        self, base_class: "DescrptHybrid", shared_level: int, resume: bool = False
+    ) -> None:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -182,7 +184,9 @@ def share_params(self, base_class, shared_level, resume=False) -> None:
             raise NotImplementedError
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self,
+        type_map: list[str],
+        model_with_new_type_stat: Optional["DescrptHybrid"] = None,
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -265,7 +269,13 @@ def forward(
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
         """Compute the descriptor.
 
         Parameters
diff --git a/deepmd/pt/model/descriptor/repflow_layer.py b/deepmd/pt/model/descriptor/repflow_layer.py
index 304e4f68b3..62145958c8 100644
--- a/deepmd/pt/model/descriptor/repflow_layer.py
+++ b/deepmd/pt/model/descriptor/repflow_layer.py
@@ -712,7 +712,7 @@ def forward(
         a_sw: torch.Tensor,  # switch func, nf x nloc x a_nnei
         edge_index: torch.Tensor,  # 2 x n_edge
         angle_index: torch.Tensor,  # 3 x n_angle
-    ):
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Parameters
         ----------
diff --git a/deepmd/pt/model/descriptor/repflows.py b/deepmd/pt/model/descriptor/repflows.py
index 7445a34a33..69b5e3b593 100644
--- a/deepmd/pt/model/descriptor/repflows.py
+++ b/deepmd/pt/model/descriptor/repflows.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Callable,
     Optional,
     Union,
@@ -54,15 +55,15 @@
 if not hasattr(torch.ops.deepmd, "border_op"):
 
     def border_op(
-        argument0,
-        argument1,
-        argument2,
-        argument3,
-        argument4,
-        argument5,
-        argument6,
-        argument7,
-        argument8,
+        argument0: Any,
+        argument1: Any,
+        argument2: Any,
+        argument3: Any,
+        argument4: Any,
+        argument5: Any,
+        argument6: Any,
+        argument7: Any,
+        argument8: Any,
     ) -> torch.Tensor:
         raise NotImplementedError(
             "border_op is not available since customized PyTorch OP library is not built when freezing the model. "
@@ -187,11 +188,11 @@ class DescrptBlockRepflows(DescriptorBlock):
 
     def __init__(
         self,
-        e_rcut,
-        e_rcut_smth,
+        e_rcut: float,
+        e_rcut_smth: float,
         e_sel: int,
-        a_rcut,
-        a_rcut_smth,
+        a_rcut: float,
+        a_rcut_smth: float,
         a_sel: int,
         ntypes: int,
         nlayers: int = 6,
@@ -376,7 +377,7 @@ def get_dim_emb(self) -> int:
         """Returns the embedding dimension e_dim."""
         return self.e_dim
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: Any) -> None:
         if key in ("avg", "data_avg", "davg"):
             self.mean = value
         elif key in ("std", "data_std", "dstd"):
@@ -384,7 +385,7 @@ def __setitem__(self, key, value) -> None:
         else:
             raise KeyError(key)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> Any:
         if key in ("avg", "data_avg", "davg"):
             return self.mean
         elif key in ("std", "data_std", "dstd"):
@@ -409,17 +410,17 @@ def get_env_protection(self) -> float:
         return self.env_protection
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         """Returns the output dimension of this descriptor."""
         return self.n_dim
 
     @property
-    def dim_in(self):
+    def dim_in(self) -> int:
         """Returns the atomic input dimension of this descriptor."""
         return self.n_dim
 
     @property
-    def dim_emb(self):
+    def dim_emb(self) -> int:
         """Returns the embedding dimension e_dim."""
         return self.get_dim_emb()
 
@@ -438,7 +439,13 @@ def forward(
         extended_atype_embd: Optional[torch.Tensor] = None,
         mapping: Optional[torch.Tensor] = None,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
         parallel_mode = comm_dict is not None
         if not parallel_mode:
             assert mapping is not None
diff --git a/deepmd/pt/model/descriptor/repformer_layer.py b/deepmd/pt/model/descriptor/repformer_layer.py
index 9715b7479b..32012af92d 100644
--- a/deepmd/pt/model/descriptor/repformer_layer.py
+++ b/deepmd/pt/model/descriptor/repformer_layer.py
@@ -585,12 +585,12 @@ def deserialize(cls, data: dict) -> "LocalAtten":
 class RepformerLayer(torch.nn.Module):
     def __init__(
         self,
-        rcut,
-        rcut_smth,
+        rcut: float,
+        rcut_smth: float,
         sel: int,
         ntypes: int,
-        g1_dim=128,
-        g2_dim=16,
+        g1_dim: int = 128,
+        g2_dim: int = 16,
         axis_neuron: int = 4,
         update_chnnl_2: bool = True,
         update_g1_has_conv: bool = True,
@@ -1141,7 +1141,7 @@ def forward(
         nlist: torch.Tensor,  # nf x nloc x nnei
         nlist_mask: torch.Tensor,  # nf x nloc x nnei
         sw: torch.Tensor,  # switch func, nf x nloc x nnei
-    ):
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Parameters
         ----------
diff --git a/deepmd/pt/model/descriptor/repformers.py b/deepmd/pt/model/descriptor/repformers.py
index 022c7510df..2c383640f1 100644
--- a/deepmd/pt/model/descriptor/repformers.py
+++ b/deepmd/pt/model/descriptor/repformers.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Callable,
     Optional,
     Union,
@@ -51,15 +52,15 @@
 if not hasattr(torch.ops.deepmd, "border_op"):
 
     def border_op(
-        argument0,
-        argument1,
-        argument2,
-        argument3,
-        argument4,
-        argument5,
-        argument6,
-        argument7,
-        argument8,
+        argument0: Any,
+        argument1: Any,
+        argument2: Any,
+        argument3: Any,
+        argument4: Any,
+        argument5: Any,
+        argument6: Any,
+        argument7: Any,
+        argument8: Any,
     ) -> torch.Tensor:
         raise NotImplementedError(
             "border_op is not available since customized PyTorch OP library is not built when freezing the model. "
@@ -75,13 +76,13 @@ def border_op(
 class DescrptBlockRepformers(DescriptorBlock):
     def __init__(
         self,
-        rcut,
-        rcut_smth,
+        rcut: float,
+        rcut_smth: float,
         sel: int,
         ntypes: int,
         nlayers: int = 3,
-        g1_dim=128,
-        g2_dim=16,
+        g1_dim: int = 128,
+        g2_dim: int = 16,
         axis_neuron: int = 4,
         direct_dist: bool = False,
         update_g1_has_conv: bool = True,
@@ -336,7 +337,7 @@ def get_dim_emb(self) -> int:
         """Returns the embedding dimension g2."""
         return self.g2_dim
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: Any) -> None:
         if key in ("avg", "data_avg", "davg"):
             self.mean = value
         elif key in ("std", "data_std", "dstd"):
@@ -344,7 +345,7 @@ def __setitem__(self, key, value) -> None:
         else:
             raise KeyError(key)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> Any:
         if key in ("avg", "data_avg", "davg"):
             return self.mean
         elif key in ("std", "data_std", "dstd"):
@@ -369,17 +370,17 @@ def get_env_protection(self) -> float:
         return self.env_protection
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         """Returns the output dimension of this descriptor."""
         return self.g1_dim
 
     @property
-    def dim_in(self):
+    def dim_in(self) -> int:
         """Returns the atomic input dimension of this descriptor."""
         return self.g1_dim
 
     @property
-    def dim_emb(self):
+    def dim_emb(self) -> int:
         """Returns the embedding dimension g2."""
         return self.get_dim_emb()
 
@@ -399,7 +400,13 @@ def forward(
         mapping: Optional[torch.Tensor] = None,
         type_embedding: Optional[torch.Tensor] = None,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
         if comm_dict is None:
             assert mapping is not None
             assert extended_atype_embd is not None
diff --git a/deepmd/pt/model/descriptor/se_a.py b/deepmd/pt/model/descriptor/se_a.py
index f49b5a1276..17fa6a830e 100644
--- a/deepmd/pt/model/descriptor/se_a.py
+++ b/deepmd/pt/model/descriptor/se_a.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import itertools
 from typing import (
+    Any,
     Callable,
     ClassVar,
     Optional,
@@ -93,11 +94,11 @@ def tabulate_fusion_se_a(
 class DescrptSeA(BaseDescriptor, torch.nn.Module):
     def __init__(
         self,
-        rcut,
-        rcut_smth,
-        sel,
-        neuron=[25, 50, 100],
-        axis_neuron=16,
+        rcut: float,
+        rcut_smth: float,
+        sel: Union[list[int], int],
+        neuron: list[int] = [25, 50, 100],
+        axis_neuron: int = 16,
         set_davg_zero: bool = False,
         activation_function: str = "tanh",
         precision: str = "float64",
@@ -110,7 +111,7 @@ def __init__(
         ntypes: Optional[int] = None,  # to be compat with input
         type_map: Optional[list[str]] = None,
         # not implemented
-        spin=None,
+        spin: Optional[Any] = None,
     ) -> None:
         del ntypes
         if spin is not None:
@@ -168,7 +169,7 @@ def get_dim_emb(self) -> int:
         """Returns the output dimension."""
         return self.sea.get_dim_emb()
 
-    def mixed_types(self):
+    def mixed_types(self) -> bool:
         """Returns if the descriptor requires a neighbor list that distinguish different
         atomic types or not.
         """
@@ -186,7 +187,9 @@ def get_env_protection(self) -> float:
         """Returns the protection of building environment matrix."""
         return self.sea.get_env_protection()
 
-    def share_params(self, base_class, shared_level, resume=False) -> None:
+    def share_params(
+        self, base_class: Any, shared_level: int, resume: bool = False
+    ) -> None:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -205,12 +208,12 @@ def share_params(self, base_class, shared_level, resume=False) -> None:
             raise NotImplementedError
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         """Returns the output dimension of this descriptor."""
         return self.sea.dim_out
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -225,7 +228,7 @@ def compute_input_stats(
         self,
         merged: Union[Callable[[], list[dict]], list[dict]],
         path: Optional[DPPath] = None,
-    ):
+    ) -> None:
         """
         Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data.
 
@@ -305,7 +308,13 @@ def forward(
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
         """Compute the descriptor.
 
         Parameters
@@ -345,10 +354,12 @@ def forward(
         )
         return (
             g1.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
-            rot_mat.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
+            rot_mat.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION)
+            if rot_mat is not None
+            else None,
             None,
             None,
-            sw.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
+            sw.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION) if sw is not None else None,
         )
 
     def set_stat_mean_and_stddev(
@@ -408,7 +419,7 @@ def deserialize(cls, data: dict) -> "DescrptSeA":
         env_mat = data.pop("env_mat")
         obj = cls(**data)
 
-        def t_cvt(xx):
+        def t_cvt(xx: Any) -> torch.Tensor:
             return torch.tensor(xx, dtype=obj.sea.prec, device=env.DEVICE)
 
         obj.sea["davg"] = t_cvt(variables["davg"])
@@ -455,11 +466,11 @@ class DescrptBlockSeA(DescriptorBlock):
 
     def __init__(
         self,
-        rcut,
-        rcut_smth,
-        sel,
-        neuron=[25, 50, 100],
-        axis_neuron=16,
+        rcut: float,
+        rcut_smth: float,
+        sel: Union[int, list[int]],
+        neuron: list[int] = [25, 50, 100],
+        axis_neuron: int = 16,
         set_davg_zero: bool = False,
         activation_function: str = "tanh",
         precision: str = "float64",
@@ -469,7 +480,7 @@ def __init__(
         type_one_side: bool = True,
         trainable: bool = True,
         seed: Optional[Union[int, list[int]]] = None,
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         """Construct an embedding net of type `se_a`.
 
@@ -602,7 +613,7 @@ def get_env_protection(self) -> float:
         return self.env_protection
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         """Returns the output dimension of this descriptor."""
         return self.filter_neuron[-1] * self.axis_neuron
 
@@ -611,7 +622,7 @@ def dim_in(self) -> int:
         """Returns the atomic input dimension of this descriptor."""
         return 0
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: torch.Tensor) -> None:
         if key in ("avg", "data_avg", "davg"):
             self.mean = value
         elif key in ("std", "data_std", "dstd"):
@@ -619,7 +630,7 @@ def __setitem__(self, key, value) -> None:
         else:
             raise KeyError(key)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> torch.Tensor:
         if key in ("avg", "data_avg", "davg"):
             return self.mean
         elif key in ("std", "data_std", "dstd"):
@@ -729,7 +740,13 @@ def forward(
         extended_atype_embd: Optional[torch.Tensor] = None,
         mapping: Optional[torch.Tensor] = None,
         type_embedding: Optional[torch.Tensor] = None,
-    ):
+    ) -> tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
         """Calculate decoded embedding for each atom.
 
         Args:
diff --git a/deepmd/pt/model/descriptor/se_atten.py b/deepmd/pt/model/descriptor/se_atten.py
index 27c5716919..bfcb510810 100644
--- a/deepmd/pt/model/descriptor/se_atten.py
+++ b/deepmd/pt/model/descriptor/se_atten.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Callable,
     Optional,
     Union,
@@ -86,12 +87,12 @@ def __init__(
         attn_layer: int = 2,
         attn_dotr: bool = True,
         attn_mask: bool = False,
-        activation_function="tanh",
+        activation_function: str = "tanh",
         precision: str = "float64",
         resnet_dt: bool = False,
-        scaling_factor=1.0,
-        normalize=True,
-        temperature=None,
+        scaling_factor: float = 1.0,
+        normalize: bool = True,
+        temperature: Optional[float] = None,
         smooth: bool = True,
         type_one_side: bool = False,
         exclude_types: list[tuple[int, int]] = [],
@@ -317,7 +318,7 @@ def get_dim_emb(self) -> int:
         """Returns the output dimension of embedding."""
         return self.filter_neuron[-1]
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: Any) -> None:
         if key in ("avg", "data_avg", "davg"):
             self.mean = value
         elif key in ("std", "data_std", "dstd"):
@@ -325,7 +326,7 @@ def __setitem__(self, key, value) -> None:
         else:
             raise KeyError(key)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> Any:
         if key in ("avg", "data_avg", "davg"):
             return self.mean
         elif key in ("std", "data_std", "dstd"):
@@ -350,17 +351,17 @@ def get_env_protection(self) -> float:
         return self.env_protection
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         """Returns the output dimension of this descriptor."""
         return self.filter_neuron[-1] * self.axis_neuron
 
     @property
-    def dim_in(self):
+    def dim_in(self) -> int:
         """Returns the atomic input dimension of this descriptor."""
         return self.tebd_dim
 
     @property
-    def dim_emb(self):
+    def dim_emb(self) -> int:
         """Returns the output dimension of embedding."""
         return self.get_dim_emb()
 
@@ -425,10 +426,10 @@ def reinit_exclude(
 
     def enable_compression(
         self,
-        table_data,
-        table_config,
-        lower,
-        upper,
+        table_data: dict,
+        table_config: dict,
+        lower: dict,
+        upper: dict,
     ) -> None:
         net = "filter_net"
         self.compress_info[0] = torch.as_tensor(
@@ -454,7 +455,13 @@ def forward(
         extended_atype_embd: Optional[torch.Tensor] = None,
         mapping: Optional[torch.Tensor] = None,
         type_embedding: Optional[torch.Tensor] = None,
-    ):
+    ) -> tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
         """Compute the descriptor.
 
         Parameters
@@ -729,11 +736,11 @@ def __init__(
 
     def forward(
         self,
-        input_G,
-        nei_mask,
+        input_G: torch.Tensor,
+        nei_mask: torch.Tensor,
         input_r: Optional[torch.Tensor] = None,
         sw: Optional[torch.Tensor] = None,
-    ):
+    ) -> torch.Tensor:
         """Compute the multi-layer gated self-attention.
 
         Parameters
@@ -753,13 +760,13 @@ def forward(
             out = layer(out, nei_mask, input_r=input_r, sw=sw)
         return out
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: int) -> Any:
         if isinstance(key, int):
             return self.attention_layers[key]
         else:
             raise TypeError(key)
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: int, value: Any) -> None:
         if not isinstance(key, int):
             raise TypeError(key)
         if isinstance(value, self.network_type):
@@ -871,11 +878,11 @@ def __init__(
 
     def forward(
         self,
-        x,
-        nei_mask,
+        x: torch.Tensor,
+        nei_mask: torch.Tensor,
         input_r: Optional[torch.Tensor] = None,
         sw: Optional[torch.Tensor] = None,
-    ):
+    ) -> torch.Tensor:
         residual = x
         x, _ = self.attention_layer(x, nei_mask, input_r=input_r, sw=sw)
         x = residual + x
@@ -989,12 +996,12 @@ def __init__(
 
     def forward(
         self,
-        query,
-        nei_mask,
+        query: torch.Tensor,
+        nei_mask: torch.Tensor,
         input_r: Optional[torch.Tensor] = None,
         sw: Optional[torch.Tensor] = None,
         attnw_shift: float = 20.0,
-    ):
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         """Compute the multi-head gated self-attention.
 
         Parameters
diff --git a/deepmd/pt/model/descriptor/se_atten_v2.py b/deepmd/pt/model/descriptor/se_atten_v2.py
index 533d7887e0..5377d919b0 100644
--- a/deepmd/pt/model/descriptor/se_atten_v2.py
+++ b/deepmd/pt/model/descriptor/se_atten_v2.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
     Union,
 )
@@ -56,8 +57,8 @@ def __init__(
         exclude_types: list[tuple[int, int]] = [],
         env_protection: float = 0.0,
         scaling_factor: int = 1.0,
-        normalize=True,
-        temperature=None,
+        normalize: bool = True,
+        temperature: Optional[float] = None,
         concat_output_tebd: bool = True,
         trainable: bool = True,
         trainable_ln: bool = True,
@@ -69,7 +70,7 @@ def __init__(
         use_tebd_bias: bool = False,
         type_map: Optional[list[str]] = None,
         # not implemented
-        spin=None,
+        spin: Optional[Any] = None,
         type: Optional[str] = None,
     ) -> None:
         r"""Construct smooth version of embedding net of type `se_atten_v2`.
@@ -257,7 +258,7 @@ def deserialize(cls, data: dict) -> "DescrptSeAttenV2":
             data["use_tebd_bias"] = True
         obj = cls(**data)
 
-        def t_cvt(xx):
+        def t_cvt(xx: Any) -> torch.Tensor:
             return torch.tensor(xx, dtype=obj.se_atten.prec, device=env.DEVICE)
 
         obj.type_embedding.embedding = TypeEmbedNetConsistent.deserialize(
diff --git a/deepmd/pt/model/descriptor/se_r.py b/deepmd/pt/model/descriptor/se_r.py
index 9ce92fb8b4..294323a48c 100644
--- a/deepmd/pt/model/descriptor/se_r.py
+++ b/deepmd/pt/model/descriptor/se_r.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Callable,
     Optional,
     Union,
@@ -81,10 +82,10 @@ def tabulate_fusion_se_r(
 class DescrptSeR(BaseDescriptor, torch.nn.Module):
     def __init__(
         self,
-        rcut,
-        rcut_smth,
-        sel,
-        neuron=[25, 50, 100],
+        rcut: float,
+        rcut_smth: float,
+        sel: Union[list[int], int],
+        neuron: list[int] = [25, 50, 100],
         set_davg_zero: bool = False,
         activation_function: str = "tanh",
         precision: str = "float64",
@@ -94,7 +95,7 @@ def __init__(
         trainable: bool = True,
         seed: Optional[Union[int, list[int]]] = None,
         type_map: Optional[list[str]] = None,
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         super().__init__()
         self.rcut = float(rcut)
@@ -226,7 +227,9 @@ def get_env_protection(self) -> float:
         """Returns the protection of building environment matrix."""
         return self.env_protection
 
-    def share_params(self, base_class, shared_level, resume=False) -> None:
+    def share_params(
+        self, base_class: Any, shared_level: int, resume: bool = False
+    ) -> None:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -268,7 +271,7 @@ def share_params(self, base_class, shared_level, resume=False) -> None:
             raise NotImplementedError
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -330,7 +333,7 @@ def get_stats(self) -> dict[str, StatItem]:
             )
         return self.stats
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: Any) -> None:
         if key in ("avg", "data_avg", "davg"):
             self.mean = value
         elif key in ("std", "data_std", "dstd"):
@@ -338,7 +341,7 @@ def __setitem__(self, key, value) -> None:
         else:
             raise KeyError(key)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> Any:
         if key in ("avg", "data_avg", "davg"):
             return self.mean
         elif key in ("std", "data_std", "dstd"):
@@ -424,7 +427,13 @@ def forward(
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
         """Compute the descriptor.
 
         Parameters
@@ -575,7 +584,7 @@ def deserialize(cls, data: dict) -> "DescrptSeR":
         env_mat = data.pop("env_mat")
         obj = cls(**data)
 
-        def t_cvt(xx):
+        def t_cvt(xx: Any) -> torch.Tensor:
             return torch.tensor(xx, dtype=obj.prec, device=env.DEVICE)
 
         obj["davg"] = t_cvt(variables["davg"])
diff --git a/deepmd/pt/model/descriptor/se_t.py b/deepmd/pt/model/descriptor/se_t.py
index f3bd0f65ef..c489d0be06 100644
--- a/deepmd/pt/model/descriptor/se_t.py
+++ b/deepmd/pt/model/descriptor/se_t.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import itertools
 from typing import (
+    Any,
     Callable,
     ClassVar,
     Optional,
@@ -146,7 +147,7 @@ def __init__(
         type_map: Optional[list[str]] = None,
         ntypes: Optional[int] = None,  # to be compat with input
         # not implemented
-        spin=None,
+        spin: Optional[dict] = None,
     ) -> None:
         del ntypes
         if spin is not None:
@@ -202,7 +203,7 @@ def get_dim_emb(self) -> int:
         """Returns the output dimension."""
         return self.seat.get_dim_emb()
 
-    def mixed_types(self):
+    def mixed_types(self) -> bool:
         """Returns if the descriptor requires a neighbor list that distinguish different
         atomic types or not.
         """
@@ -220,7 +221,9 @@ def get_env_protection(self) -> float:
         """Returns the protection of building environment matrix."""
         return self.seat.get_env_protection()
 
-    def share_params(self, base_class, shared_level, resume=False) -> None:
+    def share_params(
+        self, base_class: Any, shared_level: int, resume: bool = False
+    ) -> None:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -239,12 +242,12 @@ def share_params(self, base_class, shared_level, resume=False) -> None:
             raise NotImplementedError
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         """Returns the output dimension of this descriptor."""
         return self.seat.dim_out
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -259,7 +262,7 @@ def compute_input_stats(
         self,
         merged: Union[Callable[[], list[dict]], list[dict]],
         path: Optional[DPPath] = None,
-    ):
+    ) -> None:
         """
         Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data.
 
@@ -340,7 +343,13 @@ def forward(
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
         """Compute the descriptor.
 
         Parameters
@@ -384,7 +393,7 @@ def forward(
             None,
             None,
             None,
-            sw.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
+            sw.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION) if sw is not None else None,
         )
 
     def set_stat_mean_and_stddev(
@@ -439,7 +448,7 @@ def deserialize(cls, data: dict) -> "DescrptSeT":
         env_mat = data.pop("env_mat")
         obj = cls(**data)
 
-        def t_cvt(xx):
+        def t_cvt(xx: Any) -> torch.Tensor:
             return torch.tensor(xx, dtype=obj.seat.prec, device=env.DEVICE)
 
         obj.seat["davg"] = t_cvt(variables["davg"])
@@ -648,7 +657,7 @@ def get_env_protection(self) -> float:
         return self.env_protection
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         """Returns the output dimension of this descriptor."""
         return self.filter_neuron[-1]
 
@@ -657,7 +666,7 @@ def dim_in(self) -> int:
         """Returns the atomic input dimension of this descriptor."""
         return 0
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: Any) -> None:
         if key in ("avg", "data_avg", "davg"):
             self.mean = value
         elif key in ("std", "data_std", "dstd"):
@@ -665,7 +674,7 @@ def __setitem__(self, key, value) -> None:
         else:
             raise KeyError(key)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> Any:
         if key in ("avg", "data_avg", "davg"):
             return self.mean
         elif key in ("std", "data_std", "dstd"):
@@ -733,10 +742,10 @@ def reinit_exclude(
 
     def enable_compression(
         self,
-        table_data,
-        table_config,
-        lower,
-        upper,
+        table_data: dict,
+        table_config: dict,
+        lower: dict,
+        upper: dict,
     ) -> None:
         for embedding_idx, ll in enumerate(self.filter_layers.networks):
             ti = embedding_idx % self.ntypes
@@ -768,7 +777,13 @@ def forward(
         extended_atype_embd: Optional[torch.Tensor] = None,
         mapping: Optional[torch.Tensor] = None,
         type_embedding: Optional[torch.Tensor] = None,
-    ):
+    ) -> tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
         """Compute the descriptor.
 
         Parameters
diff --git a/deepmd/pt/model/descriptor/se_t_tebd.py b/deepmd/pt/model/descriptor/se_t_tebd.py
index 3ee7929151..f7de1c3015 100644
--- a/deepmd/pt/model/descriptor/se_t_tebd.py
+++ b/deepmd/pt/model/descriptor/se_t_tebd.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Callable,
     Optional,
     Union,
@@ -140,7 +141,7 @@ def __init__(
         type_map: Optional[list[str]] = None,
         concat_output_tebd: bool = True,
         use_econf_tebd: bool = False,
-        use_tebd_bias=False,
+        use_tebd_bias: bool = False,
         smooth: bool = True,
     ) -> None:
         super().__init__()
@@ -242,7 +243,9 @@ def get_env_protection(self) -> float:
         """Returns the protection of building environment matrix."""
         return self.se_ttebd.get_env_protection()
 
-    def share_params(self, base_class, shared_level, resume=False) -> None:
+    def share_params(
+        self, base_class: Any, shared_level: int, resume: bool = False
+    ) -> None:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -266,18 +269,18 @@ def share_params(self, base_class, shared_level, resume=False) -> None:
             raise NotImplementedError
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         return self.get_dim_out()
 
     @property
-    def dim_emb(self):
+    def dim_emb(self) -> int:
         return self.get_dim_emb()
 
     def compute_input_stats(
         self,
         merged: Union[Callable[[], list[dict]], list[dict]],
         path: Optional[DPPath] = None,
-    ):
+    ) -> None:
         """
         Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data.
 
@@ -310,7 +313,7 @@ def get_stat_mean_and_stddev(self) -> tuple[torch.Tensor, torch.Tensor]:
         return self.se_ttebd.mean, self.se_ttebd.stddev
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -390,7 +393,7 @@ def deserialize(cls, data: dict) -> "DescrptSeTTebd":
             embeddings_strip = None
         obj = cls(**data)
 
-        def t_cvt(xx):
+        def t_cvt(xx: Any) -> torch.Tensor:
             return torch.tensor(xx, dtype=obj.se_ttebd.prec, device=env.DEVICE)
 
         obj.type_embedding.embedding = TypeEmbedNetConsistent.deserialize(
@@ -412,7 +415,13 @@ def forward(
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
         """Compute the descriptor.
 
         Parameters
@@ -472,7 +481,7 @@ def forward(
             None,
             None,
             None,
-            sw.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
+            sw.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION) if sw is not None else None,
         )
 
     @classmethod
@@ -520,7 +529,7 @@ def __init__(
         tebd_dim: int = 8,
         tebd_input_mode: str = "concat",
         set_davg_zero: bool = True,
-        activation_function="tanh",
+        activation_function: str = "tanh",
         precision: str = "float64",
         resnet_dt: bool = False,
         exclude_types: list[tuple[int, int]] = [],
@@ -631,7 +640,7 @@ def get_dim_emb(self) -> int:
         """Returns the output dimension of embedding."""
         return self.filter_neuron[-1]
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: Any) -> None:
         if key in ("avg", "data_avg", "davg"):
             self.mean = value
         elif key in ("std", "data_std", "dstd"):
@@ -639,7 +648,7 @@ def __setitem__(self, key, value) -> None:
         else:
             raise KeyError(key)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> Any:
         if key in ("avg", "data_avg", "davg"):
             return self.mean
         elif key in ("std", "data_std", "dstd"):
@@ -664,17 +673,17 @@ def get_env_protection(self) -> float:
         return self.env_protection
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         """Returns the output dimension of this descriptor."""
         return self.filter_neuron[-1]
 
     @property
-    def dim_in(self):
+    def dim_in(self) -> int:
         """Returns the atomic input dimension of this descriptor."""
         return self.tebd_dim
 
     @property
-    def dim_emb(self):
+    def dim_emb(self) -> int:
         """Returns the output dimension of embedding."""
         return self.get_dim_emb()
 
@@ -744,7 +753,13 @@ def forward(
         extended_atype_embd: Optional[torch.Tensor] = None,
         mapping: Optional[torch.Tensor] = None,
         type_embedding: Optional[torch.Tensor] = None,
-    ):
+    ) -> tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
         """Compute the descriptor.
 
         Parameters
diff --git a/deepmd/pt/model/model/__init__.py b/deepmd/pt/model/model/__init__.py
index 8d451f087f..1be46e084a 100644
--- a/deepmd/pt/model/model/__init__.py
+++ b/deepmd/pt/model/model/__init__.py
@@ -14,6 +14,7 @@
 import copy
 import json
 from typing import (
+    Any,
     Optional,
 )
 
@@ -75,7 +76,7 @@
 )
 
 
-def _get_standard_model_components(model_params, ntypes):
+def _get_standard_model_components(model_params: dict, ntypes: int) -> tuple:
     if "type_embedding" in model_params:
         raise ValueError(
             "In the PyTorch backend, type_embedding is not at the model level, but within the descriptor. See type embedding documentation for details."
@@ -102,7 +103,7 @@ def _get_standard_model_components(model_params, ntypes):
     return descriptor, fitting, fitting_net["type"]
 
 
-def get_spin_model(model_params):
+def get_spin_model(model_params: dict) -> SpinModel:
     model_params = copy.deepcopy(model_params)
     if not model_params["spin"]["use_spin"] or isinstance(
         model_params["spin"]["use_spin"][0], int
@@ -138,7 +139,7 @@ def get_spin_model(model_params):
     return SpinEnergyModel(backbone_model=backbone_model, spin=spin)
 
 
-def get_linear_model(model_params):
+def get_linear_model(model_params: dict) -> LinearEnergyModel:
     model_params = copy.deepcopy(model_params)
     weights = model_params.get("weights", "mean")
     list_of_models = []
@@ -178,7 +179,7 @@ def get_linear_model(model_params):
     )
 
 
-def get_zbl_model(model_params):
+def get_zbl_model(model_params: dict) -> DPZBLModel:
     model_params = copy.deepcopy(model_params)
     ntypes = len(model_params["type_map"])
     descriptor, fitting, _ = _get_standard_model_components(model_params, ntypes)
@@ -209,7 +210,7 @@ def get_zbl_model(model_params):
     return model
 
 
-def _can_be_converted_to_float(value) -> Optional[bool]:
+def _can_be_converted_to_float(value: Any) -> Optional[bool]:
     try:
         float(value)
         return True
@@ -218,7 +219,9 @@ def _can_be_converted_to_float(value) -> Optional[bool]:
         return False
 
 
-def _convert_preset_out_bias_to_array(preset_out_bias, type_map):
+def _convert_preset_out_bias_to_array(
+    preset_out_bias: Optional[dict], type_map: list[str]
+) -> Optional[dict]:
     if preset_out_bias is not None:
         for kk in preset_out_bias:
             if len(preset_out_bias[kk]) != len(type_map):
@@ -241,7 +244,7 @@ def _convert_preset_out_bias_to_array(preset_out_bias, type_map):
     return preset_out_bias
 
 
-def get_standard_model(model_params):
+def get_standard_model(model_params: dict) -> BaseModel:
     model_params_old = model_params
     model_params = copy.deepcopy(model_params)
     ntypes = len(model_params["type_map"])
@@ -284,7 +287,7 @@ def get_standard_model(model_params):
     return model
 
 
-def get_model(model_params):
+def get_model(model_params: dict) -> Any:
     model_type = model_params.get("type", "standard")
     if model_type == "standard":
         if "spin" in model_params:
diff --git a/deepmd/pt/model/model/dipole_model.py b/deepmd/pt/model/model/dipole_model.py
index ce949baec1..de089e7de7 100644
--- a/deepmd/pt/model/model/dipole_model.py
+++ b/deepmd/pt/model/model/dipole_model.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
 )
 
@@ -28,13 +29,13 @@ class DipoleModel(DPModelCommon, DPDipoleModel_):
 
     def __init__(
         self,
-        *args,
-        **kwargs,
+        *args: Any,
+        **kwargs: Any,
     ) -> None:
         DPModelCommon.__init__(self)
         DPDipoleModel_.__init__(self, *args, **kwargs)
 
-    def translated_output_def(self):
+    def translated_output_def(self) -> dict[str, Any]:
         out_def_data = self.model_output_def().get_data()
         output_def = {
             "dipole": out_def_data["dipole"],
@@ -54,8 +55,8 @@ def translated_output_def(self):
 
     def forward(
         self,
-        coord,
-        atype,
+        coord: torch.Tensor,
+        atype: torch.Tensor,
         box: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
@@ -91,15 +92,15 @@ def forward(
     @torch.jit.export
     def forward_lower(
         self,
-        extended_coord,
-        extended_atype,
-        nlist,
+        extended_coord: torch.Tensor,
+        extended_atype: torch.Tensor,
+        nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> dict[str, torch.Tensor]:
         model_ret = self.forward_common_lower(
             extended_coord,
             extended_atype,
diff --git a/deepmd/pt/model/model/dos_model.py b/deepmd/pt/model/model/dos_model.py
index afc867f10c..a68735984f 100644
--- a/deepmd/pt/model/model/dos_model.py
+++ b/deepmd/pt/model/model/dos_model.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
 )
 
@@ -28,13 +29,13 @@ class DOSModel(DPModelCommon, DPDOSModel_):
 
     def __init__(
         self,
-        *args,
-        **kwargs,
+        *args: Any,
+        **kwargs: Any,
     ) -> None:
         DPModelCommon.__init__(self)
         DPDOSModel_.__init__(self, *args, **kwargs)
 
-    def translated_output_def(self):
+    def translated_output_def(self) -> dict[str, Any]:
         out_def_data = self.model_output_def().get_data()
         output_def = {
             "atom_dos": out_def_data["dos"],
@@ -46,8 +47,8 @@ def translated_output_def(self):
 
     def forward(
         self,
-        coord,
-        atype,
+        coord: torch.Tensor,
+        atype: torch.Tensor,
         box: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
@@ -81,15 +82,15 @@ def get_numb_dos(self) -> int:
     @torch.jit.export
     def forward_lower(
         self,
-        extended_coord,
-        extended_atype,
-        nlist,
+        extended_coord: torch.Tensor,
+        extended_atype: torch.Tensor,
+        nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> dict[str, torch.Tensor]:
         model_ret = self.forward_common_lower(
             extended_coord,
             extended_atype,
diff --git a/deepmd/pt/model/model/dp_linear_model.py b/deepmd/pt/model/model/dp_linear_model.py
index ca0819b61e..b71c8a10c3 100644
--- a/deepmd/pt/model/model/dp_linear_model.py
+++ b/deepmd/pt/model/model/dp_linear_model.py
@@ -1,10 +1,14 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
 )
 
 import torch
 
+from deepmd.dpmodel.output_def import (
+    OutputVariableDef,
+)
 from deepmd.pt.model.atomic_model import (
     LinearEnergyAtomicModel,
 )
@@ -31,12 +35,12 @@ class LinearEnergyModel(DPLinearModel_):
 
     def __init__(
         self,
-        *args,
-        **kwargs,
+        *args: Any,
+        **kwargs: Any,
     ) -> None:
         super().__init__(*args, **kwargs)
 
-    def translated_output_def(self):
+    def translated_output_def(self) -> dict[str, OutputVariableDef]:
         out_def_data = self.model_output_def().get_data()
         output_def = {
             "atom_energy": out_def_data["energy"],
@@ -56,8 +60,8 @@ def translated_output_def(self):
 
     def forward(
         self,
-        coord,
-        atype,
+        coord: torch.Tensor,
+        atype: torch.Tensor,
         box: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
@@ -90,15 +94,15 @@ def forward(
     @torch.jit.export
     def forward_lower(
         self,
-        extended_coord,
-        extended_atype,
-        nlist,
+        extended_coord: torch.Tensor,
+        extended_atype: torch.Tensor,
+        nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> dict[str, torch.Tensor]:
         model_ret = self.forward_common_lower(
             extended_coord,
             extended_atype,
diff --git a/deepmd/pt/model/model/dp_model.py b/deepmd/pt/model/model/dp_model.py
index 17ce9372e5..875dc0dca0 100644
--- a/deepmd/pt/model/model/dp_model.py
+++ b/deepmd/pt/model/model/dp_model.py
@@ -47,11 +47,12 @@ def update_sel(
         )
         return local_jdata_cpy, min_nbor_dist
 
-    def get_fitting_net(self):
+    # sadly, use -> BaseFitting here will not make torchscript happy
+    def get_fitting_net(self):  # noqa: ANN201
         """Get the fitting network."""
         return self.atomic_model.fitting_net
 
-    def get_descriptor(self):
+    def get_descriptor(self):  # noqa: ANN201
         """Get the descriptor."""
         return self.atomic_model.descriptor
 
diff --git a/deepmd/pt/model/model/dp_zbl_model.py b/deepmd/pt/model/model/dp_zbl_model.py
index 4269f4e183..7f84d8abec 100644
--- a/deepmd/pt/model/model/dp_zbl_model.py
+++ b/deepmd/pt/model/model/dp_zbl_model.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
 )
 
@@ -31,12 +32,12 @@ class DPZBLModel(DPZBLModel_):
 
     def __init__(
         self,
-        *args,
-        **kwargs,
+        *args: Any,
+        **kwargs: Any,
     ) -> None:
         super().__init__(*args, **kwargs)
 
-    def translated_output_def(self):
+    def translated_output_def(self) -> dict[str, Any]:
         out_def_data = self.model_output_def().get_data()
         output_def = {
             "atom_energy": out_def_data["energy"],
@@ -56,8 +57,8 @@ def translated_output_def(self):
 
     def forward(
         self,
-        coord,
-        atype,
+        coord: torch.Tensor,
+        atype: torch.Tensor,
         box: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
@@ -90,15 +91,15 @@ def forward(
     @torch.jit.export
     def forward_lower(
         self,
-        extended_coord,
-        extended_atype,
-        nlist,
+        extended_coord: torch.Tensor,
+        extended_atype: torch.Tensor,
+        nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> dict[str, torch.Tensor]:
         model_ret = self.forward_common_lower(
             extended_coord,
             extended_atype,
diff --git a/deepmd/pt/model/model/ener_model.py b/deepmd/pt/model/model/ener_model.py
index 062fa86d7e..dfe68d537f 100644
--- a/deepmd/pt/model/model/ener_model.py
+++ b/deepmd/pt/model/model/ener_model.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
 )
 
@@ -31,14 +32,14 @@ class EnergyModel(DPModelCommon, DPEnergyModel_):
 
     def __init__(
         self,
-        *args,
-        **kwargs,
+        *args: Any,
+        **kwargs: Any,
     ) -> None:
         DPModelCommon.__init__(self)
         DPEnergyModel_.__init__(self, *args, **kwargs)
         self._hessian_enabled = False
 
-    def enable_hessian(self):
+    def enable_hessian(self) -> None:
         self.__class__ = make_hessian_model(type(self))
         self.hess_fitting_def = super(type(self), self).atomic_output_def()
         self.requires_hessian("energy")
@@ -70,7 +71,7 @@ def get_observed_type_list(self) -> list[str]:
                 observed_type_list.append(type_map[i])
         return observed_type_list
 
-    def translated_output_def(self):
+    def translated_output_def(self) -> dict[str, Any]:
         out_def_data = self.model_output_def().get_data()
         output_def = {
             "atom_energy": out_def_data["energy"],
@@ -92,8 +93,8 @@ def translated_output_def(self):
 
     def forward(
         self,
-        coord,
-        atype,
+        coord: torch.Tensor,
+        atype: torch.Tensor,
         box: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
@@ -133,15 +134,15 @@ def forward(
     @torch.jit.export
     def forward_lower(
         self,
-        extended_coord,
-        extended_atype,
-        nlist,
+        extended_coord: torch.Tensor,
+        extended_atype: torch.Tensor,
+        nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> dict[str, torch.Tensor]:
         model_ret = self.forward_common_lower(
             extended_coord,
             extended_atype,
diff --git a/deepmd/pt/model/model/frozen.py b/deepmd/pt/model/model/frozen.py
index 27284ec276..2a63b093db 100644
--- a/deepmd/pt/model/model/frozen.py
+++ b/deepmd/pt/model/model/frozen.py
@@ -2,6 +2,7 @@
 import json
 import tempfile
 from typing import (
+    Any,
     NoReturn,
     Optional,
 )
@@ -32,7 +33,7 @@ class FrozenModel(BaseModel):
         The path to the frozen model
     """
 
-    def __init__(self, model_file: str, **kwargs) -> None:
+    def __init__(self, model_file: str, **kwargs: Any) -> None:
         super().__init__(**kwargs)
         self.model_file = model_file
         if model_file.endswith(".pth"):
@@ -116,8 +117,8 @@ def need_sorted_nlist_for_lower(self) -> bool:
     @torch.jit.export
     def forward(
         self,
-        coord,
-        atype,
+        coord: torch.Tensor,
+        atype: torch.Tensor,
         box: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
diff --git a/deepmd/pt/model/model/make_hessian_model.py b/deepmd/pt/model/model/make_hessian_model.py
index 000b9abea4..b84e63ebd7 100644
--- a/deepmd/pt/model/model/make_hessian_model.py
+++ b/deepmd/pt/model/model/make_hessian_model.py
@@ -2,6 +2,7 @@
 import copy
 import math
 from typing import (
+    Any,
     Optional,
     Union,
 )
@@ -11,9 +12,12 @@
 from deepmd.dpmodel import (
     get_hessian_name,
 )
+from deepmd.dpmodel.output_def import (
+    FittingOutputDef,
+)
 
 
-def make_hessian_model(T_Model):
+def make_hessian_model(T_Model: type) -> type:
     """Make a model that can compute Hessian.
 
     LIMITATION: this model is not jitable due to the restrictions of torch jit script.
@@ -34,8 +38,8 @@ def make_hessian_model(T_Model):
     class CM(T_Model):
         def __init__(
             self,
-            *args,
-            **kwargs,
+            *args: Any,
+            **kwargs: Any,
         ) -> None:
             super().__init__(
                 *args,
@@ -54,14 +58,14 @@ def requires_hessian(
                 if kk in keys:
                     self.hess_fitting_def[kk].r_hessian = True
 
-        def atomic_output_def(self):
+        def atomic_output_def(self) -> FittingOutputDef:
             """Get the fitting output def."""
             return self.hess_fitting_def
 
         def forward_common(
             self,
-            coord,
-            atype,
+            coord: torch.Tensor,
+            atype: torch.Tensor,
             box: Optional[torch.Tensor] = None,
             fparam: Optional[torch.Tensor] = None,
             aparam: Optional[torch.Tensor] = None,
@@ -159,9 +163,9 @@ def _cal_hessian_all(
 
         def _cal_hessian_one_component(
             self,
-            ci,
-            coord,
-            atype,
+            ci: int,
+            coord: torch.Tensor,
+            atype: torch.Tensor,
             box: Optional[torch.Tensor] = None,
             fparam: Optional[torch.Tensor] = None,
             aparam: Optional[torch.Tensor] = None,
@@ -195,8 +199,8 @@ def __init__(
 
         def __call__(
             self,
-            xx,
-        ):
+            xx: torch.Tensor,
+        ) -> torch.Tensor:
             ci = self.ci
             atype, box, fparam, aparam = self.atype, self.box, self.fparam, self.aparam
             res = super(CM, self.obj).forward_common(
diff --git a/deepmd/pt/model/model/make_model.py b/deepmd/pt/model/model/make_model.py
index b9335df747..e282ebd83a 100644
--- a/deepmd/pt/model/model/make_model.py
+++ b/deepmd/pt/model/model/make_model.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
+    Callable,
     Optional,
 )
 
@@ -39,7 +41,7 @@
 )
 
 
-def make_model(T_AtomicModel: type[BaseAtomicModel]):
+def make_model(T_AtomicModel: type[BaseAtomicModel]) -> type:
     """Make a model as a derived class of an atomic model.
 
     The model provide two interfaces.
@@ -65,10 +67,10 @@ def make_model(T_AtomicModel: type[BaseAtomicModel]):
     class CM(BaseModel):
         def __init__(
             self,
-            *args,
+            *args: Any,
             # underscore to prevent conflict with normal inputs
             atomic_model_: Optional[T_AtomicModel] = None,
-            **kwargs,
+            **kwargs: Any,
         ) -> None:
             super().__init__(*args, **kwargs)
             if atomic_model_ is not None:
@@ -80,7 +82,7 @@ def __init__(
             self.global_pt_float_precision = GLOBAL_PT_FLOAT_PRECISION
             self.global_pt_ener_float_precision = GLOBAL_PT_ENER_FLOAT_PRECISION
 
-        def model_output_def(self):
+        def model_output_def(self) -> ModelOutputDef:
             """Get the output def for the model."""
             return ModelOutputDef(self.atomic_output_def())
 
@@ -129,8 +131,8 @@ def enable_compression(
         # cannot use the name forward. torch script does not work
         def forward_common(
             self,
-            coord,
-            atype,
+            coord: torch.Tensor,
+            atype: torch.Tensor,
             box: Optional[torch.Tensor] = None,
             fparam: Optional[torch.Tensor] = None,
             aparam: Optional[torch.Tensor] = None,
@@ -206,8 +208,8 @@ def set_out_bias(self, out_bias: torch.Tensor) -> None:
 
         def change_out_bias(
             self,
-            merged,
-            bias_adjust_mode="change-by-statistic",
+            merged: Any,
+            bias_adjust_mode: str = "change-by-statistic",
         ) -> None:
             """Change the output bias of atomic model according to the input data and the pretrained model.
 
@@ -233,16 +235,16 @@ def change_out_bias(
 
         def forward_common_lower(
             self,
-            extended_coord,
-            extended_atype,
-            nlist,
+            extended_coord: torch.Tensor,
+            extended_atype: torch.Tensor,
+            nlist: torch.Tensor,
             mapping: Optional[torch.Tensor] = None,
             fparam: Optional[torch.Tensor] = None,
             aparam: Optional[torch.Tensor] = None,
             do_atomic_virial: bool = False,
             comm_dict: Optional[dict[str, torch.Tensor]] = None,
             extra_nlist_sort: bool = False,
-        ):
+        ) -> dict[str, torch.Tensor]:
             """Return model prediction. Lower interface that takes
             extended atomic coordinates and types, nlist, and mapping
             as input, and returns the predictions on the extended region.
@@ -383,7 +385,7 @@ def format_nlist(
             extended_atype: torch.Tensor,
             nlist: torch.Tensor,
             extra_nlist_sort: bool = False,
-        ):
+        ) -> torch.Tensor:
             """Format the neighbor list.
 
             1. If the number of neighbors in the `nlist` is equal to sum(self.sel),
@@ -434,7 +436,7 @@ def _format_nlist(
             nlist: torch.Tensor,
             nnei: int,
             extra_nlist_sort: bool = False,
-        ):
+        ) -> torch.Tensor:
             n_nf, n_nloc, n_nnei = nlist.shape
             # nf x nall x 3
             extended_coord = extended_coord.view([n_nf, -1, 3])
@@ -496,7 +498,7 @@ def do_grad_c(
             return self.atomic_model.do_grad_c(var_name)
 
         def change_type_map(
-            self, type_map: list[str], model_with_new_type_stat=None
+            self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
         ) -> None:
             """Change the type related params to new ones, according to `type_map` and the original one in the model.
             If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -512,10 +514,10 @@ def serialize(self) -> dict:
             return self.atomic_model.serialize()
 
         @classmethod
-        def deserialize(cls, data) -> "CM":
+        def deserialize(cls, data: Any) -> "CM":
             return cls(atomic_model_=T_AtomicModel.deserialize(data))
 
-        def set_case_embd(self, case_idx: int):
+        def set_case_embd(self, case_idx: int) -> None:
             self.atomic_model.set_case_embd(case_idx)
 
         @torch.jit.export
@@ -572,9 +574,9 @@ def atomic_output_def(self) -> FittingOutputDef:
 
         def compute_or_load_stat(
             self,
-            sampled_func,
+            sampled_func: Callable[[], Any],
             stat_file_path: Optional[DPPath] = None,
-        ):
+        ) -> None:
             """Compute or load the statistics."""
             return self.atomic_model.compute_or_load_stat(sampled_func, stat_file_path)
 
@@ -605,8 +607,8 @@ def need_sorted_nlist_for_lower(self) -> bool:
 
         def forward(
             self,
-            coord,
-            atype,
+            coord: torch.Tensor,
+            atype: torch.Tensor,
             box: Optional[torch.Tensor] = None,
             fparam: Optional[torch.Tensor] = None,
             aparam: Optional[torch.Tensor] = None,
diff --git a/deepmd/pt/model/model/model.py b/deepmd/pt/model/model/model.py
index bc2e12174d..e3cf7bde17 100644
--- a/deepmd/pt/model/model/model.py
+++ b/deepmd/pt/model/model/model.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     NoReturn,
     Optional,
 )
@@ -18,7 +19,7 @@
 
 
 class BaseModel(torch.nn.Module, make_base_model()):
-    def __init__(self, *args, **kwargs) -> None:
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
         """Construct a basic model for different tasks."""
         torch.nn.Module.__init__(self)
         self.model_def_script = ""
@@ -28,7 +29,7 @@ def __init__(self, *args, **kwargs) -> None:
 
     def compute_or_load_stat(
         self,
-        sampled_func,
+        sampled_func: Any,
         stat_file_path: Optional[DPPath] = None,
     ) -> NoReturn:
         """
@@ -71,6 +72,6 @@ def get_min_nbor_dist(self) -> Optional[float]:
         return self.min_nbor_dist.item()
 
     @torch.jit.export
-    def get_ntypes(self):
+    def get_ntypes(self) -> int:
         """Returns the number of element types."""
         return len(self.get_type_map())
diff --git a/deepmd/pt/model/model/polar_model.py b/deepmd/pt/model/model/polar_model.py
index ad9b7a6619..18eac5d24c 100644
--- a/deepmd/pt/model/model/polar_model.py
+++ b/deepmd/pt/model/model/polar_model.py
@@ -1,10 +1,14 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
 )
 
 import torch
 
+from deepmd.dpmodel.output_def import (
+    OutputVariableDef,
+)
 from deepmd.pt.model.atomic_model import (
     DPPolarAtomicModel,
 )
@@ -28,13 +32,13 @@ class PolarModel(DPModelCommon, DPPolarModel_):
 
     def __init__(
         self,
-        *args,
-        **kwargs,
+        *args: Any,
+        **kwargs: Any,
     ) -> None:
         DPModelCommon.__init__(self)
         DPPolarModel_.__init__(self, *args, **kwargs)
 
-    def translated_output_def(self):
+    def translated_output_def(self) -> dict[str, OutputVariableDef]:
         out_def_data = self.model_output_def().get_data()
         output_def = {
             "polar": out_def_data["polarizability"],
@@ -46,8 +50,8 @@ def translated_output_def(self):
 
     def forward(
         self,
-        coord,
-        atype,
+        coord: torch.Tensor,
+        atype: torch.Tensor,
         box: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
@@ -75,15 +79,15 @@ def forward(
     @torch.jit.export
     def forward_lower(
         self,
-        extended_coord,
-        extended_atype,
-        nlist,
+        extended_coord: torch.Tensor,
+        extended_atype: torch.Tensor,
+        nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> dict[str, torch.Tensor]:
         model_ret = self.forward_common_lower(
             extended_coord,
             extended_atype,
diff --git a/deepmd/pt/model/model/property_model.py b/deepmd/pt/model/model/property_model.py
index 7c50c75ff1..0931862ae8 100644
--- a/deepmd/pt/model/model/property_model.py
+++ b/deepmd/pt/model/model/property_model.py
@@ -1,10 +1,14 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
 )
 
 import torch
 
+from deepmd.dpmodel.output_def import (
+    OutputVariableDef,
+)
 from deepmd.pt.model.atomic_model import (
     DPPropertyAtomicModel,
 )
@@ -28,13 +32,13 @@ class PropertyModel(DPModelCommon, DPPropertyModel_):
 
     def __init__(
         self,
-        *args,
-        **kwargs,
+        *args: Any,
+        **kwargs: Any,
     ) -> None:
         DPModelCommon.__init__(self)
         DPPropertyModel_.__init__(self, *args, **kwargs)
 
-    def translated_output_def(self):
+    def translated_output_def(self) -> dict[str, OutputVariableDef]:
         out_def_data = self.model_output_def().get_data()
         output_def = {
             f"atom_{self.get_var_name()}": out_def_data[self.get_var_name()],
@@ -46,8 +50,8 @@ def translated_output_def(self):
 
     def forward(
         self,
-        coord,
-        atype,
+        coord: torch.Tensor,
+        atype: torch.Tensor,
         box: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
@@ -86,15 +90,15 @@ def get_var_name(self) -> str:
     @torch.jit.export
     def forward_lower(
         self,
-        extended_coord,
-        extended_atype,
-        nlist,
+        extended_coord: torch.Tensor,
+        extended_atype: torch.Tensor,
+        nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> dict[str, torch.Tensor]:
         model_ret = self.forward_common_lower(
             extended_coord,
             extended_atype,
diff --git a/deepmd/pt/model/model/spin_model.py b/deepmd/pt/model/model/spin_model.py
index ac94668039..bd7158fb8f 100644
--- a/deepmd/pt/model/model/spin_model.py
+++ b/deepmd/pt/model/model/spin_model.py
@@ -4,6 +4,8 @@
     deepcopy,
 )
 from typing import (
+    Any,
+    Callable,
     Optional,
 )
 
@@ -38,7 +40,7 @@ class SpinModel(torch.nn.Module):
 
     def __init__(
         self,
-        backbone_model,
+        backbone_model: DPAtomicModel,
         spin: Spin,
     ) -> None:
         super().__init__()
@@ -48,7 +50,9 @@ def __init__(
         self.virtual_scale_mask = to_torch_tensor(self.spin.get_virtual_scale_mask())
         self.spin_mask = to_torch_tensor(self.spin.get_spin_mask())
 
-    def process_spin_input(self, coord, atype, spin):
+    def process_spin_input(
+        self, coord: torch.Tensor, atype: torch.Tensor, spin: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         """Generate virtual coordinates and types, concat into the input."""
         nframes, nloc = atype.shape
         coord = coord.reshape(nframes, nloc, 3)
@@ -62,12 +66,12 @@ def process_spin_input(self, coord, atype, spin):
 
     def process_spin_input_lower(
         self,
-        extended_coord,
-        extended_atype,
-        extended_spin,
-        nlist,
+        extended_coord: torch.Tensor,
+        extended_atype: torch.Tensor,
+        extended_spin: torch.Tensor,
+        nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
-    ):
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
         """
         Add `extended_spin` into `extended_coord` to generate virtual atoms, and extend `nlist` and `mapping`.
         Note that the final `extended_coord_updated` with shape [nframes, nall + nall, 3] has the following order:
@@ -103,8 +107,12 @@ def process_spin_input_lower(
         )
 
     def process_spin_output(
-        self, atype, out_tensor, add_mag: bool = True, virtual_scale: bool = True
-    ):
+        self,
+        atype: torch.Tensor,
+        out_tensor: torch.Tensor,
+        add_mag: bool = True,
+        virtual_scale: bool = True,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Split the output both real and virtual atoms, and scale the latter.
         add_mag: whether to add magnetic tensor onto the real tensor.
@@ -132,12 +140,12 @@ def process_spin_output(
 
     def process_spin_output_lower(
         self,
-        extended_atype,
-        extended_out_tensor,
+        extended_atype: torch.Tensor,
+        extended_out_tensor: torch.Tensor,
         nloc: int,
         add_mag: bool = True,
         virtual_scale: bool = True,
-    ):
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Split the extended output of both real and virtual atoms with switch, and scale the latter.
         add_mag: whether to add magnetic tensor onto the real tensor.
@@ -177,7 +185,7 @@ def process_spin_output_lower(
         return extended_out_real, extended_out_mag, atomic_mask > 0.0
 
     @staticmethod
-    def extend_nlist(extended_atype, nlist):
+    def extend_nlist(extended_atype: torch.Tensor, nlist: torch.Tensor) -> torch.Tensor:
         nframes, nloc, nnei = nlist.shape
         nall = extended_atype.shape[1]
         nlist_mask = nlist != -1
@@ -207,7 +215,7 @@ def extend_nlist(extended_atype, nlist):
         return extended_nlist
 
     @staticmethod
-    def expand_aparam(aparam, nloc: int):
+    def expand_aparam(aparam: torch.Tensor, nloc: int) -> torch.Tensor:
         """Expand the atom parameters for virtual atoms if necessary."""
         nframes, natom, numb_aparam = aparam.shape
         if natom == nloc:  # good
@@ -239,22 +247,22 @@ def get_type_map(self) -> list[str]:
         return tmap[:ntypes]
 
     @torch.jit.export
-    def get_ntypes(self):
+    def get_ntypes(self) -> int:
         """Returns the number of element types."""
         return len(self.get_type_map())
 
     @torch.jit.export
-    def get_rcut(self):
+    def get_rcut(self) -> float:
         """Get the cut-off radius."""
         return self.backbone_model.get_rcut()
 
     @torch.jit.export
-    def get_dim_fparam(self):
+    def get_dim_fparam(self) -> int:
         """Get the number (dimension) of frame parameters of this atomic model."""
         return self.backbone_model.get_dim_fparam()
 
     @torch.jit.export
-    def get_dim_aparam(self):
+    def get_dim_aparam(self) -> int:
         """Get the number (dimension) of atomic parameters of this atomic model."""
         return self.backbone_model.get_dim_aparam()
 
@@ -320,7 +328,7 @@ def need_sorted_nlist_for_lower(self) -> bool:
         """Returns whether the model needs sorted nlist when using `forward_lower`."""
         return self.backbone_model.need_sorted_nlist_for_lower()
 
-    def model_output_def(self):
+    def model_output_def(self) -> ModelOutputDef:
         """Get the output def for the model."""
         model_output_type = self.backbone_model.model_output_type()
         if "mask" in model_output_type:
@@ -330,7 +338,7 @@ def model_output_def(self):
         backbone_model_atomic_output_def[var_name].magnetic = True
         return ModelOutputDef(backbone_model_atomic_output_def)
 
-    def __getattr__(self, name):
+    def __getattr__(self, name: str) -> Any:
         """Get attribute from the wrapped model."""
         if (
             name == "backbone_model"
@@ -343,7 +351,7 @@ def __getattr__(self, name):
 
     def compute_or_load_stat(
         self,
-        sampled_func,
+        sampled_func: Callable[[], list[dict[str, Any]]],
         stat_file_path: Optional[DPPath] = None,
     ) -> None:
         """
@@ -363,7 +371,7 @@ def compute_or_load_stat(
         """
 
         @functools.lru_cache
-        def spin_sampled_func():
+        def spin_sampled_func() -> list[dict[str, Any]]:
             sampled = sampled_func()
             spin_sampled = []
             for sys in sampled:
@@ -389,9 +397,9 @@ def spin_sampled_func():
 
     def forward_common(
         self,
-        coord,
-        atype,
-        spin,
+        coord: torch.Tensor,
+        atype: torch.Tensor,
+        spin: torch.Tensor,
         box: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
@@ -437,17 +445,17 @@ def forward_common(
 
     def forward_common_lower(
         self,
-        extended_coord,
-        extended_atype,
-        extended_spin,
-        nlist,
+        extended_coord: torch.Tensor,
+        extended_atype: torch.Tensor,
+        extended_spin: torch.Tensor,
+        nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
         extra_nlist_sort: bool = False,
-    ):
+    ) -> dict[str, torch.Tensor]:
         nframes, nloc = nlist.shape[:2]
         (
             extended_coord_updated,
@@ -506,7 +514,7 @@ def serialize(self) -> dict:
         }
 
     @classmethod
-    def deserialize(cls, data) -> "SpinModel":
+    def deserialize(cls, data: dict[str, Any]) -> "SpinModel":
         backbone_model_obj = make_model(DPAtomicModel).deserialize(
             data["backbone_model"]
         )
@@ -524,12 +532,12 @@ class SpinEnergyModel(SpinModel):
 
     def __init__(
         self,
-        backbone_model,
+        backbone_model: DPAtomicModel,
         spin: Spin,
     ) -> None:
         super().__init__(backbone_model, spin)
 
-    def translated_output_def(self):
+    def translated_output_def(self) -> dict[str, Any]:
         out_def_data = self.model_output_def().get_data()
         output_def = {
             "atom_energy": out_def_data["energy"],
@@ -545,9 +553,9 @@ def translated_output_def(self):
 
     def forward(
         self,
-        coord,
-        atype,
-        spin,
+        coord: torch.Tensor,
+        atype: torch.Tensor,
+        spin: torch.Tensor,
         box: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
@@ -575,16 +583,16 @@ def forward(
     @torch.jit.export
     def forward_lower(
         self,
-        extended_coord,
-        extended_atype,
-        extended_spin,
-        nlist,
+        extended_coord: torch.Tensor,
+        extended_atype: torch.Tensor,
+        extended_spin: torch.Tensor,
+        nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> dict[str, torch.Tensor]:
         model_ret = self.forward_common_lower(
             extended_coord,
             extended_atype,
diff --git a/deepmd/pt/model/model/transform_output.py b/deepmd/pt/model/model/transform_output.py
index fb05bc385b..cd88e4cb40 100644
--- a/deepmd/pt/model/model/transform_output.py
+++ b/deepmd/pt/model/model/transform_output.py
@@ -20,7 +20,7 @@
 def atomic_virial_corr(
     extended_coord: torch.Tensor,
     atom_energy: torch.Tensor,
-):
+) -> torch.Tensor:
     nall = extended_coord.shape[1]
     nloc = atom_energy.shape[1]
     coord, _ = torch.split(extended_coord, [nloc, nall - nloc], dim=1)
@@ -72,7 +72,7 @@ def task_deriv_one(
     do_virial: bool = True,
     do_atomic_virial: bool = False,
     create_graph: bool = True,
-):
+) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
     faked_grad = torch.ones_like(energy)
     lst = torch.jit.annotate(list[Optional[torch.Tensor]], [faked_grad])
     extended_force = torch.autograd.grad(
@@ -102,7 +102,7 @@ def task_deriv_one(
 def get_leading_dims(
     vv: torch.Tensor,
     vdef: OutputVariableDef,
-):
+) -> list[int]:
     """Get the dimensions of nf x nloc."""
     vshape = vv.shape
     return list(vshape[: (len(vshape) - len(vdef.shape))])
@@ -116,7 +116,7 @@ def take_deriv(
     do_virial: bool = False,
     do_atomic_virial: bool = False,
     create_graph: bool = True,
-):
+) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
     size = 1
     for ii in vdef.shape:
         size *= ii
diff --git a/deepmd/pt/model/network/init.py b/deepmd/pt/model/network/init.py
index 53e2c70892..6bdff61eea 100644
--- a/deepmd/pt/model/network/init.py
+++ b/deepmd/pt/model/network/init.py
@@ -18,19 +18,36 @@
 # functions that use `with torch.no_grad()`. The JIT doesn't support context
 # managers, so these need to be implemented as builtins. Using these wrappers
 # lets us keep those builtins small and reusable.
-def _no_grad_uniform_(tensor, a, b, generator=None):
+def _no_grad_uniform_(
+    tensor: torch.Tensor,
+    a: float,
+    b: float,
+    generator: _Optional[torch.Generator] = None,
+) -> torch.Tensor:
     with torch.no_grad():
         return tensor.uniform_(a, b, generator=generator)
 
 
-def _no_grad_normal_(tensor, mean, std, generator=None):
+def _no_grad_normal_(
+    tensor: torch.Tensor,
+    mean: float,
+    std: float,
+    generator: _Optional[torch.Generator] = None,
+) -> torch.Tensor:
     with torch.no_grad():
         return tensor.normal_(mean, std, generator=generator)
 
 
-def _no_grad_trunc_normal_(tensor, mean, std, a, b, generator=None):
+def _no_grad_trunc_normal_(
+    tensor: torch.Tensor,
+    mean: float,
+    std: float,
+    a: float,
+    b: float,
+    generator: _Optional[torch.Generator] = None,
+) -> torch.Tensor:
     # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
-    def norm_cdf(x):
+    def norm_cdf(x: float) -> float:
         # Computes standard normal cumulative distribution function
         return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
 
@@ -65,17 +82,17 @@ def norm_cdf(x):
         return tensor
 
 
-def _no_grad_zero_(tensor):
+def _no_grad_zero_(tensor: torch.Tensor) -> torch.Tensor:
     with torch.no_grad():
         return tensor.zero_()
 
 
-def _no_grad_fill_(tensor, val):
+def _no_grad_fill_(tensor: torch.Tensor, val: float) -> torch.Tensor:
     with torch.no_grad():
         return tensor.fill_(val)
 
 
-def calculate_gain(nonlinearity, param=None):
+def calculate_gain(nonlinearity: str, param: _Optional[float] = None) -> float:
     r"""Return the recommended gain value for the given nonlinearity function.
 
     The values are as follows:
@@ -146,7 +163,7 @@ def calculate_gain(nonlinearity, param=None):
         raise ValueError(f"Unsupported nonlinearity {nonlinearity}")
 
 
-def _calculate_fan_in_and_fan_out(tensor):
+def _calculate_fan_in_and_fan_out(tensor: torch.Tensor) -> tuple[int, int]:
     dimensions = tensor.dim()
     if dimensions < 2:
         raise ValueError(
@@ -167,7 +184,7 @@ def _calculate_fan_in_and_fan_out(tensor):
     return fan_in, fan_out
 
 
-def _calculate_correct_fan(tensor, mode):
+def _calculate_correct_fan(tensor: torch.Tensor, mode: str) -> int:
     mode = mode.lower()
     valid_modes = ["fan_in", "fan_out"]
     if mode not in valid_modes:
@@ -290,7 +307,7 @@ def kaiming_uniform_(
     mode: str = "fan_in",
     nonlinearity: str = "leaky_relu",
     generator: _Optional[torch.Generator] = None,
-):
+) -> Tensor:
     r"""Fill the input `Tensor` with values using a Kaiming uniform distribution.
 
     The method is described in `Delving deep into rectifiers: Surpassing
@@ -348,7 +365,7 @@ def kaiming_normal_(
     mode: str = "fan_in",
     nonlinearity: str = "leaky_relu",
     generator: _Optional[torch.Generator] = None,
-):
+) -> Tensor:
     r"""Fill the input `Tensor` with values using a Kaiming normal distribution.
 
     The method is described in `Delving deep into rectifiers: Surpassing
diff --git a/deepmd/pt/model/network/layernorm.py b/deepmd/pt/model/network/layernorm.py
index 89bd16d569..fdf31d0ffd 100644
--- a/deepmd/pt/model/network/layernorm.py
+++ b/deepmd/pt/model/network/layernorm.py
@@ -30,14 +30,14 @@
 device = env.DEVICE
 
 
-def empty_t(shape, precision):
+def empty_t(shape: tuple[int, ...], precision: torch.dtype) -> torch.Tensor:
     return torch.empty(shape, dtype=precision, device=device)
 
 
 class LayerNorm(nn.Module):
     def __init__(
         self,
-        num_in,
+        num_in: int,
         eps: float = 1e-5,
         uni_init: bool = True,
         bavg: float = 0.0,
@@ -141,7 +141,7 @@ def deserialize(cls, data: dict) -> "LayerNorm":
         )
         prec = PRECISION_DICT[obj.precision]
 
-        def check_load_param(ss):
+        def check_load_param(ss: str) -> Optional[nn.Parameter]:
             return (
                 nn.Parameter(data=to_torch_tensor(nl[ss]))
                 if nl[ss] is not None
diff --git a/deepmd/pt/model/network/mlp.py b/deepmd/pt/model/network/mlp.py
index ea07f617d4..a850c85a9b 100644
--- a/deepmd/pt/model/network/mlp.py
+++ b/deepmd/pt/model/network/mlp.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     ClassVar,
     Optional,
     Union,
@@ -43,7 +44,7 @@
 )
 
 
-def empty_t(shape, precision):
+def empty_t(shape: tuple[int, ...], precision: torch.dtype) -> torch.Tensor:
     return torch.empty(shape, dtype=precision, device=device)
 
 
@@ -72,8 +73,8 @@ def deserialize(cls, data: dict) -> "Identity":
 class MLPLayer(nn.Module):
     def __init__(
         self,
-        num_in,
-        num_out,
+        num_in: int,
+        num_out: int,
         bias: bool = True,
         use_timestep: bool = False,
         activation_function: Optional[str] = None,
@@ -132,7 +133,7 @@ def __init__(
     def check_type_consistency(self) -> None:
         precision = self.precision
 
-        def check_var(var) -> None:
+        def check_var(var: Optional[torch.Tensor]) -> None:
             if var is not None:
                 # assertion "float64" == "double" would fail
                 assert PRECISION_DICT[var.dtype.name] is PRECISION_DICT[precision]
@@ -164,7 +165,7 @@ def _default_normal_init(
             normal_(self.idt.data, mean=0.1, std=0.001, generator=generator)
 
     def _trunc_normal_init(
-        self, scale=1.0, generator: Optional[torch.Generator] = None
+        self, scale: float = 1.0, generator: Optional[torch.Generator] = None
     ) -> None:
         # Constant from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
         TRUNCATED_NORMAL_STDDEV_FACTOR = 0.87962566103423978
@@ -176,7 +177,7 @@ def _trunc_normal_init(
     def _glorot_uniform_init(self, generator: Optional[torch.Generator] = None) -> None:
         xavier_uniform_(self.matrix, gain=1, generator=generator)
 
-    def _zero_init(self, use_bias=True) -> None:
+    def _zero_init(self, use_bias: bool = True) -> None:
         with torch.no_grad():
             self.matrix.fill_(0.0)
             if use_bias and self.bias is not None:
@@ -266,7 +267,7 @@ def deserialize(cls, data: dict) -> "MLPLayer":
         )
         prec = PRECISION_DICT[obj.precision]
 
-        def check_load_param(ss):
+        def check_load_param(ss: str) -> Optional[nn.Parameter]:
             return (
                 nn.Parameter(data=to_torch_tensor(nl[ss]))
                 if nl[ss] is not None
@@ -283,7 +284,7 @@ def check_load_param(ss):
 
 
 class MLP(MLP_):
-    def __init__(self, *args, **kwargs) -> None:
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
         self.layers = torch.nn.ModuleList(self.layers)
 
@@ -304,7 +305,7 @@ class NetworkCollection(DPNetworkCollection, nn.Module):
         "fitting_network": FittingNet,
     }
 
-    def __init__(self, *args, **kwargs) -> None:
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
         # init both two base classes
         DPNetworkCollection.__init__(self, *args, **kwargs)
         nn.Module.__init__(self)
diff --git a/deepmd/pt/model/network/network.py b/deepmd/pt/model/network/network.py
index 71f335e446..d95741b05c 100644
--- a/deepmd/pt/model/network/network.py
+++ b/deepmd/pt/model/network/network.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Final,
     Optional,
     Union,
@@ -32,7 +33,7 @@
 )
 
 
-def Tensor(*shape):
+def Tensor(*shape: int) -> torch.Tensor:
     return torch.empty(shape, dtype=env.GLOBAL_PT_FLOAT_PRECISION, device=env.DEVICE)
 
 
@@ -41,12 +42,12 @@ class SimpleLinear(nn.Module):
 
     def __init__(
         self,
-        num_in,
-        num_out,
-        bavg=0.0,
-        stddev=1.0,
-        use_timestep=False,
-        activate=None,
+        num_in: int,
+        num_out: int,
+        bavg: float = 0.0,
+        stddev: float = 1.0,
+        use_timestep: bool = False,
+        activate: Optional[str] = None,
         bias: bool = True,
     ) -> None:
         """Construct a linear layer.
@@ -74,7 +75,7 @@ def __init__(
             self.idt = nn.Parameter(data=Tensor(1, num_out))
             nn.init.normal_(self.idt.data, mean=0.1, std=0.001)
 
-    def forward(self, inputs):
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
         """Return X*W+b."""
         xw = torch.matmul(inputs, self.matrix)
         hidden = xw + self.bias if self.bias is not None else xw
@@ -121,7 +122,7 @@ def __init__(
         else:
             raise ValueError("Invalid init method.")
 
-    def _trunc_normal_init(self, scale=1.0) -> None:
+    def _trunc_normal_init(self, scale: float = 1.0) -> None:
         # Constant from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
         TRUNCATED_NORMAL_STDDEV_FACTOR = 0.87962566103423978
         _, fan_in = self.weight.shape
@@ -132,7 +133,7 @@ def _trunc_normal_init(self, scale=1.0) -> None:
     def _glorot_uniform_init(self) -> None:
         nn.init.xavier_uniform_(self.weight, gain=1)
 
-    def _zero_init(self, use_bias=True) -> None:
+    def _zero_init(self, use_bias: bool = True) -> None:
         with torch.no_grad():
             self.weight.fill_(0.0)
             if use_bias:
@@ -144,13 +145,19 @@ def _normal_init(self) -> None:
 
 
 class NonLinearHead(nn.Module):
-    def __init__(self, input_dim, out_dim, activation_fn, hidden=None) -> None:
+    def __init__(
+        self,
+        input_dim: int,
+        out_dim: int,
+        activation_fn: str,
+        hidden: Optional[int] = None,
+    ) -> None:
         super().__init__()
         hidden = input_dim if not hidden else hidden
         self.linear1 = SimpleLinear(input_dim, hidden, activate=activation_fn)
         self.linear2 = SimpleLinear(hidden, out_dim)
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.linear1(x)
         x = self.linear2(x)
         return x
@@ -159,7 +166,13 @@ def forward(self, x):
 class MaskLMHead(nn.Module):
     """Head for masked language modeling."""
 
-    def __init__(self, embed_dim, output_dim, activation_fn, weight=None) -> None:
+    def __init__(
+        self,
+        embed_dim: int,
+        output_dim: int,
+        activation_fn: str,
+        weight: Optional[torch.Tensor] = None,
+    ) -> None:
         super().__init__()
         self.dense = SimpleLinear(embed_dim, embed_dim)
         self.activation_fn = ActivationFn(activation_fn)
@@ -174,7 +187,12 @@ def __init__(self, embed_dim, output_dim, activation_fn, weight=None) -> None:
             torch.zeros(output_dim, dtype=env.GLOBAL_PT_FLOAT_PRECISION)  # pylint: disable=no-explicit-dtype,no-explicit-device
         )
 
-    def forward(self, features, masked_tokens: Optional[torch.Tensor] = None, **kwargs):
+    def forward(
+        self,
+        features: torch.Tensor,
+        masked_tokens: Optional[torch.Tensor] = None,
+        **kwargs: Any,
+    ) -> torch.Tensor:
         # Only project the masked tokens while training,
         # saves both memory and computation
         if masked_tokens is not None:
@@ -190,7 +208,13 @@ def forward(self, features, masked_tokens: Optional[torch.Tensor] = None, **kwar
 
 class ResidualDeep(nn.Module):
     def __init__(
-        self, type_id, embedding_width, neuron, bias_atom_e, out_dim=1, resnet_dt=False
+        self,
+        type_id: int,
+        embedding_width: int,
+        neuron: list[int],
+        bias_atom_e: float,
+        out_dim: int = 1,
+        resnet_dt: bool = False,
     ) -> None:
         """Construct a filter on the given element as neighbor.
 
@@ -221,7 +245,7 @@ def __init__(
             bias_atom_e = 0
         self.final_layer = SimpleLinear(self.neuron[-1], self.out_dim, bias_atom_e)
 
-    def forward(self, inputs):
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
         """Calculate decoded embedding for each atom.
 
         Args:
@@ -244,15 +268,15 @@ def forward(self, inputs):
 class TypeEmbedNet(nn.Module):
     def __init__(
         self,
-        type_nums,
-        embed_dim,
-        bavg=0.0,
-        stddev=1.0,
-        precision="default",
+        type_nums: int,
+        embed_dim: int,
+        bavg: float = 0.0,
+        stddev: float = 1.0,
+        precision: str = "default",
         seed: Optional[Union[int, list[int]]] = None,
-        use_econf_tebd=False,
+        use_econf_tebd: bool = False,
         use_tebd_bias: bool = False,
-        type_map=None,
+        type_map: Optional[list[str]] = None,
         trainable: bool = True,
     ) -> None:
         """Construct a type embedding net."""
@@ -278,7 +302,7 @@ def __init__(
         )
         # nn.init.normal_(self.embedding.weight[:-1], mean=bavg, std=stddev)
 
-    def forward(self, atype):
+    def forward(self, atype: torch.Tensor) -> torch.Tensor:
         """
         Args:
             atype: Type of each input, [nframes, nloc] or [nframes, nloc, nnei].
@@ -290,7 +314,7 @@ def forward(self, atype):
         """
         return torch.embedding(self.embedding(atype.device), atype)
 
-    def get_full_embedding(self, device: torch.device):
+    def get_full_embedding(self, device: torch.device) -> torch.Tensor:
         """
         Get the type embeddings of all types.
 
@@ -307,7 +331,9 @@ def get_full_embedding(self, device: torch.device):
         """
         return self.embedding(device)
 
-    def share_params(self, base_class, shared_level, resume=False) -> None:
+    def share_params(
+        self, base_class: Any, shared_level: int, resume: bool = False
+    ) -> None:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -324,7 +350,7 @@ def share_params(self, base_class, shared_level, resume=False) -> None:
             raise NotImplementedError
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -409,7 +435,7 @@ def __init__(
         for param in self.parameters():
             param.requires_grad = trainable
 
-    def forward(self, device: torch.device):
+    def forward(self, device: torch.device) -> torch.Tensor:
         """Caulate type embedding network.
 
         Returns
@@ -431,7 +457,7 @@ def forward(self, device: torch.device):
         return embed
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -493,7 +519,7 @@ def change_type_map(
         self.ntypes = len(type_map)
 
     @classmethod
-    def deserialize(cls, data: dict):
+    def deserialize(cls, data: dict) -> "TypeEmbedNetConsistent":
         """Deserialize the model.
 
         Parameters
diff --git a/deepmd/pt/model/network/utils.py b/deepmd/pt/model/network/utils.py
index 34af976b76..7af8b7c032 100644
--- a/deepmd/pt/model/network/utils.py
+++ b/deepmd/pt/model/network/utils.py
@@ -57,7 +57,7 @@ def get_graph_index(
     a_nlist_mask: torch.Tensor,
     nall: int,
     use_loc_mapping: bool = True,
-):
+) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Get the index mapping for edge graph and angle graph, ready in `aggregate` or `index_select`.
 
diff --git a/deepmd/pt/model/task/denoise.py b/deepmd/pt/model/task/denoise.py
index fc9e8943e9..50cae4fb12 100644
--- a/deepmd/pt/model/task/denoise.py
+++ b/deepmd/pt/model/task/denoise.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
 )
 
@@ -26,12 +27,12 @@
 class DenoiseNet(Fitting):
     def __init__(
         self,
-        feature_dim,
-        ntypes,
-        attn_head=8,
-        prefactor=[0.5, 0.5],
-        activation_function="gelu",
-        **kwargs,
+        feature_dim: int,
+        ntypes: int,
+        attn_head: int = 8,
+        prefactor: list[float] = [0.5, 0.5],
+        activation_function: str = "gelu",
+        **kwargs: Any,
     ) -> None:
         """Construct a denoise net.
 
@@ -71,7 +72,7 @@ def __init__(
                 self.pair2coord_proj.append(_pair2coord_proj)
             self.pair2coord_proj = torch.nn.ModuleList(self.pair2coord_proj)
 
-    def output_def(self):
+    def output_def(self) -> FittingOutputDef:
         return FittingOutputDef(
             [
                 OutputVariableDef(
@@ -93,13 +94,13 @@ def output_def(self):
 
     def forward(
         self,
-        pair_weights,
-        diff,
-        nlist_mask,
-        features,
-        sw,
+        pair_weights: torch.Tensor,
+        diff: torch.Tensor,
+        nlist_mask: torch.Tensor,
+        features: torch.Tensor,
+        sw: torch.Tensor,
         masked_tokens: Optional[torch.Tensor] = None,
-    ):
+    ) -> dict[str, torch.Tensor]:
         """Calculate the updated coord.
         Args:
         - coord: Input noisy coord with shape [nframes, nloc, 3].
diff --git a/deepmd/pt/model/task/dipole.py b/deepmd/pt/model/task/dipole.py
index 65b64220ae..b2d8b598b9 100644
--- a/deepmd/pt/model/task/dipole.py
+++ b/deepmd/pt/model/task/dipole.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import logging
 from typing import (
+    Any,
     Callable,
     Optional,
     Union,
@@ -93,7 +94,7 @@ def __init__(
         r_differentiable: bool = True,
         c_differentiable: bool = True,
         type_map: Optional[list[str]] = None,
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         self.embedding_width = embedding_width
         self.r_differentiable = r_differentiable
@@ -117,7 +118,7 @@ def __init__(
             **kwargs,
         )
 
-    def _net_out_dim(self):
+    def _net_out_dim(self) -> int:
         """Set the FittingNet output dim."""
         return self.embedding_width
 
@@ -181,7 +182,7 @@ def forward(
         h2: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
-    ):
+    ) -> dict[str, torch.Tensor]:
         nframes, nloc, _ = descriptor.shape
         assert gr is not None, "Must provide the rotation matrix for dipole fitting."
         # cast the input to internal precsion
diff --git a/deepmd/pt/model/task/ener.py b/deepmd/pt/model/task/ener.py
index 07351b33f6..9027bfe288 100644
--- a/deepmd/pt/model/task/ener.py
+++ b/deepmd/pt/model/task/ener.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import logging
 from typing import (
+    Any,
     Optional,
     Union,
 )
@@ -56,7 +57,7 @@ def __init__(
         mixed_types: bool = True,
         seed: Optional[Union[int, list[int]]] = None,
         type_map: Optional[list[str]] = None,
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         super().__init__(
             "energy",
@@ -102,15 +103,15 @@ def serialize(self) -> dict:
 class EnergyFittingNetDirect(Fitting):
     def __init__(
         self,
-        ntypes,
-        dim_descrpt,
-        neuron,
-        bias_atom_e=None,
-        out_dim=1,
-        resnet_dt=True,
-        use_tebd=True,
-        return_energy=False,
-        **kwargs,
+        ntypes: int,
+        dim_descrpt: int,
+        neuron: list[int],
+        bias_atom_e: Optional[torch.Tensor] = None,
+        out_dim: int = 1,
+        resnet_dt: bool = True,
+        use_tebd: bool = True,
+        return_energy: bool = False,
+        **kwargs: Any,
     ) -> None:
         """Construct a fitting net for energy.
 
@@ -160,7 +161,7 @@ def __init__(
                 filter_layers.append(one)
         self.filter_layers = torch.nn.ModuleList(filter_layers)
 
-    def output_def(self):
+    def output_def(self) -> FittingOutputDef:
         return FittingOutputDef(
             [
                 OutputVariableDef(
@@ -187,7 +188,7 @@ def deserialize(self) -> "EnergyFittingNetDirect":
         raise NotImplementedError
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
     ) -> None:
         raise NotImplementedError
 
diff --git a/deepmd/pt/model/task/fitting.py b/deepmd/pt/model/task/fitting.py
index 22bbf6165b..841c494e88 100644
--- a/deepmd/pt/model/task/fitting.py
+++ b/deepmd/pt/model/task/fitting.py
@@ -4,6 +4,7 @@
     abstractmethod,
 )
 from typing import (
+    Any,
     Callable,
     Optional,
     Union,
@@ -50,12 +51,14 @@
 class Fitting(torch.nn.Module, BaseFitting):
     # plugin moved to BaseFitting
 
-    def __new__(cls, *args, **kwargs):
+    def __new__(cls, *args: Any, **kwargs: Any) -> "Fitting":
         if cls is Fitting:
             return BaseFitting.__new__(BaseFitting, *args, **kwargs)
         return super().__new__(cls)
 
-    def share_params(self, base_class, shared_level, resume=False) -> None:
+    def share_params(
+        self, base_class: "Fitting", shared_level: int, resume: bool = False
+    ) -> None:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -227,7 +230,7 @@ def __init__(
         remove_vaccum_contribution: Optional[list[bool]] = None,
         type_map: Optional[list[str]] = None,
         use_aparam_as_mask: bool = False,
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         super().__init__()
         self.var_name = var_name
@@ -339,7 +342,9 @@ def reinit_exclude(
         self.emask = AtomExcludeMask(self.ntypes, self.exclude_types)
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self,
+        type_map: list[str],
+        model_with_new_type_stat: Optional["GeneralFitting"] = None,
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -443,7 +448,7 @@ def get_type_map(self) -> list[str]:
         """Get the name to each type of atoms."""
         return self.type_map
 
-    def set_case_embd(self, case_idx: int):
+    def set_case_embd(self, case_idx: int) -> None:
         """
         Set the case embedding of this fitting net by the given case_idx,
         typically concatenated with the output of the descriptor and fed into the fitting net.
@@ -455,7 +460,7 @@ def set_case_embd(self, case_idx: int):
     def set_return_middle_output(self, return_middle_output: bool = True) -> None:
         self.eval_return_middle_output = return_middle_output
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: torch.Tensor) -> None:
         if key in ["bias_atom_e"]:
             value = value.view([self.ntypes, self._net_out_dim()])
             self.bias_atom_e = value
@@ -474,7 +479,7 @@ def __setitem__(self, key, value) -> None:
         else:
             raise KeyError(key)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> torch.Tensor:
         if key in ["bias_atom_e"]:
             return self.bias_atom_e
         elif key in ["fparam_avg"]:
@@ -493,7 +498,7 @@ def __getitem__(self, key):
             raise KeyError(key)
 
     @abstractmethod
-    def _net_out_dim(self):
+    def _net_out_dim(self) -> int:
         """Set the FittingNet output dim."""
         pass
 
@@ -512,7 +517,7 @@ def _forward_common(
         h2: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
-    ):
+    ) -> dict[str, torch.Tensor]:
         # cast the input to internal precsion
         xx = descriptor.to(self.prec)
         fparam = fparam.to(self.prec) if fparam is not None else None
diff --git a/deepmd/pt/model/task/invar_fitting.py b/deepmd/pt/model/task/invar_fitting.py
index c2f888e1fa..f7233352f8 100644
--- a/deepmd/pt/model/task/invar_fitting.py
+++ b/deepmd/pt/model/task/invar_fitting.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import logging
 from typing import (
+    Any,
     Optional,
     Union,
 )
@@ -103,7 +104,7 @@ def __init__(
         atom_ener: Optional[list[Optional[torch.Tensor]]] = None,
         type_map: Optional[list[str]] = None,
         use_aparam_as_mask: bool = False,
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         self.dim_out = dim_out
         self.atom_ener = atom_ener
@@ -131,7 +132,7 @@ def __init__(
             **kwargs,
         )
 
-    def _net_out_dim(self):
+    def _net_out_dim(self) -> int:
         """Set the FittingNet output dim."""
         return self.dim_out
 
@@ -170,7 +171,7 @@ def forward(
         h2: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
-    ):
+    ) -> dict[str, torch.Tensor]:
         """Based on embedding net output, alculate total energy.
 
         Args:
diff --git a/deepmd/pt/model/task/polarizability.py b/deepmd/pt/model/task/polarizability.py
index a326802918..5f8ad3c73a 100644
--- a/deepmd/pt/model/task/polarizability.py
+++ b/deepmd/pt/model/task/polarizability.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import logging
 from typing import (
+    Any,
     Optional,
     Union,
 )
@@ -98,7 +99,7 @@ def __init__(
         scale: Optional[Union[list[float], float]] = None,
         shift_diag: bool = True,
         type_map: Optional[list[str]] = None,
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         self.embedding_width = embedding_width
         self.fit_diag = fit_diag
@@ -142,7 +143,7 @@ def __init__(
             **kwargs,
         )
 
-    def _net_out_dim(self):
+    def _net_out_dim(self) -> int:
         """Set the FittingNet output dim."""
         return (
             self.embedding_width
@@ -150,20 +151,20 @@ def _net_out_dim(self):
             else self.embedding_width * self.embedding_width
         )
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: Any) -> None:
         if key in ["constant_matrix"]:
             self.constant_matrix = value
         else:
             super().__setitem__(key, value)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> Any:
         if key in ["constant_matrix"]:
             return self.constant_matrix
         else:
             return super().__getitem__(key)
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -232,7 +233,7 @@ def forward(
         h2: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
-    ):
+    ) -> dict[str, torch.Tensor]:
         nframes, nloc, _ = descriptor.shape
         assert gr is not None, (
             "Must provide the rotation matrix for polarizability fitting."
diff --git a/deepmd/pt/model/task/property.py b/deepmd/pt/model/task/property.py
index 5ef0cd0233..23069ea0da 100644
--- a/deepmd/pt/model/task/property.py
+++ b/deepmd/pt/model/task/property.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import logging
 from typing import (
+    Any,
     Optional,
     Union,
 )
@@ -91,7 +92,7 @@ def __init__(
         mixed_types: bool = True,
         trainable: Union[bool, list[bool]] = True,
         seed: Optional[int] = None,
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         self.task_dim = task_dim
         self.intensive = intensive
diff --git a/deepmd/pt/model/task/type_predict.py b/deepmd/pt/model/task/type_predict.py
index e4a980c3ea..5c1b064d07 100644
--- a/deepmd/pt/model/task/type_predict.py
+++ b/deepmd/pt/model/task/type_predict.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
 )
 
@@ -15,7 +16,11 @@
 
 class TypePredictNet(Fitting):
     def __init__(
-        self, feature_dim, ntypes, activation_function="gelu", **kwargs
+        self,
+        feature_dim: int,
+        ntypes: int,
+        activation_function: str = "gelu",
+        **kwargs: Any,
     ) -> None:
         """Construct a type predict net.
 
@@ -34,7 +39,9 @@ def __init__(
             weight=None,
         )
 
-    def forward(self, features, masked_tokens: Optional[torch.Tensor] = None):
+    def forward(
+        self, features: torch.Tensor, masked_tokens: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
         """Calculate the predicted logits.
         Args:
         - features: Input features with shape [nframes, nloc, feature_dim].
diff --git a/deepmd/pt/optimizer/LKF.py b/deepmd/pt/optimizer/LKF.py
index c342960e5b..aeb1120bff 100644
--- a/deepmd/pt/optimizer/LKF.py
+++ b/deepmd/pt/optimizer/LKF.py
@@ -1,6 +1,10 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import logging
 import math
+from typing import (
+    Any,
+    Optional,
+)
 
 import torch
 import torch.distributed as dist
@@ -9,7 +13,7 @@
 )
 
 
-def distribute_indices(total_length, num_workers):
+def distribute_indices(total_length: int, num_workers: int) -> list[tuple[int, int]]:
     indices_per_worker = total_length // num_workers
     remainder = total_length % num_workers
 
@@ -27,10 +31,10 @@ def distribute_indices(total_length, num_workers):
 class LKFOptimizer(Optimizer):
     def __init__(
         self,
-        params,
-        kalman_lambda=0.98,
-        kalman_nue=0.9987,
-        block_size=5120,
+        params: Any,
+        kalman_lambda: float = 0.98,
+        kalman_nue: float = 0.9987,
+        block_size: int = 5120,
     ) -> None:
         defaults = {"lr": 0.1, "kalman_nue": kalman_nue, "block_size": block_size}
 
@@ -158,13 +162,13 @@ def __init_P(self) -> None:
         self._state.setdefault("weights_num", len(P))
         self._state.setdefault("params_packed_index", params_packed_index)
 
-    def __get_blocksize(self):
+    def __get_blocksize(self) -> int:
         return self.param_groups[0]["block_size"]
 
-    def __get_nue(self):
+    def __get_nue(self) -> float:
         return self.param_groups[0]["kalman_nue"]
 
-    def __split_weights(self, weight):
+    def __split_weights(self, weight: torch.Tensor) -> list[torch.Tensor]:
         block_size = self.__get_blocksize()
         param_num = weight.nelement()
         res = []
@@ -179,7 +183,9 @@ def __split_weights(self, weight):
                     res.append(weight[i * block_size :])
         return res
 
-    def __update(self, H, error, weights) -> None:
+    def __update(
+        self, H: torch.Tensor, error: torch.Tensor, weights: torch.Tensor
+    ) -> None:
         P = self._state.get("P")
         kalman_lambda = self._state.get("kalman_lambda")
         weights_num = self._state.get("weights_num")
@@ -253,10 +259,10 @@ def __update(self, H, error, weights) -> None:
                         i += 1
                     param.data = tmp_weight.reshape(param.data.T.shape).T.contiguous()
 
-    def set_grad_prefactor(self, grad_prefactor) -> None:
+    def set_grad_prefactor(self, grad_prefactor: float) -> None:
         self.grad_prefactor = grad_prefactor
 
-    def step(self, error) -> None:
+    def step(self, error: torch.Tensor) -> None:
         params_packed_index = self._state.get("params_packed_index")
 
         weights = []
@@ -313,7 +319,7 @@ def step(self, error) -> None:
 
         self.__update(H, error, weights)
 
-    def get_device_id(self, index):
+    def get_device_id(self, index: int) -> Optional[int]:
         for i, (start, end) in enumerate(self.dindex):
             if start <= index < end:
                 return i
diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py
index 8f7c763d0f..ab98389426 100644
--- a/deepmd/pt/train/training.py
+++ b/deepmd/pt/train/training.py
@@ -3,6 +3,7 @@
 import logging
 import time
 from collections.abc import (
+    Generator,
     Iterable,
 )
 from copy import (
@@ -13,6 +14,8 @@
 )
 from typing import (
     Any,
+    Callable,
+    Optional,
 )
 
 import numpy as np
@@ -50,6 +53,7 @@
     dp_random,
 )
 from deepmd.pt.utils.dataloader import (
+    DpLoaderSet,
     get_sampler_from_params,
 )
 from deepmd.pt.utils.env import (
@@ -92,16 +96,16 @@ class Trainer:
     def __init__(
         self,
         config: dict[str, Any],
-        training_data,
-        stat_file_path=None,
-        validation_data=None,
-        init_model=None,
-        restart_model=None,
-        finetune_model=None,
-        force_load=False,
-        shared_links=None,
-        finetune_links=None,
-        init_frz_model=None,
+        training_data: DpLoaderSet,
+        stat_file_path: Optional[str] = None,
+        validation_data: Optional[DpLoaderSet] = None,
+        init_model: Optional[str] = None,
+        restart_model: Optional[str] = None,
+        finetune_model: Optional[str] = None,
+        force_load: bool = False,
+        shared_links: Optional[dict[str, str]] = None,
+        finetune_links: Optional[dict[str, str]] = None,
+        init_frz_model: Optional[str] = None,
     ) -> None:
         """Construct a DeePMD trainer.
 
@@ -151,7 +155,7 @@ def __init__(
         )
         self.lcurve_should_print_header = True
 
-        def get_opt_param(params):
+        def get_opt_param(params: dict[str, Any]) -> tuple[str, dict[str, Any]]:
             opt_type = params.get("opt_type", "Adam")
             opt_param = {
                 "kf_blocksize": params.get("kf_blocksize", 5120),
@@ -163,7 +167,7 @@ def get_opt_param(params):
             }
             return opt_type, opt_param
 
-        def cycle_iterator(iterable: Iterable):
+        def cycle_iterator(iterable: Iterable) -> Generator[Any, None, None]:
             """
             Produces an infinite iterator by repeatedly cycling through the given iterable.
 
@@ -179,8 +183,20 @@ def cycle_iterator(iterable: Iterable):
                     it = iter(iterable)
                 yield from it
 
-        def get_data_loader(_training_data, _validation_data, _training_params):
-            def get_dataloader_and_iter(_data, _params):
+        def get_data_loader(
+            _training_data: DpLoaderSet,
+            _validation_data: Optional[DpLoaderSet],
+            _training_params: dict[str, Any],
+        ) -> tuple[
+            DataLoader,
+            Generator[Any, None, None],
+            Optional[DataLoader],
+            Optional[Generator[Any, None, None]],
+            int,
+        ]:
+            def get_dataloader_and_iter(
+                _data: DpLoaderSet, _params: dict[str, Any]
+            ) -> tuple[DataLoader, Generator[Any, None, None]]:
                 _sampler = get_sampler_from_params(_data, _params)
                 if _sampler is None:
                     log.warning(
@@ -227,21 +243,21 @@ def get_dataloader_and_iter(_data, _params):
             )
 
         def single_model_stat(
-            _model,
-            _data_stat_nbatch,
-            _training_data,
-            _validation_data,
-            _stat_file_path,
-            _data_requirement,
-            finetune_has_new_type=False,
-        ):
+            _model: Any,
+            _data_stat_nbatch: int,
+            _training_data: DpLoaderSet,
+            _validation_data: Optional[DpLoaderSet],
+            _stat_file_path: Optional[str],
+            _data_requirement: list[DataRequirementItem],
+            finetune_has_new_type: bool = False,
+        ) -> Callable[[], Any]:
             _data_requirement += get_additional_data_requirement(_model)
             _training_data.add_data_requirement(_data_requirement)
             if _validation_data is not None:
                 _validation_data.add_data_requirement(_data_requirement)
 
             @functools.lru_cache
-            def get_sample():
+            def get_sample() -> Any:
                 sampled = make_stat_input(
                     _training_data.systems,
                     _training_data.dataloaders,
@@ -258,7 +274,7 @@ def get_sample():
                     _stat_file_path.root.close()
             return get_sample
 
-        def get_lr(lr_params):
+        def get_lr(lr_params: dict[str, Any]) -> LearningRateExp:
             assert lr_params.get("type", "exp") == "exp", (
                 "Only learning rate `exp` is supported!"
             )
@@ -496,11 +512,11 @@ def get_lr(lr_params):
                     state_dict = pretrained_model_wrapper.state_dict()
 
                     def collect_single_finetune_params(
-                        _model_key,
-                        _finetune_rule_single,
-                        _new_state_dict,
-                        _origin_state_dict,
-                        _random_state_dict,
+                        _model_key: str,
+                        _finetune_rule_single: Any,
+                        _new_state_dict: dict[str, Any],
+                        _origin_state_dict: dict[str, Any],
+                        _random_state_dict: dict[str, Any],
                     ) -> None:
                         _new_fitting = _finetune_rule_single.get_random_fitting()
                         _model_key_from = _finetune_rule_single.get_model_branch()
@@ -561,10 +577,10 @@ def collect_single_finetune_params(
                 if finetune_model is not None:
 
                     def single_model_finetune(
-                        _model,
-                        _finetune_rule_single,
-                        _sample_func,
-                    ):
+                        _model: Any,
+                        _finetune_rule_single: Any,
+                        _sample_func: Callable,
+                    ) -> Any:
                         _model = model_change_out_bias(
                             _model,
                             _sample_func,
@@ -619,7 +635,7 @@ def single_model_finetune(
 
         # TODO add lr warmups for multitask
         # author: iProzd
-        def warm_up_linear(step, warmup_steps):
+        def warm_up_linear(step: int, warmup_steps: int) -> float:
             if step < warmup_steps:
                 return step / warmup_steps
             else:
@@ -712,7 +728,7 @@ def run(self) -> None:
             )
             prof.start()
 
-        def step(_step_id, task_key="Default") -> None:
+        def step(_step_id: int, task_key: str = "Default") -> None:
             if self.multi_task:
                 model_index = dp_random.choice(
                     np.arange(self.num_model, dtype=np.int_),
@@ -786,7 +802,7 @@ def step(_step_id, task_key="Default") -> None:
                         else self.wrapper
                     )
 
-                    def fake_model():
+                    def fake_model() -> dict:
                         return model_pred
 
                     _, loss, more_loss = module.loss[task_key](
@@ -861,7 +877,9 @@ def fake_model():
 
                 if self.disp_avg:
 
-                    def log_loss_train(_loss, _more_loss, _task_key="Default"):
+                    def log_loss_train(
+                        _loss: Any, _more_loss: Any, _task_key: str = "Default"
+                    ) -> dict:
                         results = {}
                         if not self.multi_task:
                             # Use accumulated average loss for single task
@@ -884,7 +902,9 @@ def log_loss_train(_loss, _more_loss, _task_key="Default"):
                         return results
                 else:
 
-                    def log_loss_train(_loss, _more_loss, _task_key="Default"):
+                    def log_loss_train(
+                        _loss: Any, _more_loss: Any, _task_key: str = "Default"
+                    ) -> dict:
                         results = {}
                         rmse_val = {
                             item: _more_loss[item]
@@ -895,7 +915,7 @@ def log_loss_train(_loss, _more_loss, _task_key="Default"):
                             results[item] = rmse_val[item]
                         return results
 
-                def log_loss_valid(_task_key="Default"):
+                def log_loss_valid(_task_key: str = "Default") -> dict:
                     single_results = {}
                     sum_natoms = 0
                     if not self.multi_task:
@@ -1171,7 +1191,7 @@ def log_loss_valid(_task_key="Default"):
                     f"The profiling trace has been saved to: {self.profiling_file}"
                 )
 
-    def save_model(self, save_path, lr=0.0, step=0) -> None:
+    def save_model(self, save_path: str, lr: float = 0.0, step: int = 0) -> None:
         module = (
             self.wrapper.module
             if dist.is_available() and dist.is_initialized()
@@ -1196,7 +1216,9 @@ def save_model(self, save_path, lr=0.0, step=0) -> None:
             checkpoint_files.sort(key=lambda x: x.stat().st_mtime)
             checkpoint_files[0].unlink()
 
-    def get_data(self, is_train=True, task_key="Default"):
+    def get_data(
+        self, is_train: bool = True, task_key: str = "Default"
+    ) -> tuple[dict[str, Any], dict[str, Any], dict[str, Any]]:
         if is_train:
             iterator = self.training_data
         else:
@@ -1240,7 +1262,9 @@ def get_data(self, is_train=True, task_key="Default"):
         log_dict["sid"] = batch_data["sid"]
         return input_dict, label_dict, log_dict
 
-    def print_header(self, fout, train_results, valid_results) -> None:
+    def print_header(
+        self, fout: Any, train_results: dict[str, Any], valid_results: dict[str, Any]
+    ) -> None:
         train_keys = sorted(train_results.keys())
         print_str = ""
         print_str += "# {:5s}".format("step")
@@ -1272,7 +1296,12 @@ def print_header(self, fout, train_results, valid_results) -> None:
         fout.flush()
 
     def print_on_training(
-        self, fout, step_id, cur_lr, train_results, valid_results
+        self,
+        fout: Any,
+        step_id: int,
+        cur_lr: float,
+        train_results: dict,
+        valid_results: dict,
     ) -> None:
         train_keys = sorted(train_results.keys())
         print_str = ""
@@ -1304,7 +1333,7 @@ def print_on_training(
         fout.flush()
 
 
-def get_additional_data_requirement(_model):
+def get_additional_data_requirement(_model: Any) -> list[DataRequirementItem]:
     additional_data_requirement = []
     if _model.get_dim_fparam() > 0:
         fparam_requirement_items = [
@@ -1331,12 +1360,14 @@ def get_additional_data_requirement(_model):
     return additional_data_requirement
 
 
-def whether_hessian(loss_params):
+def whether_hessian(loss_params: dict[str, Any]) -> bool:
     loss_type = loss_params.get("type", "ener")
     return loss_type == "ener" and loss_params.get("start_pref_h", 0.0) > 0.0
 
 
-def get_loss(loss_params, start_lr, _ntypes, _model):
+def get_loss(
+    loss_params: dict[str, Any], start_lr: float, _ntypes: int, _model: Any
+) -> TaskLoss:
     loss_type = loss_params.get("type", "ener")
     if whether_hessian(loss_params):
         loss_params["starter_learning_rate"] = start_lr
@@ -1379,8 +1410,8 @@ def get_loss(loss_params, start_lr, _ntypes, _model):
 
 
 def get_single_model(
-    _model_params,
-):
+    _model_params: dict[str, Any],
+) -> Any:
     if "use_srtab" in _model_params:
         model = get_zbl_model(deepcopy(_model_params)).to(DEVICE)
     else:
@@ -1389,10 +1420,10 @@ def get_single_model(
 
 
 def get_model_for_wrapper(
-    _model_params,
-    resuming=False,
-    _loss_params=None,
-):
+    _model_params: dict[str, Any],
+    resuming: bool = False,
+    _loss_params: Optional[dict[str, Any]] = None,
+) -> Any:
     if "model_dict" not in _model_params:
         if _loss_params is not None and whether_hessian(_loss_params):
             _model_params["hessian_mode"] = True
@@ -1415,7 +1446,7 @@ def get_model_for_wrapper(
     return _model
 
 
-def get_case_embd_config(_model_params):
+def get_case_embd_config(_model_params: dict[str, Any]) -> tuple[bool, dict[str, int]]:
     assert "model_dict" in _model_params, (
         "Only support setting case embedding for multi-task model!"
     )
@@ -1440,10 +1471,10 @@ def get_case_embd_config(_model_params):
 
 
 def model_change_out_bias(
-    _model,
-    _sample_func,
-    _bias_adjust_mode="change-by-statistic",
-):
+    _model: Any,
+    _sample_func: Callable[[], Any],
+    _bias_adjust_mode: str = "change-by-statistic",
+) -> Any:
     old_bias = deepcopy(_model.get_out_bias())
     _model.change_out_bias(
         _sample_func,
diff --git a/deepmd/pt/train/wrapper.py b/deepmd/pt/train/wrapper.py
index 9a2cbff295..392f928b0d 100644
--- a/deepmd/pt/train/wrapper.py
+++ b/deepmd/pt/train/wrapper.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import logging
 from typing import (
+    Any,
     Optional,
     Union,
 )
@@ -19,8 +20,8 @@ def __init__(
         self,
         model: Union[torch.nn.Module, dict],
         loss: Union[torch.nn.Module, dict] = None,
-        model_params=None,
-        shared_links=None,
+        model_params: Optional[dict[str, Any]] = None,
+        shared_links: Optional[dict[str, Any]] = None,
     ) -> None:
         """Construct a DeePMD model wrapper.
 
@@ -59,7 +60,7 @@ def __init__(
                     self.loss[task_key] = loss[task_key]
         self.inference_only = self.loss is None
 
-    def share_params(self, shared_links, resume=False) -> None:
+    def share_params(self, shared_links: dict[str, Any], resume: bool = False) -> None:
         """
         Share the parameters of classes following rules defined in shared_links during multitask training.
         If not start from checkpoint (resume is False),
@@ -138,18 +139,18 @@ def share_params(self, shared_links, resume=False) -> None:
 
     def forward(
         self,
-        coord,
-        atype,
+        coord: torch.Tensor,
+        atype: torch.Tensor,
         spin: Optional[torch.Tensor] = None,
         box: Optional[torch.Tensor] = None,
         cur_lr: Optional[torch.Tensor] = None,
         label: Optional[torch.Tensor] = None,
         task_key: Optional[torch.Tensor] = None,
-        inference_only=False,
-        do_atomic_virial=False,
+        inference_only: bool = False,
+        do_atomic_virial: bool = False,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
-    ):
+    ) -> tuple[Any, Any, Any]:
         if not self.multi_task:
             task_key = "Default"
         else:
diff --git a/deepmd/pt/utils/dataloader.py b/deepmd/pt/utils/dataloader.py
index bc771b41d4..c434341ab9 100644
--- a/deepmd/pt/utils/dataloader.py
+++ b/deepmd/pt/utils/dataloader.py
@@ -4,6 +4,11 @@
 from multiprocessing.dummy import (
     Pool,
 )
+from typing import (
+    Any,
+    Optional,
+    Union,
+)
 
 import h5py
 import numpy as np
@@ -45,7 +50,7 @@
 torch.multiprocessing.set_sharing_strategy("file_system")
 
 
-def setup_seed(seed) -> None:
+def setup_seed(seed: Union[int, list[int], tuple[int, ...]]) -> None:
     if isinstance(seed, (list, tuple)):
         mixed_seed = mix_entropy(seed)
     else:
@@ -75,11 +80,11 @@ class DpLoaderSet(Dataset):
 
     def __init__(
         self,
-        systems,
-        batch_size,
-        type_map,
-        seed=None,
-        shuffle=True,
+        systems: Union[str, list[str]],
+        batch_size: int,
+        type_map: Optional[list[str]],
+        seed: Optional[int] = None,
+        shuffle: bool = True,
     ) -> None:
         if seed is not None:
             setup_seed(seed)
@@ -87,7 +92,7 @@ def __init__(
             with h5py.File(systems) as file:
                 systems = [os.path.join(systems, item) for item in file.keys()]
 
-        def construct_dataset(system):
+        def construct_dataset(system: str) -> DeepmdDataSetForLoader:
             return DeepmdDataSetForLoader(
                 system=system,
                 type_map=type_map,
@@ -180,7 +185,7 @@ def construct_dataset(system):
             for item in self.dataloaders:
                 self.iters.append(iter(item))
 
-    def set_noise(self, noise_settings) -> None:
+    def set_noise(self, noise_settings: dict[str, Any]) -> None:
         # noise_settings['noise_type'] # "trunc_normal", "normal", "uniform"
         # noise_settings['noise'] # float, default 1.0
         # noise_settings['noise_mode'] # "prob", "fix_num"
@@ -193,7 +198,7 @@ def set_noise(self, noise_settings) -> None:
     def __len__(self) -> int:
         return len(self.dataloaders)
 
-    def __getitem__(self, idx):
+    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
         # log.warning(str(torch.distributed.get_rank())+" idx: "+str(idx)+" index: "+str(self.index[idx]))
         with torch.device("cpu"):
             try:
@@ -231,7 +236,7 @@ def print_summary(
             )
 
 
-def collate_batch(batch):
+def collate_batch(batch: list[dict[str, Any]]) -> dict[str, Any]:
     example = batch[0]
     result = {}
     for key in example.keys():
@@ -251,7 +256,9 @@ def collate_batch(batch):
     return result
 
 
-def get_weighted_sampler(training_data, prob_style, sys_prob=False):
+def get_weighted_sampler(
+    training_data: Any, prob_style: str, sys_prob: bool = False
+) -> WeightedRandomSampler:
     if sys_prob is False:
         if prob_style == "prob_uniform":
             prob_v = 1.0 / float(training_data.__len__())
@@ -276,7 +283,7 @@ def get_weighted_sampler(training_data, prob_style, sys_prob=False):
     return sampler
 
 
-def get_sampler_from_params(_data, _params):
+def get_sampler_from_params(_data: Any, _params: dict[str, Any]) -> Any:
     if (
         "sys_probs" in _params and _params["sys_probs"] is not None
     ):  # use sys_probs first
diff --git a/deepmd/pt/utils/dataset.py b/deepmd/pt/utils/dataset.py
index 3043839308..2cbe47cc3e 100644
--- a/deepmd/pt/utils/dataset.py
+++ b/deepmd/pt/utils/dataset.py
@@ -2,6 +2,7 @@
 
 
 from typing import (
+    Any,
     Optional,
 )
 
@@ -34,7 +35,7 @@ def __init__(self, system: str, type_map: Optional[list[str]] = None) -> None:
     def __len__(self) -> int:
         return self._data_system.nframes
 
-    def __getitem__(self, index):
+    def __getitem__(self, index: int) -> dict[str, Any]:
         """Get a frame from the selected system."""
         b_data = self._data_system.get_item_torch(index)
         b_data["natoms"] = self._natoms_vec
diff --git a/deepmd/pt/utils/env_mat_stat.py b/deepmd/pt/utils/env_mat_stat.py
index 23e8627bcd..1f89c09621 100644
--- a/deepmd/pt/utils/env_mat_stat.py
+++ b/deepmd/pt/utils/env_mat_stat.py
@@ -200,7 +200,7 @@ def get_hash(self) -> str:
             }
         )
 
-    def __call__(self):
+    def __call__(self) -> tuple[np.ndarray, np.ndarray]:
         avgs = self.get_avg()
         stds = self.get_std()
 
diff --git a/deepmd/pt/utils/exclude_mask.py b/deepmd/pt/utils/exclude_mask.py
index 0a99c0777f..cf39220f1b 100644
--- a/deepmd/pt/utils/exclude_mask.py
+++ b/deepmd/pt/utils/exclude_mask.py
@@ -32,10 +32,10 @@ def reinit(
         )
         self.type_mask = to_torch_tensor(self.type_mask).view([-1])
 
-    def get_exclude_types(self):
+    def get_exclude_types(self) -> list[int]:
         return self.exclude_types
 
-    def get_type_mask(self):
+    def get_type_mask(self) -> torch.Tensor:
         return self.type_mask
 
     def forward(
@@ -98,7 +98,7 @@ def reinit(
         self.type_mask = to_torch_tensor(self.type_mask).view([-1])
         self.no_exclusion = len(self._exclude_types) == 0
 
-    def get_exclude_types(self):
+    def get_exclude_types(self) -> set[tuple[int, int]]:
         return self._exclude_types
 
     # may have a better place for this method...
diff --git a/deepmd/pt/utils/finetune.py b/deepmd/pt/utils/finetune.py
index 77b6a37acc..0e86c9aa6c 100644
--- a/deepmd/pt/utils/finetune.py
+++ b/deepmd/pt/utils/finetune.py
@@ -3,6 +3,9 @@
 from copy import (
     deepcopy,
 )
+from typing import (
+    Any,
+)
 
 import torch
 
@@ -20,13 +23,13 @@
 
 
 def get_finetune_rule_single(
-    _single_param_target,
-    _model_param_pretrained,
-    from_multitask=False,
-    model_branch="Default",
-    model_branch_from="",
-    change_model_params=False,
-):
+    _single_param_target: dict[str, Any],
+    _model_param_pretrained: dict[str, Any],
+    from_multitask: bool = False,
+    model_branch: str = "Default",
+    model_branch_from: str = "",
+    change_model_params: bool = False,
+) -> tuple[dict[str, Any], FinetuneRuleItem]:
     single_config = deepcopy(_single_param_target)
     new_fitting = False
     model_branch_chosen = "Default"
@@ -86,8 +89,11 @@ def get_finetune_rule_single(
 
 
 def get_finetune_rules(
-    finetune_model, model_config, model_branch="", change_model_params=True
-):
+    finetune_model: str,
+    model_config: dict[str, Any],
+    model_branch: str = "",
+    change_model_params: bool = True,
+) -> tuple[dict[str, Any], dict[str, FinetuneRuleItem]]:
     """
     Get fine-tuning rules and (optionally) change the model_params according to the pretrained one.
 
diff --git a/deepmd/pt/utils/multi_task.py b/deepmd/pt/utils/multi_task.py
index 6c397400bf..87b020c17b 100644
--- a/deepmd/pt/utils/multi_task.py
+++ b/deepmd/pt/utils/multi_task.py
@@ -2,6 +2,10 @@
 from copy import (
     deepcopy,
 )
+from typing import (
+    Any,
+    Optional,
+)
 
 from deepmd.pt.model.descriptor import (
     BaseDescriptor,
@@ -11,7 +15,9 @@
 )
 
 
-def preprocess_shared_params(model_config):
+def preprocess_shared_params(
+    model_config: dict[str, Any],
+) -> tuple[dict[str, Any], dict[str, Any]]:
     """Preprocess the model params for multitask model, and generate the links dict for further sharing.
 
     Args:
@@ -97,7 +103,11 @@ def preprocess_shared_params(model_config):
     type_map_keys = []
 
     def replace_one_item(
-        params_dict, key_type, key_in_dict, suffix="", index=None
+        params_dict: dict[str, Any],
+        key_type: str,
+        key_in_dict: str,
+        suffix: str = "",
+        index: Optional[int] = None,
     ) -> None:
         shared_type = key_type
         shared_key = key_in_dict
@@ -155,7 +165,7 @@ def replace_one_item(
     return model_config, shared_links
 
 
-def get_class_name(item_key, item_params):
+def get_class_name(item_key: str, item_params: dict[str, Any]) -> type:
     if item_key == "descriptor":
         return BaseDescriptor.get_class_by_type(item_params.get("type", "se_e2_a"))
     elif item_key == "fitting_net":
diff --git a/deepmd/pt/utils/neighbor_stat.py b/deepmd/pt/utils/neighbor_stat.py
index 64ad695827..b0e9eca141 100644
--- a/deepmd/pt/utils/neighbor_stat.py
+++ b/deepmd/pt/utils/neighbor_stat.py
@@ -171,7 +171,7 @@ def _execute(
         coord: np.ndarray,
         atype: np.ndarray,
         cell: Optional[np.ndarray],
-    ):
+    ) -> tuple[np.ndarray, np.ndarray]:
         """Execute the operation.
 
         Parameters
diff --git a/deepmd/pt/utils/nlist.py b/deepmd/pt/utils/nlist.py
index af84151829..8023645f8c 100644
--- a/deepmd/pt/utils/nlist.py
+++ b/deepmd/pt/utils/nlist.py
@@ -16,13 +16,13 @@
 
 
 def extend_input_and_build_neighbor_list(
-    coord,
-    atype,
+    coord: torch.Tensor,
+    atype: torch.Tensor,
     rcut: float,
     sel: list[int],
     mixed_types: bool = False,
     box: Optional[torch.Tensor] = None,
-):
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     nframes, nloc = atype.shape[:2]
     if box is not None:
         box_gpu = box.to(coord.device, non_blocking=True)
@@ -292,7 +292,7 @@ def nlist_distinguish_types(
     nlist: torch.Tensor,
     atype: torch.Tensor,
     sel: list[int],
-):
+) -> torch.Tensor:
     """Given a nlist that does not distinguish atom types, return a nlist that
     distinguish atom types.
 
@@ -414,7 +414,7 @@ def extend_coord_with_ghosts(
     cell: Optional[torch.Tensor],
     rcut: float,
     cell_cpu: Optional[torch.Tensor] = None,
-):
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """Extend the coordinates of the atoms by appending peridoc images.
     The number of images is large enough to ensure all the neighbors
     within rcut are appended.
diff --git a/deepmd/pt/utils/preprocess.py b/deepmd/pt/utils/preprocess.py
index 7161bac692..0cc31b5d7a 100644
--- a/deepmd/pt/utils/preprocess.py
+++ b/deepmd/pt/utils/preprocess.py
@@ -6,7 +6,9 @@
 log = logging.getLogger(__name__)
 
 
-def compute_smooth_weight(distance, rmin: float, rmax: float):
+def compute_smooth_weight(
+    distance: torch.Tensor, rmin: float, rmax: float
+) -> torch.Tensor:
     """Compute smooth weight for descriptor elements."""
     if rmin >= rmax:
         raise ValueError("rmin should be less than rmax.")
@@ -17,7 +19,7 @@ def compute_smooth_weight(distance, rmin: float, rmax: float):
     return vv
 
 
-def compute_exp_sw(distance, rmin: float, rmax: float):
+def compute_exp_sw(distance: torch.Tensor, rmin: float, rmax: float) -> torch.Tensor:
     """Compute the exponential switch function for neighbor update."""
     if rmin >= rmax:
         raise ValueError("rmin should be less than rmax.")
diff --git a/deepmd/pt/utils/region.py b/deepmd/pt/utils/region.py
index 3272434995..21af694c2c 100644
--- a/deepmd/pt/utils/region.py
+++ b/deepmd/pt/utils/region.py
@@ -68,7 +68,7 @@ def to_face_distance(
     return dist.view(list(cshape[:-2]) + [3])  # noqa:RUF005
 
 
-def b_to_face_distance(cell):
+def b_to_face_distance(cell: torch.Tensor) -> torch.Tensor:
     volume = torch.linalg.det(cell)
     c_yz = torch.cross(cell[:, 1], cell[:, 2], dim=-1)
     _h2yz = volume / torch.linalg.norm(c_yz, dim=-1)
diff --git a/deepmd/pt/utils/spin.py b/deepmd/pt/utils/spin.py
index 285dcaf93e..74ddb5ca13 100644
--- a/deepmd/pt/utils/spin.py
+++ b/deepmd/pt/utils/spin.py
@@ -4,10 +4,10 @@
 
 
 def concat_switch_virtual(
-    extended_tensor,
-    extended_tensor_virtual,
+    extended_tensor: torch.Tensor,
+    extended_tensor_virtual: torch.Tensor,
     nloc: int,
-):
+) -> torch.Tensor:
     """
     Concat real and virtual extended tensors, and switch all the local ones to the first nloc * 2 atoms.
     - [:, :nloc]: original nloc real atoms.
diff --git a/deepmd/pt/utils/stat.py b/deepmd/pt/utils/stat.py
index cf6892b49d..cbab88de44 100644
--- a/deepmd/pt/utils/stat.py
+++ b/deepmd/pt/utils/stat.py
@@ -4,6 +4,7 @@
     defaultdict,
 )
 from typing import (
+    Any,
     Callable,
     Optional,
     Union,
@@ -35,7 +36,9 @@
 log = logging.getLogger(__name__)
 
 
-def make_stat_input(datasets, dataloaders, nbatches):
+def make_stat_input(
+    datasets: list[Any], dataloaders: list[Any], nbatches: int
+) -> dict[str, Any]:
     """Pack data for statistics.
 
     Args:
@@ -127,9 +130,9 @@ def _save_to_file(
 
 
 def _post_process_stat(
-    out_bias,
-    out_std,
-):
+    out_bias: torch.Tensor,
+    out_std: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
     """Post process the statistics.
 
     For global statistics, we do not have the std for each type of atoms,
@@ -151,7 +154,7 @@ def _compute_model_predict(
     sampled: Union[Callable[[], list[dict]], list[dict]],
     keys: list[str],
     model_forward: Callable[..., torch.Tensor],
-):
+) -> dict[str, list[torch.Tensor]]:
     auto_batch_size = AutoBatchSize()
     model_predict = {kk: [] for kk in keys}
     for system in sampled:
@@ -165,7 +168,7 @@ def _compute_model_predict(
         fparam = system.get("fparam", None)
         aparam = system.get("aparam", None)
 
-        def model_forward_auto_batch_size(*args, **kwargs):
+        def model_forward_auto_batch_size(*args: Any, **kwargs: Any) -> Any:
             return auto_batch_size.execute_all(
                 model_forward,
                 nframes,
@@ -214,7 +217,7 @@ def _make_preset_out_bias(
 def _fill_stat_with_global(
     atomic_stat: Union[np.ndarray, None],
     global_stat: np.ndarray,
-):
+) -> Union[np.ndarray, None]:
     """This function is used to fill atomic stat with global stat.
 
     Parameters
@@ -247,7 +250,7 @@ def compute_output_stats(
     model_forward: Optional[Callable[..., torch.Tensor]] = None,
     stats_distinguish_types: bool = True,
     intensive: bool = False,
-):
+) -> dict[str, Any]:
     """
     Compute the output statistics (e.g. energy bias) for the fitting net from packed data.
 
@@ -414,7 +417,7 @@ def compute_output_stats_global(
     model_pred: Optional[dict[str, np.ndarray]] = None,
     stats_distinguish_types: bool = True,
     intensive: bool = False,
-):
+) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
     """This function only handle stat computation from reduced global labels."""
     # return directly if model predict is empty for global
     if model_pred == {}:
@@ -522,7 +525,7 @@ def compute_output_stats_global(
         }
     atom_numbs = {kk: merged_natoms[kk].sum(-1) for kk in bias_atom_e.keys()}
 
-    def rmse(x):
+    def rmse(x: np.ndarray) -> float:
         return np.sqrt(np.mean(np.square(x)))
 
     for kk in bias_atom_e.keys():
@@ -541,7 +544,7 @@ def compute_output_stats_atomic(
     ntypes: int,
     keys: list[str],
     model_pred: Optional[dict[str, np.ndarray]] = None,
-):
+) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
     # get label dict from sample; for each key, only picking the system with atomic labels.
     outputs = {
         kk: [
diff --git a/deepmd/pt/utils/tabulate.py b/deepmd/pt/utils/tabulate.py
index db743ff98c..b155a897da 100644
--- a/deepmd/pt/utils/tabulate.py
+++ b/deepmd/pt/utils/tabulate.py
@@ -3,6 +3,9 @@
 from functools import (
     cached_property,
 )
+from typing import (
+    Any,
+)
 
 import numpy as np
 import torch
@@ -48,7 +51,7 @@ class DPTabulate(BaseTabulate):
 
     def __init__(
         self,
-        descrpt,
+        descrpt: Any,
         neuron: list[int],
         type_one_side: bool = False,
         exclude_types: list[list[int]] = [],
@@ -113,7 +116,7 @@ def __init__(
         self.data_type = self._get_data_type()
         self.last_layer_size = self._get_last_layer_size()
 
-    def _make_data(self, xx, idx):
+    def _make_data(self, xx: np.ndarray, idx: int) -> Any:
         """Generate tabulation data for the given input.
 
         Parameters
@@ -282,12 +285,12 @@ def _make_data(self, xx, idx):
         d2 = dy2.detach().cpu().numpy().astype(self.data_type)
         return vv, dd, d2
 
-    def _layer_0(self, x, w, b):
+    def _layer_0(self, x: torch.Tensor, w: np.ndarray, b: np.ndarray) -> torch.Tensor:
         w = torch.from_numpy(w).to(env.DEVICE)
         b = torch.from_numpy(b).to(env.DEVICE)
         return self.activation_fn(torch.matmul(x, w) + b)
 
-    def _layer_1(self, x, w, b):
+    def _layer_1(self, x: torch.Tensor, w: np.ndarray, b: np.ndarray) -> torch.Tensor:
         w = torch.from_numpy(w).to(env.DEVICE)
         b = torch.from_numpy(b).to(env.DEVICE)
         t = torch.cat([x, x], dim=1)
@@ -310,7 +313,7 @@ def _get_descrpt_type(self) -> str:
             return "T"
         raise RuntimeError(f"Unsupported descriptor {self.descrpt}")
 
-    def _get_layer_size(self):
+    def _get_layer_size(self) -> int:
         # get the number of layers in EmbeddingNet
         layer_size = 0
         basic_size = 0
@@ -417,10 +420,10 @@ def _get_network_variable(self, var_name: str) -> dict:
                 raise RuntimeError("Unsupported descriptor")
         return result
 
-    def _get_bias(self):
+    def _get_bias(self) -> Any:
         return self._get_network_variable("b")
 
-    def _get_matrix(self):
+    def _get_matrix(self) -> Any:
         return self._get_network_variable("w")
 
     def _convert_numpy_to_tensor(self) -> None:
@@ -435,7 +438,7 @@ def _n_all_excluded(self) -> int:
 
 
 # customized op
-def grad(xbar: torch.Tensor, y: torch.Tensor, functype: int):
+def grad(xbar: torch.Tensor, y: torch.Tensor, functype: int) -> torch.Tensor:
     if functype == 1:
         return 1 - y * y
 
@@ -465,7 +468,7 @@ def grad(xbar: torch.Tensor, y: torch.Tensor, functype: int):
         raise ValueError(f"Unsupported function type: {functype}")
 
 
-def grad_grad(xbar: torch.Tensor, y: torch.Tensor, functype: int):
+def grad_grad(xbar: torch.Tensor, y: torch.Tensor, functype: int) -> torch.Tensor:
     if functype == 1:
         return -2 * y * (1 - y * y)
 
@@ -494,7 +497,7 @@ def grad_grad(xbar: torch.Tensor, y: torch.Tensor, functype: int):
 
 def unaggregated_dy_dx_s(
     y: torch.Tensor, w_np: np.ndarray, xbar: torch.Tensor, functype: int
-):
+) -> torch.Tensor:
     w = torch.from_numpy(w_np).to(env.DEVICE)
     y = y.to(env.DEVICE)
     xbar = xbar.to(env.DEVICE)
@@ -520,7 +523,7 @@ def unaggregated_dy2_dx_s(
     w_np: np.ndarray,
     xbar: torch.Tensor,
     functype: int,
-):
+) -> torch.Tensor:
     w = torch.from_numpy(w_np).to(env.DEVICE)
     y = y.to(env.DEVICE)
     dy = dy.to(env.DEVICE)
@@ -549,7 +552,7 @@ def unaggregated_dy_dx(
     dy_dx: torch.Tensor,
     ybar: torch.Tensor,
     functype: int,
-):
+) -> torch.Tensor:
     w = torch.from_numpy(w_np).to(env.DEVICE)
     if z.dim() != 2:
         raise ValueError("z tensor must have 2 dimensions")
@@ -587,7 +590,7 @@ def unaggregated_dy2_dx(
     dy2_dx: torch.Tensor,
     ybar: torch.Tensor,
     functype: int,
-):
+) -> torch.Tensor:
     w = torch.from_numpy(w_np).to(env.DEVICE)
     if z.dim() != 2:
         raise ValueError("z tensor must have 2 dimensions")
diff --git a/deepmd/pt/utils/utils.py b/deepmd/pt/utils/utils.py
index 054dc3c80b..d06e2c1640 100644
--- a/deepmd/pt/utils/utils.py
+++ b/deepmd/pt/utils/utils.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
     Union,
     overload,
@@ -69,7 +70,7 @@ def silut_double_backward(
 
 
 class SiLUTScript(torch.nn.Module):
-    def __init__(self, threshold: float = 3.0):
+    def __init__(self, threshold: float = 3.0) -> None:
         super().__init__()
         self.threshold = threshold
 
@@ -81,14 +82,20 @@ def __init__(self, threshold: float = 3.0):
         self.const_val = float(threshold * sigmoid_threshold)
         self.get_script_code()
 
-    def get_script_code(self):
+    def get_script_code(self) -> None:
         silut_forward_script = torch.jit.script(silut_forward)
         silut_backward_script = torch.jit.script(silut_backward)
         silut_double_backward_script = torch.jit.script(silut_double_backward)
 
         class SiLUTFunction(torch.autograd.Function):
             @staticmethod
-            def forward(ctx, x, threshold, slope, const_val):
+            def forward(
+                ctx: Any,
+                x: torch.Tensor,
+                threshold: float,
+                slope: float,
+                const_val: float,
+            ) -> torch.Tensor:
                 ctx.save_for_backward(x)
                 ctx.threshold = threshold
                 ctx.slope = slope
@@ -96,7 +103,9 @@ def forward(ctx, x, threshold, slope, const_val):
                 return silut_forward_script(x, threshold, slope, const_val)
 
             @staticmethod
-            def backward(ctx, grad_output):
+            def backward(
+                ctx: Any, grad_output: torch.Tensor
+            ) -> tuple[torch.Tensor, None, None, None]:
                 (x,) = ctx.saved_tensors
                 threshold = ctx.threshold
                 slope = ctx.slope
@@ -106,7 +115,13 @@ def backward(ctx, grad_output):
 
         class SiLUTGradFunction(torch.autograd.Function):
             @staticmethod
-            def forward(ctx, x, grad_output, threshold, slope):
+            def forward(
+                ctx: Any,
+                x: torch.Tensor,
+                grad_output: torch.Tensor,
+                threshold: float,
+                slope: float,
+            ) -> torch.Tensor:
                 ctx.threshold = threshold
                 ctx.slope = slope
                 grad_input = silut_backward_script(x, grad_output, threshold, slope)
@@ -114,7 +129,9 @@ def forward(ctx, x, grad_output, threshold, slope):
                 return grad_input
 
             @staticmethod
-            def backward(ctx, grad_grad_output):
+            def backward(
+                ctx: Any, grad_grad_output: torch.Tensor
+            ) -> tuple[torch.Tensor, torch.Tensor]:
                 (x, grad_output) = ctx.saved_tensors
                 threshold = ctx.threshold
                 slope = ctx.slope
@@ -126,21 +143,21 @@ def backward(ctx, grad_grad_output):
 
         self.SiLUTFunction = SiLUTFunction
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.SiLUTFunction.apply(x, self.threshold, self.slope, self.const_val)
 
 
 class SiLUT(torch.nn.Module):
-    def __init__(self, threshold=3.0):
+    def __init__(self, threshold: float = 3.0) -> None:
         super().__init__()
 
-        def sigmoid(x):
+        def sigmoid(x: float) -> float:
             return 1 / (1 + np.exp(-x))
 
-        def silu(x):
+        def silu(x: float) -> float:
             return x * sigmoid(x)
 
-        def silu_grad(x):
+        def silu_grad(x: float) -> float:
             sig = sigmoid(x)
             return sig + x * sig * (1 - sig)
 
@@ -212,8 +229,8 @@ def to_numpy_array(xx: None) -> None: ...
 
 
 def to_numpy_array(
-    xx,
-):
+    xx: Optional[torch.Tensor],
+) -> Optional[np.ndarray]:
     if xx is None:
         return None
     assert xx is not None
@@ -239,8 +256,8 @@ def to_torch_tensor(xx: None) -> None: ...
 
 
 def to_torch_tensor(
-    xx,
-):
+    xx: Optional[np.ndarray],
+) -> Optional[torch.Tensor]:
     if xx is None:
         return None
     assert xx is not None
@@ -259,7 +276,7 @@ def to_torch_tensor(
     return torch.tensor(xx, dtype=prec, device=DEVICE)
 
 
-def dict_to_device(sample_dict) -> None:
+def dict_to_device(sample_dict: dict[str, Any]) -> None:
     for key in sample_dict:
         if isinstance(sample_dict[key], list):
             sample_dict[key] = [item.to(DEVICE) for item in sample_dict[key]]
@@ -280,7 +297,7 @@ def dict_to_device(sample_dict) -> None:
 XSHIFT = 16
 
 
-def hashmix(value: int, hash_const: list[int]):
+def hashmix(value: int, hash_const: list[int]) -> int:
     value ^= INIT_A
     hash_const[0] *= MULT_A
     value *= INIT_A
@@ -291,7 +308,7 @@ def hashmix(value: int, hash_const: list[int]):
     return value
 
 
-def mix(x: int, y: int):
+def mix(x: int, y: int) -> int:
     result = MIX_MULT_L * x - MIX_MULT_R * y
     # prevent overflow
     result &= 0xFFFF_FFFF_FFFF_FFFF
diff --git a/pyproject.toml b/pyproject.toml
index ab35e881f1..b86c460fbd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -391,7 +391,6 @@ ignore = [
     "D401", # TODO: first line should be in imperative mood
     "D404", # TODO: first word of the docstring should not be This
 ]
-ignore-init-module-imports = true
 
 exclude = [
     "source/3rdparty/**",
@@ -424,7 +423,7 @@ runtime-evaluated-base-classes = ["torch.nn.Module"]
 "backend/**" = ["ANN"]
 "data/**" = ["ANN"]
 "deepmd/tf/**" = ["TID253", "ANN"]
-"deepmd/pt/**" = ["TID253", "ANN"]
+"deepmd/pt/**" = ["TID253"]
 "deepmd/jax/**" = ["TID253", "ANN"]
 "deepmd/pd/**" = ["TID253", "ANN"]
 "deepmd/dpmodel/**" = ["ANN"]