diff --git a/.devcontainer/download_libtorch.sh b/.devcontainer/download_libtorch.sh
index d78b559997..ef68a2e615 100755
--- a/.devcontainer/download_libtorch.sh
+++ b/.devcontainer/download_libtorch.sh
@@ -4,5 +4,5 @@ set -ev
 SCRIPT_PATH=$(dirname $(realpath -s $0))
 cd ${SCRIPT_PATH}/..
 
-wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.5.0%2Bcpu.zip -O ~/libtorch.zip
+wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.6.0%2Bcpu.zip -O ~/libtorch.zip
 unzip ~/libtorch.zip
diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml
index 8788bc929b..8d27e08974 100644
--- a/.github/workflows/build_wheel.yml
+++ b/.github/workflows/build_wheel.yml
@@ -14,24 +14,8 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  determine-arm64-runner:
-    runs-on: ubuntu-latest
-    permissions: read-all
-    outputs:
-      runner: ${{ steps.set-runner.outputs.runner }}
-    steps:
-      - name: Determine which runner to use for ARM64 build
-        id: set-runner
-        run: |
-          if [ "${{ github.repository_owner }}" == "deepmodeling" ]; then
-            echo "runner=[\"Linux\",\"ARM64\"]" >> $GITHUB_OUTPUT
-          else
-            echo "runner=\"ubuntu-latest\"" >> $GITHUB_OUTPUT
-          fi
-
   build_wheels:
     name: Build wheels for cp${{ matrix.python }}-${{ matrix.platform_id }}
-    needs: determine-arm64-runner
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
@@ -65,7 +49,7 @@ jobs:
             platform_id: win_amd64
             dp_variant: cpu
           # linux-aarch64
-          - os: ${{ fromJson(needs.determine-arm64-runner.outputs.runner) }}
+          - os: ubuntu-24.04-arm
             python: 310
             platform_id: manylinux_aarch64
             dp_variant: cpu
diff --git a/.github/workflows/test_cc.yml b/.github/workflows/test_cc.yml
index f7f3a4f431..da69f01225 100644
--- a/.github/workflows/test_cc.yml
+++ b/.github/workflows/test_cc.yml
@@ -29,7 +29,7 @@ jobs:
     - run: python -m pip install uv
     - name: Install Python dependencies
       run: |
-        source/install/uv_with_retry.sh pip install --system tensorflow-cpu
+        source/install/uv_with_retry.sh pip install --system tensorflow-cpu~=2.18.0 jax==0.5.0
         export TENSORFLOW_ROOT=$(python -c 'import importlib,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
         source/install/uv_with_retry.sh pip install --system -e .[cpu,test,lmp,jax] mpi4py
     - name: Convert models
diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml
index 4dbdc5acb9..9870ff6183 100644
--- a/.github/workflows/test_cuda.yml
+++ b/.github/workflows/test_cuda.yml
@@ -47,11 +47,11 @@ jobs:
          && sudo apt-get -y install cuda-12-3 libcudnn8=8.9.5.*-1+cuda12.3
       if: false  # skip as we use nvidia image
     - run: python -m pip install -U uv
-    - run: source/install/uv_with_retry.sh pip install --system "tensorflow~=2.18.0rc2" "torch~=2.5.0" "jax[cuda12]"
+    - run: source/install/uv_with_retry.sh pip install --system "tensorflow~=2.18.0rc2" "torch~=2.6.0" "jax[cuda12]==0.5.0"
     - run: |
         export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])')
         export TENSORFLOW_ROOT=$(python -c 'import importlib,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
-        source/install/uv_with_retry.sh pip install --system -v -e .[gpu,test,lmp,cu12,torch,jax] mpi4py
+        source/install/uv_with_retry.sh pip install --system -v -e .[gpu,test,lmp,cu12,torch,jax] mpi4py --reinstall-package deepmd-kit
       env:
         DP_VARIANT: cuda
         DP_ENABLE_NATIVE_OPTIMIZATION: 1
@@ -67,7 +67,7 @@ jobs:
       run: source/tests/infer/convert-models.sh
     - name: Download libtorch
       run: |
-         wget https://download.pytorch.org/libtorch/cu124/libtorch-cxx11-abi-shared-with-deps-2.5.0%2Bcu124.zip -O libtorch.zip
+         wget https://download.pytorch.org/libtorch/cu124/libtorch-cxx11-abi-shared-with-deps-2.6.0%2Bcu124.zip -O libtorch.zip
          unzip libtorch.zip
     - run: |
         export CMAKE_PREFIX_PATH=$GITHUB_WORKSPACE/libtorch
diff --git a/.github/workflows/test_python.yml b/.github/workflows/test_python.yml
index e30a19c8b1..f2169b2633 100644
--- a/.github/workflows/test_python.yml
+++ b/.github/workflows/test_python.yml
@@ -25,11 +25,11 @@ jobs:
         python-version: ${{ matrix.python }}
     - run: python -m pip install -U uv
     - run: |
-        source/install/uv_with_retry.sh pip install --system openmpi tensorflow-cpu
+        source/install/uv_with_retry.sh pip install --system openmpi tensorflow-cpu~=2.18.0
         source/install/uv_with_retry.sh pip install --system torch -i https://download.pytorch.org/whl/cpu
         export TENSORFLOW_ROOT=$(python -c 'import tensorflow;print(tensorflow.__path__[0])')
         export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])')
-        source/install/uv_with_retry.sh pip install --system -e .[test,jax] mpi4py
+        source/install/uv_with_retry.sh pip install --system -e .[test,jax] mpi4py "jax==0.5.0;python_version>='3.10'"
         source/install/uv_with_retry.sh pip install --system horovod --no-build-isolation
       env:
         # Please note that uv has some issues with finding
diff --git a/.readthedocs.yml b/.readthedocs.yml
index 499411eaa3..7cda715627 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -10,5 +10,7 @@ build:
       - VIRTUAL_ENV=$READTHEDOCS_VIRTUALENV_PATH uv pip install -r doc/requirements.txt
   apt_packages:
     - inkscape
+sphinx:
+  configuration: doc/conf.py
 formats:
   - pdf
diff --git a/CITATIONS.bib b/CITATIONS.bib
index 52c8045bf3..61f9f34ff5 100644
--- a/CITATIONS.bib
+++ b/CITATIONS.bib
@@ -40,6 +40,31 @@ @article{Zeng_JChemPhys_2023_v159_p054801
   doi          = {10.1063/5.0155600},
 }
 
+@article{Zeng_arXiv_2025_p2502.19161,
+  annote       = {general purpose},
+  author       = {
+    Jinzhe Zeng and Duo Zhang and Anyang Peng and Xiangyu Zhang and Sensen He
+    and Yan Wang and Xinzijian Liu and Hangrui Bi and Yifan Li and Chun Cai and
+    Chengqian Zhang and Yiming Du and Jia-Xin Zhu and Pinghui Mo and Zhengtao
+    Huang and Qiyu Zeng and Shaochen Shi and Xuejian Qin and Zhaoxi Yu and
+    Chenxing Luo and Ye Ding and Yun-Pei Liu and Ruosong Shi and Zhenyu Wang
+    and Sigbj{\o}rn L{\o}land Bore and Junhan Chang and Zhe Deng and Zhaohan
+    Ding and Siyuan Han and Wanrun Jiang and Guolin Ke and Zhaoqing Liu and
+    Denghui Lu and Koki Muraoka and Hananeh Oliaei and Anurag Kumar Singh and
+    Haohui Que and Weihong Xu and Zhangmancang Xu and Yong-Bin Zhuang and Jiayu
+    Dai and Timothy J. Giese and Weile Jia and Ben Xu and Darrin M. York and
+    Linfeng Zhang and Han Wang
+  },
+  title        = {
+    {DeePMD-kit v3: A Multiple-Backend Framework for Machine Learning
+    Potentials}
+  },
+  journal      = {arXiv},
+  year         = 2025,
+  pages        = {2502.19161},
+  doi          = {10.48550/arXiv.2502.19161},
+}
+
 @article{Lu_CompPhysCommun_2021_v259_p107624,
   annote       = {GPU support},
   title        = {
diff --git a/README.md b/README.md
index 18bdfd6560..15e88b218f 100644
--- a/README.md
+++ b/README.md
@@ -36,6 +36,7 @@ If you use this code in any future publications, please cite the following publi
 - Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang. "DeePMD-kit v2: A software package for deep potential models." J. Chem. Phys. 159 (2023): 054801.
   [![doi:10.1063/5.0155600](https://img.shields.io/badge/DOI-10.1063%2F5.0155600-blue)](https://doi.org/10.1063/5.0155600)
   [![Citations](https://citations.njzjz.win/10.1063/5.0155600)](https://badge.dimensions.ai/details/doi/10.1063/5.0155600)
+- Jinzhe Zeng, Duo Zhang, Anyang Peng, Xiangyu Zhang, Sensen He, Yan Wang, Xinzijian Liu, Hangrui Bi, Yifan Li, Chun Cai, Chengqian Zhang, Yiming Du, Jia-Xin Zhu, Pinghui Mo, Zhengtao Huang, Qiyu Zeng, Shaochen Shi, Xuejian Qin, Zhaoxi Yu, Chenxing Luo, Ye Ding, Yun-Pei Liu, Ruosong Shi, Zhenyu Wang, Sigbjørn Løland Bore, Junhan Chang, Zhe Deng, Zhaohan Ding, Siyuan Han, Wanrun Jiang, Guolin Ke, Zhaoqing Liu, Denghui Lu, Koki Muraoka, Hananeh Oliaei, Anurag Kumar Singh, Haohui Que, Weihong Xu, Zhangmancang Xu, Yong-Bin Zhuang, Jiayu Dai, Timothy J. Giese, Weile Jia, Ben Xu, Darrin M. York, Linfeng Zhang, Han Wang. "DeePMD-kit v3: A Multiple-Backend Framework for Machine Learning Potentials." [arXiv:2502.19161](https://arxiv.org/abs/2502.19161).
 
 In addition, please follow [the bib file](CITATIONS.bib) to cite the methods you used.
 
@@ -68,7 +69,7 @@ In addition to building up potential energy models, DeePMD-kit can also be used
 - Non-von-Neumann.
 - C API to interface with the third-party packages.
 
-See [our latest paper](https://doi.org/10.1063/5.0155600) for details of all features until v2.2.3.
+See [our v2 paper](https://doi.org/10.1063/5.0155600) for details of all features until v2.2.3.
 
 #### v3
 
@@ -76,6 +77,8 @@ See [our latest paper](https://doi.org/10.1063/5.0155600) for details of all fea
 - The DPA-2 model.
 - Plugin mechanisms for external models.
 
+See [our v3 paper](https://doi.org/10.48550/arXiv.2502.19161) for details of all features until v3.0.
+
 ## Install and use DeePMD-kit
 
 Please read the [online documentation](https://deepmd.readthedocs.io/) for how to install and use DeePMD-kit.
diff --git a/backend/find_pytorch.py b/backend/find_pytorch.py
index 125fd6a389..a309e3b9e9 100644
--- a/backend/find_pytorch.py
+++ b/backend/find_pytorch.py
@@ -116,7 +116,7 @@ def get_pt_requirement(pt_version: str = "") -> dict:
         cuda_version = os.environ.get("CUDA_VERSION", "12.2")
         if cuda_version == "" or cuda_version in SpecifierSet(">=12,<13"):
             # CUDA 12.2, cudnn 9
-            pt_version = "2.5.0"
+            pt_version = "2.6.0"
         elif cuda_version in SpecifierSet(">=11,<12"):
             # CUDA 11.8, cudnn 8
             pt_version = "2.3.1"
diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
index 20a758b170..fffeaae232 100644
--- a/deepmd/dpmodel/descriptor/dpa1.py
+++ b/deepmd/dpmodel/descriptor/dpa1.py
@@ -899,6 +899,7 @@ def call(
         exclude_mask = self.emask.build_type_exclude_mask(nlist, atype_ext)
         # nfnl x nnei
         exclude_mask = xp.reshape(exclude_mask, (nf * nloc, nnei))
+        exclude_mask = xp.astype(exclude_mask, xp.bool)
         # nfnl x nnei
         nlist = xp.reshape(nlist, (nf * nloc, nnei))
         nlist = xp.where(exclude_mask, nlist, xp.full_like(nlist, -1))
diff --git a/deepmd/dpmodel/descriptor/repformers.py b/deepmd/dpmodel/descriptor/repformers.py
index ae6b5de511..e15a20926f 100644
--- a/deepmd/dpmodel/descriptor/repformers.py
+++ b/deepmd/dpmodel/descriptor/repformers.py
@@ -393,6 +393,7 @@ def call(
     ):
         xp = array_api_compat.array_namespace(nlist, coord_ext, atype_ext)
         exclude_mask = self.emask.build_type_exclude_mask(nlist, atype_ext)
+        exclude_mask = xp.astype(exclude_mask, xp.bool)
         nlist = xp.where(exclude_mask, nlist, xp.full_like(nlist, -1))
         # nf x nloc x nnei x 4
         dmatrix, diff, sw = self.env_mat.call(
diff --git a/deepmd/dpmodel/descriptor/se_t_tebd.py b/deepmd/dpmodel/descriptor/se_t_tebd.py
index c350e3eb47..ec54fd08aa 100644
--- a/deepmd/dpmodel/descriptor/se_t_tebd.py
+++ b/deepmd/dpmodel/descriptor/se_t_tebd.py
@@ -682,6 +682,7 @@ def call(
         exclude_mask = xp.reshape(exclude_mask, (nf * nloc, nnei))
         # nfnl x nnei
         nlist = xp.reshape(nlist, (nf * nloc, nnei))
+        exclude_mask = xp.astype(exclude_mask, xp.bool)
         nlist = xp.where(exclude_mask, nlist, xp.full_like(nlist, -1))
         # nfnl x nnei
         nlist_mask = nlist != -1
diff --git a/deepmd/dpmodel/fitting/general_fitting.py b/deepmd/dpmodel/fitting/general_fitting.py
index 2958a7d18d..9d51e35fd0 100644
--- a/deepmd/dpmodel/fitting/general_fitting.py
+++ b/deepmd/dpmodel/fitting/general_fitting.py
@@ -455,6 +455,7 @@ def _call_common(
         )
         # nf x nloc
         exclude_mask = self.emask.build_type_exclude_mask(atype)
+        exclude_mask = xp.astype(exclude_mask, xp.bool)
         # nf x nloc x nod
         outs = xp.where(exclude_mask[:, :, None], outs, xp.zeros_like(outs))
         return {self.var_name: outs}
diff --git a/deepmd/dpmodel/utils/serialization.py b/deepmd/dpmodel/utils/serialization.py
index 1c07038a23..5520933753 100644
--- a/deepmd/dpmodel/utils/serialization.py
+++ b/deepmd/dpmodel/utils/serialization.py
@@ -113,7 +113,9 @@ def save_dp_model(filename: str, model_dict: dict) -> None:
                 "@version": 1,
                 "dtype": x.dtype.name,
                 "value": x.tolist(),
-            },
+            }
+            if isinstance(x, np.ndarray)
+            else x,
         )
         with open(filename, "w") as f:
             yaml.safe_dump(
diff --git a/deepmd/pt/model/atomic_model/dp_atomic_model.py b/deepmd/pt/model/atomic_model/dp_atomic_model.py
index 2cdc97f934..63a18bbd4e 100644
--- a/deepmd/pt/model/atomic_model/dp_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/dp_atomic_model.py
@@ -69,7 +69,8 @@ def __init__(
     def set_eval_descriptor_hook(self, enable: bool) -> None:
         """Set the hook for evaluating descriptor and clear the cache for descriptor list."""
         self.enable_eval_descriptor_hook = enable
-        self.eval_descriptor_list = []
+        # = [] does not work; See #4533
+        self.eval_descriptor_list.clear()
 
     def eval_descriptor(self) -> torch.Tensor:
         """Evaluate the descriptor."""
@@ -236,7 +237,7 @@ def forward_atomic(
         )
         assert descriptor is not None
         if self.enable_eval_descriptor_hook:
-            self.eval_descriptor_list.append(descriptor)
+            self.eval_descriptor_list.append(descriptor.detach())
         # energy, force
         fit_ret = self.fitting_net(
             descriptor,
diff --git a/deepmd/pt/model/descriptor/repformer_layer.py b/deepmd/pt/model/descriptor/repformer_layer.py
index 86b09e9b40..1e2cba66d6 100644
--- a/deepmd/pt/model/descriptor/repformer_layer.py
+++ b/deepmd/pt/model/descriptor/repformer_layer.py
@@ -1003,7 +1003,7 @@ def _cal_grrg(h2g2: torch.Tensor, axis_neuron: int) -> torch.Tensor:
         # nb x nloc x 3 x ng2
         nb, nloc, _, ng2 = h2g2.shape
         # nb x nloc x 3 x axis
-        h2g2m = torch.split(h2g2, axis_neuron, dim=-1)[0]
+        h2g2m = h2g2[..., :axis_neuron]
         # nb x nloc x axis x ng2
         g1_13 = torch.matmul(torch.transpose(h2g2m, -1, -2), h2g2) / (3.0**1)
         # nb x nloc x (axisxng2)
diff --git a/deepmd/pt/model/network/mlp.py b/deepmd/pt/model/network/mlp.py
index 31162fe80e..22675d6163 100644
--- a/deepmd/pt/model/network/mlp.py
+++ b/deepmd/pt/model/network/mlp.py
@@ -8,6 +8,7 @@
 import numpy as np
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 
 from deepmd.pt.utils import (
     env,
@@ -202,18 +203,14 @@ def forward(
         ori_prec = xx.dtype
         if not env.DP_DTYPE_PROMOTION_STRICT:
             xx = xx.to(self.prec)
-        yy = (
-            torch.matmul(xx, self.matrix) + self.bias
-            if self.bias is not None
-            else torch.matmul(xx, self.matrix)
-        )
-        yy = self.activate(yy).clone()
+        yy = F.linear(xx, self.matrix.t(), self.bias)
+        yy = self.activate(yy)
         yy = yy * self.idt if self.idt is not None else yy
         if self.resnet:
             if xx.shape[-1] == yy.shape[-1]:
-                yy += xx
+                yy = yy + xx
             elif 2 * xx.shape[-1] == yy.shape[-1]:
-                yy += torch.concat([xx, xx], dim=-1)
+                yy = yy + torch.concat([xx, xx], dim=-1)
             else:
                 yy = yy
         if not env.DP_DTYPE_PROMOTION_STRICT:
diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py
index 0feb7fbbd2..38a41aa726 100644
--- a/deepmd/pt/train/training.py
+++ b/deepmd/pt/train/training.py
@@ -1230,13 +1230,11 @@ def get_loss(loss_params, start_lr, _ntypes, _model):
         if "mask" in model_output_type:
             model_output_type.pop(model_output_type.index("mask"))
         tensor_name = model_output_type[0]
-        loss_params["tensor_name"] = tensor_name
         loss_params["tensor_size"] = _model.model_output_def()[tensor_name].output_size
-        label_name = tensor_name
-        if label_name == "polarizability":
-            label_name = "polar"
-        loss_params["label_name"] = label_name
-        loss_params["tensor_name"] = label_name
+        loss_params["label_name"] = tensor_name
+        if tensor_name == "polarizability":
+            tensor_name = "polar"
+        loss_params["tensor_name"] = tensor_name
         return TensorLoss(**loss_params)
     elif loss_type == "property":
         task_dim = _model.get_task_dim()
diff --git a/deepmd/pt/utils/env.py b/deepmd/pt/utils/env.py
index 81dce669ff..ef2654241b 100644
--- a/deepmd/pt/utils/env.py
+++ b/deepmd/pt/utils/env.py
@@ -21,7 +21,7 @@
     ncpus = len(os.sched_getaffinity(0))
 except AttributeError:
     ncpus = os.cpu_count()
-NUM_WORKERS = int(os.environ.get("NUM_WORKERS", min(8, ncpus)))
+NUM_WORKERS = int(os.environ.get("NUM_WORKERS", min(4, ncpus)))
 # Make sure DDP uses correct device if applicable
 LOCAL_RANK = os.environ.get("LOCAL_RANK")
 LOCAL_RANK = int(0 if LOCAL_RANK is None else LOCAL_RANK)
diff --git a/deepmd/pt/utils/nlist.py b/deepmd/pt/utils/nlist.py
index db1e87785b..ec94e8cd60 100644
--- a/deepmd/pt/utils/nlist.py
+++ b/deepmd/pt/utils/nlist.py
@@ -310,7 +310,7 @@ def nlist_distinguish_types(
         inlist = torch.gather(nlist, 2, imap)
         inlist = inlist.masked_fill(~(pick_mask.to(torch.bool)), -1)
         # nloc x nsel[ii]
-        ret_nlist.append(torch.split(inlist, [ss, snsel - ss], dim=-1)[0])
+        ret_nlist.append(inlist[..., :ss])
     return torch.concat(ret_nlist, dim=-1)
 
 
diff --git a/deepmd/pt/utils/stat.py b/deepmd/pt/utils/stat.py
index 1c5e3f1c52..5b4fa77e5f 100644
--- a/deepmd/pt/utils/stat.py
+++ b/deepmd/pt/utils/stat.py
@@ -469,7 +469,9 @@ def compute_output_stats_global(
         # subtract the model bias and output the delta bias
 
         stats_input = {
-            kk: merged_output[kk] - model_pred[kk] for kk in keys if kk in merged_output
+            kk: merged_output[kk] - model_pred[kk].reshape(merged_output[kk].shape)
+            for kk in keys
+            if kk in merged_output
         }
 
     bias_atom_e = {}
diff --git a/deepmd/tf/descriptor/descriptor.py b/deepmd/tf/descriptor/descriptor.py
index dd86beb21e..bd1af8c72e 100644
--- a/deepmd/tf/descriptor/descriptor.py
+++ b/deepmd/tf/descriptor/descriptor.py
@@ -105,7 +105,8 @@ def get_dim_rot_mat_1(self) -> int:
         int
             the first dimension of the rotation matrix
         """
-        raise NotImplementedError
+        # by default, no rotation matrix
+        return 0
 
     def get_nlist(self) -> tuple[tf.Tensor, tf.Tensor, list[int], list[int]]:
         """Returns neighbor information.
@@ -534,3 +535,9 @@ def serialize(self, suffix: str = "") -> dict:
     def input_requirement(self) -> list[DataRequirementItem]:
         """Return data requirements needed for the model input."""
         return []
+
+    def get_rot_mat(self) -> tf.Tensor:
+        """Get rotational matrix."""
+        nframes = tf.shape(self.dout)[0]
+        natoms = tf.shape(self.dout)[1]
+        return tf.zeros([nframes, natoms, 0], dtype=GLOBAL_TF_FLOAT_PRECISION)
diff --git a/deepmd/tf/descriptor/hybrid.py b/deepmd/tf/descriptor/hybrid.py
index 2ee35d9ebb..57c21f0ee6 100644
--- a/deepmd/tf/descriptor/hybrid.py
+++ b/deepmd/tf/descriptor/hybrid.py
@@ -492,3 +492,21 @@ def deserialize(cls, data: dict, suffix: str = "") -> "DescrptHybrid":
             if hasattr(ii, "type_embedding"):
                 raise NotImplementedError("hybrid + type embedding is not supported")
         return obj
+
+    def get_dim_rot_mat_1(self) -> int:
+        """Returns the first dimension of the rotation matrix. The rotation is of shape
+        dim_1 x 3.
+
+        Returns
+        -------
+        int
+            the first dimension of the rotation matrix
+        """
+        return sum([ii.get_dim_rot_mat_1() for ii in self.descrpt_list])
+
+    def get_rot_mat(self) -> tf.Tensor:
+        """Get rotational matrix."""
+        all_rot_mat = []
+        for ii in self.descrpt_list:
+            all_rot_mat.append(ii.get_rot_mat())
+        return tf.concat(all_rot_mat, axis=2)
diff --git a/deepmd/tf/fit/polar.py b/deepmd/tf/fit/polar.py
index 2f1400e697..3bb0fa841e 100644
--- a/deepmd/tf/fit/polar.py
+++ b/deepmd/tf/fit/polar.py
@@ -27,8 +27,12 @@
 from deepmd.tf.loss.tensor import (
     TensorLoss,
 )
+from deepmd.tf.utils.errors import (
+    GraphWithoutTensorError,
+)
 from deepmd.tf.utils.graph import (
     get_fitting_net_variables_from_graph_def,
+    get_tensor_by_name_from_graph,
 )
 from deepmd.tf.utils.network import (
     one_layer,
@@ -423,6 +427,16 @@ def build(
         atype = input_dict.get("atype", None)
         nframes = input_dict.get("nframes")
         start_index = 0
+
+        with tf.variable_scope("fitting_attr" + suffix, reuse=reuse):
+            self.t_constant_matrix = tf.get_variable(
+                "t_constant_matrix",
+                self.constant_matrix.shape,
+                dtype=GLOBAL_TF_FLOAT_PRECISION,
+                trainable=False,
+                initializer=tf.constant_initializer(self.constant_matrix),
+            )
+
         inputs = tf.reshape(input_d, [-1, self.dim_descrpt * natoms[0]])
         rot_mat = tf.reshape(rot_mat, [-1, self.dim_rot_mat * natoms[0]])
         if nframes is None:
@@ -446,7 +460,9 @@ def build(
                 # nframes x nloc_masked
                 constant_matrix = tf.reshape(
                     tf.reshape(
-                        tf.tile(tf.repeat(self.constant_matrix, natoms[2:]), [nframes]),
+                        tf.tile(
+                            tf.repeat(self.t_constant_matrix, natoms[2:]), [nframes]
+                        ),
                         [nframes, -1],
                     )[nloc_mask],
                     [nframes, -1],
@@ -498,7 +514,9 @@ def build(
                 # shift and scale
                 sel_type_idx = self.sel_type.index(type_i)
                 final_layer = final_layer * self.scale[sel_type_idx]
-                final_layer = final_layer + self.constant_matrix[sel_type_idx] * tf.eye(
+                final_layer = final_layer + tf.slice(
+                    self.t_constant_matrix, [sel_type_idx], [1]
+                ) * tf.eye(
                     3,
                     batch_shape=[tf.shape(inputs)[0], natoms[2 + type_i]],
                     dtype=GLOBAL_TF_FLOAT_PRECISION,
@@ -545,6 +563,16 @@ def init_variables(
         self.fitting_net_variables = get_fitting_net_variables_from_graph_def(
             graph_def, suffix=suffix
         )
+        if self.shift_diag:
+            try:
+                self.constant_matrix = get_tensor_by_name_from_graph(
+                    graph, f"fitting_attr{suffix}/t_constant_matrix"
+                )
+            except GraphWithoutTensorError:
+                warnings.warn(
+                    "You are trying to read a model trained with shift_diag=True, but the mean of the diagonal terms of the polarizability is not stored in the graph. This will lead to wrong inference results. You may train your model with the latest DeePMD-kit to avoid this issue.",
+                    stacklevel=2,
+                )
 
     def enable_mixed_precision(self, mixed_prec: Optional[dict] = None) -> None:
         """Receive the mixed precision setting.
@@ -605,6 +633,14 @@ def serialize(self, suffix: str) -> dict:
                 variables=self.fitting_net_variables,
                 suffix=suffix,
             ),
+            "@variables": {
+                "fparam_avg": None,
+                "fparam_inv_std": None,
+                "aparam_avg": None,
+                "aparam_inv_std": None,
+                "scale": self.scale.reshape(-1, 1),
+                "constant_matrix": self.constant_matrix.reshape(-1),
+            },
             "type_map": self.type_map,
         }
         return data
@@ -632,6 +668,7 @@ def deserialize(cls, data: dict, suffix: str):
             data["nets"],
             suffix=suffix,
         )
+        fitting.constant_matrix = data["@variables"]["constant_matrix"].ravel()
         return fitting
 
 
diff --git a/deepmd/tf/loss/tensor.py b/deepmd/tf/loss/tensor.py
index aca9182ff6..1e90c89cb8 100644
--- a/deepmd/tf/loss/tensor.py
+++ b/deepmd/tf/loss/tensor.py
@@ -145,7 +145,7 @@ def label_requirement(self) -> list[DataRequirementItem]:
         # data required
         data_requirements.append(
             DataRequirementItem(
-                "atom_" + self.label_name,
+                "atomic_" + self.label_name,
                 self.tensor_size,
                 atomic=True,
                 must=False,
diff --git a/deepmd/tf/model/model.py b/deepmd/tf/model/model.py
index 8991bf1baf..3377ed2d51 100644
--- a/deepmd/tf/model/model.py
+++ b/deepmd/tf/model/model.py
@@ -668,6 +668,11 @@ def __init__(
         else:
             if fitting_net["type"] in ["dipole", "polar"]:
                 fitting_net["embedding_width"] = self.descrpt.get_dim_rot_mat_1()
+                if fitting_net["embedding_width"] == 0:
+                    raise ValueError(
+                        "This descriptor cannot provide a rotation matrix "
+                        "for a tensorial fitting."
+                    )
             self.fitting = Fitting(
                 **fitting_net,
                 descrpt=self.descrpt,
diff --git a/deepmd/tf/train/trainer.py b/deepmd/tf/train/trainer.py
index c6affdef7b..16e2c27a40 100644
--- a/deepmd/tf/train/trainer.py
+++ b/deepmd/tf/train/trainer.py
@@ -282,6 +282,13 @@ def _build_network(self, data, suffix="") -> None:
             tf.int32, [None], name="t_mesh"
         )
         self.place_holders["is_training"] = tf.placeholder(tf.bool)
+        # update "atomic_" in self.place_holders.keys() with "atom_"
+        for kk in list(self.place_holders.keys()):
+            if "atomic_" in kk:
+                self.place_holders[kk.replace("atomic_", "atom_")] = (
+                    self.place_holders.pop(kk)
+                )
+
         self.model_pred = self.model.build(
             self.place_holders["coord"],
             self.place_holders["type"],
diff --git a/deepmd/utils/neighbor_stat.py b/deepmd/utils/neighbor_stat.py
index 40e629d9db..4c3d24edd8 100644
--- a/deepmd/utils/neighbor_stat.py
+++ b/deepmd/utils/neighbor_stat.py
@@ -81,8 +81,12 @@ def get_stat(self, data: DeepmdDataSystem) -> tuple[float, np.ndarray]:
 
         # do sqrt in the final
         min_nbor_dist = math.sqrt(min_nbor_dist)
-        log.info("training data with min nbor dist: " + str(min_nbor_dist))
-        log.info("training data with max nbor size: " + str(max_nbor_size))
+        log.info(
+            f"Neighbor statistics: training data with minimal neighbor distance: {min_nbor_dist:f}"
+        )
+        log.info(
+            f"Neighbor statistics: training data with maximum neighbor size: {max_nbor_size!s} (cutoff radius: {self.rcut:f})"
+        )
         return min_nbor_dist, max_nbor_size
 
     @abstractmethod
diff --git a/deepmd/utils/summary.py b/deepmd/utils/summary.py
index a35dd4db93..f093a5f7bc 100644
--- a/deepmd/utils/summary.py
+++ b/deepmd/utils/summary.py
@@ -42,6 +42,7 @@ class SummaryPrinter(ABC):
         "Please read and cite:",
         "Wang, Zhang, Han and E, Comput.Phys.Comm. 228, 178-184 (2018)",
         "Zeng et al, J. Chem. Phys., 159, 054801 (2023)",
+        "Zeng et al, arxiv:2502.19161",
         "See https://deepmd.rtfd.io/credits/ for details.",
     )
 
diff --git a/doc/credits.rst b/doc/credits.rst
index 059746ee0b..2dacc848c9 100644
--- a/doc/credits.rst
+++ b/doc/credits.rst
@@ -13,6 +13,7 @@ Cite DeePMD-kit and methods
 
    Wang_ComputPhysCommun_2018_v228_p178
    Zeng_JChemPhys_2023_v159_p054801
+   Zeng_arXiv_2025_p2502.19161
 
 - If GPU version is used,
 
diff --git a/doc/env.md b/doc/env.md
index 3cf42b724a..4ca7101236 100644
--- a/doc/env.md
+++ b/doc/env.md
@@ -72,7 +72,7 @@ Default backend.
 
 :::{envvar} NUM_WORKERS
 
-**Default**: 8 or the number of cores (whichever is smaller)
+**Default**: 4 or the number of cores (whichever is smaller)
 
 {{ pytorch_icon }} Number of subprocesses to use for data loading in the PyTorch backend.
 See [PyTorch documentation](https://pytorch.org/docs/stable/data.html) for details.
diff --git a/doc/model/dpa2.md b/doc/model/dpa2.md
index 300876bf05..a041547a14 100644
--- a/doc/model/dpa2.md
+++ b/doc/model/dpa2.md
@@ -4,7 +4,7 @@
 **Supported backends**: PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, DP {{ dpmodel_icon }}
 :::
 
-The DPA-2 model implementation. See https://doi.org/10.1038/s41524-024-01493-2 for more details.
+The DPA-2 model implementation. See [DPA-2 paper](https://doi.org/10.1038/s41524-024-01493-2) for more details.
 
 Training example: `examples/water/dpa2/input_torch_medium.json`, see [README](../../examples/water/dpa2/README.md) for inputs in different levels.
 
diff --git a/doc/model/train-se-atten.md b/doc/model/train-se-atten.md
index 92a56395f6..504b214737 100644
--- a/doc/model/train-se-atten.md
+++ b/doc/model/train-se-atten.md
@@ -8,7 +8,7 @@
 
 Here we propose DPA-1, a Deep Potential model with a novel attention mechanism, which is highly effective for representing the conformation and chemical spaces of atomic systems and learning the PES.
 
-See [this paper](https://arxiv.org/abs/2208.08236) for more information. DPA-1 is implemented as a new descriptor `"se_atten"` for model training, which can be used after simply editing the input.json.
+See [this paper](https://www.nature.com/articles/s41524-024-01278-7) for more information. DPA-1 is implemented as a new descriptor `"se_atten"` for model training, which can be used after simply editing the input.json.
 
 ## Theory
 
@@ -71,7 +71,7 @@ Then layer normalization is added in a residual way to finally obtain the self-a
 Next, we will list the detailed settings in input.json and the data format, especially for large systems with dozens of elements. An example of DPA-1 input can be found in `examples/water/se_atten/input.json`.
 
 The notation of `se_atten` is short for the smooth edition of Deep Potential with an attention mechanism.
-This descriptor was described in detail in [the DPA-1 paper](https://arxiv.org/abs/2208.08236) and the images above.
+This descriptor was described in detail in [the DPA-1 paper](https://www.nature.com/articles/s41524-024-01278-7) and the images above.
 
 In this example, we will train a DPA-1 model for a water system. A complete training input script of this example can be found in the directory:
 
diff --git a/doc/train/finetuning.md b/doc/train/finetuning.md
index 04d86cfc98..beb6012003 100644
--- a/doc/train/finetuning.md
+++ b/doc/train/finetuning.md
@@ -9,7 +9,7 @@ to vastly reduce the training cost, while it's not trivial in potential models.
 Compositions and configurations of data samples or even computational parameters in upstream software (such as VASP)
 may be different between the pre-trained and target datasets, leading to energy shifts or other diversities of training data.
 
-Recently the emerging of methods such as [DPA-1](https://arxiv.org/abs/2208.08236) has brought us to a new stage where we can
+Recently the emerging of methods such as [DPA-1](https://www.nature.com/articles/s41524-024-01278-7) has brought us to a new stage where we can
 perform similar pretraining-finetuning approaches.
 They can hopefully learn the common knowledge in the pre-trained dataset (especially the `force` information)
 and thus reduce the computational cost in downstream training tasks.
@@ -19,7 +19,7 @@ and thus reduce the computational cost in downstream training tasks.
 If you have a pre-trained model `pretrained.pb`
 (here we support models using [`se_atten`](../model/train-se-atten.md) descriptor and [`ener`](../model/train-energy.md) fitting net)
 on a large dataset (for example, [OC2M](https://github.com/Open-Catalyst-Project/ocp/blob/main/DATASET.md) in
-DPA-1 [paper](https://arxiv.org/abs/2208.08236)), a finetuning strategy can be performed by simply running:
+DPA-1 [paper](https://www.nature.com/articles/s41524-024-01278-7)), a finetuning strategy can be performed by simply running:
 
 ```bash
 $ dp train input.json --finetune pretrained.pb
diff --git a/doc/train/parallel-training.md b/doc/train/parallel-training.md
index 9ea92b4751..00df0a63f0 100644
--- a/doc/train/parallel-training.md
+++ b/doc/train/parallel-training.md
@@ -27,13 +27,14 @@ In some cases, it won't work well when scaling the learning rate by worker count
 ### Scaling test
 
 Testing `examples/water/se_e2_a` on an 8-GPU host, linear acceleration can be observed with the increasing number of cards.
-
-| Num of GPU cards | Seconds every 100 samples | Samples per second | Speed up |
-| ---------------- | ------------------------- | ------------------ | -------- |
-| 1                | 1.4515                    | 68.89              | 1.00     |
-| 2                | 1.5962                    | 62.65\*2           | 1.82     |
-| 4                | 1.7635                    | 56.71\*4           | 3.29     |
-| 8                | 1.7267                    | 57.91\*8           | 6.72     |
+In this example, the number of samples per batch on a single GPU card ({ref}`batch_size <training/training_data/batch_size>`) is set to `1`.
+
+| Num of GPU cards | Samples per batch | Seconds every 100 batches | Samples per second | Speed up |
+| ---------------- | ----------------- | ------------------------- | ------------------ | -------- |
+| 1                | 1                 | 1.4515                    | 68.89              | 1.00     |
+| 2                | 2                 | 1.5962                    | 62.65\*2           | 1.82     |
+| 4                | 4                 | 1.7635                    | 56.71\*4           | 3.29     |
+| 8                | 8                 | 1.7267                    | 57.91\*8           | 6.72     |
 
 ### How to use
 
diff --git a/doc/train/tensorboard.md b/doc/train/tensorboard.md
index 32ecdd0ab2..b2635479ce 100644
--- a/doc/train/tensorboard.md
+++ b/doc/train/tensorboard.md
@@ -26,42 +26,51 @@ Before running TensorBoard, make sure you have generated summary data in a log
 directory by modifying the input script, setting {ref}`tensorboard <training/tensorboard>` to true in the training subsection will enable the TensorBoard data analysis. eg. **water_se_a.json**.
 
 ```json
-    "training" : {
-	"systems":	["../data/"],
-	"stop_batch":	1000000,
-	"batch_size":	1,
-
-	"seed":		1,
-
-	"_comment": " display and restart",
-	"_comment": " frequencies counted in batch",
-	"disp_file":	"lcurve.out",
-	"disp_freq":	100,
-	"numb_test":	10,
-	"save_freq":	1000,
-	"save_ckpt":	"model.ckpt",
-
-	"disp_training":true,
-	"time_training":true,
-	"tensorboard":	true,
-	"tensorboard_log_dir":"log",
-	"tensorboard_freq": 1000,
-	"profiling":	false,
-	"profiling_file":"timeline.json",
-	"_comment":	"that's all"
-    }
+"training": {
+   "systems": ["../data/"],
+   "stop_batch": 1000000,
+   "batch_size": 1,
+
+   "seed": 1,
+   "_comment": " display and restart",
+   "_comment": " frequencies counted in batch",
+   "disp_file": "lcurve.out",
+   "disp_freq": 100,
+   "numb_test": 10,
+   "save_freq": 1000,
+   "save_ckpt": "model.ckpt",
+
+   "disp_training": true,
+   "time_training": true,
+   "tensorboard": true,
+   "tensorboard_log_dir": "log",
+   "tensorboard_freq": 1000,
+   "profiling": false,
+   "profiling_file": "timeline.json",
+   "_comment": "that's all"
+}
 ```
 
 Once you have event files, run TensorBoard and provide the log directory. This
 should print that TensorBoard has started. Next, connect to http://tensorboard_server_ip:6006.
 
-TensorBoard requires a logdir to read logs from. For info on configuring TensorBoard, run TensorBoard --help.
+TensorBoard requires a logdir to read logs from. For info on configuring TensorBoard, run `tensorboard --help`.
 One can easily change the log name with "tensorboard_log_dir" and the sampling frequency with "tensorboard_freq".
 
 ```bash
 tensorboard --logdir path/to/logs
 ```
 
+## PyTorch Profiler With TensorBoard {{ pytorch_icon }}
+
+DeePMD-kit has a built-in support for [PyTorch Profiler](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html#use-profiler-to-record-execution-events).
+The profiler requires extra packages for recording and visualization:
+`pip install tensorboard torch-tb-profiler`
+Set `"enable_profiler": true` in the training section of the input script, and launch a training task with 10 steps, since the default setting of the profiler scheduler is `wait=1, warmup=1, active=3, repeat=1`.
+The profiler will generate recording files in `tensorboard_log_dir`.
+
+To [visualize the profiling data](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html#use-tensorboard-to-view-results-and-analyze-model-performance), launch TensorBoard (see above) and navigate to the "pytorch_profiler" tab.
+
 ## Examples
 
 ### Tracking and visualizing loss metrics(red:train, blue:test)
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index ca79860450..eb795cd47a 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -14,7 +14,9 @@ set(DEEPMD_C_ROOT
     ""
     CACHE PATH "Path to imported DeePMD-kit C library")
 
-set(CMAKE_CXX_STANDARD 11)
+if(NOT DEFINED CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 11)
+endif()
 macro(set_if_higher VARIABLE VALUE)
   # ${VARIABLE} is a variable name, not a string
   if(${VARIABLE} LESS "${VALUE}")
@@ -446,7 +448,7 @@ if(BUILD_CPP_IF
   set(version_file "${generated_dir}/${CMAKE_PROJECT_NAME}ConfigVersion.cmake")
   write_basic_package_version_file(
     ${version_file}
-    VERSION $<IF:${GIT_SUMM}?${GIT_SUMM}:"0.0.0">
+    VERSION $<IF:${GIT_SUMM}?${GIT_SUMM}:0.0.0>
     COMPATIBILITY AnyNewerVersion)
   install(
     EXPORT ${targets_export_name}
diff --git a/source/api_cc/src/DeepPotPT.cc b/source/api_cc/src/DeepPotPT.cc
index 6910de3ccd..4be03aea18 100644
--- a/source/api_cc/src/DeepPotPT.cc
+++ b/source/api_cc/src/DeepPotPT.cc
@@ -143,7 +143,7 @@ void DeepPotPT::compute(ENERGYVTYPE& ener,
   int natoms = atype.size();
   auto options = torch::TensorOptions().dtype(torch::kFloat64);
   torch::ScalarType floatType = torch::kFloat64;
-  if (std::is_same_v<VALUETYPE, float>) {
+  if (std::is_same<VALUETYPE, float>::value) {
     options = torch::TensorOptions().dtype(torch::kFloat32);
     floatType = torch::kFloat32;
   }
@@ -341,7 +341,7 @@ void DeepPotPT::compute(ENERGYVTYPE& ener,
   int natoms = atype.size();
   auto options = torch::TensorOptions().dtype(torch::kFloat64);
   torch::ScalarType floatType = torch::kFloat64;
-  if (std::is_same_v<VALUETYPE, float>) {
+  if (std::is_same<VALUETYPE, float>::value) {
     options = torch::TensorOptions().dtype(torch::kFloat32);
     floatType = torch::kFloat32;
   }
diff --git a/source/api_cc/src/DeepSpinPT.cc b/source/api_cc/src/DeepSpinPT.cc
index aef2d60150..cf9d8010e7 100644
--- a/source/api_cc/src/DeepSpinPT.cc
+++ b/source/api_cc/src/DeepSpinPT.cc
@@ -145,7 +145,7 @@ void DeepSpinPT::compute(ENERGYVTYPE& ener,
   int natoms = atype.size();
   auto options = torch::TensorOptions().dtype(torch::kFloat64);
   torch::ScalarType floatType = torch::kFloat64;
-  if (std::is_same_v<VALUETYPE, float>) {
+  if (std::is_same<VALUETYPE, float>::value) {
     options = torch::TensorOptions().dtype(torch::kFloat32);
     floatType = torch::kFloat32;
   }
@@ -365,7 +365,7 @@ void DeepSpinPT::compute(ENERGYVTYPE& ener,
   int natoms = atype.size();
   auto options = torch::TensorOptions().dtype(torch::kFloat64);
   torch::ScalarType floatType = torch::kFloat64;
-  if (std::is_same_v<VALUETYPE, float>) {
+  if (std::is_same<VALUETYPE, float>::value) {
     options = torch::TensorOptions().dtype(torch::kFloat32);
     floatType = torch::kFloat32;
   }
diff --git a/source/lmp/pair_deepmd.cpp b/source/lmp/pair_deepmd.cpp
index 8127979cd1..d4887573cc 100644
--- a/source/lmp/pair_deepmd.cpp
+++ b/source/lmp/pair_deepmd.cpp
@@ -50,7 +50,7 @@ static const char cite_user_deepmd_package[] =
     "energy representation and molecular dynamics}},\n"
     "  pages = {178--184}\n"
     "}\n"
-    "@misc{Zeng_JChemPhys_2023_v159_p054801,\n"
+    "@article{Zeng_JChemPhys_2023_v159_p054801,\n"
     "  title  = {{DeePMD-kit v2: A software package for deep potential "
     "models}},\n"
     "  author =   {Jinzhe Zeng and Duo Zhang and Denghui Lu and Pinghui Mo and "
@@ -82,6 +82,38 @@ static const char cite_user_deepmd_package[] =
     "  year =    2023,\n"
     "  pages  =   054801,\n"
     "  doi =      {10.1063/5.0155600},\n"
+    "}\n"
+    "@Article{Zeng_arXiv_2025_p2502.19161,\n"
+    "  annote       = {general purpose},\n"
+    "    author =   {Jinzhe Zeng and Duo Zhang and Anyang Peng and Xiangyu "
+    "Zhang and Sensen\n"
+    "             He and Yan Wang and Xinzijian Liu and Hangrui Bi and Yifan "
+    "Li and Chun\n"
+    "             Cai and Chengqian Zhang and Yiming Du and Jia-Xin Zhu and "
+    "Pinghui Mo\n"
+    "             and Zhengtao Huang and Qiyu Zeng and Shaochen Shi and "
+    "Xuejian Qin and\n"
+    "             Zhaoxi Yu and Chenxing Luo and Ye Ding and Yun-Pei Liu and "
+    "Ruosong Shi\n"
+    "             and Zhenyu Wang and Sigbj{\\o}rn L{\\o}land Bore and Junhan "
+    "Chang and\n"
+    "             Zhe Deng and Zhaohan Ding and Siyuan Han and Wanrun Jiang "
+    "and Guolin\n"
+    "             Ke and Zhaoqing Liu and Denghui Lu and Koki Muraoka and "
+    "Hananeh Oliaei\n"
+    "             and Anurag Kumar Singh and Haohui Que and Weihong Xu and "
+    "Zhangmancang\n"
+    "             Xu and Yong-Bin Zhuang and Jiayu Dai and Timothy J. Giese "
+    "and Weile\n"
+    "             Jia and Ben Xu and Darrin M. York and Linfeng Zhang and Han "
+    "Wang},\n"
+    "    title =    {{DeePMD-kit v3: A Multiple-Backend Framework for Machine "
+    "Learning\n"
+    "             Potentials}},\n"
+    "    journal =  {arXiv},\n"
+    "    year =     2025,\n"
+    "    pages =    {2502.19161},\n"
+    "    doi =      {10.48550/arXiv.2502.19161},\n"
     "}\n\n";
 
 PairDeepMD::PairDeepMD(LAMMPS *lmp)
diff --git a/source/lmp/pair_deepspin.cpp b/source/lmp/pair_deepspin.cpp
index 5e7d4474b9..105e98fa70 100644
--- a/source/lmp/pair_deepspin.cpp
+++ b/source/lmp/pair_deepspin.cpp
@@ -50,7 +50,7 @@ static const char cite_user_deepmd_package[] =
     "energy representation and molecular dynamics}},\n"
     "  pages = {178--184}\n"
     "}\n"
-    "@misc{Zeng_JChemPhys_2023_v159_p054801,\n"
+    "@article{Zeng_JChemPhys_2023_v159_p054801,\n"
     "  title  = {{DeePMD-kit v2: A software package for deep potential "
     "models}},\n"
     "  author =   {Jinzhe Zeng and Duo Zhang and Denghui Lu and Pinghui Mo and "
@@ -82,6 +82,38 @@ static const char cite_user_deepmd_package[] =
     "  year =    2023,\n"
     "  pages  =   054801,\n"
     "  doi =      {10.1063/5.0155600},\n"
+    "}\n"
+    "@Article{Zeng_arXiv_2025_p2502.19161,\n"
+    "  annote       = {general purpose},\n"
+    "    author =   {Jinzhe Zeng and Duo Zhang and Anyang Peng and Xiangyu "
+    "Zhang and Sensen\n"
+    "             He and Yan Wang and Xinzijian Liu and Hangrui Bi and Yifan "
+    "Li and Chun\n"
+    "             Cai and Chengqian Zhang and Yiming Du and Jia-Xin Zhu and "
+    "Pinghui Mo\n"
+    "             and Zhengtao Huang and Qiyu Zeng and Shaochen Shi and "
+    "Xuejian Qin and\n"
+    "             Zhaoxi Yu and Chenxing Luo and Ye Ding and Yun-Pei Liu and "
+    "Ruosong Shi\n"
+    "             and Zhenyu Wang and Sigbj{\\o}rn L{\\o}land Bore and Junhan "
+    "Chang and\n"
+    "             Zhe Deng and Zhaohan Ding and Siyuan Han and Wanrun Jiang "
+    "and Guolin\n"
+    "             Ke and Zhaoqing Liu and Denghui Lu and Koki Muraoka and "
+    "Hananeh Oliaei\n"
+    "             and Anurag Kumar Singh and Haohui Que and Weihong Xu and "
+    "Zhangmancang\n"
+    "             Xu and Yong-Bin Zhuang and Jiayu Dai and Timothy J. Giese "
+    "and Weile\n"
+    "             Jia and Ben Xu and Darrin M. York and Linfeng Zhang and Han "
+    "Wang},\n"
+    "    title =    {{DeePMD-kit v3: A Multiple-Backend Framework for Machine "
+    "Learning\n"
+    "             Potentials}},\n"
+    "    journal =  {arXiv},\n"
+    "    year =     2025,\n"
+    "    pages =    {2502.19161},\n"
+    "    doi =      {10.48550/arXiv.2502.19161},\n"
     "}\n\n";
 
 PairDeepSpin::PairDeepSpin(LAMMPS *lmp)
diff --git a/source/op/pt/tabulate_multi_device.cc b/source/op/pt/tabulate_multi_device.cc
index 5c710f5c37..feae37af81 100644
--- a/source/op/pt/tabulate_multi_device.cc
+++ b/source/op/pt/tabulate_multi_device.cc
@@ -61,6 +61,10 @@ void TabulateFusionSeAForward(const torch::Tensor& table_tensor,
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     deepmd::tabulate_fusion_se_a_gpu(descriptor, table, table_info, em_x, em,
                                      two_embed, nloc, nnei, last_layer_size);
+#else
+    throw std::runtime_error(
+        "The input tensor is on the GPU, but the GPU support for the "
+        "customized OP library is not enabled.");
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   } else if (device == "CPU") {
     deepmd::tabulate_fusion_se_a_cpu(descriptor, table, table_info, em_x, em,
@@ -110,6 +114,10 @@ void TabulateFusionSeAGradForward(const torch::Tensor& table_tensor,
     deepmd::tabulate_fusion_se_a_grad_gpu(dy_dem_x, dy_dem, dy_dtwo, table,
                                           table_info, em_x, em, two_embed, dy,
                                           nloc, nnei, last_layer_size);
+#else
+    throw std::runtime_error(
+        "The input tensor is on the GPU, but the GPU support for the "
+        "customized OP library is not enabled.");
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   } else if (device == "CPU") {
     deepmd::tabulate_fusion_se_a_grad_cpu(dy_dem_x, dy_dem, dy_dtwo, table,
@@ -166,6 +174,10 @@ void TabulateFusionSeAGradGradForward(const torch::Tensor& table_tensor,
     deepmd::tabulate_fusion_se_a_grad_grad_gpu(
         dz_dy, table, table_info, em_x, em, two_embed, dz_dy_dem_x, dz_dy_dem,
         dz_dy_dtwo, nloc, nnei, last_layer_size, is_sorted);
+#else
+    throw std::runtime_error(
+        "The input tensor is on the GPU, but the GPU support for the "
+        "customized OP library is not enabled.");
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     TORCH_CHECK(last_layer_size <= 1024,
                 "In the process of model compression, the size of the "
@@ -212,6 +224,10 @@ void TabulateFusionSeTForward(const torch::Tensor& table_tensor,
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     deepmd::tabulate_fusion_se_t_gpu(descriptor, table, table_info, em_x, em,
                                      nloc, nnei_i, nnei_j, last_layer_size);
+#else
+    throw std::runtime_error(
+        "The input tensor is on the GPU, but the GPU support for the "
+        "customized OP library is not enabled.");
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   } else if (device == "CPU") {
     deepmd::tabulate_fusion_se_t_cpu(descriptor, table, table_info, em_x, em,
@@ -254,6 +270,10 @@ void TabulateFusionSeTGradForward(const torch::Tensor& table_tensor,
     deepmd::tabulate_fusion_se_t_grad_gpu(dy_dem_x, dy_dem, table, table_info,
                                           em_x, em, dy, nloc, nnei_i, nnei_j,
                                           last_layer_size);
+#else
+    throw std::runtime_error(
+        "The input tensor is on the GPU, but the GPU support for the "
+        "customized OP library is not enabled.");
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   } else if (device == "CPU") {
     deepmd::tabulate_fusion_se_t_grad_cpu(dy_dem_x, dy_dem, table, table_info,
@@ -300,6 +320,10 @@ void TabulateFusionSeTGradGradForward(const torch::Tensor& table_tensor,
     deepmd::tabulate_fusion_se_t_grad_grad_gpu(dz_dy, table, table_info, em_x,
                                                em, dz_dy_dem_x, dz_dy_dem, nloc,
                                                nnei_i, nnei_j, last_layer_size);
+#else
+    throw std::runtime_error(
+        "The input tensor is on the GPU, but the GPU support for the "
+        "customized OP library is not enabled.");
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     TORCH_CHECK(last_layer_size <= 1024,
                 "In the process of model compression, the size of the "
@@ -340,6 +364,10 @@ void TabulateFusionSeRForward(const torch::Tensor& table_tensor,
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     deepmd::tabulate_fusion_se_r_gpu(descriptor, table, table_info, em, nloc,
                                      nnei, last_layer_size);
+#else
+    throw std::runtime_error(
+        "The input tensor is on the GPU, but the GPU support for the "
+        "customized OP library is not enabled.");
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   } else if (device == "CPU") {
     deepmd::tabulate_fusion_se_r_cpu(descriptor, table, table_info, em, nloc,
@@ -376,6 +404,10 @@ void TabulateFusionSeRGradForward(const torch::Tensor& table_tensor,
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     deepmd::tabulate_fusion_se_r_grad_gpu(dy_dem, table, table_info, em, dy,
                                           nloc, nnei, last_layer_size);
+#else
+    throw std::runtime_error(
+        "The input tensor is on the GPU, but the GPU support for the "
+        "customized OP library is not enabled.");
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   } else if (device == "CPU") {
     deepmd::tabulate_fusion_se_r_grad_cpu(dy_dem, table, table_info, em, dy,
@@ -412,6 +444,10 @@ void TabulateFusionSeRGradGradForward(const torch::Tensor& table_tensor,
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     deepmd::tabulate_fusion_se_r_grad_grad_gpu(
         dz_dy, table, table_info, em, dz_dy_dem, nloc, nnei, last_layer_size);
+#else
+    throw std::runtime_error(
+        "The input tensor is on the GPU, but the GPU support for the "
+        "customized OP library is not enabled.");
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     TORCH_CHECK(last_layer_size <= 1024,
                 "In the process of model compression, the size of the "
diff --git a/source/tests/consistent/common.py b/source/tests/consistent/common.py
index 358ac8d542..88f12bb6a8 100644
--- a/source/tests/consistent/common.py
+++ b/source/tests/consistent/common.py
@@ -326,6 +326,9 @@ def test_tf_consistent_with_ref(self) -> None:
         data1.pop("@version")
         data2.pop("@version")
 
+        if tf_obj.__class__.__name__.startswith("Polar"):
+            data1["@variables"].pop("bias_atom_e")
+
         np.testing.assert_equal(data1, data2)
         for rr1, rr2 in zip(ret1, ret2):
             np.testing.assert_allclose(
diff --git a/source/tests/infer/test_models.py b/source/tests/infer/test_models.py
index a6cde3206c..bf1a04766c 100644
--- a/source/tests/infer/test_models.py
+++ b/source/tests/infer/test_models.py
@@ -159,6 +159,10 @@ def test_descriptor(self) -> None:
             descpt = self.dp.eval_descriptor(result.coord, result.box, result.atype)
             expected_descpt = result.descriptor
             np.testing.assert_almost_equal(descpt.ravel(), expected_descpt.ravel())
+            # See #4533
+            descpt = self.dp.eval_descriptor(result.coord, result.box, result.atype)
+            expected_descpt = result.descriptor
+            np.testing.assert_almost_equal(descpt.ravel(), expected_descpt.ravel())
 
     def test_2frame_atm(self) -> None:
         for ii, result in enumerate(self.case.results):
diff --git a/source/tests/pt/__init__.py b/source/tests/pt/__init__.py
index 1a6de0591a..045a752b96 100644
--- a/source/tests/pt/__init__.py
+++ b/source/tests/pt/__init__.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import torch
+import torch._dynamo
 
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
diff --git a/source/tests/tf/test_dipole_hybrid_descrpt.py b/source/tests/tf/test_dipole_hybrid_descrpt.py
new file mode 100644
index 0000000000..cc500c43ac
--- /dev/null
+++ b/source/tests/tf/test_dipole_hybrid_descrpt.py
@@ -0,0 +1,143 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import numpy as np
+
+from deepmd.tf.descriptor.hybrid import (
+    DescrptHybrid,
+)
+from deepmd.tf.env import (
+    tf,
+)
+from deepmd.tf.fit import (
+    DipoleFittingSeA,
+)
+from deepmd.tf.model import (
+    DipoleModel,
+)
+
+from .common import (
+    DataSystem,
+    gen_data,
+    j_loader,
+)
+
+GLOBAL_ENER_FLOAT_PRECISION = tf.float64
+GLOBAL_TF_FLOAT_PRECISION = tf.float64
+GLOBAL_NP_FLOAT_PRECISION = np.float64
+
+
+class TestModel(tf.test.TestCase):
+    def setUp(self) -> None:
+        gen_data()
+
+    def test_model(self) -> None:
+        jfile = "polar_se_a.json"
+        jdata = j_loader(jfile)
+
+        systems = jdata["systems"]
+        set_pfx = "set"
+        batch_size = 1
+        test_size = 1
+        rcut = jdata["model"]["descriptor"]["rcut"]
+
+        data = DataSystem(systems, set_pfx, batch_size, test_size, rcut, run_opt=None)
+
+        test_data = data.get_test()
+        numb_test = 1
+
+        descrpt = DescrptHybrid(
+            list=[
+                {
+                    "type": "se_e2_a",
+                    "sel": [20, 20],
+                    "rcut_smth": 1.8,
+                    "rcut": 6.0,
+                    "neuron": [2, 4, 8],
+                    "resnet_dt": False,
+                    "axis_neuron": 8,
+                    "precision": "float64",
+                    "type_one_side": True,
+                    "seed": 1,
+                },
+                {
+                    "type": "se_e2_a",
+                    "sel": [20, 20],
+                    "rcut_smth": 1.8,
+                    "rcut": 6.0,
+                    "neuron": [2, 4, 8],
+                    "resnet_dt": False,
+                    "axis_neuron": 8,
+                    "precision": "float64",
+                    "type_one_side": True,
+                    "seed": 1,
+                },
+                {
+                    "type": "se_e3",
+                    "sel": [5, 5],
+                    "rcut_smth": 1.8,
+                    "rcut": 2.0,
+                    "neuron": [2],
+                    "resnet_dt": False,
+                    "precision": "float64",
+                    "seed": 1,
+                },
+            ]
+        )
+        jdata["model"]["fitting_net"].pop("type", None)
+        jdata["model"]["fitting_net"].pop("fit_diag", None)
+        jdata["model"]["fitting_net"]["ntypes"] = descrpt.get_ntypes()
+        jdata["model"]["fitting_net"]["dim_descrpt"] = descrpt.get_dim_out()
+        jdata["model"]["fitting_net"]["embedding_width"] = descrpt.get_dim_rot_mat_1()
+        fitting = DipoleFittingSeA(**jdata["model"]["fitting_net"], uniform_seed=True)
+        model = DipoleModel(descrpt, fitting)
+
+        # model._compute_dstats([test_data['coord']], [test_data['box']], [test_data['type']], [test_data['natoms_vec']], [test_data['default_mesh']])
+        input_data = {
+            "coord": [test_data["coord"]],
+            "box": [test_data["box"]],
+            "type": [test_data["type"]],
+            "natoms_vec": [test_data["natoms_vec"]],
+            "default_mesh": [test_data["default_mesh"]],
+            "fparam": [test_data["fparam"]],
+        }
+        model._compute_input_stat(input_data)
+
+        t_prop_c = tf.placeholder(tf.float32, [5], name="t_prop_c")
+        t_coord = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None], name="i_coord")
+        t_type = tf.placeholder(tf.int32, [None], name="i_type")
+        t_natoms = tf.placeholder(tf.int32, [model.ntypes + 2], name="i_natoms")
+        t_box = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None, 9], name="i_box")
+        t_mesh = tf.placeholder(tf.int32, [None], name="i_mesh")
+        is_training = tf.placeholder(tf.bool)
+        t_fparam = None
+
+        model_pred = model.build(
+            t_coord,
+            t_type,
+            t_natoms,
+            t_box,
+            t_mesh,
+            t_fparam,
+            suffix="dipole_hybrid",
+            reuse=False,
+        )
+        dipole = model_pred["dipole"]
+        gdipole = model_pred["global_dipole"]
+        force = model_pred["force"]
+        virial = model_pred["virial"]
+        atom_virial = model_pred["atom_virial"]
+
+        feed_dict_test = {
+            t_prop_c: test_data["prop_c"],
+            t_coord: np.reshape(test_data["coord"][:numb_test, :], [-1]),
+            t_box: test_data["box"][:numb_test, :],
+            t_type: np.reshape(test_data["type"][:numb_test, :], [-1]),
+            t_natoms: test_data["natoms_vec"],
+            t_mesh: test_data["default_mesh"],
+            is_training: False,
+        }
+
+        sess = self.cached_session().__enter__()
+        sess.run(tf.global_variables_initializer())
+        [p, gp, f, v, av] = sess.run(
+            [dipole, gdipole, force, virial, atom_virial], feed_dict=feed_dict_test
+        )