From c95f731f2a4319fe2f596a600e49b44c3b0e9482 Mon Sep 17 00:00:00 2001
From: OutisLi <LTC201806070316@gmail.com>
Date: Mon, 8 Sep 2025 20:58:16 +0800
Subject: [PATCH 01/11] delete original outisli branch for format problems but
 picked some vital changes

---
 CLAUDE.md                   |  306 +++++
 deepmd/pt/train/training.py |    5 +-
 outisli/DPA3.md             | 2097 +++++++++++++++++++++++++++++++++++
 outisli/install.md          |  179 +++
 source/lmp/builtin.cmake    |    1 +
 5 files changed, 2587 insertions(+), 1 deletion(-)
 create mode 100644 CLAUDE.md
 create mode 100644 outisli/DPA3.md
 create mode 100644 outisli/install.md

diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000000..ad1443d8b9
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,306 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Project Overview
+
+DeePMD-kit is a deep learning package for many-body potential energy representation and molecular dynamics. It supports multiple backends (TensorFlow, PyTorch, JAX, Paddle) and interfaces with various MD packages (LAMMPS, i-PI, AMBER, GROMACS, etc.).
+
+## Development Commands
+
+### Building and Installation
+- **Standard build**: `pip install .`
+- **With GPU support**: Set environment variables like `DP_ENABLE_PYTORCH=1`, `DP_ENABLE_TENSORFLOW=1`, etc.
+- **From source**: Uses scikit-build-core with CMake - see `source/CMakeLists.txt`
+- **C++ library**: Built automatically as part of the Python package
+
+### Testing
+- **Run all tests**: `pytest source/tests`
+- **Run specific backend tests**: `pytest source/tests/tf/`, `pytest source/tests/pt/`, etc.
+- **GPU tests**: `tox -e gpu` or set `DP_VARIANT=cuda`
+- **Individual test**: `pytest source/tests/path/to/test_file.py::test_name`
+- **With coverage**: `pytest --cov=deepmd`
+
+### Code Quality
+- **Linting**: `ruff check .`
+- **Formatting**: `ruff format .`
+- **Type checking**: No specific type checker configured in the project
+
+### Backend-Specific Commands
+- **TensorFlow**: Requires TF 2.19.0, automatically enabled with certain flags
+- **PyTorch**: Enable with `DP_ENABLE_PYTORCH=1`
+- **JAX**: Enable with `DP_ENABLE_JAX=1` (requires Python >= 3.10)
+- **Paddle**: Enable with `DP_ENABLE_PADDLE=1`
+
+## Architecture Overview
+
+### Multi-Backend Design
+The codebase is organized around a modular backend system in `deepmd/backend/`:
+- `backend.py`: Core backend management logic
+- `tensorflow.py`, `pytorch.py`, `jax.py`, `paddle.py`: Backend-specific implementations
+- `suffix.py`: Model file suffix handling for different backends
+
+### Core Components
+
+#### 1. Model Architecture (`deepmd/dpmodel/`)
+Framework-agnostic model implementations:
+- `atomic_model/`: Atomic-level model components
+- `descriptor/`: Environment descriptors (se_a, se_atten, dpa1/2/3, etc.)
+- `fitting/`: Fitting networks for energy, forces, etc.
+- `model/`: Complete model definitions
+
+### DPA3 Descriptor Implementation
+
+#### DPA3 Architecture Overview
+DPA3 (Deep Potential - Atomic Environment Representation with 3-body interactions) is an advanced descriptor that combines node, edge, and angle information for more accurate atomic environment representation.
+
+**Key Components**:
+- **Main Descriptor**: `DescrptDPA3` in `deepmd/pt/model/descriptor/dpa3.py`
+- **RepFlow Block**: `DescrptBlockRepflows` in `deepmd/pt/model/descriptor/repflows.py`
+- **RepFlow Layer**: `RepFlowLayer` in `deepmd/pt/model/descriptor/repflow_layer.py`
+
+#### DPA3 Initialization and Forward Pass
+**Initialization** (`dpa3.py:105-171`):
+- Processes RepFlow parameters
+- Creates type embedding network (`TypeEmbedNetConsistent`)
+- Initializes RepFlow blocks with edge/angle embedding networks
+- Sets up multiple RepFlow layers for iterative refinement
+
+**Forward Pass** (`dpa3.py:430-498`):
+1. **Type Embedding**: Computes atomic type embeddings
+2. **RepFlow Processing**: Multi-layer node/edge/angle information processing
+3. **Output**: Returns node descriptors, rotation matrices, edge embeddings, and switch functions
+
+**DPA3 Output Variables**:
+- `node_ebd`: Node descriptors [nf, nloc, n_dim] - main atomic environment representation
+- `rot_mat`: Rotation matrices [nf, nloc, e_dim, 3] - for SE(3) equivariance
+- `edge_ebd`: Edge embeddings [nf, nloc, nnei, e_dim] - pairwise interactions
+- `h2`: Angle information [nf, nloc, nnei, 3] - 3-body angular data
+- `sw`: Switch functions [nf, nloc, nnei] - smooth cutoff boundaries
+
+#### RepFlow Implementation
+**RepFlow Block** (`repflows.py:77-200`):
+- Edge embedding network for distance information
+- Angle embedding network for angular information
+- Multiple RepFlow layers for iterative updates
+- Support for message compression and multi-head attention
+
+**Key Parameters**:
+- `e_rcut`/`e_rcut_smth`: Edge cutoff and smoothing radii
+- `a_rcut`/`a_rcut_smth`: Angle cutoff and smoothing radii  
+- `n_dim`/`e_dim`/`a_dim`: Node/edge/angle representation dimensions
+- `nlayers`: Number of RepFlow layers
+- `update_style`: Residual connection strategies (res_residual, res_update, etc.)
+
+#### CLI Usage and Training Flow
+**Training Command**: `dp train input.json`
+
+**Execution Flow**:
+1. **Entry Point**: `deepmd.pt.entrypoints.main.train()` (`main.py:237-248`)
+2. **Configuration Loading**: JSON parsing and multi-task handling
+3. **Neighbor Statistics**: Automatic selection parameter computation
+4. **Trainer Creation**: `get_trainer()` with model initialization
+5. **Model Building**: DPA3 descriptor creation via `get_model()`
+
+#### Precision Control
+DPA3 supports two levels of precision control:
+
+**Environment Variable Control**:
+```bash
+export DP_INTERFACE_PREC=high  # Default: float64 interface
+export DP_INTERFACE_PREC=low   # Lower memory: float32 interface
+```
+
+**Model Parameter Control**:
+```json
+{
+  "model": {
+    "descriptor": {
+      "type": "dpa3",
+      "precision": "float32",
+      "repflow": {
+        "precision": "float32"
+      }
+    }
+  }
+}
+```
+
+#### Inference System
+**Main Classes**:
+- `DeepEval`: Universal inference interface (`deepmd/pt/infer/deep_eval.py:75`)
+- `Tester`: Testing and inference utility (`deepmd/pt/infer/inference.py:25`)
+
+**Inference Flow**:
+1. **Model Loading**: State dict loading and multi-task handling
+2. **JIT Compilation**: Optional TorchScript optimization
+3. **Batch Processing**: Automatic batch sizing for memory optimization
+4. **Execution**: DPA3 descriptor computation in evaluation mode
+
+**Performance Optimizations**:
+- **JIT Compilation**: `torch.jit.script()` for graph optimization
+- **Auto-batching**: Dynamic batch size adjustment based on memory
+- **Multi-device**: CPU/GPU support with automatic device selection
+- **Model Freezing**: `dp freeze` for deployment-optimized models
+
+#### Configuration Example
+```json
+{
+  "model": {
+    "descriptor": {
+      "type": "dpa3",
+      "repflow": {
+        "e_rcut": 6.0,
+        "e_sel": 120,
+        "a_rcut": 4.0,
+        "a_sel": 40,
+        "n_dim": 128,
+        "e_dim": 64,
+        "a_dim": 32,
+        "nlayers": 3,
+        "update_style": "res_residual"
+      },
+      "concat_output_tebd": true,
+      "precision": "float32"
+    }
+  }
+}
+```
+
+#### Energy Summation Mechanism
+DPA3 implements a two-stage energy calculation:
+1. **Atomic Energy**: Each atom's local environment energy computed in fitting networks
+2. **System Energy**: Atomic energies summed to get total system energy
+
+**Key Files**:
+- Atomic energy: `deepmd/pt/model/task/fitting.py:473-614`
+- Energy summation: `deepmd/pt/model/model/transform_output.py:153-192`
+
+#### 2. Backend-Specific Implementations
+- `deepmd/tf/`: TensorFlow backend (original implementation)
+- `deepmd/pt/`: PyTorch backend 
+- `deepmd/jax/`: JAX backend
+- `deepmd/pd/`: Paddle backend
+
+Each backend implements similar interfaces:
+- Descriptor variants optimized for the framework
+- Training and inference modules
+- Model serialization/loading
+
+#### 3. Inference (`deepmd/infer/`)
+High-level inference interfaces:
+- `deep_pot.py`: Main potential energy model interface
+- `deep_eval.py`: Generic evaluation interface
+- Backend-specific inference modules
+
+#### 4. Training (`deepmd/*/train/`)
+Backend-specific training implementations:
+- Training loops and optimization
+- Data loading and preprocessing
+- Checkpoint management
+
+#### 5. Entry Points (`deepmd/entrypoints/`)
+Command-line interface commands:
+- `main.py`: Main CLI dispatcher
+- Training, testing, conversion utilities
+- Model analysis and documentation tools
+
+#### 6. C++ Integration (`source/`)
+- `lib/`: Core computational library with CUDA/ROCm support
+- `api_cc/`: C++ API for external integration
+- `api_c/`: C API wrapper
+- `lmp/`: LAMMPS plugin integration
+- `op/`: Custom operators for different frameworks
+
+### PyTorch Backend Data Processing
+
+#### Two-Level DataLoader Architecture
+The PyTorch backend uses a unique two-level DataLoader system for efficient multi-system data management:
+
+**System Level**: Each data system has its own DataLoader (num_workers=0 to avoid thread explosion)
+**Training Level**: Master DataLoader handles sampling and batching across systems (num_workers=NUM_WORKERS)
+
+**Key Components**:
+- `DeepmdData`: Raw data loading from HDF5/.npy files (`deepmd/utils/data.py`)
+- `DpLoaderSet`: System-level DataLoader collection (`deepmd/pt/utils/dataloader.py`)
+- `DeepmdDataSetForLoader`: PyTorch Dataset wrapper
+- `collate_batch`: Batch processing function for variable-sized systems
+
+**Data Flow**:
+```
+Raw Data (HDF5/.npy) → DeepmdData → System DataLoaders → DpLoaderSet → Training DataLoader → Model Input
+```
+
+### DPAtomicModel Hierarchy
+
+#### Class Structure
+```python
+BaseAtomicModel (base_atomic_model.py:52)
+    ↓
+DPAtomicModel (dp_atomic_model.py:34) - registered as "standard"
+    ↓
+Specific Models (Energy, Dipole, Polar, DOS, Property)
+```
+
+**Key Features**:
+- **Unified Interface**: Consistent API for different physical properties
+- **Atomic-Level Forward Pass**: `forward_atomic()` method handles descriptor computation and fitting
+- **Multi-Task Support**: Supports training multiple properties simultaneously
+- **Automatic Differentiation**: Force and virial computation through autograd
+
+**Key Files**:
+- Base class: `deepmd/pt/model/atomic_model/dp_atomic_model.py:34`
+- Energy model: `deepmd/pt/model/atomic_model/energy_atomic_model.py:13`
+- Dipole model: `deepmd/pt/model/atomic_model/dipole_atomic_model.py:14`
+
+### Key Design Patterns
+
+#### Backend Abstraction
+The code uses a sophisticated backend system that allows:
+- Runtime backend selection
+- Model conversion between backends
+- Consistent APIs across frameworks
+
+#### Descriptor-Based Architecture
+Models are built from:
+1. **Descriptors**: Local atomic environment representations
+2. **Fitting Networks**: Map descriptors to physical quantities
+3. **Models**: Combine descriptors and fitting for complete potentials
+
+#### Multi-Task Learning
+Support for training multiple properties simultaneously:
+- Energy, forces, virial
+- Dipole moments, polarizability
+- DOS, electronic properties
+- Spin systems
+
+## Working with the Code
+
+### Adding New Features
+1. **Framework-agnostic**: Add to `deepmd/dpmodel/` first
+2. **Backend implementations**: Extend each backend in `deepmd/*/`
+3. **C++ optimization**: Add performance-critical code to `source/lib/`
+4. **Tests**: Add backend-specific tests in `source/tests/*/`
+
+### Model Development
+- Use existing descriptors as templates in `deepmd/dpmodel/descriptor/`
+- Extend fitting networks in `deepmd/dpmodel/fitting/`
+- Model composition follows patterns in `deepmd/dpmodel/model/`
+
+### Performance Considerations
+- C++ library handles neighbor lists and environment matrices
+- Custom operators optimized for GPU acceleration
+- Automatic mixed precision support where available
+
+### Common Pitfalls
+- Backend-specific imports are banned at module level (use runtime imports)
+- Model compatibility requires careful version management
+- GPU builds require specific CUDA/ROCm versions
+
+## File Structure Conventions
+
+- **Public APIs**: In `deepmd/` top-level modules
+- **Implementation details**: In subdirectories like `dpmodel/`, `utils/`
+- **Backend code**: Separated into `tf/`, `pt/`, `jax/`, `pd/` directories
+- **Tests**: Organized by backend in `source/tests/*/`
+- **Examples**: In `examples/` directory with input configurations
\ No newline at end of file
diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py
index 0dfbe94b6b..c12c12b28b 100644
--- a/deepmd/pt/train/training.py
+++ b/deepmd/pt/train/training.py
@@ -20,6 +20,9 @@
 import numpy as np
 import torch
 
+from deepmd.pt.utils import (
+    env,
+)
 from deepmd.common import (
     symlink_prefix_files,
 )
@@ -836,7 +839,7 @@ def step(_step_id: int, task_key: str = "Default") -> None:
                         self.gradient_max_norm,
                         error_if_nonfinite=True,
                     )
-                with torch.device("cpu"):
+                with torch.device(env.DEVICE):
                     self.optimizer.step()
                 self.scheduler.step()
             elif self.opt_type == "LKF":
diff --git a/outisli/DPA3.md b/outisli/DPA3.md
new file mode 100644
index 0000000000..b5ed63016c
--- /dev/null
+++ b/outisli/DPA3.md
@@ -0,0 +1,2097 @@
+# DeepMD 源码导读与 DPA3 PyTorch 实现技术文档
+
+## 概述
+
+DPA3 是 DeePMD-kit 中基于 PyTorch 实现的高级原子环境描述符。它通过结合节点、边和角度信息，构建了更加精确的原子环境表示。
+
+请注意，该文档由 AI 生成，仅经过大致检查，可能存在出入，仅供阅读 deepmd-kit 源码的参考与指引。且代码行号基于作者本地格式化后的代码，因此与 GitHub 上源代码存在一定差异。
+
+### 文档结构
+
+本文档按照 DPA3 的实际使用流程和技术架构组织，包含以下主要部分：
+
+- **第一部分：快速开始** - 从 CLI 使用到基本配置的快速入门
+- **第二部分：系统架构** - DPA3 的整体设计和组件关系
+- **第三部分：详细实现** - 核心算法和技术实现细节
+- **第四部分：数据处理系统** - PyTorch 后端的数据处理架构
+- **第五部分：推理和部署** - 模型部署和集成方案
+
+---
+
+## 第一部分：快速开始
+
+### 1.1 CLI 入口和基本使用
+
+#### 1.1.1 命令行入口流程
+
+当用户执行 `dp --pt train input.json` 命令时，程序执行以下流程：
+
+1. **主入口点解析**: `deepmd.main.parse_args()` 解析命令行参数
+2. **后端选择**: 根据 `backend` 参数选择 PyTorch 后端
+3. **训练函数调用**: 调用 `deepmd.pt.entrypoints.main.train()`
+
+**关键文件位置**:
+
+- `deepmd/pt/entrypoints/main.py:237-248` - train 函数定义
+- `deepmd/entrypoints/main.py:41-91` - 主入口点分发逻辑
+
+#### 1.1.2 训练初始化流程
+
+在 `train()` 函数中，程序按以下步骤初始化：
+
+```python
+# 1. 配置文件加载和解析
+with open(input_file) as fin:
+    config = json.load(fin)
+
+# 2. 多任务模型处理
+multi_task = "model_dict" in config["model"]
+if multi_task:
+    config["model"], shared_links = preprocess_shared_params(config["model"])
+
+# 3. 邻居统计计算
+if not skip_neighbor_stat:
+    min_nbor_dist, trainer = update_sel(config, model_branch)
+
+# 4. 训练器创建
+trainer = get_trainer(
+    config, init_model, restart, finetune, force_load, init_frz_model,
+    shared_links=shared_links, finetune_links=finetune_links
+)
+```
+
+**关键代码位置**: `deepmd/pt/entrypoints/main.py:322-331`
+
+#### 1.1.3 模型构建流程
+
+训练器初始化后，通过 `get_model()` 函数构建模型：
+
+1. **模型解析**: 根据配置文件中的 `descriptor` 类型选择对应的描述符
+2. **DPA3 初始化**: 当 descriptor 类型为 `"dpa3"` 时，创建 `DescrptDPA3` 实例
+3. **模型组装**: 将描述符与拟合网络组合成完整模型
+
+**关键文件位置**:
+
+- `deepmd/pt/train/training.py:91-100` - Trainer 类定义
+- `deepmd/pt/model/model/model.py` - BaseModel 类和模型工厂函数
+
+### 1.2 基本配置示例
+
+```json
+{
+  "model": {
+    "descriptor": {
+      "type": "dpa3",
+      "repflow": {
+        "e_rcut": 6.0,
+        "e_sel": 200,
+        "a_rcut": 5.0,
+        "a_sel": 60,
+        "n_dim": 128,
+        "e_dim": 64,
+        "a_dim": 32,
+        "nlayers": 6,
+        "a_compress_rate": 2,
+        "update_angle": true,
+        "update_style": "res_residual"
+      },
+      "concat_output_tebd": true,
+      "precision": "float32"
+    }
+  }
+}
+```
+
+### 1.3 精度控制配置
+
+DPA3 提供了两种精度控制机制，分别控制不同的计算层面：
+
+#### 1.3.1 环境变量精度控制 (DP_INTERFACE_PREC)
+
+**作用范围**: 全局接口精度控制，影响输入/输出数据类型
+
+**设置方式**:
+
+```bash
+# 高精度模式 (默认)
+export DP_INTERFACE_PREC=high
+
+# 低精度模式
+export DP_INTERFACE_PREC=low
+```
+
+**精度影响**:
+
+- `high`: `GLOBAL_NP_FLOAT_PRECISION = np.float64`, `GLOBAL_ENER_FLOAT_PRECISION = np.float64`
+- `low`: `GLOBAL_NP_FLOAT_PRECISION = np.float32`, `GLOBAL_ENER_FLOAT_PRECISION = np.float64`
+
+**文件位置**: `deepmd/env.py:33-48`
+
+#### 1.3.2 模型参数精度控制 (precision)
+
+**作用范围**: 模型组件参数精度，影响神经网络权重和计算精度
+
+**配置位置**: input.json 中的各组件参数
+
+**可选值**:
+
+- `"float64"`: 双精度浮点数
+- `"float32"`: 单精度浮点数
+- `"float16"`: 半精度浮点数
+- `"default"`: 使用系统默认精度
+
+**配置示例**:
+
+```json
+{
+  "model": {
+    "descriptor": {
+      "type": "dpa3",
+      "precision": "float32", // 描述符精度
+      "repflow": {
+        "precision": "float32" // RepFlow组件精度
+      }
+    },
+    "fitting_net": {
+      "precision": "float32" // 拟合网络精度
+    }
+  }
+}
+```
+
+#### 1.3.3 精度控制的工作机制
+
+**文件位置**: `deepmd/pt/model/model/make_model.py:327-337`
+
+在模型执行过程中，精度控制按以下流程工作：
+
+1. **输入类型检测**: `input_type_cast()` 检测输入数据精度
+2. **全局精度转换**: 将输入数据转换为 `GLOBAL_PT_FLOAT_PRECISION`
+3. **模型计算**: 使用模型组件指定的精度进行计算
+4. **输出类型转换**: `output_type_cast()` 将输出转换回输入精度
+
+**关键代码**:
+
+```python
+def input_type_cast(self, coord, box=None, fparam=None, aparam=None):
+    """Cast the input data to global float type."""
+    input_prec = self.reverse_precision_dict[coord.dtype]
+    if input_prec == self.reverse_precision_dict[self.global_pt_float_precision]:
+        return coord, box, fparam, aparam, input_prec
+    else:
+        # 转换为全局精度
+        pp = self.global_pt_float_precision
+        return coord.to(pp), box.to(pp) if box is not None else None, ...
+```
+
+#### 1.3.4 精度设置的最佳实践
+
+**内存敏感场景**:
+
+```bash
+# 使用低精度接口 + 模型单精度
+export DP_INTERFACE_PREC=low
+# 模型配置中使用 "precision": "float32"
+```
+
+**高精度要求场景**:
+
+```bash
+# 使用高精度接口 + 模型双精度
+export DP_INTERFACE_PREC=high
+# 模型配置中使用 "precision": "float64"
+```
+
+**平衡性能和精度**:
+
+```bash
+# 高精度接口保证数据精度，模型使用单精度提高计算效率
+export DP_INTERFACE_PREC=high
+# 模型配置中使用 "precision": "float32"
+```
+
+#### 1.3.5 精度设置的注意事项
+
+1. **兼容性**: `DP_INTERFACE_PREC` 影响整个 DeePMD-kit 的接口，而 `precision` 参数只影响特定模型组件
+2. **性能**: 降低精度通常可以提高计算速度和减少内存使用
+3. **数值稳定性**: 高精度有助于数值稳定性，特别是在训练初期
+4. **能量精度**: 能量相关计算始终使用 `GLOBAL_ENER_FLOAT_PRECISION`，通常为 float64，因此模型在推理输出到时候默认还是双精度（即 lammps 调用时）
+
+### 1.4 快速训练和推理
+
+#### 1.4.1 训练命令
+
+```bash
+# 基本训练默认 tensorflow
+dp train input.json
+
+# 指定后端
+dp --pt train input.json
+```
+
+#### 1.4.2 推理命令
+
+```bash
+# 模型测试
+dp test -m dpa3_model.pt -s test_data
+
+# 模型冻结
+dp freeze -m dpa3_model.pt -o frozen_model.pth
+```
+
+---
+
+## 第二部分：系统架构
+
+### 2.1 整体架构设计
+
+DPA3 采用了模块化的设计架构，从数据输入到模型输出的完整流程：
+
+```
+数据输入层
+├── 原始坐标 (coord)
+├── 原子类型 (atype)
+├── 周期边界 (box)
+└── 邻居列表 (nlist)
+    ↓
+数据处理层
+├── DeepmdData (数据加载)
+├── DpLoaderSet (系统级DataLoader)
+└── 训练级DataLoader (采样和批处理)
+    ↓
+DPA3 描述符层
+├── DescrptDPA3 (主描述符)
+│   ├── TypeEmbedNet (类型嵌入)
+│   └── DescrptBlockRepflows (RepFlow块)
+│       ├── 边嵌入网络
+│       ├── 角度嵌入网络
+│       └── RepFlow层列表
+└── 输出处理
+    ↓
+拟合网络层
+├── 能量拟合
+├── 力拟合
+└── 维里拟合
+```
+
+### 2.2 核心组件关系
+
+#### 2.2.1 类继承关系
+
+```python
+@BaseDescriptor.register("dpa3")
+class DescrptDPA3(BaseDescriptor, torch.nn.Module):
+    """DPA3 描述符实现"""
+
+@DescriptorBlock.register("se_repflow")
+class DescrptBlockRepflows(DescriptorBlock):
+    """RepFlow 描述符块"""
+
+class RepFlowLayer(torch.nn.Module):
+    """单个 RepFlow 层"""
+```
+
+#### 2.2.2 组件交互流程
+
+1. **输入处理**: 接收扩展坐标、原子类型和邻居列表
+2. **类型嵌入**: 计算原子类型的嵌入向量
+3. **RepFlow 处理**: 多层节点、边、角信息迭代更新
+4. **输出生成**: 生成最终的原子环境描述符
+
+### 2.3 数据流架构
+
+#### 2.3.1 两级 DataLoader 架构
+
+```
+原始数据 (HDF5/.npy 文件)
+    ↓
+DeepmdData (数据系统加载)
+    ↓
+系统级 DataLoaders (每个系统一个 DataLoader, num_workers=0)
+    ↓
+DpLoaderSet (系统级 DataLoader 集合)
+    ↓
+训练级 DataLoader (采样和批处理, num_workers=NUM_WORKERS)
+    ↓
+模型输入 (coord, atype, box, fparam, aparam)
+```
+
+#### 2.3.2 数据变换流程
+
+1. **单帧加载**: `DeepmdDataSetForLoader.__getitem__()` 加载单个构型
+2. **批处理合并**: `collate_batch()` 组合多个帧
+3. **设备转移**: 数据移动到 GPU/CPU
+4. **输入分离**: 模型输入与标签分离
+
+### 2.4 DPAtomicModel 层次结构
+
+DPAtomicModel 是 DeePMD-kit PyTorch 后端的核心原子模型基类，它继承自 BaseAtomicModel 并为各种物理性质的预测提供了统一的接口。
+
+#### 2.4.1 类继承层次
+
+```python
+# 基础层次结构
+BaseAtomicModel (base_atomic_model.py:52)
+    ↓
+DPAtomicModel (dp_atomic_model.py:34) - 注册为 "standard"
+    ↓
+具体预测模型 (Energy, Dipole, Polar, DOS, Property)
+```
+
+**核心基类定义** (`deepmd/pt/model/atomic_model/dp_atomic_model.py:34`):
+
+```python
+@BaseAtomicModel.register("standard")
+class DPAtomicModel(BaseAtomicModel):
+    """Model give atomic prediction of some physical property.
+    
+    Parameters
+    ----------
+    descriptor
+            Descriptor
+    fitting_net
+            Fitting net
+    type_map
+            Mapping atom type to the name (str) of the type.
+    """
+```
+
+#### 2.4.2 具体派生模型
+
+**能量模型** (`deepmd/pt/model/atomic_model/energy_atomic_model.py:13`):
+```python
+class DPEnergyAtomicModel(DPAtomicModel):
+    def __init__(self, descriptor, fitting, type_map, **kwargs):
+        if not (isinstance(fitting, EnergyFittingNet) or 
+                isinstance(fitting, EnergyFittingNetDirect) or 
+                isinstance(fitting, InvarFitting)):
+            raise TypeError("fitting must be an instance of EnergyFittingNet, "
+                          "EnergyFittingNetDirect or InvarFitting for DPEnergyAtomicModel")
+        super().__init__(descriptor, fitting, type_map, **kwargs)
+```
+
+**偶极矩模型** (`deepmd/pt/model/atomic_model/dipole_atomic_model.py:14`):
+```python
+class DPDipoleAtomicModel(DPAtomicModel):
+    def __init__(self, descriptor, fitting, type_map, **kwargs):
+        if not isinstance(fitting, DipoleFittingNet):
+            raise TypeError("fitting must be an instance of DipoleFittingNet for DPDipoleAtomicModel")
+        super().__init__(descriptor, fitting, type_map, **kwargs)
+    
+    def apply_out_stat(self, ret: dict[str, torch.Tensor], atype: torch.Tensor):
+        # dipole not applying bias
+        return ret
+```
+
+**极化率模型** (`deepmd/pt/model/atomic_model/polar_atomic_model.py:14`):
+```python
+class DPPolarAtomicModel(DPAtomicModel):
+    def __init__(self, descriptor, fitting, type_map, **kwargs):
+        if not isinstance(fitting, PolarFittingNet):
+            raise TypeError("fitting must be an instance of PolarFittingNet for DPPolarAtomicModel")
+        super().__init__(descriptor, fitting, type_map, **kwargs)
+```
+
+#### 2.4.3 DPAtomicModel 核心功能
+
+**原子级前向传播** (`dp_atomic_model.py:205-265`):
+```python
+def forward_atomic(self,
+                  extended_coord,
+                  extended_atype,
+                  nlist,
+                  mapping: Optional[torch.Tensor] = None,
+                  fparam: Optional[torch.Tensor] = None,
+                  aparam: Optional[torch.Tensor] = None,
+                  comm_dict: Optional[dict[str, torch.Tensor]] = None) -> dict[str, torch.Tensor]:
+    """Return atomic prediction.
+    
+    Parameters
+    ----------
+    extended_coord
+            coordinates in extended region
+    extended_atype
+            atomic type in extended region
+    nlist
+            neighbor list. nf x nloc x nsel
+    mapping
+            mapps the extended indices to local indices
+    fparam
+            frame parameter. nf x ndf
+    aparam
+            atomic parameter. nf x nloc x nda
+    
+    Returns
+    -------
+    result_dict
+            the result dict, defined by the `FittingOutputDef`.
+    """
+    # 1. 数据类型转换和梯度设置
+    nframes, nloc, nnei = nlist.shape
+    atype = extended_atype[:, :nloc]
+    if self.do_grad_r() or self.do_grad_c():
+        extended_coord.requires_grad_(True)
+    
+    # 2. 描述符计算
+    descriptor, rot_mat, g2, h2, sw = self.descriptor(
+        extended_coord, extended_atype, nlist,
+        mapping=mapping, comm_dict=comm_dict)
+    
+    # 3. 拟合网络计算
+    fit_ret = self.fitting_net(
+        descriptor, atype, gr=rot_mat, g2=g2, h2=h2,
+        fparam=fparam, aparam=aparam)
+    
+    return fit_ret
+```
+
+**模型工厂集成** (`deepmd/pt/model/model/__init__.py`):
+```python
+def get_model(model_params):
+    model_type = model_params.get("type", "standard")
+    if model_type == "standard":
+        if "spin" in model_params:
+            return get_spin_model(model_params)
+        elif "use_srtab" in model_params:
+            return get_zbl_model(model_params)
+        else:
+            return get_standard_model(model_params)
+    # ... 其他模型类型
+```
+
+#### 2.4.4 在整体系统中的作用
+
+1. **模型创建**: 通过 `get_model()` 函数根据配置参数创建适当的 DPAtomicModel 实例
+2. **训练集成**: 在 `Trainer` 类中被包装用于训练过程
+3. **推理支持**: 在 `DeepEval` 类中用于模型推理和部署
+4. **多任务支持**: 支持多种物理性质的联合训练和预测
+
+DPAtomicModel 通过统一的接口和灵活的设计，为 DPA3 描述符与各种拟合网络的组合提供了标准化的实现框架。
+
+---
+
+## 第三部分：详细实现
+
+### 3.1 DPA3 核心实现
+
+#### 3.1.1 初始化过程 (`__init__`)
+
+**文件位置**: `deepmd/pt/model/descriptor/dpa3.py:105-171`
+
+```python
+def __init__(self,
+             ntypes: int,
+             repflow: Union[RepFlowArgs, dict],
+             concat_output_tebd: bool = False,
+             activation_function: str = "silu",
+             precision: str = "float64",
+             exclude_types: list[tuple[int, int]] = [],
+             env_protection: float = 0.0,
+             trainable: bool = True,
+             seed: Optional[Union[int, list[int]]] = None,
+             use_econf_tebd: bool = False,
+             use_tebd_bias: bool = False,
+             use_loc_mapping: bool = True,
+             type_map: Optional[list[str]] = None):
+```
+
+**关键组件初始化**:
+
+1. **RepFlow 参数处理**:
+
+   ```python
+   self.repflow_args = init_subclass_params(repflow, RepFlowArgs)
+   ```
+
+2. **类型嵌入网络**:
+
+   ```python
+   self.type_embedding = TypeEmbedNetConsistent(
+       ntypes=ntypes,
+       embedding_dim=tebd_dim,
+       precision=precision,
+       seed=child_seed(seed, 0),
+       use_econf_tebd=use_econf_tebd,
+       type_map=type_map
+   )
+   ```
+
+3. **RepFlow 块创建**:
+   ```python
+   self.repflows = DescrptBlockRepflows(
+       self.repflow_args.e_rcut,
+       self.repflow_args.e_rcut_smth,
+       self.repflow_args.e_sel,
+       self.repflow_args.a_rcut,
+       self.repflow_args.a_rcut_smth,
+       self.repflow_args.a_sel,
+       ntypes=ntypes,
+       n_dim=self.repflow_args.n_dim,
+       e_dim=self.repflow_args.e_dim,
+       a_dim=self.repflow_args.a_dim,
+       # ... 其他参数
+   )
+   ```
+
+#### 3.1.2 前向传播过程 (`forward`)
+
+**文件位置**: `deepmd/pt/model/descriptor/dpa3.py:430-498`
+
+**输入参数**:
+
+- `extended_coord`: 扩展坐标 [nf × (nall × 3)]
+- `extended_atype`: 扩展原子类型 [nf × nall]
+- `nlist`: 邻居列表 [nf × nloc × nnei]
+- `mapping`: 索引映射 (可选)
+- `comm_dict`: 并行通信数据 (可选)
+
+**处理流程**:
+
+```python
+def forward(self, extended_coord, extended_atype, nlist,
+            mapping=None, comm_dict=None):
+    # 1. 数据类型转换
+    extended_coord = extended_coord.to(dtype=self.prec)
+    nframes, nloc, nnei = nlist.shape
+    nall = extended_coord.view(nframes, -1).shape[1] // 3
+
+    # 2. 类型嵌入计算
+    if not parallel_mode and self.use_loc_mapping:
+        node_ebd_ext = self.type_embedding(extended_atype[:, :nloc])
+    else:
+        node_ebd_ext = self.type_embedding(extended_atype)
+    node_ebd_inp = node_ebd_ext[:, :nloc, :]
+
+    # 3. RepFlow 计算
+    node_ebd, edge_ebd, h2, rot_mat, sw = self.repflows(
+        nlist, extended_coord, extended_atype, node_ebd_ext,
+        mapping, comm_dict=comm_dict
+    )
+
+    # 4. 输出拼接处理
+    if self.concat_output_tebd:
+        node_ebd = torch.cat([node_ebd, node_ebd_inp], dim=-1)
+
+    return node_ebd, rot_mat, edge_ebd, h2, sw
+```
+
+**输出说明**:
+
+- `node_ebd`: 节点描述符 [nf × nloc × n_dim]
+- `rot_mat`: 旋转矩阵 [nf × nloc × e_dim × 3]
+- `edge_ebd`: 边嵌入 [nf × nloc × nnei × e_dim]
+- `h2`: 对表示 [nf × nloc × nnei × 3]
+- `sw`: 平滑开关函数 [nf × nloc × nnei]
+
+### 3.2 RepFlow 块实现
+
+#### 3.2.1 初始化组件
+
+**文件位置**: `deepmd/pt/model/descriptor/repflows.py:77-200`
+
+```python
+class DescrptBlockRepflows(DescriptorBlock):
+    def __init__(self,
+                 n_dim: int = 128,
+                 e_dim: int = 16,
+                 a_dim: int = 64,
+                 nlayers: int = 3,
+                 e_rcut: float = 6.0,
+                 e_rcut_smth: float = 0.5,
+                 e_sel: int = 120,
+                 a_rcut: float = 4.0,
+                 a_rcut_smth: float = 0.5,
+                 a_sel: int = 40,
+                 # ... 其他参数
+                ):
+```
+
+**关键组件**:
+
+1. **边嵌入网络**:
+
+   ```python
+   self.edge_embd = MLPLayer(
+       1, e_dim, activation=activation_function,
+       precision=precision, seed=child_seed(seed, 1)
+   )
+   ```
+
+2. **角度嵌入网络**:
+
+   ```python
+   self.angle_embd = MLPLayer(
+       1, a_dim, activation=activation_function,
+       precision=precision, seed=child_seed(seed, 2)
+   )
+   ```
+
+3. **RepFlow 层列表**:
+   ```python
+   self.layers = torch.nn.ModuleList()
+   for ii in range(nlayers):
+       self.layers.append(
+           RepFlowLayer(e_rcut, e_rcut_smth, e_sel, a_rcut, a_rcut_smth, a_sel,
+                       ntypes, n_dim, e_dim, a_dim, ...)
+       )
+   ```
+
+#### 3.2.2 前向传播流程
+
+**文件位置**: `deepmd/pt/model/descriptor/repflows.py:429-647`
+
+```python
+def forward(self, nlist, extended_coord, extended_atype,
+            extended_atype_embd=None, mapping=None, comm_dict=None):
+    # 1. 环境矩阵计算
+    dmatrix, diff, sw = prod_env_mat(
+        extended_coord, nlist, self.e_rcut, self.e_rcut_smth,
+        protection=self.env_protection
+    )
+
+    # 2. 边和角度邻居列表处理
+    # 生成边邻居列表和角度邻居列表
+
+    # 3. 嵌入计算
+    edge_input = dmatrix.unsqueeze(-1)  # [nf, nloc, nnei, 1]
+    edge_ebd = self.act(self.edge_embd(edge_input))
+
+    # 4. 角度信息计算
+    angle_input = ...  # 计算角度信息
+    angle_ebd = self.angle_embd(angle_input)
+
+    # 5. RepFlow 层迭代
+    for idx, ll in enumerate(self.layers):
+        node_ebd, edge_ebd, angle_ebd = ll.forward(
+            node_ebd, edge_ebd, angle_ebd,
+            nlist, extended_coord, extended_atype, ...
+        )
+
+    return node_ebd, edge_ebd, h2, rot_mat, sw
+```
+
+### 3.3 RepFlow 层实现
+
+#### 3.3.1 层初始化
+
+**文件位置**: `deepmd/pt/model/descriptor/repflow_layer.py:38-200`
+
+```python
+class RepFlowLayer(torch.nn.Module):
+    def __init__(self,
+                 e_rcut: float,
+                 e_rcut_smth: float,
+                 e_sel: int,
+                 a_rcut: float,
+                 a_rcut_smth: float,
+                 a_sel: int,
+                 ntypes: int,
+                 n_dim: int = 128,
+                 e_dim: int = 16,
+                 a_dim: int = 64,
+                 # ... 其他参数
+                ):
+```
+
+#### 3.3.2 主要功能
+
+1. **节点更新**: 基于边和角度信息更新节点表示
+2. **边更新**: 基于节点和角度信息更新边表示
+3. **角度更新**: 基于节点和边信息更新角度表示
+4. **残差连接**: 支持多种残差连接策略
+
+### 3.4 关键依赖和支持模块
+
+#### 3.4.1 网络组件
+
+- **MLP 网络**: `deepmd/pt/model/network/mlp.py`
+
+  - `MLPLayer`: 多层感知机实现
+  - `TypeEmbedNet`: 类型嵌入网络
+  - `TypeEmbedNetConsistent`: 一致性类型嵌入网络
+
+- **网络工具**: `deepmd/pt/model/network/network.py`
+  - 激活函数
+  - 网络初始化工具
+  - 图操作工具函数
+
+#### 3.4.2 工具函数
+
+- **环境矩阵**: `deepmd/pt/model/descriptor/env_mat.py`
+
+  - `prod_env_mat`: 环境矩阵计算
+  - 距离和角度计算
+
+- **邻居列表**: `deepmd/pt/utils/nlist.py`
+
+  - 邻居列表生成和处理
+  - 排除掩码处理
+
+- **环境配置**: `deepmd/pt/utils/env.py`
+  - 设备配置
+  - 数据精度设置
+  - 并行计算配置
+
+#### 3.4.3 统计和预处理
+
+- **环境矩阵统计**: `deepmd/pt/utils/env_mat_stat.py`
+
+  - 邻居统计
+  - 数据预处理
+
+- **排除掩码**: `deepmd/pt/utils/exclude_mask.py`
+  - 原子类型排除处理
+  - 掩码生成
+
+### 3.5 PyTorch 后端能量求和机制
+
+#### 3.5.1 深度势能原理的实现
+
+根据深度势能的基本原理，系统的总能量等于系统中每个原子局部环境能量的总和。这一原理在 PyTorch 后端中通过**分离的两阶段计算**得到精确实现，确保了模型的物理正确性和能量守恒。
+
+**核心公式**:
+```
+E_total = Σ E_i
+```
+其中 E_i 是第 i 个原子的局部环境能量。
+
+#### 3.5.2 原子级能量计算阶段
+
+**文件位置**: `deepmd/pt/model/task/fitting.py:473-614`
+
+在拟合网络的 `_forward_common` 方法中，每个原子的能量被独立计算：
+
+```python
+def _forward_common(self, descriptor, atype, ...):
+    # descriptor shape: [nf, nloc, nd] - 原子环境描述符
+    nf, nloc, nd = xx.shape
+    
+    # 初始化输出张量
+    outs = torch.zeros((nf, nloc, net_dim_out), dtype=self.prec, device=descriptor.device)
+    
+    if self.mixed_types:
+        # 混合类型模式：统一网络处理所有原子类型
+        atom_property = self.filter_layers.networks[0](xx)  # 神经网络计算
+        outs = outs + atom_property + self.bias_atom_e[atype].to(self.prec)
+    else:
+        # 非混合类型模式：每种原子类型使用独立网络
+        for type_i, ll in enumerate(self.filter_layers.networks):
+            mask = (atype == type_i).unsqueeze(-1)
+            mask = torch.tile(mask, (1, 1, net_dim_out))
+            atom_property = ll(xx)  # 特定类型的神经网络计算
+            atom_property = atom_property + self.bias_atom_e[type_i].to(self.prec)
+            atom_property = torch.where(mask, atom_property, 0.0)
+            outs = outs + atom_property
+    
+    # 应用排除掩码
+    mask = self.emask(atype).to(torch.bool)
+    outs = torch.where(mask[:, :, None], outs, 0.0)
+    
+    # 返回原子级能量，shape: [nf, nloc, net_dim_out]
+    results.update({self.var_name: outs})
+    return results
+```
+
+**关键特征**:
+- **原子级输出**: 网络输出为 `[nf, nloc, net_dim_out]`，每个原子都有独立的能量贡献
+- **类型特定处理**: 支持混合类型和非混合类型两种计算模式
+- **局部环境原理**: 每个原子的能量只依赖于其局部环境描述符，符合深度势能的核心思想
+- **类型偏置**: 每种原子类型都有特定的偏置能量 `bias_atom_e`
+
+#### 3.5.3 系统能量求和阶段
+
+**文件位置**: `deepmd/pt/model/model/transform_output.py:153-192`
+
+**重要发现**: 原子级能量到系统能量的转换是在 `fit_output_to_model_output` 函数中完成的，而不是在拟合网络中！
+
+```python
+def fit_output_to_model_output(fit_ret, fit_output_def, coord_ext, ...):
+    redu_prec = env.GLOBAL_PT_ENER_FLOAT_PRECISION
+    model_ret = dict(fit_ret.items())
+    
+    for kk, vv in fit_ret.items():
+        vdef = fit_output_def[kk]
+        shap = vdef.shape  # 对于能量，shap = [1]
+        atom_axis = -(len(shap) + 1)  # atom_axis = -2 (原子维度)
+        
+        if vdef.reducible:
+            kk_redu = get_reduce_name(kk)  # "energy" -> "energy_redu"
+            if vdef.intensive:
+                # 强度性质：计算平均原子能量
+                model_ret[kk_redu] = torch.mean(vv.to(redu_prec), dim=atom_axis)
+            else:
+                # 广延性质：计算总和
+                model_ret[kk_redu] = torch.sum(vv.to(redu_prec), dim=atom_axis)
+            
+            # 力和维里的自动微分计算
+            if vdef.r_differentiable:
+                kk_derv_r, kk_derv_c = get_deriv_name(kk)
+                dr, dc = take_deriv(vv, model_ret[kk_redu], vdef, coord_ext, ...)
+                model_ret[kk_derv_r] = dr
+                if vdef.c_differentiable:
+                    model_ret[kk_derv_c] = dc
+                    model_ret[kk_derv_c + "_redu"] = torch.sum(model_ret[kk_derv_c].to(redu_prec), dim=1)
+    
+    return model_ret
+```
+
+**能量求和详解**:
+- **输入**: `vv` shape `[nf, nloc, 1]` - 原子级能量
+- **求和操作**: `torch.mean(vv, dim=-2)` 对原子维度求平均
+- **输出**: `energy_redu` shape `[nf, 1]` - 系统能量
+- **物理意义**: 系统能量 = 平均原子能量 × 原子数量
+- **求和策略**: 通过 `vdef.intensive` 控制使用求和还是求平均
+
+#### 3.5.4 损失函数中的能量处理
+
+**文件位置**: `deepmd/pt/loss/ener.py:319-329`
+
+在训练过程中，能量损失按原子数量归一化：
+
+```python
+def forward(self, model_pred, label, natoms, ...):
+    # 系统能量预测值
+    energy_pred = model_pred["energy"]  # shape: [nf, 1]
+    energy_label = label["energy"]      # shape: [nf, 1]
+    
+    # 计算能量损失
+    l2_ener_loss = torch.mean(torch.square(energy_pred - energy_label))
+    
+    # 按原子数量归一化 (per atom loss)
+    atom_norm = 1.0 / natoms
+    loss += atom_norm * (pref_e * l2_ener_loss)
+```
+
+**归一化策略**:
+- **原子级归一化**: `atom_norm = 1.0 / natoms` 确保损失是 per atom 的
+- **训练稳定性**: 防止大系统主导训练过程
+- **物理一致性**: 保持能量与原子数量的线性关系
+
+#### 3.5.5 完整的能量计算数据流
+
+```
+原子坐标和类型 [nf × natoms × 3], [nf × natoms]
+    ↓
+DPA3 描述符计算 (dpa3.py:430-498)
+    ↓
+原子环境表示 [nf × natoms × n_dim]
+    ↓
+拟合网络计算 (fitting.py:473-614)
+    ↓
+原子级能量 [nf × natoms × 1]  ← 每个原子的局部环境能量
+    ↓
+能量求和变换 (transform_output.py:170-175)
+    ↓
+系统能量 [nf × 1]  ← torch.mean(dim=-2) 求平均
+    ↓
+损失计算 (ener.py:319-329)
+    ↓
+Per Atom 归一化损失 [scalar]
+```
+
+#### 3.5.6 关键设计特点
+
+**分离式计算架构**:
+1. **原子能量计算**: 在 `_forward_common` 中计算每个原子的局部环境能量
+2. **系统能量聚合**: 在 `fit_output_to_model_output` 中将原子能量聚合成系统能量
+3. **自动微分支持**: 力的计算通过自动微分实现，保持梯度传递
+
+**灵活的求和策略**:
+- **求平均**: `torch.mean()` 用于训练时的能量损失计算
+- **求总和**: `torch.sum()` 用于某些需要总量的场景
+- **精度控制**: 使用 `redu_prec` 确保数值稳定性
+
+**物理正确性保证**:
+- **局部性原理**: 每个原子的能量只依赖于其局部环境
+- **可加性**: 系统能量严格等于原子能量之和
+- **不变性**: 保持旋转和平移不变性
+
+**计算效率优化**:
+- **并行计算**: 原子级能量计算可以完全并行化
+- **批处理**: 支持多帧同时处理
+- **内存效率**: 分离的计算阶段减少内存占用
+
+### 3.6 DPA3描述符输出变量详解
+
+在DPA3描述符的forward方法中，输出的变量包含了原子环境表示的完整信息。这些变量对于理解描述符的工作原理和调试模型行为非常重要。
+
+#### 3.6.1 输出变量概述
+
+**文件位置**: `deepmd/pt/model/descriptor/dpa3.py:430-498`
+
+DPA3描述符的forward方法返回五个核心变量：
+
+```python
+def forward(self, extended_coord, extended_atype, nlist,
+            mapping=None, comm_dict=None):
+    # ... 计算过程 ...
+    return node_ebd, rot_mat, edge_ebd, h2, sw
+```
+
+#### 3.6.2 变量详细说明
+
+**node_ebd: 节点描述符**
+- **形状**: `[nf, nloc, n_dim]`
+- **含义**: 主要的原子环境描述符，包含每个原子的环境信息
+- **作用**: 直接输入拟合网络计算原子级能量
+
+**rot_mat: 旋转矩阵**
+- **形状**: `[nf, nloc, e_dim, 3]`
+- **含义**: 旋转矩阵用于坐标变换，保持旋转不变性
+- **作用**: 
+  - 将局部坐标转换到全局坐标系
+  - 确保描述符在分子旋转时的不变性
+  - 支持SE(3)等变变换
+
+**edge_ebd: 边嵌入**
+- **形状**: `[nf, nloc, nnei, e_dim]`
+- **含义**: 原子间边的嵌入表示
+- **作用**: 描述原子间的成键信息和相互作用
+
+**h2: 角度信息**
+- **形状**: `[nf, nloc, nnei, 3]`
+- **含义**: 三体角度相关信息
+- **作用**: 描述原子间的角度关系，支持3-body相互作用建模
+
+**sw: 平滑开关函数**
+- **形状**: `[nf, nloc, nnei]`
+- **含义**: 用于平滑截止边界的开关函数
+- **作用**: 在cutoff半径处平滑过渡到零，避免能量和力的不连续跳跃
+
+#### 3.6.3 变量在模型中的应用
+
+**在拟合网络中的使用** (`deepmd/pt/model/task/fitting.py:473-614`):
+
+```python
+def _forward_common(self, descriptor, atype, ...):
+    # descriptor是node_ebd [nf, nloc, nd]
+    nf, nloc, nd = descriptor.shape
+    
+    # 计算原子级能量
+    atom_property = self.filter_layers.networks[0](descriptor)
+    # ...
+    return {self.var_name: outs}  # outs shape [nf, nloc, net_dim_out]
+```
+
+#### 3.6.4 输出变量的数据流
+
+```
+扩展坐标和原子类型
+    ↓
+环境矩阵计算 (prod_env_mat)
+    ↓
+RepFlow边和角度处理
+    ↓
+edge_ebd, h2, sw ← 中间表示
+    ↓
+RepFlow层迭代更新
+    ↓
+node_ebd, rot_mat ← 最终描述符输出
+    ↓
+拟合网络处理
+    ↓
+原子级能量和性质预测
+```
+
+### 3.7 代码修改和功能增强历史
+
+#### 3.7.1 process_systems函数增强
+
+**修改位置**: `deepmd/utils/data_system.py`
+
+**核心修改**: 增强了`process_systems`函数，支持列表输入的递归搜索功能，每个字符串项都会进行递归子目录查找，同时保持向后兼容性。
+
+#### 3.7.2 功能验证
+
+- **向后兼容性**: 字符串输入行为保持完全一致
+- **新功能测试**: 列表中的字符串项正确进行递归搜索
+- **错误处理**: 边界条件和异常情况处理正确
+
+---
+
+## 第四部分：数据处理系统
+
+### 4.1 数据处理架构概述
+
+DeePMD-kit PyTorch 后端采用了独特的两级 DataLoader 架构，实现了高效的多系统数据管理和训练优化。这种架构专门为处理大规模分子动力学数据而设计，支持多数据源并行加载和智能批处理。
+
+**架构优势**:
+
+- **效率**: 系统级和训练级分离，避免线程爆炸
+- **灵活性**: 支持多种数据源和采样策略
+- **可扩展性**: 天然支持分布式训练和多 GPU
+- **稳定性**: 完善的错误处理和数据验证
+
+### 4.2 原始数据加载
+
+#### 4.2.1 数据文件结构
+
+**文件位置**: `deepmd/utils/data.py` - `DeepmdData` 类
+
+**数据来源**:
+
+- **HDF5 文件**: 高效存储大规模分子动力学数据
+- **.npy 文件**: NumPy 数组格式，存储单个属性
+- **系统目录**: 每个训练数据源独立的目录结构
+
+**目录结构**:
+
+```
+system_path/
+├── type_map.raw          # 原子类型映射
+├── set.0/                # 第一个数据集
+│   ├── coord.npy        # 原子坐标 [nframes × natoms × 3]
+│   ├── box.npy          # 周期边界条件 [nframes × 9]
+│   ├── energy.npy       # 系统能量 [nframes]
+│   ├── force.npy        # 原子力 [nframes × natoms × 3]
+│   └── virial.npy       # 系统维里 [nframes × 9]
+├── set.1/                # 第二个数据集
+└── ...
+```
+
+#### 4.2.2 数据加载过程
+
+**初始化过程** (`data.py:50-122`):
+
+```python
+class DeepmdData:
+    def __init__(self,
+                 systems: Union[str, List[str]],
+                 batch_size: int = 1,
+                 test_size: int = 0,
+                 shuffle_test: bool = True,
+                 type_map: Optional[List[str]] = None,
+                 modifier=None):
+        """
+        初始化数据系统
+
+        Args:
+            systems: 系统路径或路径列表
+            batch_size: 批处理大小
+            test_size: 测试集大小
+            shuffle_test: 是否打乱测试集
+            type_map: 原子类型映射
+            modifier: 数据修改器
+        """
+        # 1. 系统路径处理
+        self.system_dirs = self._get_system_dirs(systems)
+
+        # 2. 类型映射加载
+        self.type_map = self._load_type_map()
+
+        # 3. 数据需求定义
+        self.data_dict = {
+            "coord": {"ndof": 3, "atomic": True, "must": True},
+            "box": {"ndof": 9, "atomic": False, "must": self.pbc},
+            "energy": {"ndof": 1, "atomic": False, "must": False},
+            "force": {"ndof": 3, "atomic": True, "must": False},
+            # ... 其他属性
+        }
+
+        # 4. 数据集加载
+        self._load_all_sets()
+```
+
+**数据集加载** (`data.py:233-280`):
+
+```python
+def _load_set(self, set_path: str):
+    """加载单个数据集"""
+    # 1. 扫描数据文件
+    data_files = glob.glob(os.path.join(set_path, "*.npy"))
+
+    # 2. 加载必需属性
+    coord_data = np.load(os.path.join(set_path, "coord.npy"))
+    box_data = np.load(os.path.join(set_path, "box.npy"))
+
+    # 3. 加载可选属性
+    if os.path.exists(os.path.join(set_path, "energy.npy")):
+        energy_data = np.load(os.path.join(set_path, "energy.npy"))
+
+    # 4. 数据验证和预处理
+    self._validate_data(coord_data, box_data, energy_data)
+
+    return {
+        "coord": coord_data,
+        "box": box_data,
+        "energy": energy_data,
+        # ... 其他属性
+    }
+```
+
+#### 4.2.3 数据预处理
+
+**数据格式转换** (`data.py:300-315`):
+
+```python
+def reformat_data_torch(self, data_dict: dict) -> dict:
+    """将数据转换为 PyTorch 格式"""
+    reformatted = {}
+
+    for key, value in data_dict.items():
+        if key in self.data_dict:
+            info = self.data_dict[key]
+            if info["atomic"]:
+                # 原子级属性: [nframes × natoms × ndof]
+                reformatted[key] = torch.tensor(value, dtype=torch.float32)
+            else:
+                # 系统级属性: [nframes × ndof]
+                reformatted[key] = torch.tensor(value, dtype=torch.float32)
+
+    return reformatted
+```
+
+### 4.3 系统级 DataLoader 创建
+
+#### 4.3.1 DpLoaderSet 架构
+
+**文件位置**: `deepmd/pt/utils/dataloader.py` - `DpLoaderSet` 类
+
+**系统级 DataLoader 概述**:
+
+- **目的**: 为每个数据系统创建独立的 DataLoader
+- **特点**: 每个 DataLoader 负责处理一个系统的数据加载和批处理
+- **优势**: 避免线程爆炸，提高内存使用效率
+
+**初始化过程** (`dataloader.py:76-174`):
+
+```python
+class DpLoaderSet:
+    def __init__(self,
+                 systems: List[str],
+                 batch_size: Union[int, str, List[int]],
+                 type_map: List[str],
+                 shuffle: bool = True,
+                 dist: bool = False):
+        """
+        初始化系统级 DataLoader 集合
+
+        Args:
+            systems: 系统路径列表
+            batch_size: 批处理大小 (可以是自动、固定值或列表)
+            type_map: 原子类型映射
+            shuffle: 是否打乱数据
+            dist: 是否使用分布式训练
+        """
+        # 1. 系统数据初始化
+        self.systems = []
+        self.batch_sizes = []
+
+        for system_path in systems:
+            # 创建系统数据对象
+            system_data = DeepmdData(
+                system_path,
+                batch_size=1,  # 系统级批处理在 DataLoader 中处理
+                type_map=type_map
+            )
+
+            # 转换为 PyTorch 数据集
+            torch_dataset = DeepmdDataSetForLoader(system_data)
+            self.systems.append(torch_dataset)
+
+            # 计算系统级批处理大小
+            if isinstance(batch_size, str) and batch_size == "auto":
+                # 自动批处理: 基于原子数量优化
+                system_batch_size = self._calculate_auto_batch_size(system_data)
+            else:
+                system_batch_size = batch_size
+
+            self.batch_sizes.append(system_batch_size)
+
+        # 2. 创建系统级 DataLoaders
+        self.dataloaders = []
+        for system, batch_size in zip(self.systems, self.batch_sizes):
+            system_dataloader = self._create_system_dataloader(
+                system, batch_size, shuffle, dist
+            )
+            self.dataloaders.append(system_dataloader)
+```
+
+#### 4.3.2 系统级 DataLoader 创建
+
+**创建过程** (`dataloader.py:157-166`):
+
+```python
+def _create_system_dataloader(self, system, batch_size, shuffle, dist):
+    """创建单个系统级 DataLoader"""
+
+    # 分布式采样器
+    if dist and dist.is_available() and dist.is_initialized():
+        system_sampler = DistributedSampler(
+            system,
+            num_replicas=dist.get_world_size(),
+            rank=dist.get_rank(),
+            shuffle=shuffle
+        )
+    else:
+        system_sampler = None
+
+    # 创建 DataLoader
+    system_dataloader = DataLoader(
+        dataset=system,
+        batch_size=int(batch_size),
+        num_workers=0,  # 关键: 避免线程爆炸
+        sampler=system_sampler,
+        collate_fn=collate_batch,  # 数据批处理函数
+        shuffle=(not (dist.is_available() and dist.is_initialized())) and shuffle,
+    )
+
+    return system_dataloader
+```
+
+**为什么 num_workers=0**:
+
+- **线程管理**: 避免创建过多进程导致系统资源耗尽
+- **内存效率**: 每个系统都有独立的 DataLoader，多进程会导致内存爆炸
+- **稳定性**: 减少进程间通信的复杂性
+- **性能**: 在系统级 DataLoader 中，数据加载相对较快，不需要多进程加速
+
+#### 4.3.3 自动批处理计算
+
+**自动批处理算法** (`dataloader.py:200-220`):
+
+```python
+def _calculate_auto_batch_size(self, system_data: DeepmdData) -> int:
+    """基于系统特征计算最优批处理大小"""
+
+    # 1. 获取系统统计信息
+    natoms = system_data.get_natoms()
+    nframes = system_data.get_nframes()
+
+    # 2. 计算内存需求
+    memory_per_frame = natoms * 3 * 4  # 坐标内存 (float32)
+    memory_per_frame += natoms * 4     # 原子类型内存 (int32)
+    memory_per_frame += 9 * 4         # 盒子内存 (float32)
+
+    # 3. 基于可用内存计算批处理大小
+    available_memory = self._get_available_memory()
+    safe_memory = available_memory * 0.7  # 70% 安全阈值
+
+    batch_size = int(safe_memory / memory_per_frame)
+    batch_size = max(1, min(batch_size, 32))  # 限制在 1-32 之间
+
+    return batch_size
+```
+
+### 4.4 数据变换管道
+
+#### 4.4.1 数据集类实现
+
+**文件位置**: `deepmd/pt/utils/dataloader.py` - `DeepmdDataSetForLoader` 类
+
+**数据集类功能** (`dataloader.py:18-32`):
+
+```python
+class DeepmdDataSetForLoader(torch.utils.data.Dataset):
+    """将 DeepmdData 转换为 PyTorch Dataset"""
+
+    def __init__(self, dp_data: DeepmdData):
+        self.dp_data = dp_data
+        self.nframes = dp_data.get_nframes()
+
+    def __len__(self):
+        """返回数据集大小"""
+        return self.nframes
+
+    def __getitem__(self, idx: int):
+        """获取单个数据帧"""
+        # 1. 获取原始数据
+        frame_data = self.dp_data.get_item(idx)
+
+        # 2. 添加帧 ID
+        frame_data["fid"] = idx
+
+        # 3. 添加系统 ID (如果有多个系统)
+        if hasattr(self, "sid"):
+            frame_data["sid"] = self.sid
+
+        return frame_data
+```
+
+#### 4.4.2 批处理函数实现
+
+**核心批处理函数** (`dataloader.py:223-238`):
+
+```python
+def collate_batch(batch: List[dict]) -> dict:
+    """
+    将多个数据帧合并为批处理
+
+    Args:
+        batch: 数据帧列表，每个元素是一个字典
+
+    Returns:
+        批处理数据字典
+    """
+    example = batch[0]
+    result = {}
+
+    for key in example.keys():
+        if "find_" in key:
+            # 查找键保持为单值
+            result[key] = batch[0][key]
+        elif key == "fid":
+            # 帧 ID 转换为列表
+            result[key] = [d[key] for d in batch]
+        elif key == "type":
+            # 跳过 type 键，作为 atype 处理
+            continue
+        else:
+            # 其他键进行张量批处理
+            result[key] = collate_tensor_fn(
+                [torch.as_tensor(d[key]) for d in batch]
+            )
+
+    return result
+```
+
+**张量批处理函数** (`dataloader.py:240-250`):
+
+```python
+def collate_tensor_fn(tensors: List[torch.Tensor]) -> torch.Tensor:
+    """将张量列表合并为单个张量"""
+
+    if len(tensors) == 0:
+        return torch.tensor([])
+
+    # 检查张量形状是否一致
+    shapes = [t.shape for t in tensors]
+    if len(set(shapes)) == 1:
+        # 形状一致，直接堆叠
+        return torch.stack(tensors, dim=0)
+    else:
+        # 形状不一致，填充到最大形状
+        max_shape = [max(dim) for dim in zip(*shapes)]
+        padded_tensors = []
+
+        for tensor in tensors:
+            padding = [(0, max_dim - curr_dim)
+                      for max_dim, curr_dim in zip(max_shape, tensor.shape)]
+            padded_tensor = torch.nn.functional.pad(tensor, padding)
+            padded_tensors.append(padded_tensor)
+
+        return torch.stack(padded_tensors, dim=0)
+```
+
+### 4.5 训练级 DataLoader 数据流
+
+#### 4.5.1 训练级 DataLoader 创建
+
+**文件位置**: `deepmd/pt/train/training.py` - `get_data_loader()` 函数
+
+**训练级 DataLoader 概述**:
+
+- **目的**: 管理训练过程中的数据采样和批处理
+- **特点**: 包装系统级 DataLoader 集合，提供统一的数据接口
+- **优势**: 支持多系统采样、分布式训练和无限循环
+
+**创建过程** (`training.py:177-214`):
+
+```python
+def get_data_loader(_training_data, _validation_data, _training_params):
+    """创建训练和验证数据加载器"""
+
+    def get_dataloader_and_iter(_data, _params):
+        """创建单个数据加载器和迭代器"""
+
+        # 1. 采样器配置
+        _sampler = get_sampler_from_params(_data, _params)
+        if _sampler is None:
+            log.warning("Sampler not specified!")
+
+        # 2. 创建训练级 DataLoader
+        _dataloader = DataLoader(
+            _data,                              # DpLoaderSet 实例
+            sampler=_sampler,                   # 采样器
+            batch_size=None,                   # 单系统批处理
+            num_workers=NUM_WORKERS if dist.is_available() else 0,
+            drop_last=False,                   # 不丢弃最后一个不完整批次
+            collate_fn=lambda batch: batch,     # 防止额外转换
+            pin_memory=True,                    # 锁页内存优化
+        )
+
+        # 3. 创建无限循环迭代器
+        _data_iter = cycle_iterator(_dataloader)
+        return _dataloader, _data_iter
+
+    # 创建训练和验证数据加载器
+    training_dataloader, training_data_iter = get_dataloader_and_iter(
+        _training_data, _training_params["training_data"]
+    )
+
+    validation_dataloader, validation_data_iter = get_dataloader_and_iter(
+        _validation_data, _training_params["validation_data"]
+    )
+
+    return training_dataloader, training_data_iter, validation_dataloader, validation_data_iter
+```
+
+#### 4.5.2 采样器配置
+
+**采样器创建** (`training.py:266-277`):
+
+```python
+def get_sampler_from_params(_data, _params):
+    """基于参数创建采样器"""
+
+    # 1. 获取采样概率
+    if "prob_sys_size" in _params and _params["prob_sys_size"]:
+        # 基于系统大小的采样概率
+        prob = _data.get_sys_prob()
+    elif "prob" in _params:
+        # 用户定义的采样概率
+        prob = _params["prob"]
+    else:
+        # 均匀采样
+        prob = None
+
+    # 2. 创建采样器
+    if prob is not None:
+        sampler = WeightedRandomSampler(
+            weights=prob,
+            num_samples=len(prob),
+            replacement=True
+        )
+    else:
+        sampler = None
+
+    return sampler
+```
+
+**系统概率计算** (`dataloader.py:300-320`):
+
+```python
+def get_sys_prob(self) -> List[float]:
+    """计算系统采样概率"""
+
+    # 1. 获取每个系统的帧数
+    system_sizes = [len(system) for system in self.systems]
+
+    # 2. 基于帧数计算概率
+    total_frames = sum(system_sizes)
+    prob = [size / total_frames for size in system_sizes]
+
+    return prob
+```
+
+#### 4.5.3 无限循环迭代器
+
+**迭代器实现** (`training.py:150-160`):
+
+```python
+def cycle_iterator(dataloader):
+    """创建无限循环的数据迭代器"""
+
+    while True:
+        # 1. 重置迭代器
+        data_iter = iter(dataloader)
+
+        # 2. 遍历所有数据
+        try:
+            while True:
+                batch = next(data_iter)
+                yield batch
+        except StopIteration:
+            # 3. 重新开始循环
+            continue
+```
+
+### 4.6 最终数据提交给模型
+
+#### 4.6.1 数据获取和预处理
+
+**文件位置**: `deepmd/pt/train/training.py` - `Trainer.get_data()` 方法
+
+**数据获取过程** (`training.py:950-990`):
+
+```python
+def get_data(self, is_train=True, task_key="Default"):
+    """获取训练数据并预处理"""
+
+    # 1. 选择数据迭代器
+    if is_train:
+        iterator = self.training_data_iters[task_key]
+    else:
+        iterator = self.validation_data_iters[task_key]
+
+    # 2. 获取下一个批次
+    batch_data = next(iterator)
+
+    # 3. 数据类型和设备转换
+    for key in batch_data.keys():
+        if key not in ["sid", "fid", "box", "find_*"]:
+            # 移动到目标设备
+            batch_data[key] = batch_data[key].to(
+                env.DEVICE, non_blocking=True
+            )
+
+    # 4. 分离输入和标签
+    input_dict, label_dict, log_dict = self._separate_inputs_labels(batch_data)
+
+    return input_dict, label_dict, log_dict
+```
+
+**输入标签分离** (`training.py:1000-1020`):
+
+```python
+def _separate_inputs_labels(self, batch_data: dict) -> tuple:
+    """分离模型输入和标签"""
+
+    # 1. 定义输入键
+    input_keys = ["coord", "atype", "spin", "box", "fparam", "aparam"]
+
+    # 2. 创建输入字典
+    input_dict = {}
+    for key in input_keys:
+        if key in batch_data:
+            input_dict[key] = batch_data[key]
+
+    # 3. 创建标签字典
+    label_dict = {}
+    for key, value in batch_data.items():
+        if key not in input_keys and key not in ["sid", "fid"]:
+            label_dict[key] = value
+
+    # 4. 创建日志字典
+    log_dict = {
+        "natoms": batch_data.get("natoms", None),
+        "find_energy": batch_data.get("find_energy", False),
+        "find_force": batch_data.get("find_force", False),
+    }
+
+    return input_dict, label_dict, log_dict
+```
+
+#### 4.6.2 模型输入提交
+
+**模型执行过程** (`training.py:611-704`):
+
+```python
+def step(self, task_key="Default", **kwargs):
+    """执行单个训练步骤"""
+
+    # 1. 获取数据
+    input_dict, label_dict, log_dict = self.get_data(
+        is_train=True, task_key=task_key
+    )
+
+    # 2. 前向传播
+    with torch.cuda.amp.autocast(enabled=self.mixed_precision):
+        model_pred, loss, more_loss = self.wrapper(
+            **input_dict,
+            cur_lr=self.get_cur_lr(),
+            label=label_dict,
+            task_key=task_key
+        )
+
+    # 3. 反向传播
+        self.optimizer.zero_grad()
+        loss.backward()
+
+        # 4. 梯度裁剪
+        if self.grad_clip > 0:
+            torch.nn.utils.clip_grad_norm_(
+                self.wrapper.parameters(), self.grad_clip
+            )
+
+        # 5. 参数更新
+        self.optimizer.step()
+
+    # 6. 记录损失
+    self.record_loss(loss, more_loss, log_dict)
+
+    return loss, more_loss
+```
+
+### 4.7 数据流程优化特性
+
+#### 4.7.1 内存优化策略
+
+**内存管理**:
+
+- **锁页内存**: 使用 `pin_memory=True` 提高 GPU 数据传输效率
+- **自动批处理**: 基于系统特征动态调整批处理大小
+- **设备管理**: 智能设备选择和内存分配
+
+**NUM_WORKERS 配置** (`env.py:26-31`):
+
+```python
+# 环境变量配置
+NUM_WORKERS = int(os.environ.get("NUM_WORKERS", min(4, ncpus)))
+
+# 多进程方法检查
+if multiprocessing.get_start_method() != "fork":
+    log.warning("NUM_WORKERS > 0 is not supported with spawn or forkserver start method. Setting NUM_WORKERS to 0.")
+    NUM_WORKERS = 0
+```
+
+#### 4.7.2 性能优化特性
+
+**分布式训练支持**:
+
+- **数据并行**: 支持多 GPU 数据并行训练
+- **分布式采样**: `DistributedSampler` 确保数据均匀分布
+- **梯度同步**: 自动梯度同步和参数更新
+
+**数据增强**:
+
+- **随机打乱**: 支持训练数据随机打乱
+- **加权采样**: 基于系统大小的智能采样
+- **多任务支持**: 支持多任务学习的数据管理
+
+### 4.8 数据流程监控和调试
+
+#### 4.8.1 数据统计信息
+
+**数据统计** (`dataloader.py:400-420`):
+
+```python
+def get_data_statistics(self) -> dict:
+    """获取数据统计信息"""
+
+    stats = {
+        "num_systems": len(self.systems),
+        "total_frames": sum(len(sys) for sys in self.systems),
+        "batch_sizes": self.batch_sizes,
+        "system_sizes": [len(sys) for sys in self.systems],
+        "memory_usage": self._estimate_memory_usage(),
+    }
+
+    return stats
+```
+
+#### 4.8.2 数据验证和错误处理
+
+**数据验证** (`data.py:400-420`):
+
+```python
+def validate_data(self, coord_data, box_data, energy_data=None):
+    """验证数据完整性"""
+
+    # 1. 检查数据形状
+    nframes = coord_data.shape[0]
+    assert box_data.shape[0] == nframes, "Box data frame count mismatch"
+
+    # 2. 检查原子数量一致性
+    natoms = coord_data.shape[1] // 3
+    assert natoms > 0, "Invalid atom count"
+
+    # 3. 检查数值范围
+    assert torch.isfinite(coord_data).all(), "Invalid coordinate values"
+    assert torch.isfinite(box_data).all(), "Invalid box values"
+
+    # 4. 检查能量数据
+    if energy_data is not None:
+        assert energy_data.shape[0] == nframes, "Energy data frame count mismatch"
+        assert torch.isfinite(energy_data).all(), "Invalid energy values"
+```
+
+---
+
+## 第五部分：推理和部署
+
+### 5.1 推理架构概述
+
+DPA3 的推理系统采用分层设计，支持多种部署方式和性能优化策略。推理过程的核心是通过 `DeepEval` 类实现的，它提供了统一的接口来加载训练好的 DPA3 模型并进行高效的原子环境计算。
+
+**推理架构组件**:
+
+```
+用户接口层 (CLI / Python API)
+    ↓
+DeepEval (统一推理接口)
+    ↓
+ModelWrapper (模型包装器)
+    ↓
+DPA3 Descriptor (原子环境计算)
+    ↓
+PyTorch JIT / 原生执行 (计算后端)
+```
+
+### 5.2 推理入口点和接口
+
+#### 5.2.1 Python API 接口
+
+**主要推理类**:
+
+- `DeepEval`: 通用推理接口 (`deepmd/pt/infer/deep_eval.py:75`)
+- `Tester`: 测试和推理工具 (`deepmd/pt/infer/inference.py:25`)
+
+**基本使用方法**:
+
+```python
+from deepmd.pt.infer import DeepEval
+
+# 加载模型
+evaluator = DeepEval("dpa3_model.pt", output_def)
+
+# 执行推理
+result = evaluator.eval(
+    coords=coordinates,      # [nframes x natoms x 3]
+    cells=cell_parameters,    # [nframes x 9] (可选)
+    atom_types=atom_types,    # [natoms] 或 [nframes x natoms]
+    atomic=False             # 是否计算原子级贡献
+)
+```
+
+#### 5.2.2 CLI 推理命令
+
+**测试命令**:
+
+```bash
+dp test -m dpa3_model.pt -s test_data
+```
+
+**模型冻结**:
+
+```bash
+dp freeze -m dpa3_model.pt -o frozen_model.pth
+```
+
+### 5.3 模型加载和初始化
+
+#### 5.3.1 模型加载过程
+
+**文件位置**: `deepmd/pt/infer/deep_eval.py:96-161`
+
+```python
+def __init__(self, model_file: str, output_def: ModelOutputDef,
+             auto_batch_size: Union[bool, int, AutoBatchSize] = True,
+             neighbor_list: Optional["ase.neighborlist.NewPrimitiveNeighborList"] = None,
+             head: Optional[Union[str, int]] = None,
+             no_jit: bool = False):
+
+    # 1. 加载模型检查点
+    state_dict = torch.load(model_file, map_location=env.DEVICE, weights_only=True)
+
+    # 2. 处理多任务模型
+    if self.multi_task:
+        # 选择指定的任务头
+        model_params = self.input_param["model_dict"][head]
+
+    # 3. 重建模型架构
+    model = get_model(self.input_param).to(DEVICE)
+
+    # 4. JIT 编译优化
+    if not self.input_param.get("hessian_mode") and not no_jit:
+        model = torch.jit.script(model)
+
+    # 5. 包装和加载权重
+    self.dp = ModelWrapper(model)
+    self.dp.load_state_dict(state_dict)
+    self.dp.eval()  # 设置为评估模式
+```
+
+#### 5.3.2 多任务模型支持
+
+对于包含多个任务的 DPA3 模型，推理时需要指定具体的任务头：
+
+```python
+# 多任务模型推理
+evaluator = DeepEval("multi_task_model.pt", output_def, head="task_name")
+```
+
+### 5.4 推理执行流程
+
+#### 5.4.1 主要推理方法
+
+**文件位置**: `deepmd/pt/infer/deep_eval.py:394-462`
+
+**标准推理流程**:
+
+```python
+def _eval_model(self, coords, cells, atom_types, fparam, aparam, request_defs):
+    # 1. 数据预处理
+    coord_input = torch.tensor(coords.reshape([nframes, natoms, 3]),
+                               dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE)
+    type_input = torch.tensor(atom_types, dtype=torch.long, device=DEVICE)
+
+    # 2. 可选参数处理
+    box_input = torch.tensor(cells.reshape([nframes, 3, 3]),
+                             dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE) if cells is not None else None
+
+    # 3. 执行模型推理
+    batch_output = model(
+        coord_input,
+        type_input,
+        box=box_input,
+        do_atomic_virial=do_atomic_virial,
+        fparam=fparam_input,
+        aparam=aparam_input
+    )
+
+    # 4. 后处理和返回结果
+    return self._process_output(batch_output, request_defs)
+```
+
+#### 5.4.2 DPA3 在推理中的执行
+
+在推理过程中，DPA3 描述符的 `forward` 方法被调用来计算原子环境表示：
+
+1. **输入数据**: 接收扩展坐标、原子类型和邻居列表
+2. **类型嵌入**: 计算原子类型嵌入向量
+3. **RepFlow 计算**: 通过多层 RepFlow 处理节点、边和角度信息
+4. **输出生成**: 生成最终的原子环境描述符
+
+### 5.5 性能优化特性
+
+#### 5.5.1 自动批处理
+
+**实现位置**: `deepmd/pt/infer/deep_eval.py:351-375`
+
+```python
+def _eval_func(self, inner_func: Callable, numb_test: int, natoms: int) -> Callable:
+    if self.auto_batch_size is not None:
+        def eval_func(*args, **kwargs):
+            return self.auto_batch_size.execute_all(inner_func, numb_test, natoms, *args, **kwargs)
+    else:
+        eval_func = inner_func
+    return eval_func
+```
+
+**自动批处理优势**:
+
+- **内存优化**: 根据可用内存自动调整批处理大小
+- **性能平衡**: 在内存使用和计算效率之间找到最佳平衡
+- **适应性**: 能够根据不同的硬件配置自动调整
+
+#### 5.5.2 JIT 编译优化
+
+**JIT 编译过程**:
+
+```python
+# 模型加载时自动进行 JIT 编译
+if not self.input_param.get("hessian_mode") and not no_jit:
+    model = torch.jit.script(model)
+```
+
+**JIT 优化效果**:
+
+- **计算图优化**: 将 Python 代码编译为优化的计算图
+- **内存分配优化**: 减少动态内存分配开销
+- **算子融合**: 将多个操作融合为单个高效算子
+
+#### 5.5.3 设备优化
+
+**多设备支持**:
+
+- **CPU 推理**: 适用于小规模模型和内存受限环境
+- **GPU 推理**: 大规模并行计算，显著提升推理速度
+- **多 GPU**: 支持模型并行和数据并行
+
+**设备选择策略**:
+
+```python
+# 自动选择最佳计算设备
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = model.to(device)
+```
+
+### 5.6 推理部署选项
+
+#### 5.6.1 模型格式
+
+**支持的模型格式**:
+
+1. **.pt 文件**: PyTorch 标准检查点格式
+
+   - 包含完整的模型权重和配置信息
+   - 支持多任务模型和元数据
+
+2. **.pth 文件**: TorchScript 冻结模型
+   - 经过 JIT 编译优化的模型
+   - 部署时无需重新编译，加载更快
+
+#### 5.6.2 冻结模型生成
+
+**文件位置**: `deepmd/pt/entrypoints/main.py:344-358`
+
+```python
+def freeze(model: str, output: str = "frozen_model.pth", head: Optional[str] = None):
+    # 1. 加载原始模型
+    model = inference.Tester(model, head=head).model
+
+    # 2. 设置为评估模式
+    model.eval()
+
+    # 3. JIT 脚本编译
+    model = torch.jit.script(model)
+
+    # 4. 保存冻结模型
+    torch.jit.save(model, output, extra_files={})
+```
+
+**冻结模型优势**:
+
+- **部署简化**: 无需依赖原始模型定义代码
+- **加载速度**: 避免了模型重建的开销
+- **版本兼容**: 提供更好的版本兼容性
+
+### 5.7 高级推理功能
+
+#### 5.7.1 描述符提取
+
+**方法**: `eval_descriptor()`
+**位置**: `deepmd/pt/infer/deep_eval.py:633-687`
+
+```python
+def eval_descriptor(self, coords, cells, atom_types, fparam=None, aparam=None):
+    """提取 DPA3 原子环境描述符"""
+    # 返回原始的 DPA3 描述符输出
+    # 可用于分析和可视化原子环境表示
+```
+
+#### 5.7.2 类型嵌入分析
+
+**方法**: `eval_typeebd()`
+**位置**: `deepmd/pt/infer/deep_eval.py:565-632`
+
+```python
+def eval_typeebd(self):
+    """评估类型嵌入网络输出"""
+    # 返回原子类型的嵌入向量
+    # 用于分析类型表示的特征空间
+```
+
+#### 5.7.3 拟合网络分析
+
+**方法**: `eval_fitting_last_layer()`
+**位置**: `deepmd/pt/infer/deep_eval.py:688-730`
+
+```python
+def eval_fitting_last_layer(self, coords, cells, atom_types, fparam=None, aparam=None):
+    """评估拟合网络最后一层的输入"""
+    # 用于调试和分析拟合过程
+```
+
+### 5.8 推理性能监控
+
+#### 5.8.1 性能指标
+
+**模型大小分析**:
+
+```python
+def get_model_size(self) -> dict:
+    """获取模型参数统计"""
+    return {
+        "descriptor": sum_param_des,      # 描述符参数数量
+        "fitting-net": sum_param_fit,     # 拟合网络参数数量
+        "total": sum_param_des + sum_param_fit  # 总参数数量
+    }
+```
+
+#### 5.8.2 内存使用优化
+
+**内存管理策略**:
+
+1. **梯度禁用**: 推理时自动禁用梯度计算
+2. **批处理优化**: 通过自动批处理控制内存使用
+3. **设备内存管理**: 自动管理 GPU 内存分配和释放
+
+### 5.9 推理部署最佳实践
+
+#### 5.9.1 模型选择建议
+
+**小规模系统** (原子数 < 1000):
+
+- 使用标准的 .pt 格式
+- 启用 JIT 编译优化
+- CPU 推理通常足够
+
+**中等规模系统** (原子数 1000-10000):
+
+- 推荐使用冻结的 .pth 格式
+- 启用 GPU 推理
+- 调整自动批处理参数
+
+**大规模系统** (原子数 > 10000):
+
+- 必须使用 GPU 推理
+- 考虑多 GPU 并行
+- 优化邻居列表计算
+
+#### 5.9.2 配置优化
+
+**内存优化配置**:
+
+```python
+# 内存敏感环境
+evaluator = DeepEval("model.pt", output_def,
+                    auto_batch_size=False)  # 禁用自动批处理
+
+# 性能优化配置
+evaluator = DeepEval("model.pt", output_def,
+                    auto_batch_size=1024)  # 设置固定批处理大小
+```
+
+#### 5.9.3 错误处理和调试
+
+**常见推理问题**:
+
+1. **内存不足**: 减少批处理大小或使用 CPU
+2. **设备不匹配**: 确保模型和数据在同一设备上
+3. **版本兼容**: 使用冻结模型避免版本问题
+
+---
+
+## 总结
+
+DPA3 作为 DeePMD-kit 中最先进的原子环境描述符之一，通过结合节点、边和角度信息，提供了更加精确和全面的原子环境表示。其模块化的设计、丰富的配置选项和优秀的性能优化特性，使其能够广泛应用于各种分子动力学模拟任务中。
+
+### 技术特点总结
+
+**架构优势**:
+
+- **模块化设计**: 清晰的组件分离，易于扩展和维护
+- **高效数据处理**: 两级 DataLoader 架构，避免线程爆炸
+- **并行计算支持**: 天然支持多 GPU 和分布式训练
+- **性能优化**: JIT 编译、自动批处理、内存优化
+
+**核心创新**:
+
+- **RepFlow 架构**: 结合节点、边、角信息的统一表示
+- **3-body 相互作用**: 显式建模三体相互作用，提高精度
+- **动态更新策略**: 多种残差连接策略，优化信息流动
+- **智能压缩**: 角度消息压缩，减少计算开销
+
+### 使用建议
+
+**新手用户**:
+
+- 从基本配置开始，逐步调整参数
+- 使用自动批处理和默认优化选项
+- 关注训练收敛和基本性能指标
+
+**高级用户**:
+
+- 深入调整 RepFlow 参数优化性能
+- 利用分布式训练处理大规模数据
+- 自定义采样策略和损失函数
+
+**生产环境**:
+
+- 使用冻结模型确保部署稳定性
+- 监控推理性能和资源使用
+- 定期验证模型精度和稳定性
+
+### 未来发展方向
+
+**功能扩展**:
+
+- 支持更高阶的相互作用
+- 自适应邻居选择策略
+- 注意力机制集成
+
+**性能优化**:
+
+- 混合精度训练完善
+- 模型量化和压缩
+- 硬件特定优化
+
+**应用拓展**:
+
+- 多尺度建模支持
+- 在线学习和增量更新
+- 可解释性增强
+
+无论是学术研究还是工业应用，DPA3 都能够为用户提供可靠的深度学习势能解决方案。
diff --git a/outisli/install.md b/outisli/install.md
new file mode 100644
index 0000000000..20869e5446
--- /dev/null
+++ b/outisli/install.md
@@ -0,0 +1,179 @@
+鉴于大家可能觉得从源码安装`DeepMD-kit`门槛较高，而极少使用。然而从源码安装的灵活性最高，为进一步推广，并减少可能的坑，笔者在此根据自己的安装流程结合官方文档给出一个适用性较广的安装教程，各位可自行尝试。
+
+本教程适用于 Linux(with NVIDIA GPU) 及 Mac(with Apple Silicon)
+
+Since some users may find installing `DeepMD-kit` from source to be challenging and rarely attempt it, this guide aims to make the process more accessible. Installing from source offers the highest flexibility. To promote this method and reduce potential pitfalls, I have compiled a broadly applicable installation tutorial based on my own experience and the official documentation. You are encouraged to try it out.
+
+This tutorial is applicable to Linux (with NVIDIA GPU) and Mac (with Apple Silicon).
+
+> 注：
+>
+> 1. 安装过程不强制要求`sudo`权限
+> 2. 若在有`sudo`权限的电脑上，可自行安装 CUDA Toolkit 以及 mpi（可选）
+> 3. 在 HPC 集群上可通过`source\module`方式加载 CUDA Toolkit 以及 mpi 环境
+> 4. 默认安装在用户 home 目录 Software 目录下，若需要修改路径，请修改教程中涉及路径的命令
+> 5. 本教程需有一定计算机（linux）操作常识，若遇到问题，可以评论沟通或询问 AI
+
+> Notes:
+>
+> 1. The installation process does not strictly require `sudo` privileges.
+> 2. If you have `sudo` privileges, you may install CUDA Toolkit and MPI (optional) yourself.
+> 3. On HPC clusters, you can load CUDA Toolkit and MPI environments using `source` or `module` commands.
+> 4. By default, the installation path is set to the user's home directory under the Software folder. If you wish to change the path, please modify the relevant commands in the tutorial.
+> 5. This tutorial assumes some basic knowledge of computer (Linux) operations. If you encounter any issues, feel free to comment or ask AI for help.
+
+# 0. Preparation (Optional)
+## 0.1 CUDA Toolkit
+
+```shell
+sudo apt update && sudo apt upgrade -y && sudo apt autoremove -y
+# for Ubuntu 24.04 LTS
+wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
+# for WSL
+wget https://developer.download.nvidia.com/compute/cuda/repos/wsl-ubuntu/x86_64/cuda-keyring_1.1-1_all.deb
+sudo dpkg -i cuda-keyring_1.1-1_all.deb
+sudo apt update && sudo apt upgrade -y && sudo apt autoremove -y
+sudo apt install cuda-toolkit-12-8 -y
+
+#config cuda
+export CUDA_PATH=/usr/local/cuda
+export CUDA_HOME=/usr/local/cuda
+export PATH=$PATH:$CUDA_HOME/bin
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_HOME/lib64
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_HOME/extras/CUPTI/lib64
+```
+
+You can also use CUDA 12.6, 12.9 as well.
+
+## 0.2 Intel® oneAPI Toolkit
+
+```shell
+wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/e6ff8e9c-ee28-47fb-abd7-5c524c983e1c/l_BaseKit_p_2024.2.1.100_offline.sh
+sudo sh ./l_BaseKit_p_2024.2.1.100_offline.sh -a --silent --cli --eula accept
+
+wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/d461a695-6481-426f-a22f-b5644cd1fa8b/l_HPCKit_p_2024.2.1.79_offline.sh
+sudo sh ./l_HPCKit_p_2024.2.1.79_offline.sh -a --silent --cli --eula accept
+
+# load intel oneapi
+source /opt/intel/oneapi/setvars.sh --force > /dev/null
+```
+
+# 1. Install Backend’s Python interface
+
+## 1.1 Use Miniforge (Conda/mamba)
+
+```shell
+# 0. no need for HPC
+sudo apt update && sudo apt upgrade -y && sudo apt autoremove -y
+
+# 1. Preparation
+# 1.1 Get source code
+# or git clone https://github.com/deepmodeling/deepmd-kit.git && cd deepmd-kit && git checkout devel
+# 下述链接为笔者自己的fork，时不时增加一些小改进，欢迎star
+git clone git@github.com:OutisLi/deepmd-kit.git && cd deepmd-kit && git checkout outisli
+
+# 1.2 Create virtual environment
+# optional if you installed miniforge: alias conda="mamba"
+conda update -n base -c conda-forge conda -y ; conda update -n base -c conda-forge mamba -y
+conda deactivate && conda env remove -n dpmd -y ; rm -rf build ; git clean -xdf ; conda create -n dpmd gcc=13 gxx=13 cmake python=3.13 -c conda-forge -y && conda activate dpmd && pip install --upgrade pip && pip install uv
+
+# 1.3 (Optional) install openmpi if you do not have mpi
+conda install openmpi -c conda-forge
+
+# 2.1 Install pytorch
+uv pip install -U torch --index-url https://download.pytorch.org/whl/cu129
+
+# 2.2 (Optional) Install tensorflow
+uv pip install -U tensorflow
+
+# 2.3 (Optional) Install jax
+uv pip install -U jax-ai-stack "jax[cuda]"
+
+# 3. Install deepmd-kit
+export DP_VARIANT="cuda" DP_ENABLE_PYTORCH=1 DP_ENABLE_TENSORFLOW=1 DP_ENABLE_PADDLE=0 DP_ENABLE_NATIVE_OPTIMIZATION=1 CUDAToolkit_ROOT="/usr/local/cuda" CUDA_HOME="/usr/local/cuda" CUDA_PATH="/usr/local/cuda" CUDA_VERSION=12.9 && pip install -e . -v
+
+# 4. Install other useful packages
+uv pip install -U dpdata pymatgen freud-analysis tensorboard torch-tb-profiler seaborn ipykernel nglview git+https://gitlab.com/1041176461/ase-abacus.git
+```
+
+## 1.2 For Mac
+
+```shell
+# 1. Preparation
+# 1.1 Get source code
+# or git clone https://github.com/deepmodeling/deepmd-kit.git
+git clone git@github.com:OutisLi/deepmd-kit.git && cd deepmd-kit && git checkout outisli
+# 1.2 Create virtual environment
+conda update -n base -c conda-forge conda -y ; conda update -n base -c conda-forge mamba -y
+conda deactivate && conda env remove -n dpmd -y ; rm -rf build ; git clean -xdf ; mamba create -n dpmd compilers llvm-openmp python=3.13 -c conda-forge -y && mamba activate dpmd && pip install --upgrade pip && pip install uv
+
+# 2. Install pytorch
+uv pip install -U torch
+
+# 3. Install deepmd-kit
+export DP_ENABLE_PYTORCH=1 DP_ENABLE_PADDLE=0 DP_ENABLE_TENSORFLOW=0 DP_ENABLE_NATIVE_OPTIMIZATION=1 && uv pip install -e . -v
+
+# 4. Install other useful packages
+uv pip install -U dpdata pymatgen freud-analysis tensorboard torch-tb-profiler seaborn ipykernel nglview git+https://gitlab.com/1041176461/ase-abacus.git
+```
+
+# 2. Install the C++ interface
+
+If one does not need to use DeePMD-kit with LAMMPS or i-PI, then the python interface installed in the previous section does everything and he/she can safely skip this section.
+
+```shell
+# 0. (Optional) for reinstall
+export software="$HOME/Software"
+rm -rfv $software/deepmd-kit_cpp $software/deepmd-kit/source/build
+
+# 1. Environment Variables
+export deepmd_source_dir=$(pwd) && mkdir -p ../deepmd-kit_cpp && cd ../deepmd-kit_cpp && export deepmd_root=$(pwd) && cd ../deepmd-kit && cd source && mkdir -p build && cd build
+# export deepmd_source_dir="$software/deepmd-kit"
+# export deepmd_root="$software/deepmd-kit_cpp"
+
+# 2. CMake (Choice either one)
+# 2.1 Option 1: use pytorch only
+cmake -DENABLE_PYTORCH=ON -DUSE_PT_PYTHON_LIBS=ON -DUSE_CUDA_TOOLKIT=ON -DENABLE_NATIVE_OPTIMIZATION=ON -DCMAKE_INSTALL_PREFIX=$deepmd_root -DCMAKE_PREFIX_PATH=$CONDA_PREFIX ..
+
+# 2.2 Option 2: use pytorch & tensorflow & jax
+cmake -DENABLE_TENSORFLOW=ON -DUSE_TF_PYTHON_LIBS=ON -DENABLE_PYTORCH=ON -DUSE_PT_PYTHON_LIBS=ON -DENABLE_JAX=ON -DUSE_CUDA_TOOLKIT=ON -DENABLE_NATIVE_OPTIMIZATION=ON -DCMAKE_INSTALL_PREFIX=$deepmd_root -DCMAKE_PREFIX_PATH=$CONDA_PREFIX ..
+
+# 3. Install
+make -j && make install
+```
+
+# 3. Install LAMMPS’s DeePMD-kit module (built-in mode)
+
+_Before following this section, [DeePMD-kit C++ interface](https://docs.deepmodeling.com/projects/deepmd/en/master/install/install-from-source.html) should have be installed_ (see 3.3)
+
+```shell
+# 0.
+export software="$HOME/Software"
+make lammps && rm -rf $software/lammps
+
+# 1. Install requirements
+# Or conda install
+mamba install jpeg libpng zlib -c conda-forge -y
+
+# 2. Download Lammps
+cd $software && mkdir -p lammps && cd lammps && wget https://gh-proxy.com/github.com/lammps/lammps/archive/stable_22Jul2025.tar.gz && tar xzf stable_22Jul2025.tar.gz && cd lammps-stable_22Jul2025 && mkdir -p build && cd build
+# wget https://github.com/lammps/lammps/archive/stable_22Jul2025.tar.gz
+
+# 3. Compile
+echo "include($deepmd_source_dir/source/lmp/builtin.cmake)" >> ../cmake/CMakeLists.txt && export TORCH_CMAKE_DIR=$(python -c "import torch; print(torch.utils.cmake_prefix_path)") && export TF_LIB_PATH=$(find $CONDA_PREFIX -name "libtensorflow_framework.so.2" | xargs dirname)
+cmake -DLAMMPS_INSTALL_RPATH=ON -DBUILD_SHARED_LIBS=yes -DCMAKE_INSTALL_PREFIX=$deepmd_root -DCMAKE_PREFIX_PATH="$deepmd_root;$CONDA_PREFIX;$TORCH_CMAKE_DIR" -DCMAKE_EXE_LINKER_FLAGS="-Wl,-rpath-link,$TF_LIB_PATH" ../cmake
+make -j && make install
+
+# test
+$deepmd_root/bin/lmp -h
+```
+
+# 4. DPGEN2
+
+```shell
+# alias conda="mamba"
+export software="$HOME/Software"
+cd $software
+git clone git@github.com:OutisLi/dpgen2.git
+cd dpgen2 && conda activate dpmd && pip install uv dpdispatcher && uv pip install -e . -v
+```
diff --git a/source/lmp/builtin.cmake b/source/lmp/builtin.cmake
index e051e5c24a..e92468370b 100644
--- a/source/lmp/builtin.cmake
+++ b/source/lmp/builtin.cmake
@@ -57,6 +57,7 @@ configure_file("${CMAKE_CURRENT_LIST_DIR}/deepmd_version.h.in"
 
 file(GLOB DEEPMD_LMP_SRC ${CMAKE_CURRENT_LIST_DIR}/*.cpp)
 
+find_package(Torch REQUIRED)
 find_package(DeePMD REQUIRED)
 target_sources(
   lammps

From 20e27f658963398ab1f861048f8555b21219f0d7 Mon Sep 17 00:00:00 2001
From: OutisLi <LTC201806070316@gmail.com>
Date: Mon, 8 Sep 2025 21:02:12 +0800
Subject: [PATCH 02/11] add new folder outisli in doc

---
 {outisli => doc/outisli}/DPA3.md    | 0
 {outisli => doc/outisli}/install.md | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename {outisli => doc/outisli}/DPA3.md (100%)
 rename {outisli => doc/outisli}/install.md (100%)

diff --git a/outisli/DPA3.md b/doc/outisli/DPA3.md
similarity index 100%
rename from outisli/DPA3.md
rename to doc/outisli/DPA3.md
diff --git a/outisli/install.md b/doc/outisli/install.md
similarity index 100%
rename from outisli/install.md
rename to doc/outisli/install.md

From d9c4b90be76fda0611a147fce37d9c475cf40f8f Mon Sep 17 00:00:00 2001
From: OutisLi <LTC201806070316@gmail.com>
Date: Thu, 11 Sep 2025 14:01:24 +0800
Subject: [PATCH 03/11] add description for compression

---
 doc/outisli/DPA3.md     | 422 +++++++++++++++++++++++++++++++++-
 doc/outisli/compress.md | 493 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 908 insertions(+), 7 deletions(-)
 create mode 100644 doc/outisli/compress.md

diff --git a/doc/outisli/DPA3.md b/doc/outisli/DPA3.md
index b5ed63016c..68055772b5 100644
--- a/doc/outisli/DPA3.md
+++ b/doc/outisli/DPA3.md
@@ -323,21 +323,286 @@ DpLoaderSet (系统级 DataLoader 集合)
 3. **设备转移**: 数据移动到 GPU/CPU
 4. **输入分离**: 模型输入与标签分离
 
-### 2.4 DPAtomicModel 层次结构
+### 2.4 DeePMD-kit 模型架构完整层次结构
 
-DPAtomicModel 是 DeePMD-kit PyTorch 后端的核心原子模型基类，它继承自 BaseAtomicModel 并为各种物理性质的预测提供了统一的接口。
+#### 2.4.1 模型架构的设计理念
 
-#### 2.4.1 类继承层次
+DeePMD-kit 采用分层次、模块化的设计，从底层的原子级计算到顶层的完整模型，每一层都有明确的职责和功能分工。理解这个层次结构对于掌握 DeePMD-kit 的工作原理至关重要。
+
+#### 2.4.2 完整的模型层次结构
+
+##### 2.4.2.1 原子模型层次 (AtomicModel)
+
+**最基础的计算单元** - 负责原子级别的物理量计算：
 
 ```python
-# 基础层次结构
-BaseAtomicModel (base_atomic_model.py:52)
+# 抽象基类层
+ABC + PluginVariant + make_plugin_registry("atomic model")
+    ↓
+BaseAtomicModel_ (由 make_base_atomic_model() 动态生成)
+    ↓  
+BaseAtomicModel (deepmd/dpmodel/atomic_model/base_atomic_model.py:42)
+    ↓
+DPAtomicModel (deepmd/dpmodel/atomic_model/dp_atomic_model.py:29) - 注册为 "standard"
+    ↓
+具体的物理属性原子模型:
+├── DPEnergyAtomicModel (能量模型)
+├── DPDipoleAtomicModel (偶极子模型) 
+├── DPPolarAtomicModel (极化率模型)
+├── DPDOSAtomicModel (态密度模型)
+└── DPPropertyAtomicModel (通用属性模型)
+```
+
+**作用和用途**:
+- **核心计算单元**: 包含描述器(Descriptor) + 拟合网络(Fitting)
+- **原子级预测**: 负责单个原子的能量/力等物理量预测
+- **不直接用于训练**: 作为组件被更高层模型调用
+- **物理计算核心**: 所有物理计算都在这里发生
+
+##### 2.4.2.2 完整模型层次 (Model)
+
+**真正用于训练和推理的完整模型**：
+
+```python
+# 抽象基类层
+ABC + PluginVariant + make_plugin_registry("model")
+    ↓
+BaseBaseModel (由 make_base_model() 动态生成)
+    ↓
+BaseModel (deepmd/dpmodel/model/base_model.py:175)
+    ↓
+DPModelCommon (提供公共方法如 update_sel 等)
     ↓
-DPAtomicModel (dp_atomic_model.py:34) - 注册为 "standard"
+通过 make_model(T_AtomicModel) 动态生成的模型类
     ↓
-具体预测模型 (Energy, Dipole, Polar, DOS, Property)
+具体的完整模型实现:
+├── EnergyModel (deepmd/pt/model/model/ener_model.py:30) - 注册为 "ener" 
+├── DipoleModel - 注册为 "dipole"
+├── PolarModel - 注册为 "polar" 
+├── DOSModel - 注册为 "dos"
+└── PropertyModel - 注册为 "property"
 ```
 
+**作用和用途**:
+- **训练和推理接口**: `dp train input.json` 时创建的就是这个模型
+- **系统级功能**: 封装原子模型，添加邻居列表构建、坐标变换、批处理等
+- **梯度计算**: 自动计算力和应力
+- **输出格式转换**: 将原子级输出转换为标准格式
+
+##### 2.4.2.3 特殊模型层次 (LinearModel/ZBLModel)
+
+**线性组合和特殊模型**：
+
+```python
+BaseAtomicModel
+    ↓
+LinearEnergyAtomicModel (deepmd/dpmodel/atomic_model/linear_atomic_model.py:42) - 注册为 "linear"
+    ↓
+DPZBLLinearEnergyAtomicModel (线性组合DP和ZBL模型)
+    ↓
+通过 make_model(DPZBLLinearEnergyAtomicModel) 生成完整模型
+    ↓
+DPZBLModel (deepmd/dpmodel/model/dp_zbl_model.py:28) - 注册为 "zbl"
+```
+
+**作用和用途**:
+- **模型组合**: 线性组合多个原子模型
+- **物理修正**: DPZBLModel 结合深度势能和 ZBL 势函数
+- **特殊应用**: 处理短程排斥等特殊物理场景
+
+#### 2.4.3 模型创建和使用流程
+
+##### 2.4.3.1 训练时的模型创建流程
+
+```python
+# 1. 用户配置
+"model": {"type": "ener"}  # input.json 中
+
+# 2. 训练脚本执行
+dp train input.json
+  ↓
+# 3. 模型工厂创建 (deepmd/pt/entrypoints/main.py:248)
+model = get_model(model_params)  # 返回 EnergyModel 实例
+  ↓  
+# 4. EnergyModel 初始化流程
+# 4a. 创建 DPEnergyAtomicModel 实例（原子级计算核心）
+# 4b. 通过 make_model() 包装成完整模型（添加系统级功能）
+# 4c. 继承 DPModelCommon（添加公共方法）
+  ↓
+# 5. 训练循环中的调用
+loss = model.forward(coord, atype, box, ...)  # EnergyModel.forward()
+  ↓
+# 6. 内部调用链
+# forward() -> forward_common() -> forward_common_lower() -> forward_atomic()
+```
+
+##### 2.4.3.2 推理时的模型加载流程
+
+```python
+# 1. 模型加载
+model = torch.jit.load("frozen_model.pth")  # 实际是 EnergyModel 的实例
+  ↓
+# 2. 推理调用
+output = model(coord, atype, box)  # EnergyModel.forward()
+  ↓  
+# 3. 返回标准格式
+{"energy": ..., "force": ..., "virial": ...}
+```
+
+#### 2.4.4 设计模式和架构优势
+
+##### 2.4.4.1 核心设计模式
+
+**1. 工厂模式**
+- `make_base_atomic_model()`: 动态生成原子模型基类
+- `make_base_model()`: 动态生成最终模型基类  
+- `make_model(T_AtomicModel)`: 将原子模型包装成完整模型
+
+**2. 注册机制**
+- 使用 `@BaseAtomicModel.register()` 和 `@BaseModel.register()` 注册不同类型的模型
+- 支持通过字符串名称动态创建模型实例
+
+**3. 组合模式**
+- **DPAtomicModel**: 由描述器(Descriptor) + 拟合网络(Fitting) 组成
+- **LinearEnergyAtomicModel**: 线性组合多个原子模型
+- **DPZBLLinearEnergyAtomicModel**: 特殊的线性组合，结合DP模型和ZBL势函数
+
+**4. 多后端支持**
+每个后端(PyTorch/TensorFlow/JAX/Paddle)都有相应的实现，遵循相同的接口但针对特定框架优化。
+
+##### 2.4.4.2 架构优势
+
+**模块化**:
+- 描述器和拟合网络可以独立开发和组合
+- 不同物理量的预测可以共享相同的框架
+
+**可扩展性**: 
+- 容易添加新的物理属性或模型类型
+- 支持自定义描述器和拟合网络
+
+**多后端支持**: 
+- 同一套接口支持不同的深度学习框架
+- 代码复用和维护效率高
+
+**类型安全**: 
+- 通过注册机制确保模型类型的正确性
+- 编译时类型检查和运行时验证
+
+#### 2.4.7 模型压缩功能 (enable_compression)
+
+模型压缩是DeePMD-kit中一个重要的性能优化功能，通过表格化(tabulation)的方式来加速模型推理，特别适用于生产环境的部署。
+
+##### 2.4.7.1 压缩功能调用链
+
+**压缩入口点** (`deepmd/pt/entrypoints/compress.py:75`):
+```python
+model.enable_compression(
+    extrapolate,    # 外推尺度
+    stride,         # 步长1
+    stride * 10,    # 步长2
+)
+```
+
+**压缩方法层次**:
+```
+顶层模型压缩 (make_model.py:246-266)
+    ↓
+model.enable_compression()
+    ↓
+self.atomic_model.enable_compression(
+    self.get_min_nbor_dist(),  # 获取最小邻居距离
+    table_extrapolate,
+    table_stride_1,
+    table_stride_2, 
+    check_frequency,
+)
+    ↓
+原子模型和描述符的具体压缩实现
+```
+
+##### 2.4.7.2 压缩参数说明
+
+**关键参数**:
+- `table_extrapolate`: 模型外推的尺度参数，控制表格的外推范围
+- `table_stride_1`: 第一个表格的均匀步长，影响近程精度
+- `table_stride_2`: 第二个表格的均匀步长，影响远程精度
+- `check_frequency`: 溢出检查频率，用于数值稳定性监控
+- `get_min_nbor_dist()`: 动态获取训练数据中的最小邻居距离
+
+##### 2.4.7.3 压缩机制的实现原理
+
+**表格化加速**:
+1. **距离离散化**: 将连续的原子间距离离散化为表格索引
+2. **预计算存储**: 预先计算并存储常用距离范围内的描述符值
+3. **插值查表**: 推理时通过插值查表替代复杂的神经网络计算
+4. **内存换时间**: 牺牲一定内存空间换取显著的计算速度提升
+
+**多级表格策略**:
+- **近程高精度**: `table_stride_1` 控制近程的高精度表格
+- **远程适中精度**: `table_stride_2` 控制远程的适中精度表格
+- **平滑过渡**: 两个表格之间实现平滑过渡，避免不连续性
+
+##### 2.4.7.4 压缩的应用场景和优势
+
+**适用场景**:
+- **生产环境部署**: MD模拟中需要高频调用模型推理
+- **大规模系统**: 原子数量庞大，计算资源有限
+- **实时仿真**: 对推理速度有严格要求的应用
+
+**性能优势**:
+- **推理加速**: 可实现数倍到数十倍的推理速度提升
+- **内存可控**: 表格大小可通过步长参数灵活控制
+- **精度平衡**: 在速度和精度之间找到最优平衡点
+
+##### 2.4.7.5 压缩功能的使用建议
+
+**参数调优策略**:
+```python
+# 高精度场景 - 较小的步长，更高的精度
+model.enable_compression(
+    extrapolate=5.0,
+    stride_1=0.005,    # 更小的近程步长
+    stride_2=0.05,     # 更小的远程步长
+)
+
+# 高性能场景 - 较大的步长，更快的速度  
+model.enable_compression(
+    extrapolate=3.0,
+    stride_1=0.02,     # 较大的近程步长
+    stride_2=0.2,      # 较大的远程步长
+)
+```
+
+**最佳实践**:
+1. **测试验证**: 压缩后务必验证模型精度是否满足要求
+2. **参数调优**: 根据具体应用场景调整步长参数
+3. **内存监控**: 关注压缩后的内存使用情况
+4. **性能测试**: 定量测试压缩带来的性能提升效果
+
+#### 2.4.8 在实际使用中的角色分工
+
+**对用户而言**:
+- **只需关心最终模型**: EnergyModel、DipoleModel 等
+- **配置简单**: 通过 JSON 配置文件指定模型类型
+- **接口统一**: 所有模型都使用相同的训练和推理接口
+
+**对开发者而言**:
+- **清晰的层次**: 每一层都有明确的职责
+- **易于扩展**: 在正确的层级添加新功能
+- **代码复用**: 通过工厂模式避免重复代码
+
+#### 2.4.9 模型架构总结
+
+**对用户而言**:
+- **只需关心最终模型**: EnergyModel、DipoleModel 等
+- **配置简单**: 通过 JSON 配置文件指定模型类型
+- **接口统一**: 所有模型都使用相同的训练和推理接口
+
+**对开发者而言**:
+- **清晰的层次**: 每一层都有明确的职责
+- **易于扩展**: 在正确的层级添加新功能
+- **代码复用**: 通过工厂模式避免重复代码
+
 **核心基类定义** (`deepmd/pt/model/atomic_model/dp_atomic_model.py:34`):
 
 ```python
@@ -356,6 +621,149 @@ class DPAtomicModel(BaseAtomicModel):
     """
 ```
 
+#### 2.4.6 Forward 方法的多层次架构
+
+DeePMD-kit 中存在多个不同的 forward 方法，每个都有特定的用途和调用层级。理解这些 forward 方法的分工和调用关系对于理解模型的执行流程至关重要。
+
+##### 2.4.6.1 Forward 方法层级结构
+
+**1. 用户接口层** - `forward()`
+```python
+# deepmd/pt/model/model/ener_model.py:94
+def forward(self, coord, atype, box=None, fparam=None, aparam=None, do_atomic_virial=False)
+```
+**用途**: 
+- **最高级的用户接口**，训练和推理时直接调用的方法
+- 接收原始的坐标、原子类型、盒子信息
+- 返回标准的物理量格式 `{"energy": ..., "force": ..., "virial": ...}`
+
+**什么时候使用**:
+- 训练时的损失函数计算
+- 推理时的预测
+- LAMMPS等MD引擎调用的接口
+
+**2. 坐标处理层** - `forward_common()`
+```python
+# deepmd/pt/model/model/make_model.py:152
+def forward_common(self, coord, atype, box=None, fparam=None, aparam=None, do_atomic_virial=False)
+```
+**用途**:
+- **处理坐标变换和邻居列表构建**
+- 将原始坐标转换为扩展坐标(包含ghost原子)
+- 构建邻居列表
+- 调用底层的`forward_common_lower()`
+
+**内部工作流程**:
+```python
+# 1. 坐标标准化和扩展
+extended_coord, extended_atype, mapping = extend_coord_with_ghosts(...)
+# 2. 构建邻居列表  
+nlist = build_neighbor_list(...)
+# 3. 调用底层计算
+model_ret = self.forward_common_lower(extended_coord, extended_atype, nlist, ...)
+```
+
+**3. 底层计算层** - `forward_common_lower()`
+```python
+# deepmd/pt/model/model/make_model.py:278
+def forward_common_lower(self, extended_coord, extended_atype, nlist, mapping=None, ...)
+```
+**用途**:
+- **真正的模型计算逻辑**
+- 接收已处理好的扩展坐标和邻居列表
+- 调用原子模型进行实际计算
+- 处理输出的格式转换和reduction操作
+
+**4. 外部接口层** - `forward_lower()`
+```python  
+# deepmd/pt/model/model/ener_model.py:135
+def forward_lower(self, extended_coord, extended_atype, nlist, mapping=None, ...)
+```
+**用途**:
+- **提供给外部程序的底层接口** (如LAMMPS插件)
+- 外部程序已经准备好了邻居列表，不需要DeePMD重新构建
+- 直接调用`forward_common_lower()`
+- 返回扩展区域的结果(不做reduction)
+
+**5. 原子级计算层** - `forward_atomic()`
+```python
+# deepmd/pt/model/atomic_model/dp_atomic_model.py:273  
+def forward_atomic(self, extended_coord, extended_atype, nlist, mapping=None, ...)
+```
+**用途**:
+- **最底层的原子级计算**
+- 描述器(Descriptor)计算原子环境表示
+- 拟合网络(Fitting)预测原子能量/力等
+- 返回原子级的预测结果
+
+##### 2.4.6.2 Forward 方法调用关系链
+
+**训练/推理时的完整调用链:**
+```python
+# 用户调用
+model.forward(coord, atype, box)
+  ↓
+# 坐标处理 
+model.forward_common(coord, atype, box)
+  ↓  
+# 坐标扩展 + 邻居列表构建
+extended_coord, nlist = preprocess(...)
+  ↓
+# 底层计算
+model.forward_common_lower(extended_coord, extended_atype, nlist)
+  ↓
+# 原子模型计算  
+atomic_ret = self.atomic_model.forward_atomic(extended_coord, extended_atype, nlist)
+  ↓
+# 输出转换和reduction
+return transform_output(atomic_ret)
+```
+
+**LAMMPS等外部程序调用:**
+```python
+# 外部程序已经有邻居列表
+model.forward_lower(extended_coord, extended_atype, nlist, mapping)
+  ↓
+# 直接底层计算
+model.forward_common_lower(extended_coord, extended_atype, nlist, mapping)  
+  ↓
+# 原子模型计算
+atomic_ret = self.atomic_model.forward_atomic(...)
+```
+
+##### 2.4.6.3 设计多层次 Forward 的原因
+
+**1. 性能优化**
+- `forward_lower()`: 外部程序可以复用邻居列表，避免重复计算
+- `forward_common_lower()`: 批处理时可以直接使用预构建的数据
+
+**2. 接口灵活性** 
+- `forward()`: 简单易用的高级接口
+- `forward_lower()`: 高性能的底层接口
+
+**3. 代码复用**
+- `forward_common()`: 坐标处理逻辑可以被多种模型复用
+- `forward_atomic()`: 原子级计算与系统级处理分离
+
+**4. 调试和测试**
+- 可以单独测试每个层级的功能
+- 便于定位性能瓶颈
+
+##### 2.4.6.4 实际使用建议
+
+**对于普通用户**:
+- **只需关心 `forward()`**: 训练和推理的标准接口
+- **偶尔使用 `forward_lower()`**: 如果你要写MD插件或需要高性能推理
+
+**对于开发者**:
+- **`forward_common` 系列**: 理解内部实现和优化的关键
+- **`forward_atomic()`**: 自定义原子模型时需要实现的核心方法
+
+**性能优化场景**:
+- **外部邻居列表**: 使用 `forward_lower()` 避免重复计算
+- **批处理优化**: 直接调用 `forward_common_lower()` 处理预处理好的数据
+- **调试分析**: 单独调用 `forward_atomic()` 分析原子级计算
+
 #### 2.4.2 具体派生模型
 
 **能量模型** (`deepmd/pt/model/atomic_model/energy_atomic_model.py:13`):
diff --git a/doc/outisli/compress.md b/doc/outisli/compress.md
new file mode 100644
index 0000000000..682f857611
--- /dev/null
+++ b/doc/outisli/compress.md
@@ -0,0 +1,493 @@
+# DeepMD-kit 模型压缩功能详细分析
+
+## 概述
+
+DeepMD-kit 的 compress 功能是一种模型优化技术，通过**表格化推理**（tabulated inference）、**算子融合**（operator merging）和**精确邻域索引**（precise neighbor indexing）三种技术来提高模型的推理性能。这些技术共同作用，可以显著减少内存使用和计算开销，同时保持模型精度的损失在可接受范围内。
+
+## 理论基础
+
+### 表格化推理
+
+压缩的核心思想是将神经网络推理替换为查表操作。对于输入维度为 1 的神经网络函数，可以使用分段多项式拟合来近似网络输出。
+
+#### 五次多项式拟合
+
+对于每个区间 $[x_l, x_{l+1})$，使用五次多项式来近似神经网络输出：
+
+```math
+g^l_m(x) = a^l_m x^5 + b^l_m x^4 + c^l_m x^3 + d^l_m x^2 + e^l_m x + f^l_m
+```
+
+多项式系数通过以下公式计算：
+
+```math
+a^l_m = \frac{1}{2\Delta x_l^5}[12h_{m,l}-6(y'_{m,l+1}+y'_{m,l})\Delta x_l + (y''_{m,l+1}-y''_{m,l})\Delta x_l^2]
+```
+
+```math
+b^l_m = \frac{1}{2\Delta x_l^4}[-30h_{m,l} +(14y'_{m,l+1}+16y'_{m,l})\Delta x_l + (-2y''_{m,l+1}+3y''_{m,l})\Delta x_l^2]
+```
+
+```math
+c^l_m = \frac{1}{2\Delta x_l^3}[20h_{m,l}-(8y'_{m,l+1}+12y'_{m,l})\Delta x_l + (y''_{m,l+1}-3y''_{m,l})\Delta x_l^2]
+```
+
+```math
+d^l_m = \frac{1}{2}y''_{m,l}
+```
+
+```math
+e^l_m = y'_{m,l}
+```
+
+```math
+f^l_m = y_{m,l}
+```
+
+其中：
+
+- $\Delta x_l = x_{l+1} - x_l$ 为区间长度
+- $h_{m,l} = y_{m,l+1} - y_{m,l}$
+- $y_{m,l} = y_m(x_l)$, $y'_{m,l} = y'_m(x_l)$, $y''_{m,l} = y''_m(x_l)$ 分别是函数值、一阶导数和二阶导数
+
+## PyTorch 后端实现流程
+
+### 总体架构
+
+PyTorch 后端的 compress 功能主要涉及以下组件：
+
+1. **入口函数**: `deepmd/pt/entrypoints/compress.py`
+2. **核心算法**: `deepmd/pt/utils/tabulate.py` 中的 `DPTabulate` 类
+3. **模型接口**: `deepmd/pt/model/model/make_model.py`
+4. **描述符接口**: `deepmd/pt/model/descriptor/se_a.py`
+5. **C++自定义 OP**: `source/op/pt/tabulate_multi_device.cc`
+
+### 详细实现流程
+
+#### 1. 入口函数执行
+
+```python
+def enable_compression(
+    input_file: str,
+    output: str,
+    stride: float = 0.01,
+    extrapolate: int = 5,
+    check_frequency: int = -1,
+    training_script: Optional[str] = None,
+) -> None:
+    # 1. 加载原始模型
+    saved_model = torch.jit.load(input_file, map_location="cpu")
+    model_def_script = json.loads(saved_model.model_def_script)
+    model = get_model(model_def_script)
+    model.load_state_dict(saved_model.state_dict())
+
+    # 2. 计算最小邻域距离（如果需要）
+    if model.get_min_nbor_dist() is None:
+        # 从训练数据计算最小邻域距离
+        # ... 计算逻辑
+
+    # 3. 启用压缩
+    model.enable_compression(
+        extrapolate,
+        stride,
+        stride * 10,
+        check_frequency,
+    )
+
+    # 4. 保存压缩模型
+    model = torch.jit.script(model)
+    torch.jit.save(model, output)
+```
+
+#### 2. 模型压缩启用
+
+模型的 `enable_compression` 方法调用：
+
+```python
+def enable_compression(
+    self,
+    table_extrapolate: float = 5,
+    table_stride_1: float = 0.01,
+    table_stride_2: float = 0.1,
+    check_frequency: int = -1,
+) -> None:
+    self.atomic_model.enable_compression(
+        self.get_min_nbor_dist(),
+        table_extrapolate,
+        table_stride_1,
+        table_stride_2,
+        check_frequency,
+    )
+```
+
+#### 3. 原子模型压缩启用
+
+原子模型调用描述符的压缩方法：
+
+```python
+def enable_compression(
+    self,
+    min_nbor_dist: float,
+    table_extrapolate: float = 5,
+    table_stride_1: float = 0.01,
+    table_stride_2: float = 0.1,
+    check_frequency: int = -1,
+) -> None:
+    self.descriptor.enable_compression(
+        min_nbor_dist,
+        table_extrapolate,
+        table_stride_1,
+        table_stride_2,
+        check_frequency,
+    )
+```
+
+#### 4. 描述符压缩实现
+
+SE_A 描述符的压缩实现：
+
+```python
+def enable_compression(
+    self,
+    min_nbor_dist: float,
+    table_extrapolate: float = 5,
+    table_stride_1: float = 0.01,
+    table_stride_2: float = 0.1,
+    check_frequency: int = -1,
+) -> None:
+    if self.compress:
+        raise ValueError("Compression is already enabled.")
+
+    data = self.serialize()
+    self.table = DPTabulate(
+        self,
+        data["neuron"],
+        data["type_one_side"],
+        data["exclude_types"],
+        ActivationFn(data["activation_function"]),
+    )
+
+    self.table_config = [
+        table_extrapolate,
+        table_stride_1,
+        table_stride_2,
+        check_frequency,
+    ]
+
+    self.lower, self.upper = self.table.build(
+        min_nbor_dist, table_extrapolate, table_stride_1, table_stride_2
+    )
+
+    self.sea.enable_compression(
+        self.table.data, self.table_config, self.lower, self.upper
+    )
+```
+
+### DPTabulate 类详解
+
+`DPTabulate` 类是压缩功能的核心，继承自 `BaseTabulate`。
+
+#### 初始化过程
+
+```python
+def __init__(
+    self,
+    descrpt: Any,
+    neuron: list[int],
+    type_one_side: bool = False,
+    exclude_types: list[list[int]] = [],
+    activation_fn: ActivationFn = ActivationFn("tanh"),
+) -> None:
+    super().__init__(
+        descrpt,
+        neuron,
+        type_one_side,
+        exclude_types,
+        True,  # is_pt=True
+    )
+    self.descrpt_type = self._get_descrpt_type()
+    # ... 初始化各种参数
+```
+
+#### 构建表格过程
+
+`build` 方法是表格化实现的核心：
+
+```python
+def build(
+    self, min_nbor_dist: float, extrapolate: float, stride0: float, stride1: float
+) -> tuple[dict[str, int], dict[str, int]]:
+    # 1. 计算环境矩阵范围
+    lower, upper = self._get_env_mat_range(min_nbor_dist)
+
+    # 2. 根据描述符类型构建表格
+    if self.descrpt_type == "A":
+        # SE_A描述符的处理逻辑
+        for ii in range(self.table_size):
+            # 计算表格范围
+            xx = np.arange(ll, uu, stride0, dtype=self.data_type)
+            xx = np.append(xx, np.arange(uu, extrapolate * uu, stride1, dtype=self.data_type))
+            xx = np.append(xx, np.array([extrapolate * uu], dtype=self.data_type))
+
+            # 构建子表格
+            self._build_lower(net, xx, ii, uu, ll, stride0, stride1, extrapolate, nspline)
+
+    # 3. 转换数据格式
+    self._convert_numpy_to_tensor()
+    return self.lower, self.upper
+```
+
+#### \_build_lower 方法详解
+
+这个方法实现具体的表格构建算法：
+
+```python
+def _build_lower(
+    self,
+    net: int,
+    xx: np.ndarray,
+    idx: int,
+    upper: float,
+    lower: float,
+    stride0: int,
+    stride1: int,
+    extrapolate: bool,
+    nspline: int,
+) -> None:
+    # 1. 计算函数值、导数
+    vv, dd, d2 = self._make_data(xx, idx)
+
+    # 2. 初始化表格数据存储
+    self.data[net] = np.zeros([nspline, 6 * self.last_layer_size], dtype=self.data_type)
+
+    # 3. 计算区间长度
+    tt = np.full((nspline, self.last_layer_size), stride1)
+    tt[: int((upper - lower) / stride0), :] = stride0
+
+    # 4. 计算函数值差分
+    hh = vv[1 : nspline + 1, : self.last_layer_size] - vv[:nspline, : self.last_layer_size]
+
+    # 5. 计算五次多项式系数
+    # 常数项 f^l_m = y_{m,l}
+    self.data[net][:, : 6 * self.last_layer_size : 6] = vv[:nspline, : self.last_layer_size]
+
+    # 一次项系数 e^l_m = y'_{m,l}
+    self.data[net][:, 1 : 6 * self.last_layer_size : 6] = dd[:nspline, : self.last_layer_size]
+
+    # 二次项系数 d^l_m = 0.5 * y''_{m,l}
+    self.data[net][:, 2 : 6 * self.last_layer_size : 6] = 0.5 * d2[:nspline, : self.last_layer_size]
+
+    # 三次项系数 c^l_m
+    self.data[net][:, 3 : 6 * self.last_layer_size : 6] = (
+        1 / (2 * tt * tt * tt)
+    ) * (
+        20 * hh
+        - (8 * dd[1 : nspline + 1, : self.last_layer_size] + 12 * dd[:nspline, : self.last_layer_size]) * tt
+        - (3 * d2[:nspline, : self.last_layer_size] - d2[1 : nspline + 1, : self.last_layer_size]) * tt * tt
+    )
+
+    # 四次项系数 b^l_m
+    self.data[net][:, 4 : 6 * self.last_layer_size : 6] = (
+        1 / (2 * tt * tt * tt * tt)
+    ) * (
+        -30 * hh
+        + (14 * dd[1 : nspline + 1, : self.last_layer_size] + 16 * dd[:nspline, : self.last_layer_size]) * tt
+        + (3 * d2[:nspline, : self.last_layer_size] - 2 * d2[1 : nspline + 1, : self.last_layer_size]) * tt * tt
+    )
+
+    # 五次项系数 a^l_m
+    self.data[net][:, 5 : 6 * self.last_layer_size : 6] = (
+        1 / (2 * tt * tt * tt * tt * tt)
+    ) * (
+        12 * hh
+        - 6 * (dd[1 : nspline + 1, : self.last_layer_size] + dd[:nspline, : self.last_layer_size]) * tt
+        + (d2[1 : nspline + 1, : self.last_layer_size] - d2[:nspline, : self.last_layer_size]) * tt * tt
+    )
+
+    # 6. 记录边界信息
+    self.upper[net] = upper
+    self.lower[net] = lower
+```
+
+#### \_make_data 方法详解
+
+这个方法通过前向传播计算函数值和导数：
+
+```python
+def _make_data(self, xx: np.ndarray, idx: int) -> Any:
+    xx = torch.from_numpy(xx).view(-1, 1).to(env.DEVICE)
+
+    for layer in range(self.layer_size):
+        if layer == 0:
+            # 第一层特殊处理
+            xbar = torch.matmul(xx, torch.from_numpy(self.matrix["layer_" + str(layer + 1)][idx]).to(env.DEVICE))
+            xbar += torch.from_numpy(self.bias["layer_" + str(layer + 1)][idx]).to(env.DEVICE)
+
+            # 根据神经元数量选择处理方式
+            if self.neuron[0] == 1:
+                yy = self._layer_0(xx, self.matrix["layer_" + str(layer + 1)][idx], self.bias["layer_" + str(layer + 1)][idx])
+                yy += xx
+                dy = unaggregated_dy_dx_s(yy - xx, self.matrix["layer_" + str(layer + 1)][idx], xbar, self.functype)
+                dy += torch.ones((1, 1), dtype=yy.dtype)
+                dy2 = unaggregated_dy2_dx_s(yy - xx, dy, self.matrix["layer_" + str(layer + 1)][idx], xbar, self.functype)
+            # ... 其他情况处理
+        else:
+            # 后续层处理
+            # ... 类似的计算逻辑
+
+    # 返回函数值、导数和二阶导数
+    vv = zz.detach().cpu().numpy().astype(self.data_type)
+    dd = dy.detach().cpu().numpy().astype(self.data_type)
+    d2 = dy2.detach().cpu().numpy().astype(self.data_type)
+    return vv, dd, d2
+```
+
+### 算子融合技术
+
+算子融合是压缩功能的重要优化技术，它通过以下方式提高性能：
+
+1. **矩阵乘法融合**: 将嵌入层的输出与环境矩阵的乘法合并到表格化过程中
+2. **内存访问优化**: 避免了嵌入矩阵在寄存器和内存之间的频繁传输
+3. **计算优化**: 减少了不必要的中间结果存储
+
+#### 融合前后的对比
+
+**传统方式**:
+
+```python
+# 1. 计算嵌入层输出 G
+G = embedding_net(env_matrix)
+# 2. 矩阵乘法 G^T @ R
+result = G.T @ env_matrix
+# 3. 存储中间结果G
+```
+
+**融合方式**:
+
+```python
+# 直接在表格化过程中完成乘法
+result = tabulate_fusion(G, R)  # G的计算和乘法在同一个内核中完成
+```
+
+### 精确邻域索引技术
+
+精确邻域索引通过以下方式优化性能：
+
+1. **动态邻域数量**: 根据实际邻域数量而不是最大邻域数量进行计算
+2. **减少无效计算**: 避免对填充的零值进行计算
+3. **内存效率**: 减少内存使用和带宽消耗
+
+### C++自定义算子实现
+
+PyTorch 后端使用 C++自定义算子来实现高性能的表格化推理：
+
+#### 主要函数接口
+
+```cpp
+template <typename FPTYPE>
+void TabulateFusionSeAForward(
+    const torch::Tensor& table_tensor,      // 表格数据
+    const torch::Tensor& table_info_tensor, // 表格信息
+    const torch::Tensor& em_x_tensor,       // 环境矩阵x分量
+    const torch::Tensor& em_tensor,         // 环境矩阵
+    const torch::Tensor& two_embed_tensor,  // 二体嵌入
+    int64_t last_layer_size,               // 最后一层大小
+    torch::Tensor& descriptor_tensor       // 输出描述符
+);
+```
+
+#### CPU/GPU 统一接口
+
+代码同时支持 CPU 和 GPU 计算：
+
+```cpp
+if (device == "GPU") {
+    deepmd::tabulate_fusion_se_a_gpu(descriptor, table, table_info, em_x, em,
+                                     two_embed, nloc, nnei, last_layer_size);
+} else if (device == "CPU") {
+    deepmd::tabulate_fusion_se_a_cpu(descriptor, table, table_info, em_x, em,
+                                     two_embed, nloc, nnei, last_layer_size);
+}
+```
+
+## TensorFlow 后端实现差异
+
+TensorFlow 后端的实现与 PyTorch 有所不同：
+
+### 主要差异
+
+1. **重新训练方式**: TensorFlow 后端通过重新训练模型来生成压缩版本
+2. **图操作**: 使用 TensorFlow 的图模式进行优化
+3. **静态图**: 生成的压缩模型是静态图格式
+
+### 实现流程
+
+```python
+def compress(
+    *,
+    input: str,
+    output: str,
+    extrapolate: int,
+    step: float,
+    frequency: str,
+    checkpoint_folder: str,
+    training_script: str,
+    # ... 其他参数
+) -> None:
+    # 1. 加载原始模型
+    graph, _ = load_graph_def(input)
+
+    # 2. 创建压缩配置文件
+    jdata["model"]["compress"] = {}
+    jdata["model"]["compress"]["model_file"] = input
+    jdata["model"]["compress"]["table_config"] = [
+        extrapolate, step, 10 * step, int(frequency)
+    ]
+
+    # 3. 重新训练模型
+    train(...)
+
+    # 4. 冻结模型
+    freeze(checkpoint_folder=checkpoint_folder, output=output, node_names=None)
+```
+
+## 性能优化效果
+
+### 加速效果
+
+- **推理速度**: 通常可以达到 10 倍以上的加速
+- **内存使用**: 可以减少 20 倍以上的内存使用
+- **GPU 利用率**: 更好的内存访问模式提高 GPU 利用率
+
+### 精度保持
+
+- **相对误差**: 通常在 1%以内
+- **力场一致性**: 保持物理量的正确性
+- **能量守恒**: 维持系统能量守恒特性
+
+## 使用建议
+
+### 参数选择
+
+1. **stride 参数**: 控制表格精度，越小精度越高但内存使用越大
+2. **extrapolate 参数**: 控制外推范围，需要根据应用场景选择
+3. **check_frequency**: 控制溢出检查频率，影响性能
+
+### 适用场景
+
+1. **生产环境**: 适合大规模 MD 模拟
+2. **实时应用**: 适合需要快速响应的场景
+3. **资源受限**: 适合内存和计算资源受限的环境
+
+### 注意事项
+
+1. **模型版本**: 确保使用支持压缩的模型版本
+2. **描述符类型**: 确认描述符类型支持压缩
+3. **训练数据**: 需要提供足够的训练数据来计算统计信息
+4. **验证测试**: 建议在压缩后进行充分的验证测试
+
+## 总结
+
+DeepMD-kit 的 compress 功能通过表格化推理、算子融合和精确邻域索引三种技术实现了模型推理性能的显著提升。在保持模型精度的同时，可以获得 10 倍以上的性能提升和 20 倍以上的内存节省。这些技术不仅优化了计算效率，还提高了内存访问的局部性，是深度学习在科学计算领域应用的重要优化案例。
+
+请详细梳理这个程序的 compress 功能，尤其是针对 pytorch 后端的，整个实现的代码执行流程，以及具体的实现方法，然后整理到 doc/outisli/compress.md 这个 markdown 文件中，一定要准确，详细，充分完全，逻辑条理清晰

From c787b3ec589899bb00718795fa09d679cb541e2c Mon Sep 17 00:00:00 2001
From: OutisLi <LTC201806070316@gmail.com>
Date: Mon, 15 Sep 2025 16:59:36 +0800
Subject: [PATCH 04/11] update CLAUDE.md

---
 .gitignore                |   6 +
 CLAUDE.md                 | 174 +++++++--
 doc/outisli/compress.md   | 769 ++++++++++++++++++++------------------
 examples/water/.gitignore |   1 +
 4 files changed, 555 insertions(+), 395 deletions(-)

diff --git a/.gitignore b/.gitignore
index 6382ecedd2..231916a923 100644
--- a/.gitignore
+++ b/.gitignore
@@ -50,6 +50,8 @@ uv.lock
 buildcxx/
 node_modules/
 *.bib.original
+.claude
+*.hdf5
 
 # Coverage files
 .coverage
@@ -71,3 +73,7 @@ frozen_model.*
 
 # Test system directories
 system/
+
+# clangd
+compile_commands.json
+source/.cache
diff --git a/CLAUDE.md b/CLAUDE.md
index ad1443d8b9..83140fe346 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -32,6 +32,12 @@ DeePMD-kit is a deep learning package for many-body potential energy representat
 - **JAX**: Enable with `DP_ENABLE_JAX=1` (requires Python >= 3.10)
 - **Paddle**: Enable with `DP_ENABLE_PADDLE=1`
 
+### Model Compression
+- **Compress models**: `dp --pt compress -i model.pth -o compressed.pth`
+- **Custom parameters**: `dp --pt compress -i model.pth -o compressed.pth -s 0.005 -e 10`
+- **PyTorch backend only**: Supports SE_A, SE_R, SE_T, SE_Atten, DPA1, DPA2 descriptors
+- **DPA3 not supported**: Compression explicitly disabled for DPA3 descriptors
+
 ## Architecture Overview
 
 ### Multi-Backend Design
@@ -55,63 +61,78 @@ Framework-agnostic model implementations:
 DPA3 (Deep Potential - Atomic Environment Representation with 3-body interactions) is an advanced descriptor that combines node, edge, and angle information for more accurate atomic environment representation.
 
 **Key Components**:
-- **Main Descriptor**: `DescrptDPA3` in `deepmd/pt/model/descriptor/dpa3.py`
-- **RepFlow Block**: `DescrptBlockRepflows` in `deepmd/pt/model/descriptor/repflows.py`
-- **RepFlow Layer**: `RepFlowLayer` in `deepmd/pt/model/descriptor/repflow_layer.py`
+- **Main Descriptor**: `DescrptDPA3` in `deepmd/pt/model/descriptor/dpa3.py:105-171`
+- **RepFlow Block**: `DescrptBlockRepflows` in `deepmd/pt/model/descriptor/repflows.py:77-200`
+- **RepFlow Layer**: `RepFlowLayer` in `deepmd/pt/model/descriptor/repflow_layer.py:38-200`
+
+**DPA3 Core Innovation**: The RepFlow architecture introduces a unified representation that iteratively refines node, edge, and angle information through multiple layers, enabling explicit 3-body interaction modeling while maintaining computational efficiency through message compression strategies.
 
 #### DPA3 Initialization and Forward Pass
 **Initialization** (`dpa3.py:105-171`):
-- Processes RepFlow parameters
-- Creates type embedding network (`TypeEmbedNetConsistent`)
-- Initializes RepFlow blocks with edge/angle embedding networks
-- Sets up multiple RepFlow layers for iterative refinement
+- Processes RepFlow parameters with `init_subclass_params(repflow, RepFlowArgs)`
+- Creates type embedding network (`TypeEmbedNetConsistent`) for consistent atomic type representations
+- Initializes RepFlow blocks with edge/angle embedding networks for distance and angular information
+- Sets up multiple RepFlow layers for iterative refinement with configurable residual connections
 
 **Forward Pass** (`dpa3.py:430-498`):
-1. **Type Embedding**: Computes atomic type embeddings
-2. **RepFlow Processing**: Multi-layer node/edge/angle information processing
-3. **Output**: Returns node descriptors, rotation matrices, edge embeddings, and switch functions
+1. **Type Embedding**: Computes atomic type embeddings using `TypeEmbedNetConsistent`
+2. **RepFlow Processing**: Multi-layer node/edge/angle information processing through iterative updates
+3. **Output Generation**: Returns comprehensive atomic environment representation with rotation matrices for SE(3) equivariance
 
 **DPA3 Output Variables**:
-- `node_ebd`: Node descriptors [nf, nloc, n_dim] - main atomic environment representation
-- `rot_mat`: Rotation matrices [nf, nloc, e_dim, 3] - for SE(3) equivariance
-- `edge_ebd`: Edge embeddings [nf, nloc, nnei, e_dim] - pairwise interactions
-- `h2`: Angle information [nf, nloc, nnei, 3] - 3-body angular data
-- `sw`: Switch functions [nf, nloc, nnei] - smooth cutoff boundaries
+- `node_ebd`: Node descriptors [nf, nloc, n_dim] - primary atomic environment representation for fitting networks
+- `rot_mat`: Rotation matrices [nf, nloc, e_dim, 3] - ensures SE(3) equivariance for coordinate transformations
+- `edge_ebd`: Edge embeddings [nf, nloc, nnei, e_dim] - pairwise interaction information
+- `h2`: Angle information [nf, nloc, nnei, 3] - 3-body angular data for explicit three-body interactions
+- `sw`: Switch functions [nf, nloc, nnei] - smooth cutoff boundaries to avoid discontinuities
 
 #### RepFlow Implementation
 **RepFlow Block** (`repflows.py:77-200`):
-- Edge embedding network for distance information
-- Angle embedding network for angular information
-- Multiple RepFlow layers for iterative updates
-- Support for message compression and multi-head attention
+- Edge embedding network (`MLPLayer`) for distance information encoding
+- Angle embedding network for angular relationship processing  
+- Multiple RepFlow layers (`RepFlowLayer`) for iterative node/edge/angle updates
+- Support for message compression (`a_compress_rate`) and attention mechanisms to reduce computational cost
+- Environment matrix computation via `prod_env_mat` for neighbor distance and direction calculation
 
 **Key Parameters**:
-- `e_rcut`/`e_rcut_smth`: Edge cutoff and smoothing radii
-- `a_rcut`/`a_rcut_smth`: Angle cutoff and smoothing radii  
-- `n_dim`/`e_dim`/`a_dim`: Node/edge/angle representation dimensions
-- `nlayers`: Number of RepFlow layers
-- `update_style`: Residual connection strategies (res_residual, res_update, etc.)
+- `e_rcut`/`e_rcut_smth`: Edge cutoff (6.0Å) and smoothing radii (0.5Å) for neighbor selection
+- `a_rcut`/`a_rcut_smth`: Angle cutoff (4.0Å) and smoothing radii for three-body interactions  
+- `n_dim`/`e_dim`/`a_dim`: Node (128), edge (64), angle (32) representation dimensions
+- `nlayers`: Number of RepFlow layers (6) for iterative refinement
+- `update_style`: Residual connection strategies (`res_residual`, `res_update`, `force_residual`) for gradient flow optimization
+- `a_compress_rate`: Angle compression factor (2) to reduce computational overhead while preserving angular information
 
 #### CLI Usage and Training Flow
-**Training Command**: `dp train input.json`
+**Training Command**: `dp --pt train input.json` (specify PyTorch backend explicitly)
 
 **Execution Flow**:
-1. **Entry Point**: `deepmd.pt.entrypoints.main.train()` (`main.py:237-248`)
-2. **Configuration Loading**: JSON parsing and multi-task handling
-3. **Neighbor Statistics**: Automatic selection parameter computation
-4. **Trainer Creation**: `get_trainer()` with model initialization
-5. **Model Building**: DPA3 descriptor creation via `get_model()`
+1. **Entry Point**: `deepmd.pt.entrypoints.main.train()` (`main.py:248-372`) - PyTorch-specific training entry
+2. **Configuration Loading**: JSON parsing via `j_loader()` with multi-task handling through `preprocess_shared_params()`
+3. **Neighbor Statistics**: Automatic selection parameter computation via `BaseModel.update_sel()` unless `--skip-neighbor-stat`
+4. **Trainer Creation**: `get_trainer()` with model initialization, supporting distributed training and mixed precision
+5. **Model Building**: DPA3 descriptor creation via `get_model()` with automatic device placement and JIT compilation options
+
+**Data Processing Pipeline**:
+1. **Raw Data Loading**: `DeepmdData` loads HDF5/.npy files from system directories
+2. **System DataLoaders**: Each system gets its own DataLoader (num_workers=0 to avoid thread explosion)  
+3. **Training DataLoader**: Master DataLoader with intelligent sampling (`WeightedRandomSampler` or uniform)
+4. **Batch Processing**: `collate_batch()` handles variable-sized systems with padding and tensor stacking
 
 #### Precision Control
-DPA3 supports two levels of precision control:
+DPA3 supports two levels of precision control that work independently:
 
-**Environment Variable Control**:
-```bash
-export DP_INTERFACE_PREC=high  # Default: float64 interface
-export DP_INTERFACE_PREC=low   # Lower memory: float32 interface
-```
+**Environment Variable Control (`DP_INTERFACE_PREC`)**:
+- **Scope**: Global interface precision affecting input/output data types across all DeePMD-kit operations
+- **High precision** (`export DP_INTERFACE_PREC=high`): `GLOBAL_NP_FLOAT_PRECISION = np.float64`, `GLOBAL_ENER_FLOAT_PRECISION = np.float64`
+- **Low precision** (`export DP_INTERFACE_PREC=low`): `GLOBAL_NP_FLOAT_PRECISION = np.float32`, `GLOBAL_ENER_FLOAT_PRECISION = np.float64` (energy precision remains high)
+- **Location**: `deepmd/env.py:33-48`
+
+**Model Parameter Control (`precision` in configuration)**:
+- **Scope**: Component-specific precision for neural network weights and calculations
+- **Options**: `"float64"`, `"float32"`, `"float16"`, `"default"`
+- **Granular Control**: Can be set individually for descriptor, fitting networks, and RepFlow components
+- **Example Configuration**:
 
-**Model Parameter Control**:
 ```json
 {
   "model": {
@@ -121,11 +142,20 @@ export DP_INTERFACE_PREC=low   # Lower memory: float32 interface
       "repflow": {
         "precision": "float32"
       }
+    },
+    "fitting_net": {
+      "precision": "float32"
     }
   }
 }
 ```
 
+**Precision Workflow** (`make_model.py:327-337`):
+1. **Input Type Detection**: `input_type_cast()` detects input data precision
+2. **Global Precision Conversion**: Converts to `GLOBAL_PT_FLOAT_PRECISION` for computation
+3. **Component Computation**: Uses component-specific precision settings
+4. **Output Conversion**: `output_type_cast()` converts back to original input precision
+
 #### Inference System
 **Main Classes**:
 - `DeepEval`: Universal inference interface (`deepmd/pt/infer/deep_eval.py:75`)
@@ -176,6 +206,73 @@ DPA3 implements a two-stage energy calculation:
 - Atomic energy: `deepmd/pt/model/task/fitting.py:473-614`
 - Energy summation: `deepmd/pt/model/model/transform_output.py:153-192`
 
+### Model Compression System
+
+#### Compression Overview
+DeePMD-kit supports model compression through tabulation of embedding networks, providing significant inference speedup by replacing neural network computations with polynomial interpolation lookups.
+
+**Core Concept**: 
+- Pre-compute embedding network outputs and store in lookup tables
+- Use two-stage interpolation with different stride sizes for accuracy-memory balance
+- Replace runtime neural network evaluations with fast polynomial interpolation
+
+#### Compression Architecture
+**Entry Points**:
+- Command: `dp --pt compress -i model.pth -o compressed.pth`
+- Main entry: `deepmd/main.py` → `deepmd/pt/entrypoints/main.py:574-582`
+- Core function: `deepmd/pt/entrypoints/compress.py:32-84`
+
+**Execution Flow**:
+1. **Model Loading**: Load JIT model and reconstruct model instance
+2. **Min Distance Calculation**: Compute minimum neighbor distance from training data
+3. **Hierarchical Compression**: Model → Atomic Model → Descriptor compression
+4. **Table Building**: Create polynomial coefficient tables via `DPTabulate`
+5. **JIT Serialization**: Save compressed model as TorchScript
+
+#### Supported Descriptors
+**Fully Supported**:
+- `SE_A` (`se_a.py:257-302`): Smooth Edition Angular descriptor
+- `SE_R` (`se_r.py:359-xxx`): Smooth Edition Radial descriptor  
+- `SE_T` (`se_t.py:284-327`): Smooth Edition Three-body descriptor
+- `SE_Atten` (`se_atten.py:427-448`): Smooth Edition with Attention
+- `DPA1` (`dpa1.py:572-645`): Deep Potential Attention version 1
+- `DPA2` (`dpa2.py:893-973`): Deep Potential Attention version 2
+
+**Not Supported**:
+- `DPA3` (`dpa3.py:578-601`): Explicitly raises `NotImplementedError`
+- `Pairtab` models: No tabulation compression support
+
+#### Tabulation Implementation
+**Key Class**: `DPTabulate` (`deepmd/pt/utils/tabulate.py:30-100`)
+
+**Table Building Process**:
+1. **Range Calculation**: Compute environment matrix bounds from training data statistics
+2. **Grid Generation**: Create two-segment distance grids (fine + coarse stride)
+3. **Neural Network Evaluation**: Forward pass to get function values and derivatives
+4. **Polynomial Fitting**: Generate 5th-order Hermite interpolation coefficients
+
+**Data Storage Format**:
+- `compress_info`: [lower, upper, extrapolate_upper, stride1, stride2, check_freq]
+- `compress_data`: [nspline, 6 * last_layer_size] coefficient tables
+- Coefficients: [f(x), f'(x), f''(x)/2, c3, c4, c5] per neuron
+
+#### Performance Characteristics
+**Memory Optimization**:
+- Two-stage interpolation: fine stride (0.01) + coarse stride (0.1)
+- Extrapolation region: 5× training data range by default
+- Removes original network weights after compression
+
+**Computational Benefits**:
+- Eliminates matrix operations in embedding networks
+- Vectorized polynomial evaluation
+- Cache-friendly data layout for lookup tables
+
+#### Configuration Parameters
+- `-s, --step`: Fine stride size (default: 0.01) - affects accuracy vs memory
+- `-e, --extrapolate`: Extrapolation multiplier (default: 5)
+- `-f, --frequency`: Overflow check frequency (default: -1, disabled)  
+- `-t, --training-script`: Training script path for min distance calculation
+
 #### 2. Backend-Specific Implementations
 - `deepmd/tf/`: TensorFlow backend (original implementation)
 - `deepmd/pt/`: PyTorch backend 
@@ -291,11 +388,14 @@ Support for training multiple properties simultaneously:
 - C++ library handles neighbor lists and environment matrices
 - Custom operators optimized for GPU acceleration
 - Automatic mixed precision support where available
+- **Model compression**: Tabulation provides 2-10× inference speedup for supported descriptors
 
 ### Common Pitfalls
 - Backend-specific imports are banned at module level (use runtime imports)
 - Model compatibility requires careful version management
 - GPU builds require specific CUDA/ROCm versions
+- **Compression limitations**: DPA3 and some specialized models don't support compression
+- **Training data dependency**: Compression requires training script for optimal table range calculation
 
 ## File Structure Conventions
 
diff --git a/doc/outisli/compress.md b/doc/outisli/compress.md
index 682f857611..d572402b83 100644
--- a/doc/outisli/compress.md
+++ b/doc/outisli/compress.md
@@ -1,138 +1,117 @@
-# DeepMD-kit 模型压缩功能详细分析
+# DeePMD-kit 压缩功能详细分析
 
 ## 概述
 
-DeepMD-kit 的 compress 功能是一种模型优化技术，通过**表格化推理**（tabulated inference）、**算子融合**（operator merging）和**精确邻域索引**（precise neighbor indexing）三种技术来提高模型的推理性能。这些技术共同作用，可以显著减少内存使用和计算开销，同时保持模型精度的损失在可接受范围内。
+DeePMD-kit 的 compress 功能通过将 embedding networks 进行 tabulation（查表法）来实现模型压缩，显著提升推理速度并减少内存占用。
 
-## 理论基础
+## 核心原理
 
-### 表格化推理
+### 基本思想
 
-压缩的核心思想是将神经网络推理替换为查表操作。对于输入维度为 1 的神经网络函数，可以使用分段多项式拟合来近似网络输出。
+1. **预计算查表**：将 embedding networks 的输出预先计算并存储在表格中
+2. **分段插值**：使用两个不同步长的表格来平衡精度与存储成本：
+   - 第一段表格：使用精细步长（stride0）
+   - 第二段表格：使用粗糙步长（stride1 = 10 × stride0）
+3. **多项式插值**：基于查表结果进行五次多项式插值
 
-#### 五次多项式拟合
+## PyTorch 后端实现
 
-对于每个区间 $[x_l, x_{l+1})$，使用五次多项式来近似神经网络输出：
+### 1. 命令行入口
 
-```math
-g^l_m(x) = a^l_m x^5 + b^l_m x^4 + c^l_m x^3 + d^l_m x^2 + e^l_m x + f^l_m
-```
-
-多项式系数通过以下公式计算：
+#### 主入口
+- **文件位置**: `deepmd/main.py`
+- **命令示例**: `dp --pt compress -i model.pth -o compressed_model.pth`
 
-```math
-a^l_m = \frac{1}{2\Delta x_l^5}[12h_{m,l}-6(y'_{m,l+1}+y'_{m,l})\Delta x_l + (y''_{m,l+1}-y''_{m,l})\Delta x_l^2]
+#### 参数配置
+```python
+parser_compress.add_argument("-s", "--step", default=0.01, type=float)      # stride0
+parser_compress.add_argument("-e", "--extrapolate", default=5, type=int)    # 外推倍数
+parser_compress.add_argument("-f", "--frequency", default=-1, type=int)     # 溢出检查频率
+parser_compress.add_argument("-t", "--training-script", type=str)           # 训练脚本
 ```
 
-```math
-b^l_m = \frac{1}{2\Delta x_l^4}[-30h_{m,l} +(14y'_{m,l+1}+16y'_{m,l})\Delta x_l + (-2y''_{m,l+1}+3y''_{m,l})\Delta x_l^2]
+#### 命令分发
+```python
+# deepmd/main.py:1013-1018
+elif args.command in ("compress", "train", "freeze", ...):
+    deepmd_main = BACKENDS[args.backend]().entry_point_hook
 ```
 
-```math
-c^l_m = \frac{1}{2\Delta x_l^3}[20h_{m,l}-(8y'_{m,l+1}+12y'_{m,l})\Delta x_l + (y''_{m,l+1}-3y''_{m,l})\Delta x_l^2]
-```
+### 2. PyTorch 后端处理
 
-```math
-d^l_m = \frac{1}{2}y''_{m,l}
-```
+#### 入口函数
+**文件位置**: `deepmd/pt/entrypoints/main.py:574-582`
 
-```math
-e^l_m = y'_{m,l}
-```
-
-```math
-f^l_m = y_{m,l}
+```python
+elif FLAGS.command == "compress":
+    FLAGS.input = str(Path(FLAGS.input).with_suffix(".pth"))
+    FLAGS.output = str(Path(FLAGS.output).with_suffix(".pth"))
+    enable_compression(
+        input_file=FLAGS.input,
+        output=FLAGS.output,
+        stride=FLAGS.step,
+        extrapolate=FLAGS.extrapolate,
+        check_frequency=FLAGS.frequency,
+        training_script=FLAGS.training_script,
+    )
 ```
 
-其中：
+#### 核心压缩函数
+**文件位置**: `deepmd/pt/entrypoints/compress.py:32-84`
 
-- $\Delta x_l = x_{l+1} - x_l$ 为区间长度
-- $h_{m,l} = y_{m,l+1} - y_{m,l}$
-- $y_{m,l} = y_m(x_l)$, $y'_{m,l} = y'_m(x_l)$, $y''_{m,l} = y''_m(x_l)$ 分别是函数值、一阶导数和二阶导数
+## 详细执行流程
 
-## PyTorch 后端实现流程
-
-### 总体架构
-
-PyTorch 后端的 compress 功能主要涉及以下组件：
-
-1. **入口函数**: `deepmd/pt/entrypoints/compress.py`
-2. **核心算法**: `deepmd/pt/utils/tabulate.py` 中的 `DPTabulate` 类
-3. **模型接口**: `deepmd/pt/model/model/make_model.py`
-4. **描述符接口**: `deepmd/pt/model/descriptor/se_a.py`
-5. **C++自定义 OP**: `source/op/pt/tabulate_multi_device.cc`
-
-### 详细实现流程
-
-#### 1. 入口函数执行
+### 步骤1：模型加载
 
 ```python
-def enable_compression(
-    input_file: str,
-    output: str,
-    stride: float = 0.01,
-    extrapolate: int = 5,
-    check_frequency: int = -1,
-    training_script: Optional[str] = None,
-) -> None:
-    # 1. 加载原始模型
+def enable_compression(input_file, output, stride=0.01, extrapolate=5, check_frequency=-1, training_script=None):
+    # 1. 加载JIT模型
     saved_model = torch.jit.load(input_file, map_location="cpu")
     model_def_script = json.loads(saved_model.model_def_script)
+    
+    # 2. 重建模型实例
     model = get_model(model_def_script)
     model.load_state_dict(saved_model.state_dict())
+```
 
-    # 2. 计算最小邻域距离（如果需要）
-    if model.get_min_nbor_dist() is None:
-        # 从训练数据计算最小邻域距离
-        # ... 计算逻辑
+### 步骤2：最小邻居距离计算
 
-    # 3. 启用压缩
-    model.enable_compression(
-        extrapolate,
-        stride,
-        stride * 10,
-        check_frequency,
-    )
-
-    # 4. 保存压缩模型
-    model = torch.jit.script(model)
-    torch.jit.save(model, output)
+```python
+# 3. 计算最小邻居距离
+if model.get_min_nbor_dist() is None:
+    # 从训练数据计算
+    jdata = j_loader(training_script)
+    jdata = update_deepmd_input(jdata)
+    train_data = get_data(jdata["training"]["training_data"], 0, type_map, None)
+    
+    update_sel = UpdateSel()
+    t_min_nbor_dist = update_sel.get_min_nbor_dist(train_data)
+    model.min_nbor_dist = torch.tensor(t_min_nbor_dist, dtype=env.GLOBAL_PT_FLOAT_PRECISION)
 ```
 
-#### 2. 模型压缩启用
+### 步骤3：模型压缩启用
 
-模型的 `enable_compression` 方法调用：
+#### 3.1 模型层压缩
+**文件位置**: `deepmd/pt/model/model/make_model.py:103-129`
 
 ```python
-def enable_compression(
-    self,
-    table_extrapolate: float = 5,
-    table_stride_1: float = 0.01,
-    table_stride_2: float = 0.1,
-    check_frequency: int = -1,
-) -> None:
+def enable_compression(self, table_extrapolate=5, table_stride_1=0.01, table_stride_2=0.1, check_frequency=-1):
+    """模型层压缩入口"""
     self.atomic_model.enable_compression(
-        self.get_min_nbor_dist(),
+        self.get_min_nbor_dist(),  # 最小邻居距离
         table_extrapolate,
         table_stride_1,
-        table_stride_2,
+        table_stride_2, 
         check_frequency,
     )
 ```
 
-#### 3. 原子模型压缩启用
-
-原子模型调用描述符的压缩方法：
+#### 3.2 原子模型压缩
+**文件位置**: `deepmd/pt/model/atomic_model/dp_atomic_model.py:188-217`
 
 ```python
-def enable_compression(
-    self,
-    min_nbor_dist: float,
-    table_extrapolate: float = 5,
-    table_stride_1: float = 0.01,
-    table_stride_2: float = 0.1,
-    check_frequency: int = -1,
-) -> None:
+def enable_compression(self, min_nbor_dist, table_extrapolate=5, table_stride_1=0.01, table_stride_2=0.1, check_frequency=-1):
+    """原子模型层压缩入口"""
     self.descriptor.enable_compression(
         min_nbor_dist,
         table_extrapolate,
@@ -142,352 +121,426 @@ def enable_compression(
     )
 ```
 
-#### 4. 描述符压缩实现
+### 步骤4：描述符层压缩实现
 
-SE_A 描述符的压缩实现：
+#### 4.1 SE_A 描述符压缩
+**文件位置**: `deepmd/pt/model/descriptor/se_a.py:257-302`
 
 ```python
-def enable_compression(
-    self,
-    min_nbor_dist: float,
-    table_extrapolate: float = 5,
-    table_stride_1: float = 0.01,
-    table_stride_2: float = 0.1,
-    check_frequency: int = -1,
-) -> None:
+def enable_compression(self, min_nbor_dist, table_extrapolate=5, table_stride_1=0.01, table_stride_2=0.1, check_frequency=-1):
+    # 1. 检查是否已压缩
     if self.compress:
         raise ValueError("Compression is already enabled.")
-
+    
+    # 2. 创建查表器
     data = self.serialize()
     self.table = DPTabulate(
-        self,
-        data["neuron"],
-        data["type_one_side"],
-        data["exclude_types"],
-        ActivationFn(data["activation_function"]),
+        self,                                    # 描述符对象
+        data["neuron"],                          # 神经网络结构
+        data["type_one_side"],                   # 单侧类型
+        data["exclude_types"],                   # 排除类型对
+        ActivationFn(data["activation_function"]) # 激活函数
     )
+    
+    # 3. 存储查表配置
+    self.table_config = [table_extrapolate, table_stride_1, table_stride_2, check_frequency]
+    
+    # 4. 构建查表数据
+    self.lower, self.upper = self.table.build(min_nbor_dist, table_extrapolate, table_stride_1, table_stride_2)
+    
+    # 5. 启用嵌入层压缩
+    self.sea.enable_compression(self.table.data, self.table_config, self.lower, self.upper)
+    
+    # 6. 设置压缩标志
+    self.compress = True
+```
 
-    self.table_config = [
-        table_extrapolate,
-        table_stride_1,
-        table_stride_2,
-        check_frequency,
-    ]
-
-    self.lower, self.upper = self.table.build(
-        min_nbor_dist, table_extrapolate, table_stride_1, table_stride_2
-    )
+#### 4.2 DescrptSeA 压缩数据设置
+**文件位置**: `deepmd/pt/model/descriptor/se_a.py:699-733`
 
-    self.sea.enable_compression(
-        self.table.data, self.table_config, self.lower, self.upper
-    )
+```python
+def enable_compression(self, table_data, table_config, lower, upper):
+    """为每个嵌入网络设置压缩数据"""
+    for embedding_idx, ll in enumerate(self.filter_layers.networks):
+        if self.type_one_side:
+            net = f"filter_-1_net_{embedding_idx}"
+        else:
+            ii = embedding_idx // self.ntypes  # 中心原子类型
+            ti = embedding_idx % self.ntypes   # 邻居原子类型  
+            net = f"filter_{ii}_net_{ti}"
+            
+        # 压缩信息：[lower, upper, upper*extrapolate, stride1, stride2, check_freq]
+        info_ii = torch.as_tensor([
+            lower[net], upper[net], upper[net] * table_config[0],
+            table_config[1], table_config[2], table_config[3]
+        ], dtype=self.prec, device="cpu")
+        
+        # 压缩数据：多项式系数表
+        tensor_data_ii = table_data[net].to(device=env.DEVICE, dtype=self.prec)
+        
+        self.compress_data[embedding_idx] = tensor_data_ii
+        self.compress_info[embedding_idx] = info_ii
+    
+    self.compress = True
 ```
 
-### DPTabulate 类详解
+### 步骤5：查表器实现
 
-`DPTabulate` 类是压缩功能的核心，继承自 `BaseTabulate`。
-
-#### 初始化过程
+#### 5.1 查表器类
+**文件位置**: `deepmd/pt/utils/tabulate.py:52-118`
 
 ```python
-def __init__(
-    self,
-    descrpt: Any,
-    neuron: list[int],
-    type_one_side: bool = False,
-    exclude_types: list[list[int]] = [],
-    activation_fn: ActivationFn = ActivationFn("tanh"),
-) -> None:
-    super().__init__(
-        descrpt,
-        neuron,
-        type_one_side,
-        exclude_types,
-        True,  # is_pt=True
-    )
-    self.descrpt_type = self._get_descrpt_type()
-    # ... 初始化各种参数
+class DPTabulate(BaseTabulate):
+    def __init__(self, descrpt, neuron, type_one_side=False, exclude_types=[], activation_fn=ActivationFn("tanh")):
+        # 1. 基础初始化
+        super().__init__(descrpt, neuron, type_one_side, exclude_types, True)
+        
+        # 2. 描述符类型判断
+        self.descrpt_type = self._get_descrpt_type()  # "A", "Atten", "T", "R"
+        
+        # 3. 获取描述符参数
+        self.sel_a = self.descrpt.get_sel()
+        self.rcut = self.descrpt.get_rcut()
+        self.rcut_smth = self.descrpt.get_rcut_smth()
+        
+        # 4. 激活函数映射
+        activation_map = {"tanh": 1, "gelu": 2, "relu": 3, "relu6": 4, "softplus": 5, "sigmoid": 6}
+        self.functype = activation_map[activation_fn.activation]
+        
+        # 5. 获取统计参数
+        serialized = self.descrpt.serialize()
+        self.davg = serialized["@variables"]["davg"]  # 均值
+        self.dstd = serialized["@variables"]["dstd"]  # 标准差
+        self.embedding_net_nodes = serialized["embeddings"]["networks"]
+        
+        # 6. 提取权重和偏置
+        self.bias = self._get_bias()
+        self.matrix = self._get_matrix()
 ```
 
-#### 构建表格过程
-
-`build` 方法是表格化实现的核心：
+#### 5.2 查表构建过程
+**文件位置**: `deepmd/utils/tabulate.py:70-243`
 
 ```python
-def build(
-    self, min_nbor_dist: float, extrapolate: float, stride0: float, stride1: float
-) -> tuple[dict[str, int], dict[str, int]]:
+def build(self, min_nbor_dist, extrapolate, stride0, stride1):
     # 1. 计算环境矩阵范围
     lower, upper = self._get_env_mat_range(min_nbor_dist)
-
-    # 2. 根据描述符类型构建表格
-    if self.descrpt_type == "A":
-        # SE_A描述符的处理逻辑
+    
+    # 2. 根据描述符类型建表
+    if self.descrpt_type == "A":  # SE_A 描述符
         for ii in range(self.table_size):
-            # 计算表格范围
-            xx = np.arange(ll, uu, stride0, dtype=self.data_type)
-            xx = np.append(xx, np.arange(uu, extrapolate * uu, stride1, dtype=self.data_type))
-            xx = np.append(xx, np.array([extrapolate * uu], dtype=self.data_type))
-
-            # 构建子表格
-            self._build_lower(net, xx, ii, uu, ll, stride0, stride1, extrapolate, nspline)
-
-    # 3. 转换数据格式
+            if self._should_build_table(ii):
+                # 构建距离网格
+                xx = self._build_distance_grid(lower, upper, stride0, stride1, extrapolate, ii)
+                
+                # 查表数据
+                self._build_lower(net, xx, ii, uu, ll, stride0, stride1, extrapolate, nspline)
+    
+    # 3. 后处理转换
     self._convert_numpy_to_tensor()
+    self._convert_numpy_float_to_int()
+    
     return self.lower, self.upper
 ```
 
-#### \_build_lower 方法详解
+#### 5.3 环境矩阵范围计算
+**文件位置**: `deepmd/utils/tabulate.py:445-463`
+
+```python
+def _get_env_mat_range(self, min_nbor_dist):
+    """计算环境矩阵的范围"""
+    # 1. 计算切换函数值
+    sw = self._spline5_switch(min_nbor_dist, self.rcut_smth, self.rcut)
+    
+    # 2. 根据描述符类型计算范围
+    if self.descrpt_type in ("Atten", "A"):
+        # 标准化：(r_ij - davg) / dstd
+        lower = -self.davg[:, 0] / self.dstd[:, 0]
+        upper = ((1 / min_nbor_dist) * sw - self.davg[:, 0]) / self.dstd[:, 0]
+    
+    # 3. 向下和向上取整
+    return np.floor(lower), np.ceil(upper)
+```
 
-这个方法实现具体的表格构建算法：
+#### 5.4 多项式系数计算
+**文件位置**: `deepmd/utils/tabulate.py:245-347`
 
 ```python
-def _build_lower(
-    self,
-    net: int,
-    xx: np.ndarray,
-    idx: int,
-    upper: float,
-    lower: float,
-    stride0: int,
-    stride1: int,
-    extrapolate: bool,
-    nspline: int,
-) -> None:
-    # 1. 计算函数值、导数
-    vv, dd, d2 = self._make_data(xx, idx)
-
-    # 2. 初始化表格数据存储
+def _build_lower(self, net, xx, idx, upper, lower, stride0, stride1, extrapolate, nspline):
+    # 1. 通过神经网络前向传播计算数据
+    vv, dd, d2 = self._make_data(xx, idx)  # 值、一阶导数、二阶导数
+    
+    # 2. 多项式系数表
     self.data[net] = np.zeros([nspline, 6 * self.last_layer_size], dtype=self.data_type)
-
-    # 3. 计算区间长度
+    
+    # 3. 步长处理
     tt = np.full((nspline, self.last_layer_size), stride1)
     tt[: int((upper - lower) / stride0), :] = stride0
-
-    # 4. 计算函数值差分
-    hh = vv[1 : nspline + 1, : self.last_layer_size] - vv[:nspline, : self.last_layer_size]
-
-    # 5. 计算五次多项式系数
-    # 常数项 f^l_m = y_{m,l}
-    self.data[net][:, : 6 * self.last_layer_size : 6] = vv[:nspline, : self.last_layer_size]
-
-    # 一次项系数 e^l_m = y'_{m,l}
-    self.data[net][:, 1 : 6 * self.last_layer_size : 6] = dd[:nspline, : self.last_layer_size]
-
-    # 二次项系数 d^l_m = 0.5 * y''_{m,l}
-    self.data[net][:, 2 : 6 * self.last_layer_size : 6] = 0.5 * d2[:nspline, : self.last_layer_size]
-
-    # 三次项系数 c^l_m
-    self.data[net][:, 3 : 6 * self.last_layer_size : 6] = (
-        1 / (2 * tt * tt * tt)
-    ) * (
-        20 * hh
-        - (8 * dd[1 : nspline + 1, : self.last_layer_size] + 12 * dd[:nspline, : self.last_layer_size]) * tt
-        - (3 * d2[:nspline, : self.last_layer_size] - d2[1 : nspline + 1, : self.last_layer_size]) * tt * tt
-    )
-
-    # 四次项系数 b^l_m
-    self.data[net][:, 4 : 6 * self.last_layer_size : 6] = (
-        1 / (2 * tt * tt * tt * tt)
-    ) * (
-        -30 * hh
-        + (14 * dd[1 : nspline + 1, : self.last_layer_size] + 16 * dd[:nspline, : self.last_layer_size]) * tt
-        + (3 * d2[:nspline, : self.last_layer_size] - 2 * d2[1 : nspline + 1, : self.last_layer_size]) * tt * tt
-    )
-
-    # 五次项系数 a^l_m
-    self.data[net][:, 5 : 6 * self.last_layer_size : 6] = (
-        1 / (2 * tt * tt * tt * tt * tt)
-    ) * (
-        12 * hh
-        - 6 * (dd[1 : nspline + 1, : self.last_layer_size] + dd[:nspline, : self.last_layer_size]) * tt
-        + (d2[1 : nspline + 1, : self.last_layer_size] - d2[:nspline, : self.last_layer_size]) * tt * tt
-    )
-
-    # 6. 记录边界信息
-    self.upper[net] = upper
-    self.lower[net] = lower
+    
+    # 4. 计算多项式高阶系数
+    hh = vv[1:nspline + 1, :self.last_layer_size] - vv[:nspline, :self.last_layer_size]
+    
+    # 系数0：函数值 f(x)
+    self.data[net][:, ::6] = vv[:nspline, :self.last_layer_size]
+    
+    # 系数1：一阶导数 f'(x)
+    self.data[net][:, 1::6] = dd[:nspline, :self.last_layer_size]
+    
+    # 系数2：二阶导数 f''(x)/2
+    self.data[net][:, 2::6] = 0.5 * d2[:nspline, :self.last_layer_size]
+    
+    # 系数3-5：高阶多项式系数（保证连续性）
+    self.data[net][:, 3::6] = (1 / (2 * tt**3)) * (20 * hh - ...)
+    self.data[net][:, 4::6] = (1 / (2 * tt**4)) * (-30 * hh + ...)
+    self.data[net][:, 5::6] = (1 / (2 * tt**5)) * (12 * hh - ...)
 ```
 
-#### \_make_data 方法详解
-
-这个方法通过前向传播计算函数值和导数：
+#### 5.5 神经网络前向传播
+**文件位置**: `deepmd/pt/utils/tabulate.py:119-250`
 
 ```python
-def _make_data(self, xx: np.ndarray, idx: int) -> Any:
+def _make_data(self, xx, idx):
+    """通过神经网络前向传播查表数据"""
     xx = torch.from_numpy(xx).view(-1, 1).to(env.DEVICE)
-
+    
+    # 逐层计算
     for layer in range(self.layer_size):
         if layer == 0:
-            # 第一层特殊处理
-            xbar = torch.matmul(xx, torch.from_numpy(self.matrix["layer_" + str(layer + 1)][idx]).to(env.DEVICE))
-            xbar += torch.from_numpy(self.bias["layer_" + str(layer + 1)][idx]).to(env.DEVICE)
-
-            # 根据神经元数量选择处理方式
+            # 第一层：线性变换 + 激活函数
+            xbar = torch.matmul(xx, torch.from_numpy(self.matrix[f"layer_{layer + 1}"][idx])) + \
+                   torch.from_numpy(self.bias[f"layer_{layer + 1}"][idx])
+            
+            # 处理激活函数（含残差连接）
             if self.neuron[0] == 1:
-                yy = self._layer_0(xx, self.matrix["layer_" + str(layer + 1)][idx], self.bias["layer_" + str(layer + 1)][idx])
-                yy += xx
-                dy = unaggregated_dy_dx_s(yy - xx, self.matrix["layer_" + str(layer + 1)][idx], xbar, self.functype)
-                dy += torch.ones((1, 1), dtype=yy.dtype)
-                dy2 = unaggregated_dy2_dx_s(yy - xx, dy, self.matrix["layer_" + str(layer + 1)][idx], xbar, self.functype)
-            # ... 其他情况处理
+                yy = self._layer_0(...) + xx  # 残差连接
+            else:
+                yy = self._layer_0(...)
+            
+            # 计算一阶和二阶导数
+            dy = unaggregated_dy_dx_s(...)
+            dy2 = unaggregated_dy2_dx_s(...)
         else:
-            # 后续层处理
-            # ... 类似的计算逻辑
-
-    # 返回函数值、导数和二阶导数
-    vv = zz.detach().cpu().numpy().astype(self.data_type)
-    dd = dy.detach().cpu().numpy().astype(self.data_type)
-    d2 = dy2.detach().cpu().numpy().astype(self.data_type)
-    return vv, dd, d2
+            # 后续层...
+    
+    return vv.cpu().numpy(), dd.cpu().numpy(), d2.cpu().numpy()
 ```
 
-### 算子融合技术
+### 步骤6：模型保存
 
-算子融合是压缩功能的重要优化技术，它通过以下方式提高性能：
+```python
+# 4. 启用压缩
+model.enable_compression(extrapolate, stride, stride * 10, check_frequency)
 
-1. **矩阵乘法融合**: 将嵌入层的输出与环境矩阵的乘法合并到表格化过程中
-2. **内存访问优化**: 避免了嵌入矩阵在寄存器和内存之间的频繁传输
-3. **计算优化**: 减少了不必要的中间结果存储
+# 5. JIT脚本化保存
+model = torch.jit.script(model)
+torch.jit.save(model, output)
+```
 
-#### 融合前后的对比
+## 支持的描述符类型
 
-**传统方式**:
+### 已支持的描述符
 
-```python
-# 1. 计算嵌入层输出 G
-G = embedding_net(env_matrix)
-# 2. 矩阵乘法 G^T @ R
-result = G.T @ env_matrix
-# 3. 存储中间结果G
-```
+1. **SE_A (Smooth Edition Angular)**
+   - **文件位置**: `deepmd/pt/model/descriptor/se_a.py`
+   - **压缩方式**: 嵌入网络表格化
+   - **特点**: 支持角度信息的描述符
 
-**融合方式**:
+2. **SE_R (Smooth Edition Radial)**
+   - **文件位置**: `deepmd/pt/model/descriptor/se_r.py`
+   - **压缩方式**: 嵌入网络表格化
+   - **特点**: 仅使用径向距离信息的描述符
 
-```python
-# 直接在表格化过程中完成乘法
-result = tabulate_fusion(G, R)  # G的计算和乘法在同一个内核中完成
-```
+3. **SE_T (Smooth Edition Three-body)**
+   - **文件位置**: `deepmd/pt/model/descriptor/se_t.py`
+   - **压缩方式**: 嵌入网络表格化
+   - **特点**: 三体相互作用描述符
 
-### 精确邻域索引技术
+4. **SE_Atten (Smooth Edition with Attention)**
+   - **文件位置**: `deepmd/pt/model/descriptor/se_atten.py`
+   - **压缩方式**: 嵌入网络表格化
+   - **特点**: 带注意力机制的描述符
 
-精确邻域索引通过以下方式优化性能：
+5. **DPA1 (Deep Potential Attention 1)**
+   - **文件位置**: `deepmd/pt/model/descriptor/dpa1.py`
+   - **压缩方式**: 嵌入网络表格化
+   - **特点**: 第一代注意力机制描述符
 
-1. **动态邻域数量**: 根据实际邻域数量而不是最大邻域数量进行计算
-2. **减少无效计算**: 避免对填充的零值进行计算
-3. **内存效率**: 减少内存使用和带宽消耗
+6. **DPA2 (Deep Potential Attention 2)**
+   - **文件位置**: `deepmd/pt/model/descriptor/dpa2.py`
+   - **压缩方式**: 嵌入网络表格化
+   - **特点**: 第二代注意力机制描述符
 
-### C++自定义算子实现
+### 不支持的描述符
 
-PyTorch 后端使用 C++自定义算子来实现高性能的表格化推理：
+1. **DPA3 (Deep Potential Attention 3)**
+   - **文件位置**: `deepmd/pt/model/descriptor/dpa3.py:578-601`
+   - **压缩方式**: 不支持
+   - **原因**: ```python
+     def enable_compression(self, ...):
+         raise NotImplementedError("Compression is unsupported for DPA3.")
+     ```
 
-#### 主要函数接口
+### 特殊模型类型
 
-```cpp
-template <typename FPTYPE>
-void TabulateFusionSeAForward(
-    const torch::Tensor& table_tensor,      // 表格数据
-    const torch::Tensor& table_info_tensor, // 表格信息
-    const torch::Tensor& em_x_tensor,       // 环境矩阵x分量
-    const torch::Tensor& em_tensor,         // 环境矩阵
-    const torch::Tensor& two_embed_tensor,  // 二体嵌入
-    int64_t last_layer_size,               // 最后一层大小
-    torch::Tensor& descriptor_tensor       // 输出描述符
-);
-```
+1. **Linear Atomic Model**
+   - **文件位置**: `deepmd/pt/model/atomic_model/linear_atomic_model.py:198-228`
+   - **压缩方式**: 多个子模型分别压缩
 
-#### CPU/GPU 统一接口
+2. **Pairtab Atomic Model**
+   - **文件位置**: `deepmd/pt/model/atomic_model/pairtab_atomic_model.py:505-514`
+   - **压缩方式**: 不支持查表压缩
 
-代码同时支持 CPU 和 GPU 计算：
+## 数据结构详解
 
-```cpp
-if (device == "GPU") {
-    deepmd::tabulate_fusion_se_a_gpu(descriptor, table, table_info, em_x, em,
-                                     two_embed, nloc, nnei, last_layer_size);
-} else if (device == "CPU") {
-    deepmd::tabulate_fusion_se_a_cpu(descriptor, table, table_info, em_x, em,
-                                     two_embed, nloc, nnei, last_layer_size);
-}
+### 压缩数据格式
+
+#### 1. 压缩信息 (compress_info)
+```python
+# 每个嵌入网络存储6个参数 [6]
+compress_info[embedding_idx] = torch.tensor([
+    lower[net],           # 下界
+    upper[net],           # 上界  
+    upper[net] * extrapolate,  # 外推上界
+    table_stride_1,       # 第一段步长
+    table_stride_2,       # 第二段步长  
+    check_frequency       # 溢出检查频率
+])
 ```
 
-## TensorFlow 后端实现差异
+#### 2. 压缩数据 (compress_data)
+```python
+# 每个嵌入网络存储系数表 [nspline, 6 * last_layer_size]
+compress_data[embedding_idx] = table_data[net]
 
-TensorFlow 后端的实现与 PyTorch 有所不同：
+# 其中每6个连续的系数表示一个多项式的系数
+# [f(x), f'(x), f''(x)/2, c3, c4, c5] × last_layer_size
+```
 
-### 主要差异
+### 查表数据构建
 
-1. **重新训练方式**: TensorFlow 后端通过重新训练模型来生成压缩版本
-2. **图操作**: 使用 TensorFlow 的图模式进行优化
-3. **静态图**: 生成的压缩模型是静态图格式
+#### 1. 距离网格生成
+```python
+# 第一段：精细数据区间网格
+xx1 = np.arange(lower, upper, stride0)
 
-### 实现流程
+# 第二段：外推区间网格  
+xx2 = np.arange(upper, extrapolate * upper, stride1)
+
+# 合并网格
+xx = np.concatenate([xx1, xx2, [extrapolate * upper]])
+```
 
+#### 2. 神经网络求值
 ```python
-def compress(
-    *,
-    input: str,
-    output: str,
-    extrapolate: int,
-    step: float,
-    frequency: str,
-    checkpoint_folder: str,
-    training_script: str,
-    # ... 其他参数
-) -> None:
-    # 1. 加载原始模型
-    graph, _ = load_graph_def(input)
-
-    # 2. 创建压缩配置文件
-    jdata["model"]["compress"] = {}
-    jdata["model"]["compress"]["model_file"] = input
-    jdata["model"]["compress"]["table_config"] = [
-        extrapolate, step, 10 * step, int(frequency)
-    ]
-
-    # 3. 重新训练模型
-    train(...)
-
-    # 4. 冻结模型
-    freeze(checkpoint_folder=checkpoint_folder, output=output, node_names=None)
+# 对每个网格点进行神经网络前向传播
+for x_point in xx:
+    output = forward_pass(x_point)      # 网络输出
+    grad1 = compute_gradient(x_point)   # 一阶导数
+    grad2 = compute_hessian(x_point)    # 二阶导数
 ```
 
-## 性能优化效果
+#### 3. 多项式构造
+采用五次Hermite插值，满足：
+- 函数值连续：f(x_i) = y_i
+- 一阶导数连续：f'(x_i) = y'_i  
+- 二阶导数连续：f''(x_i) = y''_i
+
+## 性能优化
+
+### 1. 内存管理
+- **数据精度**: 支持数据精度调整（0.01）
+- **分段优化**: 粗糙步长在外推区（0.1）
+- **内存复用**: 删除原始网络权重，内存显著降低
+
+### 2. 计算优化
+- **预计算查表**: 压缩后嵌入网络不再需要矩阵运算
+- **向量化查表**: 每个原子类型对应一个优化的查表
+- **分支消除**: 消除类型判断的分支开销
+
+### 3. 缓存友好
+- **数据局部性**: 查表数据连续存储，提升cache命中率
+- **内存访问**: 内存访问模式优化，减少cache miss
+- **SIMD**: 多项式计算可向量化
+
+## 使用示例
+
+### 基础压缩命令
+```bash
+# 压缩PyTorch模型
+dp --pt compress -i frozen_model.pth -o compressed_model.pth
+
+# 自定义参数
+dp --pt compress \
+  -i frozen_model.pth \
+  -o compressed_model.pth \
+  -s 0.005 \
+  -e 10 \
+  -f 1000 \
+  -t input.json
+```
 
-### 加速效果
+### 参数说明
+- `-i, --input`: 输入的冻结模型（.pth）
+- `-o, --output`: 输出的压缩模型（.pth）
+- `-s, --step`: 第一段步长，影响精度与内存（默认0.01）
+- `-e, --extrapolate`: 外推倍数（默认5）
+- `-f, --frequency`: 溢出检查频率，-1表示不检查（默认-1）
+- `-t, --training-script`: 训练脚本（用于计算最小邻居距离）
 
-- **推理速度**: 通常可以达到 10 倍以上的加速
-- **内存使用**: 可以减少 20 倍以上的内存使用
-- **GPU 利用率**: 更好的内存访问模式提高 GPU 利用率
+## 局限性分析
 
-### 精度保持
+### 1. 描述符局限
+- DPA3 描述符不支持压缩
+- Pairtab 模型不支持查表压缩
+- 某些描述符变体可能不完全兼容
 
-- **相对误差**: 通常在 1%以内
-- **力场一致性**: 保持物理量的正确性
-- **能量守恒**: 维持系统能量守恒特性
+### 2. 精度权衡
+- 步长设置过大会影响精度
+- 外推区间精度相对较低
+- 激活函数近似可能带来误差
 
-## 使用建议
+### 3. 内存开销
+- 压缩后仍需存储多项式查表数据
+- 精度要求高时查表尺寸增大
+- 激活函数导数计算消耗额外内存
 
-### 参数选择
+### 4. 兼容性限制
+- 压缩后的模型仅适用于DeePMD-kit环境
+- JIT脚本化可能在某些场景下受限
+- LAMMPS等MD引擎需要特定的压缩模型格式
 
-1. **stride 参数**: 控制表格精度，越小精度越高但内存使用越大
-2. **extrapolate 参数**: 控制外推范围，需要根据应用场景选择
-3. **check_frequency**: 控制溢出检查频率，影响性能
+## 实现细节
 
-### 适用场景
+### 多项式插值公式
 
-1. **生产环境**: 适合大规模 MD 模拟
-2. **实时应用**: 适合需要快速响应的场景
-3. **资源受限**: 适合内存和计算资源受限的环境
+在区间 [x_i, x_{i+1}] 内，对于变量 x，多项式为：
 
-### 注意事项
+```
+f(x) = c₀ + c₁t + c₂t² + c₃t³ + c₄t⁴ + c₅t⁵
+```
 
-1. **模型版本**: 确保使用支持压缩的模型版本
-2. **描述符类型**: 确认描述符类型支持压缩
-3. **训练数据**: 需要提供足够的训练数据来计算统计信息
-4. **验证测试**: 建议在压缩后进行充分的验证测试
+其中：
+- `t = (x - x_i) / h`，h 为步长
+- `c₀ = f(x_i)`
+- `c₁ = f'(x_i) × h`
+- `c₂ = f''(x_i) × h² / 2`
+- `c₃, c₄, c₅` 根据边界连续性确定
 
-## 总结
+### 切换函数
 
-DeepMD-kit 的 compress 功能通过表格化推理、算子融合和精确邻域索引三种技术实现了模型推理性能的显著提升。在保持模型精度的同时，可以获得 10 倍以上的性能提升和 20 倍以上的内存节省。这些技术不仅优化了计算效率，还提高了内存访问的局部性，是深度学习在科学计算领域应用的重要优化案例。
+用于平滑处理截断半径的切换函数：
+
+```python
+def spline5_switch(r, r_min, r_max):
+    if r < r_min:
+        return 1.0
+    elif r < r_max:
+        u = (r - r_min) / (r_max - r_min)
+        return u³(-6u² + 15u - 10) + 1
+    else:
+        return 0.0
+```
+
+## 总结
 
-请详细梳理这个程序的 compress 功能，尤其是针对 pytorch 后端的，整个实现的代码执行流程，以及具体的实现方法，然后整理到 doc/outisli/compress.md 这个 markdown 文件中，一定要准确，详细，充分完全，逻辑条理清晰
+DeePMD-kit的compress功能通过将神经网络嵌入层用查表法和多项式插值替代，实现了显著的推理加速。PyTorch后端的实现采用了分层设计，由模型层、原子模型层、描述符层逐级传递压缩请求。查表器构建了精细和粗糙分段的插值表，平衡了精度与性能。该功能对大多数SE类和DPA1/DPA2描述符提供良好支持，是生产环境中提升MD模拟效率的重要工具。
\ No newline at end of file
diff --git a/examples/water/.gitignore b/examples/water/.gitignore
index 44cec7d508..e5704038b7 100644
--- a/examples/water/.gitignore
+++ b/examples/water/.gitignore
@@ -5,6 +5,7 @@ tab.xvg
 # for training dirs
 *.out
 *.pb
+*.hdf5
 out.json
 model.ckpt*
 checkpoint

From 6854f5f9e8e1000f0131cc896e14a94774789f38 Mon Sep 17 00:00:00 2001
From: OutisLi <LTC201806070316@gmail.com>
Date: Thu, 18 Sep 2025 13:37:01 +0800
Subject: [PATCH 05/11] feat: add debug scripts for model inference and
 training

feat: add inference script
---
 .gitignore               |   1 -
 debug/compress_debug.py  |  85 ++++++++++++++++++
 debug/dptest_debug.py    | 107 ++++++++++++++++++++++
 debug/inference_debug.py | 188 +++++++++++++++++++++++++++++++++++++++
 debug/train_debug.py     |  97 ++++++++++++++++++++
 5 files changed, 477 insertions(+), 1 deletion(-)
 create mode 100644 debug/compress_debug.py
 create mode 100644 debug/dptest_debug.py
 create mode 100644 debug/inference_debug.py
 create mode 100644 debug/train_debug.py

diff --git a/.gitignore b/.gitignore
index 231916a923..eef8d03a90 100644
--- a/.gitignore
+++ b/.gitignore
@@ -51,7 +51,6 @@ buildcxx/
 node_modules/
 *.bib.original
 .claude
-*.hdf5
 
 # Coverage files
 .coverage
diff --git a/debug/compress_debug.py b/debug/compress_debug.py
new file mode 100644
index 0000000000..7de6d41253
--- /dev/null
+++ b/debug/compress_debug.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+"""Debug script for model compression.
+
+Equivalent to: dp --pt compress -i no.pth -o yes.pth -t input_torch.json
+
+This script can be run directly in VSCode with debugging capabilities.
+"""
+
+import logging
+import os
+import sys
+from pathlib import Path
+
+# Add the deepmd-kit root to Python path
+deepmd_root = Path(__file__).parent.parent
+sys.path.insert(0, str(deepmd_root))
+
+
+def compress_model() -> None:
+    """Compress the model using the same parameters as the CLI command.
+
+    dp --pt compress -i no.pth -o yes.pth -t input_torch.json
+    """
+    # Import here to avoid module-level import restriction
+    from deepmd.pt.entrypoints.compress import enable_compression
+
+    # Setup logging
+    logging.basicConfig(level=logging.INFO)
+    log = logging.getLogger(__name__)
+
+    # Set working directory to examples/water/se_e3_tebd
+    work_dir = deepmd_root / "examples" / "water" / "se_e3_tebd"
+    original_cwd = os.getcwd()
+
+    try:
+        os.chdir(work_dir)
+        log.info(f"Changed to working directory: {work_dir}")
+
+        # Model compression parameters
+        input_file = "no.pth"
+        output_file = "yes.pth"
+        training_script = "input_torch.json"
+        stride = 0.01  # default value
+        extrapolate = 5  # default value
+        check_frequency = -1  # default value (disabled)
+
+        # Check if input files exist
+        if not os.path.exists(input_file):
+            raise FileNotFoundError(f"Input model file '{input_file}' not found in {work_dir}")
+
+        if not os.path.exists(training_script):
+            raise FileNotFoundError(f"Training script '{training_script}' not found in {work_dir}")
+
+        log.info(f"Input model: {input_file}")
+        log.info(f"Output model: {output_file}")
+        log.info(f"Training script: {training_script}")
+        log.info(f"Stride: {stride}")
+        log.info(f"Extrapolate: {extrapolate}")
+        log.info(f"Check frequency: {check_frequency}")
+
+        log.info("Starting model compression...")
+
+        # Call the compression function
+        enable_compression(
+            input_file=input_file,
+            output=output_file,
+            stride=stride,
+            extrapolate=extrapolate,
+            check_frequency=check_frequency,
+            training_script=training_script,
+        )
+
+        log.info("Model compression completed successfully!")
+        log.info(f"Compressed model saved to: {output_file}")
+
+    except Exception as e:
+        log.error(f"Error during compression: {e}")
+        raise
+    finally:
+        # Restore original working directory
+        os.chdir(original_cwd)
+
+
+if __name__ == "__main__":
+    compress_model()
diff --git a/debug/dptest_debug.py b/debug/dptest_debug.py
new file mode 100644
index 0000000000..80a6f2b516
--- /dev/null
+++ b/debug/dptest_debug.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Debug script for model inference/testing.
+
+Equivalent to: dp --pt test -m model.ckpt.pt -s data -n 100 -f test_debug.txt
+
+This script can be run directly in VSCode with debugging capabilities.
+"""
+
+import logging
+import os
+import sys
+import time
+from pathlib import (
+    Path,
+)
+
+# Add the deepmd-kit root to Python path
+deepmd_root = Path(__file__).parent.parent
+sys.path.insert(0, str(deepmd_root))
+
+
+def test_model() -> None:
+    """Test the model using the same parameters as the CLI command.
+
+    dp --pt test -m model.ckpt.pt -s . -n 100 -f test_debug.txt
+    """
+    # Import here to avoid module-level import restriction
+    from deepmd.entrypoints.test import (
+        test,
+    )
+
+    # Setup logging
+    logging.basicConfig(level=logging.INFO)
+    log = logging.getLogger(__name__)
+
+    # Set working directory to examples/water/se_e3_tebd
+    work_dir = deepmd_root / "examples" / "water" / "se_e3_tebd"
+    original_cwd = os.getcwd()
+
+    try:
+        os.chdir(work_dir)
+        log.info(f"Changed to working directory: {work_dir}")
+
+        # Test parameters
+        model_file = "no.pth"  # Model file to test
+        system_dir = "../data/data_3"  # Directory contains test data
+        datafile = None  # Not using a datafile list
+        train_json = None  # Not using training data for testing
+        valid_json = None  # Not using validation data for testing
+        numb_test = 100  # Number of test frames (0 means all)
+        rand_seed = None  # No random seed
+        shuffle_test = False  # Don't shuffle test data
+        detail_file = "test_debug.txt"  # Output file for test details
+        atomic = False  # Don't compute per-atom quantities
+        head = None  # No specific task head for multi-task models
+
+        # Check if model file exists
+        if not os.path.exists(model_file):
+            raise FileNotFoundError(
+                f"Model file '{model_file}' not found in {work_dir}"
+            )
+
+        # Set environment variable to limit batch size for testing
+        os.environ["DP_INFER_BATCH_SIZE"] = "1024"
+
+        log.info(f"Model: {model_file}")
+        log.info(f"System directory: {system_dir}")
+        log.info(f"Number of test frames: {numb_test}")
+        log.info(f"Detail file: {detail_file}")
+        log.info(f"Atomic output: {atomic}")
+
+        log.info("Starting model testing...")
+
+        # Record time usage
+        start_time = time.time()
+        # Call the test function
+        test(
+            model=model_file,
+            system=system_dir,
+            datafile=datafile,
+            train_json=train_json,
+            valid_json=valid_json,
+            numb_test=numb_test,
+            rand_seed=rand_seed,
+            shuffle_test=shuffle_test,
+            detail_file=detail_file,
+            atomic=atomic,
+            head=head,
+        )
+        end_time = time.time()
+        elapsed_time = end_time - start_time
+
+        log.info("Model testing completed successfully!")
+        log.info(f"Test results saved to: {detail_file}")
+        log.info(f"Elapsed time: {elapsed_time:.2f} seconds")
+
+    except Exception as e:
+        log.error(f"Error during testing: {e}")
+        raise
+    finally:
+        # Restore original working directory
+        os.chdir(original_cwd)
+
+
+if __name__ == "__main__":
+    test_model()
diff --git a/debug/inference_debug.py b/debug/inference_debug.py
new file mode 100644
index 0000000000..1e5beb1f39
--- /dev/null
+++ b/debug/inference_debug.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Debug script for single configuration model inference.
+
+This script loads only one configuration from the dataset and performs inference.
+Perfect for profiling and debugging individual forward passes.
+"""
+
+import logging
+import os
+import sys
+import time
+from pathlib import (
+    Path,
+)
+from typing import (
+    Any,
+)
+
+import numpy as np
+
+# Add the deepmd-kit root to Python path
+deepmd_root = Path(__file__).parent.parent
+sys.path.insert(0, str(deepmd_root))
+
+
+def load_single_configuration(data_dir: str, frame_idx: int = 0) -> dict[str, Any]:
+    """Load a single configuration from the dataset.
+
+    Parameters
+    ----------
+    data_dir : str
+        Path to the data directory containing set.000/
+    frame_idx : int, optional
+        Index of the frame to load (default: 0)
+
+    Returns
+    -------
+    dict
+        Dictionary containing coord, box, atom_types, and optional energy/force
+    """
+    set_dir = Path(data_dir) / "set.000"
+
+    # Load data
+    coord = np.load(set_dir / "coord.npy")[frame_idx : frame_idx + 1]  # Keep batch dim
+    box = np.load(set_dir / "box.npy")[frame_idx : frame_idx + 1]  # Keep batch dim
+
+    # Load atom types
+    type_map_file = Path(data_dir) / "type_map.raw"
+    type_file = Path(data_dir) / "type.raw"
+
+    if type_map_file.exists():
+        with open(type_map_file) as f:
+            type_map = [line.strip() for line in f]
+    else:
+        type_map = None
+
+    if type_file.exists():
+        with open(type_file) as f:
+            atom_types = [int(line.strip()) for line in f]
+    else:
+        raise FileNotFoundError(f"Atom type file not found: {type_file}")
+
+    # Optionally load reference data
+    data = {
+        "coord": coord,
+        "box": box,
+        "atom_types": np.array(atom_types),
+        "type_map": type_map,
+    }
+
+    # Load energy and force if available (for comparison)
+    energy_file = set_dir / "energy.npy"
+    force_file = set_dir / "force.npy"
+
+    if energy_file.exists():
+        data["energy"] = np.load(energy_file)[frame_idx : frame_idx + 1]
+    if force_file.exists():
+        data["force"] = np.load(force_file)[frame_idx : frame_idx + 1]
+
+    return data
+
+
+def inference_single_config() -> None:
+    """Perform inference on a single configuration."""
+    # Import DeepPot for simplified inference
+    from deepmd.infer import (
+        DeepPot,
+    )
+
+    # Setup logging with timestamp
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    log = logging.getLogger(__name__)
+
+    # Set working directory to examples/water/se_e3_tebd
+    work_dir = deepmd_root / "examples" / "water" / "se_e3_tebd"
+    original_cwd = os.getcwd()
+
+    try:
+        os.chdir(work_dir)
+        log.info(f"Changed to working directory: {work_dir}")
+
+        # Test parameters
+        model_file = "no.pth"  # Model file to test
+        data_dir = "../data/data_3"  # Directory contains test data
+        frame_idx = 0  # Use first frame
+
+        # Check if model file exists
+        if not os.path.exists(model_file):
+            raise FileNotFoundError(
+                f"Model file '{model_file}' not found in {work_dir}"
+            )
+
+        log.info(f"Loading model: {model_file}")
+
+        # Initialize model using DeepPot interface
+        dp = DeepPot(model_file, auto_batch_size=True)
+
+        log.info(f"Loading single configuration from: {data_dir}")
+
+        # Load single configuration
+        data = load_single_configuration(data_dir, frame_idx)
+
+        coord = data["coord"]
+        box = data["box"]
+        atom_types = data["atom_types"]
+
+        log.info("Configuration info:")
+        log.info(f"  Number of atoms: {len(atom_types)}")
+        log.info(f"  Coordinate shape: {coord.shape}")
+        log.info(f"  Box shape: {box.shape}")
+        log.info(f"  Atom types shape: {atom_types.shape}")
+        log.info(f"  Unique atom types: {np.unique(atom_types)}")
+
+        if data.get("type_map"):
+            log.info(f"  Type map: {data['type_map']}")
+
+        log.info("Starting single configuration inference...")
+
+        # Record time usage
+        start_time = time.time()
+
+        # Perform inference using DeepPot.eval()
+        e, f, v = dp.eval(coord, box, atom_types)
+
+        elapsed_time = time.time() - start_time
+
+        # Print results
+        log.info("\n=== Inference Results ===")
+        predicted_energy = e.reshape(-1)[0]
+        log.info(f"Predicted energy: {predicted_energy:.6f}")
+
+        if "energy" in data:
+            reference_energy = data["energy"][0]
+            energy_diff = abs(predicted_energy - reference_energy)
+            log.info(f"Reference energy: {reference_energy:.6f}")
+            log.info(f"Energy difference: {energy_diff:.6f}")
+
+        predicted_force = f
+        log.info(f"Predicted force shape: {predicted_force.shape}")
+        log.info(f"Force norm: {np.linalg.norm(predicted_force):.6f}")
+
+        if "force" in data:
+            reference_force = data["force"].reshape(predicted_force.shape)
+            force_diff = np.linalg.norm(predicted_force - reference_force)
+            log.info(f"Reference force norm: {np.linalg.norm(reference_force):.6f}")
+            log.info(f"Force RMSE: {force_diff / np.sqrt(predicted_force.size):.6f}")
+
+        predicted_virial = v.reshape(-1)
+        log.info(f"Predicted virial: {predicted_virial}")
+
+        log.info("Inference completed successfully!")
+        log.info(f"Elapsed time: {elapsed_time:.6f} seconds")
+
+    except Exception as e:
+        log.error(f"Error during inference: {e}")
+        raise
+    finally:
+        # Restore original working directory
+        os.chdir(original_cwd)
+
+
+if __name__ == "__main__":
+    inference_single_config()
diff --git a/debug/train_debug.py b/debug/train_debug.py
new file mode 100644
index 0000000000..c1f809c90c
--- /dev/null
+++ b/debug/train_debug.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Debug script for model training.
+
+Equivalent to: dp --pt train input_torch.json
+
+This script can be run directly in VSCode with debugging capabilities.
+"""
+
+import logging
+import os
+import sys
+from pathlib import (
+    Path,
+)
+
+# Add the deepmd-kit root to Python path
+deepmd_root = Path(__file__).parent.parent
+sys.path.insert(0, str(deepmd_root))
+
+
+def train_model() -> None:
+    """Train the model using the same parameters as the CLI command.
+
+    dp --pt train input_torch.json
+    """
+    # Import here to avoid module-level import restriction
+    from deepmd.pt.entrypoints.main import (
+        train,
+    )
+
+    # Setup logging
+    logging.basicConfig(level=logging.INFO)
+    log = logging.getLogger(__name__)
+
+    # Set working directory to examples/water/se_e3_tebd
+    work_dir = deepmd_root / "examples" / "water" / "se_e3_tebd"
+    original_cwd = os.getcwd()
+
+    try:
+        os.chdir(work_dir)
+        log.info(f"Changed to working directory: {work_dir}")
+
+        # Training parameters
+        input_file = "input_torch.json"
+        init_model = None  # Start training from scratch
+        restart = None  # No restart
+        finetune = None  # No finetuning
+        init_frz_model = None  # No frozen model initialization
+        model_branch = "default"
+        skip_neighbor_stat = True  # Calculate neighbor statistics
+        use_pretrain_script = False  # Don't use pretrain script
+        force_load = False  # Don't force load incompatible models
+        compile_model = False  # Don't compile model (JIT will be used automatically)
+        output = "out.json"  # Output configuration file
+
+        # Check if input file exists
+        if not os.path.exists(input_file):
+            raise FileNotFoundError(
+                f"Training input file '{input_file}' not found in {work_dir}"
+            )
+
+        log.info(f"Input file: {input_file}")
+        log.info(f"Output config: {output}")
+        log.info(f"Skip neighbor stat: {skip_neighbor_stat}")
+        log.info(f"Compile model: {compile_model}")
+
+        log.info("Starting model training...")
+
+        # Call the training function
+        train(
+            input_file=input_file,
+            init_model=init_model,
+            restart=restart,
+            finetune=finetune,
+            init_frz_model=init_frz_model,
+            model_branch=model_branch,
+            skip_neighbor_stat=skip_neighbor_stat,
+            use_pretrain_script=use_pretrain_script,
+            force_load=force_load,
+            compile_model=compile_model,
+            output=output,
+        )
+
+        log.info("Model training completed successfully!")
+        log.info(f"Output configuration saved to: {output}")
+
+    except Exception as e:
+        log.error(f"Error during training: {e}")
+        raise
+    finally:
+        # Restore original working directory
+        os.chdir(original_cwd)
+
+
+if __name__ == "__main__":
+    train_model()

From 281fd23ad05a58b2e48b80b92a233bd9c6590736 Mon Sep 17 00:00:00 2001
From: OutisLi <LTC201806070316@gmail.com>
Date: Sat, 27 Sep 2025 18:11:29 +0800
Subject: [PATCH 06/11] add compress doc

---
 doc/outisli/compress.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/outisli/compress.md b/doc/outisli/compress.md
index d572402b83..65475f6bc5 100644
--- a/doc/outisli/compress.md
+++ b/doc/outisli/compress.md
@@ -234,7 +234,7 @@ def build(self, min_nbor_dist, extrapolate, stride0, stride1):
                 xx = self._build_distance_grid(lower, upper, stride0, stride1, extrapolate, ii)
                 
                 # 查表数据
-                self._build_lower(net, xx, ii, uu, ll, stride0, stride1, extrapolate, nspline)
+                self._generate_spline_table(net, xx, ii, uu, ll, stride0, stride1, extrapolate, nspline)
     
     # 3. 后处理转换
     self._convert_numpy_to_tensor()
@@ -266,7 +266,7 @@ def _get_env_mat_range(self, min_nbor_dist):
 **文件位置**: `deepmd/utils/tabulate.py:245-347`
 
 ```python
-def _build_lower(self, net, xx, idx, upper, lower, stride0, stride1, extrapolate, nspline):
+def _generate_spline_table(self, net, xx, idx, upper, lower, stride0, stride1, extrapolate, nspline):
     # 1. 通过神经网络前向传播计算数据
     vv, dd, d2 = self._make_data(xx, idx)  # 值、一阶导数、二阶导数
     

From e81e41c940f6f204da6a109d28c462466ae1c4a7 Mon Sep 17 00:00:00 2001
From: OutisLi <LTC201806070316@gmail.com>
Date: Thu, 9 Oct 2025 14:05:18 +0800
Subject: [PATCH 07/11] refactor: pre-commit

---
 CLAUDE.md                   |  91 ++++++-
 debug/compress_debug.py     |  17 +-
 deepmd/pt/train/training.py |   4 +-
 doc/outisli/DPA3.md         | 521 ++++++++++++++++++++++--------------
 doc/outisli/compress.md     | 306 ++++++++++++++-------
 doc/outisli/install.md      |   7 +-
 6 files changed, 628 insertions(+), 318 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 83140fe346..952286a423 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -9,12 +9,14 @@ DeePMD-kit is a deep learning package for many-body potential energy representat
 ## Development Commands
 
 ### Building and Installation
+
 - **Standard build**: `pip install .`
 - **With GPU support**: Set environment variables like `DP_ENABLE_PYTORCH=1`, `DP_ENABLE_TENSORFLOW=1`, etc.
 - **From source**: Uses scikit-build-core with CMake - see `source/CMakeLists.txt`
 - **C++ library**: Built automatically as part of the Python package
 
 ### Testing
+
 - **Run all tests**: `pytest source/tests`
 - **Run specific backend tests**: `pytest source/tests/tf/`, `pytest source/tests/pt/`, etc.
 - **GPU tests**: `tox -e gpu` or set `DP_VARIANT=cuda`
@@ -22,17 +24,20 @@ DeePMD-kit is a deep learning package for many-body potential energy representat
 - **With coverage**: `pytest --cov=deepmd`
 
 ### Code Quality
+
 - **Linting**: `ruff check .`
 - **Formatting**: `ruff format .`
 - **Type checking**: No specific type checker configured in the project
 
 ### Backend-Specific Commands
+
 - **TensorFlow**: Requires TF 2.19.0, automatically enabled with certain flags
 - **PyTorch**: Enable with `DP_ENABLE_PYTORCH=1`
 - **JAX**: Enable with `DP_ENABLE_JAX=1` (requires Python >= 3.10)
 - **Paddle**: Enable with `DP_ENABLE_PADDLE=1`
 
 ### Model Compression
+
 - **Compress models**: `dp --pt compress -i model.pth -o compressed.pth`
 - **Custom parameters**: `dp --pt compress -i model.pth -o compressed.pth -s 0.005 -e 10`
 - **PyTorch backend only**: Supports SE_A, SE_R, SE_T, SE_Atten, DPA1, DPA2 descriptors
@@ -41,7 +46,9 @@ DeePMD-kit is a deep learning package for many-body potential energy representat
 ## Architecture Overview
 
 ### Multi-Backend Design
+
 The codebase is organized around a modular backend system in `deepmd/backend/`:
+
 - `backend.py`: Core backend management logic
 - `tensorflow.py`, `pytorch.py`, `jax.py`, `paddle.py`: Backend-specific implementations
 - `suffix.py`: Model file suffix handling for different backends
@@ -49,7 +56,9 @@ The codebase is organized around a modular backend system in `deepmd/backend/`:
 ### Core Components
 
 #### 1. Model Architecture (`deepmd/dpmodel/`)
+
 Framework-agnostic model implementations:
+
 - `atomic_model/`: Atomic-level model components
 - `descriptor/`: Environment descriptors (se_a, se_atten, dpa1/2/3, etc.)
 - `fitting/`: Fitting networks for energy, forces, etc.
@@ -58,9 +67,11 @@ Framework-agnostic model implementations:
 ### DPA3 Descriptor Implementation
 
 #### DPA3 Architecture Overview
+
 DPA3 (Deep Potential - Atomic Environment Representation with 3-body interactions) is an advanced descriptor that combines node, edge, and angle information for more accurate atomic environment representation.
 
 **Key Components**:
+
 - **Main Descriptor**: `DescrptDPA3` in `deepmd/pt/model/descriptor/dpa3.py:105-171`
 - **RepFlow Block**: `DescrptBlockRepflows` in `deepmd/pt/model/descriptor/repflows.py:77-200`
 - **RepFlow Layer**: `RepFlowLayer` in `deepmd/pt/model/descriptor/repflow_layer.py:38-200`
@@ -68,18 +79,22 @@ DPA3 (Deep Potential - Atomic Environment Representation with 3-body interaction
 **DPA3 Core Innovation**: The RepFlow architecture introduces a unified representation that iteratively refines node, edge, and angle information through multiple layers, enabling explicit 3-body interaction modeling while maintaining computational efficiency through message compression strategies.
 
 #### DPA3 Initialization and Forward Pass
+
 **Initialization** (`dpa3.py:105-171`):
+
 - Processes RepFlow parameters with `init_subclass_params(repflow, RepFlowArgs)`
 - Creates type embedding network (`TypeEmbedNetConsistent`) for consistent atomic type representations
 - Initializes RepFlow blocks with edge/angle embedding networks for distance and angular information
 - Sets up multiple RepFlow layers for iterative refinement with configurable residual connections
 
 **Forward Pass** (`dpa3.py:430-498`):
+
 1. **Type Embedding**: Computes atomic type embeddings using `TypeEmbedNetConsistent`
 2. **RepFlow Processing**: Multi-layer node/edge/angle information processing through iterative updates
 3. **Output Generation**: Returns comprehensive atomic environment representation with rotation matrices for SE(3) equivariance
 
 **DPA3 Output Variables**:
+
 - `node_ebd`: Node descriptors [nf, nloc, n_dim] - primary atomic environment representation for fitting networks
 - `rot_mat`: Rotation matrices [nf, nloc, e_dim, 3] - ensures SE(3) equivariance for coordinate transformations
 - `edge_ebd`: Edge embeddings [nf, nloc, nnei, e_dim] - pairwise interaction information
@@ -87,25 +102,30 @@ DPA3 (Deep Potential - Atomic Environment Representation with 3-body interaction
 - `sw`: Switch functions [nf, nloc, nnei] - smooth cutoff boundaries to avoid discontinuities
 
 #### RepFlow Implementation
+
 **RepFlow Block** (`repflows.py:77-200`):
+
 - Edge embedding network (`MLPLayer`) for distance information encoding
-- Angle embedding network for angular relationship processing  
+- Angle embedding network for angular relationship processing
 - Multiple RepFlow layers (`RepFlowLayer`) for iterative node/edge/angle updates
 - Support for message compression (`a_compress_rate`) and attention mechanisms to reduce computational cost
 - Environment matrix computation via `prod_env_mat` for neighbor distance and direction calculation
 
 **Key Parameters**:
+
 - `e_rcut`/`e_rcut_smth`: Edge cutoff (6.0Å) and smoothing radii (0.5Å) for neighbor selection
-- `a_rcut`/`a_rcut_smth`: Angle cutoff (4.0Å) and smoothing radii for three-body interactions  
+- `a_rcut`/`a_rcut_smth`: Angle cutoff (4.0Å) and smoothing radii for three-body interactions
 - `n_dim`/`e_dim`/`a_dim`: Node (128), edge (64), angle (32) representation dimensions
 - `nlayers`: Number of RepFlow layers (6) for iterative refinement
 - `update_style`: Residual connection strategies (`res_residual`, `res_update`, `force_residual`) for gradient flow optimization
 - `a_compress_rate`: Angle compression factor (2) to reduce computational overhead while preserving angular information
 
 #### CLI Usage and Training Flow
+
 **Training Command**: `dp --pt train input.json` (specify PyTorch backend explicitly)
 
 **Execution Flow**:
+
 1. **Entry Point**: `deepmd.pt.entrypoints.main.train()` (`main.py:248-372`) - PyTorch-specific training entry
 2. **Configuration Loading**: JSON parsing via `j_loader()` with multi-task handling through `preprocess_shared_params()`
 3. **Neighbor Statistics**: Automatic selection parameter computation via `BaseModel.update_sel()` unless `--skip-neighbor-stat`
@@ -113,21 +133,25 @@ DPA3 (Deep Potential - Atomic Environment Representation with 3-body interaction
 5. **Model Building**: DPA3 descriptor creation via `get_model()` with automatic device placement and JIT compilation options
 
 **Data Processing Pipeline**:
+
 1. **Raw Data Loading**: `DeepmdData` loads HDF5/.npy files from system directories
-2. **System DataLoaders**: Each system gets its own DataLoader (num_workers=0 to avoid thread explosion)  
+2. **System DataLoaders**: Each system gets its own DataLoader (num_workers=0 to avoid thread explosion)
 3. **Training DataLoader**: Master DataLoader with intelligent sampling (`WeightedRandomSampler` or uniform)
 4. **Batch Processing**: `collate_batch()` handles variable-sized systems with padding and tensor stacking
 
 #### Precision Control
+
 DPA3 supports two levels of precision control that work independently:
 
 **Environment Variable Control (`DP_INTERFACE_PREC`)**:
+
 - **Scope**: Global interface precision affecting input/output data types across all DeePMD-kit operations
 - **High precision** (`export DP_INTERFACE_PREC=high`): `GLOBAL_NP_FLOAT_PRECISION = np.float64`, `GLOBAL_ENER_FLOAT_PRECISION = np.float64`
 - **Low precision** (`export DP_INTERFACE_PREC=low`): `GLOBAL_NP_FLOAT_PRECISION = np.float32`, `GLOBAL_ENER_FLOAT_PRECISION = np.float64` (energy precision remains high)
 - **Location**: `deepmd/env.py:33-48`
 
 **Model Parameter Control (`precision` in configuration)**:
+
 - **Scope**: Component-specific precision for neural network weights and calculations
 - **Options**: `"float64"`, `"float32"`, `"float16"`, `"default"`
 - **Granular Control**: Can be set individually for descriptor, fitting networks, and RepFlow components
@@ -151,29 +175,35 @@ DPA3 supports two levels of precision control that work independently:
 ```
 
 **Precision Workflow** (`make_model.py:327-337`):
+
 1. **Input Type Detection**: `input_type_cast()` detects input data precision
 2. **Global Precision Conversion**: Converts to `GLOBAL_PT_FLOAT_PRECISION` for computation
 3. **Component Computation**: Uses component-specific precision settings
 4. **Output Conversion**: `output_type_cast()` converts back to original input precision
 
 #### Inference System
+
 **Main Classes**:
+
 - `DeepEval`: Universal inference interface (`deepmd/pt/infer/deep_eval.py:75`)
 - `Tester`: Testing and inference utility (`deepmd/pt/infer/inference.py:25`)
 
 **Inference Flow**:
+
 1. **Model Loading**: State dict loading and multi-task handling
 2. **JIT Compilation**: Optional TorchScript optimization
 3. **Batch Processing**: Automatic batch sizing for memory optimization
 4. **Execution**: DPA3 descriptor computation in evaluation mode
 
 **Performance Optimizations**:
+
 - **JIT Compilation**: `torch.jit.script()` for graph optimization
 - **Auto-batching**: Dynamic batch size adjustment based on memory
 - **Multi-device**: CPU/GPU support with automatic device selection
 - **Model Freezing**: `dp freeze` for deployment-optimized models
 
 #### Configuration Example
+
 ```json
 {
   "model": {
@@ -198,31 +228,39 @@ DPA3 supports two levels of precision control that work independently:
 ```
 
 #### Energy Summation Mechanism
+
 DPA3 implements a two-stage energy calculation:
+
 1. **Atomic Energy**: Each atom's local environment energy computed in fitting networks
 2. **System Energy**: Atomic energies summed to get total system energy
 
 **Key Files**:
+
 - Atomic energy: `deepmd/pt/model/task/fitting.py:473-614`
 - Energy summation: `deepmd/pt/model/model/transform_output.py:153-192`
 
 ### Model Compression System
 
 #### Compression Overview
+
 DeePMD-kit supports model compression through tabulation of embedding networks, providing significant inference speedup by replacing neural network computations with polynomial interpolation lookups.
 
-**Core Concept**: 
+**Core Concept**:
+
 - Pre-compute embedding network outputs and store in lookup tables
 - Use two-stage interpolation with different stride sizes for accuracy-memory balance
 - Replace runtime neural network evaluations with fast polynomial interpolation
 
 #### Compression Architecture
+
 **Entry Points**:
+
 - Command: `dp --pt compress -i model.pth -o compressed.pth`
 - Main entry: `deepmd/main.py` → `deepmd/pt/entrypoints/main.py:574-582`
 - Core function: `deepmd/pt/entrypoints/compress.py:32-84`
 
 **Execution Flow**:
+
 1. **Model Loading**: Load JIT model and reconstruct model instance
 2. **Min Distance Calculation**: Compute minimum neighbor distance from training data
 3. **Hierarchical Compression**: Model → Atomic Model → Descriptor compression
@@ -230,79 +268,98 @@ DeePMD-kit supports model compression through tabulation of embedding networks,
 5. **JIT Serialization**: Save compressed model as TorchScript
 
 #### Supported Descriptors
+
 **Fully Supported**:
+
 - `SE_A` (`se_a.py:257-302`): Smooth Edition Angular descriptor
-- `SE_R` (`se_r.py:359-xxx`): Smooth Edition Radial descriptor  
+- `SE_R` (`se_r.py:359-xxx`): Smooth Edition Radial descriptor
 - `SE_T` (`se_t.py:284-327`): Smooth Edition Three-body descriptor
 - `SE_Atten` (`se_atten.py:427-448`): Smooth Edition with Attention
 - `DPA1` (`dpa1.py:572-645`): Deep Potential Attention version 1
 - `DPA2` (`dpa2.py:893-973`): Deep Potential Attention version 2
 
 **Not Supported**:
+
 - `DPA3` (`dpa3.py:578-601`): Explicitly raises `NotImplementedError`
 - `Pairtab` models: No tabulation compression support
 
 #### Tabulation Implementation
+
 **Key Class**: `DPTabulate` (`deepmd/pt/utils/tabulate.py:30-100`)
 
 **Table Building Process**:
+
 1. **Range Calculation**: Compute environment matrix bounds from training data statistics
 2. **Grid Generation**: Create two-segment distance grids (fine + coarse stride)
 3. **Neural Network Evaluation**: Forward pass to get function values and derivatives
 4. **Polynomial Fitting**: Generate 5th-order Hermite interpolation coefficients
 
 **Data Storage Format**:
+
 - `compress_info`: [lower, upper, extrapolate_upper, stride1, stride2, check_freq]
 - `compress_data`: [nspline, 6 * last_layer_size] coefficient tables
 - Coefficients: [f(x), f'(x), f''(x)/2, c3, c4, c5] per neuron
 
 #### Performance Characteristics
+
 **Memory Optimization**:
+
 - Two-stage interpolation: fine stride (0.01) + coarse stride (0.1)
 - Extrapolation region: 5× training data range by default
 - Removes original network weights after compression
 
 **Computational Benefits**:
+
 - Eliminates matrix operations in embedding networks
 - Vectorized polynomial evaluation
 - Cache-friendly data layout for lookup tables
 
 #### Configuration Parameters
+
 - `-s, --step`: Fine stride size (default: 0.01) - affects accuracy vs memory
 - `-e, --extrapolate`: Extrapolation multiplier (default: 5)
-- `-f, --frequency`: Overflow check frequency (default: -1, disabled)  
+- `-f, --frequency`: Overflow check frequency (default: -1, disabled)
 - `-t, --training-script`: Training script path for min distance calculation
 
 #### 2. Backend-Specific Implementations
+
 - `deepmd/tf/`: TensorFlow backend (original implementation)
-- `deepmd/pt/`: PyTorch backend 
+- `deepmd/pt/`: PyTorch backend
 - `deepmd/jax/`: JAX backend
 - `deepmd/pd/`: Paddle backend
 
 Each backend implements similar interfaces:
+
 - Descriptor variants optimized for the framework
 - Training and inference modules
 - Model serialization/loading
 
 #### 3. Inference (`deepmd/infer/`)
+
 High-level inference interfaces:
+
 - `deep_pot.py`: Main potential energy model interface
 - `deep_eval.py`: Generic evaluation interface
 - Backend-specific inference modules
 
 #### 4. Training (`deepmd/*/train/`)
+
 Backend-specific training implementations:
+
 - Training loops and optimization
 - Data loading and preprocessing
 - Checkpoint management
 
 #### 5. Entry Points (`deepmd/entrypoints/`)
+
 Command-line interface commands:
+
 - `main.py`: Main CLI dispatcher
 - Training, testing, conversion utilities
 - Model analysis and documentation tools
 
 #### 6. C++ Integration (`source/`)
+
 - `lib/`: Core computational library with CUDA/ROCm support
 - `api_cc/`: C++ API for external integration
 - `api_c/`: C API wrapper
@@ -312,18 +369,21 @@ Command-line interface commands:
 ### PyTorch Backend Data Processing
 
 #### Two-Level DataLoader Architecture
+
 The PyTorch backend uses a unique two-level DataLoader system for efficient multi-system data management:
 
 **System Level**: Each data system has its own DataLoader (num_workers=0 to avoid thread explosion)
 **Training Level**: Master DataLoader handles sampling and batching across systems (num_workers=NUM_WORKERS)
 
 **Key Components**:
+
 - `DeepmdData`: Raw data loading from HDF5/.npy files (`deepmd/utils/data.py`)
 - `DpLoaderSet`: System-level DataLoader collection (`deepmd/pt/utils/dataloader.py`)
 - `DeepmdDataSetForLoader`: PyTorch Dataset wrapper
 - `collate_batch`: Batch processing function for variable-sized systems
 
 **Data Flow**:
+
 ```
 Raw Data (HDF5/.npy) → DeepmdData → System DataLoaders → DpLoaderSet → Training DataLoader → Model Input
 ```
@@ -331,7 +391,8 @@ Raw Data (HDF5/.npy) → DeepmdData → System DataLoaders → DpLoaderSet → T
 ### DPAtomicModel Hierarchy
 
 #### Class Structure
-```python
+
+```text
 BaseAtomicModel (base_atomic_model.py:52)
     ↓
 DPAtomicModel (dp_atomic_model.py:34) - registered as "standard"
@@ -340,12 +401,14 @@ Specific Models (Energy, Dipole, Polar, DOS, Property)
 ```
 
 **Key Features**:
+
 - **Unified Interface**: Consistent API for different physical properties
 - **Atomic-Level Forward Pass**: `forward_atomic()` method handles descriptor computation and fitting
 - **Multi-Task Support**: Supports training multiple properties simultaneously
 - **Automatic Differentiation**: Force and virial computation through autograd
 
 **Key Files**:
+
 - Base class: `deepmd/pt/model/atomic_model/dp_atomic_model.py:34`
 - Energy model: `deepmd/pt/model/atomic_model/energy_atomic_model.py:13`
 - Dipole model: `deepmd/pt/model/atomic_model/dipole_atomic_model.py:14`
@@ -353,19 +416,25 @@ Specific Models (Energy, Dipole, Polar, DOS, Property)
 ### Key Design Patterns
 
 #### Backend Abstraction
+
 The code uses a sophisticated backend system that allows:
+
 - Runtime backend selection
 - Model conversion between backends
 - Consistent APIs across frameworks
 
 #### Descriptor-Based Architecture
+
 Models are built from:
+
 1. **Descriptors**: Local atomic environment representations
 2. **Fitting Networks**: Map descriptors to physical quantities
 3. **Models**: Combine descriptors and fitting for complete potentials
 
 #### Multi-Task Learning
+
 Support for training multiple properties simultaneously:
+
 - Energy, forces, virial
 - Dipole moments, polarizability
 - DOS, electronic properties
@@ -374,23 +443,27 @@ Support for training multiple properties simultaneously:
 ## Working with the Code
 
 ### Adding New Features
+
 1. **Framework-agnostic**: Add to `deepmd/dpmodel/` first
 2. **Backend implementations**: Extend each backend in `deepmd/*/`
 3. **C++ optimization**: Add performance-critical code to `source/lib/`
 4. **Tests**: Add backend-specific tests in `source/tests/*/`
 
 ### Model Development
+
 - Use existing descriptors as templates in `deepmd/dpmodel/descriptor/`
 - Extend fitting networks in `deepmd/dpmodel/fitting/`
 - Model composition follows patterns in `deepmd/dpmodel/model/`
 
 ### Performance Considerations
+
 - C++ library handles neighbor lists and environment matrices
 - Custom operators optimized for GPU acceleration
 - Automatic mixed precision support where available
 - **Model compression**: Tabulation provides 2-10× inference speedup for supported descriptors
 
 ### Common Pitfalls
+
 - Backend-specific imports are banned at module level (use runtime imports)
 - Model compatibility requires careful version management
 - GPU builds require specific CUDA/ROCm versions
@@ -403,4 +476,4 @@ Support for training multiple properties simultaneously:
 - **Implementation details**: In subdirectories like `dpmodel/`, `utils/`
 - **Backend code**: Separated into `tf/`, `pt/`, `jax/`, `pd/` directories
 - **Tests**: Organized by backend in `source/tests/*/`
-- **Examples**: In `examples/` directory with input configurations
\ No newline at end of file
+- **Examples**: In `examples/` directory with input configurations
diff --git a/debug/compress_debug.py b/debug/compress_debug.py
index 7de6d41253..db57d78d83 100644
--- a/debug/compress_debug.py
+++ b/debug/compress_debug.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+# SPDX-License-Identifier: LGPL-3.0-or-later
 """Debug script for model compression.
 
 Equivalent to: dp --pt compress -i no.pth -o yes.pth -t input_torch.json
@@ -9,7 +10,9 @@
 import logging
 import os
 import sys
-from pathlib import Path
+from pathlib import (
+    Path,
+)
 
 # Add the deepmd-kit root to Python path
 deepmd_root = Path(__file__).parent.parent
@@ -22,7 +25,9 @@ def compress_model() -> None:
     dp --pt compress -i no.pth -o yes.pth -t input_torch.json
     """
     # Import here to avoid module-level import restriction
-    from deepmd.pt.entrypoints.compress import enable_compression
+    from deepmd.pt.entrypoints.compress import (
+        enable_compression,
+    )
 
     # Setup logging
     logging.basicConfig(level=logging.INFO)
@@ -46,10 +51,14 @@ def compress_model() -> None:
 
         # Check if input files exist
         if not os.path.exists(input_file):
-            raise FileNotFoundError(f"Input model file '{input_file}' not found in {work_dir}")
+            raise FileNotFoundError(
+                f"Input model file '{input_file}' not found in {work_dir}"
+            )
 
         if not os.path.exists(training_script):
-            raise FileNotFoundError(f"Training script '{training_script}' not found in {work_dir}")
+            raise FileNotFoundError(
+                f"Training script '{training_script}' not found in {work_dir}"
+            )
 
         log.info(f"Input model: {input_file}")
         log.info(f"Output model: {output_file}")
diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py
index c12c12b28b..556d6d236e 100644
--- a/deepmd/pt/train/training.py
+++ b/deepmd/pt/train/training.py
@@ -20,9 +20,6 @@
 import numpy as np
 import torch
 
-from deepmd.pt.utils import (
-    env,
-)
 from deepmd.common import (
     symlink_prefix_files,
 )
@@ -54,6 +51,7 @@
 )
 from deepmd.pt.utils import (
     dp_random,
+    env,
 )
 from deepmd.pt.utils.dataloader import (
     DpLoaderSet,
diff --git a/doc/outisli/DPA3.md b/doc/outisli/DPA3.md
index 68055772b5..d0747e97b5 100644
--- a/doc/outisli/DPA3.md
+++ b/doc/outisli/DPA3.md
@@ -1,4 +1,4 @@
-# DeepMD 源码导读与 DPA3 PyTorch 实现技术文档
+# DeePMD 源码导读与 DPA-3 PyTorch 实现技术文档
 
 ## 概述
 
@@ -55,8 +55,14 @@ if not skip_neighbor_stat:
 
 # 4. 训练器创建
 trainer = get_trainer(
-    config, init_model, restart, finetune, force_load, init_frz_model,
-    shared_links=shared_links, finetune_links=finetune_links
+    config,
+    init_model,
+    restart,
+    finetune,
+    force_load,
+    init_frz_model,
+    shared_links=shared_links,
+    finetune_links=finetune_links,
 )
 ```
 
@@ -283,10 +289,12 @@ DPA3 描述符层
 class DescrptDPA3(BaseDescriptor, torch.nn.Module):
     """DPA3 描述符实现"""
 
+
 @DescriptorBlock.register("se_repflow")
 class DescrptBlockRepflows(DescriptorBlock):
     """RepFlow 描述符块"""
 
+
 class RepFlowLayer(torch.nn.Module):
     """单个 RepFlow 层"""
 ```
@@ -335,25 +343,26 @@ DeePMD-kit 采用分层次、模块化的设计，从底层的原子级计算到
 
 **最基础的计算单元** - 负责原子级别的物理量计算：
 
-```python
+```text
 # 抽象基类层
 ABC + PluginVariant + make_plugin_registry("atomic model")
     ↓
 BaseAtomicModel_ (由 make_base_atomic_model() 动态生成)
-    ↓  
+    ↓
 BaseAtomicModel (deepmd/dpmodel/atomic_model/base_atomic_model.py:42)
     ↓
 DPAtomicModel (deepmd/dpmodel/atomic_model/dp_atomic_model.py:29) - 注册为 "standard"
     ↓
 具体的物理属性原子模型:
 ├── DPEnergyAtomicModel (能量模型)
-├── DPDipoleAtomicModel (偶极子模型) 
+├── DPDipoleAtomicModel (偶极子模型)
 ├── DPPolarAtomicModel (极化率模型)
 ├── DPDOSAtomicModel (态密度模型)
 └── DPPropertyAtomicModel (通用属性模型)
 ```
 
 **作用和用途**:
+
 - **核心计算单元**: 包含描述器(Descriptor) + 拟合网络(Fitting)
 - **原子级预测**: 负责单个原子的能量/力等物理量预测
 - **不直接用于训练**: 作为组件被更高层模型调用
@@ -363,7 +372,7 @@ DPAtomicModel (deepmd/dpmodel/atomic_model/dp_atomic_model.py:29) - 注册为 "s
 
 **真正用于训练和推理的完整模型**：
 
-```python
+```text
 # 抽象基类层
 ABC + PluginVariant + make_plugin_registry("model")
     ↓
@@ -376,14 +385,15 @@ DPModelCommon (提供公共方法如 update_sel 等)
 通过 make_model(T_AtomicModel) 动态生成的模型类
     ↓
 具体的完整模型实现:
-├── EnergyModel (deepmd/pt/model/model/ener_model.py:30) - 注册为 "ener" 
+├── EnergyModel (deepmd/pt/model/model/ener_model.py:30) - 注册为 "ener"
 ├── DipoleModel - 注册为 "dipole"
-├── PolarModel - 注册为 "polar" 
+├── PolarModel - 注册为 "polar"
 ├── DOSModel - 注册为 "dos"
 └── PropertyModel - 注册为 "property"
 ```
 
 **作用和用途**:
+
 - **训练和推理接口**: `dp train input.json` 时创建的就是这个模型
 - **系统级功能**: 封装原子模型，添加邻居列表构建、坐标变换、批处理等
 - **梯度计算**: 自动计算力和应力
@@ -393,7 +403,7 @@ DPModelCommon (提供公共方法如 update_sel 等)
 
 **线性组合和特殊模型**：
 
-```python
+```text
 BaseAtomicModel
     ↓
 LinearEnergyAtomicModel (deepmd/dpmodel/atomic_model/linear_atomic_model.py:42) - 注册为 "linear"
@@ -406,6 +416,7 @@ DPZBLModel (deepmd/dpmodel/model/dp_zbl_model.py:28) - 注册为 "zbl"
 ```
 
 **作用和用途**:
+
 - **模型组合**: 线性组合多个原子模型
 - **物理修正**: DPZBLModel 结合深度势能和 ZBL 势函数
 - **特殊应用**: 处理短程排斥等特殊物理场景
@@ -414,7 +425,7 @@ DPZBLModel (deepmd/dpmodel/model/dp_zbl_model.py:28) - 注册为 "zbl"
 
 ##### 2.4.3.1 训练时的模型创建流程
 
-```python
+```text
 # 1. 用户配置
 "model": {"type": "ener"}  # input.json 中
 
@@ -423,7 +434,7 @@ dp train input.json
   ↓
 # 3. 模型工厂创建 (deepmd/pt/entrypoints/main.py:248)
 model = get_model(model_params)  # 返回 EnergyModel 实例
-  ↓  
+  ↓
 # 4. EnergyModel 初始化流程
 # 4a. 创建 DPEnergyAtomicModel 实例（原子级计算核心）
 # 4b. 通过 make_model() 包装成完整模型（添加系统级功能）
@@ -438,13 +449,13 @@ loss = model.forward(coord, atype, box, ...)  # EnergyModel.forward()
 
 ##### 2.4.3.2 推理时的模型加载流程
 
-```python
+```text
 # 1. 模型加载
 model = torch.jit.load("frozen_model.pth")  # 实际是 EnergyModel 的实例
   ↓
 # 2. 推理调用
 output = model(coord, atype, box)  # EnergyModel.forward()
-  ↓  
+  ↓
 # 3. 返回标准格式
 {"energy": ..., "force": ..., "virial": ...}
 ```
@@ -454,18 +465,21 @@ output = model(coord, atype, box)  # EnergyModel.forward()
 ##### 2.4.4.1 核心设计模式
 
 **1. 工厂模式**
+
 - `make_base_atomic_model()`: 动态生成原子模型基类
-- `make_base_model()`: 动态生成最终模型基类  
+- `make_base_model()`: 动态生成最终模型基类
 - `make_model(T_AtomicModel)`: 将原子模型包装成完整模型
 
 **2. 注册机制**
+
 - 使用 `@BaseAtomicModel.register()` 和 `@BaseModel.register()` 注册不同类型的模型
 - 支持通过字符串名称动态创建模型实例
 
 **3. 组合模式**
+
 - **DPAtomicModel**: 由描述器(Descriptor) + 拟合网络(Fitting) 组成
 - **LinearEnergyAtomicModel**: 线性组合多个原子模型
-- **DPZBLLinearEnergyAtomicModel**: 特殊的线性组合，结合DP模型和ZBL势函数
+- **DPZBLLinearEnergyAtomicModel**: 特殊的线性组合，结合 DP 模型和 ZBL 势函数
 
 **4. 多后端支持**
 每个后端(PyTorch/TensorFlow/JAX/Paddle)都有相应的实现，遵循相同的接口但针对特定框架优化。
@@ -473,37 +487,43 @@ output = model(coord, atype, box)  # EnergyModel.forward()
 ##### 2.4.4.2 架构优势
 
 **模块化**:
+
 - 描述器和拟合网络可以独立开发和组合
 - 不同物理量的预测可以共享相同的框架
 
-**可扩展性**: 
+**可扩展性**:
+
 - 容易添加新的物理属性或模型类型
 - 支持自定义描述器和拟合网络
 
-**多后端支持**: 
+**多后端支持**:
+
 - 同一套接口支持不同的深度学习框架
 - 代码复用和维护效率高
 
-**类型安全**: 
+**类型安全**:
+
 - 通过注册机制确保模型类型的正确性
 - 编译时类型检查和运行时验证
 
 #### 2.4.7 模型压缩功能 (enable_compression)
 
-模型压缩是DeePMD-kit中一个重要的性能优化功能，通过表格化(tabulation)的方式来加速模型推理，特别适用于生产环境的部署。
+模型压缩是 DeePMD-kit 中一个重要的性能优化功能，通过表格化(tabulation)的方式来加速模型推理，特别适用于生产环境的部署。
 
 ##### 2.4.7.1 压缩功能调用链
 
 **压缩入口点** (`deepmd/pt/entrypoints/compress.py:75`):
+
 ```python
 model.enable_compression(
-    extrapolate,    # 外推尺度
-    stride,         # 步长1
-    stride * 10,    # 步长2
+    extrapolate,  # 外推尺度
+    stride,  # 步长1
+    stride * 10,  # 步长2
 )
 ```
 
 **压缩方法层次**:
+
 ```
 顶层模型压缩 (make_model.py:246-266)
     ↓
@@ -513,7 +533,7 @@ self.atomic_model.enable_compression(
     self.get_min_nbor_dist(),  # 获取最小邻居距离
     table_extrapolate,
     table_stride_1,
-    table_stride_2, 
+    table_stride_2,
     check_frequency,
 )
     ↓
@@ -523,6 +543,7 @@ self.atomic_model.enable_compression(
 ##### 2.4.7.2 压缩参数说明
 
 **关键参数**:
+
 - `table_extrapolate`: 模型外推的尺度参数，控制表格的外推范围
 - `table_stride_1`: 第一个表格的均匀步长，影响近程精度
 - `table_stride_2`: 第二个表格的均匀步长，影响远程精度
@@ -532,12 +553,14 @@ self.atomic_model.enable_compression(
 ##### 2.4.7.3 压缩机制的实现原理
 
 **表格化加速**:
+
 1. **距离离散化**: 将连续的原子间距离离散化为表格索引
 2. **预计算存储**: 预先计算并存储常用距离范围内的描述符值
 3. **插值查表**: 推理时通过插值查表替代复杂的神经网络计算
 4. **内存换时间**: 牺牲一定内存空间换取显著的计算速度提升
 
 **多级表格策略**:
+
 - **近程高精度**: `table_stride_1` 控制近程的高精度表格
 - **远程适中精度**: `table_stride_2` 控制远程的适中精度表格
 - **平滑过渡**: 两个表格之间实现平滑过渡，避免不连续性
@@ -545,11 +568,13 @@ self.atomic_model.enable_compression(
 ##### 2.4.7.4 压缩的应用场景和优势
 
 **适用场景**:
-- **生产环境部署**: MD模拟中需要高频调用模型推理
+
+- **生产环境部署**: MD 模拟中需要高频调用模型推理
 - **大规模系统**: 原子数量庞大，计算资源有限
 - **实时仿真**: 对推理速度有严格要求的应用
 
 **性能优势**:
+
 - **推理加速**: 可实现数倍到数十倍的推理速度提升
 - **内存可控**: 表格大小可通过步长参数灵活控制
 - **精度平衡**: 在速度和精度之间找到最优平衡点
@@ -557,23 +582,25 @@ self.atomic_model.enable_compression(
 ##### 2.4.7.5 压缩功能的使用建议
 
 **参数调优策略**:
+
 ```python
 # 高精度场景 - 较小的步长，更高的精度
 model.enable_compression(
     extrapolate=5.0,
-    stride_1=0.005,    # 更小的近程步长
-    stride_2=0.05,     # 更小的远程步长
+    stride_1=0.005,  # 更小的近程步长
+    stride_2=0.05,  # 更小的远程步长
 )
 
-# 高性能场景 - 较大的步长，更快的速度  
+# 高性能场景 - 较大的步长，更快的速度
 model.enable_compression(
     extrapolate=3.0,
-    stride_1=0.02,     # 较大的近程步长
-    stride_2=0.2,      # 较大的远程步长
+    stride_1=0.02,  # 较大的近程步长
+    stride_2=0.2,  # 较大的远程步长
 )
 ```
 
 **最佳实践**:
+
 1. **测试验证**: 压缩后务必验证模型精度是否满足要求
 2. **参数调优**: 根据具体应用场景调整步长参数
 3. **内存监控**: 关注压缩后的内存使用情况
@@ -582,11 +609,13 @@ model.enable_compression(
 #### 2.4.8 在实际使用中的角色分工
 
 **对用户而言**:
+
 - **只需关心最终模型**: EnergyModel、DipoleModel 等
 - **配置简单**: 通过 JSON 配置文件指定模型类型
 - **接口统一**: 所有模型都使用相同的训练和推理接口
 
 **对开发者而言**:
+
 - **清晰的层次**: 每一层都有明确的职责
 - **易于扩展**: 在正确的层级添加新功能
 - **代码复用**: 通过工厂模式避免重复代码
@@ -594,11 +623,13 @@ model.enable_compression(
 #### 2.4.9 模型架构总结
 
 **对用户而言**:
+
 - **只需关心最终模型**: EnergyModel、DipoleModel 等
 - **配置简单**: 通过 JSON 配置文件指定模型类型
 - **接口统一**: 所有模型都使用相同的训练和推理接口
 
 **对开发者而言**:
+
 - **清晰的层次**: 每一层都有明确的职责
 - **易于扩展**: 在正确的层级添加新功能
 - **代码复用**: 通过工厂模式避免重复代码
@@ -609,7 +640,7 @@ model.enable_compression(
 @BaseAtomicModel.register("standard")
 class DPAtomicModel(BaseAtomicModel):
     """Model give atomic prediction of some physical property.
-    
+
     Parameters
     ----------
     descriptor
@@ -628,69 +659,86 @@ DeePMD-kit 中存在多个不同的 forward 方法，每个都有特定的用途
 ##### 2.4.6.1 Forward 方法层级结构
 
 **1. 用户接口层** - `forward()`
-```python
+
+```text
 # deepmd/pt/model/model/ener_model.py:94
 def forward(self, coord, atype, box=None, fparam=None, aparam=None, do_atomic_virial=False)
 ```
-**用途**: 
+
+**用途**:
+
 - **最高级的用户接口**，训练和推理时直接调用的方法
 - 接收原始的坐标、原子类型、盒子信息
 - 返回标准的物理量格式 `{"energy": ..., "force": ..., "virial": ...}`
 
 **什么时候使用**:
+
 - 训练时的损失函数计算
 - 推理时的预测
-- LAMMPS等MD引擎调用的接口
+- LAMMPS 等 MD 引擎调用的接口
 
 **2. 坐标处理层** - `forward_common()`
-```python
+
+```text
 # deepmd/pt/model/model/make_model.py:152
 def forward_common(self, coord, atype, box=None, fparam=None, aparam=None, do_atomic_virial=False)
 ```
+
 **用途**:
+
 - **处理坐标变换和邻居列表构建**
-- 将原始坐标转换为扩展坐标(包含ghost原子)
+- 将原始坐标转换为扩展坐标(包含 ghost 原子)
 - 构建邻居列表
 - 调用底层的`forward_common_lower()`
 
 **内部工作流程**:
+
 ```python
 # 1. 坐标标准化和扩展
 extended_coord, extended_atype, mapping = extend_coord_with_ghosts(...)
-# 2. 构建邻居列表  
+# 2. 构建邻居列表
 nlist = build_neighbor_list(...)
 # 3. 调用底层计算
 model_ret = self.forward_common_lower(extended_coord, extended_atype, nlist, ...)
 ```
 
 **3. 底层计算层** - `forward_common_lower()`
-```python
+
+```text
 # deepmd/pt/model/model/make_model.py:278
 def forward_common_lower(self, extended_coord, extended_atype, nlist, mapping=None, ...)
 ```
+
 **用途**:
+
 - **真正的模型计算逻辑**
 - 接收已处理好的扩展坐标和邻居列表
 - 调用原子模型进行实际计算
-- 处理输出的格式转换和reduction操作
+- 处理输出的格式转换和 reduction 操作
 
 **4. 外部接口层** - `forward_lower()`
-```python  
+
+```text
 # deepmd/pt/model/model/ener_model.py:135
 def forward_lower(self, extended_coord, extended_atype, nlist, mapping=None, ...)
 ```
+
 **用途**:
-- **提供给外部程序的底层接口** (如LAMMPS插件)
-- 外部程序已经准备好了邻居列表，不需要DeePMD重新构建
+
+- **提供给外部程序的底层接口** (如 LAMMPS 插件)
+- 外部程序已经准备好了邻居列表，不需要 DeePMD 重新构建
 - 直接调用`forward_common_lower()`
-- 返回扩展区域的结果(不做reduction)
+- 返回扩展区域的结果(不做 reduction)
 
 **5. 原子级计算层** - `forward_atomic()`
-```python
-# deepmd/pt/model/atomic_model/dp_atomic_model.py:273  
+
+```text
+# deepmd/pt/model/atomic_model/dp_atomic_model.py:273
 def forward_atomic(self, extended_coord, extended_atype, nlist, mapping=None, ...)
 ```
+
 **用途**:
+
 - **最底层的原子级计算**
 - 描述器(Descriptor)计算原子环境表示
 - 拟合网络(Fitting)预测原子能量/力等
@@ -699,33 +747,35 @@ def forward_atomic(self, extended_coord, extended_atype, nlist, mapping=None, ..
 ##### 2.4.6.2 Forward 方法调用关系链
 
 **训练/推理时的完整调用链:**
-```python
+
+```text
 # 用户调用
 model.forward(coord, atype, box)
   ↓
-# 坐标处理 
+# 坐标处理
 model.forward_common(coord, atype, box)
-  ↓  
+  ↓
 # 坐标扩展 + 邻居列表构建
 extended_coord, nlist = preprocess(...)
   ↓
 # 底层计算
 model.forward_common_lower(extended_coord, extended_atype, nlist)
   ↓
-# 原子模型计算  
+# 原子模型计算
 atomic_ret = self.atomic_model.forward_atomic(extended_coord, extended_atype, nlist)
   ↓
 # 输出转换和reduction
 return transform_output(atomic_ret)
 ```
 
-**LAMMPS等外部程序调用:**
-```python
+**LAMMPS 等外部程序调用:**
+
+```text
 # 外部程序已经有邻居列表
 model.forward_lower(extended_coord, extended_atype, nlist, mapping)
   ↓
 # 直接底层计算
-model.forward_common_lower(extended_coord, extended_atype, nlist, mapping)  
+model.forward_common_lower(extended_coord, extended_atype, nlist, mapping)
   ↓
 # 原子模型计算
 atomic_ret = self.atomic_model.forward_atomic(...)
@@ -734,32 +784,39 @@ atomic_ret = self.atomic_model.forward_atomic(...)
 ##### 2.4.6.3 设计多层次 Forward 的原因
 
 **1. 性能优化**
+
 - `forward_lower()`: 外部程序可以复用邻居列表，避免重复计算
 - `forward_common_lower()`: 批处理时可以直接使用预构建的数据
 
-**2. 接口灵活性** 
+**2. 接口灵活性**
+
 - `forward()`: 简单易用的高级接口
 - `forward_lower()`: 高性能的底层接口
 
 **3. 代码复用**
+
 - `forward_common()`: 坐标处理逻辑可以被多种模型复用
 - `forward_atomic()`: 原子级计算与系统级处理分离
 
 **4. 调试和测试**
+
 - 可以单独测试每个层级的功能
 - 便于定位性能瓶颈
 
 ##### 2.4.6.4 实际使用建议
 
 **对于普通用户**:
+
 - **只需关心 `forward()`**: 训练和推理的标准接口
-- **偶尔使用 `forward_lower()`**: 如果你要写MD插件或需要高性能推理
+- **偶尔使用 `forward_lower()`**: 如果你要写 MD 插件或需要高性能推理
 
 **对于开发者**:
+
 - **`forward_common` 系列**: 理解内部实现和优化的关键
 - **`forward_atomic()`**: 自定义原子模型时需要实现的核心方法
 
 **性能优化场景**:
+
 - **外部邻居列表**: 使用 `forward_lower()` 避免重复计算
 - **批处理优化**: 直接调用 `forward_common_lower()` 处理预处理好的数据
 - **调试分析**: 单独调用 `forward_atomic()` 分析原子级计算
@@ -767,53 +824,67 @@ atomic_ret = self.atomic_model.forward_atomic(...)
 #### 2.4.2 具体派生模型
 
 **能量模型** (`deepmd/pt/model/atomic_model/energy_atomic_model.py:13`):
+
 ```python
 class DPEnergyAtomicModel(DPAtomicModel):
     def __init__(self, descriptor, fitting, type_map, **kwargs):
-        if not (isinstance(fitting, EnergyFittingNet) or 
-                isinstance(fitting, EnergyFittingNetDirect) or 
-                isinstance(fitting, InvarFitting)):
-            raise TypeError("fitting must be an instance of EnergyFittingNet, "
-                          "EnergyFittingNetDirect or InvarFitting for DPEnergyAtomicModel")
+        if not (
+            isinstance(fitting, EnergyFittingNet)
+            or isinstance(fitting, EnergyFittingNetDirect)
+            or isinstance(fitting, InvarFitting)
+        ):
+            raise TypeError(
+                "fitting must be an instance of EnergyFittingNet, "
+                "EnergyFittingNetDirect or InvarFitting for DPEnergyAtomicModel"
+            )
         super().__init__(descriptor, fitting, type_map, **kwargs)
 ```
 
 **偶极矩模型** (`deepmd/pt/model/atomic_model/dipole_atomic_model.py:14`):
+
 ```python
 class DPDipoleAtomicModel(DPAtomicModel):
     def __init__(self, descriptor, fitting, type_map, **kwargs):
         if not isinstance(fitting, DipoleFittingNet):
-            raise TypeError("fitting must be an instance of DipoleFittingNet for DPDipoleAtomicModel")
+            raise TypeError(
+                "fitting must be an instance of DipoleFittingNet for DPDipoleAtomicModel"
+            )
         super().__init__(descriptor, fitting, type_map, **kwargs)
-    
+
     def apply_out_stat(self, ret: dict[str, torch.Tensor], atype: torch.Tensor):
         # dipole not applying bias
         return ret
 ```
 
 **极化率模型** (`deepmd/pt/model/atomic_model/polar_atomic_model.py:14`):
+
 ```python
 class DPPolarAtomicModel(DPAtomicModel):
     def __init__(self, descriptor, fitting, type_map, **kwargs):
         if not isinstance(fitting, PolarFittingNet):
-            raise TypeError("fitting must be an instance of PolarFittingNet for DPPolarAtomicModel")
+            raise TypeError(
+                "fitting must be an instance of PolarFittingNet for DPPolarAtomicModel"
+            )
         super().__init__(descriptor, fitting, type_map, **kwargs)
 ```
 
 #### 2.4.3 DPAtomicModel 核心功能
 
 **原子级前向传播** (`dp_atomic_model.py:205-265`):
+
 ```python
-def forward_atomic(self,
-                  extended_coord,
-                  extended_atype,
-                  nlist,
-                  mapping: Optional[torch.Tensor] = None,
-                  fparam: Optional[torch.Tensor] = None,
-                  aparam: Optional[torch.Tensor] = None,
-                  comm_dict: Optional[dict[str, torch.Tensor]] = None) -> dict[str, torch.Tensor]:
+def forward_atomic(
+    self,
+    extended_coord,
+    extended_atype,
+    nlist,
+    mapping: Optional[torch.Tensor] = None,
+    fparam: Optional[torch.Tensor] = None,
+    aparam: Optional[torch.Tensor] = None,
+    comm_dict: Optional[dict[str, torch.Tensor]] = None,
+) -> dict[str, torch.Tensor]:
     """Return atomic prediction.
-    
+
     Parameters
     ----------
     extended_coord
@@ -828,7 +899,7 @@ def forward_atomic(self,
             frame parameter. nf x ndf
     aparam
             atomic parameter. nf x nloc x nda
-    
+
     Returns
     -------
     result_dict
@@ -839,21 +910,22 @@ def forward_atomic(self,
     atype = extended_atype[:, :nloc]
     if self.do_grad_r() or self.do_grad_c():
         extended_coord.requires_grad_(True)
-    
+
     # 2. 描述符计算
     descriptor, rot_mat, g2, h2, sw = self.descriptor(
-        extended_coord, extended_atype, nlist,
-        mapping=mapping, comm_dict=comm_dict)
-    
+        extended_coord, extended_atype, nlist, mapping=mapping, comm_dict=comm_dict
+    )
+
     # 3. 拟合网络计算
     fit_ret = self.fitting_net(
-        descriptor, atype, gr=rot_mat, g2=g2, h2=h2,
-        fparam=fparam, aparam=aparam)
-    
+        descriptor, atype, gr=rot_mat, g2=g2, h2=h2, fparam=fparam, aparam=aparam
+    )
+
     return fit_ret
 ```
 
 **模型工厂集成** (`deepmd/pt/model/model/__init__.py`):
+
 ```python
 def get_model(model_params):
     model_type = model_params.get("type", "standard")
@@ -886,7 +958,7 @@ DPAtomicModel 通过统一的接口和灵活的设计，为 DPA3 描述符与各
 
 **文件位置**: `deepmd/pt/model/descriptor/dpa3.py:105-171`
 
-```python
+```text
 def __init__(self,
              ntypes: int,
              repflow: Union[RepFlowArgs, dict],
@@ -920,7 +992,7 @@ def __init__(self,
        precision=precision,
        seed=child_seed(seed, 0),
        use_econf_tebd=use_econf_tebd,
-       type_map=type_map
+       type_map=type_map,
    )
    ```
 
@@ -956,8 +1028,7 @@ def __init__(self,
 **处理流程**:
 
 ```python
-def forward(self, extended_coord, extended_atype, nlist,
-            mapping=None, comm_dict=None):
+def forward(self, extended_coord, extended_atype, nlist, mapping=None, comm_dict=None):
     # 1. 数据类型转换
     extended_coord = extended_coord.to(dtype=self.prec)
     nframes, nloc, nnei = nlist.shape
@@ -972,8 +1043,12 @@ def forward(self, extended_coord, extended_atype, nlist,
 
     # 3. RepFlow 计算
     node_ebd, edge_ebd, h2, rot_mat, sw = self.repflows(
-        nlist, extended_coord, extended_atype, node_ebd_ext,
-        mapping, comm_dict=comm_dict
+        nlist,
+        extended_coord,
+        extended_atype,
+        node_ebd_ext,
+        mapping,
+        comm_dict=comm_dict,
     )
 
     # 4. 输出拼接处理
@@ -997,7 +1072,7 @@ def forward(self, extended_coord, extended_atype, nlist,
 
 **文件位置**: `deepmd/pt/model/descriptor/repflows.py:77-200`
 
-```python
+```text
 class DescrptBlockRepflows(DescriptorBlock):
     def __init__(self,
                  n_dim: int = 128,
@@ -1020,8 +1095,11 @@ class DescrptBlockRepflows(DescriptorBlock):
 
    ```python
    self.edge_embd = MLPLayer(
-       1, e_dim, activation=activation_function,
-       precision=precision, seed=child_seed(seed, 1)
+       1,
+       e_dim,
+       activation=activation_function,
+       precision=precision,
+       seed=child_seed(seed, 1),
    )
    ```
 
@@ -1029,8 +1107,11 @@ class DescrptBlockRepflows(DescriptorBlock):
 
    ```python
    self.angle_embd = MLPLayer(
-       1, a_dim, activation=activation_function,
-       precision=precision, seed=child_seed(seed, 2)
+       1,
+       a_dim,
+       activation=activation_function,
+       precision=precision,
+       seed=child_seed(seed, 2),
    )
    ```
 
@@ -1039,8 +1120,19 @@ class DescrptBlockRepflows(DescriptorBlock):
    self.layers = torch.nn.ModuleList()
    for ii in range(nlayers):
        self.layers.append(
-           RepFlowLayer(e_rcut, e_rcut_smth, e_sel, a_rcut, a_rcut_smth, a_sel,
-                       ntypes, n_dim, e_dim, a_dim, ...)
+           RepFlowLayer(
+               e_rcut,
+               e_rcut_smth,
+               e_sel,
+               a_rcut,
+               a_rcut_smth,
+               a_sel,
+               ntypes,
+               n_dim,
+               e_dim,
+               a_dim,
+               ...,
+           )
        )
    ```
 
@@ -1049,12 +1141,22 @@ class DescrptBlockRepflows(DescriptorBlock):
 **文件位置**: `deepmd/pt/model/descriptor/repflows.py:429-647`
 
 ```python
-def forward(self, nlist, extended_coord, extended_atype,
-            extended_atype_embd=None, mapping=None, comm_dict=None):
+def forward(
+    self,
+    nlist,
+    extended_coord,
+    extended_atype,
+    extended_atype_embd=None,
+    mapping=None,
+    comm_dict=None,
+):
     # 1. 环境矩阵计算
     dmatrix, diff, sw = prod_env_mat(
-        extended_coord, nlist, self.e_rcut, self.e_rcut_smth,
-        protection=self.env_protection
+        extended_coord,
+        nlist,
+        self.e_rcut,
+        self.e_rcut_smth,
+        protection=self.env_protection,
     )
 
     # 2. 边和角度邻居列表处理
@@ -1071,8 +1173,7 @@ def forward(self, nlist, extended_coord, extended_atype,
     # 5. RepFlow 层迭代
     for idx, ll in enumerate(self.layers):
         node_ebd, edge_ebd, angle_ebd = ll.forward(
-            node_ebd, edge_ebd, angle_ebd,
-            nlist, extended_coord, extended_atype, ...
+            node_ebd, edge_ebd, angle_ebd, nlist, extended_coord, extended_atype, ...
         )
 
     return node_ebd, edge_ebd, h2, rot_mat, sw
@@ -1084,7 +1185,7 @@ def forward(self, nlist, extended_coord, extended_atype,
 
 **文件位置**: `deepmd/pt/model/descriptor/repflow_layer.py:38-200`
 
-```python
+```text
 class RepFlowLayer(torch.nn.Module):
     def __init__(self,
                  e_rcut: float,
@@ -1113,7 +1214,6 @@ class RepFlowLayer(torch.nn.Module):
 #### 3.4.1 网络组件
 
 - **MLP 网络**: `deepmd/pt/model/network/mlp.py`
-
   - `MLPLayer`: 多层感知机实现
   - `TypeEmbedNet`: 类型嵌入网络
   - `TypeEmbedNetConsistent`: 一致性类型嵌入网络
@@ -1126,12 +1226,10 @@ class RepFlowLayer(torch.nn.Module):
 #### 3.4.2 工具函数
 
 - **环境矩阵**: `deepmd/pt/model/descriptor/env_mat.py`
-
   - `prod_env_mat`: 环境矩阵计算
   - 距离和角度计算
 
 - **邻居列表**: `deepmd/pt/utils/nlist.py`
-
   - 邻居列表生成和处理
   - 排除掩码处理
 
@@ -1143,7 +1241,6 @@ class RepFlowLayer(torch.nn.Module):
 #### 3.4.3 统计和预处理
 
 - **环境矩阵统计**: `deepmd/pt/utils/env_mat_stat.py`
-
   - 邻居统计
   - 数据预处理
 
@@ -1158,9 +1255,11 @@ class RepFlowLayer(torch.nn.Module):
 根据深度势能的基本原理，系统的总能量等于系统中每个原子局部环境能量的总和。这一原理在 PyTorch 后端中通过**分离的两阶段计算**得到精确实现，确保了模型的物理正确性和能量守恒。
 
 **核心公式**:
+
 ```
 E_total = Σ E_i
 ```
+
 其中 E_i 是第 i 个原子的局部环境能量。
 
 #### 3.5.2 原子级能量计算阶段
@@ -1169,14 +1268,14 @@ E_total = Σ E_i
 
 在拟合网络的 `_forward_common` 方法中，每个原子的能量被独立计算：
 
-```python
+```text
 def _forward_common(self, descriptor, atype, ...):
     # descriptor shape: [nf, nloc, nd] - 原子环境描述符
     nf, nloc, nd = xx.shape
-    
+
     # 初始化输出张量
     outs = torch.zeros((nf, nloc, net_dim_out), dtype=self.prec, device=descriptor.device)
-    
+
     if self.mixed_types:
         # 混合类型模式：统一网络处理所有原子类型
         atom_property = self.filter_layers.networks[0](xx)  # 神经网络计算
@@ -1190,17 +1289,18 @@ def _forward_common(self, descriptor, atype, ...):
             atom_property = atom_property + self.bias_atom_e[type_i].to(self.prec)
             atom_property = torch.where(mask, atom_property, 0.0)
             outs = outs + atom_property
-    
+
     # 应用排除掩码
     mask = self.emask(atype).to(torch.bool)
     outs = torch.where(mask[:, :, None], outs, 0.0)
-    
+
     # 返回原子级能量，shape: [nf, nloc, net_dim_out]
     results.update({self.var_name: outs})
     return results
 ```
 
 **关键特征**:
+
 - **原子级输出**: 网络输出为 `[nf, nloc, net_dim_out]`，每个原子都有独立的能量贡献
 - **类型特定处理**: 支持混合类型和非混合类型两种计算模式
 - **局部环境原理**: 每个原子的能量只依赖于其局部环境描述符，符合深度势能的核心思想
@@ -1212,16 +1312,16 @@ def _forward_common(self, descriptor, atype, ...):
 
 **重要发现**: 原子级能量到系统能量的转换是在 `fit_output_to_model_output` 函数中完成的，而不是在拟合网络中！
 
-```python
+```text
 def fit_output_to_model_output(fit_ret, fit_output_def, coord_ext, ...):
     redu_prec = env.GLOBAL_PT_ENER_FLOAT_PRECISION
     model_ret = dict(fit_ret.items())
-    
+
     for kk, vv in fit_ret.items():
         vdef = fit_output_def[kk]
         shap = vdef.shape  # 对于能量，shap = [1]
         atom_axis = -(len(shap) + 1)  # atom_axis = -2 (原子维度)
-        
+
         if vdef.reducible:
             kk_redu = get_reduce_name(kk)  # "energy" -> "energy_redu"
             if vdef.intensive:
@@ -1230,7 +1330,7 @@ def fit_output_to_model_output(fit_ret, fit_output_def, coord_ext, ...):
             else:
                 # 广延性质：计算总和
                 model_ret[kk_redu] = torch.sum(vv.to(redu_prec), dim=atom_axis)
-            
+
             # 力和维里的自动微分计算
             if vdef.r_differentiable:
                 kk_derv_r, kk_derv_c = get_deriv_name(kk)
@@ -1239,11 +1339,12 @@ def fit_output_to_model_output(fit_ret, fit_output_def, coord_ext, ...):
                 if vdef.c_differentiable:
                     model_ret[kk_derv_c] = dc
                     model_ret[kk_derv_c + "_redu"] = torch.sum(model_ret[kk_derv_c].to(redu_prec), dim=1)
-    
+
     return model_ret
 ```
 
 **能量求和详解**:
+
 - **输入**: `vv` shape `[nf, nloc, 1]` - 原子级能量
 - **求和操作**: `torch.mean(vv, dim=-2)` 对原子维度求平均
 - **输出**: `energy_redu` shape `[nf, 1]` - 系统能量
@@ -1256,21 +1357,22 @@ def fit_output_to_model_output(fit_ret, fit_output_def, coord_ext, ...):
 
 在训练过程中，能量损失按原子数量归一化：
 
-```python
+```text
 def forward(self, model_pred, label, natoms, ...):
     # 系统能量预测值
     energy_pred = model_pred["energy"]  # shape: [nf, 1]
     energy_label = label["energy"]      # shape: [nf, 1]
-    
+
     # 计算能量损失
     l2_ener_loss = torch.mean(torch.square(energy_pred - energy_label))
-    
+
     # 按原子数量归一化 (per atom loss)
     atom_norm = 1.0 / natoms
     loss += atom_norm * (pref_e * l2_ener_loss)
 ```
 
 **归一化策略**:
+
 - **原子级归一化**: `atom_norm = 1.0 / natoms` 确保损失是 per atom 的
 - **训练稳定性**: 防止大系统主导训练过程
 - **物理一致性**: 保持能量与原子数量的线性关系
@@ -1300,38 +1402,41 @@ Per Atom 归一化损失 [scalar]
 #### 3.5.6 关键设计特点
 
 **分离式计算架构**:
+
 1. **原子能量计算**: 在 `_forward_common` 中计算每个原子的局部环境能量
 2. **系统能量聚合**: 在 `fit_output_to_model_output` 中将原子能量聚合成系统能量
 3. **自动微分支持**: 力的计算通过自动微分实现，保持梯度传递
 
 **灵活的求和策略**:
+
 - **求平均**: `torch.mean()` 用于训练时的能量损失计算
 - **求总和**: `torch.sum()` 用于某些需要总量的场景
 - **精度控制**: 使用 `redu_prec` 确保数值稳定性
 
 **物理正确性保证**:
+
 - **局部性原理**: 每个原子的能量只依赖于其局部环境
 - **可加性**: 系统能量严格等于原子能量之和
 - **不变性**: 保持旋转和平移不变性
 
 **计算效率优化**:
+
 - **并行计算**: 原子级能量计算可以完全并行化
 - **批处理**: 支持多帧同时处理
 - **内存效率**: 分离的计算阶段减少内存占用
 
-### 3.6 DPA3描述符输出变量详解
+### 3.6 DPA3 描述符输出变量详解
 
-在DPA3描述符的forward方法中，输出的变量包含了原子环境表示的完整信息。这些变量对于理解描述符的工作原理和调试模型行为非常重要。
+在 DPA3 描述符的 forward 方法中，输出的变量包含了原子环境表示的完整信息。这些变量对于理解描述符的工作原理和调试模型行为非常重要。
 
 #### 3.6.1 输出变量概述
 
 **文件位置**: `deepmd/pt/model/descriptor/dpa3.py:430-498`
 
-DPA3描述符的forward方法返回五个核心变量：
+DPA3 描述符的 forward 方法返回五个核心变量：
 
 ```python
-def forward(self, extended_coord, extended_atype, nlist,
-            mapping=None, comm_dict=None):
+def forward(self, extended_coord, extended_atype, nlist, mapping=None, comm_dict=None):
     # ... 计算过程 ...
     return node_ebd, rot_mat, edge_ebd, h2, sw
 ```
@@ -1339,42 +1444,47 @@ def forward(self, extended_coord, extended_atype, nlist,
 #### 3.6.2 变量详细说明
 
 **node_ebd: 节点描述符**
+
 - **形状**: `[nf, nloc, n_dim]`
 - **含义**: 主要的原子环境描述符，包含每个原子的环境信息
 - **作用**: 直接输入拟合网络计算原子级能量
 
 **rot_mat: 旋转矩阵**
+
 - **形状**: `[nf, nloc, e_dim, 3]`
 - **含义**: 旋转矩阵用于坐标变换，保持旋转不变性
-- **作用**: 
+- **作用**:
   - 将局部坐标转换到全局坐标系
   - 确保描述符在分子旋转时的不变性
-  - 支持SE(3)等变变换
+  - 支持 SE(3)等变变换
 
 **edge_ebd: 边嵌入**
+
 - **形状**: `[nf, nloc, nnei, e_dim]`
 - **含义**: 原子间边的嵌入表示
 - **作用**: 描述原子间的成键信息和相互作用
 
 **h2: 角度信息**
+
 - **形状**: `[nf, nloc, nnei, 3]`
 - **含义**: 三体角度相关信息
-- **作用**: 描述原子间的角度关系，支持3-body相互作用建模
+- **作用**: 描述原子间的角度关系，支持 3-body 相互作用建模
 
 **sw: 平滑开关函数**
+
 - **形状**: `[nf, nloc, nnei]`
 - **含义**: 用于平滑截止边界的开关函数
-- **作用**: 在cutoff半径处平滑过渡到零，避免能量和力的不连续跳跃
+- **作用**: 在 cutoff 半径处平滑过渡到零，避免能量和力的不连续跳跃
 
 #### 3.6.3 变量在模型中的应用
 
 **在拟合网络中的使用** (`deepmd/pt/model/task/fitting.py:473-614`):
 
-```python
+```text
 def _forward_common(self, descriptor, atype, ...):
     # descriptor是node_ebd [nf, nloc, nd]
     nf, nloc, nd = descriptor.shape
-    
+
     # 计算原子级能量
     atom_property = self.filter_layers.networks[0](descriptor)
     # ...
@@ -1403,7 +1513,7 @@ node_ebd, rot_mat ← 最终描述符输出
 
 ### 3.7 代码修改和功能增强历史
 
-#### 3.7.1 process_systems函数增强
+#### 3.7.1 process_systems 函数增强
 
 **修改位置**: `deepmd/utils/data_system.py`
 
@@ -1463,13 +1573,15 @@ system_path/
 
 ```python
 class DeepmdData:
-    def __init__(self,
-                 systems: Union[str, List[str]],
-                 batch_size: int = 1,
-                 test_size: int = 0,
-                 shuffle_test: bool = True,
-                 type_map: Optional[List[str]] = None,
-                 modifier=None):
+    def __init__(
+        self,
+        systems: Union[str, List[str]],
+        batch_size: int = 1,
+        test_size: int = 0,
+        shuffle_test: bool = True,
+        type_map: Optional[List[str]] = None,
+        modifier=None,
+    ):
         """
         初始化数据系统
 
@@ -1565,12 +1677,14 @@ def reformat_data_torch(self, data_dict: dict) -> dict:
 
 ```python
 class DpLoaderSet:
-    def __init__(self,
-                 systems: List[str],
-                 batch_size: Union[int, str, List[int]],
-                 type_map: List[str],
-                 shuffle: bool = True,
-                 dist: bool = False):
+    def __init__(
+        self,
+        systems: List[str],
+        batch_size: Union[int, str, List[int]],
+        type_map: List[str],
+        shuffle: bool = True,
+        dist: bool = False,
+    ):
         """
         初始化系统级 DataLoader 集合
 
@@ -1590,7 +1704,7 @@ class DpLoaderSet:
             system_data = DeepmdData(
                 system_path,
                 batch_size=1,  # 系统级批处理在 DataLoader 中处理
-                type_map=type_map
+                type_map=type_map,
             )
 
             # 转换为 PyTorch 数据集
@@ -1629,7 +1743,7 @@ def _create_system_dataloader(self, system, batch_size, shuffle, dist):
             system,
             num_replicas=dist.get_world_size(),
             rank=dist.get_rank(),
-            shuffle=shuffle
+            shuffle=shuffle,
         )
     else:
         system_sampler = None
@@ -1668,8 +1782,8 @@ def _calculate_auto_batch_size(self, system_data: DeepmdData) -> int:
 
     # 2. 计算内存需求
     memory_per_frame = natoms * 3 * 4  # 坐标内存 (float32)
-    memory_per_frame += natoms * 4     # 原子类型内存 (int32)
-    memory_per_frame += 9 * 4         # 盒子内存 (float32)
+    memory_per_frame += natoms * 4  # 原子类型内存 (int32)
+    memory_per_frame += 9 * 4  # 盒子内存 (float32)
 
     # 3. 基于可用内存计算批处理大小
     available_memory = self._get_available_memory()
@@ -1746,9 +1860,7 @@ def collate_batch(batch: List[dict]) -> dict:
             continue
         else:
             # 其他键进行张量批处理
-            result[key] = collate_tensor_fn(
-                [torch.as_tensor(d[key]) for d in batch]
-            )
+            result[key] = collate_tensor_fn([torch.as_tensor(d[key]) for d in batch])
 
     return result
 ```
@@ -1773,8 +1885,10 @@ def collate_tensor_fn(tensors: List[torch.Tensor]) -> torch.Tensor:
         padded_tensors = []
 
         for tensor in tensors:
-            padding = [(0, max_dim - curr_dim)
-                      for max_dim, curr_dim in zip(max_shape, tensor.shape)]
+            padding = [
+                (0, max_dim - curr_dim)
+                for max_dim, curr_dim in zip(max_shape, tensor.shape)
+            ]
             padded_tensor = torch.nn.functional.pad(tensor, padding)
             padded_tensors.append(padded_tensor)
 
@@ -1809,13 +1923,13 @@ def get_data_loader(_training_data, _validation_data, _training_params):
 
         # 2. 创建训练级 DataLoader
         _dataloader = DataLoader(
-            _data,                              # DpLoaderSet 实例
-            sampler=_sampler,                   # 采样器
-            batch_size=None,                   # 单系统批处理
+            _data,  # DpLoaderSet 实例
+            sampler=_sampler,  # 采样器
+            batch_size=None,  # 单系统批处理
             num_workers=NUM_WORKERS if dist.is_available() else 0,
-            drop_last=False,                   # 不丢弃最后一个不完整批次
-            collate_fn=lambda batch: batch,     # 防止额外转换
-            pin_memory=True,                    # 锁页内存优化
+            drop_last=False,  # 不丢弃最后一个不完整批次
+            collate_fn=lambda batch: batch,  # 防止额外转换
+            pin_memory=True,  # 锁页内存优化
         )
 
         # 3. 创建无限循环迭代器
@@ -1831,7 +1945,12 @@ def get_data_loader(_training_data, _validation_data, _training_params):
         _validation_data, _training_params["validation_data"]
     )
 
-    return training_dataloader, training_data_iter, validation_dataloader, validation_data_iter
+    return (
+        training_dataloader,
+        training_data_iter,
+        validation_dataloader,
+        validation_data_iter,
+    )
 ```
 
 #### 4.5.2 采样器配置
@@ -1856,9 +1975,7 @@ def get_sampler_from_params(_data, _params):
     # 2. 创建采样器
     if prob is not None:
         sampler = WeightedRandomSampler(
-            weights=prob,
-            num_samples=len(prob),
-            replacement=True
+            weights=prob, num_samples=len(prob), replacement=True
         )
     else:
         sampler = None
@@ -1929,9 +2046,7 @@ def get_data(self, is_train=True, task_key="Default"):
     for key in batch_data.keys():
         if key not in ["sid", "fid", "box", "find_*"]:
             # 移动到目标设备
-            batch_data[key] = batch_data[key].to(
-                env.DEVICE, non_blocking=True
-            )
+            batch_data[key] = batch_data[key].to(env.DEVICE, non_blocking=True)
 
     # 4. 分离输入和标签
     input_dict, label_dict, log_dict = self._separate_inputs_labels(batch_data)
@@ -1979,28 +2094,21 @@ def step(self, task_key="Default", **kwargs):
     """执行单个训练步骤"""
 
     # 1. 获取数据
-    input_dict, label_dict, log_dict = self.get_data(
-        is_train=True, task_key=task_key
-    )
+    input_dict, label_dict, log_dict = self.get_data(is_train=True, task_key=task_key)
 
     # 2. 前向传播
     with torch.cuda.amp.autocast(enabled=self.mixed_precision):
         model_pred, loss, more_loss = self.wrapper(
-            **input_dict,
-            cur_lr=self.get_cur_lr(),
-            label=label_dict,
-            task_key=task_key
+            **input_dict, cur_lr=self.get_cur_lr(), label=label_dict, task_key=task_key
         )
 
-    # 3. 反向传播
+        # 3. 反向传播
         self.optimizer.zero_grad()
         loss.backward()
 
         # 4. 梯度裁剪
         if self.grad_clip > 0:
-            torch.nn.utils.clip_grad_norm_(
-                self.wrapper.parameters(), self.grad_clip
-            )
+            torch.nn.utils.clip_grad_norm_(self.wrapper.parameters(), self.grad_clip)
 
         # 5. 参数更新
         self.optimizer.step()
@@ -2029,7 +2137,9 @@ NUM_WORKERS = int(os.environ.get("NUM_WORKERS", min(4, ncpus)))
 
 # 多进程方法检查
 if multiprocessing.get_start_method() != "fork":
-    log.warning("NUM_WORKERS > 0 is not supported with spawn or forkserver start method. Setting NUM_WORKERS to 0.")
+    log.warning(
+        "NUM_WORKERS > 0 is not supported with spawn or forkserver start method. Setting NUM_WORKERS to 0."
+    )
     NUM_WORKERS = 0
 ```
 
@@ -2135,10 +2245,10 @@ evaluator = DeepEval("dpa3_model.pt", output_def)
 
 # 执行推理
 result = evaluator.eval(
-    coords=coordinates,      # [nframes x natoms x 3]
-    cells=cell_parameters,    # [nframes x 9] (可选)
-    atom_types=atom_types,    # [natoms] 或 [nframes x natoms]
-    atomic=False             # 是否计算原子级贡献
+    coords=coordinates,  # [nframes x natoms x 3]
+    cells=cell_parameters,  # [nframes x 9] (可选)
+    atom_types=atom_types,  # [natoms] 或 [nframes x natoms]
+    atomic=False,  # 是否计算原子级贡献
 )
 ```
 
@@ -2163,11 +2273,15 @@ dp freeze -m dpa3_model.pt -o frozen_model.pth
 **文件位置**: `deepmd/pt/infer/deep_eval.py:96-161`
 
 ```python
-def __init__(self, model_file: str, output_def: ModelOutputDef,
-             auto_batch_size: Union[bool, int, AutoBatchSize] = True,
-             neighbor_list: Optional["ase.neighborlist.NewPrimitiveNeighborList"] = None,
-             head: Optional[Union[str, int]] = None,
-             no_jit: bool = False):
+def __init__(
+    self,
+    model_file: str,
+    output_def: ModelOutputDef,
+    auto_batch_size: Union[bool, int, AutoBatchSize] = True,
+    neighbor_list: Optional["ase.neighborlist.NewPrimitiveNeighborList"] = None,
+    head: Optional[Union[str, int]] = None,
+    no_jit: bool = False,
+):
 
     # 1. 加载模型检查点
     state_dict = torch.load(model_file, map_location=env.DEVICE, weights_only=True)
@@ -2210,13 +2324,23 @@ evaluator = DeepEval("multi_task_model.pt", output_def, head="task_name")
 ```python
 def _eval_model(self, coords, cells, atom_types, fparam, aparam, request_defs):
     # 1. 数据预处理
-    coord_input = torch.tensor(coords.reshape([nframes, natoms, 3]),
-                               dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE)
+    coord_input = torch.tensor(
+        coords.reshape([nframes, natoms, 3]),
+        dtype=GLOBAL_PT_FLOAT_PRECISION,
+        device=DEVICE,
+    )
     type_input = torch.tensor(atom_types, dtype=torch.long, device=DEVICE)
 
     # 2. 可选参数处理
-    box_input = torch.tensor(cells.reshape([nframes, 3, 3]),
-                             dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE) if cells is not None else None
+    box_input = (
+        torch.tensor(
+            cells.reshape([nframes, 3, 3]),
+            dtype=GLOBAL_PT_FLOAT_PRECISION,
+            device=DEVICE,
+        )
+        if cells is not None
+        else None
+    )
 
     # 3. 执行模型推理
     batch_output = model(
@@ -2225,7 +2349,7 @@ def _eval_model(self, coords, cells, atom_types, fparam, aparam, request_defs):
         box=box_input,
         do_atomic_virial=do_atomic_virial,
         fparam=fparam_input,
-        aparam=aparam_input
+        aparam=aparam_input,
     )
 
     # 4. 后处理和返回结果
@@ -2250,8 +2374,12 @@ def _eval_model(self, coords, cells, atom_types, fparam, aparam, request_defs):
 ```python
 def _eval_func(self, inner_func: Callable, numb_test: int, natoms: int) -> Callable:
     if self.auto_batch_size is not None:
+
         def eval_func(*args, **kwargs):
-            return self.auto_batch_size.execute_all(inner_func, numb_test, natoms, *args, **kwargs)
+            return self.auto_batch_size.execute_all(
+                inner_func, numb_test, natoms, *args, **kwargs
+            )
+
     else:
         eval_func = inner_func
     return eval_func
@@ -2302,7 +2430,6 @@ model = model.to(device)
 **支持的模型格式**:
 
 1. **.pt 文件**: PyTorch 标准检查点格式
-
    - 包含完整的模型权重和配置信息
    - 支持多任务模型和元数据
 
@@ -2382,9 +2509,9 @@ def eval_fitting_last_layer(self, coords, cells, atom_types, fparam=None, aparam
 def get_model_size(self) -> dict:
     """获取模型参数统计"""
     return {
-        "descriptor": sum_param_des,      # 描述符参数数量
-        "fitting-net": sum_param_fit,     # 拟合网络参数数量
-        "total": sum_param_des + sum_param_fit  # 总参数数量
+        "descriptor": sum_param_des,  # 描述符参数数量
+        "fitting-net": sum_param_fit,  # 拟合网络参数数量
+        "total": sum_param_des + sum_param_fit,  # 总参数数量
     }
 ```
 
@@ -2424,12 +2551,10 @@ def get_model_size(self) -> dict:
 
 ```python
 # 内存敏感环境
-evaluator = DeepEval("model.pt", output_def,
-                    auto_batch_size=False)  # 禁用自动批处理
+evaluator = DeepEval("model.pt", output_def, auto_batch_size=False)  # 禁用自动批处理
 
 # 性能优化配置
-evaluator = DeepEval("model.pt", output_def,
-                    auto_batch_size=1024)  # 设置固定批处理大小
+evaluator = DeepEval("model.pt", output_def, auto_batch_size=1024)  # 设置固定批处理大小
 ```
 
 #### 5.9.3 错误处理和调试
diff --git a/doc/outisli/compress.md b/doc/outisli/compress.md
index 65475f6bc5..187616c8c4 100644
--- a/doc/outisli/compress.md
+++ b/doc/outisli/compress.md
@@ -19,19 +19,22 @@ DeePMD-kit 的 compress 功能通过将 embedding networks 进行 tabulation（
 ### 1. 命令行入口
 
 #### 主入口
+
 - **文件位置**: `deepmd/main.py`
 - **命令示例**: `dp --pt compress -i model.pth -o compressed_model.pth`
 
 #### 参数配置
+
 ```python
-parser_compress.add_argument("-s", "--step", default=0.01, type=float)      # stride0
-parser_compress.add_argument("-e", "--extrapolate", default=5, type=int)    # 外推倍数
-parser_compress.add_argument("-f", "--frequency", default=-1, type=int)     # 溢出检查频率
-parser_compress.add_argument("-t", "--training-script", type=str)           # 训练脚本
+parser_compress.add_argument("-s", "--step", default=0.01, type=float)  # stride0
+parser_compress.add_argument("-e", "--extrapolate", default=5, type=int)  # 外推倍数
+parser_compress.add_argument("-f", "--frequency", default=-1, type=int)  # 溢出检查频率
+parser_compress.add_argument("-t", "--training-script", type=str)  # 训练脚本
 ```
 
 #### 命令分发
-```python
+
+```text
 # deepmd/main.py:1013-1018
 elif args.command in ("compress", "train", "freeze", ...):
     deepmd_main = BACKENDS[args.backend]().entry_point_hook
@@ -40,9 +43,10 @@ elif args.command in ("compress", "train", "freeze", ...):
 ### 2. PyTorch 后端处理
 
 #### 入口函数
+
 **文件位置**: `deepmd/pt/entrypoints/main.py:574-582`
 
-```python
+```text
 elif FLAGS.command == "compress":
     FLAGS.input = str(Path(FLAGS.input).with_suffix(".pth"))
     FLAGS.output = str(Path(FLAGS.output).with_suffix(".pth"))
@@ -57,24 +61,32 @@ elif FLAGS.command == "compress":
 ```
 
 #### 核心压缩函数
+
 **文件位置**: `deepmd/pt/entrypoints/compress.py:32-84`
 
 ## 详细执行流程
 
-### 步骤1：模型加载
+### 步骤 1：模型加载
 
 ```python
-def enable_compression(input_file, output, stride=0.01, extrapolate=5, check_frequency=-1, training_script=None):
+def enable_compression(
+    input_file,
+    output,
+    stride=0.01,
+    extrapolate=5,
+    check_frequency=-1,
+    training_script=None,
+):
     # 1. 加载JIT模型
     saved_model = torch.jit.load(input_file, map_location="cpu")
     model_def_script = json.loads(saved_model.model_def_script)
-    
+
     # 2. 重建模型实例
     model = get_model(model_def_script)
     model.load_state_dict(saved_model.state_dict())
 ```
 
-### 步骤2：最小邻居距离计算
+### 步骤 2：最小邻居距离计算
 
 ```python
 # 3. 计算最小邻居距离
@@ -83,34 +95,51 @@ if model.get_min_nbor_dist() is None:
     jdata = j_loader(training_script)
     jdata = update_deepmd_input(jdata)
     train_data = get_data(jdata["training"]["training_data"], 0, type_map, None)
-    
+
     update_sel = UpdateSel()
     t_min_nbor_dist = update_sel.get_min_nbor_dist(train_data)
-    model.min_nbor_dist = torch.tensor(t_min_nbor_dist, dtype=env.GLOBAL_PT_FLOAT_PRECISION)
+    model.min_nbor_dist = torch.tensor(
+        t_min_nbor_dist, dtype=env.GLOBAL_PT_FLOAT_PRECISION
+    )
 ```
 
-### 步骤3：模型压缩启用
+### 步骤 3：模型压缩启用
 
 #### 3.1 模型层压缩
+
 **文件位置**: `deepmd/pt/model/model/make_model.py:103-129`
 
 ```python
-def enable_compression(self, table_extrapolate=5, table_stride_1=0.01, table_stride_2=0.1, check_frequency=-1):
+def enable_compression(
+    self,
+    table_extrapolate=5,
+    table_stride_1=0.01,
+    table_stride_2=0.1,
+    check_frequency=-1,
+):
     """模型层压缩入口"""
     self.atomic_model.enable_compression(
         self.get_min_nbor_dist(),  # 最小邻居距离
         table_extrapolate,
         table_stride_1,
-        table_stride_2, 
+        table_stride_2,
         check_frequency,
     )
 ```
 
 #### 3.2 原子模型压缩
+
 **文件位置**: `deepmd/pt/model/atomic_model/dp_atomic_model.py:188-217`
 
 ```python
-def enable_compression(self, min_nbor_dist, table_extrapolate=5, table_stride_1=0.01, table_stride_2=0.1, check_frequency=-1):
+def enable_compression(
+    self,
+    min_nbor_dist,
+    table_extrapolate=5,
+    table_stride_1=0.01,
+    table_stride_2=0.1,
+    check_frequency=-1,
+):
     """原子模型层压缩入口"""
     self.descriptor.enable_compression(
         min_nbor_dist,
@@ -121,41 +150,59 @@ def enable_compression(self, min_nbor_dist, table_extrapolate=5, table_stride_1=
     )
 ```
 
-### 步骤4：描述符层压缩实现
+### 步骤 4：描述符层压缩实现
 
 #### 4.1 SE_A 描述符压缩
+
 **文件位置**: `deepmd/pt/model/descriptor/se_a.py:257-302`
 
 ```python
-def enable_compression(self, min_nbor_dist, table_extrapolate=5, table_stride_1=0.01, table_stride_2=0.1, check_frequency=-1):
+def enable_compression(
+    self,
+    min_nbor_dist,
+    table_extrapolate=5,
+    table_stride_1=0.01,
+    table_stride_2=0.1,
+    check_frequency=-1,
+):
     # 1. 检查是否已压缩
     if self.compress:
         raise ValueError("Compression is already enabled.")
-    
+
     # 2. 创建查表器
     data = self.serialize()
     self.table = DPTabulate(
-        self,                                    # 描述符对象
-        data["neuron"],                          # 神经网络结构
-        data["type_one_side"],                   # 单侧类型
-        data["exclude_types"],                   # 排除类型对
-        ActivationFn(data["activation_function"]) # 激活函数
+        self,  # 描述符对象
+        data["neuron"],  # 神经网络结构
+        data["type_one_side"],  # 单侧类型
+        data["exclude_types"],  # 排除类型对
+        ActivationFn(data["activation_function"]),  # 激活函数
     )
-    
+
     # 3. 存储查表配置
-    self.table_config = [table_extrapolate, table_stride_1, table_stride_2, check_frequency]
-    
+    self.table_config = [
+        table_extrapolate,
+        table_stride_1,
+        table_stride_2,
+        check_frequency,
+    ]
+
     # 4. 构建查表数据
-    self.lower, self.upper = self.table.build(min_nbor_dist, table_extrapolate, table_stride_1, table_stride_2)
-    
+    self.lower, self.upper = self.table.build(
+        min_nbor_dist, table_extrapolate, table_stride_1, table_stride_2
+    )
+
     # 5. 启用嵌入层压缩
-    self.sea.enable_compression(self.table.data, self.table_config, self.lower, self.upper)
-    
+    self.sea.enable_compression(
+        self.table.data, self.table_config, self.lower, self.upper
+    )
+
     # 6. 设置压缩标志
     self.compress = True
 ```
 
 #### 4.2 DescrptSeA 压缩数据设置
+
 **文件位置**: `deepmd/pt/model/descriptor/se_a.py:699-733`
 
 ```python
@@ -166,84 +213,113 @@ def enable_compression(self, table_data, table_config, lower, upper):
             net = f"filter_-1_net_{embedding_idx}"
         else:
             ii = embedding_idx // self.ntypes  # 中心原子类型
-            ti = embedding_idx % self.ntypes   # 邻居原子类型  
+            ti = embedding_idx % self.ntypes  # 邻居原子类型
             net = f"filter_{ii}_net_{ti}"
-            
+
         # 压缩信息：[lower, upper, upper*extrapolate, stride1, stride2, check_freq]
-        info_ii = torch.as_tensor([
-            lower[net], upper[net], upper[net] * table_config[0],
-            table_config[1], table_config[2], table_config[3]
-        ], dtype=self.prec, device="cpu")
-        
+        info_ii = torch.as_tensor(
+            [
+                lower[net],
+                upper[net],
+                upper[net] * table_config[0],
+                table_config[1],
+                table_config[2],
+                table_config[3],
+            ],
+            dtype=self.prec,
+            device="cpu",
+        )
+
         # 压缩数据：多项式系数表
         tensor_data_ii = table_data[net].to(device=env.DEVICE, dtype=self.prec)
-        
+
         self.compress_data[embedding_idx] = tensor_data_ii
         self.compress_info[embedding_idx] = info_ii
-    
+
     self.compress = True
 ```
 
-### 步骤5：查表器实现
+### 步骤 5：查表器实现
 
 #### 5.1 查表器类
+
 **文件位置**: `deepmd/pt/utils/tabulate.py:52-118`
 
 ```python
 class DPTabulate(BaseTabulate):
-    def __init__(self, descrpt, neuron, type_one_side=False, exclude_types=[], activation_fn=ActivationFn("tanh")):
+    def __init__(
+        self,
+        descrpt,
+        neuron,
+        type_one_side=False,
+        exclude_types=[],
+        activation_fn=ActivationFn("tanh"),
+    ):
         # 1. 基础初始化
         super().__init__(descrpt, neuron, type_one_side, exclude_types, True)
-        
+
         # 2. 描述符类型判断
         self.descrpt_type = self._get_descrpt_type()  # "A", "Atten", "T", "R"
-        
+
         # 3. 获取描述符参数
         self.sel_a = self.descrpt.get_sel()
         self.rcut = self.descrpt.get_rcut()
         self.rcut_smth = self.descrpt.get_rcut_smth()
-        
+
         # 4. 激活函数映射
-        activation_map = {"tanh": 1, "gelu": 2, "relu": 3, "relu6": 4, "softplus": 5, "sigmoid": 6}
+        activation_map = {
+            "tanh": 1,
+            "gelu": 2,
+            "relu": 3,
+            "relu6": 4,
+            "softplus": 5,
+            "sigmoid": 6,
+        }
         self.functype = activation_map[activation_fn.activation]
-        
+
         # 5. 获取统计参数
         serialized = self.descrpt.serialize()
         self.davg = serialized["@variables"]["davg"]  # 均值
         self.dstd = serialized["@variables"]["dstd"]  # 标准差
         self.embedding_net_nodes = serialized["embeddings"]["networks"]
-        
+
         # 6. 提取权重和偏置
         self.bias = self._get_bias()
         self.matrix = self._get_matrix()
 ```
 
 #### 5.2 查表构建过程
+
 **文件位置**: `deepmd/utils/tabulate.py:70-243`
 
 ```python
 def build(self, min_nbor_dist, extrapolate, stride0, stride1):
     # 1. 计算环境矩阵范围
     lower, upper = self._get_env_mat_range(min_nbor_dist)
-    
+
     # 2. 根据描述符类型建表
     if self.descrpt_type == "A":  # SE_A 描述符
         for ii in range(self.table_size):
             if self._should_build_table(ii):
                 # 构建距离网格
-                xx = self._build_distance_grid(lower, upper, stride0, stride1, extrapolate, ii)
-                
+                xx = self._build_distance_grid(
+                    lower, upper, stride0, stride1, extrapolate, ii
+                )
+
                 # 查表数据
-                self._generate_spline_table(net, xx, ii, uu, ll, stride0, stride1, extrapolate, nspline)
-    
+                self._generate_spline_table(
+                    net, xx, ii, uu, ll, stride0, stride1, extrapolate, nspline
+                )
+
     # 3. 后处理转换
     self._convert_numpy_to_tensor()
     self._convert_numpy_float_to_int()
-    
+
     return self.lower, self.upper
 ```
 
 #### 5.3 环境矩阵范围计算
+
 **文件位置**: `deepmd/utils/tabulate.py:445-463`
 
 ```python
@@ -251,44 +327,50 @@ def _get_env_mat_range(self, min_nbor_dist):
     """计算环境矩阵的范围"""
     # 1. 计算切换函数值
     sw = self._spline5_switch(min_nbor_dist, self.rcut_smth, self.rcut)
-    
+
     # 2. 根据描述符类型计算范围
     if self.descrpt_type in ("Atten", "A"):
         # 标准化：(r_ij - davg) / dstd
         lower = -self.davg[:, 0] / self.dstd[:, 0]
         upper = ((1 / min_nbor_dist) * sw - self.davg[:, 0]) / self.dstd[:, 0]
-    
+
     # 3. 向下和向上取整
     return np.floor(lower), np.ceil(upper)
 ```
 
 #### 5.4 多项式系数计算
+
 **文件位置**: `deepmd/utils/tabulate.py:245-347`
 
 ```python
-def _generate_spline_table(self, net, xx, idx, upper, lower, stride0, stride1, extrapolate, nspline):
+def _generate_spline_table(
+    self, net, xx, idx, upper, lower, stride0, stride1, extrapolate, nspline
+):
     # 1. 通过神经网络前向传播计算数据
     vv, dd, d2 = self._make_data(xx, idx)  # 值、一阶导数、二阶导数
-    
+
     # 2. 多项式系数表
     self.data[net] = np.zeros([nspline, 6 * self.last_layer_size], dtype=self.data_type)
-    
+
     # 3. 步长处理
     tt = np.full((nspline, self.last_layer_size), stride1)
     tt[: int((upper - lower) / stride0), :] = stride0
-    
+
     # 4. 计算多项式高阶系数
-    hh = vv[1:nspline + 1, :self.last_layer_size] - vv[:nspline, :self.last_layer_size]
-    
+    hh = (
+        vv[1 : nspline + 1, : self.last_layer_size]
+        - vv[:nspline, : self.last_layer_size]
+    )
+
     # 系数0：函数值 f(x)
-    self.data[net][:, ::6] = vv[:nspline, :self.last_layer_size]
-    
+    self.data[net][:, ::6] = vv[:nspline, : self.last_layer_size]
+
     # 系数1：一阶导数 f'(x)
-    self.data[net][:, 1::6] = dd[:nspline, :self.last_layer_size]
-    
+    self.data[net][:, 1::6] = dd[:nspline, : self.last_layer_size]
+
     # 系数2：二阶导数 f''(x)/2
-    self.data[net][:, 2::6] = 0.5 * d2[:nspline, :self.last_layer_size]
-    
+    self.data[net][:, 2::6] = 0.5 * d2[:nspline, : self.last_layer_size]
+
     # 系数3-5：高阶多项式系数（保证连续性）
     self.data[net][:, 3::6] = (1 / (2 * tt**3)) * (20 * hh - ...)
     self.data[net][:, 4::6] = (1 / (2 * tt**4)) * (-30 * hh + ...)
@@ -296,36 +378,37 @@ def _generate_spline_table(self, net, xx, idx, upper, lower, stride0, stride1, e
 ```
 
 #### 5.5 神经网络前向传播
+
 **文件位置**: `deepmd/pt/utils/tabulate.py:119-250`
 
-```python
+```text
 def _make_data(self, xx, idx):
     """通过神经网络前向传播查表数据"""
     xx = torch.from_numpy(xx).view(-1, 1).to(env.DEVICE)
-    
+
     # 逐层计算
     for layer in range(self.layer_size):
         if layer == 0:
             # 第一层：线性变换 + 激活函数
             xbar = torch.matmul(xx, torch.from_numpy(self.matrix[f"layer_{layer + 1}"][idx])) + \
                    torch.from_numpy(self.bias[f"layer_{layer + 1}"][idx])
-            
+
             # 处理激活函数（含残差连接）
             if self.neuron[0] == 1:
                 yy = self._layer_0(...) + xx  # 残差连接
             else:
                 yy = self._layer_0(...)
-            
+
             # 计算一阶和二阶导数
             dy = unaggregated_dy_dx_s(...)
             dy2 = unaggregated_dy2_dx_s(...)
         else:
             # 后续层...
-    
+
     return vv.cpu().numpy(), dd.cpu().numpy(), d2.cpu().numpy()
 ```
 
-### 步骤6：模型保存
+### 步骤 6：模型保存
 
 ```python
 # 4. 启用压缩
@@ -377,7 +460,10 @@ torch.jit.save(model, output)
    - **压缩方式**: 不支持
    - **原因**: ```python
      def enable_compression(self, ...):
-         raise NotImplementedError("Compression is unsupported for DPA3.")
+     raise NotImplementedError("Compression is unsupported for DPA3.")
+
+     ```
+
      ```
 
 ### 特殊模型类型
@@ -395,19 +481,23 @@ torch.jit.save(model, output)
 ### 压缩数据格式
 
 #### 1. 压缩信息 (compress_info)
+
 ```python
 # 每个嵌入网络存储6个参数 [6]
-compress_info[embedding_idx] = torch.tensor([
-    lower[net],           # 下界
-    upper[net],           # 上界  
-    upper[net] * extrapolate,  # 外推上界
-    table_stride_1,       # 第一段步长
-    table_stride_2,       # 第二段步长  
-    check_frequency       # 溢出检查频率
-])
+compress_info[embedding_idx] = torch.tensor(
+    [
+        lower[net],  # 下界
+        upper[net],  # 上界
+        upper[net] * extrapolate,  # 外推上界
+        table_stride_1,  # 第一段步长
+        table_stride_2,  # 第二段步长
+        check_frequency,  # 溢出检查频率
+    ]
+)
 ```
 
 #### 2. 压缩数据 (compress_data)
+
 ```python
 # 每个嵌入网络存储系数表 [nspline, 6 * last_layer_size]
 compress_data[embedding_idx] = table_data[net]
@@ -419,11 +509,12 @@ compress_data[embedding_idx] = table_data[net]
 ### 查表数据构建
 
 #### 1. 距离网格生成
+
 ```python
 # 第一段：精细数据区间网格
 xx1 = np.arange(lower, upper, stride0)
 
-# 第二段：外推区间网格  
+# 第二段：外推区间网格
 xx2 = np.arange(upper, extrapolate * upper, stride1)
 
 # 合并网格
@@ -431,40 +522,47 @@ xx = np.concatenate([xx1, xx2, [extrapolate * upper]])
 ```
 
 #### 2. 神经网络求值
+
 ```python
 # 对每个网格点进行神经网络前向传播
 for x_point in xx:
-    output = forward_pass(x_point)      # 网络输出
-    grad1 = compute_gradient(x_point)   # 一阶导数
-    grad2 = compute_hessian(x_point)    # 二阶导数
+    output = forward_pass(x_point)  # 网络输出
+    grad1 = compute_gradient(x_point)  # 一阶导数
+    grad2 = compute_hessian(x_point)  # 二阶导数
 ```
 
 #### 3. 多项式构造
-采用五次Hermite插值，满足：
+
+采用五次 Hermite 插值，满足：
+
 - 函数值连续：f(x_i) = y_i
-- 一阶导数连续：f'(x_i) = y'_i  
-- 二阶导数连续：f''(x_i) = y''_i
+- 一阶导数连续：f'(x_i) = y'\_i
+- 二阶导数连续：f''(x_i) = y''\_i
 
 ## 性能优化
 
 ### 1. 内存管理
+
 - **数据精度**: 支持数据精度调整（0.01）
 - **分段优化**: 粗糙步长在外推区（0.1）
 - **内存复用**: 删除原始网络权重，内存显著降低
 
 ### 2. 计算优化
+
 - **预计算查表**: 压缩后嵌入网络不再需要矩阵运算
 - **向量化查表**: 每个原子类型对应一个优化的查表
 - **分支消除**: 消除类型判断的分支开销
 
 ### 3. 缓存友好
-- **数据局部性**: 查表数据连续存储，提升cache命中率
-- **内存访问**: 内存访问模式优化，减少cache miss
+
+- **数据局部性**: 查表数据连续存储，提升 cache 命中率
+- **内存访问**: 内存访问模式优化，减少 cache miss
 - **SIMD**: 多项式计算可向量化
 
 ## 使用示例
 
 ### 基础压缩命令
+
 ```bash
 # 压缩PyTorch模型
 dp --pt compress -i frozen_model.pth -o compressed_model.pth
@@ -480,34 +578,39 @@ dp --pt compress \
 ```
 
 ### 参数说明
+
 - `-i, --input`: 输入的冻结模型（.pth）
 - `-o, --output`: 输出的压缩模型（.pth）
-- `-s, --step`: 第一段步长，影响精度与内存（默认0.01）
-- `-e, --extrapolate`: 外推倍数（默认5）
-- `-f, --frequency`: 溢出检查频率，-1表示不检查（默认-1）
+- `-s, --step`: 第一段步长，影响精度与内存（默认 0.01）
+- `-e, --extrapolate`: 外推倍数（默认 5）
+- `-f, --frequency`: 溢出检查频率，-1 表示不检查（默认-1）
 - `-t, --training-script`: 训练脚本（用于计算最小邻居距离）
 
 ## 局限性分析
 
 ### 1. 描述符局限
+
 - DPA3 描述符不支持压缩
 - Pairtab 模型不支持查表压缩
 - 某些描述符变体可能不完全兼容
 
 ### 2. 精度权衡
+
 - 步长设置过大会影响精度
 - 外推区间精度相对较低
 - 激活函数近似可能带来误差
 
 ### 3. 内存开销
+
 - 压缩后仍需存储多项式查表数据
 - 精度要求高时查表尺寸增大
 - 激活函数导数计算消耗额外内存
 
 ### 4. 兼容性限制
-- 压缩后的模型仅适用于DeePMD-kit环境
-- JIT脚本化可能在某些场景下受限
-- LAMMPS等MD引擎需要特定的压缩模型格式
+
+- 压缩后的模型仅适用于 DeePMD-kit 环境
+- JIT 脚本化可能在某些场景下受限
+- LAMMPS 等 MD 引擎需要特定的压缩模型格式
 
 ## 实现细节
 
@@ -520,6 +623,7 @@ f(x) = c₀ + c₁t + c₂t² + c₃t³ + c₄t⁴ + c₅t⁵
 ```
 
 其中：
+
 - `t = (x - x_i) / h`，h 为步长
 - `c₀ = f(x_i)`
 - `c₁ = f'(x_i) × h`
@@ -530,17 +634,17 @@ f(x) = c₀ + c₁t + c₂t² + c₃t³ + c₄t⁴ + c₅t⁵
 
 用于平滑处理截断半径的切换函数：
 
-```python
+```text
 def spline5_switch(r, r_min, r_max):
     if r < r_min:
         return 1.0
     elif r < r_max:
         u = (r - r_min) / (r_max - r_min)
-        return u³(-6u² + 15u - 10) + 1
+        return u**3 * (-6 * u**2 + 15 * u - 10) + 1
     else:
         return 0.0
 ```
 
 ## 总结
 
-DeePMD-kit的compress功能通过将神经网络嵌入层用查表法和多项式插值替代，实现了显著的推理加速。PyTorch后端的实现采用了分层设计，由模型层、原子模型层、描述符层逐级传递压缩请求。查表器构建了精细和粗糙分段的插值表，平衡了精度与性能。该功能对大多数SE类和DPA1/DPA2描述符提供良好支持，是生产环境中提升MD模拟效率的重要工具。
\ No newline at end of file
+DeePMD-kit 的 compress 功能通过将神经网络嵌入层用查表法和多项式插值替代，实现了显著的推理加速。PyTorch 后端的实现采用了分层设计，由模型层、原子模型层、描述符层逐级传递压缩请求。查表器构建了精细和粗糙分段的插值表，平衡了精度与性能。该功能对大多数 SE 类和 DPA1/DPA2 描述符提供良好支持，是生产环境中提升 MD 模拟效率的重要工具。
diff --git a/doc/outisli/install.md b/doc/outisli/install.md
index 20869e5446..91d526a3b7 100644
--- a/doc/outisli/install.md
+++ b/doc/outisli/install.md
@@ -1,8 +1,8 @@
-鉴于大家可能觉得从源码安装`DeepMD-kit`门槛较高，而极少使用。然而从源码安装的灵活性最高，为进一步推广，并减少可能的坑，笔者在此根据自己的安装流程结合官方文档给出一个适用性较广的安装教程，各位可自行尝试。
+鉴于大家可能觉得从源码安装`DeePMD-kit`门槛较高，而极少使用。然而从源码安装的灵活性最高，为进一步推广，并减少可能的坑，笔者在此根据自己的安装流程结合官方文档给出一个适用性较广的安装教程，各位可自行尝试。
 
 本教程适用于 Linux(with NVIDIA GPU) 及 Mac(with Apple Silicon)
 
-Since some users may find installing `DeepMD-kit` from source to be challenging and rarely attempt it, this guide aims to make the process more accessible. Installing from source offers the highest flexibility. To promote this method and reduce potential pitfalls, I have compiled a broadly applicable installation tutorial based on my own experience and the official documentation. You are encouraged to try it out.
+Since some users may find installing `DeePMD-kit` from source to be challenging and rarely attempt it, this guide aims to make the process more accessible. Installing from source offers the highest flexibility. To promote this method and reduce potential pitfalls, I have compiled a broadly applicable installation tutorial based on my own experience and the official documentation. You are encouraged to try it out.
 
 This tutorial is applicable to Linux (with NVIDIA GPU) and Mac (with Apple Silicon).
 
@@ -23,6 +23,7 @@ This tutorial is applicable to Linux (with NVIDIA GPU) and Mac (with Apple Silic
 > 5. This tutorial assumes some basic knowledge of computer (Linux) operations. If you encounter any issues, feel free to comment or ask AI for help.
 
 # 0. Preparation (Optional)
+
 ## 0.1 CUDA Toolkit
 
 ```shell
@@ -155,7 +156,7 @@ make lammps && rm -rf $software/lammps
 # Or conda install
 mamba install jpeg libpng zlib -c conda-forge -y
 
-# 2. Download Lammps
+# 2. Download LAMMPS
 cd $software && mkdir -p lammps && cd lammps && wget https://gh-proxy.com/github.com/lammps/lammps/archive/stable_22Jul2025.tar.gz && tar xzf stable_22Jul2025.tar.gz && cd lammps-stable_22Jul2025 && mkdir -p build && cd build
 # wget https://github.com/lammps/lammps/archive/stable_22Jul2025.tar.gz
 

From 429f638faa90d72975b22f7c88436b34d5d2a383 Mon Sep 17 00:00:00 2001
From: OutisLi <LTC201806070316@gmail.com>
Date: Thu, 9 Oct 2025 14:23:13 +0800
Subject: [PATCH 08/11] feat: enable generation of compile_commands.json for
 cmake

---
 source/CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index d356d4cba6..6aa8469c4a 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -2,6 +2,9 @@
 cmake_minimum_required(VERSION 3.25.2)
 project(DeePMD)
 
+# generate compile_commands.json
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
 option(ENABLE_TENSORFLOW "Enable TensorFlow interface" OFF)
 option(ENABLE_PYTORCH "Enable PyTorch interface" OFF)
 option(ENABLE_JAX "Enable JAX interface" OFF)

From 5da2747f414010516bf13688ff7ed95ff8fdfbd4 Mon Sep 17 00:00:00 2001
From: OutisLi <LTC201806070316@gmail.com>
Date: Mon, 13 Oct 2025 11:18:55 +0800
Subject: [PATCH 09/11] fix: use monotonic time for debug scripts

feat: add multiple runs for infer scripts

feat: add pytorch profiler for infer debug

ignore profiler files in gitognore
---
 .gitignore               |   6 +
 .pre-commit-config.yaml  |  26 +-
 AGENTS.md                | 496 +++++++++++++++++++--------
 CLAUDE.md                | 702 ++++++++++++++++++---------------------
 README.md                |   2 +
 debug/compress_debug.py  |  41 ++-
 debug/dptest_debug.py    |  66 +++-
 debug/inference_debug.py | 199 +++++++++--
 debug/train_debug.py     |  37 ++-
 doc/outisli/install.md   | 175 ++++++++--
 10 files changed, 1126 insertions(+), 624 deletions(-)

diff --git a/.gitignore b/.gitignore
index eef8d03a90..521f387697 100644
--- a/.gitignore
+++ b/.gitignore
@@ -51,6 +51,8 @@ buildcxx/
 node_modules/
 *.bib.original
 .claude
+.spec-workflow
+.serena
 
 # Coverage files
 .coverage
@@ -76,3 +78,7 @@ system/
 # clangd
 compile_commands.json
 source/.cache
+
+# pytorch profiler
+*.tfevents.*
+*.pt.trace.json
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6ec5c0e8a1..c813077783 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,7 +1,7 @@
 # See https://pre-commit.com for more information
 # See https://pre-commit.com/hooks.html for more hooks
 repos:
-  - repo: https://github.com/pre-commit/pre-commit-hooks
+  - repo: https://gh-proxy.com/github.com/pre-commit/pre-commit-hooks
     rev: v6.0.0
     hooks:
       - id: trailing-whitespace
@@ -21,13 +21,13 @@ repos:
       - id: check-symlinks
       - id: check-toml
   # Python
-  - repo: https://github.com/PyCQA/isort
+  - repo: https://gh-proxy.com/github.com/PyCQA/isort
     rev: 7.0.0
     hooks:
       - id: isort
         files: \.py$
         exclude: ^source/3rdparty
-  - repo: https://github.com/astral-sh/ruff-pre-commit
+  - repo: https://gh-proxy.com/github.com/astral-sh/ruff-pre-commit
     # Ruff version.
     rev: v0.14.11
     hooks:
@@ -38,7 +38,7 @@ repos:
       - id: ruff-format
         exclude: ^source/3rdparty
         types_or: [python, pyi, jupyter]
-  - repo: https://github.com/pycqa/flake8
+  - repo: https://gh-proxy.com/github.com/pycqa/flake8
     # flake8 cannot autofix
     rev: "7.3.0"
     hooks:
@@ -47,25 +47,25 @@ repos:
           - torchfix==0.6.0
           - flake8-pyproject==1.2.3
   # numpydoc
-  - repo: https://github.com/Carreau/velin
+  - repo: https://gh-proxy.com/github.com/Carreau/velin
     rev: 0.0.12
     hooks:
       - id: velin
         args: ["--write"]
         exclude: ^source/3rdparty
   # Python inside docs
-  - repo: https://github.com/asottile/blacken-docs
+  - repo: https://gh-proxy.com/github.com/asottile/blacken-docs
     rev: 1.20.0
     hooks:
       - id: blacken-docs
   # C++
-  - repo: https://github.com/pre-commit/mirrors-clang-format
+  - repo: https://gh-proxy.com/github.com/pre-commit/mirrors-clang-format
     rev: v21.1.8
     hooks:
       - id: clang-format
         exclude: ^(source/3rdparty|source/lib/src/gpu/cudart/.+\.inc|.+\.ipynb$|.+\.json$)
   # markdown, yaml, CSS, javascript
-  - repo: https://github.com/pre-commit/mirrors-prettier
+  - repo: https://gh-proxy.com/github.com/pre-commit/mirrors-prettier
     rev: v4.0.0-alpha.8
     hooks:
       - id: prettier
@@ -73,17 +73,17 @@ repos:
         # workflow files cannot be modified by pre-commit.ci
         exclude: ^(source/3rdparty|\.github/workflows|\.clang-format)
   # Shell
-  - repo: https://github.com/scop/pre-commit-shfmt
+  - repo: https://gh-proxy.com/github.com/scop/pre-commit-shfmt
     rev: v3.12.0-2
     hooks:
       - id: shfmt
   # CMake
-  - repo: https://github.com/cheshirekow/cmake-format-precommit
+  - repo: https://gh-proxy.com/github.com/cheshirekow/cmake-format-precommit
     rev: v0.6.13
     hooks:
       - id: cmake-format
       #- id: cmake-lint
-  - repo: https://github.com/njzjz/mirrors-bibtex-tidy
+  - repo: https://gh-proxy.com/github.com/njzjz/mirrors-bibtex-tidy
     rev: v1.14.0
     hooks:
       - id: bibtex-tidy
@@ -103,7 +103,7 @@ repos:
           - --remove-empty-fields
           - --wrap=80
   # license header
-  - repo: https://github.com/Lucas-C/pre-commit-hooks
+  - repo: https://gh-proxy.com/github.com/Lucas-C/pre-commit-hooks
     rev: v1.5.5
     hooks:
       # C++, js
@@ -153,7 +153,7 @@ repos:
         # unclear why PairDeepMD is used instead of PairDeePMD
         exclude: .pre-commit-config.yaml|source/lmp
   # customized pylint rules
-  - repo: https://github.com/pylint-dev/pylint/
+  - repo: https://gh-proxy.com/github.com/pylint-dev/pylint/
     rev: v4.0.4
     hooks:
       - id: pylint
diff --git a/AGENTS.md b/AGENTS.md
index c629a08def..9d268607a4 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -1,191 +1,415 @@
-# DeePMD-kit
+# CLAUDE.md
 
-DeePMD-kit is a deep learning package for many-body potential energy representation and molecular dynamics. It supports multiple backends (TensorFlow, PyTorch, JAX, Paddle) and integrates with MD packages like LAMMPS, GROMACS, and i-PI.
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
 
-**Always reference these instructions first and fallback to search or bash commands only when you encounter unexpected information that does not match the info here.**
+## Project Overview
 
-## Working Effectively
+DeePMD-kit is a deep learning-based molecular dynamics potential model modeling software package that supports four deep learning backends: TensorFlow, PyTorch, JAX, and Paddle, and integrates with multiple MD software including LAMMPS, i-PI, AMBER, CP2K, GROMACS, etc.
 
-### Bootstrap and Build Repository
+## Common Development Commands
 
-- Create virtual environment: `uv venv venv && source venv/bin/activate`
-- Install base dependencies: `uv pip install tensorflow-cpu` (takes ~8 seconds)
-- Install PyTorch: `uv pip install torch --index-url https://download.pytorch.org/whl/cpu` (takes ~5 seconds)
-- Build Python package: `uv pip install -e .[cpu,test]` -- takes 67 seconds. **NEVER CANCEL. Set timeout to 120+ seconds.**
-- Build C++ components: `export TENSORFLOW_ROOT=$(python -c 'import importlib.util,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')` then `export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])')` then `./source/install/build_cc.sh` -- takes 164 seconds. **NEVER CANCEL. Set timeout to 300+ seconds.**
+Use this python if needed: /home/outisli/miniforge3/envs/dpmd/bin/python
 
-### Test Repository
+### Code Check and Format
 
-- Run single test: `pytest source/tests/tf/test_dp_test.py::TestDPTestEner::test_1frame -v` -- takes 8-13 seconds
-- Run test subset: `pytest source/tests/tf/test_dp_test.py -v` -- takes 15 seconds. **NEVER CANCEL. Set timeout to 60+ seconds.**
-- **Recommended: Use single test cases for validation instead of full test suite** -- full suite has 314 test files and takes 60+ minutes
+```bash
+ruff check .      # Check code style
+ruff format .     # Format code
+isort .           # Sort imports
+```
+
+### Test Commands
+
+```bash
+# Verify installation
+dp --version
+python -c "import deepmd; import deepmd.tf; print('Interfaces working')"
+
+# VITAL!: set these three OMP_NUM_THREADS, DP_INTER_OP_PARALLELISM_THREADS, DP_INTRA_OP_PARALLELISM_THREADS to zero before running test
+
+# Single test (recommended for development)
+pytest source/tests/tf/test_dp_test.py::TestDPTestEner::test_1frame -v
+
+# Specific test suite
+pytest source/tests/tf/test_dp_test.py -v
+
+# Training test
+cd examples/water/se_e2_a
+dp train input.json --skip-neighbor-stat  # TensorFlow
+dp --pt train input_torch.json --skip-neighbor-stat  # PyTorch
+```
+
+### Model Compression (Reference: doc/outisli/compress.md)
+
+#### Compression Principle
+
+- **Tabulation**: Pre-compute and store embedding network outputs
+- **Piecewise Interpolation**: Use quintic Hermite interpolation for continuity
+- **Performance**: Significantly reduces memory usage and improves inference speed
+
+#### Supported Descriptors
 
-### Lint and Format Code
+- ✅ SE_A, SE_R, SE_T, SE_Atten
+- ✅ DPA1, DPA2
+- ❌ DPA3 (compression not supported)
+
+## Code Architecture and Core Modules
+
+### 1. Deep Learning Model Layer (deepmd/dpmodel/)
 
-- Install linter: `uv pip install ruff`
-- Run linting: `ruff check .` -- takes <1 second
-- Format code: `ruff format .` -- takes <1 second
-- **Always run `ruff check .` and `ruff format .` before committing changes or the CI will fail.**
+This is the core model definition layer of DeePMD-kit, containing all mathematical abstractions of models:
 
-### Training and Validation
+- **descriptor/**: Descriptor modules (embedding networks, environment information extraction)
+  - `se_a.py`: Embedded Atom Descriptor
+  - `se_r.py`: Simplified embedding descriptor
+  - `se_a_tpe.py`: Descriptor with type embedding
+  - `hybrid.py`: Hybrid descriptor
+- **fitting/**: Fitting network modules
+  - `ener.py`: Energy fitting network
+  - `dipole.py`: Dipole fitting
+  - `polar.py`: Polarizability fitting
+- **model/**: Model definitions
+  - `model.py`: Base model class
+  - `ener_model.py`: Energy model
+  - `dos_model.py`: Density of states model
 
-- Test TensorFlow training: `cd examples/water/se_e2_a && dp train input.json --skip-neighbor-stat` -- training proceeds but is slow on CPU
-- Test PyTorch training: `cd examples/water/se_e2_a && dp --pt train input_torch.json --skip-neighbor-stat` -- training proceeds but is slow on CPU
-- **Training examples are for validation only. Real training takes hours/days. Timeout training tests after 60 seconds for validation.**
+### 2. Backend Implementation Layer
 
-## Validation Scenarios
+Each backend implements the same interface to ensure consistency:
 
-**ALWAYS manually validate any new code through at least one complete scenario:**
+#### TensorFlow Backend (deepmd/tf/)
 
-### Basic Functionality Validation
+- **entrypoints/**: Command line entry points
+  - `main.py`: Main CLI entry
+  - `train.py`: Training script
+  - `freeze.py`: Model freezing
+  - `test.py`: Model testing
+- **network/**: Network definitions
+  - `network.py`: Main network class
+  - `embedding_net.py`: Embedding network
+  - `fitting_net.py`: Fitting network
+- **model/**: Model implementations
+  - `model.py`: Model definition
+  - `model_stat.py`: Model statistics
+- **infer/**: Inference interface
+  - `deep_eval.py`: Deep evaluation
+  - `deep_pot.py`: Deep potential
 
-1. **CLI Interface**: Run `dp --version` and `dp -h` to verify installation
-2. **Python Interface**: Run `python -c "import deepmd; import deepmd.tf; print('Both interfaces work')"`
-3. **Backend Selection**: Test `dp --tf -h`, `dp --pt -h`, `dp --jax -h`, `dp --pd -h`
+#### PyTorch Backend (deepmd/pt/)
 
-### Training Workflow Validation
+Similar structure to TensorFlow backend but with PyTorch-specific optimizations:
 
-1. **TensorFlow Training**: `cd examples/water/se_e2_a && timeout 60 dp train input.json --skip-neighbor-stat` -- should start training and show decreasing loss
-2. **PyTorch Training**: `cd examples/water/se_e2_a && timeout 60 dp --pt train input_torch.json --skip-neighbor-stat` -- should start training and show decreasing loss
-3. **Verify training output**: Look for "batch X: trn: rmse" messages showing decreasing error values
+- **model/**: PyTorch model implementations
+  - `model.py`: Base model class
+  - `nn.py`: Neural network modules
+- **utils/**: PyTorch utilities
+  - `env_mat.py`: Environment matrix construction
+  - `region.py`: Periodic boundary condition handling
+- **train/**: Training related
+  - `training.py`: Training loop
+  - `optimizer.py`: Optimizer configuration
 
-### Test-Based Validation
+### 3. C++ Core Engine (source/)
 
-1. **Core Tests**: `pytest source/tests/tf/test_dp_test.py::TestDPTestEner::test_1frame -v` -- should pass in ~10 seconds
-2. **Multi-backend**: Test both TensorFlow and PyTorch components work
+Core implementation for high-performance computing:
 
-## Common Commands and Timing
+#### Core Library (source/lib/)
 
-### Repository Structure
+- **include/**: Header file definitions
+  - `deepmd.hpp`: Main API declarations
+  - `common.hpp`: Common definitions
+  - `neighbor_list.hpp`: Neighbor list algorithm
+- **src/**: Source code implementation
+  - `deepmd.cpp`: Core C++ implementation
+  - `region.cpp`: Region processing
+  - `neighbor_list.cpp`: High-performance neighbor list
+  - `prod_env_mat_a.cpp`: Environment matrix production
 
+#### Operator Implementation (source/op/)
+
+Framework-specific operators for each deep learning framework:
+
+- **tf/**: TensorFlow custom operators
+  - `prod_env_mat_a.cc`: Environment matrix operator
+  - `prod_force_se_a.cc`: Force calculation operator
+  - `tabulate.cc`: Lookup table operator
+- **torch/**: PyTorch C++ extensions
+  - `prod_env_mat_a.cpp`: PyTorch version of environment matrix operator
+
+### 4. Data Processing Layer (deepmd/utils/)
+
+- **data.py**: Data loading and preprocessing
+- `data_system.py`: Data system management
+- `shuffle.py`: Data shuffling
+- `neighbor_stat.py`: Neighbor statistics
+- `type_embed.py`: Type embedding
+- `args.py`: Argument parsing
+- `path.py`: Path handling
+- `compat.py`: Version compatibility handling
+
+### 5. Input/Output Layer (deepmd/infer/)
+
+- **deep_pot.py**: High-level inference interface
+- **deep_dipole.py**: Dipole inference
+- **deep_dos.py**: Density of states inference
+- **deep_wfc.py**: Wave function inference
+
+## Key Data Flow
+
+1. **Training Flow**:
+
+   ```
+   Atomic coordinates → neighbor_list → env_matrix → descriptor → fitting_net → loss
+   ```
+
+2. **Inference Flow**:
+
+   ```
+   Input structure → Descriptor calculation → Fitting network → Energy/Force/Stress
+   ```
+
+3. **Multi-backend Unified Interface**:
+   - Python layer provides unified API through `deepmd.infer`
+   - C++ layer provides unified interface through `source/api_cc/`
+   - Each backend implements the same model specification
+
+### Select Backend
+
+```bash
+# Command line flags
+dp --pt train input.json
+dp --tf train input.json
+
+# Environment variable
+export DP_BACKEND=pytorch
+dp train input.json
 ```
-ls -la [repo-root]
-.github/               # GitHub workflows and templates
-CONTRIBUTING.md        # Contributing guide
-README.md             # Project overview
-deepmd/               # Python package source
-doc/                  # Documentation
-examples/             # Training examples and configurations
-pyproject.toml        # Python build configuration
-source/               # C++ source code and tests
+
+## Core Algorithms and Data Structures
+
+### 1. Descriptor Implementation
+
+Descriptors are the core innovation of DeePMD-kit, used to convert local atomic environments into vector representations:
+
+#### Embedded Atom Descriptor (SE_A)
+
+- **Location**: `deepmd/dpmodel/descriptor/se_a.py`
+- **Core functions**:
+  - `build()`: Build descriptor network
+  - `call()`: Calculate descriptor values
+- **Mathematical principle**:
+  - Radial basis function expansion: $g(r) = \sum_{i} \exp[-\gamma (r-r_s)^2]$
+  - Angular basis function: Angular dependency through 1D filters
+
+#### Environment Matrix (Env Mat)
+
+- **C++ implementation**: `source/lib/src/prod_env_mat_a.cpp`
+- **Function**: Efficiently calculate environment matrix between atom pairs
+- **Optimization**: Use parallelization and SIMD instructions for acceleration
+
+### 2. Fitting Network
+
+Maps descriptors to physical quantities:
+
+#### Energy Fitting
+
+- **Location**: `deepmd/dpmodel/fitting/ener.py`
+- **Output**: Atomic energy, system total energy obtained by summation
+- **Force calculation**: Through automatic differentiation or analytical gradient
+
+#### Fitting Network Structure
+
+```python
+# Typical fitting network architecture
+FittingNet(
+    layers=[embedding_dim, 240, 240, 240, 1],  # Network layer sizes
+    activation_function="tanh",  # Activation function
+    precision="float64",  # Numerical precision
+)
 ```
 
-### Key Directories and Files
+### 3. Training Strategy
 
-- `deepmd/` - Main Python package with backend implementations
-- `source/lib/` - Core C++ library
-- `source/op/` - Backend-specific operators (TF, PyTorch, etc.)
-- `source/api_cc/` - C++ API
-- `source/api_c/` - C API
-- `source/tests/` - Test suite (314 test files)
-- `examples/water/se_e2_a/` - Basic water training example
-- `examples/` - Various model examples for different scenarios
+#### Loss Function
 
-### Common CLI Commands
+```python
+# Location: deepmd/loss.py or backend implementations
+Loss = lr_e * energy_loss + lr_f * force_loss + lr_v * virial_loss
+```
 
-- `dp --version` - Show version information
-- `dp -h` - Show help and available commands
-- `dp train input.json` - Train a model (TensorFlow backend)
-- `dp --pt train input.json` - Train with PyTorch backend
-- `dp --jax train input.json` - Train with JAX backend
-- `dp --pd train input.json` - Train with Paddle backend
-- `dp test -m model.pb -s system/` - Test a trained model
-- `dp freeze -o model.pb` - Freeze/save a model
+#### Data Preprocessing
 
-### Build Dependencies and Setup
+- **Data shuffling**: `deepmd/utils/shuffle.py`
+- **Batching**: Auto-fill to ensure consistent batch size
+- **Data augmentation**: Increase data diversity through rotation and translation
 
-- **Python 3.10+** required
-- **Virtual environment** strongly recommended: `uv venv venv && source venv/bin/activate`
-- **Backend dependencies**: TensorFlow, PyTorch, JAX, or Paddle (install before building)
-- **Build tools**: CMake, C++ compiler, scikit-build-core
-- **C++ build requires**: Both TensorFlow and PyTorch installed, set TENSORFLOW_ROOT and PYTORCH_ROOT environment variables
+### 4. Model Saving and Loading
 
-### Key Configuration Files
+#### Checkpoint Formats
 
-- `pyproject.toml` - Python build configuration and dependencies
-- `source/CMakeLists.txt` - C++ build configuration
-- `examples/water/se_e2_a/input.json` - Basic TensorFlow training config
-- `examples/water/se_e2_a/input_torch.json` - Basic PyTorch training config
+- **TensorFlow**: .pb format (frozen graph)
+- **PyTorch**: .pth format
+- **Universal format**: .dp format (framework-agnostic)
 
-## Frequent Patterns and Time Expectations
+#### Model Conversion
 
-### Installation and Build Times
+```python
+# TensorFlow to PyTorch conversion
+from deepmd.pt import model as pt_model
 
-- **Virtual environment setup**: ~5 seconds
-- **TensorFlow CPU install**: ~8 seconds
-- **PyTorch CPU install**: ~5 seconds
-- **Python package build**: ~67 seconds. **NEVER CANCEL.**
-- **C++ components build**: ~164 seconds. **NEVER CANCEL.**
-- **Full fresh setup**: ~3-4 minutes total
+pt_model.load_tf_graph(tf_checkpoint_path)
+```
 
-### Testing Times
+## Common Development Patterns
+
+### 1. Adding New Descriptors
+
+1. Create new descriptor class in `deepmd/dpmodel/descriptor/`
+2. Inherit from `BaseDescriptor` and implement necessary methods
+3. Add corresponding implementations in each backend (tf/pt/jax/pd)
+4. Add unit tests
+
+### 2. Debugging Tips
+
+- Use small systems for quick testing
+- Check energy conservation and symmetry
+- Compare results consistency across different backends
+- Use `dp test --rand-init` to verify model
+
+## Development Standards
+
+### Naming Conventions
+
+- Always use correct capitalization: DeePMD-kit, PyTorch, TensorFlow, NumPy, GitHub, LAMMPS
+
+### License Requirements
+
+All source files must include header license:
+`SPDX-License-Identifier: LGPL-3.0-or-later`
+
+## Test Strategy
+
+### Test Locations
+
+- **source/tests/**: C++ and Python tests
+- **tests/** directories in each submodule
+
+### Test Principles
+
+- During development, only run single or few related tests; full test suite takes 60+ minutes
+- Training tests use `--skip-neighbor-stat` to skip statistics for speed
+- Use `timeout` to limit training test time
+
+## Configuration File Structure
+
+### Typical Training Configuration (input.json)
+
+```json
+{
+  "model": {
+    "type_map": ["O", "H"],
+    "descriptor": {
+      "type": "se_a",
+      "sel": [46, 92],
+      "rcut_smth": 5.8,
+      "rcut": 6.0,
+      "neuron": [25, 50, 100],
+      "axis_neuron": 12
+    },
+    "fitting_net": {
+      "type": "ener",
+      "neuron": [240, 240, 240],
+      "resnet_dt": true
+    }
+  },
+  "learning_rate": {
+    "type": "exp",
+    "start_lr": 0.001,
+    "decay_steps": 5000
+  },
+  "loss": {
+    "start_pref_e": 0.02,
+    "start_pref_f": 1000,
+    "start_pref_v": 0.0
+  },
+  "training": {
+    "training_data": {
+      "systems": ["system1/", "system2/"],
+      "batch_size": 8
+    },
+    "numb_steps": 1000000
+  }
+}
+```
 
-- **Single test**: 8-13 seconds
-- **Test file (~5 tests)**: ~15 seconds
-- **Backend-specific test subset**: 15-30 minutes. **Use sparingly.**
-- **Full test suite (314 files)**: 60+ minutes. **Avoid in development - use single tests instead.**
+## Special Features
 
-### Linting and Formatting
+### 1. Type Embedding
 
-- **Ruff check**: <1 second
-- **Ruff format**: <1 second
-- **Pre-commit hooks**: May have network issues, use individual tools
+- Support unified training for multi-element systems
+- Location: `deepmd/utils/type_embed.py`
+- Dynamic type embedding can handle unseen element combinations
 
-### Commit Messages and PR Titles
+### 2. Adaptive Selection (UpdateSel)
 
-**All commit messages and PR titles must follow [conventional commit specification](https://www.conventionalcommits.org/):**
+- Automatically update neighbor list selection parameters
+- Avoid neighbor loss due to atomic migration
+- Location: `deepmd/utils/update_sel.py`
 
-- **Format**: `type(scope): description`
-- **Common types**: `feat`, `fix`, `docs`, `style`, `refactor`, `test`, `chore`, `ci`
-- **Examples**:
-  - `feat(core): add new descriptor type`
-  - `fix(tf): resolve memory leak in training`
-  - `docs: update installation guide`
-  - `ci: add workflow for testing`
+### 3. Multi-task Learning
 
-### Training and Model Operations
+- Simultaneously fit energy, force, stress, dipole, etc.
+- Loss function can configure weights for each task
+- Support physical constraints and regularization
 
-- **Training initialization**: 10-30 seconds
-- **Training per batch**: 0.1-1 second (CPU), much faster on GPU
-- **Model freezing**: 5-15 seconds
-- **Model testing**: 10-30 seconds
+## Model Compression Details (Advanced)
 
-## Backend-Specific Notes
+### Compression Data Structure
 
-### TensorFlow Backend
+#### 1. Compression Information (compress_info)
 
-- **Default backend** when no flag specified
-- **Configuration**: Use `input.json` format
-- **Training**: `dp train input.json`
-- **Requirements**: `tensorflow` or `tensorflow-cpu` package
+```python
+# Store 6 parameters for each embedding network [6]
+compress_info[embedding_idx] = torch.tensor(
+    [
+        lower[net],  # Lower bound
+        upper[net],  # Upper bound
+        upper[net] * extrapolate,  # Extrapolation upper bound
+        table_stride_1,  # First segment stride
+        table_stride_2,  # Second segment stride
+        check_frequency,  # Overflow check frequency
+    ]
+)
+```
 
-### PyTorch Backend
+#### 2. Compression Data (compress_data)
 
-- **Activation**: Use `--pt` flag or `export DP_BACKEND=pytorch`
-- **Configuration**: Use `input_torch.json` format typically
-- **Training**: `dp --pt train input_torch.json`
-- **Requirements**: `torch` package
+```python
+# Store coefficient table for each embedding network [nspline, 6 * last_layer_size]
+compress_data[embedding_idx] = table_data[net]
 
-### JAX Backend
+# Each 6 consecutive coefficients represent polynomial coefficients
+# [f(x), f'(x), f''(x)/2, c3, c4, c5] × last_layer_size
+```
 
-- **Activation**: Use `--jax` flag
-- **Training**: `dp --jax train input.json`
-- **Requirements**: `jax` and related packages
-- **Note**: Experimental backend, may have limitations
+### Tabulation Implementation
 
-### Paddle Backend
+- **Table Builder**: `deepmd/pt/utils/tabulate.py` (PyTorch)
+- **Common Utilities**: `deepmd/utils/tabulate.py`
+- **Supported Activations**: tanh, gelu, relu, relu6, softplus, sigmoid
 
-- **Activation**: Use `--pd` flag
-- **Training**: `dp --pd train input.json`
-- **Requirements**: `paddlepaddle` package
-- **Note**: Less commonly used
+### Polynomial Interpolation Formula
+
+In interval [x_i, x_{i+1}], for variable x, the polynomial is:
+
+```
+f(x) = c₀ + c₁t + c₂t² + c₃t³ + c₄t⁴ + c₅t⁵
+```
 
-## Critical Warnings
+Where:
 
-- **NEVER CANCEL BUILD OPERATIONS**: Python build takes 67 seconds, C++ build takes 164 seconds
-- **USE SINGLE TESTS FOR VALIDATION**: Run individual tests instead of full test suite for faster feedback
-- **ALWAYS activate virtual environment**: Build and runtime failures occur without proper environment
-- **ALWAYS install backend dependencies first**: TensorFlow/PyTorch required before building C++ components
-- **ALWAYS run linting before commits**: `ruff check . && ruff format .` or CI will fail
-- **ALWAYS test both Python and C++ components**: Some features require both to be built
-- **ALWAYS follow conventional commit format**: All commit messages and PR titles must use conventional commit specification (`type(scope): description`)
+- `t = (x - x_i) / h`, h is step size
+- `c₀ = f(x_i)`
+- `c₁ = f'(x_i) × h`
+- `c₂ = f''(x_i) × h² / 2`
+- `c₃, c₄, c₅` determined by boundary continuity
diff --git a/CLAUDE.md b/CLAUDE.md
index 952286a423..9d268607a4 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -4,476 +4,412 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 
 ## Project Overview
 
-DeePMD-kit is a deep learning package for many-body potential energy representation and molecular dynamics. It supports multiple backends (TensorFlow, PyTorch, JAX, Paddle) and interfaces with various MD packages (LAMMPS, i-PI, AMBER, GROMACS, etc.).
+DeePMD-kit is a deep learning-based molecular dynamics potential model modeling software package that supports four deep learning backends: TensorFlow, PyTorch, JAX, and Paddle, and integrates with multiple MD software including LAMMPS, i-PI, AMBER, CP2K, GROMACS, etc.
 
-## Development Commands
+## Common Development Commands
 
-### Building and Installation
+Use this python if needed: /home/outisli/miniforge3/envs/dpmd/bin/python
 
-- **Standard build**: `pip install .`
-- **With GPU support**: Set environment variables like `DP_ENABLE_PYTORCH=1`, `DP_ENABLE_TENSORFLOW=1`, etc.
-- **From source**: Uses scikit-build-core with CMake - see `source/CMakeLists.txt`
-- **C++ library**: Built automatically as part of the Python package
+### Code Check and Format
 
-### Testing
+```bash
+ruff check .      # Check code style
+ruff format .     # Format code
+isort .           # Sort imports
+```
 
-- **Run all tests**: `pytest source/tests`
-- **Run specific backend tests**: `pytest source/tests/tf/`, `pytest source/tests/pt/`, etc.
-- **GPU tests**: `tox -e gpu` or set `DP_VARIANT=cuda`
-- **Individual test**: `pytest source/tests/path/to/test_file.py::test_name`
-- **With coverage**: `pytest --cov=deepmd`
+### Test Commands
 
-### Code Quality
+```bash
+# Verify installation
+dp --version
+python -c "import deepmd; import deepmd.tf; print('Interfaces working')"
 
-- **Linting**: `ruff check .`
-- **Formatting**: `ruff format .`
-- **Type checking**: No specific type checker configured in the project
+# VITAL!: set these three OMP_NUM_THREADS, DP_INTER_OP_PARALLELISM_THREADS, DP_INTRA_OP_PARALLELISM_THREADS to zero before running test
 
-### Backend-Specific Commands
+# Single test (recommended for development)
+pytest source/tests/tf/test_dp_test.py::TestDPTestEner::test_1frame -v
 
-- **TensorFlow**: Requires TF 2.19.0, automatically enabled with certain flags
-- **PyTorch**: Enable with `DP_ENABLE_PYTORCH=1`
-- **JAX**: Enable with `DP_ENABLE_JAX=1` (requires Python >= 3.10)
-- **Paddle**: Enable with `DP_ENABLE_PADDLE=1`
+# Specific test suite
+pytest source/tests/tf/test_dp_test.py -v
 
-### Model Compression
+# Training test
+cd examples/water/se_e2_a
+dp train input.json --skip-neighbor-stat  # TensorFlow
+dp --pt train input_torch.json --skip-neighbor-stat  # PyTorch
+```
 
-- **Compress models**: `dp --pt compress -i model.pth -o compressed.pth`
-- **Custom parameters**: `dp --pt compress -i model.pth -o compressed.pth -s 0.005 -e 10`
-- **PyTorch backend only**: Supports SE_A, SE_R, SE_T, SE_Atten, DPA1, DPA2 descriptors
-- **DPA3 not supported**: Compression explicitly disabled for DPA3 descriptors
+### Model Compression (Reference: doc/outisli/compress.md)
 
-## Architecture Overview
+#### Compression Principle
 
-### Multi-Backend Design
+- **Tabulation**: Pre-compute and store embedding network outputs
+- **Piecewise Interpolation**: Use quintic Hermite interpolation for continuity
+- **Performance**: Significantly reduces memory usage and improves inference speed
 
-The codebase is organized around a modular backend system in `deepmd/backend/`:
+#### Supported Descriptors
 
-- `backend.py`: Core backend management logic
-- `tensorflow.py`, `pytorch.py`, `jax.py`, `paddle.py`: Backend-specific implementations
-- `suffix.py`: Model file suffix handling for different backends
+- ✅ SE_A, SE_R, SE_T, SE_Atten
+- ✅ DPA1, DPA2
+- ❌ DPA3 (compression not supported)
+
+## Code Architecture and Core Modules
+
+### 1. Deep Learning Model Layer (deepmd/dpmodel/)
+
+This is the core model definition layer of DeePMD-kit, containing all mathematical abstractions of models:
+
+- **descriptor/**: Descriptor modules (embedding networks, environment information extraction)
+  - `se_a.py`: Embedded Atom Descriptor
+  - `se_r.py`: Simplified embedding descriptor
+  - `se_a_tpe.py`: Descriptor with type embedding
+  - `hybrid.py`: Hybrid descriptor
+- **fitting/**: Fitting network modules
+  - `ener.py`: Energy fitting network
+  - `dipole.py`: Dipole fitting
+  - `polar.py`: Polarizability fitting
+- **model/**: Model definitions
+  - `model.py`: Base model class
+  - `ener_model.py`: Energy model
+  - `dos_model.py`: Density of states model
+
+### 2. Backend Implementation Layer
 
-### Core Components
+Each backend implements the same interface to ensure consistency:
+
+#### TensorFlow Backend (deepmd/tf/)
+
+- **entrypoints/**: Command line entry points
+  - `main.py`: Main CLI entry
+  - `train.py`: Training script
+  - `freeze.py`: Model freezing
+  - `test.py`: Model testing
+- **network/**: Network definitions
+  - `network.py`: Main network class
+  - `embedding_net.py`: Embedding network
+  - `fitting_net.py`: Fitting network
+- **model/**: Model implementations
+  - `model.py`: Model definition
+  - `model_stat.py`: Model statistics
+- **infer/**: Inference interface
+  - `deep_eval.py`: Deep evaluation
+  - `deep_pot.py`: Deep potential
+
+#### PyTorch Backend (deepmd/pt/)
+
+Similar structure to TensorFlow backend but with PyTorch-specific optimizations:
+
+- **model/**: PyTorch model implementations
+  - `model.py`: Base model class
+  - `nn.py`: Neural network modules
+- **utils/**: PyTorch utilities
+  - `env_mat.py`: Environment matrix construction
+  - `region.py`: Periodic boundary condition handling
+- **train/**: Training related
+  - `training.py`: Training loop
+  - `optimizer.py`: Optimizer configuration
+
+### 3. C++ Core Engine (source/)
+
+Core implementation for high-performance computing:
+
+#### Core Library (source/lib/)
+
+- **include/**: Header file definitions
+  - `deepmd.hpp`: Main API declarations
+  - `common.hpp`: Common definitions
+  - `neighbor_list.hpp`: Neighbor list algorithm
+- **src/**: Source code implementation
+  - `deepmd.cpp`: Core C++ implementation
+  - `region.cpp`: Region processing
+  - `neighbor_list.cpp`: High-performance neighbor list
+  - `prod_env_mat_a.cpp`: Environment matrix production
+
+#### Operator Implementation (source/op/)
+
+Framework-specific operators for each deep learning framework:
+
+- **tf/**: TensorFlow custom operators
+  - `prod_env_mat_a.cc`: Environment matrix operator
+  - `prod_force_se_a.cc`: Force calculation operator
+  - `tabulate.cc`: Lookup table operator
+- **torch/**: PyTorch C++ extensions
+  - `prod_env_mat_a.cpp`: PyTorch version of environment matrix operator
+
+### 4. Data Processing Layer (deepmd/utils/)
+
+- **data.py**: Data loading and preprocessing
+- `data_system.py`: Data system management
+- `shuffle.py`: Data shuffling
+- `neighbor_stat.py`: Neighbor statistics
+- `type_embed.py`: Type embedding
+- `args.py`: Argument parsing
+- `path.py`: Path handling
+- `compat.py`: Version compatibility handling
+
+### 5. Input/Output Layer (deepmd/infer/)
+
+- **deep_pot.py**: High-level inference interface
+- **deep_dipole.py**: Dipole inference
+- **deep_dos.py**: Density of states inference
+- **deep_wfc.py**: Wave function inference
+
+## Key Data Flow
+
+1. **Training Flow**:
+
+   ```
+   Atomic coordinates → neighbor_list → env_matrix → descriptor → fitting_net → loss
+   ```
+
+2. **Inference Flow**:
+
+   ```
+   Input structure → Descriptor calculation → Fitting network → Energy/Force/Stress
+   ```
+
+3. **Multi-backend Unified Interface**:
+   - Python layer provides unified API through `deepmd.infer`
+   - C++ layer provides unified interface through `source/api_cc/`
+   - Each backend implements the same model specification
 
-#### 1. Model Architecture (`deepmd/dpmodel/`)
+### Select Backend
+
+```bash
+# Command line flags
+dp --pt train input.json
+dp --tf train input.json
+
+# Environment variable
+export DP_BACKEND=pytorch
+dp train input.json
+```
 
-Framework-agnostic model implementations:
+## Core Algorithms and Data Structures
 
-- `atomic_model/`: Atomic-level model components
-- `descriptor/`: Environment descriptors (se_a, se_atten, dpa1/2/3, etc.)
-- `fitting/`: Fitting networks for energy, forces, etc.
-- `model/`: Complete model definitions
+### 1. Descriptor Implementation
 
-### DPA3 Descriptor Implementation
+Descriptors are the core innovation of DeePMD-kit, used to convert local atomic environments into vector representations:
 
-#### DPA3 Architecture Overview
+#### Embedded Atom Descriptor (SE_A)
 
-DPA3 (Deep Potential - Atomic Environment Representation with 3-body interactions) is an advanced descriptor that combines node, edge, and angle information for more accurate atomic environment representation.
+- **Location**: `deepmd/dpmodel/descriptor/se_a.py`
+- **Core functions**:
+  - `build()`: Build descriptor network
+  - `call()`: Calculate descriptor values
+- **Mathematical principle**:
+  - Radial basis function expansion: $g(r) = \sum_{i} \exp[-\gamma (r-r_s)^2]$
+  - Angular basis function: Angular dependency through 1D filters
 
-**Key Components**:
+#### Environment Matrix (Env Mat)
 
-- **Main Descriptor**: `DescrptDPA3` in `deepmd/pt/model/descriptor/dpa3.py:105-171`
-- **RepFlow Block**: `DescrptBlockRepflows` in `deepmd/pt/model/descriptor/repflows.py:77-200`
-- **RepFlow Layer**: `RepFlowLayer` in `deepmd/pt/model/descriptor/repflow_layer.py:38-200`
+- **C++ implementation**: `source/lib/src/prod_env_mat_a.cpp`
+- **Function**: Efficiently calculate environment matrix between atom pairs
+- **Optimization**: Use parallelization and SIMD instructions for acceleration
 
-**DPA3 Core Innovation**: The RepFlow architecture introduces a unified representation that iteratively refines node, edge, and angle information through multiple layers, enabling explicit 3-body interaction modeling while maintaining computational efficiency through message compression strategies.
+### 2. Fitting Network
 
-#### DPA3 Initialization and Forward Pass
+Maps descriptors to physical quantities:
 
-**Initialization** (`dpa3.py:105-171`):
+#### Energy Fitting
 
-- Processes RepFlow parameters with `init_subclass_params(repflow, RepFlowArgs)`
-- Creates type embedding network (`TypeEmbedNetConsistent`) for consistent atomic type representations
-- Initializes RepFlow blocks with edge/angle embedding networks for distance and angular information
-- Sets up multiple RepFlow layers for iterative refinement with configurable residual connections
+- **Location**: `deepmd/dpmodel/fitting/ener.py`
+- **Output**: Atomic energy, system total energy obtained by summation
+- **Force calculation**: Through automatic differentiation or analytical gradient
 
-**Forward Pass** (`dpa3.py:430-498`):
+#### Fitting Network Structure
 
-1. **Type Embedding**: Computes atomic type embeddings using `TypeEmbedNetConsistent`
-2. **RepFlow Processing**: Multi-layer node/edge/angle information processing through iterative updates
-3. **Output Generation**: Returns comprehensive atomic environment representation with rotation matrices for SE(3) equivariance
+```python
+# Typical fitting network architecture
+FittingNet(
+    layers=[embedding_dim, 240, 240, 240, 1],  # Network layer sizes
+    activation_function="tanh",  # Activation function
+    precision="float64",  # Numerical precision
+)
+```
 
-**DPA3 Output Variables**:
+### 3. Training Strategy
 
-- `node_ebd`: Node descriptors [nf, nloc, n_dim] - primary atomic environment representation for fitting networks
-- `rot_mat`: Rotation matrices [nf, nloc, e_dim, 3] - ensures SE(3) equivariance for coordinate transformations
-- `edge_ebd`: Edge embeddings [nf, nloc, nnei, e_dim] - pairwise interaction information
-- `h2`: Angle information [nf, nloc, nnei, 3] - 3-body angular data for explicit three-body interactions
-- `sw`: Switch functions [nf, nloc, nnei] - smooth cutoff boundaries to avoid discontinuities
+#### Loss Function
 
-#### RepFlow Implementation
+```python
+# Location: deepmd/loss.py or backend implementations
+Loss = lr_e * energy_loss + lr_f * force_loss + lr_v * virial_loss
+```
 
-**RepFlow Block** (`repflows.py:77-200`):
+#### Data Preprocessing
 
-- Edge embedding network (`MLPLayer`) for distance information encoding
-- Angle embedding network for angular relationship processing
-- Multiple RepFlow layers (`RepFlowLayer`) for iterative node/edge/angle updates
-- Support for message compression (`a_compress_rate`) and attention mechanisms to reduce computational cost
-- Environment matrix computation via `prod_env_mat` for neighbor distance and direction calculation
+- **Data shuffling**: `deepmd/utils/shuffle.py`
+- **Batching**: Auto-fill to ensure consistent batch size
+- **Data augmentation**: Increase data diversity through rotation and translation
 
-**Key Parameters**:
+### 4. Model Saving and Loading
 
-- `e_rcut`/`e_rcut_smth`: Edge cutoff (6.0Å) and smoothing radii (0.5Å) for neighbor selection
-- `a_rcut`/`a_rcut_smth`: Angle cutoff (4.0Å) and smoothing radii for three-body interactions
-- `n_dim`/`e_dim`/`a_dim`: Node (128), edge (64), angle (32) representation dimensions
-- `nlayers`: Number of RepFlow layers (6) for iterative refinement
-- `update_style`: Residual connection strategies (`res_residual`, `res_update`, `force_residual`) for gradient flow optimization
-- `a_compress_rate`: Angle compression factor (2) to reduce computational overhead while preserving angular information
+#### Checkpoint Formats
 
-#### CLI Usage and Training Flow
+- **TensorFlow**: .pb format (frozen graph)
+- **PyTorch**: .pth format
+- **Universal format**: .dp format (framework-agnostic)
 
-**Training Command**: `dp --pt train input.json` (specify PyTorch backend explicitly)
+#### Model Conversion
 
-**Execution Flow**:
+```python
+# TensorFlow to PyTorch conversion
+from deepmd.pt import model as pt_model
 
-1. **Entry Point**: `deepmd.pt.entrypoints.main.train()` (`main.py:248-372`) - PyTorch-specific training entry
-2. **Configuration Loading**: JSON parsing via `j_loader()` with multi-task handling through `preprocess_shared_params()`
-3. **Neighbor Statistics**: Automatic selection parameter computation via `BaseModel.update_sel()` unless `--skip-neighbor-stat`
-4. **Trainer Creation**: `get_trainer()` with model initialization, supporting distributed training and mixed precision
-5. **Model Building**: DPA3 descriptor creation via `get_model()` with automatic device placement and JIT compilation options
+pt_model.load_tf_graph(tf_checkpoint_path)
+```
 
-**Data Processing Pipeline**:
+## Common Development Patterns
 
-1. **Raw Data Loading**: `DeepmdData` loads HDF5/.npy files from system directories
-2. **System DataLoaders**: Each system gets its own DataLoader (num_workers=0 to avoid thread explosion)
-3. **Training DataLoader**: Master DataLoader with intelligent sampling (`WeightedRandomSampler` or uniform)
-4. **Batch Processing**: `collate_batch()` handles variable-sized systems with padding and tensor stacking
+### 1. Adding New Descriptors
 
-#### Precision Control
+1. Create new descriptor class in `deepmd/dpmodel/descriptor/`
+2. Inherit from `BaseDescriptor` and implement necessary methods
+3. Add corresponding implementations in each backend (tf/pt/jax/pd)
+4. Add unit tests
 
-DPA3 supports two levels of precision control that work independently:
+### 2. Debugging Tips
 
-**Environment Variable Control (`DP_INTERFACE_PREC`)**:
+- Use small systems for quick testing
+- Check energy conservation and symmetry
+- Compare results consistency across different backends
+- Use `dp test --rand-init` to verify model
 
-- **Scope**: Global interface precision affecting input/output data types across all DeePMD-kit operations
-- **High precision** (`export DP_INTERFACE_PREC=high`): `GLOBAL_NP_FLOAT_PRECISION = np.float64`, `GLOBAL_ENER_FLOAT_PRECISION = np.float64`
-- **Low precision** (`export DP_INTERFACE_PREC=low`): `GLOBAL_NP_FLOAT_PRECISION = np.float32`, `GLOBAL_ENER_FLOAT_PRECISION = np.float64` (energy precision remains high)
-- **Location**: `deepmd/env.py:33-48`
+## Development Standards
 
-**Model Parameter Control (`precision` in configuration)**:
+### Naming Conventions
 
-- **Scope**: Component-specific precision for neural network weights and calculations
-- **Options**: `"float64"`, `"float32"`, `"float16"`, `"default"`
-- **Granular Control**: Can be set individually for descriptor, fitting networks, and RepFlow components
-- **Example Configuration**:
+- Always use correct capitalization: DeePMD-kit, PyTorch, TensorFlow, NumPy, GitHub, LAMMPS
 
-```json
-{
-  "model": {
-    "descriptor": {
-      "type": "dpa3",
-      "precision": "float32",
-      "repflow": {
-        "precision": "float32"
-      }
-    },
-    "fitting_net": {
-      "precision": "float32"
-    }
-  }
-}
-```
-
-**Precision Workflow** (`make_model.py:327-337`):
+### License Requirements
 
-1. **Input Type Detection**: `input_type_cast()` detects input data precision
-2. **Global Precision Conversion**: Converts to `GLOBAL_PT_FLOAT_PRECISION` for computation
-3. **Component Computation**: Uses component-specific precision settings
-4. **Output Conversion**: `output_type_cast()` converts back to original input precision
+All source files must include header license:
+`SPDX-License-Identifier: LGPL-3.0-or-later`
 
-#### Inference System
+## Test Strategy
 
-**Main Classes**:
+### Test Locations
 
-- `DeepEval`: Universal inference interface (`deepmd/pt/infer/deep_eval.py:75`)
-- `Tester`: Testing and inference utility (`deepmd/pt/infer/inference.py:25`)
+- **source/tests/**: C++ and Python tests
+- **tests/** directories in each submodule
 
-**Inference Flow**:
+### Test Principles
 
-1. **Model Loading**: State dict loading and multi-task handling
-2. **JIT Compilation**: Optional TorchScript optimization
-3. **Batch Processing**: Automatic batch sizing for memory optimization
-4. **Execution**: DPA3 descriptor computation in evaluation mode
+- During development, only run single or few related tests; full test suite takes 60+ minutes
+- Training tests use `--skip-neighbor-stat` to skip statistics for speed
+- Use `timeout` to limit training test time
 
-**Performance Optimizations**:
+## Configuration File Structure
 
-- **JIT Compilation**: `torch.jit.script()` for graph optimization
-- **Auto-batching**: Dynamic batch size adjustment based on memory
-- **Multi-device**: CPU/GPU support with automatic device selection
-- **Model Freezing**: `dp freeze` for deployment-optimized models
-
-#### Configuration Example
+### Typical Training Configuration (input.json)
 
 ```json
 {
   "model": {
+    "type_map": ["O", "H"],
     "descriptor": {
-      "type": "dpa3",
-      "repflow": {
-        "e_rcut": 6.0,
-        "e_sel": 120,
-        "a_rcut": 4.0,
-        "a_sel": 40,
-        "n_dim": 128,
-        "e_dim": 64,
-        "a_dim": 32,
-        "nlayers": 3,
-        "update_style": "res_residual"
-      },
-      "concat_output_tebd": true,
-      "precision": "float32"
+      "type": "se_a",
+      "sel": [46, 92],
+      "rcut_smth": 5.8,
+      "rcut": 6.0,
+      "neuron": [25, 50, 100],
+      "axis_neuron": 12
+    },
+    "fitting_net": {
+      "type": "ener",
+      "neuron": [240, 240, 240],
+      "resnet_dt": true
     }
+  },
+  "learning_rate": {
+    "type": "exp",
+    "start_lr": 0.001,
+    "decay_steps": 5000
+  },
+  "loss": {
+    "start_pref_e": 0.02,
+    "start_pref_f": 1000,
+    "start_pref_v": 0.0
+  },
+  "training": {
+    "training_data": {
+      "systems": ["system1/", "system2/"],
+      "batch_size": 8
+    },
+    "numb_steps": 1000000
   }
 }
 ```
 
-#### Energy Summation Mechanism
-
-DPA3 implements a two-stage energy calculation:
-
-1. **Atomic Energy**: Each atom's local environment energy computed in fitting networks
-2. **System Energy**: Atomic energies summed to get total system energy
-
-**Key Files**:
-
-- Atomic energy: `deepmd/pt/model/task/fitting.py:473-614`
-- Energy summation: `deepmd/pt/model/model/transform_output.py:153-192`
-
-### Model Compression System
-
-#### Compression Overview
-
-DeePMD-kit supports model compression through tabulation of embedding networks, providing significant inference speedup by replacing neural network computations with polynomial interpolation lookups.
-
-**Core Concept**:
-
-- Pre-compute embedding network outputs and store in lookup tables
-- Use two-stage interpolation with different stride sizes for accuracy-memory balance
-- Replace runtime neural network evaluations with fast polynomial interpolation
-
-#### Compression Architecture
-
-**Entry Points**:
-
-- Command: `dp --pt compress -i model.pth -o compressed.pth`
-- Main entry: `deepmd/main.py` → `deepmd/pt/entrypoints/main.py:574-582`
-- Core function: `deepmd/pt/entrypoints/compress.py:32-84`
-
-**Execution Flow**:
-
-1. **Model Loading**: Load JIT model and reconstruct model instance
-2. **Min Distance Calculation**: Compute minimum neighbor distance from training data
-3. **Hierarchical Compression**: Model → Atomic Model → Descriptor compression
-4. **Table Building**: Create polynomial coefficient tables via `DPTabulate`
-5. **JIT Serialization**: Save compressed model as TorchScript
-
-#### Supported Descriptors
-
-**Fully Supported**:
-
-- `SE_A` (`se_a.py:257-302`): Smooth Edition Angular descriptor
-- `SE_R` (`se_r.py:359-xxx`): Smooth Edition Radial descriptor
-- `SE_T` (`se_t.py:284-327`): Smooth Edition Three-body descriptor
-- `SE_Atten` (`se_atten.py:427-448`): Smooth Edition with Attention
-- `DPA1` (`dpa1.py:572-645`): Deep Potential Attention version 1
-- `DPA2` (`dpa2.py:893-973`): Deep Potential Attention version 2
-
-**Not Supported**:
-
-- `DPA3` (`dpa3.py:578-601`): Explicitly raises `NotImplementedError`
-- `Pairtab` models: No tabulation compression support
-
-#### Tabulation Implementation
-
-**Key Class**: `DPTabulate` (`deepmd/pt/utils/tabulate.py:30-100`)
-
-**Table Building Process**:
+## Special Features
 
-1. **Range Calculation**: Compute environment matrix bounds from training data statistics
-2. **Grid Generation**: Create two-segment distance grids (fine + coarse stride)
-3. **Neural Network Evaluation**: Forward pass to get function values and derivatives
-4. **Polynomial Fitting**: Generate 5th-order Hermite interpolation coefficients
+### 1. Type Embedding
 
-**Data Storage Format**:
+- Support unified training for multi-element systems
+- Location: `deepmd/utils/type_embed.py`
+- Dynamic type embedding can handle unseen element combinations
 
-- `compress_info`: [lower, upper, extrapolate_upper, stride1, stride2, check_freq]
-- `compress_data`: [nspline, 6 * last_layer_size] coefficient tables
-- Coefficients: [f(x), f'(x), f''(x)/2, c3, c4, c5] per neuron
+### 2. Adaptive Selection (UpdateSel)
 
-#### Performance Characteristics
+- Automatically update neighbor list selection parameters
+- Avoid neighbor loss due to atomic migration
+- Location: `deepmd/utils/update_sel.py`
 
-**Memory Optimization**:
+### 3. Multi-task Learning
 
-- Two-stage interpolation: fine stride (0.01) + coarse stride (0.1)
-- Extrapolation region: 5× training data range by default
-- Removes original network weights after compression
+- Simultaneously fit energy, force, stress, dipole, etc.
+- Loss function can configure weights for each task
+- Support physical constraints and regularization
 
-**Computational Benefits**:
+## Model Compression Details (Advanced)
 
-- Eliminates matrix operations in embedding networks
-- Vectorized polynomial evaluation
-- Cache-friendly data layout for lookup tables
+### Compression Data Structure
 
-#### Configuration Parameters
+#### 1. Compression Information (compress_info)
 
-- `-s, --step`: Fine stride size (default: 0.01) - affects accuracy vs memory
-- `-e, --extrapolate`: Extrapolation multiplier (default: 5)
-- `-f, --frequency`: Overflow check frequency (default: -1, disabled)
-- `-t, --training-script`: Training script path for min distance calculation
-
-#### 2. Backend-Specific Implementations
-
-- `deepmd/tf/`: TensorFlow backend (original implementation)
-- `deepmd/pt/`: PyTorch backend
-- `deepmd/jax/`: JAX backend
-- `deepmd/pd/`: Paddle backend
-
-Each backend implements similar interfaces:
-
-- Descriptor variants optimized for the framework
-- Training and inference modules
-- Model serialization/loading
-
-#### 3. Inference (`deepmd/infer/`)
-
-High-level inference interfaces:
-
-- `deep_pot.py`: Main potential energy model interface
-- `deep_eval.py`: Generic evaluation interface
-- Backend-specific inference modules
-
-#### 4. Training (`deepmd/*/train/`)
-
-Backend-specific training implementations:
-
-- Training loops and optimization
-- Data loading and preprocessing
-- Checkpoint management
-
-#### 5. Entry Points (`deepmd/entrypoints/`)
-
-Command-line interface commands:
-
-- `main.py`: Main CLI dispatcher
-- Training, testing, conversion utilities
-- Model analysis and documentation tools
-
-#### 6. C++ Integration (`source/`)
-
-- `lib/`: Core computational library with CUDA/ROCm support
-- `api_cc/`: C++ API for external integration
-- `api_c/`: C API wrapper
-- `lmp/`: LAMMPS plugin integration
-- `op/`: Custom operators for different frameworks
-
-### PyTorch Backend Data Processing
-
-#### Two-Level DataLoader Architecture
-
-The PyTorch backend uses a unique two-level DataLoader system for efficient multi-system data management:
-
-**System Level**: Each data system has its own DataLoader (num_workers=0 to avoid thread explosion)
-**Training Level**: Master DataLoader handles sampling and batching across systems (num_workers=NUM_WORKERS)
-
-**Key Components**:
-
-- `DeepmdData`: Raw data loading from HDF5/.npy files (`deepmd/utils/data.py`)
-- `DpLoaderSet`: System-level DataLoader collection (`deepmd/pt/utils/dataloader.py`)
-- `DeepmdDataSetForLoader`: PyTorch Dataset wrapper
-- `collate_batch`: Batch processing function for variable-sized systems
-
-**Data Flow**:
-
-```
-Raw Data (HDF5/.npy) → DeepmdData → System DataLoaders → DpLoaderSet → Training DataLoader → Model Input
+```python
+# Store 6 parameters for each embedding network [6]
+compress_info[embedding_idx] = torch.tensor(
+    [
+        lower[net],  # Lower bound
+        upper[net],  # Upper bound
+        upper[net] * extrapolate,  # Extrapolation upper bound
+        table_stride_1,  # First segment stride
+        table_stride_2,  # Second segment stride
+        check_frequency,  # Overflow check frequency
+    ]
+)
 ```
 
-### DPAtomicModel Hierarchy
+#### 2. Compression Data (compress_data)
 
-#### Class Structure
+```python
+# Store coefficient table for each embedding network [nspline, 6 * last_layer_size]
+compress_data[embedding_idx] = table_data[net]
 
-```text
-BaseAtomicModel (base_atomic_model.py:52)
-    ↓
-DPAtomicModel (dp_atomic_model.py:34) - registered as "standard"
-    ↓
-Specific Models (Energy, Dipole, Polar, DOS, Property)
+# Each 6 consecutive coefficients represent polynomial coefficients
+# [f(x), f'(x), f''(x)/2, c3, c4, c5] × last_layer_size
 ```
 
-**Key Features**:
-
-- **Unified Interface**: Consistent API for different physical properties
-- **Atomic-Level Forward Pass**: `forward_atomic()` method handles descriptor computation and fitting
-- **Multi-Task Support**: Supports training multiple properties simultaneously
-- **Automatic Differentiation**: Force and virial computation through autograd
-
-**Key Files**:
-
-- Base class: `deepmd/pt/model/atomic_model/dp_atomic_model.py:34`
-- Energy model: `deepmd/pt/model/atomic_model/energy_atomic_model.py:13`
-- Dipole model: `deepmd/pt/model/atomic_model/dipole_atomic_model.py:14`
-
-### Key Design Patterns
-
-#### Backend Abstraction
-
-The code uses a sophisticated backend system that allows:
-
-- Runtime backend selection
-- Model conversion between backends
-- Consistent APIs across frameworks
-
-#### Descriptor-Based Architecture
-
-Models are built from:
+### Tabulation Implementation
 
-1. **Descriptors**: Local atomic environment representations
-2. **Fitting Networks**: Map descriptors to physical quantities
-3. **Models**: Combine descriptors and fitting for complete potentials
+- **Table Builder**: `deepmd/pt/utils/tabulate.py` (PyTorch)
+- **Common Utilities**: `deepmd/utils/tabulate.py`
+- **Supported Activations**: tanh, gelu, relu, relu6, softplus, sigmoid
 
-#### Multi-Task Learning
+### Polynomial Interpolation Formula
 
-Support for training multiple properties simultaneously:
+In interval [x_i, x_{i+1}], for variable x, the polynomial is:
 
-- Energy, forces, virial
-- Dipole moments, polarizability
-- DOS, electronic properties
-- Spin systems
-
-## Working with the Code
-
-### Adding New Features
-
-1. **Framework-agnostic**: Add to `deepmd/dpmodel/` first
-2. **Backend implementations**: Extend each backend in `deepmd/*/`
-3. **C++ optimization**: Add performance-critical code to `source/lib/`
-4. **Tests**: Add backend-specific tests in `source/tests/*/`
-
-### Model Development
-
-- Use existing descriptors as templates in `deepmd/dpmodel/descriptor/`
-- Extend fitting networks in `deepmd/dpmodel/fitting/`
-- Model composition follows patterns in `deepmd/dpmodel/model/`
-
-### Performance Considerations
-
-- C++ library handles neighbor lists and environment matrices
-- Custom operators optimized for GPU acceleration
-- Automatic mixed precision support where available
-- **Model compression**: Tabulation provides 2-10× inference speedup for supported descriptors
-
-### Common Pitfalls
-
-- Backend-specific imports are banned at module level (use runtime imports)
-- Model compatibility requires careful version management
-- GPU builds require specific CUDA/ROCm versions
-- **Compression limitations**: DPA3 and some specialized models don't support compression
-- **Training data dependency**: Compression requires training script for optimal table range calculation
+```
+f(x) = c₀ + c₁t + c₂t² + c₃t³ + c₄t⁴ + c₅t⁵
+```
 
-## File Structure Conventions
+Where:
 
-- **Public APIs**: In `deepmd/` top-level modules
-- **Implementation details**: In subdirectories like `dpmodel/`, `utils/`
-- **Backend code**: Separated into `tf/`, `pt/`, `jax/`, `pd/` directories
-- **Tests**: Organized by backend in `source/tests/*/`
-- **Examples**: In `examples/` directory with input configurations
+- `t = (x - x_i) / h`, h is step size
+- `c₀ = f(x_i)`
+- `c₁ = f'(x_i) × h`
+- `c₂ = f''(x_i) × h² / 2`
+- `c₃, c₄, c₅` determined by boundary continuity
diff --git a/README.md b/README.md
index 0444469779..94ee6c788a 100644
--- a/README.md
+++ b/README.md
@@ -108,3 +108,5 @@ See [DeePMD-kit Contributing Guide](CONTRIBUTING.md) to become a contributor! 
 [2]: https://journals.aps.org/prl/abstract/10.1103/PhysRevLett.120.143001
 [3]: https://arxiv.org/abs/1805.09003
 [4]: https://aip.scitation.org/doi/full/10.1063/1.5027645
+
+Use this command to generate json schema: `python -c "from deepmd.utils.argcheck import gen_json_schema; import json; json.dump(json.loads(gen_json_schema(multi_task=True)), open('/home/outisli/Research/dpmd/deepmd_json_schema.json', 'w'), indent=2)"`
diff --git a/debug/compress_debug.py b/debug/compress_debug.py
index db57d78d83..419a1d1e7e 100644
--- a/debug/compress_debug.py
+++ b/debug/compress_debug.py
@@ -10,6 +10,7 @@
 import logging
 import os
 import sys
+import time
 from pathlib import (
     Path,
 )
@@ -19,18 +20,27 @@
 sys.path.insert(0, str(deepmd_root))
 
 
-def compress_model() -> None:
+def compress_model() -> float:
     """Compress the model using the same parameters as the CLI command.
 
     dp --pt compress -i no.pth -o yes.pth -t input_torch.json
+
+    Returns
+    -------
+    float
+        Elapsed time for the compression in seconds.
     """
     # Import here to avoid module-level import restriction
     from deepmd.pt.entrypoints.compress import (
         enable_compression,
     )
 
-    # Setup logging
-    logging.basicConfig(level=logging.INFO)
+    # Setup logging with timestamp
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
     log = logging.getLogger(__name__)
 
     # Set working directory to examples/water/se_e3_tebd
@@ -39,7 +49,7 @@ def compress_model() -> None:
 
     try:
         os.chdir(work_dir)
-        log.info(f"Changed to working directory: {work_dir}")
+        log.debug(f"Changed to working directory: {work_dir}")
 
         # Model compression parameters
         input_file = "no.pth"
@@ -60,14 +70,17 @@ def compress_model() -> None:
                 f"Training script '{training_script}' not found in {work_dir}"
             )
 
-        log.info(f"Input model: {input_file}")
-        log.info(f"Output model: {output_file}")
-        log.info(f"Training script: {training_script}")
-        log.info(f"Stride: {stride}")
-        log.info(f"Extrapolate: {extrapolate}")
-        log.info(f"Check frequency: {check_frequency}")
+        log.debug(f"Input model: {input_file}")
+        log.debug(f"Output model: {output_file}")
+        log.debug(f"Training script: {training_script}")
+        log.debug(f"Stride: {stride}")
+        log.debug(f"Extrapolate: {extrapolate}")
+        log.debug(f"Check frequency: {check_frequency}")
+
+        log.debug("Starting model compression...")
 
-        log.info("Starting model compression...")
+        # Record time usage
+        start_time = time.monotonic()
 
         # Call the compression function
         enable_compression(
@@ -79,8 +92,14 @@ def compress_model() -> None:
             training_script=training_script,
         )
 
+        elapsed_time = time.monotonic() - start_time
+
+        # Print results (keep these as info level - these are the main results)
         log.info("Model compression completed successfully!")
         log.info(f"Compressed model saved to: {output_file}")
+        log.info(f"Elapsed time: {elapsed_time:.2f} seconds")
+
+        return elapsed_time
 
     except Exception as e:
         log.error(f"Error during compression: {e}")
diff --git a/debug/dptest_debug.py b/debug/dptest_debug.py
index 80a6f2b516..ec7e5dcea8 100644
--- a/debug/dptest_debug.py
+++ b/debug/dptest_debug.py
@@ -15,23 +15,34 @@
     Path,
 )
 
+import numpy as np
+
 # Add the deepmd-kit root to Python path
 deepmd_root = Path(__file__).parent.parent
 sys.path.insert(0, str(deepmd_root))
 
 
-def test_model() -> None:
+def test_model() -> float:
     """Test the model using the same parameters as the CLI command.
 
     dp --pt test -m model.ckpt.pt -s . -n 100 -f test_debug.txt
+
+    Returns
+    -------
+    float
+        Elapsed time for the testing in seconds.
     """
     # Import here to avoid module-level import restriction
     from deepmd.entrypoints.test import (
         test,
     )
 
-    # Setup logging
-    logging.basicConfig(level=logging.INFO)
+    # Setup logging with timestamp
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
     log = logging.getLogger(__name__)
 
     # Set working directory to examples/water/se_e3_tebd
@@ -40,7 +51,7 @@ def test_model() -> None:
 
     try:
         os.chdir(work_dir)
-        log.info(f"Changed to working directory: {work_dir}")
+        log.debug(f"Changed to working directory: {work_dir}")
 
         # Test parameters
         model_file = "no.pth"  # Model file to test
@@ -64,16 +75,16 @@ def test_model() -> None:
         # Set environment variable to limit batch size for testing
         os.environ["DP_INFER_BATCH_SIZE"] = "1024"
 
-        log.info(f"Model: {model_file}")
-        log.info(f"System directory: {system_dir}")
-        log.info(f"Number of test frames: {numb_test}")
-        log.info(f"Detail file: {detail_file}")
-        log.info(f"Atomic output: {atomic}")
+        log.debug(f"Model: {model_file}")
+        log.debug(f"System directory: {system_dir}")
+        log.debug(f"Number of test frames: {numb_test}")
+        log.debug(f"Detail file: {detail_file}")
+        log.debug(f"Atomic output: {atomic}")
 
-        log.info("Starting model testing...")
+        log.debug("Starting model testing...")
 
         # Record time usage
-        start_time = time.time()
+        start_time = time.monotonic()
         # Call the test function
         test(
             model=model_file,
@@ -88,13 +99,16 @@ def test_model() -> None:
             atomic=atomic,
             head=head,
         )
-        end_time = time.time()
+        end_time = time.monotonic()
         elapsed_time = end_time - start_time
 
+        # Print results (keep these as info level - these are the main results)
         log.info("Model testing completed successfully!")
         log.info(f"Test results saved to: {detail_file}")
         log.info(f"Elapsed time: {elapsed_time:.2f} seconds")
 
+        return elapsed_time
+
     except Exception as e:
         log.error(f"Error during testing: {e}")
         raise
@@ -104,4 +118,30 @@ def test_model() -> None:
 
 
 if __name__ == "__main__":
-    test_model()
+    # Run testing 10 times and calculate average timing
+    num_runs = 10
+    times = []
+
+    print(f"Running model testing {num_runs} times...")  # noqa: T201
+    print("=" * 50)  # noqa: T201
+
+    for i in range(num_runs):
+        print(f"\nRun {i + 1}/{num_runs}")  # noqa: T201
+        print("-" * 20)  # noqa: T201
+        elapsed_time = test_model()
+        times.append(elapsed_time)
+
+    # Calculate and display statistics
+    print("\n" + "=" * 50)  # noqa: T201
+    print("Timing Summary:")  # noqa: T201
+    print("=" * 50)  # noqa: T201
+
+    avg_time = sum(times) / len(times)
+    min_time = min(times)
+    max_time = max(times)
+
+    print(f"Average time: {avg_time:.2f} seconds")  # noqa: T201
+    print(f"Min time: {min_time:.2f} seconds")  # noqa: T201
+    print(f"Max time: {max_time:.2f} seconds")  # noqa: T201
+    print(f"Std deviation: {np.std(times):.2f} seconds")  # noqa: T201
+    print(f"All times: {[f'{t:.2f}' for t in times]}")  # noqa: T201
diff --git a/debug/inference_debug.py b/debug/inference_debug.py
index 1e5beb1f39..3593e5e655 100644
--- a/debug/inference_debug.py
+++ b/debug/inference_debug.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: LGPL-3.0-or-later
-"""Debug script for single configuration model inference.
+"""Inference performance profiling script with TensorBoard visualization.
 
-This script loads only one configuration from the dataset and performs inference.
-Perfect for profiling and debugging individual forward passes.
+This script focuses on identifying performance hotspots in DeePMD-kit inference
+by breaking down the computation into detailed components and visualizing results.
 """
 
 import logging
@@ -18,6 +18,9 @@
 )
 
 import numpy as np
+import torch  # noqa: TID253
+from torch.profiler import record_function  # noqa: TID253
+from torch.utils.tensorboard import SummaryWriter  # noqa: TID253
 
 # Add the deepmd-kit root to Python path
 deepmd_root = Path(__file__).parent.parent
@@ -81,8 +84,24 @@ def load_single_configuration(data_dir: str, frame_idx: int = 0) -> dict[str, An
     return data
 
 
-def inference_single_config() -> None:
-    """Perform inference on a single configuration."""
+def inference_single_config(
+    model_file: str,
+    enable_profiling: bool = False,
+) -> float:
+    """Perform inference on a single configuration with comprehensive TensorBoard logging.
+
+    Parameters
+    ----------
+    model_file : str
+        Path to the model checkpoint file.
+    enable_profiling : bool, optional
+        Whether to enable PyTorch profiling, by default False
+
+    Returns
+    -------
+    float
+        Elapsed time for the inference in seconds.
+    """
     # Import DeepPot for simplified inference
     from deepmd.infer import (
         DeepPot,
@@ -96,16 +115,19 @@ def inference_single_config() -> None:
     )
     log = logging.getLogger(__name__)
 
-    # Set working directory to examples/water/se_e3_tebd
+    # Setting working directory
     work_dir = deepmd_root / "examples" / "water" / "se_e3_tebd"
     original_cwd = os.getcwd()
 
     try:
         os.chdir(work_dir)
-        log.info(f"Changed to working directory: {work_dir}")
+        log.debug(f"Changed to working directory: {work_dir}")
+
+        log_dir = "./profile_logs"
+        os.makedirs(log_dir, exist_ok=True)
+        writer = SummaryWriter(log_dir)
 
         # Test parameters
-        model_file = "no.pth"  # Model file to test
         data_dir = "../data/data_3"  # Directory contains test data
         frame_idx = 0  # Use first frame
 
@@ -115,41 +137,105 @@ def inference_single_config() -> None:
                 f"Model file '{model_file}' not found in {work_dir}"
             )
 
-        log.info(f"Loading model: {model_file}")
+        log.debug(f"Loading model: {model_file}")
 
-        # Initialize model using DeepPot interface
-        dp = DeepPot(model_file, auto_batch_size=True)
+        # Initialize model using DeepPot interface (outside profiling for cleaner results)
+        dp = DeepPot(model_file, auto_batch_size=1024)
 
-        log.info(f"Loading single configuration from: {data_dir}")
+        log.debug(f"Loading single configuration from: {data_dir}")
 
-        # Load single configuration
+        # Load single configuration (outside profiling)
         data = load_single_configuration(data_dir, frame_idx)
-
         coord = data["coord"]
         box = data["box"]
         atom_types = data["atom_types"]
 
-        log.info("Configuration info:")
-        log.info(f"  Number of atoms: {len(atom_types)}")
-        log.info(f"  Coordinate shape: {coord.shape}")
-        log.info(f"  Box shape: {box.shape}")
-        log.info(f"  Atom types shape: {atom_types.shape}")
-        log.info(f"  Unique atom types: {np.unique(atom_types)}")
+        log.debug("Configuration info:")
+        log.debug(f"  Number of atoms: {len(atom_types)}")
+        log.debug(f"  Coordinate shape: {coord.shape}")
+        log.debug(f"  Box shape: {box.shape}")
+        log.debug(f"  Atom types shape: {atom_types.shape}")
+        log.debug(f"  Unique atom types: {np.unique(atom_types)}")
 
         if data.get("type_map"):
-            log.info(f"  Type map: {data['type_map']}")
-
-        log.info("Starting single configuration inference...")
-
-        # Record time usage
-        start_time = time.time()
-
-        # Perform inference using DeepPot.eval()
-        e, f, v = dp.eval(coord, box, atom_types)
-
-        elapsed_time = time.time() - start_time
-
-        # Print results
+            log.debug(f"  Type map: {data['type_map']}")
+
+        log.debug("Starting single configuration inference...")
+
+        # Use profiler if enabled
+        if enable_profiling:
+            log.info("PyTorch profiling enabled...")
+
+            with torch.profiler.profile(
+                schedule=torch.profiler.schedule(wait=3, warmup=3, active=3, repeat=1),
+                on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                    writer.get_logdir()
+                ),
+                record_shapes=True,
+                profile_memory=True,
+                with_stack=True,
+                with_flops=True,
+                with_modules=True,
+            ) as prof:
+                # Warmup and active phases for profiling
+                for phase in range(9):  # 3 wait + 3 warmup + 3 active
+                    if phase == 6:  # Start active profiling
+                        log.debug("Starting profiling phase...")
+
+                    # Record time usage
+                    start_time = time.monotonic()
+
+                    # 3: Use record_function to label the core inference step
+                    with record_function("Inference (DeepPot.eval)"):
+                        # Perform inference using DeepPot.eval()
+                        e, f, v = dp.eval(coord, box, atom_types)
+
+                    elapsed_time = time.monotonic() - start_time
+
+                    if phase == 6:  # End active profiling
+                        log.debug("Ending profiling phase...")
+
+                    # Mark profiler step
+                    prof.step()
+
+                # Save profiling summaries to a log file instead of showing on screen
+                profiling_output_path = "profile_summary.log"
+                with open(profiling_output_path, "w") as pf:
+                    pf.write("=== PyTorch Profiling Summary ===\n")
+                    pf.write("Top 10 CPU operations by total time:\n")
+                    cpu_summary = prof.key_averages().table(
+                        sort_by="cpu_time_total", row_limit=10
+                    )
+                    pf.write(f"{cpu_summary}\n\n")
+
+                    pf.write("Top 10 CUDA operations by total time:\n")
+                    cuda_summary = prof.key_averages().table(
+                        sort_by="cuda_time_total", row_limit=10
+                    )
+                    pf.write(f"{cuda_summary}\n\n")
+
+                    pf.write("Top 10 memory allocations:\n")
+                    memory_summary = prof.key_averages().table(
+                        sort_by="cpu_memory_usage", row_limit=10
+                    )
+                    pf.write(f"{memory_summary}\n")
+
+                log.info("Profile logs saved to ./profile_logs/")
+                log.info(
+                    "To view detailed results, run: tensorboard --logdir=./profile_logs"
+                )
+            writer.close()
+        else:
+            # Regular inference without profiling
+            # Record time usage
+            start_time = time.monotonic()
+
+            # Perform inference using DeepPot.eval()
+            e, f, v = dp.eval(coord, box, atom_types)
+
+            elapsed_time = time.monotonic() - start_time
+
+        # Print results (keep these as info level - these are the main results)
         log.info("\n=== Inference Results ===")
         predicted_energy = e.reshape(-1)[0]
         log.info(f"Predicted energy: {predicted_energy:.6f}")
@@ -161,7 +247,6 @@ def inference_single_config() -> None:
             log.info(f"Energy difference: {energy_diff:.6f}")
 
         predicted_force = f
-        log.info(f"Predicted force shape: {predicted_force.shape}")
         log.info(f"Force norm: {np.linalg.norm(predicted_force):.6f}")
 
         if "force" in data:
@@ -176,6 +261,8 @@ def inference_single_config() -> None:
         log.info("Inference completed successfully!")
         log.info(f"Elapsed time: {elapsed_time:.6f} seconds")
 
+        return elapsed_time
+
     except Exception as e:
         log.error(f"Error during inference: {e}")
         raise
@@ -185,4 +272,46 @@ def inference_single_config() -> None:
 
 
 if __name__ == "__main__":
-    inference_single_config()
+    # Set this to True to enable PyTorch profiling
+    ENABLE_PROFILING = True
+
+    # Run inference and calculate average timing
+    # If profiling is enabled, force single run
+    num_runs = 1 if ENABLE_PROFILING else 10
+    times = []
+
+    model_name = "no"
+
+    print(f"Running inference {num_runs} times...")  # noqa: T201
+    if ENABLE_PROFILING:
+        print("PyTorch profiling ENABLED (single run forced)")  # noqa: T201
+    print("=" * 50)  # noqa: T201
+
+    for i in range(num_runs):
+        print(f"\nRun {i + 1}/{num_runs}")  # noqa: T201
+        print("-" * 20)  # noqa: T201
+
+        # Enable profiling if requested (will only run once anyway)
+        elapsed_time = inference_single_config(
+            model_file=f"{model_name}.pth", enable_profiling=ENABLE_PROFILING
+        )
+        times.append(elapsed_time)
+
+    # Calculate and display statistics
+    print("\n" + "=" * 50)  # noqa: T201
+    print("Timing Summary:")  # noqa: T201
+    print("=" * 50)  # noqa: T201
+
+    # Drop the first run to avoid cold start bias
+    if len(times) > 1:
+        times = times[1:]
+
+    avg_time = sum(times) / len(times)
+    min_time = min(times)
+    max_time = max(times)
+
+    print(f"Average time: {avg_time:.6f} seconds")  # noqa: T201
+    print(f"Min time: {min_time:.6f} seconds")  # noqa: T201
+    print(f"Max time: {max_time:.6f} seconds")  # noqa: T201
+    print(f"Std deviation: {np.std(times):.6f} seconds")  # noqa: T201
+    print(f"All times: {[f'{t:.6f}' for t in times]}")  # noqa: T201
diff --git a/debug/train_debug.py b/debug/train_debug.py
index c1f809c90c..5c47a762dc 100644
--- a/debug/train_debug.py
+++ b/debug/train_debug.py
@@ -10,6 +10,7 @@
 import logging
 import os
 import sys
+import time
 from pathlib import (
     Path,
 )
@@ -19,18 +20,27 @@
 sys.path.insert(0, str(deepmd_root))
 
 
-def train_model() -> None:
+def train_model() -> float:
     """Train the model using the same parameters as the CLI command.
 
     dp --pt train input_torch.json
+
+    Returns
+    -------
+    float
+        Elapsed time for the training in seconds.
     """
     # Import here to avoid module-level import restriction
     from deepmd.pt.entrypoints.main import (
         train,
     )
 
-    # Setup logging
-    logging.basicConfig(level=logging.INFO)
+    # Setup logging with timestamp
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
     log = logging.getLogger(__name__)
 
     # Set working directory to examples/water/se_e3_tebd
@@ -39,7 +49,7 @@ def train_model() -> None:
 
     try:
         os.chdir(work_dir)
-        log.info(f"Changed to working directory: {work_dir}")
+        log.debug(f"Changed to working directory: {work_dir}")
 
         # Training parameters
         input_file = "input_torch.json"
@@ -60,12 +70,15 @@ def train_model() -> None:
                 f"Training input file '{input_file}' not found in {work_dir}"
             )
 
-        log.info(f"Input file: {input_file}")
-        log.info(f"Output config: {output}")
-        log.info(f"Skip neighbor stat: {skip_neighbor_stat}")
-        log.info(f"Compile model: {compile_model}")
+        log.debug(f"Input file: {input_file}")
+        log.debug(f"Output config: {output}")
+        log.debug(f"Skip neighbor stat: {skip_neighbor_stat}")
+        log.debug(f"Compile model: {compile_model}")
+
+        log.debug("Starting model training...")
 
-        log.info("Starting model training...")
+        # Record time usage
+        start_time = time.monotonic()
 
         # Call the training function
         train(
@@ -82,8 +95,14 @@ def train_model() -> None:
             output=output,
         )
 
+        elapsed_time = time.monotonic() - start_time
+
+        # Print results (keep these as info level - these are the main results)
         log.info("Model training completed successfully!")
         log.info(f"Output configuration saved to: {output}")
+        log.info(f"Elapsed time: {elapsed_time:.2f} seconds")
+
+        return elapsed_time
 
     except Exception as e:
         log.error(f"Error during training: {e}")
diff --git a/doc/outisli/install.md b/doc/outisli/install.md
index 91d526a3b7..220d21ba3f 100644
--- a/doc/outisli/install.md
+++ b/doc/outisli/install.md
@@ -74,27 +74,46 @@ sudo apt update && sudo apt upgrade -y && sudo apt autoremove -y
 git clone git@github.com:OutisLi/deepmd-kit.git && cd deepmd-kit && git checkout outisli
 
 # 1.2 Create virtual environment
-# optional if you installed miniforge: alias conda="mamba"
-conda update -n base -c conda-forge conda -y ; conda update -n base -c conda-forge mamba -y
-conda deactivate && conda env remove -n dpmd -y ; rm -rf build ; git clean -xdf ; conda create -n dpmd gcc=13 gxx=13 cmake python=3.13 -c conda-forge -y && conda activate dpmd && pip install --upgrade pip && pip install uv
+# optional if you installed miniforge: alias mamba="conda"
+# CUDA 13.0 support gcc-15
+mamba update -n base -c conda-forge conda -y ; mamba update -n base -c conda-forge mamba -y
+mamba deactivate && mamba env remove -n dpmd -y ; rm -rf build ; git clean -xdf ; mamba create -n dpmd gcc=15 gxx=15 cmake python=3.13 -c conda-forge -y && mamba activate dpmd && pip install --upgrade pip && pip install uv
 
 # 1.3 (Optional) install openmpi if you do not have mpi
 conda install openmpi -c conda-forge
 
 # 2.1 Install pytorch
-uv pip install -U torch --index-url https://download.pytorch.org/whl/cu129
+uv pip install -U torch --index-url https://download.pytorch.org/whl/cu130
 
 # 2.2 (Optional) Install tensorflow
 uv pip install -U tensorflow
 
 # 2.3 (Optional) Install jax
-uv pip install -U jax-ai-stack "jax[cuda]"
+uv pip install -U "tensorflow[and-cuda]" "jax[cuda13]" jax-ai-stack equinox
 
 # 3. Install deepmd-kit
-export DP_VARIANT="cuda" DP_ENABLE_PYTORCH=1 DP_ENABLE_TENSORFLOW=1 DP_ENABLE_PADDLE=0 DP_ENABLE_NATIVE_OPTIMIZATION=1 CUDAToolkit_ROOT="/usr/local/cuda" CUDA_HOME="/usr/local/cuda" CUDA_PATH="/usr/local/cuda" CUDA_VERSION=12.9 && pip install -e . -v
+export CUDA_VERSION=13.1 CUDA_HOME="/usr/local/cuda" && export CUDAToolkit_ROOT=$CUDA_HOME CUDA_PATH=$CUDA_HOME && export DP_VARIANT="cuda" DP_ENABLE_PYTORCH=1 DP_ENABLE_TENSORFLOW=1 DP_ENABLE_PADDLE=0 DP_ENABLE_NATIVE_OPTIMIZATION=1 && pip install -e . -v
 
-# 4. Install other useful packages
-uv pip install -U dpdata pymatgen freud-analysis tensorboard torch-tb-profiler seaborn ipykernel nglview git+https://gitlab.com/1041176461/ase-abacus.git
+# 4.1 Install other useful packages
+uv pip install -U dpdata pymatgen freud-analysis seaborn ipykernel nglview "git+https://gitlab.com/1041176461/ase-abacus.git"
+# 4.2 For developers
+uv pip install -U pytest pre-commit tensorboard torch-tb-profiler tensorboard-plugin-profile
+```
+
+### 1.1+ Check GPU Installation
+
+```shell
+# pytorch
+python -c "import torch; print('PyTorch devices:', [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())] if torch.cuda.is_available() else 'CPU')"
+
+# tensorflow
+python -c "import tensorflow as tf; print('TF devices:', tf.config.list_physical_devices('GPU'))"
+
+# JAX
+python -c "import jax; print('JAX devices:', jax.devices())"
+
+# All in One
+python -c "import torch, tensorflow as tf, jax; print('PyTorch: ', [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())] if torch.cuda.is_available() else 'CPU'); print('TF:      ', tf.config.list_physical_devices('GPU')); print('JAX:     ', jax.devices())"
 ```
 
 ## 1.2 For Mac
@@ -114,18 +133,19 @@ uv pip install -U torch
 # 3. Install deepmd-kit
 export DP_ENABLE_PYTORCH=1 DP_ENABLE_PADDLE=0 DP_ENABLE_TENSORFLOW=0 DP_ENABLE_NATIVE_OPTIMIZATION=1 && uv pip install -e . -v
 
-# 4. Install other useful packages
-uv pip install -U dpdata pymatgen freud-analysis tensorboard torch-tb-profiler seaborn ipykernel nglview git+https://gitlab.com/1041176461/ase-abacus.git
+# 4.1 Install other useful packages
+uv pip install -U dpdata pymatgen freud-analysis seaborn ipykernel nglview "git+https://gitlab.com/1041176461/ase-abacus.git"
+# 4.2 For developers
+uv pip install -U pytest pre-commit tensorboard torch-tb-profiler tensorboard-plugin-profile
 ```
 
 # 2. Install the C++ interface
 
-If one does not need to use DeePMD-kit with LAMMPS or i-PI, then the python interface installed in the previous section does everything and he/she can safely skip this section.
+> If one does not need to use DeePMD-kit with LAMMPS or i-PI, then the python interface installed in the previous section does everything and he/she can safely skip this section.
 
 ```shell
 # 0. (Optional) for reinstall
-export software="$HOME/Software"
-rm -rfv $software/deepmd-kit_cpp $software/deepmd-kit/source/build
+export software="$HOME/Software" && rm -rfv $software/deepmd-kit_cpp $software/deepmd-kit/source/build
 
 # 1. Environment Variables
 export deepmd_source_dir=$(pwd) && mkdir -p ../deepmd-kit_cpp && cd ../deepmd-kit_cpp && export deepmd_root=$(pwd) && cd ../deepmd-kit && cd source && mkdir -p build && cd build
@@ -133,36 +153,143 @@ export deepmd_source_dir=$(pwd) && mkdir -p ../deepmd-kit_cpp && cd ../deepmd-ki
 # export deepmd_root="$software/deepmd-kit_cpp"
 
 # 2. CMake (Choice either one)
-# 2.1 Option 1: use pytorch only
-cmake -DENABLE_PYTORCH=ON -DUSE_PT_PYTHON_LIBS=ON -DUSE_CUDA_TOOLKIT=ON -DENABLE_NATIVE_OPTIMIZATION=ON -DCMAKE_INSTALL_PREFIX=$deepmd_root -DCMAKE_PREFIX_PATH=$CONDA_PREFIX ..
 
-# 2.2 Option 2: use pytorch & tensorflow & jax
-cmake -DENABLE_TENSORFLOW=ON -DUSE_TF_PYTHON_LIBS=ON -DENABLE_PYTORCH=ON -DUSE_PT_PYTHON_LIBS=ON -DENABLE_JAX=ON -DUSE_CUDA_TOOLKIT=ON -DENABLE_NATIVE_OPTIMIZATION=ON -DCMAKE_INSTALL_PREFIX=$deepmd_root -DCMAKE_PREFIX_PATH=$CONDA_PREFIX ..
+# 2.1 Option 1: use pytorch & tensorflow & jax (from python env)
+cmake -DCMAKE_INSTALL_PREFIX=$deepmd_root \
+      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+      -DENABLE_NATIVE_OPTIMIZATION=ON \
+      -DUSE_CUDA_TOOLKIT=ON \
+      -DENABLE_PYTORCH=ON \
+      -DUSE_PT_PYTHON_LIBS=ON \
+      -DENABLE_TENSORFLOW=ON \
+      -DUSE_TF_PYTHON_LIBS=ON \
+      -DENABLE_JAX=ON \
+      -DCMAKE_PREFIX_PATH=$CONDA_PREFIX ..
+
+# 2.2 Option 2: use pytorch only (from python env)
+cmake -DCMAKE_INSTALL_PREFIX=$deepmd_root \
+      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+      -DENABLE_NATIVE_OPTIMIZATION=ON \
+      -DUSE_CUDA_TOOLKIT=ON \
+      -DENABLE_PYTORCH=ON \
+      -DUSE_PT_PYTHON_LIBS=ON \
+      -DENABLE_TENSORFLOW=OFF \
+      -DUSE_TF_PYTHON_LIBS=OFF \
+      -DCMAKE_PREFIX_PATH=$CONDA_PREFIX ..
+
+# 2.3 Option 3: use libtorch (standalone)
+wget https://download.pytorch.org/libtorch/cu128/libtorch-cxx11-abi-shared-with-deps-latest.zip
+unzip libtorch-cxx11-abi-shared-with-deps-latest.zip
+# Note: $software/libtorch is the unzipped dir, CMAKE_INSTALL_PREFIX is set to a local dir
+cmake -DCMAKE_INSTALL_PREFIX="../../install" \
+      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+      -DENABLE_NATIVE_OPTIMIZATION=ON \
+      -DUSE_CUDA_TOOLKIT=ON \
+      -DENABLE_PYTORCH=ON \
+      -DUSE_PT_PYTHON_LIBS=OFF \
+      -DCMAKE_PREFIX_PATH=$software/libtorch ..
 
 # 3. Install
 make -j && make install
+
+# 4. (Optional) Link cmake cache
+rm $software/deepmd-kit/compile_commands.json ; ln -s "$(pwd)/compile_commands.json" $software/deepmd-kit
 ```
 
 # 3. Install LAMMPS’s DeePMD-kit module (built-in mode)
 
 _Before following this section, [DeePMD-kit C++ interface](https://docs.deepmodeling.com/projects/deepmd/en/master/install/install-from-source.html) should have be installed_ (see 3.3)
 
+Note on GPU Architecture: You must specify your GPU architecture via `-DGPU_ARCH=sm_XX`.
+
+Check yours using `nvidia-smi -q | grep Architecture` or strictly match your card model.
+
+> Common values:
+> Pascal (GTX 1080, Titan X): sm_61
+> Volta (V100): sm_70
+> Turing (RTX 20xx, T4): sm_75
+> Ampere: sm_80 (A100) or sm_86 (RTX 30xx)
+> Lovelace (RTX 40xx): sm_89
+> Hopper (H100): sm_90
+> Blackwell: sm_100 (B200) or sm_103 (B300) or sm_120 (RTX 50xx, RTX PRO 6000)
+
 ```shell
 # 0.
-export software="$HOME/Software"
-make lammps && rm -rf $software/lammps
+export software="$HOME/Software" && export deepmd_source_dir="$software/deepmd-kit" && export deepmd_root="$software/deepmd-kit_cpp"
+cd "${deepmd_source_dir}/source/build" && make lammps && rm -rf $software/lammps
 
 # 1. Install requirements
 # Or conda install
-mamba install jpeg libpng zlib -c conda-forge -y
+# (jpeg, libpng: dependencies for dump image command)
+# (zlib: dependency for COMPRESS package, for .gz trajectory output)
+# (fftw: dependency for KSPACE package)
+# (voro: dependency for VORONOI package, for defect analysis)
+mamba install jpeg libpng zlib fftw voro -c conda-forge -y
 
-# 2. Download LAMMPS
-cd $software && mkdir -p lammps && cd lammps && wget https://gh-proxy.com/github.com/lammps/lammps/archive/stable_22Jul2025.tar.gz && tar xzf stable_22Jul2025.tar.gz && cd lammps-stable_22Jul2025 && mkdir -p build && cd build
-# wget https://github.com/lammps/lammps/archive/stable_22Jul2025.tar.gz
+# 2. Download lammps
+cd $software && mkdir -p lammps && cd lammps && export version="stable_22Jul2025_update2" && wget "https://gh-proxy.com/github.com/lammps/lammps/archive/${version}.tar.gz" && tar xzf "${version}.tar.gz" && cd "lammps-${version}" && mkdir -p build && cd build
+# wget https://github.com/lammps/lammps/archive/stable_22Jul2025_update2.tar.gz
 
 # 3. Compile
+# !!! CHANGE THIS TO MATCH YOUR GPU !!!
+# Example: sm_80 for A100, sm_86 for RTX 30xx, sm_89 for RTX 40xx, sm_120 for 50xx
+export CUDA_VERSION=13.1 && CUDA_HOME="/usr/local/cuda-${CUDA_VERSION}" export CUDA_PATH="/usr/local/cuda-${CUDA_VERSION}" export LAMMPS_GPU_ARCH="sm_89"
+# WM: export LAMMPS_GPU_ARCH="sm_80" && export CUDA_PATH="/lustre/software/cuda/12.6.0"
+
+# 3.1 Option 1: use pytorch & tensorflow & jax
 echo "include($deepmd_source_dir/source/lmp/builtin.cmake)" >> ../cmake/CMakeLists.txt && export TORCH_CMAKE_DIR=$(python -c "import torch; print(torch.utils.cmake_prefix_path)") && export TF_LIB_PATH=$(find $CONDA_PREFIX -name "libtensorflow_framework.so.2" | xargs dirname)
-cmake -DLAMMPS_INSTALL_RPATH=ON -DBUILD_SHARED_LIBS=yes -DCMAKE_INSTALL_PREFIX=$deepmd_root -DCMAKE_PREFIX_PATH="$deepmd_root;$CONDA_PREFIX;$TORCH_CMAKE_DIR" -DCMAKE_EXE_LINKER_FLAGS="-Wl,-rpath-link,$TF_LIB_PATH" ../cmake
+
+# for gcc13
+cmake -DCMAKE_INSTALL_PREFIX=$deepmd_root \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DBUILD_SHARED_LIBS=yes \
+      -DLAMMPS_INSTALL_RPATH=ON \
+      -DPKG_KSPACE=ON \
+      -DPKG_VORONOI=ON \
+      -DPKG_PYTHON=ON \
+      -DPKG_COMPRESS=ON \
+      -DPKG_OPENMP=ON \
+      -DPKG_GPU=ON \
+      -DGPU_API=cuda \
+      -DGPU_ARCH=$LAMMPS_GPU_ARCH \
+      -DBIN2C=$CUDA_PATH/bin/bin2c \
+      -DCMAKE_PREFIX_PATH="$deepmd_root;$CONDA_PREFIX;$TORCH_CMAKE_DIR" \
+      -DCMAKE_EXE_LINKER_FLAGS="-Wl,-rpath-link,$TF_LIB_PATH" ../cmake
+
+# for gcc15/CUDA13+ (above do not work somehow)
+cmake -DCMAKE_INSTALL_PREFIX=$deepmd_root \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DBUILD_SHARED_LIBS=yes \
+      -DLAMMPS_INSTALL_RPATH=ON \
+      -DPKG_KSPACE=ON \
+      -DPKG_VORONOI=ON \
+      -DPKG_PYTHON=ON \
+      -DPKG_COMPRESS=ON \
+      -DPKG_OPENMP=ON \
+      -DPKG_GPU=ON \
+      -DGPU_API=cuda \
+      -DGPU_ARCH=$LAMMPS_GPU_ARCH \
+      -DBIN2C=$CUDA_PATH/bin/bin2c \
+      -DCMAKE_PREFIX_PATH="$deepmd_root;$CONDA_PREFIX;$TORCH_CMAKE_DIR" \
+      -DCMAKE_EXE_LINKER_FLAGS="-Wl,-rpath-link,$TF_LIB_PATH -Wl,-rpath-link,/usr/lib/x86_64-linux-gnu -lm" ../cmake
+
+# 3.2 Option 2: use pytorch only
+echo "include($deepmd_source_dir/source/lmp/builtin.cmake)" >> ../cmake/CMakeLists.txt && export TORCH_CMAKE_DIR=$(python -c "import torch; print(torch.utils.cmake_prefix_path)")
+cmake -DCMAKE_INSTALL_PREFIX=$deepmd_root \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DBUILD_SHARED_LIBS=yes \
+      -DLAMMPS_INSTALL_RPATH=ON \
+      -DPKG_KSPACE=ON \
+      -DPKG_VORONOI=ON \
+      -DPKG_PYTHON=ON \
+      -DPKG_COMPRESS=ON \
+      -DPKG_OPENMP=ON \
+      -DPKG_GPU=ON \
+      -DGPU_API=cuda \
+      -DGPU_ARCH=$LAMMPS_GPU_ARCH \
+      -DBIN2C=$CUDA_PATH/bin/bin2c \
+      -DCMAKE_PREFIX_PATH="$deepmd_root;$CONDA_PREFIX;$TORCH_CMAKE_DIR" ../cmake
+
 make -j && make install
 
 # test

From 0622935e3e4c08b45e68f51d0177e6a400285f7b Mon Sep 17 00:00:00 2001
From: OutisLi <LTC201806070316@gmail.com>
Date: Thu, 1 Jan 2026 11:44:12 +0800
Subject: [PATCH 10/11] add grad debug script

---
 debug/train_debug_gradient.py | 191 ++++++++++++++++++++++++++++++++++
 1 file changed, 191 insertions(+)
 create mode 100644 debug/train_debug_gradient.py

diff --git a/debug/train_debug_gradient.py b/debug/train_debug_gradient.py
new file mode 100644
index 0000000000..94998452dd
--- /dev/null
+++ b/debug/train_debug_gradient.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Debug script for locating gradient explosion in SeZM-Net + ZBL training.
+
+This script uses torch.autograd.set_detect_anomaly and gradient hooks to
+pinpoint the exact location of NaN/Inf gradients.
+"""
+
+from __future__ import (
+    annotations,
+)
+
+import logging
+import os
+import pdb
+import sys
+from pathlib import (
+    Path,
+)
+
+import torch  # noqa: TID253
+
+# Add the deepmd-kit root to Python path
+deepmd_root = Path(__file__).parent.parent
+sys.path.insert(0, str(deepmd_root))
+
+# Enable anomaly detection BEFORE importing deepmd modules
+torch.autograd.set_detect_anomaly(True)
+
+
+def register_gradient_hooks(model: torch.nn.Module, log: logging.Logger) -> None:
+    """Register hooks to monitor gradients for all parameters.
+
+    Parameters
+    ----------
+    model : torch.nn.Module
+        The model to monitor.
+    log : logging.Logger
+        Logger for output.
+    """
+
+    def make_hook(name: str) -> callable:
+        def hook(grad: torch.Tensor) -> None:
+            if grad is None:
+                return
+            if torch.isnan(grad).any():
+                log.error(f"NaN gradient detected in: {name}")
+                log.error(f"  Gradient shape: {grad.shape}")
+                log.error(f"  Gradient stats: min={grad.min()}, max={grad.max()}")
+                # Set a breakpoint here for debugging
+                pdb.set_trace()
+            elif torch.isinf(grad).any():
+                log.error(f"Inf gradient detected in: {name}")
+                log.error(f"  Gradient shape: {grad.shape}")
+                log.error(f"  Gradient stats: min={grad.min()}, max={grad.max()}")
+                pdb.set_trace()
+            elif grad.abs().max() > 1e6:
+                log.warning(f"Large gradient detected in: {name}")
+                log.warning(f"  Gradient max abs: {grad.abs().max()}")
+
+        return hook
+
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            param.register_hook(make_hook(name))
+
+
+def register_tensor_hooks(model: torch.nn.Module, log: logging.Logger) -> list:
+    """Register forward hooks to monitor intermediate tensors.
+
+    Parameters
+    ----------
+    model : torch.nn.Module
+        The model to monitor.
+    log : logging.Logger
+        Logger for output.
+
+    Returns
+    -------
+    list
+        List of hook handles for cleanup.
+    """
+    handles = []
+
+    def make_forward_hook(name: str) -> callable:
+        def hook(module: torch.nn.Module, input: tuple, output: object) -> None:
+            # Check inputs
+            for i, inp in enumerate(input):
+                if isinstance(inp, torch.Tensor):
+                    if torch.isnan(inp).any():
+                        log.error(f"NaN in input[{i}] of {name}")
+                        pdb.set_trace()
+                    elif torch.isinf(inp).any():
+                        log.error(f"Inf in input[{i}] of {name}")
+                        pdb.set_trace()
+
+            # Check outputs
+            if isinstance(output, torch.Tensor):
+                if torch.isnan(output).any():
+                    log.error(f"NaN in output of {name}")
+                    log.error(f"  Output shape: {output.shape}")
+                    pdb.set_trace()
+                elif torch.isinf(output).any():
+                    log.error(f"Inf in output of {name}")
+                    log.error(f"  Output shape: {output.shape}")
+                    pdb.set_trace()
+            elif isinstance(output, tuple):
+                for j, out in enumerate(output):
+                    if isinstance(out, torch.Tensor):
+                        if torch.isnan(out).any():
+                            log.error(f"NaN in output[{j}] of {name}")
+                            pdb.set_trace()
+                        elif torch.isinf(out).any():
+                            log.error(f"Inf in output[{j}] of {name}")
+                            pdb.set_trace()
+
+        return hook
+
+    for name, module in model.named_modules():
+        h = module.register_forward_hook(make_forward_hook(name))
+        handles.append(h)
+
+    return handles
+
+
+def train_with_debug() -> None:
+    """Train with gradient debugging enabled."""
+    from deepmd.pt.entrypoints.main import (
+        train,
+    )
+    from deepmd.pt.train.training import (
+        Trainer,
+    )
+
+    # Patch Trainer to add hooks
+    original_init = Trainer.__init__
+
+    def patched_init(self: Trainer, *args: object, **kwargs: object) -> None:
+        original_init(self, *args, **kwargs)
+        log = logging.getLogger("GradientDebug")
+        log.info("Registering gradient hooks...")
+        register_gradient_hooks(self.wrapper, log)
+        # Note: forward hooks can slow down training significantly
+        # Uncomment if you need to debug forward pass as well:
+        # register_tensor_hooks(self.wrapper, log)
+
+    Trainer.__init__ = patched_init
+
+    # Setup logging
+    logging.basicConfig(
+        level=logging.DEBUG,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    log = logging.getLogger(__name__)
+
+    # Set working directory
+    work_dir = Path("/home/outisli/Research/dp_train/se_zm/pair/l_2")
+    original_cwd = os.getcwd()
+
+    try:
+        os.chdir(work_dir)
+        log.info(f"Working directory: {work_dir}")
+        log.info("Anomaly detection enabled - will show traceback on NaN/Inf")
+
+        train(
+            input_file="input.json",
+            init_model=None,
+            restart=None,
+            finetune=None,
+            init_frz_model=None,
+            model_branch="default",
+            skip_neighbor_stat=True,
+            use_pretrain_script=False,
+            force_load=False,
+            compile_model=False,
+            output="out.json",
+        )
+    except RuntimeError as e:
+        if "nan" in str(e).lower() or "inf" in str(e).lower():
+            log.error(f"Gradient anomaly detected: {e}")
+            log.error("The traceback above shows where the NaN/Inf was introduced.")
+            pdb.post_mortem()
+        else:
+            raise
+    finally:
+        os.chdir(original_cwd)
+
+
+if __name__ == "__main__":
+    train_with_debug()

From 573a7e390abe45f274628d4747c03281baf87e6b Mon Sep 17 00:00:00 2001
From: OutisLi <LTC201806070316@gmail.com>
Date: Thu, 15 Jan 2026 12:25:11 +0800
Subject: [PATCH 11/11] refactor: unify learning rate schedulers with array API

- Refactor BaseLR in dpmodel to use array_api_compat for backend-agnostic implementation
- Consolidate learning rate logic from TF/PT/PD backends into unified dpmodel layer
- Use array API operations (xp.where, xp.clip, etc.) for JIT compatibility across backends
- Add warmup support (warmup_steps, warmup_ratio, warmup_start_factor) during refactoring
- Add stop_ratio parameter as alternative to stop_lr for flexible configuration
- Implement mutual exclusion validation for stop_lr/stop_ratio and warmup_steps/warmup_ratio
- Update all backends to use unified BaseLR implementation
- Add comprehensive consistency tests across NumPy/PyTorch/JAX/array_api_strict backends
---
 deepmd/dpmodel/utils/learning_rate.py         | 313 +++++++++++++++---
 deepmd/pd/train/training.py                   |  20 +-
 deepmd/pd/utils/utils.py                      |   6 +-
 deepmd/pt/train/training.py                   |  41 +--
 deepmd/pt/utils/utils.py                      |   6 +-
 deepmd/tf/fit/dipole.py                       |   2 +-
 deepmd/tf/fit/dos.py                          |   2 +-
 deepmd/tf/fit/ener.py                         |   2 +-
 deepmd/tf/fit/fitting.py                      |   2 +-
 deepmd/tf/fit/polar.py                        |   2 +-
 deepmd/tf/train/trainer.py                    |  22 +-
 deepmd/tf/utils/__init__.py                   |   4 +-
 deepmd/tf/utils/learning_rate.py              | 180 +++++-----
 deepmd/utils/argcheck.py                      | 225 ++++++++++---
 source/tests/consistent/test_learning_rate.py |  22 +-
 source/tests/pd/model/test_model.py           |  11 +-
 source/tests/pd/test_lr.py                    |  51 ++-
 source/tests/pt/model/test_model.py           |  11 +-
 source/tests/pt/test_lr.py                    |  57 ++--
 source/tests/tf/test_lr.py                    | 114 +++++++
 .../dpmodel/utils/test_learning_rate.py       | 240 ++++++++++++++
 21 files changed, 1033 insertions(+), 300 deletions(-)
 create mode 100644 source/tests/tf/test_lr.py
 create mode 100644 source/tests/universal/dpmodel/utils/test_learning_rate.py

diff --git a/deepmd/dpmodel/utils/learning_rate.py b/deepmd/dpmodel/utils/learning_rate.py
index 7ea50583e2..c370ad6f58 100644
--- a/deepmd/dpmodel/utils/learning_rate.py
+++ b/deepmd/dpmodel/utils/learning_rate.py
@@ -29,77 +29,243 @@ def __new__(cls: type, *args: Any, **kwargs: Any) -> Any:
         return super().__new__(cls)
 
     def __init__(
-        self, start_lr: float, stop_lr: float, stop_steps: int, **kwargs: Any
+        self,
+        start_lr: float,
+        stop_lr: float | None = None,
+        stop_ratio: float | None = None,
+        stop_steps: int = 100000,
+        warmup_steps: int = 0,
+        warmup_ratio: float | None = None,
+        warmup_start_factor: float = 0.0,
+        **kwargs: Any,
     ) -> None:
         """
-        Base class for learning rate schedules.
+        Base class for learning rate schedules with warmup support.
 
         Parameters
         ----------
-        start_lr
-            The initial learning rate.
-        stop_lr
-            The final learning rate.
-        stop_steps
-            The total training steps for learning rate scheduler.
+        start_lr : float
+            The learning rate at the start of the training (after warmup).
+        stop_lr : float, optional
+            The final learning rate at the end of the training.
+            Mutually exclusive with stop_ratio.
+        stop_ratio : float, optional
+            The ratio of stop_lr to start_lr. stop_lr = start_lr * stop_ratio.
+            Mutually exclusive with stop_lr.
+            One of stop_lr or stop_ratio must be provided.
+        stop_steps : int
+            The total training steps (including warmup).
+        warmup_steps : int, optional
+            The number of steps for learning rate warmup.
+            Mutually exclusive with warmup_ratio. Default is 0 (no warmup).
+        warmup_ratio : float, optional
+            The ratio of warmup steps to total training steps.
+            warmup_steps = int(warmup_ratio * stop_steps).
+            Mutually exclusive with warmup_steps.
+        warmup_start_factor : float, optional
+            The factor of start_lr for the initial warmup learning rate.
+            The warmup learning rate starts from warmup_start_factor * start_lr.
+            Default is 0.0.
         """
+        # === Step 1. Compute stop_lr from stop_ratio if needed ===
+        # Mutual exclusion validated in argcheck.py
+        if stop_ratio is not None:
+            self.stop_lr = start_lr * stop_ratio
+        else:
+            self.stop_lr = stop_lr  # type: ignore[assignment]
+
+        # === Step 2. Compute warmup_steps from warmup_ratio if needed ===
+        # Mutual exclusion validated in argcheck.py
+        if warmup_ratio is not None:
+            self.warmup_steps = int(warmup_ratio * stop_steps)
+        else:
+            self.warmup_steps = warmup_steps
+
+        # === Step 3. Validate step ranges (runtime check) ===
+        if stop_steps <= 0:
+            raise ValueError("stop_steps must be positive")
+        if self.warmup_steps < 0:
+            raise ValueError("warmup_steps must be non-negative")
+        if self.warmup_steps >= stop_steps:
+            raise ValueError("warmup_steps must be smaller than stop_steps")
+
+        # === Step 4. Compute warmup_start_lr ===
+        self.warmup_start_lr = warmup_start_factor * start_lr
+
+        # === Step 5. Store core parameters ===
         self.start_lr = start_lr
-        self.stop_lr = stop_lr
         self.stop_steps = stop_steps
+        # Decay phase covers (stop_steps - warmup_steps) steps
+        self.decay_stop_steps = stop_steps - self.warmup_steps
 
     @abstractmethod
-    def value(self, step: int | Array) -> Array:
-        """Get the learning rate at the given step."""
-        # in optax, step will be a jnp.ndarray passed in JIT mode
+    def _decay_value(self, step: int | Array) -> Array:
+        """
+        Get the decayed learning rate at the given step (after warmup).
+
+        This method should implement the actual decay logic (exp, cosine, etc.)
+        without considering warmup.
+
+        Parameters
+        ----------
+        step : int or Array
+            The step index relative to the end of warmup.
+            For example, if warmup_steps=100 and total_step=150, this method
+            will be called with step=50.
+
+        Returns
+        -------
+        Array
+            The decayed learning rate (absolute value, not factor).
+        """
         pass
 
+    def value(self, step: int | Array) -> Array | float:
+        """
+        Get the learning rate at the given step, including warmup.
+
+        Parameters
+        ----------
+        step : int or Array
+            The absolute step index from the start of training.
+
+        Returns
+        -------
+        Array
+            The learning rate at the given step.
+        """
+        is_scalar = isinstance(step, (int, float))
+        if not array_api_compat.is_array_api_obj(step):
+            step = np.asarray(step)
+        xp = array_api_compat.array_namespace(step)
+
+        # === Step 1. Handle no-warmup case directly ===
+        if self.warmup_steps == 0:
+            lr = self._decay_value(xp.astype(step, xp.float64))
+        else:
+            # === Step 2. Warmup phase ===
+            # Linear warmup from warmup_start_lr to start_lr
+            warmup_progress = xp.astype(step, xp.float64) / self.warmup_steps
+            warmup_lr = (
+                self.warmup_start_lr
+                + (self.start_lr - self.warmup_start_lr) * warmup_progress
+            )
+
+            # === Step 3. Decay phase ===
+            # Call subclass decay logic for steps after warmup
+            decay_step = xp.maximum(
+                xp.astype(step, xp.float64) - self.warmup_steps, 0.0
+            )
+            decay_lr = self._decay_value(decay_step)
+
+            # === Step 4. Select warmup or decay based on step ===
+            lr = xp.where(step < self.warmup_steps, warmup_lr, decay_lr)
+
+        if is_scalar:
+            return float(lr)
+        return lr
+
 
 @BaseLR.register("exp")
 class LearningRateExp(BaseLR):
     def __init__(
         self,
         start_lr: float,
-        stop_lr: float,
-        decay_steps: int,
-        stop_steps: int,
+        stop_lr: float | None = None,
+        stop_ratio: float | None = None,
+        decay_steps: int = 5000,
+        stop_steps: int = 100000,
         decay_rate: float | None = None,
+        warmup_steps: int = 0,
+        warmup_ratio: float | None = None,
+        warmup_start_factor: float = 0.0,
         **kwargs: Any,
     ) -> None:
         """
-        Construct an exponential-decayed learning rate.
+        Construct an exponential-decayed learning rate with optional warmup.
 
         Parameters
         ----------
-        start_lr
-            The learning rate at the start of the training.
-        stop_lr
+        start_lr : float
+            The learning rate at the start of the training (after warmup).
+        stop_lr : float, optional
             The desired learning rate at the end of the training.
             When decay_rate is explicitly set, this value will serve as
-            the minimum learning rate during training. In other words,
-            if the learning rate decays below stop_lr, stop_lr will be applied instead.
-        decay_steps
+            the minimum learning rate during training.
+            Mutually exclusive with stop_ratio.
+        stop_ratio : float, optional
+            The ratio of stop_lr to start_lr.
+            Mutually exclusive with stop_lr.
+        decay_steps : int
             The learning rate is decaying every this number of training steps.
-        stop_steps
-            The total training steps for learning rate scheduler.
-        decay_rate
+            Default is 5000.
+        stop_steps : int
+            The total training steps (including warmup).
+        decay_rate : float, optional
             The decay rate for the learning rate.
             If provided, the decay rate will be set instead of
             calculating it through interpolation between start_lr and stop_lr.
+        warmup_steps : int, optional
+            The number of steps for learning rate warmup.
+            Mutually exclusive with warmup_ratio. Default is 0.
+        warmup_ratio : float, optional
+            The ratio of warmup steps to total training steps.
+            Mutually exclusive with warmup_steps.
+        warmup_start_factor : float, optional
+            The factor of start_lr for the initial warmup learning rate.
+            Default is 0.0.
+
+        Raises
+        ------
+        ValueError
+            If both stop_lr and stop_ratio are provided, or neither is provided.
+            If both warmup_steps and warmup_ratio are provided.
+            If decay_steps is larger than the decay phase total steps.
         """
-        super().__init__(start_lr, stop_lr, stop_steps, **kwargs)
-        default_ds = 100 if stop_steps // 10 > 100 else stop_steps // 100 + 1
+        super().__init__(
+            start_lr=start_lr,
+            stop_lr=stop_lr,
+            stop_ratio=stop_ratio,
+            stop_steps=stop_steps,
+            warmup_steps=warmup_steps,
+            warmup_ratio=warmup_ratio,
+            warmup_start_factor=warmup_start_factor,
+            **kwargs,
+        )
+        # === Step 5. Compute decay_rate for exp scheduler ===
+        # Use decay_stop_steps (stop_steps - warmup_steps) for decay calculation
+        decay_total = self.decay_stop_steps
         self.decay_steps = decay_steps
-        if self.decay_steps >= stop_steps:
-            self.decay_steps = default_ds
+
+        if self.decay_steps > decay_total:
+            raise ValueError(
+                f"decay_steps ({self.decay_steps}) must not exceed decay phase steps ({decay_total})."
+            )
+
+        # Avoid log(0) issues by clamping stop_lr for computation
+        clamped_stop_lr = max(self.stop_lr, 1e-10)
+        self.min_lr = self.stop_lr
+
         self.decay_rate = np.exp(
-            np.log(stop_lr / self.start_lr) / (stop_steps / self.decay_steps)
+            np.log(clamped_stop_lr / self.start_lr) / (decay_total / self.decay_steps)
         ).item()
         if decay_rate is not None:
             self.decay_rate = decay_rate
-        self.min_lr = self.stop_lr
 
-    def value(self, step: int | Array) -> Array:
-        """Get the learning rate at the given step."""
+    def _decay_value(self, step: int | Array) -> Array:
+        """
+        Get the exponential-decayed learning rate factor at the given step.
+
+        Parameters
+        ----------
+        step : int or Array
+            The step index relative to the end of warmup.
+
+        Returns
+        -------
+        Array
+            The decayed learning rate (absolute value).
+        """
         if not array_api_compat.is_array_api_obj(step):
             step = np.asarray(step)
         xp = array_api_compat.array_namespace(step)
@@ -107,8 +273,7 @@ def value(self, step: int | Array) -> Array:
             xp.asarray(self.decay_rate, device=array_api_compat.device(step)),
             xp.astype(step // self.decay_steps, xp.float64),
         )
-        # the original implementation `if step_lr < self.min_lr:`
-        # will cause a dynamic graph which is unsupported in JAX JIT
+        # Clip to min_lr for numerical stability in JIT
         step_lr = xp.clip(step_lr, self.min_lr, None)
         return step_lr
 
@@ -118,29 +283,74 @@ class LearningRateCosine(BaseLR):
     def __init__(
         self,
         start_lr: float,
-        stop_lr: float,
-        stop_steps: int,
+        stop_lr: float | None = None,
+        stop_ratio: float | None = None,
+        stop_steps: int = 100000,
+        warmup_steps: int = 0,
+        warmup_ratio: float | None = None,
+        warmup_start_factor: float = 0.0,
         **kwargs: Any,
     ) -> None:
         """
-        Defines a cosine annealing learning rate schedule.
-        The learning rate starts at `start_lr` and gradually decreases to `stop_lr`
-        following a cosine curve over the training steps.
+        Defines a cosine annealing learning rate schedule with optional warmup.
+
+        The learning rate starts at `start_lr` (after warmup) and gradually
+        decreases to `stop_lr` following a cosine curve over the training steps.
 
         Parameters
         ----------
-        start_lr
-            The initial learning rate at the beginning of training.
-        stop_lr
+        start_lr : float
+            The learning rate at the start of the training (after warmup).
+        stop_lr : float, optional
             The final learning rate at the end of training.
-        stop_steps
-            The total number of training steps over which the learning rate
-            will be annealed from start_lr to stop_lr.
+            Mutually exclusive with stop_ratio.
+        stop_ratio : float, optional
+            The ratio of stop_lr to start_lr.
+            Mutually exclusive with stop_lr.
+        stop_steps : int
+            The total training steps (including warmup).
+        warmup_steps : int, optional
+            The number of steps for learning rate warmup.
+            Mutually exclusive with warmup_ratio. Default is 0.
+        warmup_ratio : float, optional
+            The ratio of warmup steps to total training steps.
+            Mutually exclusive with warmup_steps.
+        warmup_start_factor : float, optional
+            The factor of start_lr for the initial warmup learning rate.
+            Default is 0.0.
+
+        Raises
+        ------
+        ValueError
+            If both stop_lr and stop_ratio are provided, or neither is provided.
+            If both warmup_steps and warmup_ratio are provided.
         """
-        super().__init__(start_lr, stop_lr, stop_steps, **kwargs)
-        self.lr_min_factor = stop_lr / start_lr
+        super().__init__(
+            start_lr=start_lr,
+            stop_lr=stop_lr,
+            stop_ratio=stop_ratio,
+            stop_steps=stop_steps,
+            warmup_steps=warmup_steps,
+            warmup_ratio=warmup_ratio,
+            warmup_start_factor=warmup_start_factor,
+            **kwargs,
+        )
+        self.lr_min_factor = self.stop_lr / self.start_lr
 
-    def value(self, step: int | Array) -> Array:
+    def _decay_value(self, step: int | Array) -> Array:
+        """
+        Get the cosine-annealed learning rate at the given step.
+
+        Parameters
+        ----------
+        step : int or Array
+            The step index relative to the end of warmup.
+
+        Returns
+        -------
+        Array
+            The annealed learning rate (absolute value).
+        """
         if not array_api_compat.is_array_api_obj(step):
             step = np.asarray(step)
         xp = array_api_compat.array_namespace(step)
@@ -153,11 +363,12 @@ def value(self, step: int | Array) -> Array:
                 1
                 + xp.cos(
                     xp.asarray(
-                        xp.pi * (xp.astype(step, xp.float64) / self.stop_steps),
+                        xp.pi * (xp.astype(step, xp.float64) / self.decay_stop_steps),
                         device=array_api_compat.device(step),
                     )
                 )
             )
         )
-        step_lr = xp.where(step >= self.stop_steps, min_lr, step_lr)
+        # Clip to min_lr for steps beyond decay_stop_steps
+        step_lr = xp.where(step >= self.decay_stop_steps, min_lr, step_lr)
         return step_lr
diff --git a/deepmd/pd/train/training.py b/deepmd/pd/train/training.py
index dd0fbdc94b..df38623132 100644
--- a/deepmd/pd/train/training.py
+++ b/deepmd/pd/train/training.py
@@ -239,7 +239,7 @@ def get_sample():
             return get_sample
 
         def get_lr(lr_params: dict[str, Any]) -> BaseLR:
-            lr_params["stop_steps"] = self.num_steps - self.warmup_steps
+            lr_params["stop_steps"] = self.num_steps
             lr_schedule = BaseLR(**lr_params)
             return lr_schedule
 
@@ -387,11 +387,7 @@ def get_lr(lr_params: dict[str, Any]) -> BaseLR:
                     )
 
         # Learning rate
-        self.warmup_steps = training_params.get("warmup_steps", 0)
         self.gradient_max_norm = training_params.get("gradient_max_norm", 0.0)
-        assert self.num_steps - self.warmup_steps > 0 or self.warmup_steps == 0, (
-            "Warm up steps must be less than total training steps!"
-        )
         if self.multi_task and config.get("learning_rate_dict", None) is not None:
             self.lr_exp = {}
             for model_key in self.model_keys:
@@ -580,18 +576,13 @@ def single_model_finetune(
 
         # TODO add lr warmups for multitask
         # author: iProzd
-        def warm_up_linear(step, warmup_steps):
-            if step < warmup_steps:
-                return step / warmup_steps
-            else:
-                return self.lr_exp.value(step - warmup_steps) / self.lr_exp.start_lr
-
         # TODO add optimizers for multitask
         # author: iProzd
         if self.opt_type == "Adam":
             self.scheduler = paddle.optimizer.lr.LambdaDecay(
                 learning_rate=self.lr_exp.start_lr,
-                lr_lambda=lambda step: warm_up_linear(step, self.warmup_steps),
+                lr_lambda=lambda step: self.lr_exp.value(step + self.start_step)
+                / self.lr_exp.start_lr,
             )
             self.optimizer = paddle.optimizer.Adam(
                 learning_rate=self.scheduler, parameters=self.wrapper.parameters()
@@ -755,10 +746,7 @@ def step(_step_id, task_key="Default") -> None:
                 fout1.flush()
             if self.opt_type == "Adam":
                 cur_lr = self.scheduler.get_lr()
-                if _step_id < self.warmup_steps:
-                    pref_lr = _lr.start_lr
-                else:
-                    pref_lr = cur_lr
+                pref_lr = cur_lr
 
                 # disable synchronization in forward-backward manually
                 # as derivatives exist in model forward
diff --git a/deepmd/pd/utils/utils.py b/deepmd/pd/utils/utils.py
index 7224547805..e939f84cb3 100644
--- a/deepmd/pd/utils/utils.py
+++ b/deepmd/pd/utils/utils.py
@@ -239,7 +239,11 @@ def to_numpy_array(
 ):
     if xx is None:
         return None
+    if isinstance(xx, (float, int)):
+        return np.array(xx)
     assert xx is not None
+    if isinstance(xx, np.ndarray):
+        return xx
     # Create a reverse mapping of PD_PRECISION_DICT
     reverse_precision_dict = {v: k for k, v in PD_PRECISION_DICT.items()}
     # Use the reverse mapping to find keys with the desired value
@@ -247,8 +251,6 @@ def to_numpy_array(
     prec = NP_PRECISION_DICT.get(prec, np.float64)
     if prec is None:
         raise ValueError(f"unknown precision {xx.dtype}")
-    if isinstance(xx, np.ndarray):
-        return xx.astype(prec)
     if xx.dtype == paddle.bfloat16:
         xx = xx.astype(paddle.get_default_dtype())
     return xx.numpy().astype(prec)
diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py
index 556d6d236e..b4aa119cb8 100644
--- a/deepmd/pt/train/training.py
+++ b/deepmd/pt/train/training.py
@@ -274,7 +274,7 @@ def get_sample() -> Any:
             return get_sample
 
         def get_lr(lr_params: dict[str, Any]) -> BaseLR:
-            lr_params["stop_steps"] = self.num_steps - self.warmup_steps
+            lr_params["stop_steps"] = self.num_steps
             lr_schedule = BaseLR(**lr_params)
             return lr_schedule
 
@@ -432,27 +432,7 @@ def get_lr(lr_params: dict[str, Any]) -> BaseLR:
                     )
 
         # Learning rate
-        warmup_steps = training_params.get("warmup_steps", None)
-        warmup_ratio = training_params.get("warmup_ratio", None)
-        if warmup_steps is not None:
-            self.warmup_steps = warmup_steps
-        elif warmup_ratio is not None:
-            if not 0 <= warmup_ratio < 1:
-                raise ValueError(f"warmup_ratio must be in [0, 1), got {warmup_ratio}")
-            self.warmup_steps = int(warmup_ratio * self.num_steps)
-            if self.warmup_steps == 0 and warmup_ratio > 0:
-                log.warning(
-                    f"warmup_ratio {warmup_ratio} results in 0 warmup steps "
-                    f"due to truncation. Consider using a larger ratio or "
-                    f"specify warmup_steps directly."
-                )
-        else:
-            self.warmup_steps = 0
-        self.warmup_start_factor = training_params.get("warmup_start_factor", 0.0)
         self.gradient_max_norm = training_params.get("gradient_max_norm", 0.0)
-        assert self.num_steps - self.warmup_steps > 0 or self.warmup_steps == 0, (
-            "Warm up steps must be less than total training steps!"
-        )
         if self.multi_task and config.get("learning_rate_dict", None) is not None:
             self.lr_exp = {}
             for model_key in self.model_keys:
@@ -698,14 +678,6 @@ def single_model_finetune(
 
         # TODO add lr warmups for multitask
         # author: iProzd
-        def warm_up_linear(step: int, warmup_steps: int) -> float:
-            if step < warmup_steps:
-                return self.warmup_start_factor + (1.0 - self.warmup_start_factor) * (
-                    step / warmup_steps
-                )
-            else:
-                return self.lr_exp.value(step - warmup_steps) / self.lr_exp.start_lr
-
         # TODO add optimizers for multitask
         # author: iProzd
         if self.opt_type in ["Adam", "AdamW"]:
@@ -726,7 +698,8 @@ def warm_up_linear(step: int, warmup_steps: int) -> float:
                 self.optimizer.load_state_dict(optimizer_state_dict)
             self.scheduler = torch.optim.lr_scheduler.LambdaLR(
                 self.optimizer,
-                lambda step: warm_up_linear(step + self.start_step, self.warmup_steps),
+                lambda step: self.lr_exp.value(step + self.start_step)
+                / self.lr_exp.start_lr,
             )
         elif self.opt_type == "LKF":
             self.optimizer = LKFOptimizer(
@@ -749,7 +722,8 @@ def warm_up_linear(step: int, warmup_steps: int) -> float:
                 self.optimizer.load_state_dict(optimizer_state_dict)
             self.scheduler = torch.optim.lr_scheduler.LambdaLR(
                 self.optimizer,
-                lambda step: warm_up_linear(step + self.start_step, self.warmup_steps),
+                lambda step: self.lr_exp.value(step + self.start_step)
+                / self.lr_exp.start_lr,
             )
         else:
             raise ValueError(f"Not supported optimizer type '{self.opt_type}'")
@@ -823,10 +797,7 @@ def step(_step_id: int, task_key: str = "Default") -> None:
                 fout1.flush()
             if self.opt_type in ["Adam", "AdamW", "AdaMuon"]:
                 cur_lr = self.scheduler.get_last_lr()[0]
-                if _step_id < self.warmup_steps:
-                    pref_lr = _lr.start_lr
-                else:
-                    pref_lr = cur_lr
+                pref_lr = cur_lr
                 model_pred, loss, more_loss = self.wrapper(
                     **input_dict, cur_lr=pref_lr, label=label_dict, task_key=task_key
                 )
diff --git a/deepmd/pt/utils/utils.py b/deepmd/pt/utils/utils.py
index ab066bdf93..10a4354e8b 100644
--- a/deepmd/pt/utils/utils.py
+++ b/deepmd/pt/utils/utils.py
@@ -227,10 +227,14 @@ def to_numpy_array(xx: None) -> None: ...
 
 
 def to_numpy_array(
-    xx: torch.Tensor | None,
+    xx: torch.Tensor | np.ndarray | float | None,
 ) -> np.ndarray | None:
     if xx is None:
         return None
+    if isinstance(xx, (float, int)):
+        return np.array(xx)
+    if isinstance(xx, np.ndarray):
+        return xx
     assert xx is not None
     # Create a reverse mapping of PT_PRECISION_DICT
     reverse_precision_dict = {v: k for k, v in PT_PRECISION_DICT.items()}
diff --git a/deepmd/tf/fit/dipole.py b/deepmd/tf/fit/dipole.py
index 961198b8e7..ebeec270e0 100644
--- a/deepmd/tf/fit/dipole.py
+++ b/deepmd/tf/fit/dipole.py
@@ -388,7 +388,7 @@ def get_loss(self, loss: dict, lr) -> Loss:
         ----------
         loss : dict
             the loss dict
-        lr : LearningRateExp
+        lr : LearningRateSchedule
             the learning rate
 
         Returns
diff --git a/deepmd/tf/fit/dos.py b/deepmd/tf/fit/dos.py
index 250d803d8f..bec8814d18 100644
--- a/deepmd/tf/fit/dos.py
+++ b/deepmd/tf/fit/dos.py
@@ -655,7 +655,7 @@ def get_loss(self, loss: dict, lr) -> Loss:
         ----------
         loss : dict
             the loss dict
-        lr : LearningRateExp
+        lr : LearningRateSchedule
             the learning rate
 
         Returns
diff --git a/deepmd/tf/fit/ener.py b/deepmd/tf/fit/ener.py
index 2b8b1b906e..6a027b2ec2 100644
--- a/deepmd/tf/fit/ener.py
+++ b/deepmd/tf/fit/ener.py
@@ -856,7 +856,7 @@ def get_loss(self, loss: dict, lr) -> Loss:
         ----------
         loss : dict
             The loss function parameters.
-        lr : LearningRateExp
+        lr : LearningRateSchedule
             The learning rate.
 
         Returns
diff --git a/deepmd/tf/fit/fitting.py b/deepmd/tf/fit/fitting.py
index b33559f12f..f7e5d959ef 100644
--- a/deepmd/tf/fit/fitting.py
+++ b/deepmd/tf/fit/fitting.py
@@ -73,7 +73,7 @@ def get_loss(self, loss: dict, lr) -> Loss:
         ----------
         loss : dict
             the loss dict
-        lr : LearningRateExp
+        lr : LearningRateSchedule
             the learning rate
 
         Returns
diff --git a/deepmd/tf/fit/polar.py b/deepmd/tf/fit/polar.py
index 1e48a5fa59..137695d9b8 100644
--- a/deepmd/tf/fit/polar.py
+++ b/deepmd/tf/fit/polar.py
@@ -863,7 +863,7 @@ def get_loss(self, loss: dict, lr) -> Loss:
         ----------
         loss : dict
             the loss dict
-        lr : LearningRateExp
+        lr : LearningRateSchedule
             the learning rate
 
         Returns
diff --git a/deepmd/tf/train/trainer.py b/deepmd/tf/train/trainer.py
index 4af59fd290..f9c67591d3 100644
--- a/deepmd/tf/train/trainer.py
+++ b/deepmd/tf/train/trainer.py
@@ -4,6 +4,9 @@
 import os
 import shutil
 import time
+from typing import (
+    Any,
+)
 
 import google.protobuf.message
 import numpy as np
@@ -52,7 +55,7 @@
     load_graph_def,
 )
 from deepmd.tf.utils.learning_rate import (
-    LearningRateExp,
+    LearningRateSchedule,
 )
 from deepmd.tf.utils.sess import (
     run_sess,
@@ -100,7 +103,9 @@ def _init_param(self, jdata) -> None:
         self.model = Model(**model_param)
         self.fitting = self.model.get_fitting()
 
-        def get_lr_and_coef(lr_param):
+        def get_lr_and_coef(
+            lr_param: dict[str, Any],
+        ) -> tuple[LearningRateSchedule, float]:
             scale_by_worker = lr_param.get("scale_by_worker", "linear")
             if scale_by_worker == "linear":
                 scale_lr_coef = float(self.run_opt.world_size)
@@ -108,13 +113,8 @@ def get_lr_and_coef(lr_param):
                 scale_lr_coef = np.sqrt(self.run_opt.world_size).real
             else:
                 scale_lr_coef = 1.0
-            lr_type = lr_param.get("type", "exp")
-            if lr_type == "exp":
-                lr = LearningRateExp(
-                    lr_param["start_lr"], lr_param["stop_lr"], lr_param["decay_steps"]
-                )
-            else:
-                raise RuntimeError("unknown learning_rate type " + lr_type)
+            lr_params = {k: v for k, v in lr_param.items() if k != "scale_by_worker"}
+            lr = LearningRateSchedule(lr_params)
             return lr, scale_lr_coef
 
         # learning rate
@@ -427,11 +427,9 @@ def train(self, train_data=None, valid_data=None) -> None:
         is_first_step = True
         self.cur_batch = cur_batch
         log.info(
-            "start training at lr %.2e (== %.2e), decay_step %d, decay_rate %f, final lr will be %.2e",
+            "start training at lr %.2e (== %.2e), final lr will be %.2e",
             run_sess(self.sess, self.learning_rate),
             self.lr.value(cur_batch),
-            self.lr.decay_steps_,
-            self.lr.decay_rate_,
             self.lr.value(stop_batch),
         )
 
diff --git a/deepmd/tf/utils/__init__.py b/deepmd/tf/utils/__init__.py
index 7d1e7e67d0..b88c13d445 100644
--- a/deepmd/tf/utils/__init__.py
+++ b/deepmd/tf/utils/__init__.py
@@ -7,7 +7,7 @@
     DeepmdDataSystem,
 )
 from .learning_rate import (
-    LearningRateExp,
+    LearningRateSchedule,
 )
 from .pair_tab import (
     PairTab,
@@ -20,7 +20,7 @@
 __all__ = [
     "DeepmdData",
     "DeepmdDataSystem",
-    "LearningRateExp",
+    "LearningRateSchedule",
     "PairTab",
     "Plugin",
     "PluginVariant",
diff --git a/deepmd/tf/utils/learning_rate.py b/deepmd/tf/utils/learning_rate.py
index 64427e185d..9867e453f9 100644
--- a/deepmd/tf/utils/learning_rate.py
+++ b/deepmd/tf/utils/learning_rate.py
@@ -1,102 +1,128 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from __future__ import (
+    annotations,
+)
+
+from typing import (
+    Any,
+)
 
 import numpy as np
 
+from deepmd.dpmodel.utils.learning_rate import (
+    BaseLR,
+)
 from deepmd.tf.env import (
     tf,
 )
 
 
-class LearningRateExp:
-    r"""The exponentially decaying learning rate.
+class LearningRateSchedule:
+    """
+    TensorFlow wrapper for BaseLR.
+
+    Parameters
+    ----------
+    params : dict[str, Any]
+        Learning rate configuration dictionary.
+    """
 
-    The learning rate at step :math:`t` is given by
+    def __init__(self, params: dict[str, Any]) -> None:
+        # === Step 1. Store configuration ===
+        self._params = dict(params)
+        if "start_lr" not in self._params:
+            raise ValueError("start_lr must be provided")
+        self._start_lr = float(self._params["start_lr"])
+        self._base_lr: BaseLR | None = None
 
-    .. math::
+    def start_lr(self) -> float:
+        """
+        Get the starting learning rate.
 
-        \alpha(t) = \alpha_0 \lambda ^ { t / \tau }
+        Returns
+        -------
+        float
+            The starting learning rate.
+        """
+        return self._start_lr
 
-    where :math:`\alpha` is the learning rate, :math:`\alpha_0` is the starting learning rate,
-    :math:`\lambda` is the decay rate, and :math:`\tau` is the decay steps.
+    @property
+    def base_lr(self) -> BaseLR:
+        """
+        Get the built BaseLR instance.
 
-    Parameters
-    ----------
-    start_lr
-            Starting learning rate :math:`\alpha_0`
-    stop_lr
-            Stop learning rate :math:`\alpha_1`
-    decay_steps
-            Learning rate decay every this number of steps :math:`\tau`
-    decay_rate
-            The decay rate :math:`\lambda`.
-            If `stop_step` is provided in `build`, then it will be determined automatically and overwritten.
-    """
+        Returns
+        -------
+        BaseLR
+            The built learning rate schedule.
 
-    def __init__(
-        self,
-        start_lr: float,
-        stop_lr: float = 5e-8,
-        decay_steps: int = 5000,
-        decay_rate: float = 0.95,
-    ) -> None:
-        """Constructor."""
-        self.cd = {}
-        self.cd["start_lr"] = start_lr
-        self.cd["stop_lr"] = stop_lr
-        self.cd["decay_steps"] = decay_steps
-        self.cd["decay_rate"] = decay_rate
-        self.start_lr_ = self.cd["start_lr"]
-
-    def build(self, global_step: tf.Tensor, stop_step: int | None = None) -> tf.Tensor:
-        """Build the learning rate.
+        Raises
+        ------
+        RuntimeError
+            If the schedule has not been built.
+        """
+        if self._base_lr is None:
+            raise RuntimeError("Learning rate schedule is not built yet.")
+        return self._base_lr
+
+    def build(self, global_step: tf.Tensor, stop_steps: int) -> tf.Tensor:
+        """
+        Build a TensorFlow learning rate tensor.
 
         Parameters
         ----------
-        global_step
-            The tf Tensor providing the global training step
-        stop_step
-            The stop step. If provided, the decay_rate will be determined automatically and overwritten.
+        global_step : tf.Tensor
+            The global training step tensor.
+        stop_steps : int
+            The total training steps.
 
         Returns
         -------
-        learning_rate
-            The learning rate
+        tf.Tensor
+            The learning rate tensor.
         """
-        if stop_step is None:
-            self.decay_steps_ = (
-                self.cd["decay_steps"] if self.cd["decay_steps"] is not None else 5000
-            )
-            self.decay_rate_ = (
-                self.cd["decay_rate"] if self.cd["decay_rate"] is not None else 0.95
-            )
-        else:
-            self.stop_lr_ = (
-                self.cd["stop_lr"] if self.cd["stop_lr"] is not None else 5e-8
-            )
-            default_ds = 100 if stop_step // 10 > 100 else stop_step // 100 + 1
-            self.decay_steps_ = (
-                self.cd["decay_steps"]
-                if self.cd["decay_steps"] is not None
-                else default_ds
-            )
-            if self.decay_steps_ >= stop_step:
-                self.decay_steps_ = default_ds
-            self.decay_rate_ = np.exp(
-                np.log(self.stop_lr_ / self.start_lr_) / (stop_step / self.decay_steps_)
-            )
-
-        return tf.train.exponential_decay(
-            self.start_lr_,
-            global_step,
-            self.decay_steps_,
-            self.decay_rate_,
-            staircase=True,
+        # === Step 1. Instantiate backend-agnostic schedule ===
+        params = dict(self._params)
+        params["stop_steps"] = stop_steps
+        # Default to 'exp' type if not specified
+        if "type" not in params:
+            params["type"] = "exp"
+        self._base_lr = BaseLR(**params)
+
+        # === Step 2. Bind a numpy_function for runtime evaluation ===
+        def _lr_value(step: np.ndarray) -> np.ndarray:
+            return np.asarray(self._base_lr.value(step), dtype=np.float64)
+
+        lr = tf.numpy_function(
+            _lr_value, [global_step], Tout=tf.float64, name="lr_schedule"
         )
-
-    def start_lr(self) -> float:
-        """Get the start lr."""
-        return self.start_lr_
+        lr.set_shape(global_step.get_shape())
+        return tf.cast(lr, tf.float32)
 
     def value(self, step: int) -> float:
-        """Get the lr at a certain step."""
-        return self.start_lr_ * np.power(self.decay_rate_, (step // self.decay_steps_))
+        """
+        Get the learning rate at the given step.
+
+        Parameters
+        ----------
+        step : int
+            The step index.
+
+        Returns
+        -------
+        float
+            The learning rate value.
+
+        Raises
+        ------
+        RuntimeError
+            If the schedule has not been built.
+        """
+        if self._base_lr is None:
+            raise RuntimeError("Learning rate schedule is not built yet.")
+        return float(np.asarray(self._base_lr.value(step)))
+
+
+__all__ = [
+    "LearningRateSchedule",
+]
diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
index 935762cdc7..80da8a0aa8 100644
--- a/deepmd/utils/argcheck.py
+++ b/deepmd/utils/argcheck.py
@@ -2480,14 +2480,159 @@ def linear_ener_model_args() -> Argument:
 lr_args_plugin = ArgsPlugin()
 
 
+def _check_lr_stop_args(data: dict[str, Any]) -> bool:
+    """
+    Check that stop_lr and stop_ratio are mutually exclusive and at least one is provided.
+
+    Parameters
+    ----------
+    data : dict[str, Any]
+        The learning rate configuration dictionary.
+
+    Returns
+    -------
+    bool
+        True if validation passes.
+
+    Raises
+    ------
+    ValueError
+        If both stop_lr and stop_ratio are provided, or neither is provided.
+    """
+    has_stop_lr = "stop_lr" in data and data["stop_lr"] is not None
+    has_stop_ratio = "stop_ratio" in data and data["stop_ratio"] is not None
+
+    if has_stop_lr and has_stop_ratio:
+        raise ValueError(
+            "stop_lr and stop_ratio are mutually exclusive. "
+            f"Got stop_lr={data['stop_lr']}, stop_ratio={data['stop_ratio']}"
+        )
+    if not has_stop_lr and not has_stop_ratio:
+        raise ValueError(
+            "Either stop_lr or stop_ratio must be provided. "
+            "Got stop_lr=None, stop_ratio=None"
+        )
+    return True
+
+
+def _check_warmup_args(data: dict[str, Any]) -> bool:
+    """
+    Check that warmup_steps and warmup_ratio are mutually exclusive.
+
+    Parameters
+    ----------
+    data : dict[str, Any]
+        The learning rate configuration dictionary.
+
+    Returns
+    -------
+    bool
+        True if validation passes.
+
+    Raises
+    ------
+    ValueError
+        If both warmup_steps (non-zero) and warmup_ratio are provided.
+    """
+    # warmup_steps default is 0, so check for non-zero value
+    has_warmup_steps = "warmup_steps" in data and data["warmup_steps"] != 0
+    has_warmup_ratio = "warmup_ratio" in data and data["warmup_ratio"] is not None
+
+    if has_warmup_steps and has_warmup_ratio:
+        raise ValueError(
+            "warmup_steps and warmup_ratio are mutually exclusive. "
+            f"Got warmup_steps={data['warmup_steps']}, warmup_ratio={data['warmup_ratio']}"
+        )
+    return True
+
+
+def _learning_rate_common_args(
+    doc_stop_lr: str,
+    extra_args: list[Argument] | None = None,
+) -> list[Argument]:
+    doc_start_lr = "The learning rate at the start of the training (after warmup)."
+    doc_stop_ratio = (
+        "The ratio of stop_lr to start_lr. stop_lr = start_lr * stop_ratio. "
+        "Mutually exclusive with stop_lr."
+    )
+    doc_warmup_steps = (
+        "The number of steps for learning rate warmup. "
+        "During warmup, the learning rate increases linearly from "
+        "warmup_start_factor * start_lr to start_lr. "
+        "Mutually exclusive with warmup_ratio. Default is 0 (no warmup)."
+    )
+    doc_warmup_ratio = (
+        "The ratio of warmup steps to total training steps. "
+        "The actual number of warmup steps is int(warmup_ratio * stop_steps). "
+        "Mutually exclusive with warmup_steps."
+    )
+    doc_warmup_start_factor = (
+        "The factor of start_lr for the initial warmup learning rate. "
+        "The warmup learning rate starts from warmup_start_factor * start_lr. "
+        "Default is 0.0, meaning the learning rate starts from zero."
+    )
+
+    args = [
+        Argument("start_lr", float, optional=False, doc=doc_start_lr),
+        Argument(
+            "stop_lr",
+            float,
+            optional=True,
+            default=None,
+            doc=doc_stop_lr,
+        ),
+        Argument(
+            "stop_ratio",
+            float,
+            optional=True,
+            default=None,
+            doc=doc_stop_ratio,
+        ),
+    ]
+    if extra_args:
+        args.extend(extra_args)
+    args.extend(
+        [
+            Argument(
+                "warmup_steps",
+                int,
+                optional=True,
+                default=0,
+                doc=doc_warmup_steps,
+            ),
+            Argument(
+                "warmup_ratio",
+                float,
+                optional=True,
+                default=None,
+                doc=doc_warmup_ratio,
+            ),
+            Argument(
+                "warmup_start_factor",
+                float,
+                optional=True,
+                default=0.0,
+                doc=doc_warmup_start_factor,
+            ),
+        ]
+    )
+    return args
+
+
 @lr_args_plugin.register("exp")
 def learning_rate_exp() -> list[Argument]:
-    doc_start_lr = "The learning rate at the start of the training."
+    """
+    Defines an exponential-decayed learning rate schedule with optional warmup.
+
+    The learning rate starts at `start_lr` (after warmup) and decays exponentially
+    to `stop_lr` over the training steps.
+    """
     doc_stop_lr = (
         "The desired learning rate at the end of the training. "
-        f"When decay_rate {doc_only_pt_supported}is explicitly set, "
+        "When decay_rate is explicitly set, "
         "this value will serve as the minimum learning rate during training. "
-        "In other words, if the learning rate decays below stop_lr, stop_lr will be applied instead."
+        "In other words, if the learning rate decays below stop_lr, stop_lr will be applied instead. "
+        "Mutually exclusive with stop_ratio."
     )
     doc_decay_steps = (
         "The learning rate is decaying every this number of training steps."
@@ -2498,37 +2643,32 @@ def learning_rate_exp() -> list[Argument]:
         "instead of calculating it through interpolation between start_lr and stop_lr."
     )
 
-    args = [
-        Argument("start_lr", float, optional=True, default=1e-3, doc=doc_start_lr),
-        Argument("stop_lr", float, optional=True, default=1e-8, doc=doc_stop_lr),
+    extra_args = [
         Argument("decay_steps", int, optional=True, default=5000, doc=doc_decay_steps),
         Argument(
             "decay_rate",
             float,
             optional=True,
             default=None,
-            doc=doc_only_pt_supported + doc_decay_rate,
+            doc=doc_decay_rate,
         ),
     ]
-    return args
+    return _learning_rate_common_args(doc_stop_lr, extra_args=extra_args)
 
 
-@lr_args_plugin.register("cosine", doc=doc_only_pt_supported)
+@lr_args_plugin.register("cosine")
 def learning_rate_cosine() -> list[Argument]:
     """
-    Defines a cosine annealing learning rate schedule.
+    Defines a cosine annealing learning rate schedule with optional warmup.
 
-    The learning rate starts at `start_lr` and gradually decreases to `stop_lr`
-    following a cosine curve over the training steps.
+    The learning rate starts at `start_lr` (after warmup) and gradually
+    decreases to `stop_lr` following a cosine curve over the training steps.
     """
-    doc_start_lr = "The learning rate at the start of the training."
-    doc_stop_lr = "The desired learning rate at the end of the training. "
-
-    args = [
-        Argument("start_lr", float, optional=True, default=1e-3, doc=doc_start_lr),
-        Argument("stop_lr", float, optional=True, default=1e-5, doc=doc_stop_lr),
-    ]
-    return args
+    doc_stop_lr = (
+        "The desired learning rate at the end of training. "
+        "Mutually exclusive with stop_ratio."
+    )
+    return _learning_rate_common_args(doc_stop_lr)
 
 
 def learning_rate_variant_type_args() -> Variant:
@@ -2546,6 +2686,15 @@ def learning_rate_variant_type_args() -> Variant:
 def learning_rate_args(fold_subdoc: bool = False) -> Argument:
     doc_scale_by_worker = "When parallel training or batch size scaled, how to alter learning rate. Valid values are `linear`(default), `sqrt` or `none`."
     doc_lr = "The definition of learning rate"
+
+    def _check_lr_args(data: dict[str, Any]) -> bool:
+        """Check learning rate argument constraints."""
+        # Check stop_lr and stop_ratio
+        _check_lr_stop_args(data)
+        # Check warmup_steps and warmup_ratio
+        _check_warmup_args(data)
+        return True
+
     return Argument(
         "learning_rate",
         dict,
@@ -2562,6 +2711,7 @@ def learning_rate_args(fold_subdoc: bool = False) -> Argument:
         optional=True,
         doc=doc_lr,
         fold_subdoc=fold_subdoc,
+        extra_check=_check_lr_args,
     )
 
 
@@ -3240,22 +3390,6 @@ def training_args(
     doc_tensorboard = "Enable tensorboard"
     doc_tensorboard_log_dir = "The log directory of tensorboard outputs"
     doc_tensorboard_freq = "The frequency of writing tensorboard events."
-    doc_warmup_steps = (
-        "The number of steps for learning rate warmup. During warmup, "
-        "the learning rate begins at zero and progressively increases linearly to `start_lr`, "
-        "rather than starting directly from `start_lr`"
-    )
-    doc_warmup_ratio = (
-        "The ratio of warmup steps to total training steps. "
-        "The actual number of warmup steps is calculated as `warmup_ratio * numb_steps`. "
-        "Valid values are in the range [0.0, 1.0). "
-        "If `warmup_steps` is set, this option will be ignored."
-    )
-    doc_warmup_start_factor = (
-        "The factor of start learning rate to the target learning rate during warmup. "
-        "The warmup learning rate will linearly increase from `warmup_start_factor * start_lr` to `start_lr`. "
-        "Default is 0.0, meaning the learning rate starts from zero."
-    )
     doc_gradient_max_norm = (
         "Clips the gradient norm to a maximum value. "
         "If the gradient norm exceeds this value, it will be clipped to this limit. "
@@ -3363,25 +3497,6 @@ def training_args(
         Argument(
             "tensorboard_freq", int, optional=True, default=1, doc=doc_tensorboard_freq
         ),
-        Argument(
-            "warmup_steps",
-            int,
-            optional=True,
-            doc=doc_only_pt_supported + doc_warmup_steps,
-        ),
-        Argument(
-            "warmup_ratio",
-            float,
-            optional=True,
-            doc=doc_only_pt_supported + doc_warmup_ratio,
-        ),
-        Argument(
-            "warmup_start_factor",
-            float,
-            optional=True,
-            default=0.0,
-            doc=doc_only_pt_supported + doc_warmup_start_factor,
-        ),
         Argument(
             "gradient_max_norm",
             float,
diff --git a/source/tests/consistent/test_learning_rate.py b/source/tests/consistent/test_learning_rate.py
index 5767f3165e..59ad6741af 100644
--- a/source/tests/consistent/test_learning_rate.py
+++ b/source/tests/consistent/test_learning_rate.py
@@ -42,33 +42,49 @@
             "stop_lr": 1e-8,
             "decay_steps": 1000,
             "stop_steps": 1000000,
+            "warmup_steps": 10000,
         },
         {
             "type": "cosine",
             "start_lr": 1e-3,
             "stop_lr": 1e-8,
-            "decay_steps": 1000,
             "stop_steps": 1000000,
+            "warmup_steps": 10000,
         },
     ),
 )
 class TestLearningRateConsistent(unittest.TestCase):
+    """Test learning rate consistency across different array backends."""
+
     def setUp(self) -> None:
         (lr_param,) = self.param
         self.lr = BaseLR(**lr_param)
         self.step = 500000
         self.ref = self.lr.value(self.step)
+        self.warmup_step = None
+        self.warmup_ref = None
+        if self.lr.warmup_steps > 0:
+            self.warmup_step = self.lr.warmup_steps // 2
+            self.warmup_ref = self.lr.value(self.warmup_step)
 
     def compare_test_with_ref(self, step: Array) -> None:
         test = self.lr.value(step)
         np.testing.assert_allclose(self.ref, to_numpy_array(test), atol=1e-10)
 
+    def compare_test_with_warmup_ref(self, step: Array) -> None:
+        if self.warmup_ref is None:
+            self.skipTest("warmup not enabled")
+        test = self.lr.value(step)
+        np.testing.assert_allclose(self.warmup_ref, to_numpy_array(test), atol=1e-10)
+
     def compare_numpy_with_ref(self, step: Array) -> None:
         self.compare_test_with_ref(np.asarray(step))
 
     @unittest.skipUnless(INSTALLED_PT, "PyTorch is not installed")
     def test_pt_consistent_with_ref(self) -> None:
         self.compare_test_with_ref(to_torch_tensor(self.step))
+        if self.warmup_step is not None:
+            self.compare_test_with_warmup_ref(to_torch_tensor(self.warmup_step))
 
     @unittest.skipUnless(
         INSTALLED_ARRAY_API_STRICT, "array_api_strict is not installed"
@@ -78,7 +94,11 @@ def test_pt_consistent_with_ref(self) -> None:
     )
     def test_array_api_strict(self) -> None:
         self.compare_test_with_ref(xp.asarray(self.step))
+        if self.warmup_step is not None:
+            self.compare_test_with_warmup_ref(xp.asarray(self.warmup_step))
 
     @unittest.skipUnless(INSTALLED_JAX, "JAX is not installed")
     def test_jax_consistent_with_ref(self) -> None:
         self.compare_test_with_ref(jnp.array(self.step))
+        if self.warmup_step is not None:
+            self.compare_test_with_warmup_ref(jnp.array(self.warmup_step))
diff --git a/source/tests/pd/model/test_model.py b/source/tests/pd/model/test_model.py
index e619171e44..848f2dfa47 100644
--- a/source/tests/pd/model/test_model.py
+++ b/source/tests/pd/model/test_model.py
@@ -49,7 +49,7 @@
     DeepmdDataSystem,
 )
 from deepmd.tf.utils.learning_rate import (
-    LearningRateExp,
+    LearningRateSchedule,
 )
 
 from ..test_finetune import (
@@ -226,8 +226,13 @@ def _get_dp_loss(self):
         )
 
     def _get_dp_lr(self):
-        return LearningRateExp(
-            start_lr=self.start_lr, stop_lr=self.stop_lr, decay_steps=self.decay_steps
+        return LearningRateSchedule(
+            {
+                "type": "exp",
+                "start_lr": self.start_lr,
+                "stop_lr": self.stop_lr,
+                "decay_steps": self.decay_steps,
+            }
         )
 
     def _get_dp_placeholders(self, dataset):
diff --git a/source/tests/pd/test_lr.py b/source/tests/pd/test_lr.py
index 9607f982fd..bd4c1a4ea1 100644
--- a/source/tests/pd/test_lr.py
+++ b/source/tests/pd/test_lr.py
@@ -9,8 +9,8 @@
 from deepmd.dpmodel.utils.learning_rate import (
     LearningRateExp,
 )
-from deepmd.tf.utils import (
-    learning_rate,
+from deepmd.tf.utils.learning_rate import (
+    LearningRateSchedule,
 )
 
 
@@ -18,7 +18,8 @@ class TestLearningRate(unittest.TestCase):
     def setUp(self):
         self.start_lr = 0.001
         self.stop_lr = 3.51e-8
-        self.decay_steps = np.arange(400, 601, 100)
+        # decay_steps must not exceed stop_steps
+        self.decay_steps = np.arange(400, 501, 100)
         self.stop_steps = np.arange(500, 1600, 500)
 
     def test_consistency(self):
@@ -30,8 +31,13 @@ def test_consistency(self):
                 self.decay_rate_pd()
 
     def judge_it(self):
-        base_lr = learning_rate.LearningRateExp(
-            self.start_lr, self.stop_lr, self.decay_step
+        base_lr = LearningRateSchedule(
+            {
+                "type": "exp",
+                "start_lr": self.start_lr,
+                "stop_lr": self.stop_lr,
+                "decay_steps": self.decay_step,
+            }
         )
         g = tf.Graph()
         with g.as_default():
@@ -39,7 +45,10 @@ def judge_it(self):
             t_lr = base_lr.build(global_step, self.stop_step)
 
         my_lr = LearningRateExp(
-            self.start_lr, self.stop_lr, self.decay_step, self.stop_step
+            start_lr=self.start_lr,
+            stop_lr=self.stop_lr,
+            decay_steps=self.decay_step,
+            stop_steps=self.stop_step,
         )
         with tf.Session(graph=g) as sess:
             base_vals = [
@@ -57,28 +66,34 @@ def judge_it(self):
 
     def decay_rate_pd(self):
         my_lr = LearningRateExp(
-            self.start_lr, self.stop_lr, self.decay_step, self.stop_step
+            start_lr=self.start_lr,
+            stop_lr=self.stop_lr,
+            decay_steps=self.decay_step,
+            stop_steps=self.stop_step,
         )
 
         default_ds = 100 if self.stop_step // 10 > 100 else self.stop_step // 100 + 1
-        if self.decay_step >= self.stop_step:
-            self.decay_step = default_ds
+        # Use local variable to avoid modifying instance state
+        decay_step_for_rate = self.decay_step
+        if decay_step_for_rate >= self.stop_step:
+            decay_step_for_rate = default_ds
         decay_rate = np.exp(
-            np.log(self.stop_lr / self.start_lr) / (self.stop_step / self.decay_step)
+            np.log(self.stop_lr / self.start_lr)
+            / (self.stop_step / decay_step_for_rate)
         )
         my_lr_decay = LearningRateExp(
-            self.start_lr,
-            1e-10,
-            self.decay_step,
-            self.stop_step,
+            start_lr=self.start_lr,
+            stop_lr=1e-10,
+            decay_steps=self.decay_step,
+            stop_steps=self.stop_step,
             decay_rate=decay_rate,
         )
         min_lr = 1e-5
         my_lr_decay_trunc = LearningRateExp(
-            self.start_lr,
-            min_lr,
-            self.decay_step,
-            self.stop_step,
+            start_lr=self.start_lr,
+            stop_lr=min_lr,
+            decay_steps=self.decay_step,
+            stop_steps=self.stop_step,
             decay_rate=decay_rate,
         )
         my_vals = [
diff --git a/source/tests/pt/model/test_model.py b/source/tests/pt/model/test_model.py
index eee0e9beef..501d607cc3 100644
--- a/source/tests/pt/model/test_model.py
+++ b/source/tests/pt/model/test_model.py
@@ -49,7 +49,7 @@
     DeepmdDataSystem,
 )
 from deepmd.tf.utils.learning_rate import (
-    LearningRateExp,
+    LearningRateSchedule,
 )
 
 from ..test_finetune import (
@@ -226,8 +226,13 @@ def _get_dp_loss(self):
         )
 
     def _get_dp_lr(self):
-        return LearningRateExp(
-            start_lr=self.start_lr, stop_lr=self.stop_lr, decay_steps=self.decay_steps
+        return LearningRateSchedule(
+            {
+                "type": "exp",
+                "start_lr": self.start_lr,
+                "stop_lr": self.stop_lr,
+                "decay_steps": self.decay_steps,
+            }
         )
 
     def _get_dp_placeholders(self, dataset):
diff --git a/source/tests/pt/test_lr.py b/source/tests/pt/test_lr.py
index 75f663f041..4e226d54ba 100644
--- a/source/tests/pt/test_lr.py
+++ b/source/tests/pt/test_lr.py
@@ -10,8 +10,8 @@
     LearningRateCosine,
     LearningRateExp,
 )
-from deepmd.tf.utils import (
-    learning_rate,
+from deepmd.tf.utils.learning_rate import (
+    LearningRateSchedule,
 )
 
 
@@ -19,7 +19,8 @@ class TestLearningRate(unittest.TestCase):
     def setUp(self) -> None:
         self.start_lr = 0.001
         self.stop_lr = 3.51e-8
-        self.decay_steps = np.arange(400, 601, 100)
+        # decay_steps must not exceed stop_steps
+        self.decay_steps = np.arange(400, 501, 100)
         self.stop_steps = np.arange(500, 1600, 500)
 
     def test_consistency(self) -> None:
@@ -31,8 +32,13 @@ def test_consistency(self) -> None:
                 self.decay_rate_pt()
 
     def judge_it(self) -> None:
-        base_lr = learning_rate.LearningRateExp(
-            self.start_lr, self.stop_lr, self.decay_step
+        base_lr = LearningRateSchedule(
+            {
+                "type": "exp",
+                "start_lr": self.start_lr,
+                "stop_lr": self.stop_lr,
+                "decay_steps": self.decay_step,
+            }
         )
         g = tf.Graph()
         with g.as_default():
@@ -40,7 +46,10 @@ def judge_it(self) -> None:
             t_lr = base_lr.build(global_step, self.stop_step)
 
         my_lr = LearningRateExp(
-            self.start_lr, self.stop_lr, self.decay_step, self.stop_step
+            start_lr=self.start_lr,
+            stop_lr=self.stop_lr,
+            decay_steps=self.decay_step,
+            stop_steps=self.stop_step,
         )
         with tf.Session(graph=g) as sess:
             base_vals = [
@@ -58,28 +67,34 @@ def judge_it(self) -> None:
 
     def decay_rate_pt(self) -> None:
         my_lr = LearningRateExp(
-            self.start_lr, self.stop_lr, self.decay_step, self.stop_step
+            start_lr=self.start_lr,
+            stop_lr=self.stop_lr,
+            decay_steps=self.decay_step,
+            stop_steps=self.stop_step,
         )
 
         default_ds = 100 if self.stop_step // 10 > 100 else self.stop_step // 100 + 1
-        if self.decay_step >= self.stop_step:
-            self.decay_step = default_ds
+        # Use local variable to avoid modifying instance state
+        decay_step_for_rate = self.decay_step
+        if decay_step_for_rate >= self.stop_step:
+            decay_step_for_rate = default_ds
         decay_rate = np.exp(
-            np.log(self.stop_lr / self.start_lr) / (self.stop_step / self.decay_step)
+            np.log(self.stop_lr / self.start_lr)
+            / (self.stop_step / decay_step_for_rate)
         )
         my_lr_decay = LearningRateExp(
-            self.start_lr,
-            1e-10,
-            self.decay_step,
-            self.stop_step,
+            start_lr=self.start_lr,
+            stop_lr=1e-10,
+            decay_steps=self.decay_step,
+            stop_steps=self.stop_step,
             decay_rate=decay_rate,
         )
         min_lr = 1e-5
         my_lr_decay_trunc = LearningRateExp(
-            self.start_lr,
-            min_lr,
-            self.decay_step,
-            self.stop_step,
+            start_lr=self.start_lr,
+            stop_lr=min_lr,
+            decay_steps=self.decay_step,
+            stop_steps=self.stop_step,
             decay_rate=decay_rate,
         )
         my_vals = [
@@ -108,7 +123,11 @@ def test_basic_curve(self) -> None:
         start_lr = 1.0
         stop_lr = 0.1
         stop_steps = 10
-        lr = LearningRateCosine(start_lr, stop_lr, stop_steps)
+        lr = LearningRateCosine(
+            start_lr=start_lr,
+            stop_lr=stop_lr,
+            stop_steps=stop_steps,
+        )
 
         self.assertTrue(np.allclose(lr.value(0), start_lr))
         self.assertTrue(np.allclose(lr.value(stop_steps), stop_lr))
diff --git a/source/tests/tf/test_lr.py b/source/tests/tf/test_lr.py
new file mode 100644
index 0000000000..44e3eb749c
--- /dev/null
+++ b/source/tests/tf/test_lr.py
@@ -0,0 +1,114 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Tests for TensorFlow learning rate schedule wrapper.
+
+This module tests the TF-specific wrapper logic only.
+Core learning rate algorithms are tested in dpmodel tests.
+"""
+
+import unittest
+
+import numpy as np
+
+from deepmd.dpmodel.utils.learning_rate import (
+    LearningRateExp,
+)
+from deepmd.tf.env import (
+    tf,
+)
+from deepmd.tf.utils.learning_rate import (
+    LearningRateSchedule,
+)
+
+
+class TestLearningRateScheduleValidation(unittest.TestCase):
+    """Test TF wrapper validation and error handling."""
+
+    def test_missing_start_lr(self) -> None:
+        """Test that missing start_lr raises ValueError."""
+        with self.assertRaises(ValueError) as cm:
+            LearningRateSchedule({"type": "exp", "stop_lr": 1e-5})
+        self.assertIn("start_lr", str(cm.exception))
+
+    def test_value_before_build(self) -> None:
+        """Test that calling value() before build() raises RuntimeError."""
+        lr_schedule = LearningRateSchedule({"start_lr": 1e-3})
+        with self.assertRaises(RuntimeError) as cm:
+            lr_schedule.value(100)
+        self.assertIn("not built", str(cm.exception))
+
+    def test_base_lr_before_build(self) -> None:
+        """Test that accessing base_lr before build() raises RuntimeError."""
+        lr_schedule = LearningRateSchedule({"start_lr": 1e-3})
+        with self.assertRaises(RuntimeError) as cm:
+            _ = lr_schedule.base_lr
+        self.assertIn("not built", str(cm.exception))
+
+
+class TestLearningRateScheduleBuild(unittest.TestCase):
+    """Test TF tensor building and integration."""
+
+    def test_build_returns_tensor(self) -> None:
+        """Test that build() returns a float32 TF tensor."""
+        lr_schedule = LearningRateSchedule({"start_lr": 1e-3, "stop_lr": 1e-5})
+        global_step = tf.constant(0, dtype=tf.int64)
+        lr_tensor = lr_schedule.build(global_step, stop_steps=10000)
+
+        self.assertIsInstance(lr_tensor, tf.Tensor)
+        self.assertEqual(lr_tensor.dtype, tf.float32)
+
+    def test_default_type_exp(self) -> None:
+        """Test that default type is 'exp' when not specified."""
+        lr_schedule = LearningRateSchedule({"start_lr": 1e-3, "stop_lr": 1e-5})
+        global_step = tf.constant(0, dtype=tf.int64)
+        lr_schedule.build(global_step, stop_steps=10000)
+
+        self.assertIsInstance(lr_schedule.base_lr, LearningRateExp)
+
+    def test_tensor_value_matches_base_lr(self) -> None:
+        """Test that TF tensor value matches BaseLR.value()."""
+        lr_schedule = LearningRateSchedule(
+            {
+                "start_lr": 1e-3,
+                "stop_lr": 1e-5,
+                "type": "exp",
+                "decay_steps": 1000,
+            }
+        )
+        test_step = 5000
+        global_step = tf.constant(test_step, dtype=tf.int64)
+        lr_schedule.build(global_step, stop_steps=10000)
+
+        # Use value() method which works in both graph and eager mode
+        # This indirectly verifies tensor computation matches BaseLR
+        tensor_value = lr_schedule.value(test_step)
+        base_lr_value = lr_schedule.base_lr.value(test_step)
+
+        np.testing.assert_allclose(tensor_value, base_lr_value, rtol=1e-10)
+
+    def test_start_lr_accessor(self) -> None:
+        """Test start_lr() accessor returns correct value."""
+        lr_schedule = LearningRateSchedule({"start_lr": 1e-3})
+        self.assertEqual(lr_schedule.start_lr(), 1e-3)
+
+    def test_value_after_build(self) -> None:
+        """Test value() works correctly after build()."""
+        lr_schedule = LearningRateSchedule(
+            {
+                "start_lr": 1e-3,
+                "stop_lr": 1e-5,
+                "type": "exp",
+                "decay_steps": 1000,
+            }
+        )
+        global_step = tf.constant(0, dtype=tf.int64)
+        lr_schedule.build(global_step, stop_steps=10000)
+
+        # value() should work after build
+        lr_value = lr_schedule.value(5000)
+        expected = lr_schedule.base_lr.value(5000)
+
+        np.testing.assert_allclose(lr_value, expected, rtol=1e-10)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/source/tests/universal/dpmodel/utils/test_learning_rate.py b/source/tests/universal/dpmodel/utils/test_learning_rate.py
new file mode 100644
index 0000000000..408300696a
--- /dev/null
+++ b/source/tests/universal/dpmodel/utils/test_learning_rate.py
@@ -0,0 +1,240 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import unittest
+
+import numpy as np
+
+from deepmd.dpmodel.common import (
+    to_numpy_array,
+)
+from deepmd.dpmodel.utils.learning_rate import (
+    LearningRateCosine,
+    LearningRateExp,
+)
+
+
+class TestLearningRateExpBasic(unittest.TestCase):
+    """Test basic exponential decay learning rate functionality."""
+
+    def test_basic_decay(self) -> None:
+        """Test basic exponential decay without warmup."""
+        lr = LearningRateExp(
+            start_lr=1e-3,
+            stop_lr=1e-5,
+            stop_steps=10000,
+            decay_steps=5000,
+        )
+        np.testing.assert_allclose(lr.value(0), 1e-3, rtol=1e-10)
+        np.testing.assert_allclose(lr.value(10000), 1e-5, rtol=1e-5)
+
+    def test_stop_ratio(self) -> None:
+        """Test stop_ratio parameter."""
+        lr = LearningRateExp(
+            start_lr=1e-3,
+            stop_ratio=0.01,
+            stop_steps=10000,
+            decay_steps=5000,
+        )
+        np.testing.assert_allclose(lr.stop_lr, 1e-5, rtol=1e-10)
+        np.testing.assert_allclose(lr.value(10000), 1e-5, rtol=1e-5)
+
+    def test_decay_rate_override(self) -> None:
+        """Test explicit decay_rate parameter."""
+        lr = LearningRateExp(
+            start_lr=1e-3,
+            stop_lr=1e-5,
+            stop_steps=10000,
+            decay_steps=1000,
+            decay_rate=0.9,
+        )
+        self.assertEqual(lr.decay_rate, 0.9)
+        np.testing.assert_allclose(lr.value(1000), 1e-3 * 0.9, rtol=1e-10)
+
+
+class TestLearningRateCosineBasic(unittest.TestCase):
+    """Test basic cosine annealing learning rate functionality."""
+
+    def test_basic_cosine(self) -> None:
+        """Test basic cosine annealing without warmup."""
+        lr = LearningRateCosine(
+            start_lr=1e-3,
+            stop_lr=1e-5,
+            stop_steps=10000,
+        )
+        np.testing.assert_allclose(lr.value(0), 1e-3, rtol=1e-10)
+        np.testing.assert_allclose(lr.value(10000), 1e-5, rtol=1e-10)
+        np.testing.assert_allclose(lr.value(5000), (1e-3 + 1e-5) / 2, rtol=1e-5)
+
+    def test_stop_ratio(self) -> None:
+        """Test stop_ratio parameter."""
+        lr = LearningRateCosine(
+            start_lr=1e-3,
+            stop_ratio=0.01,
+            stop_steps=10000,
+        )
+        np.testing.assert_allclose(lr.stop_lr, 1e-5, rtol=1e-10)
+
+
+class TestLearningRateWarmup(unittest.TestCase):
+    """Test learning rate warmup functionality."""
+
+    def test_warmup_steps_exp(self) -> None:
+        """Test warmup with exponential decay."""
+        lr = LearningRateExp(
+            start_lr=1e-3,
+            stop_lr=1e-5,
+            stop_steps=10000,
+            decay_steps=1000,
+            warmup_steps=1000,
+        )
+        self.assertEqual(lr.decay_stop_steps, 9000)
+        np.testing.assert_allclose(lr.value(0), 0.0, rtol=1e-10)
+        np.testing.assert_allclose(lr.value(500), 0.5e-3, rtol=1e-10)
+        np.testing.assert_allclose(lr.value(1000), 1e-3, rtol=1e-10)
+        self.assertLess(to_numpy_array(lr.value(2000)), 1e-3)
+
+    def test_warmup_steps_cosine(self) -> None:
+        """Test warmup with cosine annealing."""
+        lr = LearningRateCosine(
+            start_lr=1e-3,
+            stop_lr=1e-5,
+            stop_steps=10000,
+            warmup_steps=1000,
+        )
+        self.assertEqual(lr.decay_stop_steps, 9000)
+        np.testing.assert_allclose(lr.value(0), 0.0, rtol=1e-10)
+        np.testing.assert_allclose(lr.value(1000), 1e-3, rtol=1e-10)
+        np.testing.assert_allclose(lr.value(10000), 1e-5, rtol=1e-10)
+
+    def test_warmup_ratio(self) -> None:
+        """Test warmup_ratio parameter."""
+        lr = LearningRateExp(
+            start_lr=1e-3,
+            stop_lr=1e-5,
+            stop_steps=10000,
+            decay_steps=1000,
+            warmup_ratio=0.1,
+        )
+        self.assertEqual(lr.warmup_steps, 1000)
+        self.assertEqual(lr.decay_stop_steps, 9000)
+
+    def test_warmup_start_factor(self) -> None:
+        """Test warmup_start_factor parameter."""
+        lr = LearningRateExp(
+            start_lr=1e-3,
+            stop_lr=1e-5,
+            stop_steps=10000,
+            decay_steps=1000,
+            warmup_steps=1000,
+            warmup_start_factor=0.1,
+        )
+        np.testing.assert_allclose(lr.value(0), 0.1e-3, rtol=1e-10)
+        np.testing.assert_allclose(lr.value(1000), 1e-3, rtol=1e-10)
+
+    def test_no_warmup(self) -> None:
+        """Test that warmup_steps=0 works correctly."""
+        lr = LearningRateExp(
+            start_lr=1e-3,
+            stop_lr=1e-5,
+            stop_steps=10000,
+            decay_steps=5000,
+            warmup_steps=0,
+        )
+        self.assertEqual(lr.warmup_steps, 0)
+        self.assertEqual(lr.decay_stop_steps, 10000)
+        np.testing.assert_allclose(lr.value(0), 1e-3, rtol=1e-10)
+
+
+class TestLearningRateArrayInput(unittest.TestCase):
+    """Test learning rate with array inputs for JIT compatibility."""
+
+    def test_array_input_exp(self) -> None:
+        """Test exponential decay with array input."""
+        lr = LearningRateExp(
+            start_lr=1e-3,
+            stop_lr=1e-5,
+            stop_steps=10000,
+            decay_steps=5000,
+            warmup_steps=1000,
+        )
+        steps = np.array([0, 500, 1000, 5000, 10000])
+        lrs = lr.value(steps)
+        self.assertEqual(lrs.shape, (5,))
+        np.testing.assert_allclose(lrs[0], 0.0, rtol=1e-10)
+        np.testing.assert_allclose(lrs[2], 1e-3, rtol=1e-10)
+
+    def test_array_input_cosine(self) -> None:
+        """Test cosine annealing with array input."""
+        lr = LearningRateCosine(
+            start_lr=1e-3,
+            stop_lr=1e-5,
+            stop_steps=10000,
+            warmup_steps=1000,
+        )
+        steps = np.array([0, 1000, 5500, 10000])
+        lrs = lr.value(steps)
+        self.assertEqual(lrs.shape, (4,))
+        np.testing.assert_allclose(lrs[0], 0.0, rtol=1e-10)
+        np.testing.assert_allclose(lrs[1], 1e-3, rtol=1e-10)
+        np.testing.assert_allclose(lrs[3], 1e-5, rtol=1e-10)
+
+
+class TestLearningRateBeyondStopSteps(unittest.TestCase):
+    """Test learning rate behavior beyond stop_steps."""
+
+    def test_exp_beyond_stop_steps(self) -> None:
+        """Test exponential decay clamps to stop_lr."""
+        lr = LearningRateExp(
+            start_lr=1e-3,
+            stop_lr=1e-5,
+            stop_steps=10000,
+            decay_steps=1000,
+        )
+        np.testing.assert_allclose(lr.value(20000), 1e-5, rtol=1e-10)
+
+    def test_cosine_beyond_stop_steps(self) -> None:
+        """Test cosine annealing returns stop_lr beyond decay phase."""
+        lr = LearningRateCosine(
+            start_lr=1e-3,
+            stop_lr=1e-5,
+            stop_steps=10000,
+        )
+        np.testing.assert_allclose(lr.value(20000), 1e-5, rtol=1e-10)
+
+
+class TestLearningRateValidation(unittest.TestCase):
+    """Test learning rate parameter validation."""
+
+    def test_decay_steps_exceeds_decay_total_without_warmup(self) -> None:
+        """Test that decay_steps > stop_steps raises ValueError."""
+        with self.assertRaises(ValueError) as cm:
+            LearningRateExp(
+                start_lr=1e-3,
+                stop_lr=1e-5,
+                stop_steps=500,
+                decay_steps=600,
+            )
+        self.assertIn("decay_steps", str(cm.exception))
+        self.assertIn("exceed", str(cm.exception))
+
+    def test_decay_steps_exceeds_decay_total_with_warmup(self) -> None:
+        """Test that decay_steps > (stop_steps - warmup_steps) raises ValueError."""
+        with self.assertRaises(ValueError) as cm:
+            LearningRateExp(
+                start_lr=1e-3,
+                stop_lr=1e-5,
+                stop_steps=1000,
+                decay_steps=900,
+                warmup_steps=200,  # decay_total = 800
+            )
+        self.assertIn("decay_steps", str(cm.exception))
+
+    def test_decay_steps_equals_decay_total_allowed(self) -> None:
+        """Test that decay_steps == decay_total is allowed (boundary case)."""
+        # Should not raise
+        lr = LearningRateExp(
+            start_lr=1e-3,
+            stop_lr=1e-5,
+            stop_steps=500,
+            decay_steps=500,
+        )
+        self.assertEqual(lr.decay_steps, 500)