diff --git a/.github/labeler.yml b/.github/labeler.yml
new file mode 100644
index 0000000000..f985fe0fe5
--- /dev/null
+++ b/.github/labeler.yml
@@ -0,0 +1,13 @@
+Python:
+- deepmd/**/*
+- source/tests/**/*
+Docs: doc/**/*
+Examples: examples/**/*
+Core: source/lib/**/*
+CUDA: source/lib/src/cuda/**/*
+ROCM: source/lib/src/rocm/**/*
+OP: source/op/**/*
+C++: source/api_cc/**/*
+LAMMPS: source/lmp/**/*
+Gromacs: source/gmx/**/*
+i-Pi: source/ipi/**/*
\ No newline at end of file
diff --git a/.github/workflows/build_cc.yml b/.github/workflows/build_cc.yml
index 36bb0201e1..b6547e2c09 100644
--- a/.github/workflows/build_cc.yml
+++ b/.github/workflows/build_cc.yml
@@ -12,6 +12,8 @@ jobs:
         - variant: cpu
         - variant: cuda
     steps:
+    - name: work around permission issue
+      run: git config --global --add safe.directory /__w/deepmd-kit/deepmd-kit
     - uses: actions/checkout@master
       with:
         submodules: true
diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml
index 936aa6b65f..f0134b175c 100644
--- a/.github/workflows/build_wheel.yml
+++ b/.github/workflows/build_wheel.yml
@@ -13,6 +13,8 @@ jobs:
         os: [ubuntu-18.04]  #, windows-latest, macos-latest]
 
     steps:
+      - name: work around permission issue
+        run: git config --global --add safe.directory /__w/deepmd-kit/deepmd-kit
       - uses: actions/checkout@v2
       - uses: actions/setup-python@v2
         name: Install Python
@@ -26,7 +28,7 @@ jobs:
       - name: Build wheels
         env:
           CIBW_BUILD: "cp36-* cp37-* cp38-* cp39-* cp310-*"
-          CIBW_MANYLINUX_X86_64_IMAGE: ghcr.io/deepmodeling/manylinux2010_x86_64_tensorflow
+          CIBW_MANYLINUX_X86_64_IMAGE: ghcr.io/deepmodeling/manylinux2014_x86_64_tensorflow
           CIBW_BEFORE_BUILD: pip install tensorflow
           CIBW_SKIP: "*-win32 *-manylinux_i686 *-musllinux*"
         run: |
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
new file mode 100644
index 0000000000..51dced0ed7
--- /dev/null
+++ b/.github/workflows/labeler.yml
@@ -0,0 +1,14 @@
+name: "Pull Request Labeler"
+on:
+- pull_request_target
+
+jobs:
+  triage:
+    permissions:
+      contents: read
+      pull-requests: write
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/labeler@v4
+      with:
+        repo-token: "${{ secrets.GITHUB_TOKEN }}"
\ No newline at end of file
diff --git a/.github/workflows/lint_python.yml b/.github/workflows/lint_python.yml
index 92a609edc1..6b6dd695d3 100644
--- a/.github/workflows/lint_python.yml
+++ b/.github/workflows/lint_python.yml
@@ -11,6 +11,8 @@ jobs:
         python-version: [3.8]
 
     steps:
+    - name: work around permission issue
+      run: git config --global --add safe.directory /__w/deepmd-kit/deepmd-kit
     - uses: actions/checkout@v1
     - uses: actions/setup-python@v2
       with:
diff --git a/.github/workflows/test_cc.yml b/.github/workflows/test_cc.yml
index 5192eda6e9..b000c20ea4 100644
--- a/.github/workflows/test_cc.yml
+++ b/.github/workflows/test_cc.yml
@@ -6,6 +6,12 @@ jobs:
   testpython:
     name: Test C++
     runs-on: ubuntu-latest
+    container: ghcr.io/deepmodeling/deepmd-kit-test-cc:latest
     steps:
+    - name: work around permission issue
+      run: git config --global --add safe.directory /__w/deepmd-kit/deepmd-kit
     - uses: actions/checkout@master
-    - run: source/install/test_cc.sh
+    - run: source/install/test_cc_local.sh
+      env:
+        tensorflow_root: /usr/local
+    - run: source/install/codecov.sh
diff --git a/.github/workflows/test_python.yml b/.github/workflows/test_python.yml
index dd8bded91d..f66665c116 100644
--- a/.github/workflows/test_python.yml
+++ b/.github/workflows/test_python.yml
@@ -51,6 +51,8 @@ jobs:
 
     container: ghcr.io/deepmodeling/deepmd-kit-test-environment:py${{ matrix.python }}-gcc${{ matrix.gcc }}-tf${{ matrix.tf }}
     steps:
+    - name: work around permission issue
+      run: git config --global --add safe.directory /__w/deepmd-kit/deepmd-kit
     - uses: actions/checkout@master
     - name: pip cache
       uses: actions/cache@v2
diff --git a/README.md b/README.md
index 995ac6e914..8afb65e2c7 100644
--- a/README.md
+++ b/README.md
@@ -86,7 +86,8 @@ A full [document](doc/train/train-input-auto.rst) on options in the training inp
     - [Install GROMACS](doc/install/install-gromacs.md)
     - [Building conda packages](doc/install/build-conda.md)
 - [Data](doc/data/index.md)
-    - [Data conversion](doc/data/data-conv.md)
+    - [System](doc/data/system.md)
+    - [Formats of a system](doc/data/data-conv.md)
     - [Prepare data with dpdata](doc/data/dpdata.md)
 - [Model](doc/model/index.md)
     - [Overall](doc/model/overall.md)
@@ -99,6 +100,7 @@ A full [document](doc/train/train-input-auto.rst) on options in the training inp
     - [Fit `tensor` like `Dipole` and `Polarizability`](doc/model/train-fitting-tensor.md)
     - [Train a Deep Potential model using `type embedding` approach](doc/model/train-se-e2-a-tebd.md)
     - [Deep potential long-range](doc/model/dplr.md)
+    - [Deep Potential - Range Correction (DPRc)](doc/model/dprc.md)
 - [Training](doc/train/index.md)
     - [Training a model](doc/train/training.md)
     - [Advanced options](doc/train/training-advanced.md)
@@ -121,37 +123,31 @@ A full [document](doc/train/train-input-auto.rst) on options in the training inp
     - [LAMMPS commands](doc/third-party/lammps-command.md)
     - [Run path-integral MD with i-PI](doc/third-party/ipi.md)
     - [Run MD with GROMACS](doc/third-party/gromacs.md)
+    - [Interfaces out of DeePMD-kit](doc/third-party/out-of-deepmd-kit.md)
 
 # Code structure
+
 The code is organized as follows:
 
 * `data/raw`: tools manipulating the raw data files.
-
 * `examples`: examples.
-
 * `deepmd`: DeePMD-kit python modules.
-
 * `source/api_cc`: source code of DeePMD-kit C++ API.
-
 * `source/ipi`: source code of i-PI client.
-
 * `source/lib`: source code of DeePMD-kit library.
-
 * `source/lmp`: source code of Lammps module.
-
 * `source/gmx`: source code of Gromacs plugin.
-
 * `source/op`: tensorflow op implementation. working with library.
 
 
 # Troubleshooting
 
-- [Model compatibility](doc/troubleshooting/model-compatability.md)
+- [Model compatibility](doc/troubleshooting/model_compatability.md)
 - [Installation](doc/troubleshooting/installation.md)
-- [The temperature undulates violently during early stages of MD](doc/troubleshooting/md-energy-undulation.md)
-- [MD: cannot run LAMMPS after installing a new version of DeePMD-kit](doc/troubleshooting/md-version-compatibility.md)
-- [Do we need to set rcut < half boxsize?](doc/troubleshooting/howtoset-rcut.md)
-- [How to set sel?](doc/troubleshooting/howtoset-sel.md)
+- [The temperature undulates violently during early stages of MD](doc/troubleshooting/md_energy_undulation.md)
+- [MD: cannot run LAMMPS after installing a new version of DeePMD-kit](doc/troubleshooting/md_version_compatibility.md)
+- [Do we need to set rcut < half boxsize?](doc/troubleshooting/howtoset_rcut.md)
+- [How to set sel?](doc/troubleshooting/howtoset_sel.md)
 - [How to control the number of nodes used by a job?](doc/troubleshooting/howtoset_num_nodes.md)
 - [How to tune Fitting/embedding-net size?](doc/troubleshooting/howtoset_netsize.md)
 
diff --git a/deepmd/calculator.py b/deepmd/calculator.py
index 25dc7fd5ee..60b22b868a 100644
--- a/deepmd/calculator.py
+++ b/deepmd/calculator.py
@@ -54,7 +54,7 @@ class DP(Calculator):
     """
 
     name = "DP"
-    implemented_properties = ["energy", "forces", "virial", "stress"]
+    implemented_properties = ["energy", "free_energy", "forces", "virial", "stress"]
 
     def __init__(
         self,
@@ -102,6 +102,8 @@ def calculate(
         atype = [self.type_dict[k] for k in symbols]
         e, f, v = self.dp.eval(coords=coord, cells=cell, atom_types=atype)
         self.results['energy'] = e[0][0]
+        # see https://gitlab.com/ase/ase/-/merge_requests/2485
+        self.results['free_energy'] = e[0][0]
         self.results['forces'] = f[0]
         self.results['virial'] = v[0].reshape(3, 3)
 
diff --git a/deepmd/common.py b/deepmd/common.py
index d32422f0db..6a18cda677 100644
--- a/deepmd/common.py
+++ b/deepmd/common.py
@@ -95,6 +95,7 @@ def add_data_requirement(
     high_prec: bool = False,
     type_sel: bool = None,
     repeat: int = 1,
+    default: float = 0.,
 ):
     """Specify data requirements for training.
 
@@ -116,6 +117,8 @@ def add_data_requirement(
         select only certain type of atoms, by default None
     repeat : int, optional
         if specify repaeat data `repeat` times, by default 1
+    default : float, optional, default=0.
+        default value of data
     """
     data_requirement[key] = {
         "ndof": ndof,
@@ -124,6 +127,7 @@ def add_data_requirement(
         "high_prec": high_prec,
         "type_sel": type_sel,
         "repeat": repeat,
+        "default": default,
     }
 
 
@@ -444,28 +448,6 @@ def expand_sys_str(root_dir: Union[str, Path]) -> List[str]:
     return matches
 
 
-def docstring_parameter(*sub: Tuple[str, ...]):
-    """Add parameters to object docstring.
-
-    Parameters
-    ----------
-    sub: Tuple[str, ...]
-        list of strings that will be inserted into prepared locations in docstring.
-
-    Note
-    ----
-    Can be used on both object and classes.
-    """
-
-    @wraps
-    def dec(obj: "_OBJ") -> "_OBJ":
-        if obj.__doc__ is not None:
-            obj.__doc__ = obj.__doc__.format(*sub)
-        return obj
-
-    return dec
-
-
 def get_np_precision(precision: "_PRECISION") -> np.dtype:
     """Get numpy precision constant from string.
 
diff --git a/deepmd/descriptor/hybrid.py b/deepmd/descriptor/hybrid.py
index cfee332b78..d1c188affc 100644
--- a/deepmd/descriptor/hybrid.py
+++ b/deepmd/descriptor/hybrid.py
@@ -189,7 +189,7 @@ def build (self,
             dout = tf.reshape(dout, [-1, ii.get_dim_out()])
             all_dout.append(dout)
         dout = tf.concat(all_dout, axis = 1)
-        dout = tf.reshape(dout, [-1, natoms[0] * self.get_dim_out()])
+        dout = tf.reshape(dout, [-1, natoms[0], self.get_dim_out()])
         return dout
         
 
diff --git a/deepmd/descriptor/loc_frame.py b/deepmd/descriptor/loc_frame.py
index d8063505bd..f956d5ef36 100644
--- a/deepmd/descriptor/loc_frame.py
+++ b/deepmd/descriptor/loc_frame.py
@@ -305,7 +305,7 @@ def prod_force_virial(self,
         """
         [net_deriv] = tf.gradients (atom_ener, self.descrpt)
         tf.summary.histogram('net_derivative', net_deriv)
-        net_deriv_reshape = tf.reshape (net_deriv, [-1, natoms[0] * self.ndescrpt])
+        net_deriv_reshape = tf.reshape (net_deriv, [np.cast['int64'](-1), natoms[0] * np.cast['int64'](self.ndescrpt)])
         force = op_module.prod_force (net_deriv_reshape,
                                       self.descrpt_deriv,
                                       self.nlist,
diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py
index bdc2b37d7b..ff7549b124 100644
--- a/deepmd/descriptor/se_a.py
+++ b/deepmd/descriptor/se_a.py
@@ -3,8 +3,7 @@
 from typing import Tuple, List, Dict, Any
 
 from deepmd.env import tf
-from deepmd.common import get_activation_func, get_precision, ACTIVATION_FN_DICT, PRECISION_DICT, docstring_parameter, cast_precision
-from deepmd.utils.argcheck import list_to_doc
+from deepmd.common import get_activation_func, get_precision, cast_precision
 from deepmd.env import GLOBAL_TF_FLOAT_PRECISION
 from deepmd.env import GLOBAL_NP_FLOAT_PRECISION
 from deepmd.env import op_module
@@ -88,9 +87,9 @@ class DescrptSeA (DescrptSe):
     set_davg_zero
             Set the shift of embedding net input to zero.
     activation_function
-            The activation function in the embedding net. Supported options are {0}
+            The activation function in the embedding net. Supported options are |ACTIVATION_FN|
     precision
-            The precision of the embedding net parameters. Supported options are {1}
+            The precision of the embedding net parameters. Supported options are |PRECISION|
     uniform_seed
             Only for the purpose of backward compatibility, retrieves the old behavior of using the random seed
     
@@ -101,7 +100,6 @@ class DescrptSeA (DescrptSe):
        systems. In Proceedings of the 32nd International Conference on Neural Information Processing
        Systems (NIPS'18). Curran Associates Inc., Red Hook, NY, USA, 4441–4451.
     """
-    @docstring_parameter(list_to_doc(ACTIVATION_FN_DICT.keys()), list_to_doc(PRECISION_DICT.keys()))
     def __init__ (self, 
                   rcut: float,
                   rcut_smth: float,
@@ -517,7 +515,7 @@ def prod_force_virial(self,
         """
         [net_deriv] = tf.gradients (atom_ener, self.descrpt_reshape)
         tf.summary.histogram('net_derivative', net_deriv)
-        net_deriv_reshape = tf.reshape (net_deriv, [-1, natoms[0] * self.ndescrpt])        
+        net_deriv_reshape = tf.reshape (net_deriv, [np.cast['int64'](-1), natoms[0] * np.cast['int64'](self.ndescrpt)])        
         force \
             = op_module.prod_force_se_a (net_deriv_reshape,
                                           self.descrpt_deriv,
@@ -553,14 +551,14 @@ def _pass_filter(self,
         else:
             type_embedding = None
         start_index = 0
-        inputs = tf.reshape(inputs, [-1, self.ndescrpt * natoms[0]])
+        inputs = tf.reshape(inputs, [-1, natoms[0], self.ndescrpt])
         output = []
         output_qmat = []
         if not (self.type_one_side and len(self.exclude_types) == 0) and type_embedding is None:
             for type_i in range(self.ntypes):
                 inputs_i = tf.slice (inputs,
-                                     [ 0, start_index*      self.ndescrpt],
-                                     [-1, natoms[2+type_i]* self.ndescrpt] )
+                                     [ 0, start_index, 0],
+                                     [-1, natoms[2+type_i], -1] )
                 inputs_i = tf.reshape(inputs_i, [-1, self.ndescrpt])
                 if self.type_one_side:
                     # reuse NN parameters for all types to support type_one_side along with exclude_types
@@ -569,8 +567,8 @@ def _pass_filter(self,
                 else:
                     filter_name = 'filter_type_'+str(type_i)+suffix
                 layer, qmat = self._filter(inputs_i, type_i, name=filter_name, natoms=natoms, reuse=reuse, trainable = trainable, activation_fn = self.filter_activation_fn)
-                layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[2+type_i] * self.get_dim_out()])
-                qmat  = tf.reshape(qmat,  [tf.shape(inputs)[0], natoms[2+type_i] * self.get_dim_rot_mat_1() * 3])
+                layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[2+type_i], self.get_dim_out()])
+                qmat  = tf.reshape(qmat,  [tf.shape(inputs)[0], natoms[2+type_i], self.get_dim_rot_mat_1() * 3])
                 output.append(layer)
                 output_qmat.append(qmat)
                 start_index += natoms[2+type_i]
@@ -579,8 +577,8 @@ def _pass_filter(self,
             inputs_i = tf.reshape(inputs_i, [-1, self.ndescrpt])
             type_i = -1
             layer, qmat = self._filter(inputs_i, type_i, name='filter_type_all'+suffix, natoms=natoms, reuse=reuse, trainable = trainable, activation_fn = self.filter_activation_fn, type_embedding=type_embedding)
-            layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[0] * self.get_dim_out()])
-            qmat  = tf.reshape(qmat,  [tf.shape(inputs)[0], natoms[0] * self.get_dim_rot_mat_1() * 3])
+            layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[0], self.get_dim_out()])
+            qmat  = tf.reshape(qmat,  [tf.shape(inputs)[0], natoms[0], self.get_dim_rot_mat_1() * 3])
             output.append(layer)
             output_qmat.append(qmat)
         output = tf.concat(output, axis = 1)
@@ -635,7 +633,7 @@ def _compute_dstats_sys_smth (self,
 
     def _compute_std (self,sumv2, sumv, sumn) :
         if sumn == 0:
-            return 1e-2
+            return 1. / self.rcut_r
         val = np.sqrt(sumv2/sumn - np.multiply(sumv/sumn, sumv/sumn))
         if np.abs(val) < 1e-2:
             val = 1e-2
@@ -720,12 +718,12 @@ def _filter_lower(
                 raise RuntimeError('compression of type embedded descriptor is not supported at the moment')
         # natom x 4 x outputs_size
         if self.compress and (not is_exclude):
-          info = [self.lower, self.upper, self.upper * self.table_config[0], self.table_config[1], self.table_config[2], self.table_config[3]]
-          if self.type_one_side:
-            net = 'filter_-1_net_' + str(type_i)
-          else:
-            net = 'filter_' + str(type_input) + '_net_' + str(type_i)
-          return op_module.tabulate_fusion_se_a(tf.cast(self.table.data[net], self.filter_precision), info, xyz_scatter, tf.reshape(inputs_i, [natom, shape_i[1]//4, 4]), last_layer_size = outputs_size[-1])  
+            if self.type_one_side:
+                net = 'filter_-1_net_' + str(type_i)
+            else:
+                net = 'filter_' + str(type_input) + '_net_' + str(type_i)
+            info = [self.lower[net], self.upper[net], self.upper[net] * self.table_config[0], self.table_config[1], self.table_config[2], self.table_config[3]]
+            return op_module.tabulate_fusion_se_a(tf.cast(self.table.data[net], self.filter_precision), info, xyz_scatter, tf.reshape(inputs_i, [natom, shape_i[1]//4, 4]), last_layer_size = outputs_size[-1])  
         else:
           if (not is_exclude):
               # with (natom x nei_type_i) x out_size
diff --git a/deepmd/descriptor/se_a_ebd.py b/deepmd/descriptor/se_a_ebd.py
index 7a1f640153..9a6c2b206e 100644
--- a/deepmd/descriptor/se_a_ebd.py
+++ b/deepmd/descriptor/se_a_ebd.py
@@ -433,8 +433,8 @@ def _pass_filter(self,
                                        seed = self.seed, 
                                        trainable = trainable, 
                                        activation_fn = self.filter_activation_fn)
-        output      = tf.reshape(layer, [tf.shape(inputs)[0], natoms[0] * self.get_dim_out()])
-        output_qmat = tf.reshape(qmat,  [tf.shape(inputs)[0], natoms[0] * self.get_dim_rot_mat_1() * 3])
+        output      = tf.reshape(layer, [tf.shape(inputs)[0], natoms[0], self.get_dim_out()])
+        output_qmat = tf.reshape(qmat,  [tf.shape(inputs)[0], natoms[0], self.get_dim_rot_mat_1() * 3])
         return output, output_qmat
 
 
diff --git a/deepmd/descriptor/se_a_ef.py b/deepmd/descriptor/se_a_ef.py
index b037475722..cfe9c25d46 100644
--- a/deepmd/descriptor/se_a_ef.py
+++ b/deepmd/descriptor/se_a_ef.py
@@ -2,8 +2,7 @@
 from typing import Tuple, List
 
 from deepmd.env import tf
-from deepmd.common import add_data_requirement,get_activation_func, get_precision, ACTIVATION_FN_DICT, PRECISION_DICT, docstring_parameter
-from deepmd.utils.argcheck import list_to_doc
+from deepmd.common import add_data_requirement
 from deepmd.utils.sess import run_sess
 from deepmd.env import GLOBAL_TF_FLOAT_PRECISION
 from deepmd.env import GLOBAL_NP_FLOAT_PRECISION
@@ -43,13 +42,12 @@ class DescrptSeAEf (Descriptor):
     set_davg_zero
             Set the shift of embedding net input to zero.
     activation_function
-            The activation function in the embedding net. Supported options are {0}
+            The activation function in the embedding net. Supported options are |ACTIVATION_FN|
     precision
-            The precision of the embedding net parameters. Supported options are {1}
+            The precision of the embedding net parameters. Supported options are |PRECISION|
     uniform_seed
             Only for the purpose of backward compatibility, retrieves the old behavior of using the random seed
     """
-    @docstring_parameter(list_to_doc(ACTIVATION_FN_DICT.keys()), list_to_doc(PRECISION_DICT.keys()))
     def __init__(self,
                  rcut: float,
                  rcut_smth: float,
@@ -230,7 +228,7 @@ def build (self,
         self.dout_vert = tf.reshape(self.dout_vert, [nframes * natoms[0], self.descrpt_vert.get_dim_out()])
         self.dout_para = tf.reshape(self.dout_para, [nframes * natoms[0], self.descrpt_para.get_dim_out()])
         self.dout = tf.concat([self.dout_vert, self.dout_para], axis = 1)
-        self.dout = tf.reshape(self.dout, [nframes, natoms[0] * self.get_dim_out()])
+        self.dout = tf.reshape(self.dout, [nframes, natoms[0], self.get_dim_out()])
         self.qmat = self.descrpt_vert.qmat + self.descrpt_para.qmat
 
         tf.summary.histogram('embedding_net_output', self.dout)
diff --git a/deepmd/descriptor/se_r.py b/deepmd/descriptor/se_r.py
index b9e5d5aabd..929007bcf5 100644
--- a/deepmd/descriptor/se_r.py
+++ b/deepmd/descriptor/se_r.py
@@ -2,8 +2,7 @@
 from typing import Tuple, List
 
 from deepmd.env import tf
-from deepmd.common import get_activation_func, get_precision, ACTIVATION_FN_DICT, PRECISION_DICT, docstring_parameter, cast_precision
-from deepmd.utils.argcheck import list_to_doc
+from deepmd.common import get_activation_func, get_precision, cast_precision
 from deepmd.env import GLOBAL_TF_FLOAT_PRECISION
 from deepmd.env import GLOBAL_NP_FLOAT_PRECISION
 from deepmd.env import op_module
@@ -46,13 +45,12 @@ class DescrptSeR (DescrptSe):
             The excluded pairs of types which have no interaction with each other.
             For example, `[[0, 1]]` means no interaction between type 0 and type 1.
     activation_function
-            The activation function in the embedding net. Supported options are {0}
+            The activation function in the embedding net. Supported options are |ACTIVATION_FN|
     precision
-            The precision of the embedding net parameters. Supported options are {1}
+            The precision of the embedding net parameters. Supported options are |PRECISION|
     uniform_seed
             Only for the purpose of backward compatibility, retrieves the old behavior of using the random seed
     """
-    @docstring_parameter(list_to_doc(ACTIVATION_FN_DICT.keys()), list_to_doc(PRECISION_DICT.keys()))
     def __init__ (self, 
                   rcut: float,
                   rcut_smth: float,
@@ -415,7 +413,7 @@ def prod_force_virial(self,
         """
         [net_deriv] = tf.gradients (atom_ener, self.descrpt_reshape)
         tf.summary.histogram('net_derivative', net_deriv)
-        net_deriv_reshape = tf.reshape (net_deriv, [-1, natoms[0] * self.ndescrpt])        
+        net_deriv_reshape = tf.reshape (net_deriv, [np.cast['int64'](-1), natoms[0] * np.cast['int64'](self.ndescrpt)])        
         force \
             = op_module.prod_force_se_r (net_deriv_reshape,
                                          self.descrpt_deriv,
@@ -441,13 +439,13 @@ def _pass_filter(self,
                      suffix = '', 
                      trainable = True) :
         start_index = 0
-        inputs = tf.reshape(inputs, [-1, self.ndescrpt * natoms[0]])
+        inputs = tf.reshape(inputs, [-1, natoms[0], self.ndescrpt])
         output = []
         if not (self.type_one_side and len(self.exclude_types) == 0):
             for type_i in range(self.ntypes):
                 inputs_i = tf.slice (inputs,
-                                     [ 0, start_index*      self.ndescrpt],
-                                     [-1, natoms[2+type_i]* self.ndescrpt] )
+                                     [ 0, start_index, 0],
+                                     [-1, natoms[2+type_i], -1] )
                 inputs_i = tf.reshape(inputs_i, [-1, self.ndescrpt])
                 if self.type_one_side:
                     # reuse NN parameters for all types to support type_one_side along with exclude_types
@@ -456,7 +454,7 @@ def _pass_filter(self,
                 else:
                     filter_name = 'filter_type_'+str(type_i)+suffix
                 layer = self._filter_r(inputs_i, type_i, name=filter_name, natoms=natoms, reuse=reuse, trainable = trainable, activation_fn = self.filter_activation_fn)
-                layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[2+type_i] * self.get_dim_out()])
+                layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[2+type_i], self.get_dim_out()])
                 output.append(layer)
                 start_index += natoms[2+type_i]
         else :
@@ -464,7 +462,7 @@ def _pass_filter(self,
             inputs_i = tf.reshape(inputs_i, [-1, self.ndescrpt])
             type_i = -1
             layer = self._filter_r(inputs_i, type_i, name='filter_type_all'+suffix, natoms=natoms, reuse=reuse, trainable = trainable, activation_fn = self.filter_activation_fn)
-            layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[0] * self.get_dim_out()])
+            layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[0], self.get_dim_out()])
             output.append(layer)
         output = tf.concat(output, axis = 1)
         return output
@@ -540,8 +538,8 @@ def _filter_r(self,
                 # with (natom x nei_type_i) x 1
                 xyz_scatter = tf.reshape(inputs_i, [-1, 1])
                 if self.compress and ((type_input, type_i) not in self.exclude_types):
-                    info = [self.lower, self.upper, self.upper * self.table_config[0], self.table_config[1], self.table_config[2], self.table_config[3]]
                     net = 'filter_' + str(type_input) + '_net_' + str(type_i)
+                    info = [self.lower[net], self.upper[net], self.upper[net] * self.table_config[0], self.table_config[1], self.table_config[2], self.table_config[3]]
                     xyz_scatter = op_module.tabulate_fusion_se_r(tf.cast(self.table.data[net], self.filter_precision), info, inputs_i, last_layer_size = outputs_size[-1]) 
                 elif (type_input, type_i) not in self.exclude_types:
                     xyz_scatter = embedding_net(xyz_scatter, 
diff --git a/deepmd/descriptor/se_t.py b/deepmd/descriptor/se_t.py
index 1735757dcb..f9453f17c3 100644
--- a/deepmd/descriptor/se_t.py
+++ b/deepmd/descriptor/se_t.py
@@ -2,8 +2,7 @@
 from typing import Tuple, List
 
 from deepmd.env import tf
-from deepmd.common import get_activation_func, get_precision, ACTIVATION_FN_DICT, PRECISION_DICT, docstring_parameter, cast_precision
-from deepmd.utils.argcheck import list_to_doc
+from deepmd.common import get_activation_func, get_precision, cast_precision
 from deepmd.env import GLOBAL_TF_FLOAT_PRECISION
 from deepmd.env import GLOBAL_NP_FLOAT_PRECISION
 from deepmd.env import op_module
@@ -44,13 +43,12 @@ class DescrptSeT (DescrptSe):
     set_davg_zero
             Set the shift of embedding net input to zero.
     activation_function
-            The activation function in the embedding net. Supported options are {0}
+            The activation function in the embedding net. Supported options are |ACTIVATION_FN|
     precision
-            The precision of the embedding net parameters. Supported options are {1}
+            The precision of the embedding net parameters. Supported options are |PRECISION|
     uniform_seed
             Only for the purpose of backward compatibility, retrieves the old behavior of using the random seed
     """
-    @docstring_parameter(list_to_doc(ACTIVATION_FN_DICT.keys()), list_to_doc(PRECISION_DICT.keys()))
     def __init__ (self, 
                   rcut: float,
                   rcut_smth: float,
@@ -414,7 +412,7 @@ def prod_force_virial(self,
                 The atomic virial
         """
         [net_deriv] = tf.gradients (atom_ener, self.descrpt_reshape)
-        net_deriv_reshape = tf.reshape (net_deriv, [-1, natoms[0] * self.ndescrpt])        
+        net_deriv_reshape = tf.reshape (net_deriv, [np.cast['int64'](-1), natoms[0] * np.cast['int64'](self.ndescrpt)])        
         force \
             = op_module.prod_force_se_a (net_deriv_reshape,
                                           self.descrpt_deriv,
@@ -442,14 +440,14 @@ def _pass_filter(self,
                      suffix = '', 
                      trainable = True) :
         start_index = 0
-        inputs = tf.reshape(inputs, [-1, self.ndescrpt * natoms[0]])
+        inputs = tf.reshape(inputs, [-1, natoms[0], self.ndescrpt])
         output = []
         output_qmat = []
         inputs_i = inputs
         inputs_i = tf.reshape(inputs_i, [-1, self.ndescrpt])
         type_i = -1
         layer, qmat = self._filter(inputs_i, type_i, name='filter_type_all'+suffix, natoms=natoms, reuse=reuse, trainable = trainable, activation_fn = self.filter_activation_fn)
-        layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[0] * self.get_dim_out()])
+        layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[0], self.get_dim_out()])
         # qmat  = tf.reshape(qmat,  [tf.shape(inputs)[0], natoms[0] * self.get_dim_rot_mat_1() * 3])
         output.append(layer)
         # output_qmat.append(qmat)
@@ -559,8 +557,8 @@ def _filter(self,
                     # with (natom x nei_type_i x nei_type_j)
                     ebd_env_ij = tf.reshape(env_ij, [-1, 1])
                     if self.compress:
-                        info = [self.lower, self.upper, self.upper * self.table_config[0], self.table_config[1], self.table_config[2], self.table_config[3]]
                         net = 'filter_' + str(type_i) + '_net_' + str(type_j)
+                        info = [self.lower[net], self.upper[net], self.upper[net] * self.table_config[0], self.table_config[1], self.table_config[2], self.table_config[3]]
                         res_ij = op_module.tabulate_fusion_se_t(tf.cast(self.table.data[net], self.filter_precision), info, ebd_env_ij, env_ij, last_layer_size = outputs_size[-1]) 
                     else:
                         # with (natom x nei_type_i x nei_type_j) x out_size
diff --git a/deepmd/entrypoints/compress.py b/deepmd/entrypoints/compress.py
index 742c10ed98..d8f8e3e933 100644
--- a/deepmd/entrypoints/compress.py
+++ b/deepmd/entrypoints/compress.py
@@ -102,7 +102,7 @@ def compress(
         10 * step,
         int(frequency),
     ]
-    jdata["training"]["save_ckpt"] = "model-compression/model.ckpt"
+    jdata["training"]["save_ckpt"] = os.path.join("model-compression", "model.ckpt")
     jdata = update_deepmd_input(jdata)
     jdata = normalize(jdata)
 
@@ -135,6 +135,9 @@ def compress(
             "increase the step size." % step
         ) from e
 
+    # reset the graph, otherwise the size limitation will be only 2 GB / 2 = 1 GB
+    tf.reset_default_graph()
+
     # stage 2: freeze the model
     log.info("\n\n")
     log.info("stage 2: freeze the model")
diff --git a/deepmd/entrypoints/convert.py b/deepmd/entrypoints/convert.py
index aa602dbed4..cbe00e9e5d 100644
--- a/deepmd/entrypoints/convert.py
+++ b/deepmd/entrypoints/convert.py
@@ -1,4 +1,4 @@
-from deepmd.utils.convert import convert_10_to_21, convert_20_to_21, convert_13_to_21, convert_12_to_21 
+from deepmd.utils.convert import convert_012_to_21, convert_10_to_21, convert_20_to_21, convert_13_to_21, convert_12_to_21 
 
 def convert(
     *,
@@ -7,7 +7,9 @@ def convert(
     output_model: str,
     **kwargs,
 ):
-    if FROM == '1.0':
+    if FROM == '0.12':
+        convert_012_to_21(input_model, output_model)
+    elif FROM == '1.0':
         convert_10_to_21(input_model, output_model)
     elif FROM in ['1.1', '1.2']:
         # no difference between 1.1 and 1.2
diff --git a/deepmd/entrypoints/main.py b/deepmd/entrypoints/main.py
index 46bdad05de..949797ea8b 100644
--- a/deepmd/entrypoints/main.py
+++ b/deepmd/entrypoints/main.py
@@ -20,7 +20,7 @@
 )
 from deepmd.loggers import set_log_handles
 
-__all__ = ["main", "parse_args", "get_ll"]
+__all__ = ["main", "parse_args", "get_ll", "main_parser"]
 
 
 def get_ll(log_level: str) -> int:
@@ -44,14 +44,13 @@ def get_ll(log_level: str) -> int:
     return int_level
 
 
-def parse_args(args: Optional[List[str]] = None):
+def main_parser() -> argparse.ArgumentParser:
     """DeePMD-Kit commandline options argument parser.
 
-    Parameters
-    ----------
-    args: List[str]
-        list of command line arguments, main purpose is testing default option None
-        takes arguments from sys.argv
+    Returns
+    -------
+    argparse.ArgumentParser
+        main parser of DeePMD-kit
     """
     parser = argparse.ArgumentParser(
         description="DeePMD-kit: A deep learning package for many-body potential energy"
@@ -383,7 +382,6 @@ def parse_args(args: Optional[List[str]] = None):
     )
 
     # * convert models
-    # supported: 1.2->2.0, 1.3->2.0
     parser_transform = subparsers.add_parser(
         'convert-from',
         parents=[parser_log],
@@ -392,7 +390,7 @@ def parse_args(args: Optional[List[str]] = None):
     parser_transform.add_argument(
         'FROM',
         type = str,
-        choices = ['1.0', '1.1', '1.2', '1.3', '2.0'],
+        choices = ['0.12', '1.0', '1.1', '1.2', '1.3', '2.0'],
         help="The original model compatibility",
     )
     parser_transform.add_argument(
@@ -441,7 +439,24 @@ def parse_args(args: Optional[List[str]] = None):
         
     # --version
     parser.add_argument('--version', action='version', version='DeePMD-kit v%s' % __version__)
+    return parser
+
+
+def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace:
+    """Parse arguments and convert argument strings to objects.
 
+    Parameters
+    ----------
+    args: List[str]
+        list of command line arguments, main purpose is testing default option None
+        takes arguments from sys.argv
+
+    Returns
+    -------
+    argparse.Namespace
+        the populated namespace
+    """
+    parser = main_parser()
     parsed_args = parser.parse_args(args=args)
     if parsed_args.command is None:
         parser.print_help()
diff --git a/deepmd/entrypoints/train.py b/deepmd/entrypoints/train.py
index 42e21d2ba4..4a67eee9d4 100755
--- a/deepmd/entrypoints/train.py
+++ b/deepmd/entrypoints/train.py
@@ -154,7 +154,7 @@ def _do_work(jdata: Dict[str, Any], run_opt: RunOptions, is_compress: bool = Fal
         train_data = get_data(jdata["training"]["training_data"], rcut, ipt_type_map, modifier)
         train_data.print_summary("training")
         if jdata["training"].get("validation_data", None) is not None:
-            valid_data = get_data(jdata["training"]["validation_data"], rcut, ipt_type_map, modifier)
+            valid_data = get_data(jdata["training"]["validation_data"], rcut, train_data.type_map, modifier)
             valid_data.print_summary("validation")
 
     # get training info
diff --git a/deepmd/fit/dipole.py b/deepmd/fit/dipole.py
index 80ea7178da..383ea17f1f 100644
--- a/deepmd/fit/dipole.py
+++ b/deepmd/fit/dipole.py
@@ -3,8 +3,7 @@
 from typing import Tuple, List
 
 from deepmd.env import tf
-from deepmd.common import add_data_requirement, get_activation_func, get_precision, ACTIVATION_FN_DICT, PRECISION_DICT, docstring_parameter, cast_precision
-from deepmd.utils.argcheck import list_to_doc
+from deepmd.common import add_data_requirement, get_activation_func, get_precision, cast_precision
 from deepmd.utils.network import one_layer, one_layer_rand_seed_shift
 from deepmd.utils.graph import get_fitting_net_variables_from_graph_def
 from deepmd.descriptor import DescrptSeA
@@ -31,13 +30,12 @@ class DipoleFittingSeA (Fitting) :
     seed : int
             Random seed for initializing the network parameters.
     activation_function : str
-            The activation function in the embedding net. Supported options are {0}
+            The activation function in the embedding net. Supported options are |ACTIVATION_FN|
     precision : str
-            The precision of the embedding net parameters. Supported options are {1}        
+            The precision of the embedding net parameters. Supported options are |PRECISION|
     uniform_seed
             Only for the purpose of backward compatibility, retrieves the old behavior of using the random seed
     """
-    @docstring_parameter(list_to_doc(ACTIVATION_FN_DICT.keys()), list_to_doc(PRECISION_DICT.keys()))
     def __init__ (self, 
                   descrpt : tf.Tensor,
                   neuron : List[int] = [120,120,120], 
@@ -123,20 +121,20 @@ def build (self,
                 The atomic dipole.
         """
         start_index = 0
-        inputs = tf.reshape(input_d, [-1, self.dim_descrpt * natoms[0]])
-        rot_mat = tf.reshape(rot_mat, [-1, self.dim_rot_mat * natoms[0]])
+        inputs = tf.reshape(input_d, [-1, natoms[0], self.dim_descrpt])
+        rot_mat = tf.reshape(rot_mat, [-1, natoms[0], self.dim_rot_mat])
 
         count = 0
         outs_list = []
         for type_i in range(self.ntypes):
             # cut-out inputs
             inputs_i = tf.slice (inputs,
-                                 [ 0, start_index*      self.dim_descrpt],
-                                 [-1, natoms[2+type_i]* self.dim_descrpt] )
+                                 [ 0, start_index, 0],
+                                 [-1, natoms[2+type_i], -1] )
             inputs_i = tf.reshape(inputs_i, [-1, self.dim_descrpt])
             rot_mat_i = tf.slice (rot_mat,
-                                  [ 0, start_index*      self.dim_rot_mat],
-                                  [-1, natoms[2+type_i]* self.dim_rot_mat] )
+                                  [ 0, start_index, 0],
+                                  [-1, natoms[2+type_i], -1] )
             rot_mat_i = tf.reshape(rot_mat_i, [-1, self.dim_rot_mat_1, 3])
             start_index += natoms[2+type_i]
             if not type_i in self.sel_type :
diff --git a/deepmd/fit/ener.py b/deepmd/fit/ener.py
index bb1a3844b6..4084281865 100644
--- a/deepmd/fit/ener.py
+++ b/deepmd/fit/ener.py
@@ -4,8 +4,7 @@
 from packaging.version import Version
 
 from deepmd.env import tf
-from deepmd.common import add_data_requirement, get_activation_func, get_precision, ACTIVATION_FN_DICT, PRECISION_DICT, docstring_parameter, cast_precision
-from deepmd.utils.argcheck import list_to_doc
+from deepmd.common import add_data_requirement, get_activation_func, get_precision, cast_precision
 from deepmd.utils.network import one_layer, one_layer_rand_seed_shift
 from deepmd.utils.type_embed import embed_atom_type
 from deepmd.utils.graph import get_fitting_net_variables_from_graph_def, load_graph_def, get_tensor_by_name_from_graph
@@ -72,13 +71,12 @@ class EnerFitting (Fitting):
     atom_ener
             Specifying atomic energy contribution in vacuum. The `set_davg_zero` key in the descrptor should be set.
     activation_function
-            The activation function :math:`\boldsymbol{\phi}` in the embedding net. Supported options are {0}
+            The activation function :math:`\boldsymbol{\phi}` in the embedding net. Supported options are |ACTIVATION_FN|
     precision
-            The precision of the embedding net parameters. Supported options are {1}                
+            The precision of the embedding net parameters. Supported options are |PRECISION|
     uniform_seed
             Only for the purpose of backward compatibility, retrieves the old behavior of using the random seed
     """
-    @docstring_parameter(list_to_doc(ACTIVATION_FN_DICT.keys()), list_to_doc(PRECISION_DICT.keys()))
     def __init__ (self, 
                   descrpt : tf.Tensor,
                   neuron : List[int] = [120,120,120],
@@ -276,8 +274,8 @@ def _build_lower(
     ):
         # cut-out inputs
         inputs_i = tf.slice (inputs,
-                             [ 0, start_index*      self.dim_descrpt],
-                             [-1, natoms* self.dim_descrpt] )
+                             [ 0, start_index, 0],
+                             [-1, natoms, -1] )
         inputs_i = tf.reshape(inputs_i, [-1, self.dim_descrpt])
         layer = inputs_i
         if fparam is not None:
@@ -378,10 +376,16 @@ def build (self,
         if input_dict is None:
             input_dict = {}
         bias_atom_e = self.bias_atom_e
-        if self.numb_fparam > 0 and ( self.fparam_avg is None or self.fparam_inv_std is None ):
-            raise RuntimeError('No data stat result. one should do data statisitic, before build')
-        if self.numb_aparam > 0 and ( self.aparam_avg is None or self.aparam_inv_std is None ):
-            raise RuntimeError('No data stat result. one should do data statisitic, before build')
+        if self.numb_fparam > 0:
+            if self.fparam_avg is None:
+                self.fparam_avg = 0.
+            if self.fparam_inv_std is None:
+                self.fparam_inv_std = 1.
+        if self.numb_aparam > 0:
+            if self.aparam_avg is None:
+                self.aparam_avg = 0.
+            if self.aparam_inv_std is None:
+                self.aparam_inv_std = 1.
 
         with tf.variable_scope('fitting_attr' + suffix, reuse = reuse) :
             t_dfparam = tf.constant(self.numb_fparam, 
@@ -413,13 +417,13 @@ def build (self,
                                                 trainable = False,
                                                 initializer = tf.constant_initializer(self.aparam_inv_std))
             
-        inputs = tf.reshape(inputs, [-1, self.dim_descrpt * natoms[0]])
+        inputs = tf.reshape(inputs, [-1, natoms[0], self.dim_descrpt])
         if len(self.atom_ener):
             # only for atom_ener
             nframes = input_dict.get('nframes')
             if nframes is not None:
                 # like inputs, but we don't want to add a dependency on inputs
-                inputs_zero = tf.zeros((nframes, self.dim_descrpt * natoms[0]), dtype=self.fitting_precision)
+                inputs_zero = tf.zeros((nframes, natoms[0], self.dim_descrpt), dtype=self.fitting_precision)
             else:
                 inputs_zero = tf.zeros_like(inputs, dtype=self.fitting_precision)
         
@@ -484,7 +488,7 @@ def build (self,
                 axis=1
             )
             self.dim_descrpt = self.dim_descrpt + type_shape[1]
-            inputs = tf.reshape(inputs, [-1, self.dim_descrpt * natoms[0]])
+            inputs = tf.reshape(inputs, [-1, natoms[0], self.dim_descrpt])
             final_layer = self._build_lower(
                 0, natoms[0], 
                 inputs, fparam, aparam, 
@@ -527,7 +531,12 @@ def init_variables(self,
             suffix to name scope
         """
         self.fitting_net_variables = get_fitting_net_variables_from_graph_def(graph_def)
-
+        if self.numb_fparam > 0:
+            self.fparam_avg = get_tensor_by_name_from_graph(graph, 'fitting_attr%s/t_fparam_avg' % suffix)
+            self.fparam_inv_std = get_tensor_by_name_from_graph(graph, 'fitting_attr%s/t_fparam_istd' % suffix)
+        if self.numb_aparam > 0:
+            self.aparam_avg = get_tensor_by_name_from_graph(graph, 'fitting_attr%s/t_aparam_avg' % suffix)
+            self.aparam_inv_std = get_tensor_by_name_from_graph(graph, 'fitting_attr%s/t_aparam_istd' % suffix)
 
     def enable_compression(self,
                            model_file: str,
diff --git a/deepmd/fit/polar.py b/deepmd/fit/polar.py
index 725a276028..3f1b7daa6b 100644
--- a/deepmd/fit/polar.py
+++ b/deepmd/fit/polar.py
@@ -3,8 +3,7 @@
 from typing import Tuple, List
 
 from deepmd.env import tf
-from deepmd.common import add_data_requirement, cast_precision, get_activation_func, get_precision, ACTIVATION_FN_DICT, PRECISION_DICT, docstring_parameter
-from deepmd.utils.argcheck import list_to_doc
+from deepmd.common import add_data_requirement, cast_precision, get_activation_func, get_precision
 from deepmd.utils.network import one_layer, one_layer_rand_seed_shift
 from deepmd.utils.graph import get_fitting_net_variables_from_graph_def
 from deepmd.descriptor import DescrptLocFrame
@@ -56,7 +55,7 @@ def build (self,
                reuse = None,
                suffix = '') :
         start_index = 0
-        inputs = tf.cast(tf.reshape(input_d, [-1, self.dim_descrpt * natoms[0]]), self.fitting_precision)
+        inputs = tf.cast(tf.reshape(input_d, [-1, natoms[0], self.dim_descrpt]), self.fitting_precision)
         rot_mat = tf.reshape(rot_mat, [-1, 9 * natoms[0]])
 
         count = 0
@@ -64,8 +63,8 @@ def build (self,
         for type_i in range(self.ntypes):
             # cut-out inputs
             inputs_i = tf.slice (inputs,
-                                 [ 0, start_index*      self.dim_descrpt],
-                                 [-1, natoms[2+type_i]* self.dim_descrpt] )
+                                 [ 0, start_index, 0],
+                                 [-1, natoms[2+type_i], -1] )
             inputs_i = tf.reshape(inputs_i, [-1, self.dim_descrpt])
             rot_mat_i = tf.slice (rot_mat,
                                   [ 0, start_index*      9],
@@ -105,8 +104,33 @@ def build (self,
 class PolarFittingSeA (Fitting) :
     """
     Fit the atomic polarizability with descriptor se_a
+
+    Parameters
+    ----------
+    descrpt : tf.Tensor
+            The descrptor
+    neuron : List[int]
+            Number of neurons in each hidden layer of the fitting net
+    resnet_dt : bool
+            Time-step `dt` in the resnet construction:
+            y = x + dt * \phi (Wx + b)
+    sel_type : List[int]
+            The atom types selected to have an atomic polarizability prediction. If is None, all atoms are selected.
+    fit_diag : bool
+            Fit the diagonal part of the rotational invariant polarizability matrix, which will be converted to normal polarizability matrix by contracting with the rotation matrix.
+    scale : List[float]
+            The output of the fitting net (polarizability matrix) for type i atom will be scaled by scale[i]
+    diag_shift : List[float]
+            The diagonal part of the polarizability matrix of type i will be shifted by diag_shift[i]. The shift operation is carried out after scale.        
+    seed : int
+            Random seed for initializing the network parameters.
+    activation_function : str
+            The activation function in the embedding net. Supported options are |ACTIVATION_FN|
+    precision : str
+            The precision of the embedding net parameters. Supported options are |PRECISION|
+    uniform_seed
+            Only for the purpose of backward compatibility, retrieves the old behavior of using the random seed
     """
-    @docstring_parameter(list_to_doc(ACTIVATION_FN_DICT.keys()), list_to_doc(PRECISION_DICT.keys()))
     def __init__ (self, 
                   descrpt : tf.Tensor,
                   neuron : List[int] = [120,120,120],
@@ -123,32 +147,6 @@ def __init__ (self,
     ) -> None:
         """
         Constructor
-
-        Parameters
-        ----------
-        descrpt : tf.Tensor
-                The descrptor
-        neuron : List[int]
-                Number of neurons in each hidden layer of the fitting net
-        resnet_dt : bool
-                Time-step `dt` in the resnet construction:
-                y = x + dt * \phi (Wx + b)
-        sel_type : List[int]
-                The atom types selected to have an atomic polarizability prediction. If is None, all atoms are selected.
-        fit_diag : bool
-                Fit the diagonal part of the rotational invariant polarizability matrix, which will be converted to normal polarizability matrix by contracting with the rotation matrix.
-        scale : List[float]
-                The output of the fitting net (polarizability matrix) for type i atom will be scaled by scale[i]
-        diag_shift : List[float]
-                The diagonal part of the polarizability matrix of type i will be shifted by diag_shift[i]. The shift operation is carried out after scale.        
-        seed : int
-                Random seed for initializing the network parameters.
-        activation_function : str
-                The activation function in the embedding net. Supported options are {0}
-        precision : str
-                The precision of the embedding net parameters. Supported options are {1}                
-        uniform_seed
-                Only for the purpose of backward compatibility, retrieves the old behavior of using the random seed
         """
         if not isinstance(descrpt, DescrptSeA) :
             raise RuntimeError('PolarFittingSeA only supports DescrptSeA')
@@ -431,11 +429,10 @@ class GlobalPolarFittingSeA () :
     seed : int
             Random seed for initializing the network parameters.
     activation_function : str
-            The activation function in the embedding net. Supported options are {0}
+            The activation function in the embedding net. Supported options are |ACTIVATION_FN|
     precision : str
-            The precision of the embedding net parameters. Supported options are {1}    
+            The precision of the embedding net parameters. Supported options are |PRECISION|
     """
-    @docstring_parameter(list_to_doc(ACTIVATION_FN_DICT.keys()), list_to_doc(PRECISION_DICT.keys()))
     def __init__ (self, 
                   descrpt : tf.Tensor,
                   neuron : List[int] = [120,120,120],
diff --git a/deepmd/fit/wfc.py b/deepmd/fit/wfc.py
index 9b6b217432..564d601cae 100644
--- a/deepmd/fit/wfc.py
+++ b/deepmd/fit/wfc.py
@@ -3,8 +3,7 @@
 from typing import Tuple, List
 
 from deepmd.env import tf
-from deepmd.common import ClassArg, add_data_requirement, get_activation_func, get_precision, ACTIVATION_FN_DICT, PRECISION_DICT, docstring_parameter
-from deepmd.utils.argcheck import list_to_doc
+from deepmd.common import ClassArg, add_data_requirement, get_activation_func, get_precision
 from deepmd.utils.network import one_layer, one_layer_rand_seed_shift
 from deepmd.descriptor import DescrptLocFrame
 
diff --git a/deepmd/infer/model_devi.py b/deepmd/infer/model_devi.py
index 46b22b326b..5bbcd66ee2 100644
--- a/deepmd/infer/model_devi.py
+++ b/deepmd/infer/model_devi.py
@@ -89,7 +89,7 @@ def calc_model_devi(coord,
                     models,
                     fname=None,
                     frequency=1, 
-                    nopbc=True):
+                    ):
     '''
     Python interface to calculate model deviation
 
@@ -107,8 +107,6 @@ def calc_model_devi(coord,
         File to dump results, default None
     frequency : int
         Steps between frames (if the system is given by molecular dynamics engine), default 1
-    nopbc : bool
-        Whether to use pbc conditions
     
     Returns
     -------
@@ -127,8 +125,10 @@ def calc_model_devi(coord,
     >>> graphs = [DP("graph.000.pb"), DP("graph.001.pb")]
     >>> model_devi = calc_model_devi(coord, cell, atype, graphs)
     '''
-    if nopbc:
-        box = None
+    if box is not None:
+        nopbc = True
+    else:
+        nopbc = False
 
     forces = []
     virials = []
@@ -197,10 +197,6 @@ def make_model_devi(
     for system in all_sys:
         # create data-system
         dp_data = DeepmdData(system, set_prefix, shuffle_test=False, type_map=tmap)
-        if dp_data.pbc:
-            nopbc = False
-        else:
-            nopbc = True
 
         data_sets = [dp_data._load_set(set_name) for set_name in dp_data.dirs]
         nframes_tot = 0
@@ -209,7 +205,9 @@ def make_model_devi(
             coord = data["coord"]
             box = data["box"]
             atype = data["type"][0] 
-            devi = calc_model_devi(coord, box, atype, dp_models, nopbc=nopbc)
+            if not dp_data.pbc:
+                box = None
+            devi = calc_model_devi(coord, box, atype, dp_models)
             nframes_tot += coord.shape[0]
             devis.append(devi)
         devis = np.vstack(devis)
diff --git a/deepmd/loss/ener.py b/deepmd/loss/ener.py
index 29e1fa4068..be57ce5c18 100644
--- a/deepmd/loss/ener.py
+++ b/deepmd/loss/ener.py
@@ -5,10 +5,17 @@
 from deepmd.env import global_cvt_2_tf_float
 from deepmd.env import global_cvt_2_ener_float
 from deepmd.utils.sess import run_sess
+from .loss import Loss
 
-class EnerStdLoss () :
+
+class EnerStdLoss (Loss) :
     """
     Standard loss function for DP models
+
+    Parameters
+    ----------
+    enable_atom_ener_coeff : bool
+        if true, the energy will be computed as \sum_i c_i E_i
     """
     def __init__ (self, 
                   starter_learning_rate : float, 
@@ -22,7 +29,8 @@ def __init__ (self,
                   limit_pref_ae : float = 0.0,
                   start_pref_pf : float = 0.0,
                   limit_pref_pf : float = 0.0,
-                  relative_f : float = None 
+                  relative_f : float = None,
+                  enable_atom_ener_coeff: bool=False,
     ) -> None:
         self.starter_learning_rate = starter_learning_rate
         self.start_pref_e = start_pref_e
@@ -36,6 +44,7 @@ def __init__ (self,
         self.start_pref_pf = start_pref_pf
         self.limit_pref_pf = limit_pref_pf
         self.relative_f = relative_f
+        self.enable_atom_ener_coeff = enable_atom_ener_coeff
         self.has_e = (self.start_pref_e != 0.0 or self.limit_pref_e != 0.0)
         self.has_f = (self.start_pref_f != 0.0 or self.limit_pref_f != 0.0)
         self.has_v = (self.start_pref_v != 0.0 or self.limit_pref_v != 0.0)
@@ -47,6 +56,8 @@ def __init__ (self,
         add_data_requirement('virial', 9, atomic=False, must=False, high_prec=False)
         add_data_requirement('atom_ener', 1, atomic=True, must=False, high_prec=False)
         add_data_requirement('atom_pref', 1, atomic=True, must=False, high_prec=False, repeat=3)
+        if self.enable_atom_ener_coeff:
+            add_data_requirement('atom_ener_coeff', 1, atomic=True, must=False, high_prec=False, default=1.)
 
     def build (self, 
                learning_rate,
@@ -69,6 +80,18 @@ def build (self,
         find_atom_ener = label_dict['find_atom_ener']                
         find_atom_pref = label_dict['find_atom_pref']                
 
+        if self.enable_atom_ener_coeff:
+            # when ener_coeff (\nu) is defined, the energy is defined as 
+            # E = \sum_i \nu_i E_i
+            # instead of the sum of atomic energies.
+            #
+            # A case is that we want to train reaction energy
+            # A + B -> C + D
+            # E = - E(A) - E(B) + E(C) + E(D)
+            # A, B, C, D could be put far away from each other
+            atom_ener_coeff = label_dict['atom_ener_coeff']
+            atom_ener_coeff = tf.reshape(atom_ener_coeff, tf.shape(atom_ener))
+            energy = tf.reduce_sum(atom_ener_coeff * atom_ener, 1)
         l2_ener_loss = tf.reduce_mean( tf.square(energy - energy_hat), name='l2_'+suffix)
 
         force_reshape = tf.reshape (force, [-1])
@@ -221,7 +244,7 @@ def print_on_training(self,
         return print_str      
 
 
-class EnerDipoleLoss () :
+class EnerDipoleLoss (Loss) :
     def __init__ (self, 
                   starter_learning_rate : float,
                   start_pref_e : float = 0.1,
diff --git a/deepmd/loss/loss.py b/deepmd/loss/loss.py
new file mode 100644
index 0000000000..6ae9dc7399
--- /dev/null
+++ b/deepmd/loss/loss.py
@@ -0,0 +1,59 @@
+from abc import ABCMeta, abstractmethod
+from typing import Tuple, Dict
+from deepmd.env import tf
+
+
+class Loss(metaclass=ABCMeta):
+    """The abstract class for the loss function."""
+    @abstractmethod
+    def build(self, 
+            learning_rate: tf.Tensor,
+            natoms: tf.Tensor,
+            model_dict: Dict[str, tf.Tensor],
+            label_dict: Dict[str, tf.Tensor],
+            suffix: str) -> Tuple[tf.Tensor, Dict[str, tf.Tensor]]:
+        """Build the loss function graph.
+        
+        Parameters
+        ----------
+        learning_rate : tf.Tensor
+            learning rate
+        natoms : tf.Tensor
+            number of atoms
+        model_dict : dict[str, tf.Tensor]
+            A dictionary that maps model keys to tensors
+        label_dict : dict[str, tf.Tensor]
+            A dictionary that maps label keys to tensors
+        suffix : str
+            suffix
+
+        Returns
+        -------
+        tf.Tensor
+            the total squared loss
+        dict[str, tf.Tensor]
+            A dictionary that maps loss keys to more loss tensors
+        """
+
+    @abstractmethod
+    def eval(self,
+             sess: tf.Session,
+             feed_dict: Dict[tf.placeholder, tf.Tensor],
+             natoms: tf.Tensor) -> dict:
+        """Eval the loss function.
+
+        Parameters
+        ----------
+        sess : tf.Session
+            TensorFlow session
+        feed_dict : dict[tf.placeholder, tf.Tensor]
+            A dictionary that maps graph elements to values
+        natoms : tf.Tensor
+            number of atoms
+
+        Returns
+        -------
+        dict
+            A dictionary that maps keys to values. It
+            should contain key `natoms`
+        """
diff --git a/deepmd/loss/tensor.py b/deepmd/loss/tensor.py
index de4dee6fa8..64763627a3 100644
--- a/deepmd/loss/tensor.py
+++ b/deepmd/loss/tensor.py
@@ -5,8 +5,10 @@
 from deepmd.env import global_cvt_2_tf_float
 from deepmd.env import global_cvt_2_ener_float
 from deepmd.utils.sess import run_sess
+from .loss import Loss
 
-class TensorLoss () :
+
+class TensorLoss(Loss) :
     """
     Loss function for tensorial properties.
     """
diff --git a/deepmd/model/model_stat.py b/deepmd/model/model_stat.py
index 61f151f27b..b7aa66397c 100644
--- a/deepmd/model/model_stat.py
+++ b/deepmd/model/model_stat.py
@@ -16,12 +16,14 @@ def _make_all_stat_ref(data, nbatches):
 def make_stat_input(data, nbatches, merge_sys = True):
     """
     pack data for statistics
+
     Parameters
     ----------
     data:
         The data
     merge_sys: bool (True)
         Merge system data
+
     Returns
     -------
     all_stat:
diff --git a/deepmd/op/__init__.py b/deepmd/op/__init__.py
index 3700bf175e..aa9b309888 100644
--- a/deepmd/op/__init__.py
+++ b/deepmd/op/__init__.py
@@ -13,8 +13,8 @@
 def import_ops():
     """Import all custom TF ops that are present in this submodule.
 
-    Note
-    ----
+    Notes
+    -----
     Initialy this subdir is unpopulated. CMake will install all the op module python
     files and shared libs.
     """
diff --git a/deepmd/train/__init__.py b/deepmd/train/__init__.py
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/deepmd/train/__init__.py
@@ -0,0 +1 @@
+
diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py
index 40583e6223..77d5028051 100644
--- a/deepmd/train/trainer.py
+++ b/deepmd/train/trainer.py
@@ -699,4 +699,6 @@ def _init_from_frz_model(self):
             ) from e
         else:
             self.model_type = bytes.decode(t_model_type)
+        if self.model_type == 'compressed_model':
+            self.frz_model = self.run_opt.init_frz_model
         self.model.init_variables(graph, graph_def, model_type=self.model_type)
diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
index 3c99b58196..ed1253d171 100644
--- a/deepmd/utils/argcheck.py
+++ b/deepmd/utils/argcheck.py
@@ -28,14 +28,14 @@ def type_embedding_args():
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
     doc_seed = 'Random seed for parameter initialization'
     doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}'
-    doc_precision = f'The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())}'
+    doc_precision = f'The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
     doc_trainable = 'If the parameters in the embedding net are trainable'
     
     return [
         Argument("neuron", list, optional = True, default = [2, 4, 8], doc = doc_neuron),
         Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function),
         Argument("resnet_dt", bool, optional = True, default = False, doc = doc_resnet_dt),
-        Argument("precision", str, optional = True, default = "float64", doc = doc_precision),
+        Argument("precision", str, optional = True, default = "default", doc = doc_precision),
         Argument("trainable", bool, optional = True, default = True, doc = doc_trainable),
         Argument("seed", [int,None], optional = True, doc = doc_seed),
     ]        
@@ -74,8 +74,13 @@ def descrpt_some_descrpt_args():
             alias = tuple(alias)
         return self.__plugin.register((name, alias))
 
-    def get_all_argument(self) -> List[Argument]:
+    def get_all_argument(self, exclude_hybrid: bool = False) -> List[Argument]:
         """Get all arguments.
+
+        Parameters
+        ----------
+        exclude_hybrid : bool
+            exclude hybrid descriptor to prevent circular calls
         
         Returns
         -------
@@ -84,6 +89,8 @@ def get_all_argument(self) -> List[Argument]:
         """
         arguments = []
         for (name, alias), metd in self.__plugin.plugins.items():
+            if exclude_hybrid and name == "hybrid":
+                continue
             arguments.append(Argument(name=name, dtype=dict, sub_fields=metd(), alias=alias))
         return arguments
 
@@ -123,7 +130,7 @@ def descrpt_se_a_args():
     doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
     doc_type_one_side = 'Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets'
-    doc_precision = f'The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())}'
+    doc_precision = f'The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
     doc_trainable = 'If the parameters in the embedding net is trainable'
     doc_seed = 'Random seed for parameter initialization'
     doc_exclude_types = 'The excluded pairs of types which have no interaction with each other. For example, `[[0, 1]]` means no interaction between type 0 and type 1.'
@@ -138,7 +145,7 @@ def descrpt_se_a_args():
         Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function),
         Argument("resnet_dt", bool, optional = True, default = False, doc = doc_resnet_dt),
         Argument("type_one_side", bool, optional = True, default = False, doc = doc_type_one_side),
-        Argument("precision", str, optional = True, default = "float64", doc = doc_precision),
+        Argument("precision", str, optional = True, default = "default", doc = doc_precision),
         Argument("trainable", bool, optional = True, default = True, doc = doc_trainable),
         Argument("seed", [int,None], optional = True, doc = doc_seed),
         Argument("exclude_types", list, optional = True, default = [], doc = doc_exclude_types),
@@ -156,7 +163,7 @@ def descrpt_se_t_args():
     doc_neuron = 'Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.'
     doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
-    doc_precision = f'The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())}'
+    doc_precision = f'The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
     doc_trainable = 'If the parameters in the embedding net are trainable'
     doc_seed = 'Random seed for parameter initialization'
     doc_set_davg_zero = 'Set the normalization average to zero. This option should be set when `atom_ener` in the energy fitting is used'
@@ -168,7 +175,7 @@ def descrpt_se_t_args():
         Argument("neuron", list, optional = True, default = [10,20,40], doc = doc_neuron),
         Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function),
         Argument("resnet_dt", bool, optional = True, default = False, doc = doc_resnet_dt),
-        Argument("precision", str, optional = True, default = "float64", doc = doc_precision),
+        Argument("precision", str, optional = True, default = "default", doc = doc_precision),
         Argument("trainable", bool, optional = True, default = True, doc = doc_trainable),
         Argument("seed", [int,None], optional = True, doc = doc_seed),
         Argument("set_davg_zero", bool, optional = True, default = False, doc = doc_set_davg_zero)
@@ -200,7 +207,7 @@ def descrpt_se_r_args():
     doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
     doc_type_one_side = 'Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets'
-    doc_precision = f'The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())}'
+    doc_precision = f'The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
     doc_trainable = 'If the parameters in the embedding net are trainable'
     doc_seed = 'Random seed for parameter initialization'
     doc_exclude_types = 'The excluded pairs of types which have no interaction with each other. For example, `[[0, 1]]` means no interaction between type 0 and type 1.'
@@ -214,7 +221,7 @@ def descrpt_se_r_args():
         Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function),
         Argument("resnet_dt", bool, optional = True, default = False, doc = doc_resnet_dt),
         Argument("type_one_side", bool, optional = True, default = False, doc = doc_type_one_side),
-        Argument("precision", str, optional = True, default = "float64", doc = doc_precision),
+        Argument("precision", str, optional = True, default = "default", doc = doc_precision),
         Argument("trainable", bool, optional = True, default = True, doc = doc_trainable),
         Argument("seed", [int,None], optional = True, doc = doc_seed),
         Argument("exclude_types", list, optional = True, default = [], doc = doc_exclude_types),
@@ -231,7 +238,7 @@ def descrpt_hybrid_args():
     ]
 
 
-def descrpt_variant_type_args():
+def descrpt_variant_type_args(exclude_hybrid: bool = False) -> Variant:
     link_lf = make_link('loc_frame', 'model/descriptor[loc_frame]')
     link_se_e2_a = make_link('se_e2_a', 'model/descriptor[se_e2_a]')
     link_se_e2_r = make_link('se_e2_r', 'model/descriptor[se_e2_r]')
@@ -255,7 +262,7 @@ def fitting_ener():
     doc_numb_aparam = 'The dimension of the atomic parameter. If set to >0, file `aparam.npy` should be included to provided the input aparams.'
     doc_neuron = 'The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.'
     doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}'
-    doc_precision = f'The precision of the fitting net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())}'
+    doc_precision = f'The precision of the fitting net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
     doc_trainable = 'Whether the parameters in the fitting net are trainable. This option can be\n\n\
 - bool: True if all parameters of the fitting net are trainable, False otherwise.\n\n\
@@ -269,7 +276,7 @@ def fitting_ener():
         Argument("numb_aparam", int, optional = True, default = 0, doc = doc_numb_aparam),
         Argument("neuron", list, optional = True, default = [120,120,120], alias = ['n_neuron'], doc = doc_neuron),
         Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function),
-        Argument("precision", str, optional = True, default = 'float64', doc = doc_precision),
+        Argument("precision", str, optional = True, default = 'default', doc = doc_precision),
         Argument("resnet_dt", bool, optional = True, default = True, doc = doc_resnet_dt),
         Argument("trainable", [list,bool], optional = True, default = True, doc = doc_trainable),
         Argument("rcond", float, optional = True, default = 1e-3, doc = doc_rcond),
@@ -282,7 +289,7 @@ def fitting_polar():
     doc_neuron = 'The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.'
     doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
-    doc_precision = f'The precision of the fitting net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())}'
+    doc_precision = f'The precision of the fitting net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
     doc_scale = 'The output of the fitting net (polarizability matrix) will be scaled by ``scale``'
     #doc_diag_shift = 'The diagonal part of the polarizability matrix  will be shifted by ``diag_shift``. The shift operation is carried out after ``scale``.'
     doc_fit_diag = 'Fit the diagonal part of the rotational invariant polarizability matrix, which will be converted to normal polarizability matrix by contracting with the rotation matrix.'
@@ -296,7 +303,7 @@ def fitting_polar():
         Argument("neuron", list, optional = True, default = [120,120,120], alias = ['n_neuron'], doc = doc_neuron),
         Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function),
         Argument("resnet_dt", bool, optional = True, default = True, doc = doc_resnet_dt),
-        Argument("precision", str, optional = True, default = 'float64', doc = doc_precision),
+        Argument("precision", str, optional = True, default = 'default', doc = doc_precision),
         Argument("fit_diag", bool, optional = True, default = True, doc = doc_fit_diag),
         Argument("scale", [list,float], optional = True, default = 1.0, doc = doc_scale),
         #Argument("diag_shift", [list,float], optional = True, default = 0.0, doc = doc_diag_shift),
@@ -314,14 +321,14 @@ def fitting_dipole():
     doc_neuron = 'The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.'
     doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
-    doc_precision = f'The precision of the fitting net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())}'
+    doc_precision = f'The precision of the fitting net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
     doc_sel_type = 'The atom types for which the atomic dipole will be provided. If not set, all types will be selected.'
     doc_seed = 'Random seed for parameter initialization of the fitting net'
     return [
         Argument("neuron", list, optional = True, default = [120,120,120], alias = ['n_neuron'], doc = doc_neuron),
         Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function),
         Argument("resnet_dt", bool, optional = True, default = True, doc = doc_resnet_dt),
-        Argument("precision", str, optional = True, default = 'float64', doc = doc_precision),
+        Argument("precision", str, optional = True, default = 'default', doc = doc_precision),
         Argument("sel_type", [list,int,None], optional = True, alias = ['dipole_type'], doc = doc_sel_type),
         Argument("seed", [int,None], optional = True, doc = doc_seed)
     ]    
@@ -479,6 +486,7 @@ def loss_ener():
     doc_start_pref_pf = start_pref('atom_pref')
     doc_limit_pref_pf = limit_pref('atom_pref')
     doc_relative_f = 'If provided, relative force error will be used in the loss. The difference of force will be normalized by the magnitude of the force in the label with a shift given by `relative_f`, i.e. DF_i / ( || F || + relative_f ) with DF denoting the difference between prediction and label and || F || denoting the L2 norm of the label.'
+    doc_enable_atom_ener_coeff = "If true, the energy will be computed as \sum_i c_i E_i. c_i should be provided by file atom_ener_coeff.npy in each data system, otherwise it's 1."
     return [
         Argument("start_pref_e", [float,int], optional = True, default = 0.02, doc = doc_start_pref_e),
         Argument("limit_pref_e", [float,int], optional = True, default = 1.00, doc = doc_limit_pref_e),
@@ -490,7 +498,8 @@ def loss_ener():
         Argument("limit_pref_ae", [float,int], optional = True, default = 0.00, doc = doc_limit_pref_ae),
         Argument("start_pref_pf", [float,int], optional = True, default = 0.00, doc = doc_start_pref_pf),
         Argument("limit_pref_pf", [float,int], optional = True, default = 0.00, doc = doc_limit_pref_pf),
-        Argument("relative_f", [float,None], optional = True, doc = doc_relative_f)
+        Argument("relative_f", [float,None], optional = True, doc = doc_relative_f),
+        Argument("enable_atom_ener_coeff", [bool], optional=True, default=False, doc=doc_enable_atom_ener_coeff),
     ]
 
 # YWolfeee: Modified to support tensor type of loss args.
diff --git a/deepmd/utils/batch_size.py b/deepmd/utils/batch_size.py
index 435a90aa49..27b6a830a4 100644
--- a/deepmd/utils/batch_size.py
+++ b/deepmd/utils/batch_size.py
@@ -8,10 +8,10 @@
 class AutoBatchSize:
     """This class allows DeePMD-kit to automatically decide the maximum
     batch size that will not cause an OOM error.
-    
+
     Notes
     -----
-    We assume all OOM error will raise :metd:`OutOfMemoryError`.
+    We assume all OOM error will raise :class:`OutOfMemoryError`.
 
     Parameters
     ----------
diff --git a/deepmd/utils/convert.py b/deepmd/utils/convert.py
index 2c9a653002..7dc8ebb06f 100644
--- a/deepmd/utils/convert.py
+++ b/deepmd/utils/convert.py
@@ -83,6 +83,28 @@ def convert_10_to_21(input_model: str, output_model: str):
     print("the converted output model (2.1 support) is saved in %s" % output_model)
 
 
+def convert_012_to_21(input_model: str, output_model: str):
+    """Convert DP 0.12 graph to 2.1 graph.
+    
+    Parameters
+    ----------
+    input_model : str
+        filename of the input graph
+    output_model : str
+        filename of the output graph
+    """
+    convert_pb_to_pbtxt(input_model, 'frozen_model.pbtxt')
+    convert_dp012_to_dp10('frozen_model.pbtxt')
+    convert_dp10_to_dp11('frozen_model.pbtxt')
+    convert_dp12_to_dp13('frozen_model.pbtxt')
+    convert_dp13_to_dp20('frozen_model.pbtxt')
+    convert_dp20_to_dp21('frozen_model.pbtxt')
+    convert_pbtxt_to_pb('frozen_model.pbtxt', output_model)
+    if os.path.isfile('frozen_model.pbtxt'):
+        os.remove('frozen_model.pbtxt')
+    print("the converted output model (2.1 support) is saved in %s" % output_model)
+
+
 def convert_20_to_21(input_model: str, output_model: str):
     """Convert DP 2.0 graph to 2.1 graph.
     
@@ -134,6 +156,24 @@ def convert_pbtxt_to_pb(pbtxtfile: str, pbfile: str):
         tf.train.write_graph(graph_def, './', pbfile, as_text=False)
 
 
+def convert_dp012_to_dp10(file: str):
+    """Convert DP 1.0 graph text to 1.1 graph text.
+    
+    Parameters
+    ----------
+    file : str
+        filename of the graph text
+    """
+    with open(file) as fp:
+        file_content = fp.read()
+    file_content = file_content\
+                   .replace('DescrptNorot', 'DescrptSeA') \
+                   .replace('ProdForceNorot', 'ProdForceSeA') \
+                   .replace('ProdVirialNorot', 'ProdVirialSeA')
+    with open(file, 'w') as fp:
+        fp.write(file_content)
+
+
 def convert_dp10_to_dp11(file: str):
     """Convert DP 1.0 graph text to 1.1 graph text.
     
diff --git a/deepmd/utils/data.py b/deepmd/utils/data.py
index 99337aa41d..0709352d7a 100644
--- a/deepmd/utils/data.py
+++ b/deepmd/utils/data.py
@@ -92,7 +92,8 @@ def add(self,
             must : bool = False, 
             high_prec : bool = False,
             type_sel : List[int] = None,
-            repeat : int = 1
+            repeat : int = 1,
+            default: float=0.,
     ) :
         """
         Add a data item that to be loaded
@@ -116,6 +117,8 @@ def add(self,
                 Select certain type of atoms
         repeat
                 The data will be repeated `repeat` times.
+        default : float, default=0.
+                default value of data
         """
         self.data_dict[key] = {'ndof': ndof, 
                                'atomic': atomic,
@@ -124,6 +127,7 @@ def add(self,
                                'type_sel': type_sel,
                                'repeat': repeat,
                                'reduce': None,
+                               'default': default,
         }
         return self
 
@@ -438,7 +442,9 @@ def _load_set(self, set_name: DPPath) :
                                       high_prec = self.data_dict[kk]['high_prec'],
                                       must = self.data_dict[kk]['must'], 
                                       type_sel = self.data_dict[kk]['type_sel'],
-                                      repeat = self.data_dict[kk]['repeat'])
+                                      repeat = self.data_dict[kk]['repeat'],
+                                      default=self.data_dict[kk]['default'],
+                                      )
         for kk in self.data_dict.keys():
             if self.data_dict[kk]['reduce'] is not None :
                 k_in = self.data_dict[kk]['reduce']
@@ -450,7 +456,7 @@ def _load_set(self, set_name: DPPath) :
         return data
 
 
-    def _load_data(self, set_name, key, nframes, ndof_, atomic = False, must = True, repeat = 1, high_prec = False, type_sel = None):
+    def _load_data(self, set_name, key, nframes, ndof_, atomic = False, must = True, repeat = 1, high_prec = False, type_sel = None, default: float=0.):
         if atomic:
             natoms = self.natoms
             idx_map = self.idx_map
@@ -487,9 +493,9 @@ def _load_data(self, set_name, key, nframes, ndof_, atomic = False, must = True,
             raise RuntimeError("%s not found!" % path)
         else:
             if high_prec :
-                data = np.zeros([nframes,ndof]).astype(GLOBAL_ENER_FLOAT_PRECISION)                
+                data = np.full([nframes, ndof], default, dtype=GLOBAL_ENER_FLOAT_PRECISION)                
             else :
-                data = np.zeros([nframes,ndof]).astype(GLOBAL_NP_FLOAT_PRECISION)
+                data = np.full([nframes, ndof], default, dtype=GLOBAL_NP_FLOAT_PRECISION)
             if repeat != 1:
                 data = np.repeat(data, repeat).reshape([nframes, -1])
             return np.float32(0.0), data
diff --git a/deepmd/utils/data_system.py b/deepmd/utils/data_system.py
index 0103aeab1f..656a7a2e7b 100644
--- a/deepmd/utils/data_system.py
+++ b/deepmd/utils/data_system.py
@@ -205,14 +205,17 @@ def add_dict(self, adict: dict) -> None:
         """
         Add items to the data system by a `dict`.
         `adict` should have items like
-        adict[key] = {
+        .. code-block:: python
+
+           adict[key] = {
                    'ndof': ndof, 
                    'atomic': atomic,
                    'must': must, 
                    'high_prec': high_prec,
                    'type_sel': type_sel,
                    'repeat': repeat,
-        }        
+           }
+
         For the explaination of the keys see `add`
         """
         for kk in adict :
@@ -222,7 +225,9 @@ def add_dict(self, adict: dict) -> None:
                      must=adict[kk]['must'], 
                      high_prec=adict[kk]['high_prec'], 
                      type_sel=adict[kk]['type_sel'], 
-                     repeat=adict[kk]['repeat'])
+                     repeat=adict[kk]['repeat'],
+                     default=adict[kk]['default'],
+                     )
 
     def add(self, 
             key : str, 
@@ -231,7 +236,8 @@ def add(self,
             must : bool = False, 
             high_prec : bool = False,
             type_sel : List[int] = None,
-            repeat : int = 1
+            repeat : int = 1,
+            default: float=0.,
     ) :
         """
         Add a data item that to be loaded
@@ -255,9 +261,11 @@ def add(self,
                 Select certain type of atoms
         repeat
                 The data will be repeated `repeat` times.
+        default, default=0.
+                Default value of data
         """
         for ii in self.data_systems:
-            ii.add(key, ndof, atomic=atomic, must=must, high_prec=high_prec, repeat=repeat, type_sel=type_sel)
+            ii.add(key, ndof, atomic=atomic, must=must, high_prec=high_prec, repeat=repeat, type_sel=type_sel, default=default)
 
     def reduce(self, key_out, key_in):
         """
diff --git a/deepmd/utils/neighbor_stat.py b/deepmd/utils/neighbor_stat.py
index fe582311cf..9088cfc8a4 100644
--- a/deepmd/utils/neighbor_stat.py
+++ b/deepmd/utils/neighbor_stat.py
@@ -7,7 +7,7 @@
 from deepmd.env import default_tf_session_config
 from deepmd.env import GLOBAL_NP_FLOAT_PRECISION
 from deepmd.utils.data_system import DeepmdDataSystem
-from deepmd.utils.sess import run_sess
+from deepmd.utils.parallel_op import ParallelOp
 
 log = logging.getLogger(__name__)
 
@@ -32,21 +32,28 @@ def __init__(self,
         """
         self.rcut = rcut
         self.ntypes = ntypes
-        self.place_holders = {}
         sub_graph = tf.Graph()
-        with sub_graph.as_default():
+
+        def builder():
+            place_holders = {}
             for ii in ['coord', 'box']:
-                self.place_holders[ii] = tf.placeholder(GLOBAL_NP_FLOAT_PRECISION, [None, None], name='t_'+ii)
-            self.place_holders['type'] = tf.placeholder(tf.int32, [None, None], name='t_type')
-            self.place_holders['natoms_vec'] = tf.placeholder(tf.int32, [self.ntypes+2], name='t_natoms')
-            self.place_holders['default_mesh'] = tf.placeholder(tf.int32, [None], name='t_mesh')
-            self._max_nbor_size, self._min_nbor_dist \
-                = op_module.neighbor_stat(self.place_holders['coord'],
-                                         self.place_holders['type'],
-                                         self.place_holders['natoms_vec'],
-                                         self.place_holders['box'],
-                                         self.place_holders['default_mesh'],
+                place_holders[ii] = tf.placeholder(GLOBAL_NP_FLOAT_PRECISION, [None, None], name='t_'+ii)
+            place_holders['type'] = tf.placeholder(tf.int32, [None, None], name='t_type')
+            place_holders['natoms_vec'] = tf.placeholder(tf.int32, [self.ntypes+2], name='t_natoms')
+            place_holders['default_mesh'] = tf.placeholder(tf.int32, [None], name='t_mesh')
+            _max_nbor_size, _min_nbor_dist \
+                = op_module.neighbor_stat(place_holders['coord'],
+                                         place_holders['type'],
+                                         place_holders['natoms_vec'],
+                                         place_holders['box'],
+                                         place_holders['default_mesh'],
                                          rcut = self.rcut)
+            place_holders['dir'] = tf.placeholder(tf.string)
+            return place_holders, (_max_nbor_size, _min_nbor_dist, place_holders['dir'])
+
+        with sub_graph.as_default():
+            self.p = ParallelOp(builder, config=default_tf_session_config)
+
         self.sub_sess = tf.Session(graph = sub_graph, config=default_tf_session_config)
 
     def get_stat(self,
@@ -69,39 +76,37 @@ def get_stat(self,
         self.min_nbor_dist = 100.0
         self.max_nbor_size = [0] * self.ntypes
 
-        # for ii in tqdm(range(len(data.system_dirs)), desc = 'DEEPMD INFO    |-> deepmd.utils.neighbor_stat\t\t\tgetting neighbor status'):
-        for ii in range(len(data.system_dirs)):
-            for jj in data.data_systems[ii].dirs:
-                data_set = data.data_systems[ii]._load_set(jj)
-                for kk in range(np.array(data_set['type']).shape[0]):
-                    mn, dt \
-                        = run_sess(self.sub_sess, [self._max_nbor_size, self._min_nbor_dist], 
-                                            feed_dict = {
-                                                self.place_holders['coord']: np.array(data_set['coord'])[kk].reshape([-1, data.natoms[ii] * 3]),
-                                                self.place_holders['type']: np.array(data_set['type'])[kk].reshape([-1, data.natoms[ii]]),
-                                                self.place_holders['natoms_vec']: np.array(data.natoms_vec[ii]),
-                                                self.place_holders['box']: np.array(data_set['box'])[kk].reshape([-1, 9]),
-                                                self.place_holders['default_mesh']: np.array(data.default_mesh[ii]),
-                                            })
-                    if dt.size != 0:
-                        dt = np.min(dt)              
-                    else:
-                        dt = self.rcut
-                        log.warning("Atoms with no neighbors found in %s. Please make sure it's what you expected."%jj)
-                        
-                    if dt < self.min_nbor_dist:
-                        if math.isclose(dt, 0., rel_tol=1e-6):
-                            # it's unexpected that the distance between two atoms is zero
-                            # zero distance will cause nan (#874) 
-                            raise RuntimeError(
-                                "Some atoms in %s are overlapping. Please check your"
-                                " training data to remove duplicated atoms." % jj
-                            )
-                        self.min_nbor_dist = dt
-                    for ww in range(self.ntypes):
-                        var = np.max(mn[:, ww])
-                        if var > self.max_nbor_size[ww]:
-                            self.max_nbor_size[ww] = var
+        def feed():
+            for ii in range(len(data.system_dirs)):
+                for jj in data.data_systems[ii].dirs:
+                    data_set = data.data_systems[ii]._load_set(jj)
+                    for kk in range(np.array(data_set['type']).shape[0]):
+                        yield {
+                            'coord': np.array(data_set['coord'])[kk].reshape([-1, data.natoms[ii] * 3]),
+                            'type': np.array(data_set['type'])[kk].reshape([-1, data.natoms[ii]]),
+                            'natoms_vec': np.array(data.natoms_vec[ii]),
+                            'box': np.array(data_set['box'])[kk].reshape([-1, 9]),
+                            'default_mesh': np.array(data.default_mesh[ii]),
+                            'dir': str(jj),
+                        }
+
+        for mn, dt, jj in self.p.generate(self.sub_sess, feed()):
+            if dt.size != 0:
+                dt = np.min(dt)
+            else:
+                dt = self.rcut
+                log.warning("Atoms with no neighbors found in %s. Please make sure it's what you expected." % jj)
+            if dt < self.min_nbor_dist:
+                if math.isclose(dt, 0., rel_tol=1e-6):
+                    # it's unexpected that the distance between two atoms is zero
+                    # zero distance will cause nan (#874) 
+                    raise RuntimeError(
+                        "Some atoms are overlapping in %s. Please check your"
+                        " training data to remove duplicated atoms." % jj
+                    )
+                self.min_nbor_dist = dt
+            var = np.max(mn, axis=0)
+            self.max_nbor_size = np.maximum(var, self.max_nbor_size)
 
         log.info('training data with min nbor dist: ' + str(self.min_nbor_dist))
         log.info('training data with max nbor size: ' + str(self.max_nbor_size))
diff --git a/deepmd/utils/parallel_op.py b/deepmd/utils/parallel_op.py
new file mode 100644
index 0000000000..91f3d6d743
--- /dev/null
+++ b/deepmd/utils/parallel_op.py
@@ -0,0 +1,80 @@
+from typing import Callable, Generator, Tuple, Dict, Any
+
+from deepmd.env import tf
+from deepmd.utils.sess import run_sess
+
+
+class ParallelOp:
+    """Run an op with data parallelism.
+    
+    Parameters
+    ----------
+    builder : Callable[..., Tuple[Dict[str, tf.Tensor], Tuple[tf.Tensor]]]
+        returns two objects: a dict which stores placeholders by key, and a tuple with the final op(s)
+    nthreads : int, optional
+        the number of threads
+    config : tf.ConfigProto, optional
+        tf.ConfigProto
+    
+    Examples
+    --------
+    >>> from deepmd.env import tf
+    >>> from deepmd.utils.parallel_op import ParallelOp
+    >>> def builder():
+    ...     x = tf.placeholder(tf.int32, [1])
+    ...     return {"x": x}, (x + 1)
+    ...
+    >>> p = ParallelOp(builder, nthreads=4)
+    >>> def feed():
+    ...     for ii in range(10):
+    ...         yield {"x": [ii]}
+    ...
+    >>> print(*p.generate(tf.Session(), feed()))
+    [1] [2] [3] [4] [5] [6] [7] [8] [9] [10]
+    """
+    def __init__(self, builder: Callable[..., Tuple[Dict[str, tf.Tensor], Tuple[tf.Tensor]]], nthreads: int = None, config: tf.ConfigProto = None) -> None:
+        if nthreads is not None:
+            self.nthreads = nthreads
+        elif config is not None:
+            self.nthreads = max(config.inter_op_parallelism_threads, 1)
+        else:
+            self.nthreads = 1
+        
+        self.placeholders = []
+        self.ops = []
+        for ii in range(self.nthreads):
+            with tf.name_scope("task_%d" % ii) as scope:
+                placeholder, op = builder()
+                self.placeholders.append(placeholder)
+                self.ops.append(op)
+
+    def generate(self, sess: tf.Session, feed: Generator[Dict[str, Any], None, None]) -> Generator[Tuple, None, None]:
+        """Returns a generator.
+
+        Parameters
+        ----------
+        feed : Generator[dict, None, None]
+            generator which yields feed_dict
+        
+        Yields
+        ------
+        Generator[Tuple, None, None]
+            generator which yields session returns
+        """
+        nn = self.nthreads
+        while True:
+            feed_dict = {}
+            for ii in range(self.nthreads):
+                try:
+                    fd = next(feed)
+                except StopIteration:
+                    if ii == 0:
+                        return
+                    nn = ii
+                    break
+                for kk, vv in fd.items():
+                    feed_dict[self.placeholders[ii][kk]] = vv  
+            ops = self.ops[:nn]       
+            for yy in run_sess(sess, ops, feed_dict=feed_dict):
+                yield yy
+        
diff --git a/deepmd/utils/path.py b/deepmd/utils/path.py
index b39fba7e3c..43f2e741ec 100644
--- a/deepmd/utils/path.py
+++ b/deepmd/utils/path.py
@@ -64,7 +64,7 @@ def glob(self, pattern: str) -> List["DPPath"]:
     
     @abstractmethod
     def rglob(self, pattern: str) -> List["DPPath"]:
-        """This is like calling :metd:`DPPath.glob()` with `**/` added in front
+        """This is like calling :meth:`DPPath.glob()` with `**/` added in front
         of the given relative pattern.
         
         Parameters
@@ -161,7 +161,7 @@ def glob(self, pattern: str) -> List["DPPath"]:
         return list([type(self)(p) for p in self.path.glob(pattern)])
 
     def rglob(self, pattern: str) -> List["DPPath"]:
-        """This is like calling :metd:`DPPath.glob()` with `**/` added in front
+        """This is like calling :meth:`DPPath.glob()` with `**/` added in front
         of the given relative pattern.
         
         Parameters
@@ -277,7 +277,7 @@ def glob(self, pattern: str) -> List["DPPath"]:
         return list([type(self)("%s#%s"%(self.root_path, pp)) for pp in globfilter(subpaths, self._connect_path(pattern))])
 
     def rglob(self, pattern: str) -> List["DPPath"]:
-        """This is like calling :metd:`DPPath.glob()` with `**/` added in front
+        """This is like calling :meth:`DPPath.glob()` with `**/` added in front
         of the given relative pattern.
         
         Parameters
diff --git a/deepmd/utils/plugin.py b/deepmd/utils/plugin.py
index 6a40e69fab..af28a4632d 100644
--- a/deepmd/utils/plugin.py
+++ b/deepmd/utils/plugin.py
@@ -32,8 +32,8 @@ def __add__(self, other) -> "Plugin":
     def register(self, key : str) -> Callable[[object], object]:
         """Register a plugin.
         
-        Parameter
-        ---------
+        Parameters
+        ----------
         key : str
             key of the plugin
         
diff --git a/deepmd/utils/tabulate.py b/deepmd/utils/tabulate.py
index ee1088bd3c..41c51569f7 100644
--- a/deepmd/utils/tabulate.py
+++ b/deepmd/utils/tabulate.py
@@ -1,9 +1,8 @@
-import math
 import logging
 import numpy as np
 import deepmd
 from typing import Callable
-from typing import Tuple, List
+from typing import Tuple, List, Dict
 from functools import lru_cache
 from scipy.special import comb
 from deepmd.env import tf
@@ -84,26 +83,17 @@ def __init__(self,
         self.sub_sess = tf.Session(graph = self.sub_graph)
 
         if isinstance(self.descrpt, deepmd.descriptor.DescrptSeR):
-            try:
-                self.sel_a = self.graph.get_operation_by_name('ProdEnvMatR').get_attr('sel')
-                self.prod_env_mat_op = self.graph.get_operation_by_name ('ProdEnvMatR')
-            except KeyError:
-                self.sel_a = self.graph.get_operation_by_name('DescrptSeR').get_attr('sel')
-                self.prod_env_mat_op = self.graph.get_operation_by_name ('DescrptSeR')
+            self.sel_a = self.descrpt.sel_r
+            self.rcut = self.descrpt.rcut
+            self.rcut_smth = self.descrpt.rcut_smth
         elif isinstance(self.descrpt, deepmd.descriptor.DescrptSeA):
-            try:
-                self.sel_a = self.graph.get_operation_by_name('ProdEnvMatA').get_attr('sel_a')
-                self.prod_env_mat_op = self.graph.get_operation_by_name ('ProdEnvMatA')
-            except KeyError:
-                self.sel_a = self.graph.get_operation_by_name('DescrptSeA').get_attr('sel_a')
-                self.prod_env_mat_op = self.graph.get_operation_by_name ('DescrptSeA')
+            self.sel_a = self.descrpt.sel_a
+            self.rcut = self.descrpt.rcut_r
+            self.rcut_smth = self.descrpt.rcut_r_smth
         elif isinstance(self.descrpt, deepmd.descriptor.DescrptSeT):
-            try:
-                self.sel_a = self.graph.get_operation_by_name('ProdEnvMatA').get_attr('sel_a')
-                self.prod_env_mat_op = self.graph.get_operation_by_name ('ProdEnvMatA')
-            except KeyError:
-                self.sel_a = self.graph.get_operation_by_name('DescrptSeA').get_attr('sel_a')
-                self.prod_env_mat_op = self.graph.get_operation_by_name ('DescrptSeA')
+            self.sel_a = self.descrpt.sel_a
+            self.rcut = self.descrpt.rcut_r
+            self.rcut_smth = self.descrpt.rcut_r_smth
         else:
             raise RuntimeError("Unsupported descriptor")
 
@@ -111,13 +101,6 @@ def __init__(self,
         self.dstd = get_tensor_by_name_from_graph(self.graph, f'descrpt_attr{self.suffix}/t_std')
         self.ntypes = get_tensor_by_name_from_graph(self.graph, 'descrpt_attr/ntypes')
 
-        if isinstance(self.descrpt, deepmd.descriptor.DescrptSeR):
-            self.rcut = self.prod_env_mat_op.get_attr('rcut')
-            self.rcut_smth = self.prod_env_mat_op.get_attr('rcut_smth')
-        else:
-            self.rcut = self.prod_env_mat_op.get_attr('rcut_r')
-            self.rcut_smth = self.prod_env_mat_op.get_attr('rcut_r_smth')
-
         self.embedding_net_nodes = get_embedding_net_nodes_from_graph_def(self.graph_def, suffix=self.suffix)
 
         # move it to the descriptor class
@@ -137,12 +120,15 @@ def __init__(self,
 
         self.data = {}
 
+        self.upper = {}
+        self.lower = {}
+
 
     def build(self, 
               min_nbor_dist : float,
               extrapolate : float, 
               stride0 : float, 
-              stride1 : float) -> Tuple[int, int]:
+              stride1 : float) -> Tuple[Dict[str, int], Dict[str, int]]:
         """
         Build the tables for model compression
 
@@ -161,81 +147,101 @@ def build(self,
 
         Returns
         ----------
-        lower
-                The lower boundary of environment matrix
-        upper
-                The upper boundary of environment matrix
+        lower : dict[str, int]
+                The lower boundary of environment matrix by net
+        upper : dict[str, int]
+                The upper boundary of environment matrix by net
         """
         # tabulate range [lower, upper] with stride0 'stride0'
         lower, upper = self._get_env_mat_range(min_nbor_dist)
-
         if isinstance(self.descrpt, deepmd.descriptor.DescrptSeA):
-            xx = np.arange(lower, upper, stride0, dtype = self.data_type)
-            xx = np.append(xx, np.arange(upper, extrapolate * upper, stride1, dtype = self.data_type))
-            xx = np.append(xx, np.array([extrapolate * upper], dtype = self.data_type))
-            self.nspline = int((upper - lower) / stride0 + (extrapolate * upper - upper) / stride1)
             for ii in range(self.table_size):
                 if (self.type_one_side and not self._all_excluded(ii)) or (not self.type_one_side and (ii // self.ntypes, ii % self.ntypes) not in self.exclude_types):
                     if self.type_one_side:
                         net = "filter_-1_net_" + str(ii)
+                        # upper and lower should consider all types which are not excluded and sel>0
+                        idx = [(type_i, ii) not in self.exclude_types and self.sel_a[type_i] > 0 for type_i in range(self.ntypes)]
+                        uu = np.max(upper[idx])
+                        ll = np.min(lower[idx])
                     else:
-                        net = "filter_" + str(ii // self.ntypes) + "_net_" + str(ii % self.ntypes)
-                    self._build_lower(net, xx, ii, upper, lower, stride0, stride1, extrapolate)
+                        ielement = ii // self.ntypes
+                        net = "filter_" + str(ielement) + "_net_" + str(ii % self.ntypes)
+                        uu = upper[ielement]
+                        ll = lower[ielement]
+                    xx = np.arange(ll, uu, stride0, dtype = self.data_type)
+                    xx = np.append(xx, np.arange(uu, extrapolate * uu, stride1, dtype = self.data_type))
+                    xx = np.append(xx, np.array([extrapolate * uu], dtype = self.data_type))
+                    nspline = ((uu - ll) / stride0 + (extrapolate * uu - uu) / stride1).astype(int)
+                    self._build_lower(net, xx, ii, uu, ll, stride0, stride1, extrapolate, nspline)
         elif isinstance(self.descrpt, deepmd.descriptor.DescrptSeT):
-            xx = np.arange(extrapolate * lower, lower, stride1, dtype = self.data_type)
-            xx = np.append(xx, np.arange(lower, upper, stride0, dtype = self.data_type))
-            xx = np.append(xx, np.arange(upper, extrapolate * upper, stride1, dtype = self.data_type))
-            xx = np.append(xx, np.array([extrapolate * upper], dtype = self.data_type))
-            self.nspline = int((upper - lower) / stride0 + 2 * ((extrapolate * upper - upper) / stride1))
+            xx_all = []
+            for ii in range(self.ntypes):
+                xx = np.arange(extrapolate * lower[ii], lower[ii], stride1, dtype = self.data_type)
+                xx = np.append(xx, np.arange(lower[ii], upper[ii], stride0, dtype = self.data_type))
+                xx = np.append(xx, np.arange(upper[ii], extrapolate * upper[ii], stride1, dtype = self.data_type))
+                xx = np.append(xx, np.array([extrapolate * upper[ii]], dtype = self.data_type))
+                xx_all.append(xx)
+            nspline = ((upper - lower) / stride0 + 2 * ((extrapolate * upper - upper) / stride1)).astype(int)
             idx = 0
             for ii in range(self.ntypes):
                 for jj in range(ii, self.ntypes):
                     net = "filter_" + str(ii) + "_net_" + str(jj)
-                    self._build_lower(net, xx, idx, upper, lower, stride0, stride1, extrapolate)
+                    self._build_lower(net, xx_all[ii], idx, upper[ii], lower[ii], stride0, stride1, extrapolate, nspline[ii])
                     idx += 1
         elif isinstance(self.descrpt, deepmd.descriptor.DescrptSeR):
-            xx = np.arange(lower, upper, stride0, dtype = self.data_type)
-            xx = np.append(xx, np.arange(upper, extrapolate * upper, stride1, dtype = self.data_type))
-            xx = np.append(xx, np.array([extrapolate * upper], dtype = self.data_type))
-            self.nspline = int((upper - lower) / stride0 + (extrapolate * upper - upper) / stride1)
             for ii in range(self.table_size):
                 if (self.type_one_side and not self._all_excluded(ii)) or (not self.type_one_side and (ii // self.ntypes, ii % self.ntypes) not in self.exclude_types):
                     if self.type_one_side:
                         net = "filter_-1_net_" + str(ii)
+                        # upper and lower should consider all types which are not excluded and sel>0
+                        idx = [(type_i, ii) not in self.exclude_types and self.sel_a[type_i] > 0 for type_i in range(self.ntypes)]
+                        uu = np.max(upper[idx])
+                        ll = np.min(lower[idx])
                     else:
-                        net = "filter_" + str(ii // self.ntypes) + "_net_" + str(ii % self.ntypes)
-                    self._build_lower(net, xx, ii, upper, lower, stride0, stride1, extrapolate)
+                        ielement = ii // self.ntypes
+                        net = "filter_" + str(ielement) + "_net_" + str(ii % self.ntypes)
+                        uu = upper[ielement]
+                        ll = lower[ielement]
+                    xx = np.arange(ll, uu, stride0, dtype = self.data_type)
+                    xx = np.append(xx, np.arange(uu, extrapolate * uu, stride1, dtype = self.data_type))
+                    xx = np.append(xx, np.array([extrapolate * uu], dtype = self.data_type))
+                    nspline = ((uu - ll) / stride0 + (extrapolate * uu - uu) / stride1).astype(int)
+                    self._build_lower(net, xx, ii, uu, ll, stride0, stride1, extrapolate, nspline)
         else:
             raise RuntimeError("Unsupported descriptor")
+        self._convert_numpy_to_tensor()
 
-        return lower, upper
+        return self.lower, self.upper
 
-    def _build_lower(self, net, xx, idx, upper, lower, stride0, stride1, extrapolate):
+    def _build_lower(self, net, xx, idx, upper, lower, stride0, stride1, extrapolate, nspline):
         vv, dd, d2 = self._make_data(xx, idx)
-        self.data[net] = np.zeros([self.nspline, 6 * self.last_layer_size], dtype = self.data_type)
+        self.data[net] = np.zeros([nspline, 6 * self.last_layer_size], dtype = self.data_type)
 
-        # tt.shape: [self.nspline, self.last_layer_size]
+        # tt.shape: [nspline, self.last_layer_size]
         if isinstance(self.descrpt, deepmd.descriptor.DescrptSeA):
-            tt = np.full((self.nspline, self.last_layer_size), stride1)
+            tt = np.full((nspline, self.last_layer_size), stride1)
             tt[:int((upper - lower) / stride0), :] = stride0
         elif isinstance(self.descrpt, deepmd.descriptor.DescrptSeT):
-            tt = np.full((self.nspline, self.last_layer_size), stride1)
+            tt = np.full((nspline, self.last_layer_size), stride1)
             tt[int((lower - extrapolate * lower) / stride1) + 1:(int((lower - extrapolate * lower) / stride1) + int((upper - lower) / stride0)), :] = stride0
         elif isinstance(self.descrpt, deepmd.descriptor.DescrptSeR):
-            tt = np.full((self.nspline, self.last_layer_size), stride1)
+            tt = np.full((nspline, self.last_layer_size), stride1)
             tt[:int((upper - lower) / stride0), :] = stride0
         else:
             raise RuntimeError("Unsupported descriptor")
 
-        # hh.shape: [self.nspline, self.last_layer_size]
-        hh = vv[1:self.nspline+1, :self.last_layer_size] - vv[:self.nspline, :self.last_layer_size]
+        # hh.shape: [nspline, self.last_layer_size]
+        hh = vv[1:nspline+1, :self.last_layer_size] - vv[:nspline, :self.last_layer_size]
 
-        self.data[net][:, :6 * self.last_layer_size:6] = vv[:self.nspline, :self.last_layer_size]
-        self.data[net][:, 1:6 * self.last_layer_size:6] = dd[:self.nspline, :self.last_layer_size]
-        self.data[net][:, 2:6 * self.last_layer_size:6] = 0.5 * d2[:self.nspline, :self.last_layer_size]
-        self.data[net][:, 3:6 * self.last_layer_size:6] = (1 / (2 * tt * tt * tt)) * (20 * hh - (8 * dd[1:self.nspline+1, :self.last_layer_size] + 12 * dd[:self.nspline, :self.last_layer_size]) * tt - (3 * d2[:self.nspline, :self.last_layer_size] - d2[1:self.nspline+1, :self.last_layer_size]) * tt * tt)
-        self.data[net][:, 4:6 * self.last_layer_size:6] = (1 / (2 * tt * tt * tt * tt)) * (-30 * hh + (14 * dd[1:self.nspline+1, :self.last_layer_size] + 16 * dd[:self.nspline, :self.last_layer_size]) * tt + (3 * d2[:self.nspline, :self.last_layer_size] - 2 * d2[1:self.nspline+1, :self.last_layer_size]) * tt * tt)
-        self.data[net][:, 5:6 * self.last_layer_size:6] = (1 / (2 * tt * tt * tt * tt * tt)) * (12 * hh - 6 * (dd[1:self.nspline+1, :self.last_layer_size] + dd[:self.nspline, :self.last_layer_size]) * tt + (d2[1:self.nspline+1, :self.last_layer_size] - d2[:self.nspline, :self.last_layer_size]) * tt * tt)
+        self.data[net][:, :6 * self.last_layer_size:6] = vv[:nspline, :self.last_layer_size]
+        self.data[net][:, 1:6 * self.last_layer_size:6] = dd[:nspline, :self.last_layer_size]
+        self.data[net][:, 2:6 * self.last_layer_size:6] = 0.5 * d2[:nspline, :self.last_layer_size]
+        self.data[net][:, 3:6 * self.last_layer_size:6] = (1 / (2 * tt * tt * tt)) * (20 * hh - (8 * dd[1:nspline+1, :self.last_layer_size] + 12 * dd[:nspline, :self.last_layer_size]) * tt - (3 * d2[:nspline, :self.last_layer_size] - d2[1:nspline+1, :self.last_layer_size]) * tt * tt)
+        self.data[net][:, 4:6 * self.last_layer_size:6] = (1 / (2 * tt * tt * tt * tt)) * (-30 * hh + (14 * dd[1:nspline+1, :self.last_layer_size] + 16 * dd[:nspline, :self.last_layer_size]) * tt + (3 * d2[:nspline, :self.last_layer_size] - 2 * d2[1:nspline+1, :self.last_layer_size]) * tt * tt)
+        self.data[net][:, 5:6 * self.last_layer_size:6] = (1 / (2 * tt * tt * tt * tt * tt)) * (12 * hh - 6 * (dd[1:nspline+1, :self.last_layer_size] + dd[:nspline, :self.last_layer_size]) * tt + (d2[1:nspline+1, :self.last_layer_size] - d2[:nspline, :self.last_layer_size]) * tt * tt)
+
+        self.upper[net] = upper
+        self.lower[net] = lower
 
     def _load_sub_graph(self):
         sub_graph_def = tf.GraphDef()
@@ -387,24 +393,23 @@ def _layer_1(self, x, w, b):
     # Change the embedding net range to sw / min_nbor_dist
     def _get_env_mat_range(self,
                            min_nbor_dist):
-        lower = +100.0
-        upper = -100.0
         sw    = self._spline5_switch(min_nbor_dist, self.rcut_smth, self.rcut)
         if isinstance(self.descrpt, deepmd.descriptor.DescrptSeA):
-            lower = np.min(-self.davg[:, 0] / self.dstd[:, 0])
-            upper = np.max(((1 / min_nbor_dist) * sw - self.davg[:, 0]) / self.dstd[:, 0])
+            lower = -self.davg[:, 0] / self.dstd[:, 0]
+            upper = ((1 / min_nbor_dist) * sw - self.davg[:, 0]) / self.dstd[:, 0]
         elif isinstance(self.descrpt, deepmd.descriptor.DescrptSeT):
             var = np.square(sw / (min_nbor_dist * self.dstd[:, 1:4]))
-            lower = np.min(-var)
-            upper = np.max(var)
+            lower = np.min(-var, axis=1)
+            upper = np.max(var, axis=1)
         elif isinstance(self.descrpt, deepmd.descriptor.DescrptSeR):
-            lower = np.min(-self.davg[:, 0] / self.dstd[:, 0])
-            upper = np.max(((1 / min_nbor_dist) * sw - self.davg[:, 0]) / self.dstd[:, 0])
+            lower = -self.davg[:, 0] / self.dstd[:, 0]
+            upper = ((1 / min_nbor_dist) * sw - self.davg[:, 0]) / self.dstd[:, 0]
         else:
             raise RuntimeError("Unsupported descriptor")
         log.info('training data with lower boundary: ' + str(lower))
         log.info('training data with upper boundary: ' + str(upper))
-        return math.floor(lower), math.ceil(upper)
+        # returns element-wise lower and upper
+        return np.floor(lower), np.ceil(upper)
 
     def _spline5_switch(self,
                         xx,
@@ -484,3 +489,8 @@ def _get_last_layer_size(self):
             if len(item) != 0:
                 return item.shape[1]
         return 0
+
+    def _convert_numpy_to_tensor(self):
+        """Convert self.data from np.ndarray to tf.Tensor."""
+        for ii in self.data:
+            self.data[ii] = tf.constant(self.data[ii])
diff --git a/deepmd/utils/type_embed.py b/deepmd/utils/type_embed.py
index 80808c4e8e..5d5b5c9888 100644
--- a/deepmd/utils/type_embed.py
+++ b/deepmd/utils/type_embed.py
@@ -10,8 +10,7 @@
 from deepmd.utils.network import  embedding_net
 
 from deepmd.utils.graph import get_type_embedding_net_variables_from_graph_def
-from deepmd.common import get_activation_func, get_precision, ACTIVATION_FN_DICT, PRECISION_DICT, docstring_parameter, get_np_precision
-from deepmd.utils.argcheck import list_to_doc
+from deepmd.common import get_activation_func, get_precision
 
 
 def embed_atom_type(
@@ -64,9 +63,9 @@ class TypeEmbedNet():
             Time-step `dt` in the resnet construction:
             y = x + dt * \phi (Wx + b)
     activation_function
-            The activation function in the embedding net. Supported options are {0}
+            The activation function in the embedding net. Supported options are |ACTIVATION_FN|
     precision
-            The precision of the embedding net parameters. Supported options are {1}        
+            The precision of the embedding net parameters. Supported options are |PRECISION| 
     trainable
             If the weights of embedding net are trainable.
     seed
@@ -74,7 +73,6 @@ class TypeEmbedNet():
     uniform_seed
             Only for the purpose of backward compatibility, retrieves the old behavior of using the random seed
     """
-    @docstring_parameter(list_to_doc(ACTIVATION_FN_DICT.keys()), list_to_doc(PRECISION_DICT.keys()))
     def __init__(
             self,
             neuron: List[int]=[],
diff --git a/doc/cli.rst b/doc/cli.rst
new file mode 100644
index 0000000000..4c52a9ede8
--- /dev/null
+++ b/doc/cli.rst
@@ -0,0 +1,7 @@
+Command line interface
+======================
+
+.. argparse::
+   :module: deepmd.entrypoints.main
+   :func: main_parser
+   :prog: dp
diff --git a/doc/conf.py b/doc/conf.py
index 5dc12be1f7..8cf9394ab6 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -16,6 +16,8 @@
 import recommonmark
 from recommonmark.transform import AutoStructify
 from datetime import date
+from deepmd.common import ACTIVATION_FN_DICT, PRECISION_DICT
+from deepmd.utils.argcheck import list_to_doc
 
 def mkindex(dirname):
     dirname = dirname + "/"
@@ -110,31 +112,6 @@ def classify_index_TS():
 copyright = '2017-%d, DeepModeling' % date.today().year
 author = 'DeepModeling'
 
-def run_doxygen(folder):
-    """Run the doxygen make command in the designated folder"""
-
-    try:
-        retcode = subprocess.call("cd %s; doxygen Doxyfile" % folder, shell=True)
-        if retcode < 0:
-            sys.stderr.write("doxygen terminated by signal %s" % (-retcode))
-    except OSError as e:
-        sys.stderr.write("doxygen execution failed: %s" % e)
-
-
-def generate_doxygen_xml(app):
-    """Run the doxygen make commands if we're on the ReadTheDocs server"""
-
-    read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True'
-
-    if read_the_docs_build:
-        run_doxygen("./")
-    else:
-        subprocess.call("doxygen Doxyfile", shell=True)
-
-def generate_train_input(app):
-    with open("train-input-auto.rst", 'w') as f:
-        f.write(subprocess.check_output((sys.executable, "-m", "deepmd", "doc-train-input"), universal_newlines=True))
-
 def run_apidoc(_):
     from sphinx.ext.apidoc import main
     import sys
@@ -146,9 +123,7 @@ def run_apidoc(_):
 def setup(app):
 
     # Add hook for building doxygen xml when needed
-    app.connect("builder-inited", generate_doxygen_xml)
     app.connect('builder-inited', run_apidoc)
-    app.connect('builder-inited', generate_train_input)
 
 # -- General configuration ---------------------------------------------------
 
@@ -169,12 +144,14 @@ def setup(app):
 
 extensions = [
     "deepmodeling_sphinx",
+    "dargs.sphinx",
     "sphinx_rtd_theme",
     'myst_parser',
     'sphinx.ext.autosummary',
     'sphinx.ext.mathjax',
     'sphinx.ext.viewcode',
     'sphinx.ext.intersphinx',
+    'sphinxarg.ext',
     'numpydoc',
     'breathe',
     'exhale'
@@ -235,6 +212,11 @@ def setup(app):
 for typing_type in typing.__all__:
     numpydoc_xref_aliases[typing_type] = "typing.%s" % typing_type
 
+rst_epilog = """
+.. |ACTIVATION_FN| replace:: %s
+.. |PRECISION| replace:: %s
+""" % (list_to_doc(ACTIVATION_FN_DICT.keys()), list_to_doc(PRECISION_DICT.keys()))
+
 # -- Options for HTML output -------------------------------------------------
 
 # The theme to use for HTML and HTML Help pages.  See the documentation for
@@ -251,3 +233,16 @@ def setup(app):
 autodoc_default_flags = ['members']
 autosummary_generate = True
 master_doc = 'index'
+mathjax_path = 'https://cdnjs.cloudflare.com/ajax/libs/mathjax/3.2.0/es5/tex-mml-chtml.min.js'
+myst_enable_extensions = [
+    'dollarmath',
+]
+# fix emoji issue in pdf
+latex_engine = "xelatex"
+latex_elements = {
+    'fontpkg': r'''
+\usepackage{fontspec}
+\setmainfont{Symbola}
+''',
+}
+
diff --git a/doc/data/data-conv.md b/doc/data/data-conv.md
index 6f25e36ba4..d3c0632464 100644
--- a/doc/data/data-conv.md
+++ b/doc/data/data-conv.md
@@ -1,26 +1,21 @@
-# Data conversion
+# Formats of a system
 
-One needs to provide the following information to train a model: the atom type, the simulation box, the atom coordinate, the atom force, system energy and virial. A snapshot of a system that contains these information is called a **frame**. We use the following convention of units:
+Two binaray formats, NumPy and HDF5, are supported for training. The raw format is not directly supported, but a tool is provided to convert data from the raw format to the NumPy format.
 
+## NumPy format
 
-Property | Unit 
----|---
-Time     | ps   
-Length   | Å    
-Energy   | eV   
-Force    | eV/Å 
-Virial   | eV   
-Pressure | Bar  
-
-
-The frames of the system are stored in two formats. A raw file is a plain text file with each information item written in one file and one frame written on one line. The default files that provide box, coordinate, force, energy and virial are `box.raw`, `coord.raw`, `force.raw`, `energy.raw` and `virial.raw`, respectively. *We recommend you use these file names*. Here is an example of force.raw:
-```bash
-$ cat force.raw
--0.724  2.039 -0.951  0.841 -0.464  0.363
- 6.737  1.554 -5.587 -2.803  0.062  2.222
--1.968 -0.163  1.020 -0.225 -0.789  0.343
+In a system with the Numpy format, the system properties are stored as text files ending with `.raw`, such as `type.raw` amd `type_map.raw`, under the system directory. If one needs to train a non-periodic system, an empty `nopbc` file should be put under the system directory. Both input and labeled frame properties are saved as the [NumPy binary data (NPY) files](https://numpy.org/doc/stable/reference/generated/numpy.lib.format.html#npy-format) ending with `.npy` in each of the `set.*` directories. Take an example, a system may contain the following files:
+```
+type.raw
+type_map.raw
+nopbc
+set.000/coord.npy
+set.000/energy.npy
+set.000/force.npy
+set.001/coord.npy
+set.001/energy.npy
+set.001/force.npy
 ```
-This `force.raw` contains 3 frames with each frame having the forces of 2 atoms, thus it has 3 lines and 6 columns. Each line provides all the 3 force components of 2 atoms in 1 frame. The first three numbers are the 3 force components of the first atom, while the second three numbers are the 3 force components of the second atom. The coordinate file `coord.raw` is organized similarly. In `box.raw`, the 9 components of the box vectors should be provided on each line in the order `XX XY XZ YX YY YZ ZX ZY ZZ`. In `virial.raw`, the 9 components of the virial tensor should be provided on each line in the order `XX XY XZ YX YY YZ ZX ZY ZZ`. The number of lines of all raw files should be identical.
 
 We assume that the atom types do not change in all frames. It is provided by `type.raw`, which has one line with the types of atoms written one by one. The atom types should be integers. For example the `type.raw` of a system that has 2 atoms with 0 and 1:
 ```bash
@@ -35,7 +30,30 @@ O H
 ```
 The type `0` is named by `"O"` and the type `1` is named by `"H"`.
 
-The second format is the data sets of `numpy` binary data that are directly used by the training program. User can use the script `$deepmd_source_dir/data/raw/raw_to_set.sh` to convert the prepared raw files to data sets. For example, if we have a raw file that contains 6000 frames, 
+## HDF5 format
+
+A system with the HDF5 format has the same strucutre as the Numpy format, but in a HDF5 file, a system is organized as an [HDF5 group](https://docs.h5py.org/en/stable/high/group.html). The file name of a Numpy file is the key in a HDF5 file, and the data is the value to the key. One need to use `#` in a DP path to divide the path to the HDF5 file and the HDF5 key:
+```
+/path/to/data.hdf5#H2O
+```
+Here, `/path/to/data.hdf5` is the path and `H2O` is the key. There should be some data in the `H2O` group, such as `H2O/type.raw` and `H2O/set.000/force.npy`.
+
+A HDF5 files with a large number of systems has better performance than multiple NumPy files in a large cluster.
+
+## Raw format and data conversion
+
+A raw file is a plain text file with each information item written in one file and one frame written on one line. **It's not directly supported**, but we provide a tool to convert them.
+
+In the raw format, the property of one frame are provided per line, ending with `.raw`. Take an example, the default files that provide box, coordinate, force, energy and virial are `box.raw`, `coord.raw`, `force.raw`, `energy.raw` and `virial.raw`, respectively. Here is an example of `force.raw`:
+```bash
+$ cat force.raw
+-0.724  2.039 -0.951  0.841 -0.464  0.363
+ 6.737  1.554 -5.587 -2.803  0.062  2.222
+-1.968 -0.163  1.020 -0.225 -0.789  0.343
+```
+This `force.raw` contains 3 frames with each frame having the forces of 2 atoms, thus it has 3 lines and 6 columns. Each line provides all the 3 force components of 2 atoms in 1 frame. The first three numbers are the 3 force components of the first atom, while the second three numbers are the 3 force components of the second atom. Other files are organized similarly. The number of lines of all raw files should be identical.
+
+One can use the script `$deepmd_source_dir/data/raw/raw_to_set.sh` to convert the prepared raw files to the NumPy format. For example, if we have a raw file that contains 6000 frames, 
 ```bash
 $ ls 
 box.raw  coord.raw  energy.raw  force.raw  type.raw  virial.raw
@@ -49,7 +67,4 @@ making set 2 ...
 $ ls 
 box.raw  coord.raw  energy.raw  force.raw  set.000  set.001  set.002  type.raw  virial.raw
 ```
-It generates three sets `set.000`, `set.001` and `set.002`, with each set contains 2000 frames. One do not need to take care of the binary data files in each of the `set.*` directories. The path containing `set.*` and `type.raw` is called a *system*. 
-
-If one needs to train a non-periodic system, an empty `nopbc` file should be put under the system directory. `box.raw` is not necessary in a non-periodic system.
-
+It generates three sets `set.000`, `set.001` and `set.002`, with each set contains 2000 frames with the Numpy format.
diff --git a/doc/data/index.md b/doc/data/index.md
index d54f52cd8e..256b263aac 100644
--- a/doc/data/index.md
+++ b/doc/data/index.md
@@ -2,7 +2,8 @@
 
 In this section, we will introduce how to convert the DFT labeled data into the data format used by DeePMD-kit.
 
-The DeePMD-kit organize data in `systems`. Each `system` is composed by a number of `frames`. One may roughly view a `frame` as a snap short on an MD trajectory, but it does not necessary come from an MD simulation. A `frame` records the coordinates and types of atoms, cell vectors if the periodic boundary condition is assumed, energy, atomic forces and virial. It is noted that the `frames` in one `system` share the same number of atoms with the same type.
+The DeePMD-kit organize data in `systems`. Each `system` is composed by a number of `frames`. One may roughly view a `frame` as a snap shot on an MD trajectory, but it does not necessary come from an MD simulation. A `frame` records the coordinates and types of atoms, cell vectors if the periodic boundary condition is assumed, energy, atomic forces and virial. It is noted that the `frames` in one `system` share the same number of atoms with the same type.
 
-- [Data conversion](data-conv.md)
+- [System](system.md)
+- [Formats of a system](data-conv.md)
 - [Prepare data with dpdata](dpdata.md)
diff --git a/doc/data/index.rst b/doc/data/index.rst
index 0631727546..c9af5011af 100644
--- a/doc/data/index.rst
+++ b/doc/data/index.rst
@@ -2,10 +2,11 @@ Data
 ====
 In this section, we will introduce how to convert the DFT labeled data into the data format used by DeePMD-kit.
 
-The DeePMD-kit organize data in :code:`systems`. Each :code:`system` is composed by a number of :code:`frames`. One may roughly view a :code:`frame` as a snap short on an MD trajectory, but it does not necessary come from an MD simulation. A :code:`frame` records the coordinates and types of atoms, cell vectors if the periodic boundary condition is assumed, energy, atomic forces and virial. It is noted that the :code:`frames` in one :code:`system` share the same number of atoms with the same type.
+The DeePMD-kit organize data in :code:`systems`. Each :code:`system` is composed by a number of :code:`frames`. One may roughly view a :code:`frame` as a snap shot on an MD trajectory, but it does not necessary come from an MD simulation. A :code:`frame` records the coordinates and types of atoms, cell vectors if the periodic boundary condition is assumed, energy, atomic forces and virial. It is noted that the :code:`frames` in one :code:`system` share the same number of atoms with the same type.
 
 .. toctree::
    :maxdepth: 1
 
+   system
    data-conv
    dpdata
diff --git a/doc/data/system.md b/doc/data/system.md
new file mode 100644
index 0000000000..b8d318f255
--- /dev/null
+++ b/doc/data/system.md
@@ -0,0 +1,45 @@
+# System
+
+DeePMD-kit takes a **system** as data structure. A snapshot of a system is called a **frame**. A system may contain multiple frames with the same atom types and numbers, i.e. the same formula (like `H2O`). To contains data with different formula, one need to divide data into multiple systems.
+
+A system should contain system properties, input frame properties, and labeled frame properties. The system property contains the following property:
+
+ID       | Property                | Raw file     | Required/Optional    | Shape                    | Description
+-------- | ----------------------  | ------------ | -------------------- | -----------------------  | -----------
+type     | Atom type indexes       | type.raw     | Required             | Natoms                   | Integers that start with 0
+type_map | Atom type names         | type_map.raw | Optional             | Ntypes                   | Atom names that map to atom type, which is unnecessart to be contained in the periodic table
+nopbc    | Non-periodic system     | nopbc        | Optional             | 1                        | If True, this system is non-periodic; otherwise it's periodic
+
+The input frame properties contains the following property, the first axis of which is the number of frames:
+
+ID       | Property                | Raw file       | Unit | Required/Optional    | Shape                    | Description
+-------- | ----------------------  | -------------- | ---- | -------------------- | -----------------------  | -----------
+coord    | Atomic coordinates      | coord.raw      | Å    | Required             | Nframes \* Natoms \* 3   | 
+box      | Boxes                   | box.raw        | Å    | Required if periodic | Nframes \* 3 \* 3        | in the order `XX XY XZ YX YY YZ ZX ZY ZZ`
+fparam   | Extra frame parameters  | fparam.raw     | Any  | Optional             | Nframes \* Any           |
+aparam   | Extra atomic parameters | aparam.raw     | Any  | Optional             | Nframes \* aparam \* Any |
+
+The labeled frame properties is listed as follows, all of which will be used for training if and only if the loss function contains such property:
+
+ID                     | Property                 | Raw file                 | Unit   | Shape                    | Description
+---------------------- | -----------------------  | ------------------------ | ----   | -----------------------  | -----------
+energy                 | Frame energies           | energy.raw               | eV     | Nframes                  | 
+force                  | Atomic forces            | force.raw                | eV/Å   | Nframes \* Natoms \* 3   | 
+virial                 | Frame virial             | virial.raw               | eV     | Nframes \* 3             | in the order `XX XY XZ YX YY YZ ZX ZY ZZ`
+atom_ener              | Atomic energies          | atom_ener.raw            | eV     | Nframes \* Natoms        |
+atom_pref              | Weights of atomic forces | atom_pref.raw            | 1      | Nframes \* Natoms        |
+dipole                 | Frame dipole             | dipole.raw               | Any    | Nframes \* 3             |
+atomic_dipole          | Atomic dipole            | atomic_dipole.raw        | Any    | Nframes \* Natoms \* 3   |
+polarizability         | Frame polarizability     | polarizability.raw       | Any    | Nframes \* 9             | in the order `XX XY XZ YX YY YZ ZX ZY ZZ`
+atomic_polarizability  | Atomic polarizability    | atomic_polarizability.raw| Any    | Nframes \* Natoms \* 9   | in the order `XX XY XZ YX YY YZ ZX ZY ZZ`
+
+In general, we always use the following convention of units:
+
+Property | Unit 
+---------| ----
+Time     | ps   
+Length   | Å    
+Energy   | eV   
+Force    | eV/Å 
+Virial   | eV   
+Pressure | Bar  
diff --git a/doc/development/type-embedding.md b/doc/development/type-embedding.md
index 1fc9c26dd5..8f3c3309d5 100644
--- a/doc/development/type-embedding.md
+++ b/doc/development/type-embedding.md
@@ -1,39 +1,46 @@
 # Atom Type Embedding
 ## Overview
-Here is an overview of the deepmd-kit algorithm. Given a specific centric atom, we can obtain the matrix describing its local environment, named as `R`. It is consist of the distance between centric atom and its neighbors, as well as a direction vector. We can embed each distance into a vector of M1 dimension by a `embedding net`, so the environment matrix `R` can be embed into matrix `G`. We can thus extract a descriptor vector (of M1*M2 dim) of the centric atom from the `G` by some matrix multiplication, and put the descriptor into `fitting net` to get predicted energy `E`. The vanilla version of deepmd-kit build `embedding net` and `fitting net` relying on the atom type, resulting in O(N) memory usage. After applying atom type embedding, in deepmd-kit v2.0, we can share one `embedding net` and one `fitting net` in total, which decline training complexity largely. 
+Here is an overview of the deepmd-kit algorithm. Given a specific centric atom, we can obtain the matrix describing its local environment, named as $\mathcal R$. It is consist of the distance between centric atom and its neighbors, as well as a direction vector. We can embed each distance into a vector of $M_1$ dimension by an `embedding net`, so the environment matrix $\mathcal R$ can be embed into matrix $\mathcal G$. We can thus extract a descriptor vector (of $M_1 \times M_2$ dim) of the centric atom from the $\mathcal G$ by some matrix multiplication, and put the descriptor into `fitting net` to get predicted energy $E$. The vanilla version of deepmd-kit build `embedding net` and `fitting net` relying on the atom type, resulting in $O(N)$ memory usage. After applying atom type embedding, in deepmd-kit v2.0, we can share one `embedding net` and one `fitting net` in total, which decline training complexity largely. 
 
 ## Preliminary
 In the following chart, you can find the meaning of symbols used to clarify the atom type embedding algorithm.
 
-|Symbol| Meaning|
-|---| :---:|
-|i| Type of centric atom|
-|j| Type of neighbor atom|
-|s_ij| Distance between centric atom and neighbor atom|
-|G_ij(·)| Origin embedding net, take s_ij as input and output embedding vector of M1 dim|
-|G(·) | Shared embedding net|
-|Multi(·) | Matrix multiplication and flattening, output the descriptor vector of M1*M2 dim|
-|F_i(·) | Origin fitting net, take the descriptor vector as input and output energy|
-|F(·) | Shared fitting net|
-|A(·) | Atom type embedding net, input is atom type, output is type embedding vector of dim `nchanl`|
+<!-- GitHub Markdown cannot render math in a table... -->
+$i$: Type of centric atom
+
+$j$: Type of neighbor atom
+
+$s_{ij}$: Distance between centric atom and neighbor atom
+
+$\mathcal G_{ij}(\cdot)$: Origin embedding net, take $s_{ij}$ as input and output embedding vector of $M_1$ dim
+
+$\mathcal G(\cdot)$: Shared embedding net
+
+$\text{Multi}(\cdot)$: Matrix multiplication and flattening, output the descriptor vector of $M_1\times M_2$ dim
+
+$F_i(\cdot)$: Origin fitting net, take the descriptor vector as input and output energy
+
+$F(\cdot)$: Shared fitting net
+
+$A(\cdot)$: Atom type embedding net, input is atom type, output is type embedding vector of dim `nchanl`
 
 So, we can formulate the training process as follows.
 Vanilla deepmd-kit algorithm:
-```
-Energy = F_i( Multi( G_ij( s_ij ) ) )
-```
+
+$$E = F_i( \text{Multi}( \mathcal G_{ij}( s_{ij} ) ) )$$
+
 Deepmd-kit applying atom type embedding:
-```
-Energy = F( [ Multi( G( [s_ij, A(i), A(j)] ) ), A(j)] )
-```
+
+$$E = F( [ \text{Multi}( \mathcal G( [s_{ij}, A(i), A(j)] ) ), A(j)] )$$
+
 or 
-```
-Energy = F( [ Multi( G( [s_ij, A(j)] ) ), A(j)] )
-```
+
+$$E = F( [ \text{Multi}( \mathcal G( [s_{ij}, A(j)] ) ), A(j)] )$$
+
 The difference between two variants above is whether using the information of centric atom when generating the descriptor. Users can choose by modifying the `type_one_side` hyper-parameter in the input json file.
 
 ## How to use
-A detailed introduction can be found at [`se_e2_a_tebd`](../train-se-e2-a-tebd.md). Looking for a fast start up, you can simply add a `type_embedding` section in the input json file as displayed in the following, and the algorithm will adopt atom type embedding algorithm automatically.
+A detailed introduction can be found at [`se_e2_a_tebd`](../model/train-se-e2-a-tebd.md). Looking for a fast start up, you can simply add a `type_embedding` section in the input json file as displayed in the following, and the algorithm will adopt atom type embedding algorithm automatically.
 An example of `type_embedding` is like
 ```json
     "type_embedding":{
@@ -50,18 +57,20 @@ Atom type embedding can be applied to varied `embedding net` and `fitting net`,
 ### trainer (train/trainer.py)
 In trainer.py, it will parse the parameter from the input json file. If a `type_embedding` section is detected, it will build a `TypeEmbedNet`, which will be later input in the `model`. `model` will be built in the function `_build_network`.
 ### model (model/ener.py)
-When building the operation graph of the `model` in `model.build`. If a `TypeEmbedNet` is detected, it will build the operation graph of `type embed net`, `embedding net` and `fitting net` by order. The building process of `type embed net` can be found in `TypeEmbedNet.build`, which output the type embedding vector of each atom type (of [ntypes * nchanl] dimension). We then save the type embedding vector into `input_dict`, so that they can be fetched later in `embedding net` and `fitting net`.
+When building the operation graph of the `model` in `model.build`. If a `TypeEmbedNet` is detected, it will build the operation graph of `type embed net`, `embedding net` and `fitting net` by order. The building process of `type embed net` can be found in `TypeEmbedNet.build`, which output the type embedding vector of each atom type (of [$\text{ntypes} \times \text{nchanl}$] dimensions). We then save the type embedding vector into `input_dict`, so that they can be fetched later in `embedding net` and `fitting net`.
 ### embedding net (descriptor/se*.py)
-In `embedding net`, we shall take local environment `R` as input and output matrix `G`. Functions called in this process by order is 
+In `embedding net`, we shall take local environment $\mathcal R$ as input and output matrix $\mathcal G$. Functions called in this process by order is 
 ```
 build -> _pass_filter -> _filter -> _filter_lower 
 ```
-* `_pass_filter`: It will first detect whether an atom type embedding exists, if so, it will apply atom type embedding algorithm and doesn't divide the input by type.
-* `_filter`: It will call `_filter_lower` function to obtain the result of matrix multiplication (`G^T·R` ), do further multiplication involved in Multi(·), and finally output the result of descriptor vector of M1*M2 dim.
-* `_filter_lower`: The main function handling input modification. If type embedding exists, it will call `_concat_type_embedding` function to concat the first column of input `R` (the column of s_ij) with the atom type embedding information. It will decide whether using the atom type embedding vector of centric atom according to the value of `type_one_side` (if set **True**, then we only use the vector of the neighbor atom). The modified input will be put into the `fitting net` to get `G` for further matrix multiplication stage.
+`_pass_filter`: It will first detect whether an atom type embedding exists, if so, it will apply atom type embedding algorithm and doesn't divide the input by type.
+
+`_filter`: It will call `_filter_lower` function to obtain the result of matrix multiplication ($\mathcal G^T\cdot \mathcal R$), do further multiplication involved in $\text{Multi}(\cdot)$, and finally output the result of descriptor vector of $M_1 \times M_2$ dim.
+
+`_filter_lower`: The main function handling input modification. If type embedding exists, it will call `_concat_type_embedding` function to concat the first column of input $\mathcal R$ (the column of $s_{ij}$) with the atom type embedding information. It will decide whether using the atom type embedding vector of centric atom according to the value of `type_one_side` (if set **True**, then we only use the vector of the neighbor atom). The modified input will be put into the `fitting net` to get $\mathcal G$ for further matrix multiplication stage.
 
 ### fitting net (fit/ener.py)
-In `fitting net`, it take the descriptor vector as input, whose dimension is [natoms, (M1*M2)]. Because we need to involve information of centric atom in this step, we need to generate a matrix named as `atype_embed` (of dim [natoms, nchanl]), in which each row is the type embedding vector of the specific centric atom. The input is sorted by type of centric atom, we also know the number of a particular atom type (stored in `natoms[2+i]`), thus we get the type vector of centric atom. In the build phrase of fitting net, it will check whether type embedding exist in `input_dict` and fetch them. After that calling `embed_atom_type` function to lookup embedding vector for type vector of centric atom to obtain `atype_embed`, and concat input with it ([input, atype_embed]). The modified input go through `fitting net` to get predicted energy.
+In `fitting net`, it take the descriptor vector as input, whose dimension is [natoms, $M_1\times M_2$]. Because we need to involve information of centric atom in this step, we need to generate a matrix named as `atype_embed` (of dim [natoms, nchanl]), in which each row is the type embedding vector of the specific centric atom. The input is sorted by type of centric atom, we also know the number of a particular atom type (stored in `natoms[2+i]`), thus we get the type vector of centric atom. In the build phrase of fitting net, it will check whether type embedding exist in `input_dict` and fetch them. After that calling `embed_atom_type` function to lookup embedding vector for type vector of centric atom to obtain `atype_embed`, and concat input with it ([input, atype_embed]). The modified input go through `fitting net` to get predicted energy.
 
 
 **P.S.: You can't apply compression method while using atom type embedding**
diff --git a/doc/freeze/compress.md b/doc/freeze/compress.md
index a441c9571d..5cd6016d32 100644
--- a/doc/freeze/compress.md
+++ b/doc/freeze/compress.md
@@ -82,7 +82,7 @@ The model compression interface requires the version of deepmd-kit used in origi
 
 **Acceptable descriptor type**
 
-Descriptors with `se_e2_a`,`se_e3`,'se_e2_r' type are supported by the model compression feature. Hybrid mixed with above descriptors is also supported.
+Descriptors with `se_e2_a`,`se_e3`, `se_e2_r` type are supported by the model compression feature. Hybrid mixed with above descriptors is also supported.
 
 
 **Available activation functions for descriptor:**
diff --git a/doc/index.rst b/doc/index.rst
index 2b6cf74fbf..5ba11a764a 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -33,6 +33,7 @@ DeePMD-kit is a package written in Python/C++, designed to minimize the effort r
    freeze/index
    test/index
    inference/index
+   cli
    third-party/index
    troubleshooting/index
 
diff --git a/doc/install/build-conda.md b/doc/install/build-conda.md
index e69374d3de..253de51e96 100644
--- a/doc/install/build-conda.md
+++ b/doc/install/build-conda.md
@@ -2,7 +2,7 @@
 
 One may want to keep both convenience and personalization of the DeePMD-kit. To achieve this goal, one can consider building conda packages. We provide building scripts in [deepmd-kit-recipes organization](https://github.com/deepmd-kit-recipes/). These building tools are driven by [conda-build](https://github.com/conda/conda-build) and [conda-smithy](https://github.com/conda-forge/conda-smithy).
 
-For example, if one wants to turn on `MPIIO` package in LAMMPS, go to [`lammps-dp-feedstock`](https://github.com/deepmd-kit-recipes/lammps-dp-feedstock/) repository and modify `recipe/build.sh`. `-D PKG_MPIIO=OFF` should be changed to `-D PKG_MPIIO=ON`. Then go to the main directory and executing
+For example, if one wants to turn on `MPIIO` package in LAMMPS, go to [`lammps-feedstock`](https://github.com/deepmd-kit-recipes/lammps-feedstock/) repository and modify `recipe/build.sh`. `-D PKG_MPIIO=OFF` should be changed to `-D PKG_MPIIO=ON`. Then go to the main directory and executing
 
 ```sh
 ./build-locally.py
@@ -10,7 +10,7 @@ For example, if one wants to turn on `MPIIO` package in LAMMPS, go to [`lammps-d
 
 This requires that Docker has been installed. After the building, the packages will be generated in `build_artifacts/linux-64` and `build_artifacts/noarch`, and then one can install then executing
 ```sh
-conda create -n deepmd lammps-dp -c file:///path/to/build_artifacts -c https://conda.deepmodeling.org -c nvidia
+conda create -n deepmd lammps -c file:///path/to/build_artifacts -c https://conda.deepmodeling.com -c nvidia
 ```
 
 One may also upload packages to one's Anaconda channel, so they can be installed on other machines:
diff --git a/doc/install/easy-install.md b/doc/install/easy-install.md
index 779ae7ff7c..baed662040 100644
--- a/doc/install/easy-install.md
+++ b/doc/install/easy-install.md
@@ -16,7 +16,7 @@ Both CPU and GPU version offline packages are available in [the Releases page](h
 
 Some packages are splited into two files due to size limit of GitHub. One may merge them into one after downloading:
 ```bash
-cat deepmd-kit-2.0.0-cuda11.3_gpu-Linux-x86_64.sh.0 deepmd-kit-2.0.0-cuda11.3_gpu-Linux-x86_64.sh.1 > deepmd-kit-2.0.0-cuda11.3_gpu-Linux-x86_64.sh
+cat deepmd-kit-2.1.1-cuda11.6_gpu-Linux-x86_64.sh.0 deepmd-kit-2.1.1-cuda11.6_gpu-Linux-x86_64.sh.1 > deepmd-kit-2.1.1-cuda11.6_gpu-Linux-x86_64.sh
 ```
 
 ## Install with conda
@@ -24,18 +24,18 @@ DeePMD-kit is avaiable with [conda](https://github.com/conda/conda). Install [An
 
 One may create an environment that contains the CPU version of DeePMD-kit and LAMMPS:
 ```bash
-conda create -n deepmd deepmd-kit=*=*cpu libdeepmd=*=*cpu lammps-dp -c https://conda.deepmodeling.org
+conda create -n deepmd deepmd-kit=*=*cpu libdeepmd=*=*cpu lammps -c https://conda.deepmodeling.com
 ```
 
 Or one may want to create a GPU environment containing [CUDA Toolkit](https://docs.nvidia.com/deploy/cuda-compatibility/index.html#binary-compatibility__table-toolkit-driver):
 ```bash
-conda create -n deepmd deepmd-kit=*=*gpu libdeepmd=*=*gpu lammps-dp cudatoolkit=11.3 horovod -c https://conda.deepmodeling.org
+conda create -n deepmd deepmd-kit=*=*gpu libdeepmd=*=*gpu lammps cudatoolkit=11.6 horovod -c https://conda.deepmodeling.com
 ```
-One could change the CUDA Toolkit version from `10.1` or `11.3`.
+One could change the CUDA Toolkit version from `10.2` or `11.6`.
 
-One may speficy the DeePMD-kit version such as `2.0.0` using
+One may speficy the DeePMD-kit version such as `2.1.1` using
 ```bash
-conda create -n deepmd deepmd-kit=2.0.0=*cpu libdeepmd=2.0.0=*cpu lammps-dp=2.0.0 horovod -c https://conda.deepmodeling.org
+conda create -n deepmd deepmd-kit=2.1.1=*cpu libdeepmd=2.1.1=*cpu lammps horovod -c https://conda.deepmodeling.com
 ```
 
 One may enable the environment using
@@ -48,12 +48,12 @@ A docker for installing the DeePMD-kit is available [here](https://github.com/or
 
 To pull the CPU version:
 ```bash
-docker pull ghcr.io/deepmodeling/deepmd-kit:2.0.0_cpu
+docker pull ghcr.io/deepmodeling/deepmd-kit:2.1.1_cpu
 ```
 
 To pull the GPU version:
 ```bash
-docker pull ghcr.io/deepmodeling/deepmd-kit:2.0.0_cuda10.1_gpu
+docker pull ghcr.io/deepmodeling/deepmd-kit:2.1.1_cuda11.6_gpu
 ```
 
 To pull the ROCm version:
diff --git a/doc/install/install-from-source.md b/doc/install/install-from-source.md
index a81efd4656..cecce0f3cd 100644
--- a/doc/install/install-from-source.md
+++ b/doc/install/install-from-source.md
@@ -22,7 +22,7 @@ First, check the python version on your machine
 python --version
 ```
 
-We follow the virtual environment approach to install TensorFlow's Python interface. The full instruction can be found on the official [TensorFlow website](https://www.tensorflow.org/install/pip). Now we assume that the Python interface will be installed to virtual environment directory `$tensorflow_venv`
+We follow the virtual environment approach to install TensorFlow's Python interface. The full instruction can be found on the official [TensorFlow website](https://www.tensorflow.org/install/pip). TensorFlow 1.8 or later is supported. Now we assume that the Python interface will be installed to virtual environment directory `$tensorflow_venv`
 ```bash
 virtualenv -p python3 $tensorflow_venv
 source $tensorflow_venv/bin/activate
@@ -51,8 +51,18 @@ python -c "import tensorflow as tf;print(tf.reduce_sum(tf.random.normal([1000, 1
 ```
 One should remember to activate the virtual environment every time he/she uses deepmd-kit.
 
+One can also [build TensorFlow Python interface from source](https://www.tensorflow.org/install/source) for custom hardward optimization, such as CUDA, ROCM, or OneDNN support.
+
 ### Install the DeePMD-kit's python interface
 
+Check the compiler version on your machine
+
+```
+gcc --version
+```
+
+The compiler gcc 4.8 or later is supported in the DeePMD-kit. Note that TensorFlow may have specific requirement of the compiler version. It is recommended to use the same compiler version as TensorFlow, which can be printed by `python -c "import tensorflow;print(tensorflow.version.COMPILER_VERSION)"`.
+
 Execute
 ```bash
 cd $deepmd_source_dir
@@ -64,7 +74,7 @@ One may set the following environment variables before executing `pip`:
 | Environment variables | Allowed value          | Default value | Usage                      |
 | --------------------- | ---------------------- | ------------- | -------------------------- |
 | DP_VARIANT            | `cpu`, `cuda`, `rocm`  | `cpu`         | Build CPU variant or GPU variant with CUDA or ROCM support. |
-| CUDA_TOOLKIT_ROOT_DIR | Path                   | Detected automatically | The path to the CUDA toolkit directory. |
+| CUDA_TOOLKIT_ROOT_DIR | Path                   | Detected automatically | The path to the CUDA toolkit directory. CUDA 7.0 or later is supported. NVCC is required. |
 | ROCM_ROOT             | Path                   | Detected automatically | The path to the ROCM toolkit directory. |
 
 To test the installation, one should firstly jump out of the source directory
@@ -140,15 +150,9 @@ If one does not need to use DeePMD-kit with Lammps or I-Pi, then the python inte
 
 ### Install the Tensorflow's C++ interface
 
-Check the compiler version on your machine
-
-```
-gcc --version
-```
-
-The C++ interface of DeePMD-kit was tested with compiler gcc >= 4.8. It is noticed that the I-Pi support is only compiled with gcc >= 4.8.
+The C++ interface of DeePMD-kit was tested with compiler gcc >= 4.8. It is noticed that the I-Pi support is only compiled with gcc >= 4.8. Note that TensorFlow may have specific requirement of the compiler version.
 
-First the C++ interface of Tensorflow should be installed. It is noted that the version of Tensorflow should be consistent with the python interface. You may follow [the instruction](install-tf.2.3.md) to install the corresponding C++ interface.
+First the C++ interface of Tensorflow should be installed. It is noted that the version of Tensorflow should be consistent with the python interface. You may follow [the instruction](install-tf.2.8.md) or run the script `$deepmd_source_dir/source/install/build_tf.py` to install the corresponding C++ interface.
 
 ### Install the DeePMD-kit's C++ interface
 
@@ -171,11 +175,11 @@ One may add the following arguments to `cmake`:
 | -DTENSORFLOW_ROOT=&lt;value&gt;  | Path              | -             | The Path to TensorFlow's C++ interface. |
 | -DCMAKE_INSTALL_PREFIX=&lt;value&gt; | Path          | -             | The Path where DeePMD-kit will be installed. |
 | -DUSE_CUDA_TOOLKIT=&lt;value&gt; | `TRUE` or `FALSE` | `FALSE`       | If `TRUE`, Build GPU support with CUDA toolkit. |
-| -DCUDA_TOOLKIT_ROOT_DIR=&lt;value&gt; | Path         | Detected automatically | The path to the CUDA toolkit directory. |
+| -DCUDA_TOOLKIT_ROOT_DIR=&lt;value&gt; | Path         | Detected automatically | The path to the CUDA toolkit directory. CUDA 7.0 or later is supported. NVCC is required. |
 | -DUSE_ROCM_TOOLKIT=&lt;value&gt; | `TRUE` or `FALSE` | `FALSE`       | If `TRUE`, Build GPU support with ROCM toolkit. |
 | -DROCM_ROOT=&lt;value&gt; | Path         | Detected automatically | The path to the ROCM toolkit directory. |
-| -DLAMMPS_VERSION_NUMBER=&lt;value&gt; | Number         | `20210929` | Only neccessary for LAMMPS built-in mode. The version number of LAMMPS (yyyymmdd). |
-| -DLAMMPS_SOURCE_ROOT=&lt;value&gt; | Path         | - | Only neccessary for LAMMPS plugin mode. The path to the LAMMPS source code (later than 8Apr2021). If not assigned, the plugin mode will not be enabled. |
+| -DLAMMPS_VERSION_NUMBER=&lt;value&gt; | Number         | `20220723` | Only neccessary for LAMMPS built-in mode. The version number of LAMMPS (yyyymmdd). LAMMPS 29Oct2020 (20201029) or later is supported. |
+| -DLAMMPS_SOURCE_ROOT=&lt;value&gt; | Path         | - | Only neccessary for LAMMPS plugin mode. The path to the [LAMMPS source code](install-lammps.md). LAMMPS 8Apr2021 or later is supported. If not assigned, the plugin mode will not be enabled. |
 
 If the cmake has been executed successfully, then run the following make commands to build the package:  
 ```bash
diff --git a/doc/install/install-gromacs.md b/doc/install/install-gromacs.md
index 5df6da385c..cf4a9356f4 100644
--- a/doc/install/install-gromacs.md
+++ b/doc/install/install-gromacs.md
@@ -1,4 +1,7 @@
 # Install GROMACS with DeepMD
+
+Before following this section, [DeePMD-kit C++ interface](install-from-source.md) should have be installed.
+
 ## Patch source code of GROMACS 
 Download source code of a supported gromacs version (2020.2) from https://manual.gromacs.org/2020.2/download.html. Run the following command:
 ```bash
diff --git a/doc/install/install-lammps.md b/doc/install/install-lammps.md
index 3371708c13..943229383d 100644
--- a/doc/install/install-lammps.md
+++ b/doc/install/install-lammps.md
@@ -3,27 +3,29 @@
 There are two ways to install LAMMPS: the built-in mode and the plugin mode. The built-in mode builds LAMMPS along with the DeePMD-kit and DeePMD-kit will be loaded automatically when running LAMMPS. The plugin mode builds LAMMPS and a plugin separately, so one needs to use `plugin load` command to load the DeePMD-kit's LAMMPS plugin library. 
 
 ## Install LAMMPS's DeePMD-kit module (built-in mode)
+Before following this section, [DeePMD-kit C++ interface](install-from-source.md) should have be installed.
+
 DeePMD-kit provides a module for running MD simulation with LAMMPS. Now make the DeePMD-kit module for LAMMPS.
 
 ```bash
 cd $deepmd_source_dir/source/build
 make lammps
 ```
-DeePMD-kit will generate a module called `USER-DEEPMD` in the `build` directory. If you need the low precision version, move `env_low.sh` to `env.sh` in the directory. Now download the LAMMPS code (`29Oct2020` or later), and uncompress it:
+DeePMD-kit will generate a module called `USER-DEEPMD` in the `build` directory. If you need the low precision version, move `env_low.sh` to `env.sh` in the directory. Now download the LAMMPS code, and uncompress it. The LAMMPS version should be the same as what is specified as the CMAKE argument `LAMMPS_VERSION_NUMBER`.
 ```bash
 cd /some/workspace
-wget https://github.com/lammps/lammps/archive/stable_29Sep2021_update3.tar.gz
-tar xf stable_29Sep2021_update3.tar.gz
+wget https://github.com/lammps/lammps/archive/stable_23Jun2022.tar.gz
+tar xf stable_23Jun2022.tar.gz
 ```
-The source code of LAMMPS is stored in directory `lammps-stable_29Sep2021_update3`. Now go into the LAMMPS code and copy the DeePMD-kit module like this
+The source code of LAMMPS is stored in directory `lammps-stable_23Jun2022`. Now go into the LAMMPS code and copy the DeePMD-kit module like this
 ```bash
-cd lammps-stable_29Sep2021_update3/src/
+cd lammps-stable_23Jun2022/src/
 cp -r $deepmd_source_dir/source/build/USER-DEEPMD .
-```
-Now build LAMMPS
-```bash
 make yes-kspace
 make yes-user-deepmd
+```
+You can enable any other package you want. Now build LAMMPS
+```bash
 make mpi -j4
 ```
 
@@ -43,15 +45,15 @@ Starting from `8Apr2021`, LAMMPS also provides a plugin mode, allowing one to bu
 Now download the LAMMPS code (`8Apr2021` or later), and uncompress it:
 ```bash
 cd /some/workspace
-wget https://github.com/lammps/lammps/archive/stable_29Sep2021_update3.tar.gz
-tar xf stable_29Sep2021_update3.tar.gz
+wget https://github.com/lammps/lammps/archive/stable_23Jun2022.tar.gz
+tar xf stable_23Jun2022.tar.gz
 ```
 
-The source code of LAMMPS is stored in directory `lammps-stable_29Sep2021_update3`. Now go into the LAMMPS directory and create a directory called `build`
+The source code of LAMMPS is stored in directory `lammps-stable_23Jun2022`. The directory of the source code should be specified as the CMAKE argument `LAMMPS_SOURCE_ROOT` during installation of the DeePMD-kit C++ interface. Now go into the LAMMPS directory and create a directory called `build`
 
 ```bash
-mkdir -p lammps-stable_29Sep2021_update3/build/
-cd lammps-stable_29Sep2021_update3/build/
+mkdir -p lammps-stable_23Jun2022/build/
+cd lammps-stable_23Jun2022/build/
 ```
 Now build LAMMPS. Note that `PLUGIN` and `KSPACE` package must be enabled, and `BUILD_SHARED_LIBS` must be set to `yes`. You can install any other package you want.
 ```bash
diff --git a/doc/install/install-tf.2.8.md b/doc/install/install-tf.2.8.md
new file mode 100644
index 0000000000..fec90446f9
--- /dev/null
+++ b/doc/install/install-tf.2.8.md
@@ -0,0 +1,115 @@
+# Install TensorFlow's C++ interface 
+The tensorflow's C++ interface will be compiled from the source code. Firstly one installs bazel. [bazelisk](https://github.com/bazelbuild/bazelisk) can be lanuched to use [bazel](https://github.com/bazelbuild/bazel).
+
+```bash
+wget https://github.com/bazelbuild/bazelisk/releases/download/v1.11.0/bazelisk-linux-amd64 -O /some/workspace/bazel/bin/bazel
+chmod +x /some/workspace/bazel/bin/bazel
+export PATH=/some/workspace/bazel/bin:$PATH
+```
+
+Firstly get the source code of the TensorFlow
+```bash
+git clone https://github.com/tensorflow/tensorflow tensorflow -b v2.8.0 --depth=1
+cd tensorflow
+./configure
+```
+
+You will answer a list of questions that help configure the building of TensorFlow. You may want to answer the question like the following. If you do not want to add CUDA support, please answer no.
+
+```
+Please specify the location of python. [Default is xxx]:
+
+Found possible Python library paths:
+  xxx
+Please input the desired Python library path to use.  Default is [xxx]
+
+Do you wish to build TensorFlow with OpenCL SYCL support? [y/N]:
+No OpenCL SYCL support will be enabled for TensorFlow.
+
+Do you wish to build TensorFlow with ROCm support? [y/N]:
+No ROCm support will be enabled for TensorFlow.
+
+Do you wish to build TensorFlow with CUDA support? [y/N]: y
+CUDA support will be enabled for TensorFlow.
+
+Do you wish to build TensorFlow with TensorRT support? [y/N]:
+No TensorRT support will be enabled for TensorFlow.
+
+Found CUDA 10.2 in:
+    /usr/local/cuda/lib64
+    /usr/local/cuda/include
+Found cuDNN 7 in:
+    /usr/local/cuda/lib64
+    /usr/local/cuda/include
+
+Please specify a list of comma-separated CUDA compute capabilities you want to build with.
+You can find the compute capability of your device at: https://developer.nvidia.com/cuda-gpus.
+Please note that each additional compute capability significantly increases your build time and binary size, and that TensorFlow only supports compute capabilities >= 3.5 [Default is: 7.5,7.5]:
+
+Do you want to use clang as CUDA compiler? [y/N]:
+nvcc will be used as CUDA compiler.
+
+Please specify which gcc should be used by nvcc as the host compiler. [Default is /usr/bin/gcc]:
+
+Please specify optimization flags to use during compilation when bazel option "--config=opt" is specified [Default is -march=native -Wno-sign-compare]:
+
+Would you like to interactively configure ./WORKSPACE for Android builds? [y/N]:
+Not configuring the WORKSPACE for Android builds.
+
+Preconfigured Bazel build configs. You can use any of the below by adding "--config=<>" to your build command. See .bazelrc for more details.
+    --config=mkl            # Build with MKL support.
+    --config=monolithic     # Config for mostly static monolithic build.
+    --config=ngraph         # Build with Intel nGraph support.
+    --config=numa           # Build with NUMA support.
+    --config=dynamic_kernels    # (Experimental) Build kernels into separate shared objects.
+    --config=v2             # Build TensorFlow 2.x instead of 1.x.
+Preconfigured Bazel build configs to DISABLE default on features:
+    --config=noaws          # Disable AWS S3 filesystem support.
+    --config=nogcp          # Disable GCP support.
+    --config=nohdfs         # Disable HDFS support.
+    --config=nonccl         # Disable NVIDIA NCCL support.
+Configuration finished
+```
+
+The library path for Python should be set accordingly.
+
+Now build the shared library of tensorflow:
+```bash
+bazel build -c opt --verbose_failures //tensorflow:libtensorflow_cc.so
+```
+You may want to add options `--copt=-msse4.2`,  `--copt=-mavx`, `--copt=-mavx2` and `--copt=-mfma` to enable SSE4.2, AVX, AVX2 and FMA SIMD accelerations, respectively. It is noted that these options should be chosen according to the CPU architecture. If the RAM becomes an issue of your machine, you may limit the RAM usage by using `--local_resources 2048,.5,1.0`. If you want to enable [oneDNN optimization](https://www.oneapi.io/blog/tensorflow-and-onednn-in-partnership/), add `--config=mkl`.
+
+Now I assume you want to install TensorFlow in directory `$tensorflow_root`. Create the directory if it does not exist
+```bash
+mkdir -p $tensorflow_root
+```
+Now, copy the libraries to the tensorflow's installation directory:
+```bash
+mkdir -p $tensorflow_root/lib
+cp -d bazel-bin/tensorflow/libtensorflow_cc.so* $tensorflow_root/lib/
+cp -d bazel-bin/tensorflow/libtensorflow_framework.so* $tensorflow_root/lib/
+cp -d $tensorflow_root/lib/libtensorflow_framework.so.2 $tensorflow_root/lib/libtensorflow_framework.so
+```
+Then copy the headers
+```bash
+mkdir -p $tensorflow_root/include/tensorflow
+rsync -avzh --exclude '_virtual_includes/' --include '*/' --include '*.h' --include '*.inc' --exclude '*' bazel-bin/ $tensorflow_root/include/
+rsync -avzh --include '*/' --include '*.h' --include '*.inc' --exclude '*' tensorflow/cc $tensorflow_root/include/tensorflow/
+rsync -avzh --include '*/' --include '*.h' --include '*.inc' --exclude '*' tensorflow/core $tensorflow_root/include/tensorflow/
+rsync -avzh --include '*/' --include '*' --exclude '*.cc' third_party/ $tensorflow_root/include/third_party/
+rsync -avzh --include '*/' --include '*' --exclude '*.txt' bazel-tensorflow/external/eigen_archive/Eigen/ $tensorflow_root/include/Eigen/
+rsync -avzh --include '*/' --include '*' --exclude '*.txt' bazel-tensorflow/external/eigen_archive/unsupported/ $tensorflow_root/include/unsupported/
+rsync -avzh --include '*/' --include '*.h' --include '*.inc' --exclude '*' bazel-tensorflow/external/com_google_protobuf/src/google/ $tensorflow_root/include/google/
+rsync -avzh --include '*/' --include '*.h' --include '*.inc' --exclude '*' bazel-tensorflow/external/com_google_absl/absl/ $tensorflow_root/include/absl/
+```
+
+If you've enabled oneDNN, also copy `libiomp5.so`:
+```bash
+cp -d bazel-out/k8-opt/bin/external/llvm_openmp/libiomp5.so $tensorflow_root/lib/
+```
+
+# Troubleshooting
+```bash
+git: unknown command -C ...
+```
+This may be an issue with your git version issue. Early versions of git do not support this command, in this case upgrading your git to a newer version may resolve any issues.
diff --git a/doc/model/dplr.md b/doc/model/dplr.md
index 07184dc9cd..14db3facfc 100644
--- a/doc/model/dplr.md
+++ b/doc/model/dplr.md
@@ -6,7 +6,7 @@ The method of DPLR is described in [this paper][1]. One is recommended to read t
 
 In the following, we take the DPLR model for example to introduce the training and LAMMPS simulation with the DPLR model. The DPLR model is training in two steps.
 
-### Train a deep Wannier model for Wannier centroids
+## Train a deep Wannier model for Wannier centroids
 
 We use the deep Wannier model (DW) to represent the relative position of the Wannier centroid (WC) with the atom to which it is associated. One may consult the introduction of the [dipole model](train-fitting-tensor.md) for a detailed introduction. An example input `wc.json` and a small dataset `data` for tutorial purposes can be found in
 ```bash
@@ -22,7 +22,7 @@ Two settings make the training input script different from an energy training in
 	    "seed":		1
 	},
 ```
-The type of fitting is set to `"dipole"`. The dipole is associate to type 0 atoms (oxygens), by the setting `"dipole_type": [0]`. What we trained is the displacement of the WC from the corresponding oxygen atom. It shares the same training input as atomic dipole because both are 3-dimensional vectors defined on atoms. 
+The type of fitting is set to {ref}`dipole <model/fitting_net[dipole]>`. The dipole is associate to type 0 atoms (oxygens), by the setting `"dipole_type": [0]`. What we trained is the displacement of the WC from the corresponding oxygen atom. It shares the same training input as atomic dipole because both are 3-dimensional vectors defined on atoms. 
 The loss section is provided as follows
 ```json
     "loss": {
@@ -38,7 +38,7 @@ The training and freezing can be started from the example directory by
 dp train dw.json && dp freeze -o dw.pb
 ```
 
-### Train the DPLR model
+## Train the DPLR model
 
 The training of the DPLR model is very similar to the standard short-range DP models. An example input script can be found in the example directory. The following section is introduced to compute the long-range energy contribution of the DPLR model, and modify the short-range DP model by this part. 
 ```json
@@ -51,7 +51,7 @@ The training of the DPLR model is very similar to the standard short-range DP mo
             "ewald_beta":       0.40
         },
 ```
-The `"model_name"` specifies which DW model is used to predict the position of WCs. `"model_charge_map"` gives the amount of charge assigned to WCs. `"sys_charge_map"` provides the nuclear charge of oxygen (type 0) and hydrogen (type 1) atoms. `"ewald_beta"` (unit A^{-1}) gives the spread parameter controls the spread of Gaussian charges, and `"ewald_h"`  (unit A) assigns the grid size of Fourier transform. 
+The {ref}`model_name <model/modifier[dipole_charge]/model_name>` specifies which DW model is used to predict the position of WCs. {ref}`model_charge_map <model/modifier[dipole_charge]/model_charge_map>` gives the amount of charge assigned to WCs. {ref}`sys_charge_map <model/modifier[dipole_charge]/sys_charge_map>` provides the nuclear charge of oxygen (type 0) and hydrogen (type 1) atoms. {ref}`ewald_beta <model/modifier[dipole_charge]/ewald_beta>` (unit $\text{Å}^{-1}$) gives the spread parameter controls the spread of Gaussian charges, and {ref}`ewald_h <model/modifier[dipole_charge]/ewald_h>`  (unit Å) assigns the grid size of Fourier transform. 
 The DPLR model can be trained and frozen by (from the example directory)
 ```
 dp train ener.json && dp freeze -o ener.pb
diff --git a/doc/model/dprc.md b/doc/model/dprc.md
new file mode 100644
index 0000000000..2e4c2220e8
--- /dev/null
+++ b/doc/model/dprc.md
@@ -0,0 +1,80 @@
+# Deep Potential - Range Correction (DPRc)
+
+Deep Potential - Range Correction (DPRc) is designed to combine with QM/MM method, and corrects energies from a low-level QM/MM method to a high-level QM/MM method:
+
+$$ E=E_\text{QM}(\mathbf R; \mathbf P)  + E_\text{QM/MM}(\mathbf R; \mathbf P) + E_\text{MM}(\mathbf R) + E_\text{DPRc}(\mathbf R) $$
+
+See the [JCTC paper](https://doi.org/10.1021/acs.jctc.1c00201) for details.
+
+## Training data
+
+Instead the normal _ab initio_ data, one needs to provide the correction from a low-level QM/MM method to a high-level QM/MM method:
+
+$$ E = E_\text{high-level QM/MM} - E_\text{low-level QM/MM} $$
+
+Two levels of data use the same MM method, so $E_\text{MM}$ is eliminated.
+
+## Training the DPRc model
+
+In a DPRc model, QM atoms and MM atoms have different atom types. Assuming we have 4 QM atom types (C, H, O, P) and 2 MM atom types (HW, OW):
+
+```json
+"type_map": ["C", "H", "HW", "O", "OW", "P"]
+```
+
+As described in the paper, the DPRc model only corrects $E_\text{QM}$ and $E_\text{QM/MM}$ within the cutoff, so we use a hybrid descriptor to describe them separatedly:
+
+```json
+"descriptor" :{
+    "type":             "hybrid",
+    "list" : [
+        {
+            "type":     "se_e2_a",
+            "sel":              [6, 11, 0, 6, 0, 1],
+            "rcut_smth":        1.00,
+            "rcut":             9.00,
+            "neuron":           [12, 25, 50],
+            "exclude_types":    [[2, 2], [2, 4], [4, 4], [0, 2], [0, 4], [1, 2], [1, 4], [3, 2], [3, 4], [5, 2], [5, 4]],
+            "axis_neuron":      12,
+            "set_davg_zero":    true,
+            "_comment": " QM/QM interaction"
+        },
+        {
+            "type":     "se_e2_a",
+            "sel":              [6, 11, 100, 6, 50, 1],
+            "rcut_smth":        0.50,
+            "rcut":             6.00,
+            "neuron":           [12, 25, 50],
+            "exclude_types":    [[0, 0], [0, 1], [0, 3], [0, 5], [1, 1], [1, 3], [1, 5], [3, 3], [3, 5], [5, 5], [2, 2], [2, 4], [4, 4]],
+            "axis_neuron":      12,
+            "set_davg_zero":    true,
+            "_comment": " QM/MM interaction"
+        }
+    ]
+}
+```
+
+{ref}`exclude_types <model/descriptor[se_e2_a]/exclude_types>` can be generated by the following Python script:
+```py
+from itertools import combinations_with_replacement, product
+qm = (0, 1, 3, 5)
+mm = (2, 4)
+print("QM/QM:", list(map(list, list(combinations_with_replacement(mm, 2)) + list(product(qm, mm)))))
+print("QM/MM:", list(map(list, list(combinations_with_replacement(qm, 2)) + list(combinations_with_replacement(mm, 2)))))
+```
+
+Also, DPRc assumes MM atom energies ({ref}`atom_ener <model/fitting_net[ener]/atom_ener>`) are zero:
+
+```json
+"fitting_net": {
+   "neuron": [240, 240, 240],
+   "resnet_dt": true,
+   "atom_ener": [null, null, 0.0, null, 0.0, null]
+}
+```
+
+Note that {ref}`atom_ener <model/fitting_net[ener]/atom_ener>` only works when {ref}`descriptor/set_davg_zero <model/descriptor[se_e2_a]/set_davg_zero>` is `true`.
+
+## Run MD simulations
+
+The DPRc model has the best practices with the [AMBER](../third-party/out-of-deepmd-kit.md#amber-interface-to-deepmd-kit) QM/MM module. An example is given by [GitLab RutgersLBSR/AmberDPRc](https://gitlab.com/RutgersLBSR/AmberDPRc/). In theory, DPRc is able to be used with any QM/MM package, as long as the DeePMD-kit package accepts QM atoms and MM atoms within the cutoff range and returns energies and forces.
diff --git a/doc/model/index.md b/doc/model/index.md
index 94575f67c6..12df4e22f3 100644
--- a/doc/model/index.md
+++ b/doc/model/index.md
@@ -10,3 +10,4 @@
 - [Fit `tensor` like `Dipole` and `Polarizability`](train-fitting-tensor.md)
 - [Train a Deep Potential model using `type embedding` approach](train-se-e2-a-tebd.md)
 - [Deep potential long-range](dplr.md)
+- [Deep Potential - Range Correction (DPRc)](dprc.md)
diff --git a/doc/model/index.rst b/doc/model/index.rst
index 10f41b375c..6177452342 100644
--- a/doc/model/index.rst
+++ b/doc/model/index.rst
@@ -14,3 +14,4 @@ Model
    train-fitting-tensor
    train-se-e2-a-tebd
    dplr
+   dprc
diff --git a/doc/model/overall.md b/doc/model/overall.md
index 87827363d7..fd043a162b 100644
--- a/doc/model/overall.md
+++ b/doc/model/overall.md
@@ -1,6 +1,6 @@
 # Overall
 
-A model has two parts, a descriptor that maps atomic configuration to a set of symmetry invariant features, and a fitting net that takes descriptor as input and predicts the atomic contribution to the target physical property. It's defined in the `model` section of the `input.json`, for example
+A model has two parts, a descriptor that maps atomic configuration to a set of symmetry invariant features, and a fitting net that takes descriptor as input and predicts the atomic contribution to the target physical property. It's defined in the {ref}`model <model>` section of the `input.json`, for example,
 ```json
     "model": {
         "type_map":	["O", "H"],
@@ -12,10 +12,9 @@ A model has two parts, a descriptor that maps atomic configuration to a set of s
         }
     }
 ```
+The two subsections, {ref}`descriptor <model/descriptor>` and {ref}`fitting_net <model/fitting_net>`, define the descriptor and the fitting net, respectively.
 
-Assume that we are looking for a model for water, we will have two types of atoms. The atom types are recorded as integers. In this example, we denote `0` for oxygen and `1` for hydrogen. A mapping from the atom type to their names is provided by `type_map`. 
-
-The model has two subsections `descritpor` and `fitting_net`, which defines the descriptor and the fitting net, respectively. The `type_map` is optional, which provides the element names (but not necessarily to be the element name) of the corresponding atom types.
+The {ref}`type_map <model/type_map>` is optional, which provides the element names (but not necessarily same with the actual name of the element) of the corresponding atom types. A model for water, as in this example, has two kinds of atoms. The atom types are internally recorded as integers, e.g., `0` for oxygen and `1` for hydrogen here. A mapping from the atom type to their names is provided by {ref}`type_map <model/type_map>`. 
 
 DeePMD-kit implements the following descriptors:
 1. [`se_e2_a`](train-se-e2-a.md): DeepPot-SE constructed from all information (both angular and radial) of atomic configurations. The embedding takes the distance between atoms as input.
@@ -25,6 +24,6 @@ DeePMD-kit implements the following descriptors:
 5. [`hybrid`](train-hybrid.md): Concate a list of descriptors to form a new descriptor.
 
 The fitting of the following physical properties are supported
-1. [`ener`](train-energy.md): Fitting the energy of the system. The force (derivative with atom positions) and the virial (derivative with the box tensor) can also be trained. See [the example](train-se-e2-a.md#loss).
+1. [`ener`](train-energy.md): Fit the energy of the system. The force (derivative with atom positions) and the virial (derivative with the box tensor) can also be trained.
 2. [`dipole`](train-fitting-tensor.md): The dipole moment.
 3. [`polar`](train-fitting-tensor.md): The polarizability.
diff --git a/doc/model/train-energy.md b/doc/model/train-energy.md
index 65f80f85e7..20f173bc9d 100644
--- a/doc/model/train-energy.md
+++ b/doc/model/train-energy.md
@@ -2,9 +2,9 @@
 
 In this section, we will take `$deepmd_source_dir/examples/water/se_e2_a/input.json` as an example of the input file.
 
-## Fitting network
+## The fitting network
 
-The construction of the fitting net is give by section `fitting_net`
+The construction of the fitting net is give by section {ref}`fitting_net <model/fitting_net>`
 ```json
 	"fitting_net" : {
 	    "neuron":		[240, 240, 240],
@@ -12,23 +12,26 @@ The construction of the fitting net is give by section `fitting_net`
 	    "seed":		1
 	},
 ```
-* `neuron` specifies the size of the fitting net. If two neighboring layers are of the same size, then a [ResNet architecture](https://arxiv.org/abs/1512.03385) is built between them. 
-* If the option `resnet_dt` is set `true`, then a timestep is used in the ResNet. 
-* `seed` gives the random seed that is used to generate random numbers when initializing the model parameters.
+* {ref}`neuron <model/fitting_net[ener]/neuron>` specifies the size of the fitting net. If two neighboring layers are of the same size, then a [ResNet architecture](https://arxiv.org/abs/1512.03385) is built between them. 
+* If the option {ref}`resnet_dt <model/fitting_net[ener]/resnet_dt>` is set to `true`, then a timestep is used in the ResNet. 
+* {ref}`seed <model/fitting_net[ener]/seed>` gives the random seed that is used to generate random numbers when initializing the model parameters.
 
 ## Loss
 
-The loss function for training energy is given by
-```
-loss = pref_e * loss_e + pref_f * loss_f + pref_v * loss_v
-```
-where `loss_e`, `loss_f` and `loss_v` denote the loss in energy, force and virial, respectively. `pref_e`, `pref_f` and `pref_v` give the prefactors of the energy, force and virial losses. The prefectors may not be a constant, rather it changes linearly with the learning rate. Taking the force prefactor for example, at training step `t`, it is given by
+The loss function $L$ for training energy is given by
+
+$$L = p_e L_e + p_f L_f + p_v L_v$$
+
+where $L_e$, $L_f$, and $L_v$ denote the loss in energy, force and virial, respectively. $p_e$, $p_f$, and $p_v$ give the prefactors of the energy, force and virial losses. The prefectors may not be a constant, rather it changes linearly with the learning rate. Taking the force prefactor for example, at training step $t$, it is given by
+
+$$p_f(t) = p_f^0 \frac{ \alpha(t) }{ \alpha(0) } + p_f^\infty ( 1 - \frac{ \alpha(t) }{ \alpha(0) })$$
+
+where $\alpha(t)$ denotes the learning rate at step $t$. $p_f^0$ and $p_f^\infty$ specifies the $p_f$ at the start of the training and at the limit of $t \to \infty$ (set by {ref}`start_pref_f <loss[ener]/start_pref_f>` and {ref}`limit_pref_f <loss[ener]/limit_pref_f>`, respectively), i.e.
 ```math
 pref_f(t) = start_pref_f * ( lr(t) / start_lr ) + limit_pref_f * ( 1 - lr(t) / start_lr )
 ```
-where `lr(t)` denotes the learning rate at step `t`. `start_pref_f` and `limit_pref_f` specifies the `pref_f` at the start of the training and at the limit of `t -> inf`.
 
-The `loss` section in the `input.json` is 
+The {ref}`loss <loss>` section in the `input.json` is 
 ```json
     "loss" : {
 	"start_pref_e":	0.02,
@@ -39,6 +42,6 @@ The `loss` section in the `input.json` is
 	"limit_pref_v":	0
     }
 ```
-The options `start_pref_e`, `limit_pref_e`, `start_pref_f`, `limit_pref_f`, `start_pref_v` and `limit_pref_v` determine the start and limit prefactors of energy, force and virial, respectively.
+The options {ref}`start_pref_e <loss[ener]/start_pref_e>`, {ref}`limit_pref_e <loss[ener]/limit_pref_e>`, {ref}`start_pref_f <loss[ener]/start_pref_f>`, {ref}`limit_pref_f <loss[ener]/limit_pref_f>`, {ref}`start_pref_v <loss[ener]/start_pref_v>` and {ref}`limit_pref_v <loss[ener]/limit_pref_v>` determine the start and limit prefactors of energy, force and virial, respectively.
 
-If one does not want to train with virial, then he/she may set the virial prefactors `start_pref_v` and `limit_pref_v` to 0.
+If one does not want to train with virial, then he/she may set the virial prefactors {ref}`start_pref_v <loss[ener]/start_pref_v>` and {ref}`limit_pref_v <loss[ener]/limit_pref_v>` to 0.
diff --git a/doc/model/train-fitting-tensor.md b/doc/model/train-fitting-tensor.md
index 4074316186..240f126aa3 100644
--- a/doc/model/train-fitting-tensor.md
+++ b/doc/model/train-fitting-tensor.md
@@ -1,6 +1,6 @@
 # Fit `tensor` like `Dipole` and `Polarizability`
 
-Unlike `energy` which is a scalar, one may want to fit some high dimensional physical quantity, like `dipole` (vector) and `polarizability` (matrix, shorted as `polar`). Deep Potential has provided different API to allow this. In this example we will show you how to train a model to fit them for a water system. A complete training input script of the examples can be found in 
+Unlike `energy`, which is a scalar, one may want to fit some high dimensional physical quantity, like `dipole` (vector) and `polarizability` (matrix, shorted as `polar`). Deep Potential has provided different APIs to do this. In this example, we will show you how to train a model to fit them for a water system. A complete training input script of the examples can be found in 
 
 ```bash
 $deepmd_source_dir/examples/water_tensor/dipole/dipole_input.json
@@ -9,11 +9,11 @@ $deepmd_source_dir/examples/water_tensor/polar/polar_input.json
 
 The training and validation data are also provided our examples. But note that **the data provided along with the examples are of limited amount, and should not be used to train a production model.**
 
-Similar to the `input.json` used in `ener` mode, training json is also divided into `model`, `learning_rate`, `loss` and `training`. Most keywords remains the same as `ener` mode, and their meaning can be found [here](train-se-e2-a.md). To fit a tensor, one need to modify `model.fitting_net` and `loss`.
+Similar to the `input.json` used in `ener` mode, training json is also divided into {ref}`model <model>`, {ref}`learning_rate <learning_rate>`, {ref}`loss <loss>` and {ref}`training <training>`. Most keywords remains the same as `ener` mode, and their meaning can be found [here](train-se-e2-a.md). To fit a tensor, one need to modify {ref}`model/fitting_net <model/fitting_net>` and {ref}`loss <loss>`.
 
-## Fitting Network
+## The fitting Network
 
-The `fitting_net` section tells DP which fitting net to use.
+The {ref}`fitting_net <model/fitting_net>` section tells DP which fitting net to use.
 
 The json of `dipole` type should be provided like
 
@@ -47,7 +47,7 @@ The json of `polar` type should be provided like
 
 DP supports a combinational training of global system (only a global `tensor` label, i.e. dipole or polar, is provided in a frame) and atomic system (labels for **each** atom included in `sel_type` are provided). In a global system, each frame has just **one** `tensor` label. For example, when fitting `polar`, each frame will just provide a `1 x 9` vector which gives the elements of the polarizability tensor of that frame in order XX, XY, XZ, YX, YY, YZ, XZ, ZY, ZZ. By contrast, in a atomic system, each atom in `sel_type` has a `tensor` label. For example, when fitting dipole, each frame will provide a `#sel_atom x 3` matrix, where `#sel_atom` is the number of atoms whose type are in `sel_type`.
 
-The `loss` section tells DP the weight of this two kind of loss, i.e.
+The {ref}`loss <loss>` section tells DP the weight of this two kind of loss, i.e.
 
 ```python
 loss = pref * global_loss + pref_atomic * atomic_loss
@@ -63,8 +63,8 @@ The loss section should be provided like
 	},
 ```
 
--   `type` should be written as `tensor` as a distinction from `ener` mode.
--   `pref` and `pref_atomic` respectively specify the weight of global loss and atomic loss. It can not be left unset. If set to 0, system with corresponding label will NOT be included in the training process.
+-   {ref}`type <loss/type>` should be written as `tensor` as a distinction from `ener` mode.
+-   {ref}`pref <loss[tensor]/pref>` and {ref}`pref_atomic <loss[tensor]/pref_atomic>` respectively specify the weight of global loss and atomic loss. It can not be left unset. If set to 0, system with corresponding label will NOT be included in the training process.
 
 ## Training Data Preparation
 
diff --git a/doc/model/train-hybrid.md b/doc/model/train-hybrid.md
index 4ae8806867..b69b49ea21 100644
--- a/doc/model/train-hybrid.md
+++ b/doc/model/train-hybrid.md
@@ -1,8 +1,8 @@
 # Descriptor `"hybrid"`
 
-This descriptor hybridize multiple descriptors to form a new descriptor. For example we have a list of descriptor denoted by D_1, D_2, ..., D_N, the hybrid descriptor this the concatenation of the list, i.e. D = (D_1, D_2, ..., D_N).
+This descriptor hybridize multiple descriptors to form a new descriptor. For example we have a list of descriptor denoted by $\mathcal D_1$, $\mathcal D_2$, ..., $\mathcal D_N$, the hybrid descriptor this the concatenation of the list, i.e. $\mathcal D = (\mathcal D_1, \mathcal D_2, \cdots, \mathcal D_N)$.
 
-To use the descriptor in DeePMD-kit, one firstly set the `type` to `"hybrid"`, then provide the definitions of the descriptors by the items in the `list`,
+To use the descriptor in DeePMD-kit, one firstly set the {ref}`type <model/descriptor/type>` to {ref}`hybrid <model/descriptor[hybrid]>`, then provide the definitions of the descriptors by the items in the `list`,
 ```json
         "descriptor" :{
             "type": "hybrid",
diff --git a/doc/model/train-se-e2-a-tebd.md b/doc/model/train-se-e2-a-tebd.md
index 82815b6956..c80127939d 100644
--- a/doc/model/train-se-e2-a-tebd.md
+++ b/doc/model/train-se-e2-a-tebd.md
@@ -2,10 +2,10 @@
  
 We generate specific type embedding vector for each atom type, so that we can share one descriptor embedding net and one fitting net in total, which decline training complexity largely. 
 
-The training input script is similar to that of [`se_e2_a`](train-se-e2-a.md#the-training-input-script), but different by adding the `type_embedding` section. 
+The training input script is similar to that of [`se_e2_a`](train-se-e2-a.md), but different by adding the {ref}`type_embedding <model/type_embedding>` section. 
 
 ## Type embedding net
-The `model` defines how the model is constructed, adding a section of type embedding net:
+The {ref}`model <model>` defines how the model is constructed, adding a section of type embedding net:
 ```json
     "model": {
 	"type_map":	["O", "H"],
@@ -22,7 +22,7 @@ The `model` defines how the model is constructed, adding a section of type embed
 ```
 Model will automatically apply type embedding approach and generate type embedding vectors. If type embedding vector is detected, descriptor and fitting net would take it as a part of input.
 
-The construction of type embedding net is given by `type_embedding`. An example of `type_embedding` is provided as follows
+The construction of type embedding net is given by {ref}`type_embedding <model/type_embedding>`. An example of {ref}`type_embedding <model/type_embedding>` is provided as follows
 ```json
 	"type_embedding":{
 	    "neuron":		[2, 4, 8],
@@ -30,9 +30,9 @@ The construction of type embedding net is given by `type_embedding`. An example
 	    "seed":		1
 	}
 ```
-* The `neuron` specifies the size of the type embedding net. From left to right the members denote the sizes of each hidden layer from input end to the output end, respectively. It takes one-hot vector as input and output dimension equals to the last dimension of the `neuron` list. If the outer layer is of twice size as the inner layer, then the inner layer is copied and concatenated, then a [ResNet architecture](https://arxiv.org/abs/1512.03385) is built between them.
-* If the option `resnet_dt` is set `true`, then a timestep is used in the ResNet.
-* `seed` gives the random seed that is used to generate random numbers when initializing the model parameters.
+* The {ref}`neuron <model/type_embedding/neuron>` specifies the size of the type embedding net. From left to right the members denote the sizes of each hidden layer from input end to the output end, respectively. It takes one-hot vector as input and output dimension equals to the last dimension of the {ref}`neuron <model/type_embedding/neuron>` list. If the outer layer is of twice size as the inner layer, then the inner layer is copied and concatenated, then a [ResNet architecture](https://arxiv.org/abs/1512.03385) is built between them.
+* If the option {ref}`resnet_dt <model/type_embedding/resnet_dt>` is set to `true`, then a timestep is used in the ResNet.
+* {ref}`seed <model/type_embedding/seed>` gives the random seed that is used to generate random numbers when initializing the model parameters.
 
 
 A complete training input script of this example can be find in the directory. 
diff --git a/doc/model/train-se-e2-a.md b/doc/model/train-se-e2-a.md
index c5c3644f15..2a28bf9658 100644
--- a/doc/model/train-se-e2-a.md
+++ b/doc/model/train-se-e2-a.md
@@ -8,7 +8,7 @@ $deepmd_source_dir/examples/water/se_e2_a/input.json
 ```
 With the training input script, data are also provided in the example directory. One may train the model with the DeePMD-kit from the directory.
 
-The construction of the descriptor is given by section `descriptor`. An example of the descriptor is provided as follows
+The construction of the descriptor is given by section {ref}`descriptor <model/descriptor>`. An example of the descriptor is provided as follows
 ```json
 	"descriptor" :{
 	    "type":		"se_e2_a",
@@ -22,12 +22,12 @@ The construction of the descriptor is given by section `descriptor`. An example
 	    "seed":		1
 	}
 ```
-* The `type` of the descriptor is set to `"se_e2_a"`. 
-* `rcut` is the cut-off radius for neighbor searching, and the `rcut_smth` gives where the smoothing starts. 
-* `sel` gives the maximum possible number of neighbors in the cut-off radius. It is a list, the length of which is the same as the number of atom types in the system, and `sel[i]` denote the maximum possible number of neighbors with type `i`. 
-* The `neuron` specifies the size of the embedding net. From left to right the members denote the sizes of each hidden layer from input end to the output end, respectively. If the outer layer is of twice size as the inner layer, then the inner layer is copied and concatenated, then a [ResNet architecture](https://arxiv.org/abs/1512.03385) is built between them.
-* If the option `type_one_side` is set to `true`, then descriptor will consider the types of neighbor atoms. Otherwise, both the types of centric and  neighbor atoms are considered.
-* The `axis_neuron` specifies the size of submatrix of the embedding matrix, the axis matrix as explained in the [DeepPot-SE paper](https://arxiv.org/abs/1805.09003) 
-* If the option `resnet_dt` is set `true`, then a timestep is used in the ResNet.
-* `seed` gives the random seed that is used to generate random numbers when initializing the model parameters.
+* The {ref}`type <model/descriptor/type>` of the descriptor is set to `"se_e2_a"`. 
+* {ref}`rcut <model/descriptor[se_e2_a]/rcut>` is the cut-off radius for neighbor searching, and the {ref}`rcut_smth <model/descriptor[se_e2_a]/rcut_smth>` gives where the smoothing starts. 
+* {ref}`sel <model/descriptor[se_e2_a]/sel>` gives the maximum possible number of neighbors in the cut-off radius. It is a list, the length of which is the same as the number of atom types in the system, and `sel[i]` denote the maximum possible number of neighbors with type `i`. 
+* The {ref}`neuron <model/descriptor[se_e2_a]/neuron>` specifies the size of the embedding net. From left to right the members denote the sizes of each hidden layer from input end to the output end, respectively. If the outer layer is of twice size as the inner layer, then the inner layer is copied and concatenated, then a [ResNet architecture](https://arxiv.org/abs/1512.03385) is built between them.
+* If the option {ref}`type_one_side <model/descriptor[se_e2_a]/type_one_side>` is set to `true`, then descriptor will consider the types of neighbor atoms. Otherwise, both the types of centric and  neighbor atoms are considered.
+* The {ref}`axis_neuron <model/descriptor[se_e2_a]/axis_neuron>` specifies the size of submatrix of the embedding matrix, the axis matrix as explained in the [DeepPot-SE paper](https://arxiv.org/abs/1805.09003) 
+* If the option {ref}`resnet_dt <model/descriptor[se_e2_a]/resnet_dt>` is set to `true`, then a timestep is used in the ResNet.
+* {ref}`seed <model/descriptor[se_e2_a]/seed>` gives the random seed that is used to generate random numbers when initializing the model parameters.
 
diff --git a/doc/model/train-se-e2-r.md b/doc/model/train-se-e2-r.md
index 997f32f2b9..181146e8e9 100644
--- a/doc/model/train-se-e2-r.md
+++ b/doc/model/train-se-e2-r.md
@@ -7,7 +7,7 @@ A complete training input script of this example can be found in the directory
 $deepmd_source_dir/examples/water/se_e2_r/input.json
 ```
 
-The training input script is very similar to that of [`se_e2_a`](train-se-e2-a.md#the-training-input-script). The only difference lies in the `descriptor` section
+The training input script is very similar to that of [`se_e2_a`](train-se-e2-a.md). The only difference lies in the {ref}`descriptor <model/descriptor>` section
 ```json
 	"descriptor": {
 	    "type":		"se_e2_r",
@@ -20,4 +20,4 @@ The training input script is very similar to that of [`se_e2_a`](train-se-e2-a.m
 	    "_comment": " that's all"
 	},
 ```
-The type of the descriptor is set by the key `"type"`.
+The type of the descriptor is set by the key {ref}`type <model/descriptor/type>`.
diff --git a/doc/model/train-se-e3.md b/doc/model/train-se-e3.md
index 36bbd202db..d59f11b264 100644
--- a/doc/model/train-se-e3.md
+++ b/doc/model/train-se-e3.md
@@ -7,7 +7,7 @@ A complete training input script of this example can be found in the directory
 $deepmd_source_dir/examples/water/se_e3/input.json
 ```
 
-The training input script is very similar to that of [`se_e2_a`](train-se-e2-a.md#the-training-input-script). The only difference lies in the `descriptor` section
+The training input script is very similar to that of [`se_e2_a`](train-se-e2-a.md). The only difference lies in the `descriptor <model/descriptor>` section
 ```json
 	"descriptor": {
 	    "type":		"se_e3",
@@ -20,4 +20,4 @@ The training input script is very similar to that of [`se_e2_a`](train-se-e2-a.m
 	    "_comment":		" that's all"
 	},
 ```
-The type of the descriptor is set by the key `"type"`.
+The type of the descriptor is set by the key {ref}`type <model/descriptor/type>`.
diff --git a/doc/third-party/index.md b/doc/third-party/index.md
index d5e5fb1bdd..5803f3ef95 100644
--- a/doc/third-party/index.md
+++ b/doc/third-party/index.md
@@ -6,4 +6,5 @@ Note that the model for inference is required to be compatible with the DeePMD-k
 - [Run MD with LAMMPS](lammps.md)
 - [LAMMPS commands](lammps-command.md)
 - [Run path-integral MD with i-PI](ipi.md)
-- [Run MD with GROMACS](gromacs.md)
\ No newline at end of file
+- [Run MD with GROMACS](gromacs.md)
+- [Interfaces out of DeePMD-kit](out-of-deepmd-kit.md)
\ No newline at end of file
diff --git a/doc/third-party/index.rst b/doc/third-party/index.rst
index b87d8e3a97..8620058245 100644
--- a/doc/third-party/index.rst
+++ b/doc/third-party/index.rst
@@ -11,3 +11,4 @@ Note that the model for inference is required to be compatible with the DeePMD-k
    lammps-command
    ipi
    gromacs
+   out-of-deepmd-kit
\ No newline at end of file
diff --git a/doc/third-party/ipi.md b/doc/third-party/ipi.md
index cd0448ce90..8553953627 100644
--- a/doc/third-party/ipi.md
+++ b/doc/third-party/ipi.md
@@ -1,4 +1,4 @@
-### Run path-integral MD with i-PI
+# Run path-integral MD with i-PI
 The i-PI works in a client-server model. The i-PI provides the server for integrating the replica positions of atoms, while the DeePMD-kit provides a client named `dp_ipi` (or `dp_ipi_low` for low precision) that computes the interactions (including energy, force and virial). The server and client communicates via the Unix domain socket or the Internet socket. Installation instructions of i-PI can be found [here](../install/install-ipi.md). The client can be started by
 ```bash
 i-pi input.xml &
diff --git a/doc/third-party/lammps-command.md b/doc/third-party/lammps-command.md
index c32b018535..1e4f713256 100644
--- a/doc/third-party/lammps-command.md
+++ b/doc/third-party/lammps-command.md
@@ -57,13 +57,11 @@ This pair style takes the deep potential defined in a model file that usually ha
 
 The model deviation evalulate the consistency of the force predictions from multiple models. By default, only the maximal, minimal and averge model deviations are output. If the key `atomic` is set, then the model deviation of force prediction of each atom will be output.
 
-By default, the model deviation is output in absolute value. If the keyword `relative` is set, then the relative model deviation will be output. The relative model deviation of the force on atom `i` is defined by
-```math
-           |Df_i|
-Ef_i = -------------
-       |f_i| + level
-```
-where `Df_i` is the absolute model deviation of the force on atom `i`, `|f_i|` is the norm of the the force and `level` is provided as the parameter of the keyword `relative`.
+By default, the model deviation is output in absolute value. If the keyword `relative` is set, then the relative model deviation will be output. The relative model deviation of the force on atom $i$ is defined by
+
+$$E_{f_i}=\frac{\left|D_{f_i}\right|}{\left|f_i\right|+l}$$
+
+where $D_{f_i}$ is the absolute model deviation of the force on atom $i$, $f_i$ is the norm of the the force and $l$ is provided as the parameter of the keyword `relative`.
 
 ### Restrictions
 - The `deepmd` pair style is provided in the USER-DEEPMD package, which is compiled from the DeePMD-kit, visit the [DeePMD-kit website](https://github.com/deepmodeling/deepmd-kit) for more information.
@@ -108,9 +106,9 @@ Please notice that the DeePMD does nothing to the direct space part of the elect
 
 The [DeePMD-kit](https://github.com/deepmodeling/deepmd-kit) allows also the computation of per-atom stress tensor defined as:
 
-<img src="https://render.githubusercontent.com/render/math?math=dvatom=\sum_{m}( \mathbf{r}_n- \mathbf{r}_m) \frac{de_m}{d\mathbf{r}_n} ">
+$$dvatom=\sum_{m}( \mathbf{r}_n- \mathbf{r}_m) \frac{de_m}{d\mathbf{r}_n}$$
 
-Where <img src="https://render.githubusercontent.com/render/math?math=\mathbf{r}_n "> is the atomic position of nth atom, <img src="https://render.githubusercontent.com/render/math?math=\mathbf{v}_n "> velocity of atom and <img src="https://render.githubusercontent.com/render/math?math=\frac{de_m}{d\mathbf{r}_n} "> the derivative of the atomic energy.
+Where $\mathbf{r}_n$ is the atomic position of nth atom, $\mathbf{v}_n$ velocity of atom and $\frac{de_m}{d\mathbf{r}_n}$ the derivative of the atomic energy.
 
 In LAMMPS one can get the per-atom stress using the command `centroid/stress/atom`:
 ```bash
@@ -129,7 +127,7 @@ If you use this feature please cite [D. Tisi, L. Zhang, R. Bertossa, H. Wang, R.
 ## Computation of heat flux
 Using per-atom stress tensor one can, for example, compute the heat flux defined as:
 
-<img src="https://render.githubusercontent.com/render/math?math=\mathbf{J}=\sum_n e_n \mathbf{v}_n + \sum_{nm}( \mathbf{r}_m- \mathbf{r}_n) \frac{de_m}{d\mathbf{r}_n} \mathbf{v}_n">
+$$\mathbf J = \sum_n e_n \mathbf v_n + \sum_{n,m} ( \mathbf r_m- \mathbf r_n) \frac{de_m}{d\mathbf r_n} \mathbf v_n$$
 
 to compute the heat flux with LAMMPS: 
 ```bash
@@ -147,7 +145,7 @@ compute pe all pe/atom
 compute stress all centroid/stress/atom NULL virial
 compute flux all heat/flux ke pe stress
 ```
-`c_flux` is a global vector of length 6. The first three components are the `x`, `y` and `z` components of the full heat flux vector. The others are the components of the so-called convective portion, see [LAMMPS doc page](https://docs.lammps.org/compute_heat_flux.html) for more detailes.
+`c_flux` is a global vector of length 6. The first three components are the $x$, $y$ and $z$ components of the full heat flux vector. The others are the components of the so-called convective portion, see [LAMMPS doc page](https://docs.lammps.org/compute_heat_flux.html) for more detailes.
 
 If you use these features please cite [D. Tisi, L. Zhang, R. Bertossa, H. Wang, R. Car, S. Baroni - arXiv preprint arXiv:2108.10850, 2021](https://arxiv.org/abs/2108.10850)
 
diff --git a/doc/third-party/out-of-deepmd-kit.md b/doc/third-party/out-of-deepmd-kit.md
new file mode 100644
index 0000000000..3561f5bb83
--- /dev/null
+++ b/doc/third-party/out-of-deepmd-kit.md
@@ -0,0 +1,31 @@
+# Interfaces out of DeePMD-kit
+
+The codes of the following interfaces are not a part of the DeePMD-kit package and maintained by other repositories. We list these interfaces here for user convenience.
+
+## dpdata
+
+[dpdata](https://github.com/deepmodeling/dpdata) provides the `predict` method for `System` class:
+
+```py
+import dpdata
+dsys = dpdata.LabeledSystem('OUTCAR')
+dp_sys = dsys.predict("frozen_model_compressed.pb")
+```
+
+By inferring with the DP model `frozen_model_compressed.pb`, dpdata will generate a new labeled system `dp_sys` with inferred energies, forces, and virials.
+
+## OpenMM plugin for DeePMD-kit
+
+An [OpenMM](https://github.com/openmm/openmm) plugin is provided from [JingHuangLab/openmm_deepmd_plugin](https://github.com/JingHuangLab/openmm_deepmd_plugin), written by the [Huang Lab](http://www.compbiophysics.org/) at the Westlake University.
+
+## AMBER interface to DeePMD-kit
+
+An [AMBER](https://ambermd.org/) interface to DeePMD-kit is written by the [York Lab](https://theory.rutgers.edu/) from the Rutgers University. It is open-source at [GitLab RutgersLBSR/AmberDPRc](https://gitlab.com/RutgersLBSR/AmberDPRc/). Details can be found in [this paper](https://doi.org/10.1021/acs.jctc.1c00201).
+
+## DP-GEN
+
+[DP-GEN](https://github.com/deepmodeling/dpgen) provides a workflow to generate accurate DP models by calling DeePMD-kit's command line interface (CLI) in the local or the remote server. Details can be found in [this paper](https://doi.org/10.1016/j.cpc.2020.107206).
+
+## MLatom
+
+[Mlatom](http://mlatom.com/) provides an interface to the DeePMD-kit within MLatom's workflow by calling DeePMD-kit's CLI. Details can be found in [this paper](https://doi.org/10.1007/s41061-021-00339-5).
diff --git a/doc/train/parallel-training.md b/doc/train/parallel-training.md
index 7fecd364c2..c8d3d29aad 100644
--- a/doc/train/parallel-training.md
+++ b/doc/train/parallel-training.md
@@ -5,11 +5,11 @@ Depend on the number of training processes (according to MPI context) and number
 
 ## Tuning learning rate
 
-Horovod works in the data-parallel mode, resulting in a larger global batch size. For example, the real batch size is 8 when `batch_size` is set to 2 in the input file and you launch 4 workers. Thus, `learning_rate` is automatically scaled by the number of workers for better convergence. Technical details of such heuristic rule are discussed at [Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour](https://arxiv.org/abs/1706.02677).
+Horovod works in the data-parallel mode, resulting in a larger global batch size. For example, the real batch size is 8 when {ref}`batch_size <training/training_data/batch_size>` is set to 2 in the input file and you launch 4 workers. Thus, {ref}`learning_rate <learning_rate>` is automatically scaled by the number of workers for better convergence. Technical details of such heuristic rule are discussed at [Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour](https://arxiv.org/abs/1706.02677).
 
 The number of decay steps required to achieve same accuracy can decrease by the number of cards (e.g., 1/2 of steps in the above case), but needs to be scaled manually in the input file.
 
-In some cases, it won't work well when scale learning rate by worker count in a `linear` way. Then you can try `sqrt` or `none` by setting argument `scale_by_worker` like below.
+In some cases, it won't work well when scale learning rate by worker count in a `linear` way. Then you can try `sqrt` or `none` by setting argument {ref}`scale_by_worker <learning_rate/scale_by_worker>` like below.
 ```json
     "learning_rate" :{
         "scale_by_worker": "none",
diff --git a/doc/train/tensorboard.md b/doc/train/tensorboard.md
index aa92bfaaab..17b0384d66 100644
--- a/doc/train/tensorboard.md
+++ b/doc/train/tensorboard.md
@@ -19,7 +19,7 @@ DeePMD-kit can now use most of the interesting features enabled by tensorboard!
 ## How to use Tensorboard with DeePMD-kit
 
 Before running TensorBoard, make sure you have generated summary data in a log
-directory by modifying the the input script, set "tensorboard" true in training
+directory by modifying the the input script, set {ref}`tensorboard <training/tensorboard>` to true in training
 subsection will enable the tensorboard data analysis. eg. **water_se_a.json**.
 
 ```json
diff --git a/doc/train/train-input.rst b/doc/train/train-input.rst
index 2e3fe83d4b..44d511cc21 100644
--- a/doc/train/train-input.rst
+++ b/doc/train/train-input.rst
@@ -1,6 +1,20 @@
 Training Parameters
 ======================================
 .. note::
-   One can load, modify, and export the input file by using our effective web-based tool `DP-GUI <https://deepmodeling.org/dpgui/input/deepmd-kit-2.0>`_. All training parameters below can be set in DP-GUI. By clicking "SAVE JSON", one can download the input file for furthur training.
+   One can load, modify, and export the input file by using our effective web-based tool `DP-GUI <https://deepmodeling.com/dpgui/input/deepmd-kit-2.0>`_. All training parameters below can be set in DP-GUI. By clicking "SAVE JSON", one can download the input file for furthur training.
 
-.. include:: ../train-input-auto.rst
+.. dargs::
+   :module: deepmd.utils.argcheck
+   :func: model_args
+
+.. dargs::
+   :module: deepmd.utils.argcheck
+   :func: learning_rate_args
+
+.. dargs::
+   :module: deepmd.utils.argcheck
+   :func: loss_args
+
+.. dargs::
+   :module: deepmd.utils.argcheck
+   :func: training_args
diff --git a/doc/train/training-advanced.md b/doc/train/training-advanced.md
index 004c6709b7..98e12b4773 100644
--- a/doc/train/training-advanced.md
+++ b/doc/train/training-advanced.md
@@ -4,7 +4,7 @@ In this section, we will take `$deepmd_source_dir/examples/water/se_e2_a/input.j
 
 ## Learning rate
 
-The `learning_rate` section in `input.json` is given as follows
+The {ref}`learning_rate <learning_rate>` section in `input.json` is given as follows
 ```json
     "learning_rate" :{
 	"type":		"exp",
@@ -14,17 +14,21 @@ The `learning_rate` section in `input.json` is given as follows
 	"_comment":	"that's all"
     }
 ```
-* `start_lr` gives the learning rate at the beginning of the training.
-* `stop_lr` gives the learning rate at the end of the training. It should be small enough to ensure that the network parameters satisfactorily converge. 
-* During the training, the learning rate decays exponentially from `start_lr` to `stop_lr` following the formula.
+* {ref}`start_lr <learning_rate[exp]/start_lr>` gives the learning rate at the beginning of the training.
+* {ref}`stop_lr <learning_rate[exp]/stop_lr>` gives the learning rate at the end of the training. It should be small enough to ensure that the network parameters satisfactorily converge. 
+* During the training, the learning rate decays exponentially from {ref}`start_lr <learning_rate[exp]/start_lr>` to {ref}`stop_lr <learning_rate[exp]/stop_lr>` following the formula:
+
+$$ \alpha(t) = \alpha_0 \lambda ^ { t / \tau } $$
+
+where $t$ is the training step, $\alpha$ is the learning rate, $\alpha_0$ is the starting learning rate (set by {ref}`start_lr <learning_rate[exp]/start_lr>`), $\lambda$ is the decay rate, and $\tau$ is the decay steps, i.e.
+
     ```
     lr(t) = start_lr * decay_rate ^ ( t / decay_steps )
     ```
-    where `t` is the training step.
 
 ## Training parameters
 
-Other training parameters are given in the `training` section.
+Other training parameters are given in the {ref}`training <training>` section.
 ```json
     "training": {
  	"training_data": {
@@ -41,18 +45,18 @@ Other training parameters are given in the `training` section.
 	    "compute_prec":     "float16"
 	},
 
-	"numb_step":	1000000,
+	"numb_steps":	1000000,
 	"seed":		1,
 	"disp_file":	"lcurve.out",
 	"disp_freq":	100,
 	"save_freq":	1000
     }
 ```
-The sections `"training_data"` and `"validation_data"` give the training dataset and validation dataset, respectively. Taking the training dataset for example, the keys are explained below:
-* `systems` provide paths of the training data systems. DeePMD-kit allows you to provide multiple systems with different numbers of atoms. This key can be a `list` or a `str`.
-    * `list`: `systems` gives the training data systems.
-    * `str`: `systems` should be a valid path. DeePMD-kit will recursively search all data systems in this path.
-* At each training step, DeePMD-kit randomly pick `batch_size` frame(s) from one of the systems. The probability of using a system is by default in proportion to the number of batches in the system. More optional are available for automatically determining the probability of using systems. One can set the key `auto_prob` to
+The sections {ref}`training_data <training/training_data>` and {ref}`validation_data <training/validation_data>` give the training dataset and validation dataset, respectively. Taking the training dataset for example, the keys are explained below:
+* {ref}`systems <training/training_data/systems>` provide paths of the training data systems. DeePMD-kit allows you to provide multiple systems with different numbers of atoms. This key can be a `list` or a `str`.
+    * `list`: {ref}`systems <training/training_data/systems>` gives the training data systems.
+    * `str`: {ref}`systems <training/training_data/systems>` should be a valid path. DeePMD-kit will recursively search all data systems in this path.
+* At each training step, DeePMD-kit randomly pick {ref}`batch_size <training/training_data/batch_size>` frame(s) from one of the systems. The probability of using a system is by default in proportion to the number of batches in the system. More optional are available for automatically determining the probability of using systems. One can set the key {ref}`auto_prob <training/training_data/auto_prob>` to
     * `"prob_uniform"` all systems are used with the same probability.
     * `"prob_sys_size"` the probability of using a system is in proportional to its size (number of frames).
     * `"prob_sys_size; sidx_0:eidx_0:w_0; sidx_1:eidx_1:w_1;..."` the `list` of systems are divided into blocks. The block `i` has systems ranging from `sidx_i` to `eidx_i`. The probability of using a system from block `i` is in proportional to `w_i`. Within one block, the probability of using a system is in proportional to its size.
@@ -64,34 +68,34 @@ The sections `"training_data"` and `"validation_data"` give the training dataset
 	    "batch_size":	"auto"
 	}
 ```
-* The probability of using systems can also be specified explicitly with key `"sys_prob"` that is a list having the length of the number of systems. For example
+* The probability of using systems can also be specified explicitly with key {ref}`sys_probs <training/training_data/sys_probs>` that is a list having the length of the number of systems. For example
 ```json
  	"training_data": {
 	    "systems":		["../data_water/data_0/", "../data_water/data_1/", "../data_water/data_2/"],
-	    "sys_prob":	[0.5, 0.3, 0.2],
+	    "sys_probs":	[0.5, 0.3, 0.2],
 	    "batch_size":	"auto:32"
 	}
 ```
-* The key `batch_size` specifies the number of frames used to train or validate the model in a training step. It can be set to
-    * `list`: the length of which is the same as the `systems`. The batch size of each system is given by the elements of the list.
+* The key {ref}`batch_size <training/training_data/batch_size>` specifies the number of frames used to train or validate the model in a training step. It can be set to
+    * `list`: the length of which is the same as the {ref}`systems`. The batch size of each system is given by the elements of the list.
     * `int`: all systems use the same batch size.
     * `"auto"`: the same as `"auto:32"`, see `"auto:N"`
-    * `"auto:N"`: automatically determines the batch size so that the `batch_size` times the number of atoms in the system is no less than `N`.
-* The key `numb_batch` in `validate_data` gives the number of batches of model validation. Note that the batches may not be from the same system
+    * `"auto:N"`: automatically determines the batch size so that the {ref}`batch_size <training/training_data/batch_size>` times the number of atoms in the system is no less than `N`.
+* The key {ref}`numb_batch <training/validation_data/numb_btch>` in {ref}`validate_data <training/validation_data>` gives the number of batches of model validation. Note that the batches may not be from the same system
 
-The section `mixed_precision` specifies the mixed precision settings, which will enable the mixed precision training workflow for deepmd-kit. The keys are explained below:
-* `output_prec`  precision used in the output tensors, only `float32` is supported currently.
-* `compute_prec` precision used in the computing tensors, only `float16` is supported currently.
+The section {ref}`mixed_precision <training/mixed_precision>` specifies the mixed precision settings, which will enable the mixed precision training workflow for deepmd-kit. The keys are explained below:
+* {ref}`output_prec <training/mixed_precision/output_prec>`  precision used in the output tensors, only `float32` is supported currently.
+* {ref}`compute_prec <training/mixed_precision/compute_prec>` precision used in the computing tensors, only `float16` is supported currently.
 Note there are severial limitations about the mixed precision training:
-* Only 'se_e2_a' type descriptor is supported by the mixed precision training workflow.
+* Only {ref}`se_e2_a <model/descriptor[se_e2_a]>` type descriptor is supported by the mixed precision training workflow.
 * The precision of embedding net and fitting net are forced to be set to `float32`.
 
-Other keys in the `training` section are explained below:
-* `numb_step` The number of training steps.
-* `seed` The random seed for getting frames from the training data set.
-* `disp_file` The file for printing learning curve.
-* `disp_freq` The frequency of printing learning curve. Set in the unit of training steps
-* `save_freq` The frequency of saving check point.
+Other keys in the {ref}`training <training>` section are explained below:
+* {ref}`numb_steps <training/numb_steps>` The number of training steps.
+* {ref}`seed <training/seed>` The random seed for getting frames from the training data set.
+* {ref}`disp_file <training/disp_file>` The file for printing learning curve.
+* {ref}`disp_freq <training/disp_freq>` The frequency of printing learning curve. Set in the unit of training steps
+* {ref}`save_freq <training/save_freq>` The frequency of saving check point.
 
 ## Options and environment variables
 
diff --git a/doc/train/training.md b/doc/train/training.md
index 7fe4012e12..1183e03b81 100644
--- a/doc/train/training.md
+++ b/doc/train/training.md
@@ -1,4 +1,4 @@
-# Training a model
+# Train a model
 
 Several examples of training can be found at the `examples` directory:
 ```bash
@@ -26,9 +26,9 @@ DEEPMD INFO                                        system  natoms  bch_sz   n_bc
 DEEPMD INFO                          ../data_water/data_3     192       1      80  1.000    T
 DEEPMD INFO    --------------------------------------------------------------------------------------
 ```
-The DeePMD-kit prints detailed information on the training and validation data sets. The data sets are defined by `"training_data"` and `"validation_data"` defined in the `"training"` section of the input script. The training data set is composed by three data systems, while the validation data set is composed by one data system. The number of atoms, batch size, number of batches in the system and the probability of using the system are all shown on the screen. The last column presents if the periodic boundary condition is assumed for the system. 
+The DeePMD-kit prints detailed information on the training and validation data sets. The data sets are defined by {ref}`training_data <training/training_data>` and {ref}`validation_data <training/validation_data>` defined in the {ref}`training <training>` section of the input script. The training data set is composed by three data systems, while the validation data set is composed by one data system. The number of atoms, batch size, number of batches in the system and the probability of using the system are all shown on the screen. The last column presents if the periodic boundary condition is assumed for the system. 
 
-During the training, the error of the model is tested every `disp_freq` training steps with the batch used to train the model and with `numb_btch` batches from the validating data. The training error and validation error are printed correspondingly in the file `disp_file` (default is `lcurve.out`). The batch size can be set in the input script by the key `batch_size` in the corresponding sections for training and validation data set. An example of the output 
+During the training, the error of the model is tested every {ref}`disp_freq <training/disp_freq>` training steps with the batch used to train the model and with {ref}`numb_btch <training/validation_data/numb_btch>` batches from the validating data. The training error and validation error are printed correspondingly in the file {ref}`disp_file <training/disp_file>` (default is `lcurve.out`). The batch size can be set in the input script by the key {ref}`batch_size <training/training_data/batch_size>` in the corresponding sections for training and validation data set. An example of the output 
 ```bash
 #  step      rmse_val    rmse_trn    rmse_e_val  rmse_e_trn    rmse_f_val  rmse_f_trn         lr
       0      3.33e+01    3.41e+01      1.03e+01    1.03e+01      8.39e-01    8.72e-01    1.0e-03
@@ -56,7 +56,7 @@ plt.grid()
 plt.show()
 ```
 
-Checkpoints will be written to files with prefix `save_ckpt` every `save_freq` training steps. 
+Checkpoints will be written to files with prefix {ref}`save_ckpt <training/save_ckpt>` every {ref}`save_freq <training/save_freq>` training steps. 
 
 ## Warning
 It is warned that the example water data (in folder `examples/water/data`) is of very limited amount, is provided only for testing purpose, and should not be used to train a production model.
diff --git a/doc/troubleshooting/model-compatability.md b/doc/troubleshooting/model-compatability.md
index 820a79210f..9ed9607bbb 100644
--- a/doc/troubleshooting/model-compatability.md
+++ b/doc/troubleshooting/model-compatability.md
@@ -8,7 +8,7 @@ One can execute `dp convert-from` to convert an old model to a new one.
 
 | Model version | v0.12 | v1.0 | v1.1 | v1.2 | v1.3 | v2.0 | v2.1 |
 |:-:|:-----------:|:----------:|:----------:|:----------:|:----------:|:----------:|:----------:|
-| Compatibility  | 😢 | 😊 | 😊 | 😊 | 😊 | 😄 | 😄 |
+| Compatibility  | 😊 | 😊 | 😊 | 😊 | 😊 | 😄 | 😄 |
 
 **Legend**:
 - 😄: The model is compatible with the DeePMD-kit package.
diff --git a/examples/fparam/train/input.json b/examples/fparam/train/input.json
index 3e401aa7d2..c32b0d1c17 100644
--- a/examples/fparam/train/input.json
+++ b/examples/fparam/train/input.json
@@ -37,10 +37,12 @@
 
     "_comment": " traing controls",
     "training" : {
-	"systems":	["../data/e3000_i2000/", "../data/e8000_i2000/"],
-	"set_prefix":	"set",
+		"training_data": {
+			"systems": ["../data/e3000_i2000/", "../data/e8000_i2000/"],
+			"set_prefix":	"set",
+			"batch_size":	1
+		},
 	"stop_batch":	1000000,
-	"batch_size":	1,
 
 	"seed":		1,
 
@@ -48,7 +50,6 @@
 	"_comment": " frequencies counted in batch",
 	"disp_file":	"lcurve.out",
 	"disp_freq":	100,
-	"numb_test":	10,
 	"save_freq":	1000,
 	"save_ckpt":	"model.ckpt",
 	"disp_training":true,
diff --git a/examples/fparam/train/input_aparam.json b/examples/fparam/train/input_aparam.json
index 05eae62216..b978ef055e 100644
--- a/examples/fparam/train/input_aparam.json
+++ b/examples/fparam/train/input_aparam.json
@@ -37,10 +37,12 @@
 
     "_comment": " traing controls",
     "training" : {
-	"systems":	["../data/e3000_i2000/", "../data/e8000_i2000/"],
-	"set_prefix":	"set",
+		"training_data": {
+			"systems":	["../data/e3000_i2000/", "../data/e8000_i2000/"],
+			"set_prefix":	"set",
+			"batch_size":	1
+		},
 	"stop_batch":	1000000,
-	"batch_size":	1,
 
 	"seed":		1,
 
@@ -48,7 +50,6 @@
 	"_comment": " frequencies counted in batch",
 	"disp_file":	"lcurve.out",
 	"disp_freq":	100,
-	"numb_test":	10,
 	"save_freq":	1000,
 	"save_ckpt":	"model.ckpt",
 	"disp_training":true,
diff --git a/examples/water/se_e2_r/input.json b/examples/water/se_e2_r/input.json
index de9b8f877b..c30d07fa6e 100644
--- a/examples/water/se_e2_r/input.json
+++ b/examples/water/se_e2_r/input.json
@@ -56,7 +56,6 @@
 	"seed":		1,
 	"disp_file":	"lcurve.out",
 	"disp_freq":	100,
-	"numb_test":	10,
 	"save_freq":	1000,
 	"_comment":	"that's all"
     },
diff --git a/examples/water/se_e3/input.json b/examples/water/se_e3/input.json
index 4b62f4435b..05a25d9ef9 100644
--- a/examples/water/se_e3/input.json
+++ b/examples/water/se_e3/input.json
@@ -59,7 +59,6 @@
 	"_comment": " frequencies counted in batch",
 	"disp_file":	"lcurve.out",
 	"disp_freq":	10,
-	"numb_test":	4,
 	"save_freq":	1000,
 	"save_ckpt":	"model.ckpt",
 	"disp_training":true,
diff --git a/setup.py b/setup.py
index 2623f0f149..9f8b4a69b9 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,7 @@
 """Setup script for DeePMD-kit package."""
 
 import os
+import site
 from distutils.util import get_platform
 from importlib.machinery import FileFinder
 from importlib.util import find_spec
@@ -58,6 +59,13 @@
 
 # get tensorflow spec
 tf_spec = find_spec("tensorflow")
+
+if not tf_spec and site.ENABLE_USER_SITE:
+    # first search TF from user site-packages before global site-packages
+    site_packages = site.getusersitepackages()
+    if site_packages:
+        tf_spec = FileFinder(site_packages).find_spec("tensorflow")
+
 if not tf_spec:
     # purelib gets site-packages path
     site_packages = get_path("purelib")
@@ -127,7 +135,7 @@
     cmake_minimum_required_version="3.0",
     extras_require={
         "test": ["dpdata>=0.1.9", "ase", "pytest", "pytest-cov", "pytest-sugar"],
-        "docs": ["sphinx>=3.1.1", "recommonmark", "sphinx_rtd_theme>=1.0.0rc1", "sphinx_markdown_tables", "myst-parser", "breathe", "exhale", "numpydoc", "ase", "deepmodeling-sphinx"],
+        "docs": ["sphinx>=3.1.1", "recommonmark", "sphinx_rtd_theme>=1.0.0rc1", "sphinx_markdown_tables", "myst-parser", "breathe", "exhale", "numpydoc", "ase", "deepmodeling-sphinx", "dargs>=0.3.1", "sphinx-argparse"],
         **extras_require,
     },
     entry_points={"console_scripts": ["dp = deepmd.entrypoints.main:main"]},
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index d706a6d292..8680931299 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -56,6 +56,7 @@ endif()
 
 # define USE_CUDA_TOOLKIT
 if (USE_CUDA_TOOLKIT)
+  set(CUDA_USE_STATIC_CUDA_RUNTIME OFF CACHE INTERNAL "")
   find_package(CUDA REQUIRED)
   add_definitions("-DGOOGLE_CUDA")
   message(STATUS "Found CUDA in ${CUDA_TOOLKIT_ROOT_DIR}, build nv GPU support")
diff --git a/source/api_cc/include/DeepPot.h b/source/api_cc/include/DeepPot.h
index 6057cf4ab7..2b76cc4e0b 100644
--- a/source/api_cc/include/DeepPot.h
+++ b/source/api_cc/include/DeepPot.h
@@ -226,16 +226,16 @@ class DeepPotModelDevi
   ~DeepPotModelDevi() ;
   /**
   * @brief DP model deviation constructor with initialization.
-  * @param[in] model The names of the frozen model files.
+  * @param[in] models The names of the frozen model files.
   * @param[in] gpu_rank The GPU rank. Default is 0.
-  * @param[in] file_content The contents of the model files. If it is not empty, DP will read from the strings instead of the files.
+  * @param[in] file_contents The contents of the model files. If it is not empty, DP will read from the strings instead of the files.
   **/
   DeepPotModelDevi  (const std::vector<std::string> & models, const int & gpu_rank = 0, const std::vector<std::string> & file_contents = std::vector<std::string>());
   /**
   * @brief Initialize the DP model deviation contrcutor.
-  * @param[in] model The names of the frozen model files.
+  * @param[in] models The names of the frozen model files.
   * @param[in] gpu_rank The GPU rank. Default is 0.
-  * @param[in] file_content The contents of the model files. If it is not empty, DP will read from the strings instead of the files.
+  * @param[in] file_contents The contents of the model files. If it is not empty, DP will read from the strings instead of the files.
   **/
   void init (const std::vector<std::string> & models, const int & gpu_rank = 0, const std::vector<std::string> & file_contents = std::vector<std::string>());
 public:
diff --git a/source/api_cc/include/DeepTensor.h b/source/api_cc/include/DeepTensor.h
index 4e1e47307b..2f696d5289 100644
--- a/source/api_cc/include/DeepTensor.h
+++ b/source/api_cc/include/DeepTensor.h
@@ -18,7 +18,7 @@ class DeepTensor
   * @brief Deep Tensor constructor with initialization..
   * @param[in] model The name of the frozen model file.
   * @param[in] gpu_rank The GPU rank. Default is 0.
-  * @param[in] file_content The content of the model file. If it is not empty, DP will read from the string instead of the file.
+  * @param[in] name_scope Name scopes of operations.
   **/
   DeepTensor(const std::string & model, 
 	     const int & gpu_rank = 0, 
@@ -27,7 +27,7 @@ class DeepTensor
   * @brief Initialize the Deep Tensor.
   * @param[in] model The name of the frozen model file.
   * @param[in] gpu_rank The GPU rank. Default is 0.
-  * @param[in] file_content The content of the model file. If it is not empty, DP will read from the string instead of the file.
+  * @param[in] name_scope Name scopes of operations.
   **/
   void init (const std::string & model, 
 	     const int & gpu_rank = 0, 
diff --git a/source/api_cc/include/common.h b/source/api_cc/include/common.h
index a07caac546..9d8faa83c4 100644
--- a/source/api_cc/include/common.h
+++ b/source/api_cc/include/common.h
@@ -109,6 +109,7 @@ select_map_inv(typename std::vector<VT >::iterator out,
 
 /**
 * @brief Get the number of threads from the environment variable.
+* @details A warning will be thrown if environmental variables are not set.
 * @param[out] num_intra_nthreads The number of intra threads. Read from TF_INTRA_OP_PARALLELISM_THREADS.
 * @param[out] num_inter_nthreads The number of inter threads. Read from TF_INTER_OP_PARALLELISM_THREADS.
 **/
diff --git a/source/api_cc/src/DataModifier.cc b/source/api_cc/src/DataModifier.cc
index c6c009b0d8..2fbd58584b 100644
--- a/source/api_cc/src/DataModifier.cc
+++ b/source/api_cc/src/DataModifier.cc
@@ -142,8 +142,8 @@ compute (std::vector<VALUETYPE> &		dfcorr_,
   if (nloc_real == 0){
     dfcorr_.resize(nall * 3);
     dvcorr_.resize(9);
-    fill(dfcorr_.begin(), dfcorr_.end(), 0.0);
-    fill(dvcorr_.begin(), dvcorr_.end(), 0.0);
+    fill(dfcorr_.begin(), dfcorr_.end(), (VALUETYPE)0.0);
+    fill(dvcorr_.begin(), dvcorr_.end(), (VALUETYPE)0.0);
     return;
   }
   // resize to nall_real
@@ -223,7 +223,7 @@ compute (std::vector<VALUETYPE> &		dfcorr_,
   assert(dfcorr_1.size() == nall_real * 3);
   // resize to all and clear
   std::vector<VALUETYPE> dfcorr_2(nall*3);
-  fill(dfcorr_2.begin(), dfcorr_2.end(), 0.0);
+  fill(dfcorr_2.begin(), dfcorr_2.end(), (VALUETYPE)0.0);
   // back map to original position
   for (int ii = 0; ii < nall_real; ++ii){
     for (int dd = 0; dd < 3; ++dd){
diff --git a/source/api_cc/src/DeepPot.cc b/source/api_cc/src/DeepPot.cc
index 047c665e8d..1b99e78920 100644
--- a/source/api_cc/src/DeepPot.cc
+++ b/source/api_cc/src/DeepPot.cc
@@ -34,10 +34,10 @@ run_model (ENERGYTYPE &			dener,
     // no backward map needed
     // dforce of size nall * 3
     dforce_.resize(nall * 3);
-    fill(dforce_.begin(), dforce_.end(), 0.0);
+    fill(dforce_.begin(), dforce_.end(), (VALUETYPE)0.0);
     // dvirial of size 9
     dvirial.resize(9);
-    fill(dvirial.begin(), dvirial.end(), 0.0);
+    fill(dvirial.begin(), dvirial.end(), (VALUETYPE)0.0);
     return;
   }
 
@@ -62,17 +62,17 @@ run_model (ENERGYTYPE &			dener,
     dforce[ii] = of(ii);
   }
   // set dvirial to zero, prevent input vector is not zero (#1123)
-  std::fill(dvirial.begin(), dvirial.end(), 0.);
+  std::fill(dvirial.begin(), dvirial.end(), (VALUETYPE)0.);
   for (int ii = 0; ii < nall; ++ii) {
-    dvirial[0] += 1.0 * oav(9*ii+0);
-    dvirial[1] += 1.0 * oav(9*ii+1);
-    dvirial[2] += 1.0 * oav(9*ii+2);
-    dvirial[3] += 1.0 * oav(9*ii+3);
-    dvirial[4] += 1.0 * oav(9*ii+4);
-    dvirial[5] += 1.0 * oav(9*ii+5);
-    dvirial[6] += 1.0 * oav(9*ii+6);
-    dvirial[7] += 1.0 * oav(9*ii+7);
-    dvirial[8] += 1.0 * oav(9*ii+8);
+    dvirial[0] += (VALUETYPE)1.0 * oav(9*ii+0);
+    dvirial[1] += (VALUETYPE)1.0 * oav(9*ii+1);
+    dvirial[2] += (VALUETYPE)1.0 * oav(9*ii+2);
+    dvirial[3] += (VALUETYPE)1.0 * oav(9*ii+3);
+    dvirial[4] += (VALUETYPE)1.0 * oav(9*ii+4);
+    dvirial[5] += (VALUETYPE)1.0 * oav(9*ii+5);
+    dvirial[6] += (VALUETYPE)1.0 * oav(9*ii+6);
+    dvirial[7] += (VALUETYPE)1.0 * oav(9*ii+7);
+    dvirial[8] += (VALUETYPE)1.0 * oav(9*ii+8);
   }
   dforce_ = dforce;
   atommap.backward (dforce_.begin(), dforce.begin(), 3);
@@ -95,16 +95,16 @@ static void run_model (ENERGYTYPE   &		dener,
         // no backward map needed
         // dforce of size nall * 3
         dforce_.resize(nall * 3);
-        fill(dforce_.begin(), dforce_.end(), 0.0);
+        fill(dforce_.begin(), dforce_.end(), (VALUETYPE)0.0);
         // dvirial of size 9
         dvirial.resize(9);
-        fill(dvirial.begin(), dvirial.end(), 0.0);
+        fill(dvirial.begin(), dvirial.end(), (VALUETYPE)0.0);
         // datom_energy_ of size nall
         datom_energy_.resize(nall);
-        fill(datom_energy_.begin(), datom_energy_.end(), 0.0);
+        fill(datom_energy_.begin(), datom_energy_.end(), (VALUETYPE)0.0);
         // datom_virial_ of size nall * 9
         datom_virial_.resize(nall * 9);
-        fill(datom_virial_.begin(), datom_virial_.end(), 0.0);
+        fill(datom_virial_.begin(), datom_virial_.end(), (VALUETYPE)0.0);
         return;
     }
     std::vector<Tensor> output_tensors;
@@ -139,17 +139,17 @@ static void run_model (ENERGYTYPE   &		dener,
         datom_virial[ii] = oav(ii);
     }
     // set dvirial to zero, prevent input vector is not zero (#1123)
-    std::fill(dvirial.begin(), dvirial.end(), 0.);
+    std::fill(dvirial.begin(), dvirial.end(), (VALUETYPE)0.);
     for (int ii = 0; ii < nall; ++ii) {
-        dvirial[0] += 1.0 * datom_virial[9*ii+0];
-        dvirial[1] += 1.0 * datom_virial[9*ii+1];
-        dvirial[2] += 1.0 * datom_virial[9*ii+2];
-        dvirial[3] += 1.0 * datom_virial[9*ii+3];
-        dvirial[4] += 1.0 * datom_virial[9*ii+4];
-        dvirial[5] += 1.0 * datom_virial[9*ii+5];
-        dvirial[6] += 1.0 * datom_virial[9*ii+6];
-        dvirial[7] += 1.0 * datom_virial[9*ii+7];
-        dvirial[8] += 1.0 * datom_virial[9*ii+8];
+        dvirial[0] += (VALUETYPE)1.0 * datom_virial[9*ii+0];
+        dvirial[1] += (VALUETYPE)1.0 * datom_virial[9*ii+1];
+        dvirial[2] += (VALUETYPE)1.0 * datom_virial[9*ii+2];
+        dvirial[3] += (VALUETYPE)1.0 * datom_virial[9*ii+3];
+        dvirial[4] += (VALUETYPE)1.0 * datom_virial[9*ii+4];
+        dvirial[5] += (VALUETYPE)1.0 * datom_virial[9*ii+5];
+        dvirial[6] += (VALUETYPE)1.0 * datom_virial[9*ii+6];
+        dvirial[7] += (VALUETYPE)1.0 * datom_virial[9*ii+7];
+        dvirial[8] += (VALUETYPE)1.0 * datom_virial[9*ii+8];
 	}
     dforce_ = dforce;
     datom_energy_ = datom_energy;
@@ -164,14 +164,12 @@ DeepPot::
 DeepPot ()
     : inited (false), init_nbor (false)
 {
-  get_env_nthreads(num_intra_nthreads, num_inter_nthreads);
 }
 
 DeepPot::
 DeepPot (const std::string & model, const int & gpu_rank, const std::string & file_content)
     : inited (false), init_nbor (false)
 {
-  get_env_nthreads(num_intra_nthreads, num_inter_nthreads);
   init(model, gpu_rank, file_content);  
 }
 
@@ -369,7 +367,7 @@ compute (ENERGYTYPE &			dener,
   select_map<int>(datype, datype_, fwd_map, 1);
   // aparam
   if (daparam > 0){
-    aparam.resize(bkw_map.size());
+    aparam.resize(bkw_map.size() - nghost_real);
     select_map<VALUETYPE>(aparam, aparam_, fwd_map, daparam);
   }
   // internal nlist
@@ -453,25 +451,52 @@ compute (ENERGYTYPE &			dener,
 	 const InputNlist &	lmp_list,
 	 const int               &	ago,
 	 const std::vector<VALUETYPE> &	fparam,
-	 const std::vector<VALUETYPE> &	aparam)
+	 const std::vector<VALUETYPE> &	aparam_)
 {
   int nall = dcoord_.size() / 3;
   int nloc = nall - nghost;
-    validate_fparam_aparam(nloc, fparam, aparam);
+  validate_fparam_aparam(nloc, fparam, aparam_);
     std::vector<std::pair<std::string, Tensor>> input_tensors;
-
+  // select real atoms
+  std::vector<VALUETYPE> dcoord, dforce, aparam, datom_energy, datom_virial;
+  std::vector<int> datype, fwd_map, bkw_map;
+  int nghost_real;
+  select_real_atoms(fwd_map, bkw_map, nghost_real, dcoord_, datype_, nghost, ntypes);
+  // resize to nall_real
+  int nall_real = bkw_map.size();
+  int nloc_real = nall_real - nghost_real;
+  dcoord.resize(nall_real * 3);
+  datype.resize(nall_real);
+  datom_energy.resize(nall_real);
+  // fwd map
+  select_map<VALUETYPE>(dcoord, dcoord_, fwd_map, 3);
+  select_map<int>(datype, datype_, fwd_map, 1);
+  select_map<VALUETYPE>(datom_energy, datom_energy_, fwd_map, 1);
+  // aparam
+  if (daparam > 0){
+    aparam.resize(nloc_real);
+    select_map<VALUETYPE>(aparam, aparam_, fwd_map, daparam);
+  }
     if (ago == 0) {
-        atommap = AtomMap<VALUETYPE> (datype_.begin(), datype_.begin() + nloc);
-        assert (nloc == atommap.get_type().size());
+    atommap = AtomMap<VALUETYPE> (datype.begin(), datype.begin() + nloc_real);
+    assert (nloc_real == atommap.get_type().size());
 
         nlist_data.copy_from_nlist(lmp_list);
         nlist_data.shuffle(atommap);
 	nlist_data.make_inlist(nlist);
     }
 
-    int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, atommap, nghost, ago);
-    assert (nloc == ret);
-    run_model (dener, dforce_, dvirial, datom_energy_, datom_virial_, session, input_tensors, atommap, nghost);
+  int ret = session_input_tensors (input_tensors, dcoord, ntypes, datype, dbox, nlist, fparam, aparam, atommap, nghost_real, ago);
+  assert (nloc_real == ret);
+  run_model (dener, dforce, dvirial, datom_energy, datom_virial, session, input_tensors, atommap, nghost_real);
+
+  // bkw map
+  dforce_.resize(fwd_map.size() * 3);
+  datom_energy_.resize(fwd_map.size());
+  datom_virial_.resize(fwd_map.size() * 9);
+  select_map<VALUETYPE>(dforce_, dforce, bkw_map, 3);
+  select_map<VALUETYPE>(datom_energy_, datom_energy, bkw_map, 1);
+  select_map<VALUETYPE>(datom_virial_, datom_virial, bkw_map, 9);
 }
 
 void
@@ -488,7 +513,6 @@ DeepPotModelDevi ()
       init_nbor (false),
       numb_models (0)
 {
-  get_env_nthreads(num_intra_nthreads, num_inter_nthreads);
 }
 
 DeepPotModelDevi::
@@ -497,7 +521,6 @@ DeepPotModelDevi (const std::vector<std::string> & models, const int & gpu_rank,
       init_nbor(false),
       numb_models (0)
 {
-  get_env_nthreads(num_intra_nthreads, num_inter_nthreads);
   init(models, gpu_rank, file_contents);
 }
 
@@ -521,6 +544,7 @@ init (const std::vector<std::string> & models, const int & gpu_rank, const std::
   #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
   SessionOptions options;
+  get_env_nthreads(num_intra_nthreads, num_inter_nthreads);
   options.config.set_inter_op_parallelism_threads(num_inter_nthreads);
   options.config.set_intra_op_parallelism_threads(num_intra_nthreads);
   for (unsigned ii = 0; ii < numb_models; ++ii){
diff --git a/source/api_cc/src/common.cc b/source/api_cc/src/common.cc
index b19f79d42e..a83f364e11 100644
--- a/source/api_cc/src/common.cc
+++ b/source/api_cc/src/common.cc
@@ -206,6 +206,14 @@ check_status(const tensorflow::Status& status) {
   }
 }
 
+void
+throw_env_not_set_warning(std::string env_name)
+{
+  std::cerr << "DeePMD-kit WARNING: Environmental variable " << env_name << " is not set. "
+    << "Tune " << env_name << " for the best performance."
+    << std::endl;
+}
+
 void
 deepmd::
 get_env_nthreads(int & num_intra_nthreads,
@@ -215,17 +223,28 @@ get_env_nthreads(int & num_intra_nthreads,
   num_inter_nthreads = 0;
   const char* env_intra_nthreads = std::getenv("TF_INTRA_OP_PARALLELISM_THREADS");
   const char* env_inter_nthreads = std::getenv("TF_INTER_OP_PARALLELISM_THREADS");
+  const char* env_omp_nthreads = std::getenv("OMP_NUM_THREADS");
   if (env_intra_nthreads && 
       std::string(env_intra_nthreads) != std::string("") && 
       atoi(env_intra_nthreads) >= 0
       ) {
     num_intra_nthreads = atoi(env_intra_nthreads);
+  } else {
+    throw_env_not_set_warning("TF_INTRA_OP_PARALLELISM_THREADS");
   }
   if (env_inter_nthreads && 
       std::string(env_inter_nthreads) != std::string("") &&
       atoi(env_inter_nthreads) >= 0
       ) {
     num_inter_nthreads = atoi(env_inter_nthreads);
+  } else {
+    throw_env_not_set_warning("TF_INTER_OP_PARALLELISM_THREADS");
+  }
+  if (!(env_omp_nthreads && 
+      std::string(env_omp_nthreads) != std::string("") &&
+      atoi(env_omp_nthreads) >= 0
+      )) {
+    throw_env_not_set_warning("OMP_NUM_THREADS");
   }
 }
 
diff --git a/source/api_cc/tests/CMakeLists.txt b/source/api_cc/tests/CMakeLists.txt
index bd7c23a5a3..5609f0a8f2 100644
--- a/source/api_cc/tests/CMakeLists.txt
+++ b/source/api_cc/tests/CMakeLists.txt
@@ -36,7 +36,6 @@ set(opname "deepmd_op")
 set(OP_BASE_DIR ${CMAKE_SOURCE_DIR}/../../op)
 # file(GLOB OP_SRC ${OP_BASE_DIR}/*.cc)
 file(GLOB OP_SRC ${OP_BASE_DIR}/custom_op.cc ${OP_BASE_DIR}/prod_force.cc ${OP_BASE_DIR}/prod_virial.cc ${OP_BASE_DIR}/descrpt.cc ${OP_BASE_DIR}/descrpt_se_a_ef.cc ${OP_BASE_DIR}/descrpt_se_a_ef.cc ${OP_BASE_DIR}/descrpt_se_a_ef_para.cc ${OP_BASE_DIR}/descrpt_se_a_ef_vert.cc ${OP_BASE_DIR}/pair_tab.cc ${OP_BASE_DIR}/prod_force_multi_device.cc ${OP_BASE_DIR}/prod_virial_multi_device.cc ${OP_BASE_DIR}/soft_min.cc ${OP_BASE_DIR}/soft_min_force.cc ${OP_BASE_DIR}/soft_min_virial.cc ${OP_BASE_DIR}/ewald_recp.cc ${OP_BASE_DIR}/gelu_multi_device.cc ${OP_BASE_DIR}/map_aparam.cc ${OP_BASE_DIR}/neighbor_stat.cc ${OP_BASE_DIR}/unaggregated_grad.cc ${OP_BASE_DIR}/tabulate_multi_device.cc ${OP_BASE_DIR}/prod_env_mat_multi_device.cc)
-add_library(${opname} SHARED ${OP_SRC})
 
 list (APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/../../cmake/)
 find_package(tensorflow REQUIRED)
@@ -46,6 +45,7 @@ else()
   set (CMAKE_CXX_STANDARD 11)
 endif()
 include_directories(${TensorFlow_INCLUDE_DIRS})
+add_library(${opname} SHARED ${OP_SRC})
 
 find_package(Threads)
 # find openmp
diff --git a/source/cmake/Findtensorflow.cmake b/source/cmake/Findtensorflow.cmake
index 43fc97894c..87e9918463 100644
--- a/source/cmake/Findtensorflow.cmake
+++ b/source/cmake/Findtensorflow.cmake
@@ -144,7 +144,6 @@ try_run(
   TENSORFLOW_VERSION_RUN_RESULT_VAR TENSORFLOW_VERSION_COMPILE_RESULT_VAR
   ${CMAKE_CURRENT_BINARY_DIR}/tf_version
   "${CMAKE_CURRENT_LIST_DIR}/tf_version.cpp"
-  LINK_LIBRARIES ${TensorFlowFramework_LIBRARY}
   CMAKE_FLAGS "-DINCLUDE_DIRECTORIES:STRING=${TensorFlow_INCLUDE_DIRS}"
   RUN_OUTPUT_VARIABLE TENSORFLOW_VERSION
   COMPILE_OUTPUT_VARIABLE TENSORFLOW_VERSION_COMPILE_OUTPUT_VAR
diff --git a/source/cmake/tf_cxx_abi.cpp b/source/cmake/tf_cxx_abi.cpp
index 296b335b27..ed3fd848e0 100644
--- a/source/cmake/tf_cxx_abi.cpp
+++ b/source/cmake/tf_cxx_abi.cpp
@@ -2,6 +2,14 @@
 #include "tensorflow/core/public/version.h"
 int main(int argc, char * argv[])
 {
+#if (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION>=9) || TF_MAJOR_VERSION > 2 
+#ifdef _GLIBCXX_USE_CXX11_ABI
+  std::cout << _GLIBCXX_USE_CXX11_ABI;
+#else
+  std::cout << 0;
+#endif
+#else
   std::cout << tf_cxx11_abi_flag();
+#endif
   return 0;
 }
diff --git a/source/install/build_cc.sh b/source/install/build_cc.sh
index 19686b83e7..61a4c166c0 100755
--- a/source/install/build_cc.sh
+++ b/source/install/build_cc.sh
@@ -20,7 +20,7 @@ NPROC=$(nproc --all)
 BUILD_TMP_DIR=${SCRIPT_PATH}/../build
 mkdir -p ${BUILD_TMP_DIR}
 cd ${BUILD_TMP_DIR}
-cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DINSTALL_TENSORFLOW=TRUE ${CUDA_ARGS} -DLAMMPS_VERSION=stable_29Sep2021_update3 -DUSE_TTM=TRUE ..
+cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DINSTALL_TENSORFLOW=TRUE ${CUDA_ARGS} -DLAMMPS_VERSION=stable_23Jun2022 -DUSE_TTM=TRUE ..
 make -j${NPROC}
 make install
 
diff --git a/source/install/build_lammps.sh b/source/install/build_lammps.sh
index 91c2fc1cd4..8a96c09b31 100755
--- a/source/install/build_lammps.sh
+++ b/source/install/build_lammps.sh
@@ -15,7 +15,7 @@ BUILD_TMP_DIR=${SCRIPT_PATH}/../build_lammps
 mkdir -p ${BUILD_TMP_DIR}
 cd ${BUILD_TMP_DIR}
 # download LAMMMPS
-LAMMPS_VERSION=stable_29Sep2021_update3
+LAMMPS_VERSION=stable_23Jun2022
 if [ ! -d "lammps-${LAMMPS_VERSION}" ]
 then
 	curl -L -o lammps.tar.gz https://github.com/lammps/lammps/archive/refs/tags/${LAMMPS_VERSION}.tar.gz
diff --git a/source/install/build_tf.py b/source/install/build_tf.py
new file mode 100755
index 0000000000..e2b65eebf6
--- /dev/null
+++ b/source/install/build_tf.py
@@ -0,0 +1,795 @@
+#!/usr/bin/env python3
+"""The easy script to build TensorFlow C++ Library.
+
+Required dependencies:
+- gcc/g++
+- Python3
+- NumPy
+- git
+For CUDA only:
+- CUDA Toolkit
+- cuDNN
+"""
+
+# make sure Python 3 is used
+# https://stackoverflow.com/a/41901923/9567349
+import sys
+if sys.version_info[0] < 3:
+    raise Exception("Python 3 or a more recent version is required.")
+
+# The script should only rely on the stardard Python libraries.
+
+from contextlib import contextmanager
+import argparse
+import importlib.util
+import os
+import re
+import stat
+import subprocess as sp
+import hashlib
+import logging
+import urllib.request
+import tarfile
+import shutil
+import tempfile
+from pathlib import Path
+from typing import List, Dict, Optional
+from abc import ABCMeta, abstractmethod, abstractproperty
+from functools import lru_cache
+from shutil import copytree, ignore_patterns, copy2
+from fnmatch import filter
+
+
+# default config
+FILE = Path(__file__).parent.absolute()
+PACKAGE_DIR = FILE.parent / "packages"
+PREFIX = None
+CPU_COUNT = os.cpu_count()
+nvcc_path = shutil.which("nvcc")
+if nvcc_path is not None:
+    CUDA_PATH = Path(shutil.which("nvcc")).parent.parent
+else:
+    CUDA_PATH = None
+CUDNN_PATH = Path("/usr") if os.path.isfile("/usr/include/cudnn.h") else None
+GCC = shutil.which("gcc")
+GXX = shutil.which("g++")
+
+
+dlog = logging.getLogger("TensorFlow C++ Library installer")
+dlog.setLevel(logging.INFO)
+handler = logging.StreamHandler(sys.stdout)
+handler.setLevel(logging.DEBUG)
+formatter = logging.Formatter(
+    '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+handler.setFormatter(formatter)
+dlog.addHandler(handler)
+
+
+# Common utils
+
+def download_file(url: str, filename: str):
+    """Download files from remote URL.
+
+    Parameters
+    ----------
+    url: str
+        The URL that is available to download.
+    filename: str
+        The downloading path of the file.
+
+    Raises
+    ------
+    URLError
+        raises for HTTP error
+    """
+    dlog.info("Download %s from %s" % (filename, url))
+    with urllib.request.urlopen(url) as response, open(filename, 'wb') as out_file:
+        shutil.copyfileobj(response, out_file)
+
+
+class OnlineResource:
+    """Online resource. Call the instance to download.
+
+    Parameters
+    ----------
+    filename: str
+        The target filename.
+    url : str
+        remote URL
+    sha256 : str
+        expecting sha256
+    executable : bool, default=False
+        if the file is executable
+    gzip : str
+        if not None, decompress to a directory
+    """
+
+    def __init__(self,
+                 filename: str,
+                 url: str,
+                 sha256: str = None,
+                 executable: bool = False,
+                 gzip: str = None,
+                 ) -> None:
+        self.filename = filename
+        self.url = url
+        self.reference_sha256 = sha256
+        self.executable = executable
+        self.gzip = gzip
+
+    def __call__(self):
+        # download if not exists
+        if not self.exists:
+            self.download()
+            if not self.exists:
+                raise RuntimeError(
+                    "Download {} from {} failed! "
+                    "You can manually download it to {} and "
+                    "retry the script.".format(
+                        self.filename, self.url, str(self.path)
+                    ))
+        self.post_process()
+
+    def post_process(self):
+        if self.executable:
+            self.path.chmod(self.path.stat().st_mode | stat.S_IEXEC)
+        if self.gzip is not None:
+            with tarfile.open(self.path) as tar:
+                tar.extractall(path=self.gzip_path)
+
+    def download(self):
+        """Download the target file."""
+        download_file(self.url, self.path)
+
+    @property
+    def path(self) -> Path:
+        """Path to the target file."""
+        return PACKAGE_DIR / self.filename
+
+    @property
+    def gzip_path(self) -> Path:
+        if self.gzip is None:
+            raise RuntimeError("gzip is None for %s" % self.path)
+        return PACKAGE_DIR / self.gzip
+
+    @property
+    def sha256(self) -> str:
+        """Get sha256 of the target file.
+
+        Parameters
+        ----------
+        filename : str
+            The filename.
+
+        Returns
+        -------
+        sha256 : str
+            The sha256.
+        """
+        h = hashlib.sha256()
+        # buffer size: 128 kB
+        b = bytearray(128*1024)
+        mv = memoryview(b)
+        with open(self.path, 'rb', buffering=0) as f:
+            for n in iter(lambda: f.readinto(mv), 0):
+                h.update(mv[:n])
+        return h.hexdigest()
+
+    @property
+    def exists(self) -> bool:
+        """Check if target file exists."""
+        return self.path.exists() and (self.sha256 == self.reference_sha256 or self.reference_sha256 is None)
+
+
+class Build(metaclass=ABCMeta):
+    """Build process."""
+    @abstractproperty
+    def resources(self) -> Dict[str, OnlineResource]:
+        """Required resources."""
+
+    @abstractproperty
+    def dependencies(self) -> Dict[str, "Build"]:
+        """Required dependencies."""
+
+    def download_all_resources(self):
+        """All resources, including dependencies' resources."""
+        for res in self.resources.values():
+            res()
+        for dd in self.dependencies.values():
+            if not dd.built:
+                dd.download_all_resources()
+
+    @abstractproperty
+    def built(self) -> bool:
+        """Check if it has built."""
+
+    @abstractmethod
+    def build(self):
+        """Build process."""
+
+    def __call__(self):
+        if not self.built:
+            # firstly download all resources
+            self.download_all_resources()
+            for dd in self.dependencies.values():
+                if not dd.built:
+                    dd()
+                else:
+                    dlog.info("Skip installing %s, which has been already installed" % dd.__class__.__name__)
+            dlog.info("Start installing %s..." % self.__class__.__name__)
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                self._prefix = Path(tmpdirname)
+                self.build()
+                self.copy_from_tmp_to_prefix()
+            if not self.built:
+                raise RuntimeError("Build failed!")
+
+    @property
+    def prefix(self):
+        """Tmp prefix"""
+        return self._prefix
+
+    def copy_from_tmp_to_prefix(self):
+        """Copy from tmp prefix to real prefix."""
+        copytree2(str(self.prefix), str(PREFIX))
+
+
+@contextmanager
+def set_directory(path: Path):
+    """Sets the current working path within the context.
+
+    Parameters
+    ----------
+    path : Path
+        The path to the cwd
+
+    Yields
+    ------
+    None
+
+    Examples
+    --------
+    >>> with set_directory("some_path"):
+    ...    do_something()
+    """
+    cwd = Path().absolute()
+    path.mkdir(exist_ok=True, parents=True)
+    try:
+        os.chdir(path)
+        yield
+    finally:
+        os.chdir(cwd)
+
+
+def list2env(l: list) -> str:
+    return ':'.join(map(str, l))
+
+
+def get_shlib_ext():
+    """Return the shared library extension."""
+    plat = sys.platform
+    if plat.startswith('win'):
+        return '.dll'
+    elif plat in ['osx', 'darwin']:
+        return '.dylib'
+    elif plat.startswith('linux'):
+        return '.so'
+    else:
+        raise NotImplementedError(plat)
+
+
+def copy3(src: Path, dst: Path, *args, **kwargs):
+    """wrapper to shutil.copy2 to support Pathlib."""
+    return copy2(str(src), str(dst), *args, **kwargs)
+
+
+def copytree2(src: Path, dst: Path, *args, **kwargs):
+    """wrapper to copytree and cp to support Pathlib, pattern, and override."""
+    with tempfile.TemporaryDirectory() as td:
+        # hack to support override
+        tmpdst = Path(td) / "dst"
+        copytree(str(src), str(tmpdst), *args, **kwargs)
+        call([
+            "/bin/cp",
+            # archieve, recursive, force, do not create one inside
+            # https://stackoverflow.com/a/24486142/9567349
+            "-arfT",
+            str(tmpdst),
+            str(dst),
+        ])
+
+
+def include_patterns(*include_patterns):
+    """Factory function that can be used with copytree() ignore parameter.
+
+    Remove directory starts with _.
+    """
+    def _ignore_patterns(path, names):
+        keep = set(name for pattern in include_patterns
+                   for name in filter(names, pattern))
+        removed_dir = any([x.startswith("_") for x in path.split(os.path.sep)])
+        ignore = set(name for name in names
+                     if (name not in keep or removed_dir) and not os.path.isdir(os.path.join(path, name)))
+        return ignore
+    return _ignore_patterns
+
+
+def call(commands: List[str], env={}, **kwargs):
+    """Call commands and print to screen for debug.
+
+    Raises
+    ------
+    RuntimeError
+        returned code is not zero
+    """
+    with sp.Popen(commands, stdout=sys.stdout, stderr=sys.stderr, env=env, **kwargs) as p:
+        p.communicate()
+        exit_code = p.wait()
+
+        if exit_code:
+            raise RuntimeError("Run %s failed, return code: %d" %
+                               (" ".join(commands), exit_code))
+
+
+# the detailed step to build DeePMD-kit
+
+# online resources to download
+RESOURCES = {
+    # bazelisk is used to warpper bazel
+    "bazelisk-1.11.0": OnlineResource(
+        "bazel-linux-amd64-1.11.0",
+        "https://github.com/bazelbuild/bazelisk/releases/download/v1.11.0/bazelisk-linux-amd64",
+        "231ec5ca8115e94c75a1f4fbada1a062b48822ca04f21f26e4cb1cd8973cd458",
+        executable=True,
+    ),
+    # tensorflow
+    "tensorflow-2.9.1": OnlineResource(
+        "tensorflow-2.9.1.tar.gz",
+        "https://github.com/tensorflow/tensorflow/archive/refs/tags/v2.9.1.tar.gz",
+        "6eaf86ead73e23988fe192da1db68f4d3828bcdd0f3a9dc195935e339c95dbdc",
+        gzip="tensorflow",
+    ),
+}
+
+
+class BuildBazelisk(Build):
+    def __init__(self, version="1.11.0") -> None:
+        self.version = version
+
+    @property
+    @lru_cache()
+    def resources(self) -> Dict[str, OnlineResource]:
+        return {
+            "bazelisk": RESOURCES["bazelisk-" + self.version],
+        }
+
+    @property
+    @lru_cache()
+    def dependencies(self) -> Dict[str, Build]:
+        return {}
+
+    def build(self):
+        bazel_res = self.resources['bazelisk']
+        bin_dst = self.prefix / "bin"
+        bin_dst.mkdir(exist_ok=True)
+        copy3(bazel_res.path, bin_dst / "bazelisk")
+
+    @property
+    def built(self):
+        return (PREFIX / "bin" / "bazelisk").exists()
+
+
+class BuildNumpy(Build):
+    """Build NumPy"""
+    @property
+    @lru_cache()
+    def resources(self) -> Dict[str, OnlineResource]:
+        return {}
+
+    @property
+    @lru_cache()
+    def dependencies(self) -> Dict[str, Build]:
+        return {}
+
+    @property
+    def built(self) -> bool:
+        return importlib.util.find_spec("numpy") is not None
+
+    def build(self):
+        try:
+            call([
+                sys.executable,
+                "-m",
+                "pip",
+                "install",
+                "numpy",
+            ])
+        except RuntimeError as e:
+            raise RuntimeError("Please manually install numpy!") from e
+
+
+class BuildCUDA(Build):
+    """Find CUDA."""
+    @property
+    @lru_cache()
+    def resources(self) -> Dict[str, OnlineResource]:
+        return {}
+
+    @property
+    @lru_cache()
+    def dependencies(self) -> Dict[str, Build]:
+        return {}
+
+    def build(self):
+        raise RuntimeError(
+            "NVCC is not found. Please manually install CUDA"
+            "Toolkit and cuDNN!\n"
+            "CUDA Toolkit: https://developer.nvidia.com/cuda-toolkit-archive\n"
+            "cuDNN: https://developer.nvidia.com/rdp/cudnn-archive")
+
+    @property
+    def built(self):
+        return CUDA_PATH is not None and CUDNN_PATH is not None
+
+    @property
+    def cuda_version(self):
+        nvcc_bin = CUDA_PATH / 'bin' / 'nvcc'
+        output = sp.check_output([str(nvcc_bin), '--version'], env={}, encoding='utf8').split('\n')
+        pattern = re.compile('V[0-9]*\\.[0-9]*\\.[0-9]*')
+        for x in output:
+            search = pattern.search(x)
+            if search is not None:
+                # strip "V"
+                version = search.group()[1:]
+                # only return major and minor
+                return ".".join(version.split(".")[:2])
+        raise RuntimeError("Not found version in nvcc --version")
+
+    @property
+    def cudnn_version(self):
+        cudnn_header = CUDNN_PATH / 'include' / 'cudnn.h'
+        with open(cudnn_header) as f:
+            for line in f:
+                if line.startswith("#define CUDNN_MAJOR "):
+                    return line.split()[-1]
+        cudnn_header = CUDNN_PATH / 'include' / 'cudnn_version.h'
+        with open(cudnn_header) as f:
+            for line in f:
+                if line.startswith("#define CUDNN_MAJOR "):
+                    return line.split()[-1]
+        raise RuntimeError(
+            "cuDNN version is not found!\n"
+            "Download from: https://developer.nvidia.com/rdp/cudnn-archive"
+            )
+
+    @property
+    @lru_cache()
+    def cuda_compute_capabilities(self):
+        """Get cuda compute capabilities."""
+        cuda_version = tuple(map(int, self.cuda_version.split(".")))
+        if (10, 0, 0) <= cuda_version < (11, 0, 0):
+            return "sm_35,sm_50,sm_60,sm_62,sm_70,sm_72,sm_75,compute_75"
+        elif (11, 0, 0) <= cuda_version < (11, 1, 0):
+            return "sm_35,sm_50,sm_60,sm_62,sm_70,sm_72,sm_75,sm_80,compute_80"
+        elif (11, 1, 0) <= cuda_version:
+            return "sm_35,sm_50,sm_60,sm_62,sm_70,sm_72,sm_75,sm_80,sm_86,compute_86"
+        else:
+            raise RuntimeError("Unsupported CUDA version")
+
+
+class BuildTensorFlow(Build):
+    """Build TensorFlow C++ interface.
+
+    Parameters
+    ----------
+    version : str, default=2.9.1
+        TensorFlow version
+    enable_mkl : bool, default=True
+        enable OneDNN
+    enable_cuda : bool, default=False
+        Enable CUDA build
+    """
+
+    def __init__(self, version: str ="2.9.1", enable_mkl: bool=True, enable_cuda: bool=False) -> None:
+        self.version = version
+        self.enable_mkl = enable_mkl
+        self.enable_cuda = enable_cuda
+
+    @property
+    @lru_cache()
+    def resources(self) -> Dict[str, OnlineResource]:
+        return {
+            "tensorflow": RESOURCES["tensorflow-" + self.version],
+        }
+
+    @property
+    @lru_cache()
+    def dependencies(self) -> Dict[str, Build]:
+        optional_dep = {}
+        if self.enable_cuda:
+            optional_dep['cuda'] = BuildCUDA()
+        return {
+            "bazelisk": BuildBazelisk(),
+            "numpy": BuildNumpy(),
+            **optional_dep,
+        }
+
+    def build(self):
+        tf_res = self.resources['tensorflow']
+        src = tf_res.gzip_path / ("tensorflow-%s" % self.version)
+        with set_directory(src):
+            # configure -- need bazelisk in PATH
+            call([str(src / "configure")], env={
+                "PATH": list2env([PREFIX / "bin", "/usr/bin"]),
+                **self._environments,
+            })
+            # bazel build
+            call([
+                str(PREFIX / "bin" / "bazelisk"),
+                *self._bazel_opts,
+                "build",
+                *self._build_opts,
+                *self._build_targets,
+            ], env={
+                "PATH": list2env(["/usr/bin"]),
+                "HOME": os.environ.get("HOME"),
+                "TEST_TMPDIR": str(PACKAGE_DIR / "bazelcache"),
+                # for libstdc++
+                "LD_LIBRARY_PATH": os.environ.get("LD_LIBRARY_PATH"),
+                "CC": str(Path(GCC).resolve()),
+                "CXX": str(Path(GXX).resolve()),
+            })
+
+        # copy libraries and directories
+        ext = get_shlib_ext()
+        lib_dst = self.prefix / "lib"
+        include_dst = self.prefix / "include"
+        lib_dst.mkdir(exist_ok=True)
+        include_dst.mkdir(exist_ok=True)
+
+        # 1. copy headers
+        (include_dst / "tensorflow").mkdir(exist_ok=True)
+        copytree2(src / "tensorflow" / "cc", include_dst /
+                  "tensorflow" / "cc", ignore=include_patterns('*.h', '*.inc'))
+        copytree2(src / "tensorflow" / "core", include_dst /
+                  "tensorflow" / "core", ignore=include_patterns('*.h', '*.inc'))
+        # bazel-bin includes generated headers like version, pb.h, ..
+        copytree2(src / "bazel-bin", include_dst,
+                  ignore=include_patterns('*.h', '*.inc'))
+
+        copytree2(src / "third_party", include_dst /
+                  "third_party", ignore=ignore_patterns('*.cc'))
+        bazel_tensorflow = src / ("bazel-" + src.name)
+        copytree2(bazel_tensorflow / "external" /
+                  "eigen_archive" / "Eigen", include_dst / "Eigen")
+        copytree2(bazel_tensorflow / "external" / "eigen_archive" /
+                  "unsupported", include_dst / "unsupported")
+        copytree2(bazel_tensorflow / "external" / "com_google_protobuf" /
+                  "src" / "google", include_dst / "google")
+        copytree2(bazel_tensorflow / "external" /
+                  "com_google_absl" / "absl", include_dst / "absl")
+
+        # 2. copy libraries
+        if self.enable_mkl:
+            copy3(src / "bazel-out" / "k8-opt" / "bin" / "external" /
+                "llvm_openmp" / ("libiomp5" + ext), lib_dst)
+        lib_src = src / "bazel-bin" / "tensorflow"
+        self.copy_lib("libtensorflow_framework" + ext, lib_src, lib_dst)
+        self.copy_lib("libtensorflow_cc" + ext, lib_src, lib_dst)
+
+    def copy_lib(self, libname, src, dst):
+        """Copy library and make symlink."""
+        copy3(src / (libname + "." + self.version), dst)
+        libname_v = libname + "." + self.version
+        (dst / (libname + "." + self.version.split(".")
+         [0])).symlink_to(libname_v)
+        (dst / libname).symlink_to(libname_v)
+
+    @property
+    def _environments(self) -> dict:
+        if self.enable_cuda:
+            cuda_env = {
+                "TF_NEED_CUDA": "1",
+                # /usr is path to driver
+                "TF_CUDA_PATHS": ",".join((str(CUDA_PATH), str(CUDNN_PATH), "/usr")),
+                "TF_CUDA_VERSION": str(self.dependencies['cuda'].cuda_version),
+                "TF_CUDNN_VERSION": str(self.dependencies['cuda'].cudnn_version),
+                "TF_NCCL_VERSION": "",
+                "TF_CUDA_COMPUTE_CAPABILITIES": self.dependencies['cuda'].cuda_compute_capabilities,
+                "GCC_HOST_COMPILER_PATH": str(Path(GCC).resolve()),
+                "GCC_HOST_COMPILER_PREFIX": str(Path(GCC).resolve().parent.parent),
+            }
+        else:
+            cuda_env = {
+                "TF_NEED_CUDA": "0",
+            }
+        return {
+            "TF_ENABLE_XLA": "1",
+            "CC_OPT_FLAGS": "-Wno-sign-compare",
+            # Python settings
+            "PYTHON_BIN_PATH": sys.executable,
+            "USE_DEFAULT_PYTHON_LIB_PATH": "1",
+            # Additional settings
+            "TF_NEED_OPENCL": "0",
+            "TF_NEED_OPENCL_SYCL": "0",
+            "TF_NEED_COMPUTECPP": "0",
+            "TF_CUDA_CLANG": "0",
+            "TF_NEED_TENSORRT": "0",
+            "TF_NEED_ROCM": "0",
+            "TF_NEED_MPI": "0",
+            "TF_DOWNLOAD_CLANG": "0",
+            "TF_SET_ANDROID_WORKSPACE": "0",
+            "TF_CONFIGURE_IOS": "0",
+            ** cuda_env,
+        }
+
+    @property
+    def _build_targets(self) -> List[str]:
+        # C++ interface
+        return ["//tensorflow:libtensorflow_cc" + get_shlib_ext()]
+
+    @property
+    def _build_opts(self) -> List[str]:
+        opts = [
+            "--logging=6",
+            "--verbose_failures",
+            "--config=opt",
+            "--config=noaws",
+            "--copt=-mtune=generic",
+            "--local_cpu_resources=%d" % CPU_COUNT,
+        ]
+        if self.enable_mkl:
+            # enable oneDNN
+            opts.append("--config=mkl")
+        return opts
+
+    @property
+    def _bazel_opts(self) -> List[str]:
+        return []
+
+    @property
+    def built(self):
+        return (PREFIX / "lib" / ("libtensorflow_cc%s.%s" % (get_shlib_ext(), self.version))).exists()
+
+
+def clean_package():
+    """Clean the unused files."""
+    clean_files = [
+        PACKAGE_DIR,
+        # bazelisk
+        PREFIX / "bin" / "bazelisk",
+        # numpy
+        PREFIX / "numpy",
+        # bazel cache
+        Path.home() / ".cache" / "bazel",
+    ]
+    for f in clean_files:
+        shutil.rmtree(str(f), ignore_errors=True)
+
+
+# interface
+
+def env() -> Dict[str, str]:
+    return {
+        "Python": sys.executable,
+        "CUDA": CUDA_PATH,
+        "cuDNN": CUDNN_PATH,
+        "gcc": GCC,
+        "g++": GXX,
+        "Install prefix": PREFIX,
+        "Packages": PACKAGE_DIR,
+    }
+
+
+def pretty_print_env() -> str:
+    return ("Build configs:\n" +
+            "\n".join(["%s:%s%s" % (kk, " "*(19-len(kk)), vv) for kk, vv in env().items() if vv is not None]))
+
+
+class RawTextArgumentDefaultsHelpFormatter(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
+    pass
+
+
+def parse_args(args: Optional[List[str]] = None):
+    """TensorFlow C++ Library Installer commandline options argument parser.
+
+    Parameters
+    ----------
+    args: List[str]
+        list of command line arguments, main purpose is testing default option None
+        takes arguments from sys.argv
+    """
+    parser = argparse.ArgumentParser(
+        description="Installer of Tensorflow C++ Library.\n\n" + pretty_print_env(),
+        formatter_class=RawTextArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--prefix",
+        type=str,
+        required=True,
+        help="Prefix of installed paths.",
+    )
+    parser.add_argument(
+        "--packages",
+        type=str,
+        default=str(PACKAGE_DIR),
+        help="Path to download packages.",
+    )
+    parser.add_argument(
+        "--cuda",
+        action='store_true',
+        help="Enable CUDA for TensorFlow and DeePMD-kit",
+    )
+    parser.add_argument(
+        "--cuda-path",
+        type=str,
+        default=CUDA_PATH,
+        help="path to CUDA Toolkit",
+    )
+    parser.add_argument(
+        "--cudnn-path",
+        type=str,
+        default=CUDNN_PATH,
+        help="path to cuDNN",
+    )
+    parser.add_argument(
+        "--gcc",
+        type=str,
+        default=GCC,
+        help="path to gcc",
+    )
+    parser.add_argument(
+        "--gxx",
+        type=str,
+        default=GXX,
+        help="path to gxx",
+    )
+    parser.add_argument(
+        "--cpus",
+        type=int,
+        default=CPU_COUNT,
+        help="Number of CPU cores used to build.",
+    )
+    parser.add_argument(
+        '--clean',
+        action='store_true',
+        help='Clean files after build.',
+    )
+    parsed_args = parser.parse_args(args=args)
+
+    return parsed_args
+
+
+def str_to_path_if_not_none(x: str) -> Path:
+    if x is not None:
+        return Path(x).absolute()
+    return x
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    # override default settings
+    PREFIX = str_to_path_if_not_none(args.prefix)
+    PACKAGE_DIR = str_to_path_if_not_none(args.packages)
+    CPU_COUNT = args.cpus
+    CUDA_PATH = str_to_path_if_not_none(args.cuda_path)
+    CUDNN_PATH = str_to_path_if_not_none(args.cudnn_path)
+    GCC = args.gcc
+    GXX = args.gxx
+    assert GCC is not None
+    assert GXX is not None
+
+    dlog.info(pretty_print_env())
+
+    # create directories
+    PACKAGE_DIR.mkdir(exist_ok=True)
+    PREFIX.mkdir(exist_ok=True)
+
+    # start to build
+    BuildTensorFlow(enable_cuda=args.cuda)()
+    dlog.info("Build TensorFlow C++ Library successfully!")
+
+    # clean
+    if args.clean:
+        clean_package()
+
diff --git a/source/install/codecov.sh b/source/install/codecov.sh
new file mode 100755
index 0000000000..9a4b7ee29a
--- /dev/null
+++ b/source/install/codecov.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+set -e
+
+#------------------
+
+SCRIPT_PATH=$(dirname $(realpath -s $0))
+
+#------------------
+# upload to codecov
+cd ${SCRIPT_PATH}
+bash <(curl -s https://codecov.io/bash) || echo "Codecov did not collect coverage reports"
+
diff --git a/source/install/test_cc.sh b/source/install/test_cc.sh
index 6b07b802ba..58fa86eb10 100755
--- a/source/install/test_cc.sh
+++ b/source/install/test_cc.sh
@@ -34,8 +34,4 @@ make install
 cd ${SCRIPT_PATH}/../api_cc/tests
 ${INSTALL_PREFIX}/bin/runUnitTests
 
-#------------------
-# upload to codecov
-cd ${SCRIPT_PATH}
-bash <(curl -s https://codecov.io/bash) || echo "Codecov did not collect coverage reports"
 
diff --git a/source/lib/include/device.h b/source/lib/include/device.h
index 7fe6cc127c..9493533366 100644
--- a/source/lib/include/device.h
+++ b/source/lib/include/device.h
@@ -6,6 +6,7 @@
 
 #define TPB 256
 #define SQRT_2_PI 0.7978845608028654 
+typedef long long int_64;
 typedef unsigned long long uint_64;
 
 #if GOOGLE_CUDA
diff --git a/source/lib/include/gelu.h b/source/lib/include/gelu.h
index cf82720434..969cde7ca7 100644
--- a/source/lib/include/gelu.h
+++ b/source/lib/include/gelu.h
@@ -1,4 +1,5 @@
 #pragma once
+#include "device.h"
 
 namespace deepmd{
 
@@ -6,14 +7,14 @@ template<typename FPTYPE>
 void gelu_cpu(
     FPTYPE * out, 
     const FPTYPE * xx, 
-    const int size);
+    const int_64 size);
 
 template<typename FPTYPE>
 void gelu_grad_cpu(
     FPTYPE * out, 
     const FPTYPE * xx,
     const FPTYPE * dy, 
-    const int size);
+    const int_64 size);
 
 template<typename FPTYPE>
 void gelu_grad_grad_cpu(
@@ -21,21 +22,21 @@ void gelu_grad_grad_cpu(
     const FPTYPE * xx,
     const FPTYPE * dy, 
     const FPTYPE * dy_2,
-    const int size);
+    const int_64 size);
 
 #if GOOGLE_CUDA
 template<typename FPTYPE>
 void gelu_gpu_cuda(
     FPTYPE * out, 
     const FPTYPE * xx, 
-    const int size);
+    const int_64 size);
 
 template<typename FPTYPE>
 void gelu_grad_gpu_cuda(
     FPTYPE * out, 
     const FPTYPE * xx,
     const FPTYPE * dy, 
-    const int size);
+    const int_64 size);
 
 template<typename FPTYPE>
 void gelu_grad_grad_gpu_cuda(
@@ -43,7 +44,7 @@ void gelu_grad_grad_gpu_cuda(
     const FPTYPE * xx,
     const FPTYPE * dy, 
     const FPTYPE * dy_2,
-    const int size);
+    const int_64 size);
 #endif // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
@@ -51,14 +52,14 @@ template<typename FPTYPE>
 void gelu_gpu_rocm(
     FPTYPE * out, 
     const FPTYPE * xx, 
-    const int size);
+    const int_64 size);
 
 template<typename FPTYPE>
 void gelu_grad_gpu_rocm(
     FPTYPE * out, 
     const FPTYPE * xx,
     const FPTYPE * dy, 
-    const int size);
+    const int_64 size);
 
 template<typename FPTYPE>
 void gelu_grad_grad_gpu_rocm(
@@ -66,7 +67,7 @@ void gelu_grad_grad_gpu_rocm(
     const FPTYPE * xx,
     const FPTYPE * dy, 
     const FPTYPE * dy_2,
-    const int size);
+    const int_64 size);
 
 #endif//TENSORFLOW_USE_ROCM
 }
diff --git a/source/lib/include/utilities.h b/source/lib/include/utilities.h
index e95ca3e684..06e6498ed6 100644
--- a/source/lib/include/utilities.h
+++ b/source/lib/include/utilities.h
@@ -73,7 +73,7 @@ template <>
 inline float
 invsqrt<float> (const float x) 
 {
-  return 1./sqrtf (x);
+  return 1.f/sqrtf (x);
 }
 
 }
diff --git a/source/lib/src/coord.cc b/source/lib/src/coord.cc
index ecaec9b34c..7e40286023 100644
--- a/source/lib/src/coord.cc
+++ b/source/lib/src/coord.cc
@@ -18,8 +18,8 @@ normalize_coord_cpu(
     FPTYPE ri[3];
     convert_to_inter_cpu(ri, region, coord+3*ii);
     for(int dd = 0; dd < 3; ++dd){
-      ri[dd] = fmod(ri[dd], 1.);
-      if (ri[dd] < 0.) ri[dd] += 1.;
+      ri[dd] = fmod(ri[dd], (FPTYPE)1.);
+      if (ri[dd] < (FPTYPE)0.) ri[dd] += (FPTYPE)1.;
     }
     convert_to_phys_cpu(coord+3*ii, region, ri);
   }
diff --git a/source/lib/src/cuda/coord.cu b/source/lib/src/cuda/coord.cu
index 76d1d08ca9..660619cbad 100644
--- a/source/lib/src/cuda/coord.cu
+++ b/source/lib/src/cuda/coord.cu
@@ -51,6 +51,10 @@ __device__ inline int compute_pbc_shift(
     return shift;
 }
 
+__device__ inline double _fmod(double x, double y) {return fmod(x, y);}
+__device__ inline float _fmod(float x, float y) {return fmodf(x, y);}
+
+
 template<typename FPTYPE>
 __global__ void normalize_one(
     FPTYPE *out_c,
@@ -64,8 +68,8 @@ __global__ void normalize_one(
     FPTYPE inter[3];
     phys2Inter(inter,out_c+idy*3,rec_boxt);
     for (int dd = 0; dd < 3; ++dd) {
-        inter[dd]=(FPTYPE)fmod((double)inter[dd], 1.);
-        if (inter[dd] <  0.) inter[dd] += 1.;
+        inter[dd]=_fmod(inter[dd], (FPTYPE)1.);
+        if (inter[dd] <  (FPTYPE)0.) inter[dd] += (FPTYPE)1.;
     }
     inter2Phys(out_c+idy*3,inter,boxt);
 }
@@ -93,7 +97,7 @@ __global__ void _fill_idx_cellmap(
         ext_ncell[dd] = ext_end[dd] - ext_stt[dd];
         global_grid[dd] = nat_end[dd] - nat_stt[dd];
         idx_orig_shift[dd] = nat_stt[dd] - ext_stt[dd];
-        cell_size[dd] = 1./global_grid[dd];
+        cell_size[dd] = (FPTYPE)1./global_grid[dd];
         nat_orig[dd] = nat_stt[dd] * cell_size[dd];
     }
     if (idy<nloc)
@@ -104,7 +108,7 @@ __global__ void _fill_idx_cellmap(
         phys2Inter(inter,in_c+idy*3,rec_boxt);
         for (int dd = 0; dd < 3; ++dd){
             idx_noshift[dd] = (inter[dd] - nat_orig[dd]) / cell_size[dd];
-            if (inter[dd] - nat_orig[dd] < 0.) idx_noshift[dd] --;
+            if (inter[dd] - nat_orig[dd] < (FPTYPE)0.) idx_noshift[dd] --;
             if (idx_noshift[dd] < nat_stt[dd]) 
             {
                 idx_noshift[dd] = nat_stt[dd];
diff --git a/source/lib/src/cuda/gelu.cu b/source/lib/src/cuda/gelu.cu
index ca96751895..51e580a445 100644
--- a/source/lib/src/cuda/gelu.cu
+++ b/source/lib/src/cuda/gelu.cu
@@ -1,17 +1,20 @@
 #include "gelu.h"
 #include "device.h"
 
+__device__ inline double _tanh(double x) {return tanh(x);}
+__device__ inline float _tanh(float x) {return tanhf(x);}
+
 template <typename FPTYPE>
 __global__ void gelu(
     FPTYPE * out, 
     const FPTYPE * xx, 
-    int const size) 
+    const int_64 size) 
 {
-  int const idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int_64 idx = int_64(blockIdx.x) * blockDim.x + threadIdx.x;
   if (idx >= size) {
     return;
   }
-  out[idx] = xx[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (xx[idx] + 0.044715 * xx[idx] * xx[idx] *xx[idx])));
+  out[idx] = xx[idx] * (FPTYPE)0.5 * ((FPTYPE)1.0 + _tanh((FPTYPE)SQRT_2_PI * (xx[idx] + (FPTYPE)0.044715 * xx[idx] * xx[idx] *xx[idx])));
 }
 
 template <typename FPTYPE>
@@ -19,15 +22,15 @@ __global__ void gelu_grad(
     FPTYPE * out, 
     const FPTYPE * xx, 
     const FPTYPE * dy, 
-    int const size) 
+    const int_64 size) 
 {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int_64 idx = int_64(blockIdx.x) * blockDim.x + threadIdx.x;
   if (idx >= size) {
     return;
   }
   // out[idx] = xx[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (xx[idx] + 0.044715 * xx[idx] * xx[idx] *xx[idx])));
-  const FPTYPE var = tanh(SQRT_2_PI * (xx[idx] + 0.044715 * xx[idx] * xx[idx] *xx[idx]));
-  out[idx] = dy[idx] * (0.5 * SQRT_2_PI * xx[idx] * (1 - var * var) * (0.134145 * xx[idx] * xx[idx] + 1) + 0.5 * var + 0.5);
+  const FPTYPE var = _tanh((FPTYPE)SQRT_2_PI * (xx[idx] + (FPTYPE)0.044715 * xx[idx] * xx[idx] *xx[idx]));
+  out[idx] = dy[idx] * ((FPTYPE)0.5 * SQRT_2_PI * xx[idx] * ((FPTYPE)1. - var * var) * ((FPTYPE)0.134145 * xx[idx] * xx[idx] + 1) + (FPTYPE)0.5 * var + (FPTYPE)0.5);
 }
 
 template <typename FPTYPE>
@@ -36,16 +39,16 @@ __global__ void gelu_grad_grad(
     const FPTYPE * xx, 
     const FPTYPE * dy, 
     const FPTYPE * dy_2,
-    int const size) 
+    const int_64 size) 
 {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int_64 idx = int_64(blockIdx.x) * blockDim.x + threadIdx.x;
   if (idx >= size) {
     return;
   }
   // out[idx] = xx[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (xx[idx] + 0.044715 * xx[idx] * xx[idx] *xx[idx])));
-  const FPTYPE var1 = tanh(SQRT_2_PI * (xx[idx] + 0.044715 * xx[idx] * xx[idx] *xx[idx]));
-  const FPTYPE var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * xx[idx] * xx[idx] + 1);
-  out[idx] = dy[idx] * dy_2[idx] * (0.134145 * SQRT_2_PI * xx[idx] * xx[idx] * (1 - var1 * var1) - SQRT_2_PI * xx[idx] * var2 * (0.134145 * xx[idx] * xx[idx] + 1) * var1 + var2);
+  const FPTYPE var1 = _tanh((FPTYPE)SQRT_2_PI * (xx[idx] + (FPTYPE)0.044715 * xx[idx] * xx[idx] *xx[idx]));
+  const FPTYPE var2 = (FPTYPE)SQRT_2_PI * ((FPTYPE)1. - var1 * var1) * ((FPTYPE)0.134145 * xx[idx] * xx[idx] + (FPTYPE)1.);
+  out[idx] = dy[idx] * dy_2[idx] * ((FPTYPE)0.134145 * (FPTYPE)SQRT_2_PI * xx[idx] * xx[idx] * ((FPTYPE)1. - var1 * var1) - (FPTYPE)SQRT_2_PI * xx[idx] * var2 * ((FPTYPE)0.134145 * xx[idx] * xx[idx] + (FPTYPE)1.) * var1 + var2);
 }
 
 namespace deepmd {
@@ -53,7 +56,7 @@ template<typename FPTYPE>
 void gelu_gpu_cuda(
     FPTYPE * out, 
     const FPTYPE * xx, 
-    const int size)
+    const int_64 size)
 {
   if(size <= 0){
     return;
@@ -71,7 +74,7 @@ void gelu_grad_gpu_cuda(
     FPTYPE * out, 
     const FPTYPE * xx,
     const FPTYPE * dy, 
-    const int size)
+    const int_64 size)
 {
   if(size <= 0){
     return;
@@ -90,7 +93,7 @@ void gelu_grad_grad_gpu_cuda(
     const FPTYPE * xx,
     const FPTYPE * dy, 
     const FPTYPE * dy_2,
-    const int size)
+    const int_64 size)
 {
   if(size <= 0){
     return;
@@ -103,10 +106,10 @@ void gelu_grad_grad_gpu_cuda(
   DPErrcheck(cudaDeviceSynchronize());
 }
 
-template void gelu_gpu_cuda<float>(float * out, const float * x, const int size);
-template void gelu_gpu_cuda<double>(double * out, const double * x, const int size);
-template void gelu_grad_gpu_cuda<float>(float * out, const float * x, const float * dy, const int size);
-template void gelu_grad_gpu_cuda<double>(double * out, const double * x, const double * dy, const int size);
-template void gelu_grad_grad_gpu_cuda<float>(float * out, const float * x, const float * dy, const float * dy_2, const int size);
-template void gelu_grad_grad_gpu_cuda<double>(double * out, const double * x, const double * dy, const double * dy_2, const int size);
+template void gelu_gpu_cuda<float>(float * out, const float * x, const int_64 size);
+template void gelu_gpu_cuda<double>(double * out, const double * x, const int_64 size);
+template void gelu_grad_gpu_cuda<float>(float * out, const float * x, const float * dy, const int_64 size);
+template void gelu_grad_gpu_cuda<double>(double * out, const double * x, const double * dy, const int_64 size);
+template void gelu_grad_grad_gpu_cuda<float>(float * out, const float * x, const float * dy, const float * dy_2, const int_64 size);
+template void gelu_grad_grad_gpu_cuda<double>(double * out, const double * x, const double * dy, const double * dy_2, const int_64 size);
 }
\ No newline at end of file
diff --git a/source/lib/src/cuda/prod_env_mat.cu b/source/lib/src/cuda/prod_env_mat.cu
index b2cd4dcaf7..93a2b6a787 100644
--- a/source/lib/src/cuda/prod_env_mat.cu
+++ b/source/lib/src/cuda/prod_env_mat.cu
@@ -5,6 +5,11 @@
 #include <cub/block/block_store.cuh>
 #include <cub/block/block_radix_sort.cuh>
 
+__device__ inline double _sqrt(double x) {return sqrt(x);}
+__device__ inline float _sqrt(float x) {return sqrtf(x);}
+__device__ inline double _rsqrt(double x) {return rsqrt(x);}
+__device__ inline float _rsqrt(float x) {return rsqrtf(x);}
+
 // common part of prod_env_mat
 template <
     typename    Key,
@@ -29,7 +34,7 @@ __global__ void BlockSortKernel(
   // Per-thread tile items
   Key items[ITEMS_PER_THREAD];
   // Our current block's offset
-  int block_offset = blockIdx.x * TILE_SIZE;
+  int_64 block_offset = (int_64)blockIdx.x * TILE_SIZE;
   // Load items into a blocked arrangement
   BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
   // Barrier for smem reuse
@@ -57,18 +62,18 @@ __device__ inline void spline5_switch(
     const float & rmax) 
 {
   if (xx < rmin) {
-    dd = 0;
-    vv = 1;
+    dd = (FPTYPE)0.;
+    vv = (FPTYPE)1.;
   }
   else if (xx < rmax) {
     FPTYPE uu = (xx - rmin) / (rmax - rmin) ;
-    FPTYPE du = 1. / (rmax - rmin) ;
-    vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1;
-    dd = ( 3 * uu*uu * (-6 * uu*uu + 15 * uu - 10) + uu*uu*uu * (-12 * uu + 15) ) * du;
+    FPTYPE du = (FPTYPE)1. / (rmax - rmin) ;
+    vv = uu*uu*uu * ((FPTYPE)-6. * uu*uu + (FPTYPE)15. * uu - (FPTYPE)10.) + (FPTYPE)1.;
+    dd = ( (FPTYPE)3. * uu*uu * ((FPTYPE)-6. * uu*uu + (FPTYPE)15. * uu - (FPTYPE)10.) + uu*uu*uu * ((FPTYPE)-12. * uu + (FPTYPE)15.) ) * du;
   }
   else {
-    dd = 0;
-    vv = 0;
+    dd = (FPTYPE)0.;
+    vv = (FPTYPE)0.;
   }
 }
 
@@ -82,7 +87,7 @@ __device__ inline uint_64 encoding_nbor_info(
   // the type of nbor atom must be smaller than 128
   // the distance of center atom between nbor atom must be smaller than 128
   // the index of nbor atom(including ghost region) must be smaller than 16777216(1 << 24)
-  if(type >= 128 || dist >= 128.0 || index >= (1 << 24)) {
+  if(type >= 128 || dist >= (FPTYPE)128.0 || index >= (1 << 24)) {
     asm("trap;");
   }
   return ((uint_64)type << 57) + (uint_64)((double)dist * ((uint_64)1 << 50)) / (1 << 24) * (1 << 24) + index;
@@ -122,7 +127,7 @@ __global__ void format_nlist_fill_a(
     const int MAX_NBOR_SIZE)
 {   
   // <<<nloc, MAX_NBOR_SIZE>>>
-  const unsigned int idx = blockIdx.x;
+  const int_64 idx = blockIdx.x;
   const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
   
   const int nsize = numneigh[i_idx[idx]];
@@ -138,7 +143,7 @@ __global__ void format_nlist_fill_a(
   for (int dd = 0; dd < 3; dd++) {
     diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd];
   }
-  FPTYPE rr = sqrt(dev_dot(diff, diff)); 
+  FPTYPE rr = _sqrt(dev_dot(diff, diff)); 
   if (rr <= rcut) {
     key_in[idy] = encoding_nbor_info(type[j_idx], rr, j_idx);
   }
@@ -152,7 +157,7 @@ __global__ void fill_nei_iter(
     const int max_nbor_size,
     const int sec_size)
 {
-  int row = blockIdx.x;
+  int_64 row = blockIdx.x;
   int col = blockIdx.y * blockDim.x + threadIdx.x;
   const FPTYPE * key_out = key + nloc * max_nbor_size + row * max_nbor_size;
   int nei_type_cur = -1, nbor_idx_cur = 0;
@@ -178,7 +183,7 @@ __global__ void format_nlist_fill_b(
     int * nei_iter_dev,
     const int max_nbor_size)
 { 
-  int row = blockIdx.x;
+  int_64 row = blockIdx.x;
   int col = blockIdx.y * blockDim.x + threadIdx.x;
   int * nei_iter = nei_iter_dev + row * sec_size;
   FPTYPE * key_out = key + nloc * max_nbor_size + row * max_nbor_size;
@@ -214,6 +219,66 @@ __global__ void encoding_decoding_nbor_info(
   decoding_nbor_info(out_type[idx], out_index[idx], key[idx]);
 }
 
+template<typename FPTYPE>
+void format_nbor_list_256 (
+    uint_64 * key,
+    const FPTYPE* coord,
+    const int* type,
+    const deepmd::InputNlist & gpu_inlist,
+    const int& nloc,       
+    const float& rcut, 
+    int * i_idx) 
+{   
+  const int LEN = 256;
+  const int MAX_NBOR_SIZE = 256;
+  const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(1, LEN);
+  format_nlist_fill_a<<<block_grid, thread_grid>>> (
+      key,
+      coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+  const int ITEMS_PER_THREAD = 4;
+  const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
+  // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+  BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (
+      key, 
+      key + nloc * MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+
+template<typename FPTYPE>
+void format_nbor_list_512 (
+    uint_64 * key,
+    const FPTYPE* coord,
+    const int* type,
+    const deepmd::InputNlist & gpu_inlist,
+    const int& nloc,       
+    const float& rcut, 
+    int * i_idx) 
+{   
+  const int LEN = 256;
+  const int MAX_NBOR_SIZE = 512;
+  const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(1, LEN);
+  format_nlist_fill_a<<<block_grid, thread_grid>>> (
+      key,
+      coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+  const int ITEMS_PER_THREAD = 4;
+  const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
+  // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+  BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (
+      key, 
+      key + nloc * MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+
 template<typename FPTYPE>
 void format_nbor_list_1024 (
     uint_64 * key,
@@ -321,7 +386,7 @@ __global__ void compute_env_mat_a(
     const float rmax)
 {   
   // <<<nloc, TPB>>>
-  const unsigned int bid = blockIdx.x;
+  const int_64 bid = blockIdx.x;
   const unsigned int tid = threadIdx.x;
   if (tid >= nnei) {
     return;
@@ -345,14 +410,14 @@ __global__ void compute_env_mat_a(
       }
       // const FPTYPE * rr = &row_rij[ii * 3];
       FPTYPE nr2 = dev_dot(rr, rr);
-      FPTYPE inr = 1./sqrt(nr2);
+      FPTYPE inr = _rsqrt(nr2);
       FPTYPE nr = nr2 * inr;
       FPTYPE inr2 = inr * inr;
       FPTYPE inr4 = inr2 * inr2;
       FPTYPE inr3 = inr4 * nr;
       FPTYPE sw, dsw;
       spline5_switch(sw, dsw, nr, rmin, rmax);
-      dd[0] = (1./nr)       ;//* sw;
+      dd[0] = ((FPTYPE)1./nr)       ;//* sw;
       dd[1] = (rr[0] / nr2) ;//* sw;
       dd[2] = (rr[1] / nr2) ;//* sw;
       dd[3] = (rr[2] / nr2) ;//* sw;
@@ -360,17 +425,17 @@ __global__ void compute_env_mat_a(
       vv[1] = (rr[1] * inr3 * sw - dd[0] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3];
       vv[2] = (rr[2] * inr3 * sw - dd[0] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3];
       // ****deriv of component x/r2
-      vv[3] = ((2. * rr[0] * rr[0] * inr4 - inr2) * sw - dd[1] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 3) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 3) % (ndescrpt * 3)) / 3];
-      vv[4] = ((2. * rr[0] * rr[1] * inr4	) * sw - dd[1] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 4) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 4) % (ndescrpt * 3)) / 3];
-      vv[5] = ((2. * rr[0] * rr[2] * inr4	) * sw - dd[1] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 5) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 5) % (ndescrpt * 3)) / 3];
+      vv[3] = (((FPTYPE)2. * rr[0] * rr[0] * inr4 - inr2) * sw - dd[1] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 3) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 3) % (ndescrpt * 3)) / 3];
+      vv[4] = (((FPTYPE)2. * rr[0] * rr[1] * inr4	) * sw - dd[1] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 4) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 4) % (ndescrpt * 3)) / 3];
+      vv[5] = (((FPTYPE)2. * rr[0] * rr[2] * inr4	) * sw - dd[1] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 5) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 5) % (ndescrpt * 3)) / 3];
       // ***deriv of component y/r2
-      vv[6] = ((2. * rr[1] * rr[0] * inr4	) * sw - dd[2] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 6) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 6) % (ndescrpt * 3)) / 3];
-      vv[7] = ((2. * rr[1] * rr[1] * inr4 - inr2) * sw - dd[2] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 7) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 7) % (ndescrpt * 3)) / 3];
-      vv[8] = ((2. * rr[1] * rr[2] * inr4	) * sw - dd[2] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 8) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 8) % (ndescrpt * 3)) / 3];
+      vv[6] = (((FPTYPE)2. * rr[1] * rr[0] * inr4	) * sw - dd[2] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 6) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 6) % (ndescrpt * 3)) / 3];
+      vv[7] = (((FPTYPE)2. * rr[1] * rr[1] * inr4 - inr2) * sw - dd[2] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 7) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 7) % (ndescrpt * 3)) / 3];
+      vv[8] = (((FPTYPE)2. * rr[1] * rr[2] * inr4	) * sw - dd[2] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 8) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 8) % (ndescrpt * 3)) / 3];
       // ***deriv of component z/r2 
-      vv[9] = ((2. * rr[2] * rr[0] * inr4	) * sw - dd[3] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 9) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 9) % (ndescrpt * 3)) / 3];
-      vv[10]= ((2. * rr[2] * rr[1] * inr4	) * sw - dd[3] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 10) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 10) % (ndescrpt * 3)) / 3];
-      vv[11]= ((2. * rr[2] * rr[2] * inr4 - inr2) * sw - dd[3] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 11) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 11) % (ndescrpt * 3)) / 3];
+      vv[9] = (((FPTYPE)2. * rr[2] * rr[0] * inr4	) * sw - dd[3] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 9) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 9) % (ndescrpt * 3)) / 3];
+      vv[10]= (((FPTYPE)2. * rr[2] * rr[1] * inr4	) * sw - dd[3] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 10) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 10) % (ndescrpt * 3)) / 3];
+      vv[11]= (((FPTYPE)2. * rr[2] * rr[2] * inr4 - inr2) * sw - dd[3] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 11) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 11) % (ndescrpt * 3)) / 3];
       // 4 value components
       dd[0] *= sw; // * em[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] * ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt + idx_value + 0];
       dd[1] *= sw; // * em[idx * ndescrpt + idx_value + 1]);// - avg[type[idx] * ndescrpt + idx_value + 1]) / std[type[idx] * ndescrpt + idx_value + 1];
@@ -407,7 +472,7 @@ __global__ void compute_env_mat_r(
     const float rmax)
 {
   // <<<nloc, TPB>>>
-  const unsigned int bid = blockIdx.x;
+  const int_64 bid = blockIdx.x;
   const unsigned int tid = threadIdx.x;
   if (tid >= nnei) {
     return;
@@ -431,14 +496,14 @@ __global__ void compute_env_mat_r(
       }
       // const FPTYPE * rr = &row_rij[ii * 3];
       FPTYPE nr2 = dev_dot(rr, rr);
-      FPTYPE inr = 1./sqrt(nr2);
+      FPTYPE inr = _rsqrt(nr2);
       FPTYPE nr = nr2 * inr;
       FPTYPE inr2 = inr * inr;
       FPTYPE inr4 = inr2 * inr2;
       FPTYPE inr3 = inr4 * nr;
       FPTYPE sw, dsw;
       spline5_switch(sw, dsw, nr, rmin, rmax);
-      dd = (1./nr)       ;//* sw;
+      dd = ((FPTYPE)1./nr)       ;//* sw;
       vv[0] = (rr[0] * inr3 * sw - dd * dsw * rr[0] * inr); // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3];
       vv[1] = (rr[1] * inr3 * sw - dd * dsw * rr[1] * inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3];
       vv[2] = (rr[2] * inr3 * sw - dd * dsw * rr[2] * inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3];
@@ -479,9 +544,9 @@ void format_nbor_list_gpu_cuda(
   int * nei_iter = array_int + sec.size(); // = new int[sec_size];
   int * i_idx = array_int + sec.size() + nloc * sec.size();
   uint_64 * key = array_longlong;
-  assert(max_nbor_size == 1024 || max_nbor_size == 2048 || max_nbor_size == 4096);
-  DPErrcheck(cudaMemset(nlist, -1, sizeof(int) * nloc * nnei));
-  DPErrcheck(cudaMemset(key, 0xffffffff, sizeof(uint_64) * nloc * max_nbor_size));
+  assert(max_nbor_size == 256 || max_nbor_size == 512 || max_nbor_size == 1024 || max_nbor_size == 2048 || max_nbor_size == 4096);
+  DPErrcheck(cudaMemset(nlist, -1, sizeof(int) * int_64(nloc) * nnei));
+  DPErrcheck(cudaMemset(key, 0xffffffff, sizeof(uint_64) * int_64(nloc) * max_nbor_size));
   DPErrcheck(cudaMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), cudaMemcpyHostToDevice));   
 
   get_i_idx<<<nblock, LEN>>>(
@@ -490,7 +555,17 @@ void format_nbor_list_gpu_cuda(
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
 
-  if (max_nbor_size == 1024) {
+  if (max_nbor_size == 256) {
+    format_nbor_list_256 (
+        key,
+        coord, type, gpu_inlist, nloc, rcut, i_idx); 
+  }
+  else if (max_nbor_size == 512) {
+    format_nbor_list_512 (
+        key,
+        coord, type, gpu_inlist, nloc, rcut, i_idx); 
+  } 
+  else if (max_nbor_size == 1024) {
     format_nbor_list_1024 (
         key,
         coord, type, gpu_inlist, nloc, rcut, i_idx); 
@@ -539,9 +614,9 @@ void prod_env_mat_a_gpu_cuda(
 {
   const int nnei = sec.back();
   const int ndescrpt = nnei * 4;
-  DPErrcheck(cudaMemset(em, 0, sizeof(FPTYPE) * nloc * ndescrpt));
-  DPErrcheck(cudaMemset(em_deriv, 0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
-  DPErrcheck(cudaMemset(rij, 0, sizeof(FPTYPE) * nloc * nnei * 3));
+  DPErrcheck(cudaMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt));
+  DPErrcheck(cudaMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3));
+  DPErrcheck(cudaMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3));
 
   format_nbor_list_gpu_cuda(
       nlist, 
@@ -578,9 +653,9 @@ void prod_env_mat_r_gpu_cuda(
 {
   const int nnei = sec.back();
   const int ndescrpt = nnei * 1;
-  DPErrcheck(cudaMemset(em, 0, sizeof(FPTYPE) * nloc * ndescrpt));
-  DPErrcheck(cudaMemset(em_deriv, 0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
-  DPErrcheck(cudaMemset(rij, 0, sizeof(FPTYPE) * nloc * nnei * 3));
+  DPErrcheck(cudaMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt));
+  DPErrcheck(cudaMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3));
+  DPErrcheck(cudaMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3));
 
   format_nbor_list_gpu_cuda(
       nlist, 
diff --git a/source/lib/src/cuda/prod_force.cu b/source/lib/src/cuda/prod_force.cu
index 4f634b3fb6..ace49b3d98 100644
--- a/source/lib/src/cuda/prod_force.cu
+++ b/source/lib/src/cuda/prod_force.cu
@@ -11,7 +11,7 @@ __global__ void force_deriv_wrt_center_atom(
     const int ndescrpt)
 {
   __shared__ FPTYPE data[THREADS_PER_BLOCK * 3];
-  unsigned int bid = blockIdx.x;
+  int_64 bid = blockIdx.x;
   unsigned int tid = threadIdx.x;
   for (int ii = tid; ii < THREADS_PER_BLOCK * 3; ii += THREADS_PER_BLOCK) {
     data[ii] = 0.f;
@@ -49,7 +49,7 @@ __global__ void force_deriv_wrt_neighbors_a(
     const int nnei)
 {  
     // idy -> nnei
-    const unsigned int idx = blockIdx.x;
+    const int_64 idx = blockIdx.x;
     const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
     const unsigned int idz = threadIdx.y;
     const int ndescrpt = nnei * 4;
@@ -78,7 +78,7 @@ __global__ void force_deriv_wrt_neighbors_r(
 		const int nnei)
 {  
     // idy -> nnei
-    const unsigned int idx = blockIdx.x;
+    const int_64 idx = blockIdx.x;
     const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
     const unsigned int idz = threadIdx.y;
     const int ndescrpt = nnei * 1;
diff --git a/source/lib/src/cuda/prod_force_grad.cu b/source/lib/src/cuda/prod_force_grad.cu
index 33955faade..f09082c316 100644
--- a/source/lib/src/cuda/prod_force_grad.cu
+++ b/source/lib/src/cuda/prod_force_grad.cu
@@ -17,7 +17,7 @@ __global__ void force_grad_wrt_center_atom(
     const int ndescrpt)
 {
     __shared__ FPTYPE grad_one[3];
-    unsigned int center_idx = blockIdx.x;
+    int_64 center_idx = blockIdx.x;
     unsigned int tid = threadIdx.x;
     if(tid < 3){
         grad_one[tid] = grad[center_idx * 3 + tid];
@@ -39,7 +39,7 @@ __global__ void force_grad_wrt_neighbors_a(
     const int nnei)
 {
     // idy -> nnei
-    const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int_64 idx = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int idy = blockIdx.y;
     const unsigned int idw = threadIdx.y;
     if (idx >= nloc) {
@@ -63,7 +63,7 @@ __global__ void force_grad_wrt_neighbors_r(
     const int nnei)
 {
     // idy -> nnei
-    const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int_64 idx = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int idy = blockIdx.y;
     if (idx >= nloc) {
         return;
diff --git a/source/lib/src/cuda/prod_virial.cu b/source/lib/src/cuda/prod_virial.cu
index e806af4e57..16566027ae 100644
--- a/source/lib/src/cuda/prod_virial.cu
+++ b/source/lib/src/cuda/prod_virial.cu
@@ -12,7 +12,7 @@ __global__ void atom_virial_reduction(
     unsigned int bid = blockIdx.x;
     unsigned int tid = threadIdx.x;
     __shared__ FPTYPE data[THREADS_PER_BLOCK];
-    data[tid] = 0.f;
+    data[tid] = (FPTYPE)0.;
     for (int ii = tid; ii < nall; ii += THREADS_PER_BLOCK) {
         data[tid] += atom_virial[ii * 9 + bid];
     }
@@ -44,7 +44,7 @@ __global__ void virial_deriv_wrt_neighbors_a(
   // idz = dd0 * 3 + dd1
   // dd0 = idz / 3
   // dd1 = idz % 3
-  const unsigned int idx = blockIdx.x;
+  const int_64 idx = blockIdx.x;
   const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
   const unsigned int idz = threadIdx.y;
   const int ndescrpt = nnei * 4;
@@ -58,7 +58,7 @@ __global__ void virial_deriv_wrt_neighbors_a(
   // atomicAdd(
   //    virial + idz, 
   //    net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz % 3]);
-  FPTYPE virial_tmp = 0.f;
+  FPTYPE virial_tmp = (FPTYPE)0.;
   for (int idw = 0; idw < 4; ++idw) {
       virial_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz % 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3];
   }
@@ -81,7 +81,7 @@ __global__ void virial_deriv_wrt_neighbors_r(
     // idz = dd0 * 3 + dd1
     // dd0 = idz / 3
     // dd1 = idz % 3
-    const unsigned int idx = blockIdx.x;
+    const int_64 idx = blockIdx.x;
     const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
     const unsigned int idz = threadIdx.y;
     const int ndescrpt = nnei * 1;
diff --git a/source/lib/src/cuda/prod_virial_grad.cu b/source/lib/src/cuda/prod_virial_grad.cu
index 0209ba933a..c699c4a09a 100644
--- a/source/lib/src/cuda/prod_virial_grad.cu
+++ b/source/lib/src/cuda/prod_virial_grad.cu
@@ -6,7 +6,7 @@ __device__ inline FPTYPE dev_dot9(
     const FPTYPE * arr1, 
     const FPTYPE * arr2) 
 {
-    FPTYPE result = 0.0;
+    FPTYPE result = (FPTYPE)0.0;
     for(int ii=0; ii<9; ii++){
         result += arr1[ii] * arr2[ii];
     }
@@ -25,7 +25,7 @@ __global__ void virial_grad_wrt_neighbors_a(
 {
     // idy -> nnei
     const unsigned int tid = threadIdx.x;
-    const unsigned int idx = blockIdx.x * blockDim.x + tid;
+    const int_64 idx = blockIdx.x * blockDim.x + tid;
     const unsigned int idy = blockIdx.y;
     const unsigned int idw = threadIdx.y;
     const int ndescrpt = nnei * 4;
@@ -47,7 +47,7 @@ __global__ void virial_grad_wrt_neighbors_a(
             tmp[dd0 * 3 + dd1] = rij[idx * nnei * 3 + idy * 3 + dd1] * env_deriv[idx * ndescrpt * 3 + idy * 4 * 3 + idw * 3 + dd0];
         }
     }
-    grad_net[idx * ndescrpt + idy * 4 + idw] -= -1.0 * dev_dot9(grad_one, tmp);
+    grad_net[idx * ndescrpt + idy * 4 + idw] -= (FPTYPE)-1.0 * dev_dot9(grad_one, tmp);
 }
 
 template<typename FPTYPE>
@@ -62,7 +62,7 @@ __global__ void virial_grad_wrt_neighbors_r(
 {
     // idy -> nnei
     const unsigned int tid = threadIdx.x;
-    const unsigned int idx = blockIdx.x * blockDim.x + tid;
+    const int_64 idx = blockIdx.x * blockDim.x + tid;
     const unsigned int idy = blockIdx.y;
     const int ndescrpt = nnei;
     __shared__ FPTYPE grad_one[9];
@@ -83,7 +83,7 @@ __global__ void virial_grad_wrt_neighbors_r(
             tmp[dd0 * 3 + dd1] = rij[idx * nnei * 3 + idy * 3 + dd1] * env_deriv[idx * ndescrpt * 3 + idy * 3 + dd0];
         }
     }
-    grad_net[idx * ndescrpt + idy] -= -1.0 * dev_dot9(grad_one, tmp);
+    grad_net[idx * ndescrpt + idy] -= (FPTYPE)-1.0 * dev_dot9(grad_one, tmp);
 }
 
 namespace deepmd {
diff --git a/source/lib/src/cuda/tabulate.cu b/source/lib/src/cuda/tabulate.cu
index b9b1f80a0e..f8c3b46589 100644
--- a/source/lib/src/cuda/tabulate.cu
+++ b/source/lib/src/cuda/tabulate.cu
@@ -20,7 +20,7 @@ void locate_xx_se_a(
 {
   if (xx < lower) {
     table_idx = 0;
-    xx = 0;
+    xx = (FPTYPE)0.;
   }
   else if (xx < upper) {
     table_idx = (int)((xx - lower) / stride0);
@@ -33,7 +33,7 @@ void locate_xx_se_a(
   }
   else {
     table_idx = int((upper - lower) / stride0) + (int)((max - upper) / stride1) - 1;
-    xx = 0;
+    xx = (FPTYPE)0.;
   }
 }
 
@@ -51,7 +51,7 @@ void locate_xx_se_t(
 {
   if (xx < min) {
     table_idx = 0;
-    xx = 0;
+    xx = (FPTYPE)0.;
   }
   else if (xx < lower) {
     table_idx = (int)((xx - min) / stride1);
@@ -69,7 +69,7 @@ void locate_xx_se_t(
   }
   else {
     table_idx = int((lower - min) / stride1) + int((upper - lower) / stride0) + (int)((max - upper) / stride1) - 1;
-    xx = 0;
+    xx = (FPTYPE)0.;
   }
 }
 
@@ -86,7 +86,7 @@ void locate_xx_se_r(
 {
   if (xx < lower) {
     table_idx = 0;
-    xx = 0;
+    xx = (FPTYPE)0.;
   }
   else if (xx < upper) {
     table_idx = (int)((xx - lower) / stride0);
@@ -99,7 +99,7 @@ void locate_xx_se_r(
   }
   else {
     table_idx = int((upper - lower) / stride0) + (int)((max - upper) / stride1) - 1;
-    xx = 0;
+    xx = (FPTYPE)0.;
   }
 }
 
@@ -157,13 +157,13 @@ __global__ void tabulate_fusion_se_a_fifth_order_polynomial(
     const int nnei, 
     const int last_layer_size) 
 {
-  const int block_idx = blockIdx.x;   // nloc
+  const int_64 block_idx = blockIdx.x;   // nloc
   const int thread_idx = threadIdx.x; // last_layer_size
   FPTYPE ago = __shfl_sync(0xffffffff, em_x[block_idx * nnei + nnei - 1], 0);
   bool unloop = false;
   int breakpoint = nnei - 1;
 
-  FPTYPE sum[MTILE] = {0.f};
+  FPTYPE sum[MTILE] = {(FPTYPE)0.};
   int mark_table_idx = -1;
   FPTYPE var[6];
   for (int ii = 0; ii < nnei; ii++) {
@@ -210,7 +210,7 @@ __global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial(
     const int last_layer_size) 
 {
   extern __shared__ int _data[];
-  const int block_idx = blockIdx.x;  // nloc
+  const int_64 block_idx = blockIdx.x;  // nloc
   const int thread_idx = threadIdx.x; // KTILE * WARP_SIZE, usally 128 here~
   int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / WARP_SIZE, 0);
   int lane_idx = threadIdx.x % WARP_SIZE;
@@ -238,8 +238,8 @@ __global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial(
       em[block_idx * nnei * MTILE + ii * 4 + 2],
       em[block_idx * nnei * MTILE + ii * 4 + 3]
     };
-    FPTYPE Csub = 0.f;
-    FPTYPE sum[MTILE] = {0.f};
+    FPTYPE Csub = (FPTYPE)0.;
+    FPTYPE sum[MTILE] = {(FPTYPE)0.};
     locate_xx_se_a(xx, table_idx, lower, upper, max, stride0, stride1);
 
     FPTYPE var[6]; 
@@ -291,14 +291,14 @@ __global__ void tabulate_fusion_se_a_grad_grad_fifth_order_polynomial(
     const int last_layer_size)
 {
   extern __shared__ int _data[];
-  const int block_idx = blockIdx.x;   // nloc
+  const int_64 block_idx = blockIdx.x;   // nloc
   const int thread_idx = threadIdx.x; // last_layer_size
   FPTYPE ago = __shfl_sync(0xffffffff, em_x[block_idx * nnei + nnei - 1], 0);
   bool unloop = false;
   int breakpoint = nnei - 1;
   FPTYPE * iteratorC = (FPTYPE*) &_data[0];
   for (int kk = 0; kk < MTILE; kk++)
-    iteratorC[kk * last_layer_size + thread_idx] = 0.f;
+    iteratorC[kk * last_layer_size + thread_idx] = (FPTYPE)0.;
   __syncthreads();
 
   int mark_table_idx = -1;
@@ -349,10 +349,10 @@ __global__ void tabulate_fusion_se_t_fifth_order_polynomial(
     const int nnei_j, 
     const int last_layer_size) 
 {
-  const int block_idx = blockIdx.x;   // nloc
+  const int_64 block_idx = blockIdx.x;   // nloc
   const int thread_idx = threadIdx.x; // last_layer_size
 
-  FPTYPE sum = 0.f;
+  FPTYPE sum = (FPTYPE)0.;
   for (int ii = 0; ii < nnei_i; ii++) {
     FPTYPE ago = __shfl_sync(0xffffffff, em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + nnei_j - 1], 0);
     int breakpoint = nnei_j - 1;
@@ -402,7 +402,7 @@ __global__ void tabulate_fusion_se_t_grad_fifth_order_polynomial(
     const int last_layer_size) 
 {
   extern __shared__ int _data[];
-  const int block_idx = blockIdx.x;  // nloc
+  const int_64 block_idx = blockIdx.x;  // nloc
   const int thread_idx = threadIdx.x; // KTILE * WARP_SIZE, usally 128 here~
   int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / WARP_SIZE, 0);
   int lane_idx = threadIdx.x % WARP_SIZE;
@@ -423,15 +423,15 @@ __global__ void tabulate_fusion_se_t_grad_fifth_order_polynomial(
       }
       int table_idx = 0;
       locate_xx_se_t(xx, table_idx, lower, upper, -max, max, stride0, stride1);
-      FPTYPE sum  = 0.f;
-      FPTYPE Csub = 0.f;
+      FPTYPE sum  = (FPTYPE)0.;
+      FPTYPE Csub = (FPTYPE)0.;
       for (int kk = lane_idx; kk < last_layer_size; kk += WARP_SIZE) {
         FPTYPE var[6]; 
         load_polynomial_params(var, table, table_idx, kk, last_layer_size);
         FPTYPE res = var[0] + (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * xx;
 
         sum  += iteratorA[kk] * res;
-        Csub += iteratorA[kk] * tmp * (var[1] + (2 * var[2] + (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) * xx);
+        Csub += iteratorA[kk] * tmp * (var[1] + ((FPTYPE)2. * var[2] + ((FPTYPE)3. * var[3] + ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) * xx) * xx);
       }
       __syncwarp();
       warp_reduce(sum);
@@ -465,10 +465,10 @@ __global__ void tabulate_fusion_se_t_grad_grad_fifth_order_polynomial(
     const int nnei_j,
     const int last_layer_size)
 {
-  const int block_idx  = blockIdx.x;   // nloc
+  const int_64 block_idx  = blockIdx.x;   // nloc
   const int thread_idx = threadIdx.x; // last_layer_size
 
-  FPTYPE sum = 0.f;
+  FPTYPE sum = (FPTYPE)0.;
   for (int ii = 0; ii < nnei_i; ii++) { 
     FPTYPE ago = __shfl_sync(0xffffffff, em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + nnei_j - 1], 0);
     bool unloop = false;
@@ -515,7 +515,7 @@ __global__ void tabulate_fusion_se_r_fifth_order_polynomial(
     const int nnei, 
     const int last_layer_size) 
 {
-  const int block_idx = blockIdx.x;   // nloc
+  const int_64 block_idx = blockIdx.x;   // nloc
   const int thread_idx = threadIdx.x; // last_layer_size
 
   int mark_table_idx = -1;
@@ -550,7 +550,7 @@ __global__ void tabulate_fusion_se_r_grad_fifth_order_polynomial(
     const int last_layer_size) 
 {
   extern __shared__ int _data[];
-  const int block_idx = blockIdx.x;  // nloc
+  const int_64 block_idx = blockIdx.x;  // nloc
   const int thread_idx = threadIdx.x; // KTILE * WARP_SIZE, usally 128 here~
   int warp_idx = __shfl_sync(0xffffffff, thread_idx / WARP_SIZE, 0);
   int lane_idx = thread_idx % WARP_SIZE;
@@ -559,7 +559,7 @@ __global__ void tabulate_fusion_se_r_grad_fifth_order_polynomial(
     FPTYPE xx = em[block_idx * nnei + ii];
     
     int table_idx = 0;
-    FPTYPE Csub = 0.f;
+    FPTYPE Csub = (FPTYPE)0.;
     locate_xx_se_r(xx, table_idx, lower, upper, max, stride0, stride1);
 
     FPTYPE var[6]; 
@@ -595,7 +595,7 @@ __global__ void tabulate_fusion_se_r_grad_grad_fifth_order_polynomial(
     const int last_layer_size)
 {
   extern __shared__ int _data[];
-  const int block_idx = blockIdx.x;   // nloc
+  const int_64 block_idx = blockIdx.x;   // nloc
   const int thread_idx = threadIdx.x; // last_layer_size
   
   int mark_table_idx = -1;
diff --git a/source/lib/src/env_mat.cc b/source/lib/src/env_mat.cc
index 7b0d3e4140..f269056cbb 100644
--- a/source/lib/src/env_mat.cc
+++ b/source/lib/src/env_mat.cc
@@ -108,7 +108,7 @@ env_mat_a_cpu (
 {  
     // compute the diff of the neighbors
     rij_a.resize (sec_a.back() * 3);
-    fill (rij_a.begin(), rij_a.end(), 0.0);
+    fill (rij_a.begin(), rij_a.end(), (FPTYPE)0.0);
     for (int ii = 0; ii < int(sec_a.size()) - 1; ++ii) {
         for (int jj = sec_a[ii]; jj < sec_a[ii + 1]; ++jj) {
             if (fmt_nlist_a[jj] < 0) break;
@@ -120,17 +120,17 @@ env_mat_a_cpu (
     }
     // 1./rr, cos(theta), cos(phi), sin(phi)
     descrpt_a.resize (sec_a.back() * 4);
-    fill (descrpt_a.begin(), descrpt_a.end(), 0.0);
+    fill (descrpt_a.begin(), descrpt_a.end(), (FPTYPE)0.0);
     // deriv wrt center: 3
     descrpt_a_deriv.resize (sec_a.back() * 4 * 3);
-    fill (descrpt_a_deriv.begin(), descrpt_a_deriv.end(), 0.0);
+    fill (descrpt_a_deriv.begin(), descrpt_a_deriv.end(), (FPTYPE)0.0);
 
     for (int sec_iter = 0; sec_iter < int(sec_a.size()) - 1; ++sec_iter) {
         for (int nei_iter = sec_a[sec_iter]; nei_iter < sec_a[sec_iter+1]; ++nei_iter) {      
             if (fmt_nlist_a[nei_iter] < 0) break;
             const FPTYPE * rr = &rij_a[nei_iter * 3];
             FPTYPE nr2 = deepmd::dot3(rr, rr);
-            FPTYPE inr = 1./sqrt(nr2);
+            FPTYPE inr = (FPTYPE)1./sqrt(nr2);
             FPTYPE nr = nr2 * inr;
             FPTYPE inr2 = inr * inr;
             FPTYPE inr4 = inr2 * inr2;
@@ -140,7 +140,7 @@ env_mat_a_cpu (
             int idx_deriv = nei_iter * 4 * 3;	// 4 components time 3 directions
             int idx_value = nei_iter * 4;	// 4 components
             // 4 value components
-            descrpt_a[idx_value + 0] = 1./nr;
+            descrpt_a[idx_value + 0] = (FPTYPE)1./nr;
             descrpt_a[idx_value + 1] = rr[0] / nr2;
             descrpt_a[idx_value + 2] = rr[1] / nr2;
             descrpt_a[idx_value + 3] = rr[2] / nr2;
@@ -149,17 +149,17 @@ env_mat_a_cpu (
             descrpt_a_deriv[idx_deriv + 1] = rr[1] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[1] * inr;
             descrpt_a_deriv[idx_deriv + 2] = rr[2] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[2] * inr;
             // deriv of component x/r2
-            descrpt_a_deriv[idx_deriv + 3] = (2. * rr[0] * rr[0] * inr4 - inr2) * sw - descrpt_a[idx_value + 1] * dsw * rr[0] * inr;
-            descrpt_a_deriv[idx_deriv + 4] = (2. * rr[0] * rr[1] * inr4	) * sw - descrpt_a[idx_value + 1] * dsw * rr[1] * inr;
-            descrpt_a_deriv[idx_deriv + 5] = (2. * rr[0] * rr[2] * inr4	) * sw - descrpt_a[idx_value + 1] * dsw * rr[2] * inr;
+            descrpt_a_deriv[idx_deriv + 3] = ((FPTYPE)2. * rr[0] * rr[0] * inr4 - inr2) * sw - descrpt_a[idx_value + 1] * dsw * rr[0] * inr;
+            descrpt_a_deriv[idx_deriv + 4] = ((FPTYPE)2. * rr[0] * rr[1] * inr4	) * sw - descrpt_a[idx_value + 1] * dsw * rr[1] * inr;
+            descrpt_a_deriv[idx_deriv + 5] = ((FPTYPE)2. * rr[0] * rr[2] * inr4	) * sw - descrpt_a[idx_value + 1] * dsw * rr[2] * inr;
             // deriv of component y/r2
-            descrpt_a_deriv[idx_deriv + 6] = (2. * rr[1] * rr[0] * inr4	) * sw - descrpt_a[idx_value + 2] * dsw * rr[0] * inr;
-            descrpt_a_deriv[idx_deriv + 7] = (2. * rr[1] * rr[1] * inr4 - inr2) * sw - descrpt_a[idx_value + 2] * dsw * rr[1] * inr;
-            descrpt_a_deriv[idx_deriv + 8] = (2. * rr[1] * rr[2] * inr4	) * sw - descrpt_a[idx_value + 2] * dsw * rr[2] * inr;
+            descrpt_a_deriv[idx_deriv + 6] = ((FPTYPE)2. * rr[1] * rr[0] * inr4	) * sw - descrpt_a[idx_value + 2] * dsw * rr[0] * inr;
+            descrpt_a_deriv[idx_deriv + 7] = ((FPTYPE)2. * rr[1] * rr[1] * inr4 - inr2) * sw - descrpt_a[idx_value + 2] * dsw * rr[1] * inr;
+            descrpt_a_deriv[idx_deriv + 8] = ((FPTYPE)2. * rr[1] * rr[2] * inr4	) * sw - descrpt_a[idx_value + 2] * dsw * rr[2] * inr;
             // deriv of component z/r2
-            descrpt_a_deriv[idx_deriv + 9] = (2. * rr[2] * rr[0] * inr4	) * sw - descrpt_a[idx_value + 3] * dsw * rr[0] * inr;
-            descrpt_a_deriv[idx_deriv +10] = (2. * rr[2] * rr[1] * inr4	) * sw - descrpt_a[idx_value + 3] * dsw * rr[1] * inr;
-            descrpt_a_deriv[idx_deriv +11] = (2. * rr[2] * rr[2] * inr4 - inr2) * sw - descrpt_a[idx_value + 3] * dsw * rr[2] * inr;
+            descrpt_a_deriv[idx_deriv + 9] = ((FPTYPE)2. * rr[2] * rr[0] * inr4	) * sw - descrpt_a[idx_value + 3] * dsw * rr[0] * inr;
+            descrpt_a_deriv[idx_deriv +10] = ((FPTYPE)2. * rr[2] * rr[1] * inr4	) * sw - descrpt_a[idx_value + 3] * dsw * rr[1] * inr;
+            descrpt_a_deriv[idx_deriv +11] = ((FPTYPE)2. * rr[2] * rr[2] * inr4 - inr2) * sw - descrpt_a[idx_value + 3] * dsw * rr[2] * inr;
             // 4 value components
             descrpt_a[idx_value + 0] *= sw;
             descrpt_a[idx_value + 1] *= sw;
@@ -256,7 +256,7 @@ env_mat_r_cpu (
 {
     // compute the diff of the neighbors
     rij_a.resize (sec.back() * 3);
-    fill (rij_a.begin(), rij_a.end(), 0.0);
+    fill (rij_a.begin(), rij_a.end(), (FPTYPE)0.0);
     for (int ii = 0; ii < int(sec.size()) - 1; ++ii) {
         for (int jj = sec[ii]; jj < sec[ii + 1]; ++jj) {
             if (fmt_nlist[jj] < 0) break;
@@ -269,17 +269,17 @@ env_mat_r_cpu (
     }
     // 1./rr, cos(theta), cos(phi), sin(phi)
     descrpt_a.resize (sec.back());
-    fill (descrpt_a.begin(), descrpt_a.end(), 0.0);
+    fill (descrpt_a.begin(), descrpt_a.end(), (FPTYPE)0.0);
     // deriv wrt center: 3
     descrpt_a_deriv.resize (sec.back() * 3);
-    fill (descrpt_a_deriv.begin(), descrpt_a_deriv.end(), 0.0);
+    fill (descrpt_a_deriv.begin(), descrpt_a_deriv.end(), (FPTYPE)0.0);
 
     for (int sec_iter = 0; sec_iter < int(sec.size()) - 1; ++sec_iter) {
         for (int nei_iter = sec[sec_iter]; nei_iter < sec[sec_iter+1]; ++nei_iter) {      
             if (fmt_nlist[nei_iter] < 0) break;
             const FPTYPE * rr = &rij_a[nei_iter * 3];
             FPTYPE nr2 = deepmd::dot3(rr, rr);
-            FPTYPE inr = 1./sqrt(nr2);
+            FPTYPE inr = (FPTYPE)1./sqrt(nr2);
             FPTYPE nr = nr2 * inr;
             FPTYPE inr2 = inr * inr;
             FPTYPE inr4 = inr2 * inr2;
@@ -289,7 +289,7 @@ env_mat_r_cpu (
             int idx_deriv = nei_iter * 3;	// 1 components time 3 directions
             int idx_value = nei_iter;	    // 1 components
             // 4 value components
-            descrpt_a[idx_value + 0] = 1./nr;
+            descrpt_a[idx_value + 0] = (FPTYPE)1./nr;
             // deriv of component 1/r
             descrpt_a_deriv[idx_deriv + 0] = rr[0] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[0] * inr;
             descrpt_a_deriv[idx_deriv + 1] = rr[1] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[1] * inr;
diff --git a/source/lib/src/ewald.cc b/source/lib/src/ewald.cc
index 486d2cbb73..08d0354023 100644
--- a/source/lib/src/ewald.cc
+++ b/source/lib/src/ewald.cc
@@ -13,7 +13,7 @@ dir_err_esti(const VALUETYPE & test_q,
   const VALUETYPE & rcut = param.rcut;
   const VALUETYPE & beta = param.beta;
   const VALUETYPE rho_q2 = c2/nn;  
-  VALUETYPE sum = 2 * test_q 
+  VALUETYPE sum = (VALUETYPE)2. * test_q 
       * sqrt (rho_q2 / rcut)
       * exp (- beta*beta*rcut*rcut) * ElectrostaticConvertion;
   return sum;
@@ -215,7 +215,7 @@ ewald_recp(
 	VALUETYPE eincr = expnmm2 * (sqr[mc] * sqr[mc] + sqi[mc] * sqi[mc]);
 	thread_ener[thread_id] += eincr;
 	// virial
-	VALUETYPE vpref = -2. * (1. + M_PI * M_PI * nmm2 / (param.beta * param.beta)) / nmm2;
+	VALUETYPE vpref = (VALUETYPE)-2. * ((VALUETYPE)1. + M_PI * M_PI * nmm2 / (param.beta * param.beta)) / nmm2;
 	for (int dd0 = 0; dd0 < 3; ++dd0){
 	  for (int dd1 = 0; dd1 < 3; ++dd1){	    
 	    VALUETYPE tmp = vpref * rm[dd0] * rm[dd1];
@@ -225,10 +225,10 @@ ewald_recp(
 	}
 	// force
 	for (int ii = 0; ii < natoms; ++ii){
-	  VALUETYPE mdotr = - 2. * M_PI * (coord[ii*3+0]*rm[0] + coord[ii*3+1]*rm[1] + coord[ii*3+2]*rm[2]);
+	  VALUETYPE mdotr = (VALUETYPE)-2. * M_PI * (coord[ii*3+0]*rm[0] + coord[ii*3+1]*rm[1] + coord[ii*3+2]*rm[2]);
 	  VALUETYPE tmpr = charge[ii] * cos(mdotr);
 	  VALUETYPE tmpi = charge[ii] * sin(mdotr);
-	  VALUETYPE cc = 4. * M_PI * (tmpr * sqi[mc] + tmpi * sqr[mc]) * expnmm2;
+	  VALUETYPE cc = (VALUETYPE)4. * M_PI * (tmpr * sqi[mc] + tmpi * sqr[mc]) * expnmm2;
 	  thread_force[thread_id][ii*3+0] -= rm[0] * cc;
 	  thread_force[thread_id][ii*3+1] -= rm[1] * cc;
 	  thread_force[thread_id][ii*3+2] -= rm[2] * cc;
@@ -252,14 +252,14 @@ ewald_recp(
   }
 
   VALUETYPE vol = volume_cpu(region);
-  ener /= 2 * M_PI * vol;
+  ener /= (VALUETYPE)2. * M_PI * vol;
   ener *= ElectrostaticConvertion;
   for (int ii = 0; ii < 3*natoms; ++ii){
-    force[ii] /= 2 * M_PI * vol;
+    force[ii] /= (VALUETYPE)2. * M_PI * vol;
     force[ii] *= ElectrostaticConvertion;
   }  
   for (int ii = 0; ii < 3*3; ++ii){
-    virial[ii] /= 2 * M_PI * vol;
+    virial[ii] /= (VALUETYPE)2. * M_PI * vol;
     virial[ii] *= ElectrostaticConvertion;
   }  
   delete[]sqr;
diff --git a/source/lib/src/fmt_nlist.cc b/source/lib/src/fmt_nlist.cc
index 35155d77d1..9e1d5fc57c 100644
--- a/source/lib/src/fmt_nlist.cc
+++ b/source/lib/src/fmt_nlist.cc
@@ -8,16 +8,17 @@
 
 using namespace deepmd;
 
+template<typename FPTYPE> 
 struct NeighborInfo 
 {
   int type;
-  double dist;
+  FPTYPE dist;
   int index;
   NeighborInfo () 
       : type (0), dist(0), index(0) 
       {
       }
-  NeighborInfo (int tt, double dd, int ii) 
+  NeighborInfo (int tt, FPTYPE dd, int ii) 
       : type (tt), dist(dd), index(ii) 
       {
       }
@@ -60,7 +61,7 @@ int format_nlist_i_fill_a (
   nei_idx.insert (nei_idx.end(), nei_idx_r.begin(), nei_idx_r.end());
   assert (nei_idx.size() == nei_idx_a.size() + nei_idx_r.size());
   // allocate the information for all neighbors
-  std::vector<NeighborInfo > sel_nei ;
+  std::vector<NeighborInfo<double> > sel_nei ;
   sel_nei.reserve (nei_idx_a.size() + nei_idx_r.size());
   for (unsigned kk = 0; kk < nei_idx.size(); ++kk){
     double diff[3];
@@ -75,7 +76,7 @@ int format_nlist_i_fill_a (
     }
     double rr = sqrt(deepmd::dot3(diff, diff));    
     if (rr <= rcut) {
-      sel_nei.push_back(NeighborInfo (type[j_idx], rr, j_idx));
+      sel_nei.push_back(NeighborInfo<double> (type[j_idx], rr, j_idx));
     }
   }
   sort (sel_nei.begin(), sel_nei.end());  
@@ -118,17 +119,19 @@ int format_nlist_i_cpu (
     // gether all neighbors
     std::vector<int > nei_idx (nei_idx_a);
     // allocate the information for all neighbors
-    std::vector<NeighborInfo > sel_nei;
+    std::vector<NeighborInfo<float> > sel_nei;
     sel_nei.reserve (nei_idx_a.size());
+    float rcut2 = rcut * rcut;
     for (unsigned kk = 0; kk < nei_idx.size(); ++kk) {
-        FPTYPE diff[3];
+        // rcut is float in this function, so float rr is enough
+        float diff[3];
         const int & j_idx = nei_idx[kk];
         for (int dd = 0; dd < 3; ++dd) {
-            diff[dd] = posi[j_idx * 3 + dd] - posi[i_idx * 3 + dd];
+            diff[dd] = (float)posi[j_idx * 3 + dd] - (float)posi[i_idx * 3 + dd];
         }
-        FPTYPE rr = sqrt(deepmd::dot3(diff, diff));    
-        if (rr <= rcut) {
-            sel_nei.push_back(NeighborInfo(type[j_idx], rr, j_idx));
+        float rr2 = deepmd::dot3(diff, diff);    
+        if (rr2 <= rcut2) {
+            sel_nei.push_back(NeighborInfo<float>(type[j_idx], rr2, j_idx));
         }
     }
     sort(sel_nei.begin(), sel_nei.end());  
diff --git a/source/lib/src/gelu.cc b/source/lib/src/gelu.cc
index e86faa882b..13a3711027 100644
--- a/source/lib/src/gelu.cc
+++ b/source/lib/src/gelu.cc
@@ -1,15 +1,15 @@
 #include "gelu.h"
-#include "math.h"
+#include <cmath>
 #include "device.h"
 
 template<typename FPTYPE>
 void deepmd::gelu_cpu(
     FPTYPE * out, 
     const FPTYPE * xx, 
-    const int size)
+    const int_64 size)
 {
   for (int ii = 0; ii < size; ii++) {
-    out[ii] = xx[ii] * 0.5 * (1.0 + tanh(SQRT_2_PI * (xx[ii] + 0.044715 * xx[ii] * xx[ii] *xx[ii])));
+    out[ii] = xx[ii] * (FPTYPE)0.5 * ((FPTYPE)1.0 + tanh((FPTYPE)SQRT_2_PI * (xx[ii] + (FPTYPE)0.044715 * xx[ii] * xx[ii] *xx[ii])));
   }
 }
 
@@ -18,11 +18,11 @@ void deepmd::gelu_grad_cpu(
     FPTYPE * out, 
     const FPTYPE * xx,
     const FPTYPE * dy, 
-    const int size)
+    const int_64 size)
 {
   for (int ii = 0; ii < size; ii++) {
-    const FPTYPE var = tanh(SQRT_2_PI * (xx[ii] + 0.044715 * xx[ii] * xx[ii] * xx[ii]));
-    out[ii] = dy[ii] * (0.5 * SQRT_2_PI * xx[ii] * (1 - var * var) * (0.134145 * xx[ii] * xx[ii] + 1) + 0.5 * var + 0.5);
+    const FPTYPE var = tanh((FPTYPE)SQRT_2_PI * (xx[ii] + (FPTYPE)0.044715 * xx[ii] * xx[ii] * xx[ii]));
+    out[ii] = dy[ii] * ((FPTYPE)0.5 * (FPTYPE)SQRT_2_PI * xx[ii] * ((FPTYPE)1. - var * var) * ((FPTYPE)0.134145 * xx[ii] * xx[ii] + (FPTYPE)1.) + (FPTYPE)0.5 * var + (FPTYPE)0.5);
   }
 }
 
@@ -32,18 +32,18 @@ void deepmd::gelu_grad_grad_cpu(
     const FPTYPE * xx,
     const FPTYPE * dy, 
     const FPTYPE * dy_2,
-    const int size) 
+    const int_64 size) 
 {
   for (int ii = 0; ii < size; ii++) {
-    const FPTYPE var1 = tanh(SQRT_2_PI * (xx[ii] + 0.044715 * xx[ii] * xx[ii] *xx[ii]));
-    const FPTYPE var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * xx[ii] * xx[ii] + 1);
-    out[ii] = dy[ii] * dy_2[ii] * (0.134145 * SQRT_2_PI * xx[ii] * xx[ii] * (1 - var1 * var1) - SQRT_2_PI * xx[ii] * var2 * (0.134145 * xx[ii] * xx[ii] + 1) * var1 + var2);
+    const FPTYPE var1 = tanh((FPTYPE)SQRT_2_PI * (xx[ii] + (FPTYPE)0.044715 * xx[ii] * xx[ii] *xx[ii]));
+    const FPTYPE var2 = (FPTYPE)SQRT_2_PI * ((FPTYPE)1. - var1 * var1) * ((FPTYPE)0.134145 * xx[ii] * xx[ii] + (FPTYPE)1.);
+    out[ii] = dy[ii] * dy_2[ii] * ((FPTYPE)0.134145 * (FPTYPE)SQRT_2_PI * xx[ii] * xx[ii] * ((FPTYPE)1. - var1 * var1) - (FPTYPE)SQRT_2_PI * xx[ii] * var2 * ((FPTYPE)0.134145 * xx[ii] * xx[ii] + (FPTYPE)1.) * var1 + var2);
   }
 }
 
-template void deepmd::gelu_cpu<float>(float * out, const float * x, const int size);
-template void deepmd::gelu_cpu<double>(double * out, const double * x, const int size);
-template void deepmd::gelu_grad_cpu<float>(float * out, const float * x, const float * dy, const int size);
-template void deepmd::gelu_grad_cpu<double>(double * out, const double * x, const double * dy, const int size);
-template void deepmd::gelu_grad_grad_cpu<float>(float * out, const float * x, const float * dy, const float * dy_2, const int size);
-template void deepmd::gelu_grad_grad_cpu<double>(double * out, const double * x, const double * dy, const double * dy_2, const int size);
+template void deepmd::gelu_cpu<float>(float * out, const float * x, const int_64 size);
+template void deepmd::gelu_cpu<double>(double * out, const double * x, const int_64 size);
+template void deepmd::gelu_grad_cpu<float>(float * out, const float * x, const float * dy, const int_64 size);
+template void deepmd::gelu_grad_cpu<double>(double * out, const double * x, const double * dy, const int_64 size);
+template void deepmd::gelu_grad_grad_cpu<float>(float * out, const float * x, const float * dy, const float * dy_2, const int_64 size);
+template void deepmd::gelu_grad_grad_cpu<double>(double * out, const double * x, const double * dy, const double * dy_2, const int_64 size);
diff --git a/source/lib/src/pair_tab.cc b/source/lib/src/pair_tab.cc
index 2c48ce957a..22c9bd5390 100644
--- a/source/lib/src/pair_tab.cc
+++ b/source/lib/src/pair_tab.cc
@@ -157,11 +157,11 @@ deepmd::pair_tab_cpu(
   }
   for (int ii = 0; ii < nall; ++ii){
     int i_idx = ii;
-    force[i_idx * 3 + 0] = 0;
-    force[i_idx * 3 + 1] = 0;
-    force[i_idx * 3 + 2] = 0;
+    force[i_idx * 3 + 0] = (FPTYPE)0.;
+    force[i_idx * 3 + 1] = (FPTYPE)0.;
+    force[i_idx * 3 + 2] = (FPTYPE)0.;
     for (int dd = 0; dd < 9; ++dd) {
-      virial[i_idx * 9 + dd] = 0;
+      virial[i_idx * 9 + dd] = (FPTYPE)0.;
     }
   }
   // compute force of a frame
diff --git a/source/lib/src/prod_env_mat.cc b/source/lib/src/prod_env_mat.cc
index 1ac944786c..303542699c 100644
--- a/source/lib/src/prod_env_mat.cc
+++ b/source/lib/src/prod_env_mat.cc
@@ -284,7 +284,13 @@ void deepmd::env_mat_nbor_update(
     memcpy_host_to_device(gpu_inlist.ilist, inlist.ilist, inum);
     memcpy_host_to_device(gpu_inlist.numneigh, inlist.numneigh, inum);
     int _max_nbor_size = max_numneigh(inlist);
-    if (_max_nbor_size <= 1024) {
+    if (_max_nbor_size <= 256) {
+      _max_nbor_size = 256;
+    }
+    else if (_max_nbor_size <= 512) {
+      _max_nbor_size = 512;
+    }
+    else if (_max_nbor_size <= 1024) {
       _max_nbor_size = 1024;
     }
     else if (_max_nbor_size <= 2048) {
diff --git a/source/lib/src/prod_force.cc b/source/lib/src/prod_force.cc
index 6859a5bae3..626b572b00 100644
--- a/source/lib/src/prod_force.cc
+++ b/source/lib/src/prod_force.cc
@@ -105,9 +105,9 @@ prod_force_r_cpu(
 
   for (int ii = 0; ii < nall; ++ii){
     int i_idx = ii;
-    force[i_idx * 3 + 0] = 0;
-    force[i_idx * 3 + 1] = 0;
-    force[i_idx * 3 + 2] = 0;
+    force[i_idx * 3 + 0] = (FPTYPE)0.;
+    force[i_idx * 3 + 1] = (FPTYPE)0.;
+    force[i_idx * 3 + 2] = (FPTYPE)0.;
   }
 
   // compute force of a frame
diff --git a/source/lib/src/prod_force_grad.cc b/source/lib/src/prod_force_grad.cc
index 78bad3c9ca..88d61a4436 100644
--- a/source/lib/src/prod_force_grad.cc
+++ b/source/lib/src/prod_force_grad.cc
@@ -37,7 +37,7 @@ prod_force_grad_a_cpu(
   // reset the frame to 0
   for (int ii = 0; ii < nloc; ++ii){
     for (int aa = 0; aa < ndescrpt; ++aa){
-      grad_net[ii * ndescrpt + aa] = 0;
+      grad_net[ii * ndescrpt + aa] = (FPTYPE)0.;
     }
   }      
 
@@ -116,7 +116,7 @@ prod_force_grad_r_cpu(
   // reset the frame to 0
   for (int ii = 0; ii < nloc; ++ii){
     for (int aa = 0; aa < ndescrpt; ++aa){
-      grad_net[ii * ndescrpt + aa] = 0;
+      grad_net[ii * ndescrpt + aa] = (FPTYPE)0.;
     }
   }      
 
diff --git a/source/lib/src/prod_virial.cc b/source/lib/src/prod_virial.cc
index d715cf9e5b..29b343ba0b 100644
--- a/source/lib/src/prod_virial.cc
+++ b/source/lib/src/prod_virial.cc
@@ -37,10 +37,10 @@ prod_virial_a_cpu(
   const int ndescrpt = 4 * nnei;
 
   for (int ii = 0; ii < 9; ++ ii){
-    virial[ii] = 0.;
+    virial[ii] = (FPTYPE)0.;
   }
   for (int ii = 0; ii < 9 * nall; ++ ii){
-    atom_virial[ii] = 0.;
+    atom_virial[ii] = (FPTYPE)0.;
   }
 
   // compute virial of a frame
@@ -55,7 +55,7 @@ prod_virial_a_cpu(
       int aa_start, aa_end;
       make_index_range (aa_start, aa_end, jj, nnei);
       for (int aa = aa_start; aa < aa_end; ++aa) {
-	FPTYPE pref = -1.0 * net_deriv[i_idx * ndescrpt + aa];
+	FPTYPE pref = (FPTYPE)-1.0 * net_deriv[i_idx * ndescrpt + aa];
 	for (int dd0 = 0; dd0 < 3; ++dd0){
 	  for (int dd1 = 0; dd1 < 3; ++dd1){
 	    FPTYPE tmp_v = pref * rij[i_idx * nnei * 3 + jj * 3 + dd1] *  env_deriv[i_idx * ndescrpt * 3 + aa * 3 + dd0];
@@ -116,10 +116,10 @@ prod_virial_r_cpu(
   const int ndescrpt = nnei;
 
   for (int ii = 0; ii < 9; ++ ii){
-    virial[ii] = 0.;
+    virial[ii] = (FPTYPE)0.;
   }
   for (int ii = 0; ii < 9 * nall; ++ ii){
-    atom_virial[ii] = 0.;
+    atom_virial[ii] = (FPTYPE)0.;
   }
 
   // compute virial of a frame
diff --git a/source/lib/src/rocm/coord.hip.cu b/source/lib/src/rocm/coord.hip.cu
index 73d85d2111..ab75e7f7a0 100644
--- a/source/lib/src/rocm/coord.hip.cu
+++ b/source/lib/src/rocm/coord.hip.cu
@@ -51,6 +51,9 @@ __device__ inline int compute_pbc_shift(
     return shift;
 }
 
+__device__ inline double _fmod(double x, double y) {return fmod(x, y);}
+__device__ inline float _fmod(float x, float y) {return fmodf(x, y);}
+
 template<typename FPTYPE>
 __global__ void normalize_one(
     FPTYPE *out_c,
@@ -64,8 +67,8 @@ __global__ void normalize_one(
     FPTYPE inter[3];
     phys2Inter(inter,out_c+idy*3,rec_boxt);
     for (int dd = 0; dd < 3; ++dd) {
-        inter[dd]=(FPTYPE)fmod((double)inter[dd], 1.);
-        if (inter[dd] <  0.) inter[dd] += 1.;
+        inter[dd]=_fmod(inter[dd], (FPTYPE)1.);
+        if (inter[dd] <  (FPTYPE)0.) inter[dd] += (FPTYPE)1.;
     }
     inter2Phys(out_c+idy*3,inter,boxt);
 }
@@ -93,7 +96,7 @@ __global__ void _fill_idx_cellmap(
         ext_ncell[dd] = ext_end[dd] - ext_stt[dd];
         global_grid[dd] = nat_end[dd] - nat_stt[dd];
         idx_orig_shift[dd] = nat_stt[dd] - ext_stt[dd];
-        cell_size[dd] = 1./global_grid[dd];
+        cell_size[dd] = (FPTYPE)1./global_grid[dd];
         nat_orig[dd] = nat_stt[dd] * cell_size[dd];
     }
     if (idy<nloc)
diff --git a/source/lib/src/rocm/gelu.hip.cu b/source/lib/src/rocm/gelu.hip.cu
index bdfd3767d6..6529d277f1 100644
--- a/source/lib/src/rocm/gelu.hip.cu
+++ b/source/lib/src/rocm/gelu.hip.cu
@@ -1,17 +1,20 @@
 #include "gelu.h"
 #include "device.h"
 
+__device__ inline double _tanh(double x) {return tanh(x);}
+__device__ inline float _tanh(float x) {return tanhf(x);}
+
 template <typename FPTYPE>
 __global__ void gelu(
     FPTYPE * out, 
     const FPTYPE * xx, 
-    int const size) 
+    const int_64 size) 
 {
-  int const idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int_64 idx = int_64(blockIdx.x) * blockDim.x + threadIdx.x;
   if (idx >= size) {
     return;
   }
-  out[idx] = xx[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (xx[idx] + 0.044715 * xx[idx] * xx[idx] *xx[idx])));
+  out[idx] = xx[idx] * (FPTYPE)0.5 * ((FPTYPE)1.0 + _tanh((FPTYPE)SQRT_2_PI * (xx[idx] + (FPTYPE)0.044715 * xx[idx] * xx[idx] *xx[idx])));
 }
 
 template <typename FPTYPE>
@@ -19,15 +22,15 @@ __global__ void gelu_grad(
     FPTYPE * out, 
     const FPTYPE * xx, 
     const FPTYPE * dy, 
-    int const size) 
+    const int_64 size) 
 {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int_64 idx = int_64(blockIdx.x) * blockDim.x + threadIdx.x;
   if (idx >= size) {
     return;
   }
   // out[idx] = xx[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (xx[idx] + 0.044715 * xx[idx] * xx[idx] *xx[idx])));
-  const FPTYPE var = tanh(SQRT_2_PI * (xx[idx] + 0.044715 * xx[idx] * xx[idx] *xx[idx]));
-  out[idx] = dy[idx] * (0.5 * SQRT_2_PI * xx[idx] * (1 - var * var) * (0.134145 * xx[idx] * xx[idx] + 1) + 0.5 * var + 0.5);
+  const FPTYPE var = _tanh((FPTYPE)SQRT_2_PI * (xx[idx] + (FPTYPE)0.044715 * xx[idx] * xx[idx] *xx[idx]));
+  out[idx] = dy[idx] * ((FPTYPE)0.5 * (FPTYPE)SQRT_2_PI * xx[idx] * ((FPTYPE)1. - var * var) * ((FPTYPE)0.134145 * xx[idx] * xx[idx] + (FPTYPE)1.) + (FPTYPE)0.5 * var + (FPTYPE)0.5);
 }
 
 template <typename FPTYPE>
@@ -36,16 +39,16 @@ __global__ void gelu_grad_grad(
     const FPTYPE * xx, 
     const FPTYPE * dy, 
     const FPTYPE * dy_2,
-    int const size) 
+    const int_64 size) 
 {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int_64 idx = int_64(blockIdx.x) * blockDim.x + threadIdx.x;
   if (idx >= size) {
     return;
   }
   // out[idx] = xx[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (xx[idx] + 0.044715 * xx[idx] * xx[idx] *xx[idx])));
-  const FPTYPE var1 = tanh(SQRT_2_PI * (xx[idx] + 0.044715 * xx[idx] * xx[idx] *xx[idx]));
-  const FPTYPE var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * xx[idx] * xx[idx] + 1);
-  out[idx] = dy[idx] * dy_2[idx] * (0.134145 * SQRT_2_PI * xx[idx] * xx[idx] * (1 - var1 * var1) - SQRT_2_PI * xx[idx] * var2 * (0.134145 * xx[idx] * xx[idx] + 1) * var1 + var2);
+  const FPTYPE var1 = _tanh((FPTYPE)SQRT_2_PI * (xx[idx] + (FPTYPE)0.044715 * xx[idx] * xx[idx] *xx[idx]));
+  const FPTYPE var2 = (FPTYPE)SQRT_2_PI * ((FPTYPE)1. - var1 * var1) * ((FPTYPE)0.134145 * xx[idx] * xx[idx] + (FPTYPE)1.);
+  out[idx] = dy[idx] * dy_2[idx] * ((FPTYPE)0.134145 * (FPTYPE)SQRT_2_PI * xx[idx] * xx[idx] * ((FPTYPE)1. - var1 * var1) - (FPTYPE)SQRT_2_PI * xx[idx] * var2 * ((FPTYPE)0.134145 * xx[idx] * xx[idx] + (FPTYPE)1.) * var1 + var2);
 }
 
 namespace deepmd {
@@ -53,7 +56,7 @@ namespace deepmd {
   void gelu_gpu_rocm(
       FPTYPE * out, 
       const FPTYPE * xx, 
-      const int size)
+      const int_64 size)
   {
     if(size <= 0)
     {
@@ -72,7 +75,7 @@ namespace deepmd {
       FPTYPE * out, 
       const FPTYPE * xx,
       const FPTYPE * dy, 
-      const int size)
+      const int_64 size)
   {
     if(size <= 0)
     {
@@ -92,7 +95,7 @@ namespace deepmd {
       const FPTYPE * xx,
       const FPTYPE * dy, 
       const FPTYPE * dy_2,
-      const int size)
+      const int_64 size)
   {
     if(size <= 0)
     {
@@ -106,10 +109,10 @@ namespace deepmd {
     DPErrcheck(hipDeviceSynchronize());
   }
   
-  template void gelu_gpu_rocm<float>(float * out, const float * x, const int size);
-  template void gelu_gpu_rocm<double>(double * out, const double * x, const int size);
-  template void gelu_grad_gpu_rocm<float>(float * out, const float * x, const float * dy, const int size);
-  template void gelu_grad_gpu_rocm<double>(double * out, const double * x, const double * dy, const int size);
-  template void gelu_grad_grad_gpu_rocm<float>(float * out, const float * x, const float * dy, const float * dy_2, const int size);
-  template void gelu_grad_grad_gpu_rocm<double>(double * out, const double * x, const double * dy, const double * dy_2, const int size);
+  template void gelu_gpu_rocm<float>(float * out, const float * x, const int_64 size);
+  template void gelu_gpu_rocm<double>(double * out, const double * x, const int_64 size);
+  template void gelu_grad_gpu_rocm<float>(float * out, const float * x, const float * dy, const int_64 size);
+  template void gelu_grad_gpu_rocm<double>(double * out, const double * x, const double * dy, const int_64 size);
+  template void gelu_grad_grad_gpu_rocm<float>(float * out, const float * x, const float * dy, const float * dy_2, const int_64 size);
+  template void gelu_grad_grad_gpu_rocm<double>(double * out, const double * x, const double * dy, const double * dy_2, const int_64 size);
 }
\ No newline at end of file
diff --git a/source/lib/src/rocm/prod_env_mat.hip.cu b/source/lib/src/rocm/prod_env_mat.hip.cu
index 45fa0deb41..506a844a04 100644
--- a/source/lib/src/rocm/prod_env_mat.hip.cu
+++ b/source/lib/src/rocm/prod_env_mat.hip.cu
@@ -3,6 +3,11 @@
 #include "device.h"
 #include "hipcub/hipcub.hpp"
 
+__device__ inline double _sqrt(double x) {return sqrt(x);}
+__device__ inline float _sqrt(float x) {return sqrtf(x);}
+__device__ inline double _rsqrt(double x) {return rsqrt(x);}
+__device__ inline float _rsqrt(float x) {return rsqrtf(x);}
+
 // common part of prod_env_mat
 template <
     typename    Key,
@@ -27,7 +32,7 @@ __global__ void BlockSortKernel(
   // Per-thread tile items
   Key items[ITEMS_PER_THREAD];
   // Our current block's offset
-  int block_offset = blockIdx.x * TILE_SIZE;
+  int_64 block_offset = (int_64)blockIdx.x * TILE_SIZE;
   // Load items into a blocked arrangement
   BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
   // Barrier for smem reuse
@@ -55,18 +60,18 @@ __device__ inline void spline5_switch(
     const float & rmax) 
 {
   if (xx < rmin) {
-    dd = 0;
-    vv = 1;
+    dd = (FPTYPE)0.;
+    vv = (FPTYPE)1.;
   }
   else if (xx < rmax) {
     FPTYPE uu = (xx - rmin) / (rmax - rmin) ;
-    FPTYPE du = 1. / (rmax - rmin) ;
-    vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1;
-    dd = ( 3 * uu*uu * (-6 * uu*uu + 15 * uu - 10) + uu*uu*uu * (-12 * uu + 15) ) * du;
+    FPTYPE du = (FPTYPE)1. / (rmax - rmin) ;
+    vv = uu*uu*uu * ((FPTYPE)-6. * uu*uu + (FPTYPE)15. * uu - (FPTYPE)10.) + (FPTYPE)1.;
+    dd = ( (FPTYPE)3. * uu*uu * ((FPTYPE)-6. * uu*uu + (FPTYPE)15. * uu - (FPTYPE)10.) + uu*uu*uu * ((FPTYPE)-12. * uu + (FPTYPE)15.) ) * du;
   }
   else {
-    dd = 0;
-    vv = 0;
+    dd = (FPTYPE)0.;
+    vv = (FPTYPE)0.;
   }
 }
 
@@ -80,7 +85,7 @@ __device__ inline uint_64 encoding_nbor_info(
   // the type of nbor atom must be smaller than 128
   // the distance of center atom between nbor atom must be smaller than 128
   // the index of nbor atom(including ghost region) must be smaller than 16777216(1 << 24)
-  if(type >= 128 || dist >= 128.0 || index >= (1 << 24)) {
+  if(type >= 128 || dist >= (FPTYPE)128.0 || index >= (1 << 24)) {
     __builtin_trap();
   }
   return ((uint_64)type << 57) + (uint_64)((double)dist * ((uint_64)1 << 50)) / (1 << 24) * (1 << 24) + index;
@@ -120,7 +125,7 @@ __global__ void format_nlist_fill_a(
     const int MAX_NBOR_SIZE)
 {   
   // <<<nloc, MAX_NBOR_SIZE>>>
-  const unsigned int idx = blockIdx.x;
+  const int_64 idx = blockIdx.x;
   const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
   
   const int nsize = numneigh[i_idx[idx]];
@@ -136,7 +141,7 @@ __global__ void format_nlist_fill_a(
   for (int dd = 0; dd < 3; dd++) {
     diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd];
   }
-  FPTYPE rr = sqrt(dev_dot(diff, diff)); 
+  FPTYPE rr = _sqrt(dev_dot(diff, diff)); 
   if (rr <= rcut) {
     key_in[idy] = encoding_nbor_info(type[j_idx], rr, j_idx);
   }
@@ -150,7 +155,7 @@ __global__ void fill_nei_iter(
     const int max_nbor_size,
     const int sec_size)
 {
-  int row = blockIdx.x;
+  int_64 row = blockIdx.x;
   int col = blockIdx.y * blockDim.x + threadIdx.x;
   const FPTYPE * key_out = key + nloc * max_nbor_size + row * max_nbor_size;
   int nei_type_cur = -1, nbor_idx_cur = 0;
@@ -176,7 +181,7 @@ __global__ void format_nlist_fill_b(
     int * nei_iter_dev,
     const int max_nbor_size)
 { 
-  int row = blockIdx.x;
+  int_64 row = blockIdx.x;
   int col = blockIdx.y * blockDim.x + threadIdx.x;
   int * nei_iter = nei_iter_dev + row * sec_size;
   FPTYPE * key_out = key + nloc * max_nbor_size + row * max_nbor_size;
@@ -212,6 +217,66 @@ __global__ void encoding_decoding_nbor_info(
   decoding_nbor_info(out_type[idx], out_index[idx], key[idx]);
 }
 
+template<typename FPTYPE>
+void format_nbor_list_256 (
+    uint_64 * key,
+    const FPTYPE* coord,
+    const int* type,
+    const deepmd::InputNlist & gpu_inlist,
+    const int& nloc,       
+    const float& rcut, 
+    int * i_idx) 
+{   
+  const int LEN = 256;
+  const int MAX_NBOR_SIZE = 256;
+  const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(1, LEN);
+  hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, 
+      key,
+      coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE);
+  DPErrcheck(hipGetLastError());
+  DPErrcheck(hipDeviceSynchronize());
+  const int ITEMS_PER_THREAD = 4;
+  const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
+  // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD>), g_grid_size, BLOCK_THREADS, 0, 0, 
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>), nloc, BLOCK_THREADS, 0, 0, 
+      key, 
+      key + nloc * MAX_NBOR_SIZE);
+  DPErrcheck(hipGetLastError());
+  DPErrcheck(hipDeviceSynchronize());
+}
+
+template<typename FPTYPE>
+void format_nbor_list_512 (
+    uint_64 * key,
+    const FPTYPE* coord,
+    const int* type,
+    const deepmd::InputNlist & gpu_inlist,
+    const int& nloc,       
+    const float& rcut, 
+    int * i_idx) 
+{   
+  const int LEN = 256;
+  const int MAX_NBOR_SIZE = 512;
+  const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(1, LEN);
+  hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, 
+      key,
+      coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE);
+  DPErrcheck(hipGetLastError());
+  DPErrcheck(hipDeviceSynchronize());
+  const int ITEMS_PER_THREAD = 4;
+  const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
+  // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD>), g_grid_size, BLOCK_THREADS, 0, 0, 
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>), nloc, BLOCK_THREADS, 0, 0, 
+      key, 
+      key + nloc * MAX_NBOR_SIZE);
+  DPErrcheck(hipGetLastError());
+  DPErrcheck(hipDeviceSynchronize());
+}
+
 template<typename FPTYPE>
 void format_nbor_list_1024 (
     uint_64 * key,
@@ -319,7 +384,7 @@ __global__ void compute_env_mat_a(
     const float rmax)
 {   
   // <<<nloc, TPB>>>
-  const unsigned int bid = blockIdx.x;
+  const int_64 bid = blockIdx.x;
   const unsigned int tid = threadIdx.x;
   if (tid >= nnei) {
     return;
@@ -343,14 +408,14 @@ __global__ void compute_env_mat_a(
       }
       // const FPTYPE * rr = &row_rij[ii * 3];
       FPTYPE nr2 = dev_dot(rr, rr);
-      FPTYPE inr = 1./sqrt(nr2);
+      FPTYPE inr = _rsqrt(nr2);
       FPTYPE nr = nr2 * inr;
       FPTYPE inr2 = inr * inr;
       FPTYPE inr4 = inr2 * inr2;
       FPTYPE inr3 = inr4 * nr;
       FPTYPE sw, dsw;
       spline5_switch(sw, dsw, nr, rmin, rmax);
-      dd[0] = (1./nr)       ;//* sw;
+      dd[0] = ((FPTYPE)1./nr)       ;//* sw;
       dd[1] = (rr[0] / nr2) ;//* sw;
       dd[2] = (rr[1] / nr2) ;//* sw;
       dd[3] = (rr[2] / nr2) ;//* sw;
@@ -358,17 +423,17 @@ __global__ void compute_env_mat_a(
       vv[1] = (rr[1] * inr3 * sw - dd[0] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3];
       vv[2] = (rr[2] * inr3 * sw - dd[0] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3];
       // ****deriv of component x/r2
-      vv[3] = ((2. * rr[0] * rr[0] * inr4 - inr2) * sw - dd[1] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 3) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 3) % (ndescrpt * 3)) / 3];
-      vv[4] = ((2. * rr[0] * rr[1] * inr4	) * sw - dd[1] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 4) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 4) % (ndescrpt * 3)) / 3];
-      vv[5] = ((2. * rr[0] * rr[2] * inr4	) * sw - dd[1] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 5) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 5) % (ndescrpt * 3)) / 3];
+      vv[3] = (((FPTYPE)2. * rr[0] * rr[0] * inr4 - inr2) * sw - dd[1] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 3) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 3) % (ndescrpt * 3)) / 3];
+      vv[4] = (((FPTYPE)2. * rr[0] * rr[1] * inr4	) * sw - dd[1] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 4) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 4) % (ndescrpt * 3)) / 3];
+      vv[5] = (((FPTYPE)2. * rr[0] * rr[2] * inr4	) * sw - dd[1] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 5) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 5) % (ndescrpt * 3)) / 3];
       // ***deriv of component y/r2
-      vv[6] = ((2. * rr[1] * rr[0] * inr4	) * sw - dd[2] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 6) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 6) % (ndescrpt * 3)) / 3];
-      vv[7] = ((2. * rr[1] * rr[1] * inr4 - inr2) * sw - dd[2] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 7) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 7) % (ndescrpt * 3)) / 3];
-      vv[8] = ((2. * rr[1] * rr[2] * inr4	) * sw - dd[2] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 8) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 8) % (ndescrpt * 3)) / 3];
+      vv[6] = (((FPTYPE)2. * rr[1] * rr[0] * inr4	) * sw - dd[2] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 6) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 6) % (ndescrpt * 3)) / 3];
+      vv[7] = (((FPTYPE)2. * rr[1] * rr[1] * inr4 - inr2) * sw - dd[2] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 7) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 7) % (ndescrpt * 3)) / 3];
+      vv[8] = (((FPTYPE)2. * rr[1] * rr[2] * inr4	) * sw - dd[2] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 8) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 8) % (ndescrpt * 3)) / 3];
       // ***deriv of component z/r2 
-      vv[9] = ((2. * rr[2] * rr[0] * inr4	) * sw - dd[3] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 9) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 9) % (ndescrpt * 3)) / 3];
-      vv[10]= ((2. * rr[2] * rr[1] * inr4	) * sw - dd[3] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 10) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 10) % (ndescrpt * 3)) / 3];
-      vv[11]= ((2. * rr[2] * rr[2] * inr4 - inr2) * sw - dd[3] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 11) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 11) % (ndescrpt * 3)) / 3];
+      vv[9] = (((FPTYPE)2. * rr[2] * rr[0] * inr4	) * sw - dd[3] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 9) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 9) % (ndescrpt * 3)) / 3];
+      vv[10]= (((FPTYPE)2. * rr[2] * rr[1] * inr4	) * sw - dd[3] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 10) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 10) % (ndescrpt * 3)) / 3];
+      vv[11]= (((FPTYPE)2. * rr[2] * rr[2] * inr4 - inr2) * sw - dd[3] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 11) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 11) % (ndescrpt * 3)) / 3];
       // 4 value components
       dd[0] *= sw; // * em[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] * ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt + idx_value + 0];
       dd[1] *= sw; // * em[idx * ndescrpt + idx_value + 1]);// - avg[type[idx] * ndescrpt + idx_value + 1]) / std[type[idx] * ndescrpt + idx_value + 1];
@@ -405,7 +470,7 @@ __global__ void compute_env_mat_r(
     const float rmax)
 {
   // <<<nloc, TPB>>>
-  const unsigned int bid = blockIdx.x;
+  const int_64 bid = blockIdx.x;
   const unsigned int tid = threadIdx.x;
   if (tid >= nnei) {
     return;
@@ -419,9 +484,9 @@ __global__ void compute_env_mat_r(
     const int idx_value = ii;	  // 4 components
     const int idx_deriv = ii * 3;	// 4 components time 3 directions
     if (row_nlist[ii] >= 0) {
-      FPTYPE rr[3]  = {0};
-      FPTYPE vv[3]  = {0};
-      FPTYPE dd     = 0;
+      FPTYPE rr[3]  = {(FPTYPE)0.};
+      FPTYPE vv[3]  = {(FPTYPE)0.};
+      FPTYPE dd     = (FPTYPE)0.;
       const int & j_idx = row_nlist[ii];
       for (int kk = 0; kk < 3; kk++) {
         rr[kk] = coord[j_idx * 3 + kk] - coord[bid * 3 + kk];
@@ -429,14 +494,14 @@ __global__ void compute_env_mat_r(
       }
       // const FPTYPE * rr = &row_rij[ii * 3];
       FPTYPE nr2 = dev_dot(rr, rr);
-      FPTYPE inr = 1./sqrt(nr2);
+      FPTYPE inr = _rsqrt(nr2);
       FPTYPE nr = nr2 * inr;
       FPTYPE inr2 = inr * inr;
       FPTYPE inr4 = inr2 * inr2;
       FPTYPE inr3 = inr4 * nr;
       FPTYPE sw, dsw;
       spline5_switch(sw, dsw, nr, rmin, rmax);
-      dd = (1./nr)       ;//* sw;
+      dd = ((FPTYPE)1./nr)       ;//* sw;
       vv[0] = (rr[0] * inr3 * sw - dd * dsw * rr[0] * inr); // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3];
       vv[1] = (rr[1] * inr3 * sw - dd * dsw * rr[1] * inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3];
       vv[2] = (rr[2] * inr3 * sw - dd * dsw * rr[2] * inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3];
@@ -477,9 +542,9 @@ void format_nbor_list_gpu_rocm(
   int * nei_iter = array_int + sec.size(); // = new int[sec_size];
   int * i_idx = array_int + sec.size() + nloc * sec.size();
   uint_64 * key = array_longlong;
-  assert(max_nbor_size == 1024 || max_nbor_size == 2048 || max_nbor_size == 4096);
-  DPErrcheck(hipMemset(nlist, -1, sizeof(int) * nloc * nnei));
-  DPErrcheck(hipMemset(key, 0xffffffff, sizeof(uint_64) * nloc * max_nbor_size));
+  assert(max_nbor_size == 256 || max_nbor_size == 512 || 1024 || max_nbor_size == 2048 || max_nbor_size == 4096);
+  DPErrcheck(hipMemset(nlist, -1, sizeof(int) * int_64(nloc) * nnei));
+  DPErrcheck(hipMemset(key, 0xffffffff, sizeof(uint_64) * int_64(nloc) * max_nbor_size));
   DPErrcheck(hipMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), hipMemcpyHostToDevice));   
 
   hipLaunchKernelGGL(get_i_idx, nblock, LEN, 0, 0, 
@@ -488,7 +553,17 @@ void format_nbor_list_gpu_rocm(
   DPErrcheck(hipGetLastError());
   DPErrcheck(hipDeviceSynchronize());
 
-  if (max_nbor_size == 1024) {
+  if (max_nbor_size == 256) {
+    format_nbor_list_256 (
+        key,
+        coord, type, gpu_inlist, nloc, rcut, i_idx); 
+  }
+  else if (max_nbor_size == 512) {
+    format_nbor_list_512 (
+        key,
+        coord, type, gpu_inlist, nloc, rcut, i_idx); 
+  } 
+  else if (max_nbor_size == 1024) {
     format_nbor_list_1024 (
         key,
         coord, type, gpu_inlist, nloc, rcut, i_idx); 
@@ -537,9 +612,9 @@ void prod_env_mat_a_gpu_rocm(
 {
   const int nnei = sec.back();
   const int ndescrpt = nnei * 4;
-  DPErrcheck(hipMemset(em, 0, sizeof(FPTYPE) * nloc * ndescrpt));
-  DPErrcheck(hipMemset(em_deriv, 0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
-  DPErrcheck(hipMemset(rij, 0, sizeof(FPTYPE) * nloc * nnei * 3));
+  DPErrcheck(hipMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt));
+  DPErrcheck(hipMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3));
+  DPErrcheck(hipMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3));
 
   format_nbor_list_gpu_rocm(
       nlist, 
@@ -576,9 +651,9 @@ void prod_env_mat_r_gpu_rocm(
 {
   const int nnei = sec.back();
   const int ndescrpt = nnei * 1;
-  DPErrcheck(hipMemset(em, 0, sizeof(FPTYPE) * nloc * ndescrpt));
-  DPErrcheck(hipMemset(em_deriv, 0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
-  DPErrcheck(hipMemset(rij, 0, sizeof(FPTYPE) * nloc * nnei * 3));
+  DPErrcheck(hipMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt));
+  DPErrcheck(hipMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3));
+  DPErrcheck(hipMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3));
 
   format_nbor_list_gpu_rocm(
       nlist, 
diff --git a/source/lib/src/rocm/prod_force.hip.cu b/source/lib/src/rocm/prod_force.hip.cu
index 19ca5f0b89..16db29859e 100644
--- a/source/lib/src/rocm/prod_force.hip.cu
+++ b/source/lib/src/rocm/prod_force.hip.cu
@@ -11,10 +11,10 @@ __global__ void force_deriv_wrt_center_atom(
     const int ndescrpt)
 {
   __shared__ FPTYPE data[THREADS_PER_BLOCK * 3];
-  unsigned int bid = blockIdx.x;
+  int_64 bid = blockIdx.x;
   unsigned int tid = threadIdx.x;
   for (int ii = tid; ii < THREADS_PER_BLOCK * 3; ii += THREADS_PER_BLOCK) {
-    data[ii] = 0.f;
+    data[ii] = (FPTYPE)0.;
   }
   for (int ii = tid; ii < ndescrpt; ii += THREADS_PER_BLOCK) {
     for (int jj = 0; jj < 3; jj++) {
@@ -49,7 +49,7 @@ __global__ void force_deriv_wrt_neighbors_a(
     const int nnei)
 {  
     // idy -> nnei
-    const unsigned int idx = blockIdx.x;
+    const int_64 idx = blockIdx.x;
     const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
     const unsigned int idz = threadIdx.y;
     const int ndescrpt = nnei * 4;
@@ -61,7 +61,7 @@ __global__ void force_deriv_wrt_neighbors_a(
     if (j_idx < 0) {
         return;
     }
-    FPTYPE force_tmp = 0.f;
+    FPTYPE force_tmp = (FPTYPE)0.;
     for (int idw = 0; idw < 4; ++idw) {
         force_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz];
     }
@@ -78,7 +78,7 @@ __global__ void force_deriv_wrt_neighbors_r(
 		const int nnei)
 {  
     // idy -> nnei
-    const unsigned int idx = blockIdx.x;
+    const int_64 idx = blockIdx.x;
     const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
     const unsigned int idz = threadIdx.y;
     const int ndescrpt = nnei * 1;
diff --git a/source/lib/src/rocm/prod_force_grad.hip.cu b/source/lib/src/rocm/prod_force_grad.hip.cu
index 2329552896..f7540c07a2 100644
--- a/source/lib/src/rocm/prod_force_grad.hip.cu
+++ b/source/lib/src/rocm/prod_force_grad.hip.cu
@@ -17,7 +17,7 @@ __global__ void force_grad_wrt_center_atom(
     const int ndescrpt)
 {
     __shared__ FPTYPE grad_one[3];
-    unsigned int center_idx = blockIdx.x;
+    int_64 center_idx = blockIdx.x;
     unsigned int tid = threadIdx.x;
     if(tid < 3){
         grad_one[tid] = grad[center_idx * 3 + tid];
@@ -39,7 +39,7 @@ __global__ void force_grad_wrt_neighbors_a(
     const int nnei)
 {
     // idy -> nnei
-    const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int_64 idx = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int idy = blockIdx.y;
     const unsigned int idw = threadIdx.y;
     if (idx >= nloc) {
@@ -63,7 +63,7 @@ __global__ void force_grad_wrt_neighbors_r(
     const int nnei)
 {
     // idy -> nnei
-    const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int_64 idx = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int idy = blockIdx.y;
     if (idx >= nloc) {
         return;
diff --git a/source/lib/src/rocm/prod_virial.hip.cu b/source/lib/src/rocm/prod_virial.hip.cu
index 5c7cc05721..066e425d81 100644
--- a/source/lib/src/rocm/prod_virial.hip.cu
+++ b/source/lib/src/rocm/prod_virial.hip.cu
@@ -12,7 +12,7 @@ __global__ void atom_virial_reduction(
     unsigned int bid = blockIdx.x;
     unsigned int tid = threadIdx.x;
     __shared__ FPTYPE data[THREADS_PER_BLOCK];
-    data[tid] = 0.f;
+    data[tid] = (FPTYPE)0.;
     for (int ii = tid; ii < nall; ii += THREADS_PER_BLOCK) {
         data[tid] += atom_virial[ii * 9 + bid];
     }
@@ -44,7 +44,7 @@ __global__ void virial_deriv_wrt_neighbors_a(
   // idz = dd0 * 3 + dd1
   // dd0 = idz / 3
   // dd1 = idz % 3
-  const unsigned int idx = blockIdx.x;
+  const int_64 idx = blockIdx.x;
   const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
   const unsigned int idz = threadIdx.y;
   const int ndescrpt = nnei * 4;
@@ -55,7 +55,7 @@ __global__ void virial_deriv_wrt_neighbors_a(
   if (j_idx < 0) {
       return;
   }
-  FPTYPE virial_tmp = 0.f;
+  FPTYPE virial_tmp = (FPTYPE)0.;
   for (int idw = 0; idw < 4; ++idw) {
       virial_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz % 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3];
   }
@@ -78,7 +78,7 @@ __global__ void virial_deriv_wrt_neighbors_r(
     // idz = dd0 * 3 + dd1
     // dd0 = idz / 3
     // dd1 = idz % 3
-    const unsigned int idx = blockIdx.x;
+    const int_64 idx = blockIdx.x;
     const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
     const unsigned int idz = threadIdx.y;
     const int ndescrpt = nnei * 1;
diff --git a/source/lib/src/rocm/prod_virial_grad.hip.cu b/source/lib/src/rocm/prod_virial_grad.hip.cu
index 18c009f0c8..ebe4948a50 100644
--- a/source/lib/src/rocm/prod_virial_grad.hip.cu
+++ b/source/lib/src/rocm/prod_virial_grad.hip.cu
@@ -6,7 +6,7 @@ __device__ inline FPTYPE dev_dot9(
     const FPTYPE * arr1, 
     const FPTYPE * arr2) 
 {
-    FPTYPE result = 0.0;
+    FPTYPE result = (FPTYPE)0.0;
     for(int ii=0; ii<9; ii++){
         result += arr1[ii] * arr2[ii];
     }
@@ -25,7 +25,7 @@ __global__ void virial_grad_wrt_neighbors_a(
 {
     // idy -> nnei
     const unsigned int tid = threadIdx.x;
-    const unsigned int idx = blockIdx.x * blockDim.x + tid;
+    const int_64 idx = blockIdx.x * blockDim.x + tid;
     const unsigned int idy = blockIdx.y;
     const unsigned int idw = threadIdx.y;
     const int ndescrpt = nnei * 4;
@@ -62,7 +62,7 @@ __global__ void virial_grad_wrt_neighbors_r(
 {
     // idy -> nnei
     const unsigned int tid = threadIdx.x;
-    const unsigned int idx = blockIdx.x * blockDim.x + tid;
+    const int_64 idx = blockIdx.x * blockDim.x + tid;
     const unsigned int idy = blockIdx.y;
     const int ndescrpt = nnei;
     __shared__ FPTYPE grad_one[9];
@@ -83,7 +83,7 @@ __global__ void virial_grad_wrt_neighbors_r(
             tmp[dd0 * 3 + dd1] = rij[idx * nnei * 3 + idy * 3 + dd1] * env_deriv[idx * ndescrpt * 3 + idy * 3 + dd0];
         }
     }
-    grad_net[idx * ndescrpt + idy] -= -1.0 * dev_dot9(grad_one, tmp);
+    grad_net[idx * ndescrpt + idy] -= (FPTYPE)-1.0 * dev_dot9(grad_one, tmp);
 }
 
 namespace deepmd {
diff --git a/source/lib/src/rocm/tabulate.hip.cu b/source/lib/src/rocm/tabulate.hip.cu
index 0354bc68b3..caa51578dd 100644
--- a/source/lib/src/rocm/tabulate.hip.cu
+++ b/source/lib/src/rocm/tabulate.hip.cu
@@ -20,7 +20,7 @@ void locate_xx(
 {
   if (xx < lower) {
     table_idx = 0;
-    xx = 0;
+    xx = (FPTYPE)0.;
   }
   else if (xx < upper) {
     table_idx = (int)((xx - lower) / stride0);
@@ -33,7 +33,7 @@ void locate_xx(
   }
   else {
     table_idx = int((upper - lower) / stride0) + (int)((max - upper) / stride1) - 1;
-    xx = 0;
+    xx = (FPTYPE)0.;
   }
 }
 
@@ -51,7 +51,7 @@ void locate_xx_se_t(
 {
   if (xx < min) {
     table_idx = 0;
-    xx = 0;
+    xx = (FPTYPE)0.;
   }
   else if (xx < lower) {
     table_idx = (int)((xx - min) / stride1);
@@ -69,7 +69,7 @@ void locate_xx_se_t(
   }
   else {
     table_idx = int((lower - min) / stride1) + int((upper - lower) / stride0) + (int)((max - upper) / stride1) - 1;
-    xx = 0;
+    xx = (FPTYPE)0.;
   }
 }
 
@@ -110,14 +110,14 @@ __global__ void tabulate_fusion_se_a_fifth_order_polynomial(
     const int last_layer_size) 
 {
   HIP_DYNAMIC_SHARED( int, _data)
-  const int block_idx = blockIdx.x;   // nloc
+  const int_64 block_idx = blockIdx.x;   // nloc
   const int thread_idx = threadIdx.x; // last_layer_size
   FPTYPE ago = __shfl(em_x[block_idx * nnei + nnei - 1], 0);
   bool unloop = false;
   int breakpoint = nnei - 1;
   FPTYPE * iteratorC = (FPTYPE*) &_data[0];
   for (int kk = 0; kk < MTILE; kk++)
-    iteratorC[kk * last_layer_size + thread_idx] = 0.f;
+    iteratorC[kk * last_layer_size + thread_idx] = (FPTYPE)0.;
   __syncthreads();
 
   for (int ii = 0; ii < nnei; ii++) {
@@ -167,7 +167,7 @@ __global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial(
     const int last_layer_size) 
 {
   HIP_DYNAMIC_SHARED( int, _data)
-  const int block_idx = blockIdx.x;  // nloc
+  const int_64 block_idx = blockIdx.x;  // nloc
   const int thread_idx = threadIdx.x; // KTILE * WARP_SIZE, usally 128 here~
   int warp_idx = __shfl(threadIdx.x / 64, 0);
   int lane_idx = threadIdx.x % 64;
@@ -190,8 +190,8 @@ __global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial(
     
     int table_idx = 0;
     locate_xx(xx, table_idx, lower, upper, max, stride0, stride1);
-    FPTYPE sum[KTILE] = {0.f};
-    FPTYPE Csub = 0.f;
+    FPTYPE sum[KTILE] = {(FPTYPE)0.};
+    FPTYPE Csub = (FPTYPE)0.;
     for (int jj = lane_idx; jj < last_layer_size; jj += WARP_SIZE) {
       FPTYPE var[6]; 
       // load iteratorB through table 
@@ -210,7 +210,7 @@ __global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial(
       res += em[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 1] * iteratorA[1 * last_layer_size + jj];
       res += em[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 2] * iteratorA[2 * last_layer_size + jj];
       res += em[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 3] * iteratorA[3 * last_layer_size + jj];
-      Csub += (nnei - breakpoint) * (var[1] + (2 * var[2] + (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) * xx) * res;
+      Csub += (nnei - breakpoint) * (var[1] + ((FPTYPE)2. * var[2] + ((FPTYPE)3. * var[3] + ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) * xx) * xx) * res;
     }
     //__syncwarp();->syncwrap
     __syncthreads();
@@ -248,14 +248,14 @@ __global__ void tabulate_fusion_se_a_grad_grad_fifth_order_polynomial(
     const int last_layer_size)
 {
   extern __shared__ int _data[];
-  const int block_idx = blockIdx.x;   // nloc
+  const int_64 block_idx = blockIdx.x;   // nloc
   const int thread_idx = threadIdx.x; // last_layer_size
   FPTYPE ago = __shfl( em_x[block_idx * nnei + nnei - 1], 0);
   bool unloop = false;
   int breakpoint = nnei - 1;
   FPTYPE * iteratorC = (FPTYPE*) &_data[0];
   for (int kk = 0; kk < MTILE; kk++)
-    iteratorC[kk * last_layer_size + thread_idx] = 0.f;
+    iteratorC[kk * last_layer_size + thread_idx] = (FPTYPE)0.;
   __syncthreads();
 
   for (int ii = 0; ii < nnei; ii++) {
@@ -275,7 +275,7 @@ __global__ void tabulate_fusion_se_a_grad_grad_fifth_order_polynomial(
     var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4];
     var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5];
     FPTYPE res = var[0] + (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * xx;
-    FPTYPE res_grad = var[1] + (2 * var[2] + (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) * xx;
+    FPTYPE res_grad = var[1] + ((FPTYPE)2. * var[2] + ((FPTYPE)3. * var[3] + ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) * xx) * xx;
 
     for (int kk = 0; kk < MTILE; kk++) {
       int em_index = block_idx * nnei * MTILE + ii * MTILE + kk;
@@ -307,10 +307,10 @@ __global__ void tabulate_fusion_se_t_fifth_order_polynomial(
     const int last_layer_size) 
 {
   HIP_DYNAMIC_SHARED( int, _data)
-  const int block_idx = blockIdx.x;   // nloc
+  const int_64 block_idx = blockIdx.x;   // nloc
   const int thread_idx = threadIdx.x; // last_layer_size
 
-  FPTYPE sum = 0.f;
+  FPTYPE sum = (FPTYPE)0.;
   for (int ii = 0; ii < nnei_i; ii++) {
     FPTYPE ago = __shfl(em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + nnei_j - 1], 0);
     int breakpoint = nnei_j - 1;
@@ -361,7 +361,7 @@ __global__ void tabulate_fusion_se_t_grad_fifth_order_polynomial(
     const int last_layer_size) 
 {
   HIP_DYNAMIC_SHARED( int, _data)
-  const int block_idx = blockIdx.x;  // nloc
+  const int_64 block_idx = blockIdx.x;  // nloc
   const int thread_idx = threadIdx.x; // KTILE * WARP_SIZE, usally 128 here~
   int warp_idx = __shfl(threadIdx.x / 64, 0);
   int lane_idx = threadIdx.x % 64;
@@ -382,8 +382,8 @@ __global__ void tabulate_fusion_se_t_grad_fifth_order_polynomial(
       }
       int table_idx = 0;
       locate_xx_se_t(xx, table_idx, lower, upper, -max, max, stride0, stride1);
-      FPTYPE sum  = 0.f;
-      FPTYPE Csub = 0.f;
+      FPTYPE sum  = (FPTYPE)0.;
+      FPTYPE Csub = (FPTYPE)0.;
       for (int kk = lane_idx; kk < last_layer_size; kk += WARP_SIZE) {
         FPTYPE var[6]; 
         // load iteratorB through table 
@@ -396,7 +396,7 @@ __global__ void tabulate_fusion_se_t_grad_fifth_order_polynomial(
         FPTYPE res = var[0] + (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * xx;
 
         sum  += iteratorA[kk] * res;
-        Csub += iteratorA[kk] * tmp * (var[1] + (2 * var[2] + (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) * xx);
+        Csub += iteratorA[kk] * tmp * (var[1] + ((FPTYPE)2. * var[2] + ((FPTYPE)3. * var[3] + ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) * xx) * xx);
       }
       __syncthreads();
       warp_reduce(sum);
@@ -430,10 +430,10 @@ __global__ void tabulate_fusion_se_t_grad_grad_fifth_order_polynomial(
     const int nnei_j,
     const int last_layer_size)
 {
-  const int block_idx  = blockIdx.x;   // nloc
+  const int_64 block_idx  = blockIdx.x;   // nloc
   const int thread_idx = threadIdx.x; // last_layer_size
 
-  FPTYPE sum = 0.f;
+  FPTYPE sum = (FPTYPE)0.;
   for (int ii = 0; ii < nnei_i; ii++) { 
     FPTYPE ago = __shfl(em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + nnei_j - 1], 0);
     bool unloop = false;
@@ -456,7 +456,7 @@ __global__ void tabulate_fusion_se_t_grad_grad_fifth_order_polynomial(
       var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4];
       var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5];
       FPTYPE res = var[0] + (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * xx;
-      FPTYPE res_grad = var[1] + (2 * var[2] + (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) * xx;
+      FPTYPE res_grad = var[1] + ((FPTYPE)2. * var[2] + ((FPTYPE)3. * var[3] + ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) * xx) * xx;
   
       sum += (tmp * res_grad * dz_xx + dz_em * res);
       if (unloop) break;
@@ -482,7 +482,7 @@ __global__ void tabulate_fusion_se_r_fifth_order_polynomial(
     const int last_layer_size) 
 {
   HIP_DYNAMIC_SHARED( int, _data)
-  const int block_idx = blockIdx.x;   // nloc
+  const int_64 block_idx = blockIdx.x;   // nloc
   const int thread_idx = threadIdx.x; // last_layer_size
   
   for (int ii = 0; ii < nnei; ii++) {
@@ -519,7 +519,7 @@ __global__ void tabulate_fusion_se_r_grad_fifth_order_polynomial(
     const int last_layer_size) 
 {
   HIP_DYNAMIC_SHARED( int, _data)
-  const int block_idx = blockIdx.x;  // nloc
+  const int_64 block_idx = blockIdx.x;  // nloc
   const int thread_idx = threadIdx.x; // KTILE * WARP_SIZE, usally 128 here~
   int warp_idx = __shfl(threadIdx.x / 64, 0);
   int lane_idx = threadIdx.x % 64;
@@ -539,7 +539,7 @@ __global__ void tabulate_fusion_se_r_grad_fifth_order_polynomial(
       var[3]  = table[table_idx * last_layer_size * 6 + 6 * jj + 3];
       var[4]  = table[table_idx * last_layer_size * 6 + 6 * jj + 4];
       var[5]  = table[table_idx * last_layer_size * 6 + 6 * jj + 5];
-      Csub +=(var[1] + (2 * var[2] + (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) * xx) * dy[block_idx * nnei * last_layer_size + ii * last_layer_size + jj];
+      Csub +=(var[1] + ((FPTYPE)2. * var[2] + ((FPTYPE)3. * var[3] + ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) * xx) * xx) * dy[block_idx * nnei * last_layer_size + ii * last_layer_size + jj];
     }
     //__syncwarp();->syncwrap
     __syncthreads();
@@ -569,7 +569,7 @@ __global__ void tabulate_fusion_se_r_grad_grad_fifth_order_polynomial(
     const int last_layer_size)
 {
   extern __shared__ int _data[];
-  const int block_idx = blockIdx.x;   // nloc
+  const int_64 block_idx = blockIdx.x;   // nloc
   const int thread_idx = threadIdx.x; // last_layer_size
 
   __syncthreads();
@@ -585,7 +585,7 @@ __global__ void tabulate_fusion_se_r_grad_grad_fifth_order_polynomial(
     var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3];
     var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4];
     var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5];
-    FPTYPE res_grad = var[1] + (2 * var[2] + (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) * xx;
+    FPTYPE res_grad = var[1] + ((FPTYPE)2. * var[2] + ((FPTYPE)3. * var[3] + ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) * xx) * xx;
     dz_dy[block_idx * nnei * last_layer_size + ii * last_layer_size + thread_idx] = dz_dy_dem[block_idx * nnei + ii]*res_grad;
 
   }
diff --git a/source/lib/src/soft_min_switch.cc b/source/lib/src/soft_min_switch.cc
index 88471a3d4b..9b37b29cde 100644
--- a/source/lib/src/soft_min_switch.cc
+++ b/source/lib/src/soft_min_switch.cc
@@ -17,12 +17,12 @@ void deepmd::soft_min_switch_cpu(
 {
   // fill results with 0
   for (int ii = 0; ii < nloc; ++ii){
-    sw_value[ii] = 0;
+    sw_value[ii] = (FPTYPE)0.;
   }
   for (int ii = 0; ii < nloc * nnei; ++ii){
-    sw_deriv[ii * 3 + 0] = 0;
-    sw_deriv[ii * 3 + 1] = 0;
-    sw_deriv[ii * 3 + 2] = 0;
+    sw_deriv[ii * 3 + 0] = (FPTYPE)0.;
+    sw_deriv[ii * 3 + 1] = (FPTYPE)0.;
+    sw_deriv[ii * 3 + 2] = (FPTYPE)0.;
   }
   // compute force of a frame      
   for (int ii = 0; ii < nloc; ++ii){
@@ -62,8 +62,8 @@ void deepmd::soft_min_switch_cpu(
       FPTYPE rr2 = dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2];
       FPTYPE rr = sqrt(rr2);
       FPTYPE ee = exp(-rr / alpha);
-      FPTYPE pref_c = (1./rr - 1./alpha) * ee ;
-      FPTYPE pref_d = 1./(rr * alpha) * ee;
+      FPTYPE pref_c = ((FPTYPE)1./rr - (FPTYPE)1./alpha) * ee ;
+      FPTYPE pref_d = (FPTYPE)1./(rr * alpha) * ee;
       FPTYPE ts;
       ts = dd / (aa * aa) * (aa * pref_c + bb * pref_d);
       sw_deriv[rij_idx_shift + 0] += ts * dr[0];
diff --git a/source/lib/src/soft_min_switch_force.cc b/source/lib/src/soft_min_switch_force.cc
index 724952493d..9a6633daa0 100644
--- a/source/lib/src/soft_min_switch_force.cc
+++ b/source/lib/src/soft_min_switch_force.cc
@@ -19,9 +19,9 @@ void deepmd::soft_min_switch_force_cpu(
   // set zeros
   for (int ii = 0; ii < nall; ++ii){
     int i_idx = ii;
-    force[i_idx * 3 + 0] = 0;
-    force[i_idx * 3 + 1] = 0;
-    force[i_idx * 3 + 2] = 0;
+    force[i_idx * 3 + 0] = (FPTYPE)0.;
+    force[i_idx * 3 + 1] = (FPTYPE)0.;
+    force[i_idx * 3 + 2] = (FPTYPE)0.;
   }
   // compute force of a frame
   for (int ii = 0; ii < nloc; ++ii){
diff --git a/source/lib/src/soft_min_switch_force_grad.cc b/source/lib/src/soft_min_switch_force_grad.cc
index 31e46e9d6d..138d20d93c 100644
--- a/source/lib/src/soft_min_switch_force_grad.cc
+++ b/source/lib/src/soft_min_switch_force_grad.cc
@@ -18,7 +18,7 @@ void deepmd::soft_min_switch_force_grad_cpu(
 {
   // reset the frame to 0
   for (int ii = 0; ii < nloc; ++ii){
-    grad_net[ii] = 0;
+    grad_net[ii] = (FPTYPE)0.;
   }      
 
   // compute grad of one frame
diff --git a/source/lib/src/soft_min_switch_virial.cc b/source/lib/src/soft_min_switch_virial.cc
index a93ab3c1fb..634f6b6d49 100644
--- a/source/lib/src/soft_min_switch_virial.cc
+++ b/source/lib/src/soft_min_switch_virial.cc
@@ -20,10 +20,10 @@ void deepmd::soft_min_switch_virial_cpu(
 //
 {
   for (int ii = 0; ii < 9; ++ ii){
-    virial[ii] = 0.;
+    virial[ii] = (FPTYPE)0.;
   }
   for (int ii = 0; ii < 9 * nall; ++ ii){
-    atom_virial[ii] = 0.;
+    atom_virial[ii] = (FPTYPE)0.;
   }
 
   // compute virial of a frame
diff --git a/source/lib/src/soft_min_switch_virial_grad.cc b/source/lib/src/soft_min_switch_virial_grad.cc
index 1bb28a7c63..5c86376737 100644
--- a/source/lib/src/soft_min_switch_virial_grad.cc
+++ b/source/lib/src/soft_min_switch_virial_grad.cc
@@ -19,7 +19,7 @@ void deepmd::soft_min_switch_virial_grad_cpu(
 {
   // reset the frame to 0
   for (int ii = 0; ii < nloc; ++ii){
-    grad_net[ii] = 0;
+    grad_net[ii] = (FPTYPE)0.;
   }      
 
   // compute grad of one frame
diff --git a/source/lib/src/tabulate.cc b/source/lib/src/tabulate.cc
index e116711466..b75e9155f9 100644
--- a/source/lib/src/tabulate.cc
+++ b/source/lib/src/tabulate.cc
@@ -25,7 +25,7 @@ inline void locate_xx(
 {
   if (xx < lower) {
     table_idx = 0;
-    xx = 0;
+    xx = (FPTYPE)0.;
   }
   else if (xx < upper) {
     table_idx = (int)((xx - lower) / stride0);
@@ -38,7 +38,7 @@ inline void locate_xx(
   }
   else {
     table_idx = int((upper - lower) / stride0) + (int)((max - upper) / stride1) - 1;
-    xx = 0;
+    xx = (FPTYPE)0.;
   }
 }
 
@@ -56,7 +56,7 @@ inline void locate_xx_se_t(
 {
   if (xx < min) {
     table_idx = 0;
-    xx = 0;
+    xx = (FPTYPE)0.;
   }
   else if (xx < lower) {
     table_idx = (int)((xx - min) / stride1);
@@ -74,7 +74,7 @@ inline void locate_xx_se_t(
   }
   else {
     table_idx = int((lower - min) / stride1) + int((upper - lower) / stride0) + (int)((max - upper) / stride1) - 1;
-    xx = 0;
+    xx = (FPTYPE)0.;
   }
 }
 
@@ -187,7 +187,7 @@ void deepmd::tabulate_fusion_se_a_grad_cpu(
       }
       int table_idx = 0;
       locate_xx(lower, upper, _max, stride0, stride1, xx, table_idx);
-      FPTYPE grad = 0.0;
+      FPTYPE grad = (FPTYPE)0.0;
       for (int kk = 0; kk < last_layer_size; kk++) {
         rr[0] = dy[ii * last_layer_size * 4 + 0 * last_layer_size + kk];
         rr[1] = dy[ii * last_layer_size * 4 + 1 * last_layer_size + kk];
@@ -273,7 +273,7 @@ void deepmd::tabulate_fusion_se_a_grad_grad_cpu(
         FPTYPE a4  = table[table_idx * last_layer_size * 6 + 6 * kk + 4];
         FPTYPE a5  = table[table_idx * last_layer_size * 6 + 6 * kk + 5];
         FPTYPE var = a0 + (a1 + (a2 + (a3 + (a4 + a5 * xx) * xx) * xx) * xx) * xx;
-        FPTYPE var_grad = a1 + (2 * a2 + (3 * a3 + (4 * a4 + 5 * a5 * xx) * xx) * xx) * xx;
+        FPTYPE var_grad = a1 + ((FPTYPE)2. * a2 + ((FPTYPE)3. * a3 + ((FPTYPE)4. * a4 + (FPTYPE)5. * a5 * xx) * xx) * xx) * xx;
         if (unloop) {
           dz_dy[ii * last_layer_size * 4 + 0 * last_layer_size + kk] += (nnei - jj) * (var * hh[0] + dz_xx * var_grad * ll[0]);
           dz_dy[ii * last_layer_size * 4 + 1 * last_layer_size + kk] += (nnei - jj) * (var * hh[1] + dz_xx * var_grad * ll[1]);
@@ -371,8 +371,8 @@ void deepmd::tabulate_fusion_se_t_grad_cpu(
   // FPTYPE * res = new FPTYPE[4 * last_layer_size];
   #pragma omp parallel for
   for (int ii = 0; ii < nloc; ii++) {
-    FPTYPE ll = 0;
-    FPTYPE rr = 0;
+    FPTYPE ll = (FPTYPE)0.;
+    FPTYPE rr = (FPTYPE)0.;
     for (int jj = 0; jj < nnei_i; jj++) {
       FPTYPE ago = em_x[ii * nnei_i * nnei_j + jj * nnei_j + nnei_j - 1];
       bool unloop = false;
@@ -385,7 +385,7 @@ void deepmd::tabulate_fusion_se_t_grad_cpu(
         }
         int table_idx = 0;
         locate_xx_se_t(lower, upper, -_max, _max, stride0, stride1, xx, table_idx);
-        FPTYPE grad = 0.0;
+        FPTYPE grad = (FPTYPE)0.0;
         for (int mm = 0; mm < last_layer_size; mm++) {
           rr = dy[ii * last_layer_size + mm];
           FPTYPE a0  = table[table_idx * last_layer_size * 6 + 6 * mm + 0]; 
@@ -397,11 +397,11 @@ void deepmd::tabulate_fusion_se_t_grad_cpu(
           FPTYPE res = a0 + (a1 + (a2 + (a3 + (a4 + a5 * xx) * xx) * xx) * xx) * xx;
 
           if (unloop) {
-            grad += (a1 + (2 * a2 + (3 * a3 + (4 * a4 + 5 * a5 * xx) * xx) * xx) * xx) * ll * rr * (nnei_j - kk);
+            grad += (a1 + ((FPTYPE)2. * a2 + ((FPTYPE)3. * a3 + ((FPTYPE)4. * a4 + (FPTYPE)5. * a5 * xx) * xx) * xx) * xx) * ll * rr * (nnei_j - kk);
             dy_dem[ii * nnei_i * nnei_j + jj * nnei_j + kk] += res * rr * (nnei_j - kk);
           }
           else {
-            grad += (a1 + (2 * a2 + (3 * a3 + (4 * a4 + 5 * a5 * xx) * xx) * xx) * xx) * ll * rr;
+            grad += (a1 + ((FPTYPE)2. * a2 + ((FPTYPE)3. * a3 + ((FPTYPE)4. * a4 + (FPTYPE)5. * a5 * xx) * xx) * xx) * xx) * ll * rr;
             dy_dem[ii * nnei_i * nnei_j + jj * nnei_j + kk] += res * rr;
           }
         }
@@ -458,7 +458,7 @@ void deepmd::tabulate_fusion_se_t_grad_grad_cpu(
           FPTYPE a4  = table[table_idx * last_layer_size * 6 + 6 * mm + 4];
           FPTYPE a5  = table[table_idx * last_layer_size * 6 + 6 * mm + 5];
           FPTYPE var = a0 + (a1 + (a2 + (a3 + (a4 + a5 * xx) * xx) * xx) * xx) * xx;
-          FPTYPE var_grad = a1 + (2 * a2 + (3 * a3 + (4 * a4 + 5 * a5 * xx) * xx) * xx) * xx;
+          FPTYPE var_grad = a1 + ((FPTYPE)2. * a2 + ((FPTYPE)3. * a3 + ((FPTYPE)4. * a4 + (FPTYPE)5. * a5 * xx) * xx) * xx) * xx;
           
           dz_dy[ii * last_layer_size + mm] += var * dz_em + dz_xx * var_grad * tmp;
         }
@@ -531,7 +531,7 @@ void deepmd::tabulate_fusion_se_r_grad_cpu(
       FPTYPE xx = em[ii * nnei + jj]; 
       int table_idx = 0;
       locate_xx(lower, upper, _max, stride0, stride1, xx, table_idx);
-      FPTYPE grad = 0.0;
+      FPTYPE grad = (FPTYPE)0.0;
       for (int kk = 0; kk < last_layer_size; kk++) {
         FPTYPE a0  = table[table_idx * last_layer_size * 6 + 6 * kk + 0]; 
         FPTYPE a1  = table[table_idx * last_layer_size * 6 + 6 * kk + 1]; 
@@ -539,7 +539,7 @@ void deepmd::tabulate_fusion_se_r_grad_cpu(
         FPTYPE a3  = table[table_idx * last_layer_size * 6 + 6 * kk + 3];
         FPTYPE a4  = table[table_idx * last_layer_size * 6 + 6 * kk + 4];
         FPTYPE a5  = table[table_idx * last_layer_size * 6 + 6 * kk + 5];
-        grad += (a1 + (2 * a2 + (3 * a3 + (4 * a4 + 5 * a5 * xx) * xx) * xx) * xx) * dy[ii * last_layer_size * nnei + jj * last_layer_size + kk];
+        grad += (a1 + ((FPTYPE)2. * a2 + ((FPTYPE)3. * a3 + ((FPTYPE)4. * a4 + (FPTYPE)5. * a5 * xx) * xx) * xx) * xx) * dy[ii * last_layer_size * nnei + jj * last_layer_size + kk];
       }
       dy_dem[ii * nnei + jj] = grad;
     }
@@ -578,7 +578,7 @@ void deepmd::tabulate_fusion_se_r_grad_grad_cpu(
         FPTYPE a3  = table[table_idx * last_layer_size * 6 + 6 * kk + 3];
         FPTYPE a4  = table[table_idx * last_layer_size * 6 + 6 * kk + 4];
         FPTYPE a5  = table[table_idx * last_layer_size * 6 + 6 * kk + 5];
-        FPTYPE var_grad = a1 + (2 * a2 + (3 * a3 + (4 * a4 + 5 * a5 * xx) * xx) * xx) * xx;
+        FPTYPE var_grad = a1 + ((FPTYPE)2. * a2 + ((FPTYPE)3. * a3 + ((FPTYPE)4. * a4 + (FPTYPE)5. * a5 * xx) * xx) * xx) * xx;
         dz_dy[ii * last_layer_size * nnei + jj * last_layer_size + kk] = dz_dy_dem[ii * nnei + jj] * var_grad;
       }
     }
diff --git a/source/lib/tests/CMakeLists.txt b/source/lib/tests/CMakeLists.txt
index 8c3a3e4c16..394c9730c7 100644
--- a/source/lib/tests/CMakeLists.txt
+++ b/source/lib/tests/CMakeLists.txt
@@ -72,13 +72,22 @@ endif()
 
 if (USE_CUDA_TOOLKIT)
   target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread deepmd_op_cuda coverage_config)
+  install(TARGETS deepmd_op_cuda DESTINATION lib/)
+
 elseif (USE_ROCM_TOOLKIT)
   target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread deepmd_op_rocm coverage_config ${ROCM_LIBRARIES})
+  install(TARGETS deepmd_op_rocm DESTINATION lib/)
 else()
   target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread coverage_config)
 endif()
 add_test( runUnitTests runUnitTests )
 
+set_target_properties(
+  runUnitTests
+  PROPERTIES 
+  INSTALL_RPATH "$ORIGIN/../lib"
+)
+
 # include(GoogleTest)
 # add_executable(FooTest tests/test_simulation_region.cc)
 # gtest_add_tests(TARGET      FooTest
@@ -108,4 +117,4 @@ else ()
 endif ()
 
 install(TARGETS runUnitTests DESTINATION bin/)
-install(TARGETS runUnitTests DESTINATION lib/)
+
diff --git a/source/lib/tests/test_tabulate_se_r.cc b/source/lib/tests/test_tabulate_se_r.cc
index b7173b0123..ebd4eb2295 100644
--- a/source/lib/tests/test_tabulate_se_r.cc
+++ b/source/lib/tests/test_tabulate_se_r.cc
@@ -122,7 +122,7 @@ TEST_F(TestTabulateSeR, tabulate_fusion_se_r_grad_gpu_cuda)
   std::vector<double> dy_dem(em.size(), 0.0);
   std::vector<double> dy(nloc * nnei * last_layer_size, 1.0);
 
-  * dy_dem_dev = NULL, * table_dev = NULL, * em_dev = NULL, * dy_dev = NULL;
+  double * dy_dem_dev = NULL, * table_dev = NULL, * em_dev = NULL, * dy_dev = NULL;
   deepmd::malloc_device_memory_sync(dy_dem_dev, dy_dem);
   deepmd::malloc_device_memory_sync(table_dev, table);
   deepmd::malloc_device_memory_sync(em_dev, em);
diff --git a/source/lmp/CMakeLists.txt b/source/lmp/CMakeLists.txt
index 970959c678..dbf259800a 100644
--- a/source/lmp/CMakeLists.txt
+++ b/source/lmp/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_subdirectory(plugin)
 if (NOT DEFINED LAMMPS_VERSION_NUMBER)
-    # set the default to stable_29Sep2021
-    set(LAMMPS_VERSION_NUMBER 20210929)
+    # set the default to stable_23Jun2022
+    set(LAMMPS_VERSION_NUMBER 20220623)
 endif()
 message(STATUS "LAMMPS version is ${LAMMPS_VERSION_NUMBER}")
 
diff --git a/source/lmp/pair_deepmd.cpp b/source/lmp/pair_deepmd.cpp
index b9793ff7f9..f457d2a183 100644
--- a/source/lmp/pair_deepmd.cpp
+++ b/source/lmp/pair_deepmd.cpp
@@ -411,7 +411,11 @@ void PairDeepMD::compute(int eflag, int vflag)
 	vector<double > deatom (nall * 1, 0);
 	vector<double > dvatom (nall * 9, 0);
 #ifdef HIGH_PREC
+  try {
 	deep_pot.compute (dener, dforce, dvirial, deatom, dvatom, dcoord, dtype, dbox, nghost, lmp_list, ago, fparam, daparam);
+  } catch(deepmd::deepmd_exception& e) {
+    error->all(FLERR, e.what());
+  }
 #else 
 	vector<float> dcoord_(dcoord.size());
 	vector<float> dbox_(dbox.size());
diff --git a/source/op/custom_op.h b/source/op/custom_op.h
index 8482e92b03..93bfc90c86 100644
--- a/source/op/custom_op.h
+++ b/source/op/custom_op.h
@@ -1,6 +1,7 @@
 #include <vector>
 #include <string>
 #include <iostream>
+#include "device.h"
 
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/source/op/gelu_multi_device.cc b/source/op/gelu_multi_device.cc
index af49c4ac7e..9865599b4b 100644
--- a/source/op/gelu_multi_device.cc
+++ b/source/op/gelu_multi_device.cc
@@ -45,7 +45,7 @@ class GeluOp : public OpKernel {
     // flat the tensors
     FPTYPE * out = output_tensor->flat<FPTYPE>().data();
     const FPTYPE * x = x_tensor.flat<FPTYPE>().data();
-    const int size = static_cast<int>(output_tensor->NumElements());
+    const int_64 size = static_cast<int_64>(output_tensor->NumElements());
 
     if (device == "GPU") {
       #if GOOGLE_CUDA
@@ -98,7 +98,7 @@ class GeluGradOp : public OpKernel {
     FPTYPE * out = output_tensor->flat<FPTYPE>().data();
     const FPTYPE * x = x_tensor.flat<FPTYPE>().data();
     const FPTYPE * dy = dy_tensor.flat<FPTYPE>().data();
-    const int size = static_cast<int>(output_tensor->NumElements());
+    const int_64 size = static_cast<int_64>(output_tensor->NumElements());
 
     if (device == "GPU") {
       #if GOOGLE_CUDA
@@ -153,7 +153,7 @@ class GeluGradGradOp : public OpKernel {
     const FPTYPE * x = x_tensor.flat<FPTYPE>().data();
     const FPTYPE * dy = dy_tensor.flat<FPTYPE>().data();
     const FPTYPE * dy_2 = dy_2_tensor.flat<FPTYPE>().data();
-    const int size = static_cast<int>(output_tensor->NumElements());
+    const int_64 size = static_cast<int_64>(output_tensor->NumElements());
 
     if (device == "GPU") {
       #if GOOGLE_CUDA
diff --git a/source/op/optimizer/parallel.cc b/source/op/optimizer/parallel.cc
index 55e120e989..317de4b11f 100644
--- a/source/op/optimizer/parallel.cc
+++ b/source/op/optimizer/parallel.cc
@@ -2,6 +2,13 @@
 #include "tensorflow/core/public/version.h"
 #if TF_MAJOR_VERSION >= 2 || (TF_MAJOR_VERSION == 1 && TF_MINOR_VERSION >= 15)
 
+#if TF_MAJOR_VERSION >= 2 && TF_MINOR_VERSION >= 7
+// breaking change in tf 2.7: Renaming of tensorflow::int64 to int_64_t
+#define TF_INT64 int64_t 
+#else
+#define TF_INT64 tensorflow::int64 
+#endif
+
 #include "parallel.h"
 
 #include "tensorflow/core/grappler/devices.h"
@@ -34,10 +41,10 @@ bool FindProdForce(RemapperContext *ctx, int node_index) {
   return IsProdForce(*node_def);
 }
 
-int64_t GetNThreads() {
+TF_INT64 GetNThreads() {
   // the number of threads is based on the session...
   // For convenience, we use environment variable directly
-  int64_t tot = 1;
+  TF_INT64 tot = 1;
   Status status =
       ReadInt64FromEnvVar("TF_INTER_OP_PARALLELISM_THREADS", 1, &tot);
   if (!status.ok()) {
@@ -55,7 +62,7 @@ Status ParallelProdForce(RemapperContext *ctx, int node_index,
 
   const NodeDef *ori_node = ctx->graph_view.GetNode(node_index)->node();
   auto &src_attr = ori_node->attr();
-  int64_t tot = GetNThreads();
+  TF_INT64 tot = GetNThreads();
   if (tot <= 1)
     return Status::OK();
 
diff --git a/source/op/optimizer/parallel.h b/source/op/optimizer/parallel.h
index 7de9f0b7ea..efedf65da8 100644
--- a/source/op/optimizer/parallel.h
+++ b/source/op/optimizer/parallel.h
@@ -16,6 +16,11 @@ class DPParallel : public CustomGraphOptimizer {
   bool UsesFunctionLibrary() const override { return false; }
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* optimized_graph) override;
+#if TF_MAJOR_VERSION >= 2 && TF_MINOR_VERSION < 6
+// TF 3457a2b122e50b4d44ceaaed5a663d635e5c22df
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimized_graph, double result) override {}
+#endif
 };
 
 #endif  // DP_REMAPPER_H_
\ No newline at end of file
diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc
index 69e08eaa5e..fbee405ca1 100644
--- a/source/op/prod_env_mat_multi_device.cc
+++ b/source/op/prod_env_mat_multi_device.cc
@@ -48,6 +48,26 @@ REGISTER_OP("DescrptSeA")
     .Output("rij: T")
     .Output("nlist: int32");
 
+// alias of ProdEnvMatA -- compatible with v0.12
+REGISTER_OP("DescrptNorot")
+    .Attr("T: {float, double} = DT_DOUBLE")
+    .Input("coord: T")
+    .Input("type: int32")
+    .Input("natoms: int32")
+    .Input("box : T")
+    .Input("mesh : int32")
+    .Input("davg: T")
+    .Input("dstd: T")
+    .Attr("rcut_a: float")
+    .Attr("rcut_r: float")
+    .Attr("rcut_r_smth: float")
+    .Attr("sel_a: list(int)")
+    .Attr("sel_r: list(int)")
+    .Output("descrpt: T")
+    .Output("descrpt_deriv: T")
+    .Output("rij: T")
+    .Output("nlist: int32");
+
 REGISTER_OP("ProdEnvMatR")
     .Attr("T: {float, double} = DT_DOUBLE")
     .Input("coord: T")
@@ -393,16 +413,16 @@ class ProdEnvMatAOp : public OpKernel {
     // Create output tensors
     TensorShape descrpt_shape ;
     descrpt_shape.AddDim (nsamples);
-    descrpt_shape.AddDim (nloc * ndescrpt);
+    descrpt_shape.AddDim (int_64(nloc) * ndescrpt);
     TensorShape descrpt_deriv_shape ;
     descrpt_deriv_shape.AddDim (nsamples);
-    descrpt_deriv_shape.AddDim (nloc * ndescrpt * 3);
+    descrpt_deriv_shape.AddDim (int_64(nloc) * ndescrpt * 3);
     TensorShape rij_shape ;
     rij_shape.AddDim (nsamples);
-    rij_shape.AddDim (nloc * nnei * 3);
+    rij_shape.AddDim (int_64(nloc) * nnei * 3);
     TensorShape nlist_shape ;
     nlist_shape.AddDim (nsamples);
-    nlist_shape.AddDim (nloc * nnei);
+    nlist_shape.AddDim (int_64(nloc) * nnei);
     // define output tensor
     int context_output_index = 0;
     Tensor* descrpt_tensor = NULL;
@@ -437,7 +457,7 @@ class ProdEnvMatAOp : public OpKernel {
     const int * p_type = type_tensor.flat<int>().data();
 
     // loop over samples
-    for(int ff = 0; ff < nsamples; ++ff){
+    for(int_64 ff = 0; ff < nsamples; ++ff){
       FPTYPE * em = p_em + ff*nloc*ndescrpt;
       FPTYPE * em_deriv = p_em_deriv + ff*nloc*ndescrpt*3;
       FPTYPE * rij = p_rij + ff*nloc*nnei*3;
@@ -468,11 +488,11 @@ class ProdEnvMatAOp : public OpKernel {
       // allocate temp memory, temp memory must not be used after this operation!
       Tensor int_temp;
       TensorShape int_shape;
-      int_shape.AddDim(sec_a.size() + nloc * sec_a.size() + nloc);
+      int_shape.AddDim(sec_a.size() + int_64(nloc) * sec_a.size() + nloc);
       OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp));
       Tensor uint64_temp;
       TensorShape uint64_shape;
-      uint64_shape.AddDim(nloc * GPU_MAX_NBOR_SIZE * 2);
+      uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2);
       OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp));
       array_int = int_temp.flat<int>().data(); 
       array_longlong = uint64_temp.flat<unsigned long long>().data();
@@ -506,11 +526,11 @@ class ProdEnvMatAOp : public OpKernel {
       // allocate temp memory, temp memory must not be used after this operation!
       Tensor int_temp;
       TensorShape int_shape;
-      int_shape.AddDim(sec_a.size() + nloc * sec_a.size() + nloc);
+      int_shape.AddDim(sec_a.size() + int_64(nloc) * sec_a.size() + nloc);
       OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp));
       Tensor uint64_temp;
       TensorShape uint64_shape;
-      uint64_shape.AddDim(nloc * GPU_MAX_NBOR_SIZE * 2);
+      uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2);
       OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp));
       array_int = int_temp.flat<int>().data(); 
       array_longlong = uint64_temp.flat<unsigned long long>().data();
@@ -657,16 +677,16 @@ class ProdEnvMatROp : public OpKernel {
     // Create an output tensor
     TensorShape descrpt_shape ;
     descrpt_shape.AddDim (nsamples);
-    descrpt_shape.AddDim (nloc * ndescrpt);
+    descrpt_shape.AddDim (int_64(nloc) * ndescrpt);
     TensorShape descrpt_deriv_shape ;
     descrpt_deriv_shape.AddDim (nsamples);
-    descrpt_deriv_shape.AddDim (nloc * ndescrpt * 3);
+    descrpt_deriv_shape.AddDim (int_64(nloc) * ndescrpt * 3);
     TensorShape rij_shape ;
     rij_shape.AddDim (nsamples);
-    rij_shape.AddDim (nloc * nnei * 3);
+    rij_shape.AddDim (int_64(nloc) * nnei * 3);
     TensorShape nlist_shape ;
     nlist_shape.AddDim (nsamples);
-    nlist_shape.AddDim (nloc * nnei);
+    nlist_shape.AddDim (int_64(nloc) * nnei);
 
     int context_output_index = 0;
     Tensor* descrpt_tensor = NULL;
@@ -701,7 +721,7 @@ class ProdEnvMatROp : public OpKernel {
     const int * p_type = type_tensor.flat<int>().data();
 
     // loop over samples
-    for(int ff = 0; ff < nsamples; ++ff){
+    for(int_64 ff = 0; ff < nsamples; ++ff){
       FPTYPE * em = p_em + ff*nloc*ndescrpt;
       FPTYPE * em_deriv = p_em_deriv + ff*nloc*ndescrpt*3;
       FPTYPE * rij = p_rij + ff*nloc*nnei*3;
@@ -732,11 +752,11 @@ class ProdEnvMatROp : public OpKernel {
       // allocate temp memory, temp memory must not be used after this operation!
       Tensor int_temp;
       TensorShape int_shape;
-      int_shape.AddDim(sec.size() + nloc * sec.size() + nloc);
+      int_shape.AddDim(sec.size() + int_64(nloc) * sec.size() + nloc);
       OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp));
       Tensor uint64_temp;
       TensorShape uint64_shape;
-      uint64_shape.AddDim(nloc * GPU_MAX_NBOR_SIZE * 2);
+      uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2);
       OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp));
       array_int = int_temp.flat<int>().data(); 
       array_longlong = uint64_temp.flat<unsigned long long>().data();
@@ -771,11 +791,11 @@ class ProdEnvMatROp : public OpKernel {
       // allocate temp memory, temp memory must not be used after this operation!
       Tensor int_temp;
       TensorShape int_shape;
-      int_shape.AddDim(sec.size() + nloc * sec.size() + nloc);
+      int_shape.AddDim(sec.size() + int_64(nloc) * sec.size() + nloc);
       OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp));
       Tensor uint64_temp;
       TensorShape uint64_shape;
-      uint64_shape.AddDim(nloc * GPU_MAX_NBOR_SIZE * 2);
+      uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2);
       OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp));
       array_int = int_temp.flat<int>().data(); 
       array_longlong = uint64_temp.flat<unsigned long long>().data();
@@ -915,8 +935,8 @@ _map_nlist_cpu(
     const int & nloc,
     const int & nnei)
 {
-  for (int ii = 0; ii < nloc; ++ii){
-    for (int jj = 0; jj < nnei; ++jj){
+  for (int_64 ii = 0; ii < nloc; ++ii){
+    for (int_64 jj = 0; jj < nnei; ++jj){
       int record = nlist[ii*nnei+jj];
       if (record >= 0) {		
 	nlist[ii*nnei+jj] = idx_mapping[record];	      
@@ -1092,11 +1112,11 @@ _build_nlist_gpu(
   int tt;
   for(tt = 0; tt < max_nnei_trial; ++tt){
     TensorShape jlist_shape;
-    jlist_shape.AddDim(3*nloc*mem_nnei);
+    jlist_shape.AddDim(3*int_64(nloc)*mem_nnei);
     context->allocate_temp(DT_INT32, jlist_shape, tensor_list+1);
     jlist = (*(tensor_list+1)).flat<int>().data();
     ind_data = jlist + nloc * mem_nnei;
-    for(int ii = 0; ii < nloc; ++ii){
+    for(int_64 ii = 0; ii < nloc; ++ii){
       firstneigh_host[ii] = jlist + ii * mem_nnei;
     }
     deepmd::memcpy_host_to_device(firstneigh, firstneigh_host);
@@ -1190,7 +1210,7 @@ _prepare_coord_nlist_gpu(
     deepmd::env_mat_nbor_update(
         inlist_temp, inlist, max_nbor_size, nbor_list_dev,
         mesh_tensor_data, mesh_tensor_size);
-    OP_REQUIRES (context, (max_numneigh(inlist_temp) <= GPU_MAX_NBOR_SIZE), errors::InvalidArgument ("Assert failed, max neighbor size of atom(lammps) " + std::to_string(max_numneigh(inlist_temp)) + " is larger than " + std::to_string(GPU_MAX_NBOR_SIZE) + ", which currently is not supported by deepmd-kit."));
+    OP_REQUIRES (context, (max_numneigh(inlist_temp) <= max_nbor_size), errors::InvalidArgument ("Assert failed, max neighbor size of atom(lammps) " + std::to_string(max_numneigh(inlist_temp)) + " is larger than " + std::to_string(max_nbor_size) + ", which currently is not supported by deepmd-kit."));
   }
 }
 #endif  // GOOGLE_CUDA
@@ -1307,11 +1327,11 @@ _build_nlist_gpu_rocm(
   int tt;
   for(tt = 0; tt < max_nnei_trial; ++tt){
     TensorShape jlist_shape;
-    jlist_shape.AddDim(3*nloc*mem_nnei);
+    jlist_shape.AddDim(3*int_64(nloc)*mem_nnei);
     context->allocate_temp(DT_INT32, jlist_shape, tensor_list+1);
     jlist = (*(tensor_list+1)).flat<int>().data();
     ind_data = jlist + nloc * mem_nnei;
-    for(int ii = 0; ii < nloc; ++ii){
+    for(int_64 ii = 0; ii < nloc; ++ii){
       firstneigh_host[ii] = jlist + ii * mem_nnei;
     }
     deepmd::memcpy_host_to_device(firstneigh, firstneigh_host);
@@ -1405,7 +1425,7 @@ _prepare_coord_nlist_gpu_rocm(
     deepmd::env_mat_nbor_update(
         inlist_temp, inlist, max_nbor_size, nbor_list_dev,
         mesh_tensor_data, mesh_tensor_size);
-    OP_REQUIRES (context, (max_numneigh(inlist_temp) <= GPU_MAX_NBOR_SIZE), errors::InvalidArgument ("Assert failed, max neighbor size of atom(lammps) " + std::to_string(max_numneigh(inlist_temp)) + " is larger than " + std::to_string(GPU_MAX_NBOR_SIZE) + ", which currently is not supported by deepmd-kit."));
+    OP_REQUIRES (context, (max_numneigh(inlist_temp) <= max_nbor_size), errors::InvalidArgument ("Assert failed, max neighbor size of atom(lammps) " + std::to_string(max_numneigh(inlist_temp)) + " is larger than " + std::to_string(max_nbor_size) + ", which currently is not supported by deepmd-kit."));
   }
 }
 #endif  // TENSORFLOW_USE_ROCM
@@ -1423,6 +1443,9 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(                                                                                  \
     Name("DescrptSeA").Device(DEVICE_CPU).TypeConstraint<T>("T"),                                        \
     ProdEnvMatAOp<CPUDevice, T>);                                                                         \
+REGISTER_KERNEL_BUILDER(                                                                                  \
+    Name("DescrptNorot").Device(DEVICE_CPU).TypeConstraint<T>("T"),                                        \
+    ProdEnvMatAOp<CPUDevice, T>);                                                                         \
 REGISTER_KERNEL_BUILDER(                                                                                  \
     Name("DescrptSeR").Device(DEVICE_CPU).TypeConstraint<T>("T"),                                        \
     ProdEnvMatROp<CPUDevice, T>);   
@@ -1442,6 +1465,9 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(                                                                                  \
     Name("DescrptSeA").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms").HostMemory("box"), \
     ProdEnvMatAOp<GPUDevice, T>);                                                                         \
+REGISTER_KERNEL_BUILDER(                                                                                  \
+    Name("DescrptNorot").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms").HostMemory("box"), \
+    ProdEnvMatAOp<GPUDevice, T>);                                                                         \
 REGISTER_KERNEL_BUILDER(                                                                                  \
     Name("DescrptSeR").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms").HostMemory("box"), \
     ProdEnvMatROp<GPUDevice, T>);
diff --git a/source/op/prod_force_grad_multi_device.cc b/source/op/prod_force_grad_multi_device.cc
index 533f6cbf14..2316fa3029 100644
--- a/source/op/prod_force_grad_multi_device.cc
+++ b/source/op/prod_force_grad_multi_device.cc
@@ -69,13 +69,13 @@ class ProdForceSeAGradOp : public OpKernel {
     OP_REQUIRES (context, (nframes == nlist_shape.dim_size(0)),		errors::InvalidArgument ("number of frames should match"));
     
     OP_REQUIRES (context, (nloc * 3 == grad_shape.dim_size(1)),		errors::InvalidArgument ("input grad shape should be 3 x natoms"));
-    OP_REQUIRES (context, (nloc * ndescrpt * 3 == in_deriv_shape.dim_size(1)),errors::InvalidArgument ("number of descriptors should match"));
+    OP_REQUIRES (context, (int_64(nloc) * ndescrpt * 3 == in_deriv_shape.dim_size(1)),errors::InvalidArgument ("number of descriptors should match"));
     OP_REQUIRES (context, (nnei == n_a_sel + n_r_sel),			errors::InvalidArgument ("number of neighbors should match"));
 
     // Create an output tensor
     TensorShape grad_net_shape ;
     grad_net_shape.AddDim (nframes);
-    grad_net_shape.AddDim (nloc * ndescrpt);
+    grad_net_shape.AddDim (int_64(nloc) * ndescrpt);
 
     // allocate the output tensor
     Tensor* grad_net_tensor = NULL;
@@ -106,7 +106,7 @@ class ProdForceSeAGradOp : public OpKernel {
     const FPTYPE * p_in_deriv = in_deriv_tensor.flat<FPTYPE>().data();
     const int * p_nlist	= nlist_tensor.flat<int>().data();
 
-    for (int kk = 0; kk < nframes; ++kk){
+    for (int_64 kk = 0; kk < nframes; ++kk){
         FPTYPE * grad_net = p_grad_net + kk * nloc * ndescrpt;
         const FPTYPE * grad = p_grad + kk * nloc * 3;
         const FPTYPE * in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
@@ -181,12 +181,12 @@ class ProdForceSeRGradOp : public OpKernel
     OP_REQUIRES (context, (nframes == nlist_shape.dim_size(0)),		errors::InvalidArgument ("number of frames should match"));
     
     OP_REQUIRES (context, (nloc * 3 == grad_shape.dim_size(1)),		errors::InvalidArgument ("input grad shape should be 3 x natoms"));
-    OP_REQUIRES (context, (nloc * ndescrpt * 3 == in_deriv_shape.dim_size(1)),errors::InvalidArgument ("number of descriptors should match"));
+    OP_REQUIRES (context, (int_64(nloc) * ndescrpt * 3 == in_deriv_shape.dim_size(1)),errors::InvalidArgument ("number of descriptors should match"));
 
     // Create an output tensor
     TensorShape grad_net_shape ;
     grad_net_shape.AddDim (nframes);
-    grad_net_shape.AddDim (nloc * ndescrpt);
+    grad_net_shape.AddDim (int_64(nloc) * ndescrpt);
 
     // allocate the output tensor
     Tensor* grad_net_tensor = NULL;
@@ -218,7 +218,7 @@ class ProdForceSeRGradOp : public OpKernel
     const int * p_nlist	= nlist_tensor.flat<int>().data();
 
     // loop over frames
-    for (int kk = 0; kk < nframes; ++kk){
+    for (int_64 kk = 0; kk < nframes; ++kk){
         FPTYPE * grad_net = p_grad_net + kk * nloc * ndescrpt;
         const FPTYPE * grad = p_grad + kk * nloc * 3;
         const FPTYPE * in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
diff --git a/source/op/prod_force_multi_device.cc b/source/op/prod_force_multi_device.cc
index 38a2ee88a6..08c77bca65 100644
--- a/source/op/prod_force_multi_device.cc
+++ b/source/op/prod_force_multi_device.cc
@@ -12,6 +12,17 @@ REGISTER_OP("ProdForceSeA")
     .Attr("n_r_sel: int")
     .Output("force: T");
 
+// compatible with v0.12
+REGISTER_OP("ProdForceNorot")
+    .Attr("T: {float, double} = DT_DOUBLE")
+    .Input("net_deriv: T")
+    .Input("in_deriv: T")
+    .Input("nlist: int32")
+    .Input("natoms: int32")
+    .Attr("n_a_sel: int")
+    .Attr("n_r_sel: int")
+    .Output("force: T");
+
 // rename temp op
 REGISTER_OP("ParallelProdForceSeA")
     .Attr("T: {float, double} = DT_DOUBLE")
@@ -69,7 +80,7 @@ class ProdForceSeAOp : public OpKernel {
     // check the sizes
     OP_REQUIRES (context, (nframes == in_deriv_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match"));
     OP_REQUIRES (context, (nframes == nlist_tensor.shape().dim_size(0)),    errors::InvalidArgument ("number of samples should match"));
-    OP_REQUIRES (context, (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of descriptors should match"));
+    OP_REQUIRES (context, (int_64(nloc) * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of descriptors should match"));
     // Create an output tensor
     TensorShape force_shape ;
     force_shape.AddDim (nframes);
@@ -113,7 +124,7 @@ class ProdForceSeAOp : public OpKernel {
       nloc_loc = end_index - start_index;
     }
 
-    for(int kk = 0; kk < nframes; ++kk){
+    for(int_64 kk = 0; kk < nframes; ++kk){
       FPTYPE * force = p_force + kk * nall * 3;
       const FPTYPE * net_deriv = p_net_deriv + kk * nloc * ndescrpt;
       const FPTYPE * in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
@@ -201,7 +212,7 @@ class ProdForceSeROp : public OpKernel {
     const FPTYPE * p_in_deriv = in_deriv_tensor.flat<FPTYPE>().data();
     const int * p_nlist = nlist_tensor.flat<int>().data();
 
-    for(int kk = 0; kk < nframes; ++kk){
+    for(int_64 kk = 0; kk < nframes; ++kk){
       FPTYPE * force = p_force + kk * nall * 3;
       const FPTYPE * net_deriv = p_net_deriv + kk * nloc * ndescrpt;
       const FPTYPE * in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
@@ -235,6 +246,9 @@ class ProdForceSeROp : public OpKernel {
 REGISTER_KERNEL_BUILDER(                                                                 \
     Name("ProdForceSeA").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
     ProdForceSeAOp<CPUDevice, T>);                                                       \
+REGISTER_KERNEL_BUILDER(                                                                 \
+    Name("ProdForceNorot").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    ProdForceSeAOp<CPUDevice, T>);                                                       \
 REGISTER_KERNEL_BUILDER(                                                                 \
     Name("ParallelProdForceSeA").Device(DEVICE_CPU).TypeConstraint<T>("T"),             \
     ProdForceSeAOp<CPUDevice, T>);                                                       \
@@ -249,6 +263,9 @@ REGISTER_CPU(double);
 REGISTER_KERNEL_BUILDER(                                                                 \
     Name("ProdForceSeA").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms"), \
     ProdForceSeAOp<GPUDevice, T>);                                                       \
+REGISTER_KERNEL_BUILDER(                                                                 \
+    Name("ProdForceNorot").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms"), \
+    ProdForceSeAOp<GPUDevice, T>);                                                       \
 REGISTER_KERNEL_BUILDER(                                                                 \
     Name("ProdForceSeR").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms"), \
     ProdForceSeROp<GPUDevice, T>);
diff --git a/source/op/prod_virial_grad_multi_device.cc b/source/op/prod_virial_grad_multi_device.cc
index 9afd4462eb..8007d72acc 100644
--- a/source/op/prod_virial_grad_multi_device.cc
+++ b/source/op/prod_virial_grad_multi_device.cc
@@ -76,14 +76,14 @@ class ProdVirialSeAGradOp : public OpKernel
     OP_REQUIRES (context, (nframes == nlist_shape.dim_size(0)),		errors::InvalidArgument ("number of frames should match"));
     
     OP_REQUIRES (context, (9 == grad_shape.dim_size(1)),		errors::InvalidArgument ("input grad shape should be 3 x natoms"));
-    OP_REQUIRES (context, (nloc * ndescrpt * 3 == in_deriv_shape.dim_size(1)),errors::InvalidArgument ("number of descriptors should match"));
-    OP_REQUIRES (context, (nloc * nnei * 3 == rij_shape.dim_size(1)),	errors::InvalidArgument ("dim of rij should be  nnei * 3"));
+    OP_REQUIRES (context, (int_64(nloc) * ndescrpt * 3 == in_deriv_shape.dim_size(1)),errors::InvalidArgument ("number of descriptors should match"));
+    OP_REQUIRES (context, (int_64(nloc) * nnei * 3 == rij_shape.dim_size(1)),	errors::InvalidArgument ("dim of rij should be  nnei * 3"));
     OP_REQUIRES (context, (nnei == n_a_sel + n_r_sel),			errors::InvalidArgument ("number of neighbors should match"));
 
     // Create an output tensor
     TensorShape grad_net_shape ;
     grad_net_shape.AddDim (nframes);
-    grad_net_shape.AddDim (nloc * ndescrpt);
+    grad_net_shape.AddDim (int_64(nloc) * ndescrpt);
 
     // allocate the output tensor
     Tensor* grad_net_tensor = NULL;
@@ -119,7 +119,7 @@ class ProdVirialSeAGradOp : public OpKernel
     const int * p_nlist	= nlist_tensor.flat<int>().data();
 
     // loop over frames
-    for (int kk = 0; kk < nframes; ++kk){
+    for (int_64 kk = 0; kk < nframes; ++kk){
       FPTYPE * grad_net = p_grad_net + kk * nloc * ndescrpt;
       const FPTYPE * grad = p_grad + kk * 9;
       const FPTYPE * in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
@@ -199,13 +199,13 @@ class ProdVirialSeRGradOp : public OpKernel
     OP_REQUIRES (context, (nframes == nlist_shape.dim_size(0)),		errors::InvalidArgument ("number of frames should match"));
     
     OP_REQUIRES (context, (9 == grad_shape.dim_size(1)),		errors::InvalidArgument ("input grad shape should be 3 x natoms"));
-    OP_REQUIRES (context, (nloc * ndescrpt * 3 == in_deriv_shape.dim_size(1)),errors::InvalidArgument ("number of descriptors should match"));
-    OP_REQUIRES (context, (nloc * nnei * 3 == rij_shape.dim_size(1)),	errors::InvalidArgument ("dim of rij should be  nnei * 3"));
+    OP_REQUIRES (context, (int_64(nloc) * ndescrpt * 3 == in_deriv_shape.dim_size(1)),errors::InvalidArgument ("number of descriptors should match"));
+    OP_REQUIRES (context, (int_64(nloc) * nnei * 3 == rij_shape.dim_size(1)),	errors::InvalidArgument ("dim of rij should be  nnei * 3"));
 
     // Create an output tensor
     TensorShape grad_net_shape ;
     grad_net_shape.AddDim (nframes);
-    grad_net_shape.AddDim (nloc * ndescrpt);
+    grad_net_shape.AddDim (int_64(nloc) * ndescrpt);
 
     // allocate the output tensor
     Tensor* grad_net_tensor = NULL;
@@ -241,7 +241,7 @@ class ProdVirialSeRGradOp : public OpKernel
     const int * p_nlist	= nlist_tensor.flat<int>().data();
 
     // loop over frames
-    for (int kk = 0; kk < nframes; ++kk){
+    for (int_64 kk = 0; kk < nframes; ++kk){
       FPTYPE * grad_net = p_grad_net + kk * nloc * ndescrpt;
       const FPTYPE * grad = p_grad + kk * 9;
       const FPTYPE * in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
diff --git a/source/op/prod_virial_multi_device.cc b/source/op/prod_virial_multi_device.cc
index 33c263ef84..31cf5fff9a 100644
--- a/source/op/prod_virial_multi_device.cc
+++ b/source/op/prod_virial_multi_device.cc
@@ -12,6 +12,18 @@ REGISTER_OP("ProdVirialSeA")
     .Attr("n_r_sel: int")
     .Output("virial: T")
     .Output("atom_virial: T");
+// compatible with v0.12
+REGISTER_OP("ProdVirialNorot")
+    .Attr("T: {float, double} = DT_DOUBLE")
+    .Input("net_deriv: T")
+    .Input("in_deriv: T")
+    .Input("rij: T")
+    .Input("nlist: int32")
+    .Input("natoms: int32")
+    .Attr("n_a_sel: int")
+    .Attr("n_r_sel: int")
+    .Output("virial: T")
+    .Output("atom_virial: T");
 
 REGISTER_OP("ProdVirialSeR")
     .Attr("T: {float, double} = DT_DOUBLE")
@@ -56,8 +68,8 @@ class ProdVirialSeAOp : public OpKernel {
     OP_REQUIRES (context, (nframes == in_deriv_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match"));
     OP_REQUIRES (context, (nframes == rij_tensor.shape().dim_size(0)),      errors::InvalidArgument ("number of samples should match"));
     OP_REQUIRES (context, (nframes == nlist_tensor.shape().dim_size(0)),    errors::InvalidArgument ("number of samples should match"));
-    OP_REQUIRES (context, (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of descriptors should match"));
-    OP_REQUIRES (context, (nloc * nnei * 3 == rij_tensor.shape().dim_size(1)),  errors::InvalidArgument ("dim of rij should be nnei * 3"));
+    OP_REQUIRES (context, (int_64(nloc) * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of descriptors should match"));
+    OP_REQUIRES (context, (int_64(nloc) * nnei * 3 == rij_tensor.shape().dim_size(1)),  errors::InvalidArgument ("dim of rij should be nnei * 3"));
     // Create an output tensor
     TensorShape virial_shape ;
     virial_shape.AddDim (nframes);
@@ -88,7 +100,7 @@ class ProdVirialSeAOp : public OpKernel {
     const FPTYPE * p_rij = rij_tensor.flat<FPTYPE>().data();
     const int * p_nlist = nlist_tensor.flat<int>().data();
     
-    for(int kk = 0; kk < nframes; ++kk){
+    for(int_64 kk = 0; kk < nframes; ++kk){
       FPTYPE * virial = p_virial + kk * 9;
       FPTYPE * atom_virial = p_atom_virial + kk * nall * 9;
       const FPTYPE * net_deriv = p_net_deriv + kk * nloc * ndescrpt;
@@ -152,8 +164,8 @@ class ProdVirialSeROp : public OpKernel {
     OP_REQUIRES (context, (nframes == in_deriv_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match"));
     OP_REQUIRES (context, (nframes == rij_tensor.shape().dim_size(0)),      errors::InvalidArgument ("number of samples should match"));
     OP_REQUIRES (context, (nframes == nlist_tensor.shape().dim_size(0)),    errors::InvalidArgument ("number of samples should match"));
-    OP_REQUIRES (context, (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of descriptors should match"));
-    OP_REQUIRES (context, (nloc * nnei * 3 == rij_tensor.shape().dim_size(1)),  errors::InvalidArgument ("dim of rij should be nnei * 3"));
+    OP_REQUIRES (context, (int_64(nloc) * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of descriptors should match"));
+    OP_REQUIRES (context, (int_64(nloc) * nnei * 3 == rij_tensor.shape().dim_size(1)),  errors::InvalidArgument ("dim of rij should be nnei * 3"));
     // Create an output tensor
     TensorShape virial_shape ;
     virial_shape.AddDim (nframes);
@@ -184,7 +196,7 @@ class ProdVirialSeROp : public OpKernel {
     const FPTYPE * p_rij = rij_tensor.flat<FPTYPE>().data();
     const int * p_nlist = nlist_tensor.flat<int>().data();
     
-    for(int kk = 0; kk < nframes; ++kk){
+    for(int_64 kk = 0; kk < nframes; ++kk){
       FPTYPE * virial = p_virial + kk * 9;
       FPTYPE * atom_virial = p_atom_virial + kk * nall * 9;
       const FPTYPE * net_deriv = p_net_deriv + kk * nloc * ndescrpt;
@@ -220,6 +232,9 @@ class ProdVirialSeROp : public OpKernel {
 REGISTER_KERNEL_BUILDER(                                                                  \
     Name("ProdVirialSeA").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
     ProdVirialSeAOp<CPUDevice, T>);                                                       \
+REGISTER_KERNEL_BUILDER(                                                                  \
+    Name("ProdVirialNorot").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    ProdVirialSeAOp<CPUDevice, T>);                                                       \
 REGISTER_KERNEL_BUILDER(                                                                  \
     Name("ProdVirialSeR").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
     ProdVirialSeROp<CPUDevice, T>);
@@ -231,6 +246,9 @@ REGISTER_CPU(double);
 REGISTER_KERNEL_BUILDER(                                                                  \
     Name("ProdVirialSeA").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms"), \
     ProdVirialSeAOp<GPUDevice, T>);                                                       \
+REGISTER_KERNEL_BUILDER(                                                                  \
+    Name("ProdVirialNorot").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms"), \
+    ProdVirialSeAOp<GPUDevice, T>);                                                       \
 REGISTER_KERNEL_BUILDER(                                                                  \
     Name("ProdVirialSeR").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms"), \
     ProdVirialSeROp<GPUDevice, T>);
diff --git a/source/op/tabulate_multi_device.cc b/source/op/tabulate_multi_device.cc
index 7bf190661c..2a2640d286 100644
--- a/source/op/tabulate_multi_device.cc
+++ b/source/op/tabulate_multi_device.cc
@@ -666,7 +666,7 @@ class TabulateFusionSeRGradGradOp : public OpKernel {
     const Tensor& dz_dy_dem_tensor	= context->input(context_input_index++);
     const Tensor& descriptor_tensor = context->input(context_input_index++);
     // set size of the sample
-    OP_REQUIRES (context, (dz_dy_dem_tensor.shape().dims() == 3),      errors::InvalidArgument ("Dim of input should be 3"));
+    OP_REQUIRES (context, (dz_dy_dem_tensor.shape().dims() == 2),      errors::InvalidArgument ("Dim of input should be 2"));
     int context_output_index = 0;
     Tensor* dz_dy_tensor = NULL;
     OP_REQUIRES_OK(context, context->allocate_output(
diff --git a/source/tests/model_compression/input.json b/source/tests/model_compression/input.json
index 599311e784..3ba7d7daf7 100644
--- a/source/tests/model_compression/input.json
+++ b/source/tests/model_compression/input.json
@@ -53,11 +53,11 @@
 		"numb_btch":	3,
 		"_comment":		"that's all"
 },
-"numb_steps":	100,
+"numb_steps":	1,
 "seed":		10,
 "disp_file":	"lcurve.out",
-"disp_freq":	100,
-"save_freq":	100,
+"disp_freq":	1,
+"save_freq":	1,
 "_comment":	"that's all"
 	},    
 
diff --git a/source/tests/test_data_requirement.py b/source/tests/test_data_requirement.py
index c1a61dc8c1..96e192af3e 100644
--- a/source/tests/test_data_requirement.py
+++ b/source/tests/test_data_requirement.py
@@ -11,3 +11,4 @@ def test_add(self) :
         self.assertEqual(data_requirement['test']['must'], False)
         self.assertEqual(data_requirement['test']['high_prec'], False)
         self.assertEqual(data_requirement['test']['repeat'], 1)
+        self.assertEqual(data_requirement['test']['default'], 0.)
diff --git a/source/tests/test_deepmd_data_sys.py b/source/tests/test_deepmd_data_sys.py
index 5f78e6e116..5bab4d74e5 100644
--- a/source/tests/test_deepmd_data_sys.py
+++ b/source/tests/test_deepmd_data_sys.py
@@ -82,6 +82,7 @@ def test_get_test(self):
         ds = DeepmdDataSystem(self.sys_name, batch_size, test_size, 2.0)
         ds.add('test', self.test_ndof, atomic = True, must = True)
         ds.add('null', self.test_ndof, atomic = True, must = False)
+        ds.add('ones', self.test_ndof, atomic = True, must = False, default=1.)
         sys_idx = 0
         data = ds.get_test(sys_idx=sys_idx)
         self.assertEqual(list(data['type'][0]), list(np.sort(self.atom_type[sys_idx])))
@@ -98,6 +99,11 @@ def test_get_test(self):
                                               -
                                               data['null']
         ), 0.0)
+        self.assertAlmostEqual(np.linalg.norm(np.ones([self.nframes[sys_idx]+2,
+                                                self.natoms[sys_idx]*self.test_ndof])
+                                        -
+                                        data['ones']
+        ), 0.0)
 
         sys_idx = 2
         data = ds.get_test(sys_idx=sys_idx)
diff --git a/source/tests/test_examples.py b/source/tests/test_examples.py
new file mode 100644
index 0000000000..9bcbc2b3d7
--- /dev/null
+++ b/source/tests/test_examples.py
@@ -0,0 +1,36 @@
+"""This module ensures input in the examples directory
+could pass the argument checking.
+"""
+import unittest
+import json
+from pathlib import Path
+
+from deepmd.common import j_loader
+from deepmd.utils.argcheck import normalize
+
+
+p_examples = Path(__file__).parent.parent.parent / "examples"
+
+input_files = (
+    p_examples / "water" / "se_e2_a" / "input.json",
+    p_examples / "water" / "se_e2_r" / "input.json",
+    p_examples / "water" / "se_e3" / "input.json",
+    p_examples / "water" / "se_e2_a_tebd" / "input.json",
+    p_examples / "water" / "se_e2_a_mixed_prec" / "input.json",
+    p_examples / "water" / "dplr" / "train" / "dw.json",
+    p_examples / "water" / "dplr" / "train" / "ener.json",
+    p_examples / "nopbc" / "train" / "input.json",
+    p_examples / "water_tensor" / "dipole" / "dipole_input.json",
+    p_examples / "water_tensor" / "polar" / "polar_input.json",
+    p_examples / "fparam" / "train" / "input.json",
+    p_examples / "fparam" / "train" / "input_aparam.json",
+)
+
+
+class TestExamples(unittest.TestCase):
+    def test_arguments(self):
+        for fn in input_files:
+            fn = str(fn)
+            with self.subTest(fn=fn):
+                jdata = j_loader(fn)
+                normalize(jdata)
diff --git a/source/tests/test_model_devi.py b/source/tests/test_model_devi.py
index 6843874d57..07fa69014a 100644
--- a/source/tests/test_model_devi.py
+++ b/source/tests/test_model_devi.py
@@ -30,12 +30,12 @@ def setUp(self):
     
     def test_calc_model_devi(self):
         model_devi = calc_model_devi(self.coord,
-                                     self.box, 
+                                     None, 
                                      self.atype, 
                                      self.graphs,
                                      frequency=self.freq,
-                                     nopbc=True,
-                                     fname=self.output)
+                                     fname=self.output,
+                                     )
         self.assertAlmostEqual(model_devi[0][0], 0)
         self.assertAlmostEqual(model_devi[1][0], self.freq)
         np.testing.assert_almost_equal(model_devi[0][1:7], self.expect[1:7], 6)