deepmodeling · njzjz · Jul 1, 2026 · Jun 23, 2026 · Jun 26, 2026 · Jun 27, 2026
diff --git a/source/lib/include/fmt_nlist.h b/source/lib/include/fmt_nlist.h
@@ -29,9 +29,26 @@ void format_nbor_list_gpu(int* nlist,
                           const int max_nbor_size,
                           const int nloc,
                           const int nall,
+                          const int nframes,
                           const float rcut,
                           const std::vector<int> sec);
 
+template <typename FPTYPE>
+inline void format_nbor_list_gpu(int* nlist,
+                                 const FPTYPE* coord,
+                                 const int* type,
+                                 const deepmd::InputNlist& gpu_inlist,
+                                 int* array_int,
+                                 uint_64* array_longlong,
+                                 const int max_nbor_size,
+                                 const int nloc,
+                                 const int nall,
+                                 const float rcut,
+                                 const std::vector<int> sec) {
+  format_nbor_list_gpu(nlist, coord, type, gpu_inlist, array_int,
+                       array_longlong, max_nbor_size, nloc, nall, 1, rcut, sec);
+}
+
 template <typename FPTYPE>
 void test_encoding_decoding_nbor_info_gpu(uint_64* key,
                                           int* out_type,

diff --git a/source/lib/include/neighbor_list.h b/source/lib/include/neighbor_list.h
@@ -144,22 +144,35 @@ int max_numneigh(const InputNlist& to_nlist);
 // build neighbor list.
 // outputs
 //	nlist, max_list_size
+//	nlist contains nframes * nloc rows. Row frame_idx * nloc + atom_idx
+//	stores the neighbors of local atom atom_idx in that frame.
 //	max_list_size is the maximal size of jlist.
 // inputs
-//	c_cpy, nloc, nall, mem_size, rcut, region
-//	mem_size is the size of allocated memory for jlist.
+//	c_cpy, nloc, nall, mem_size, rcut, nframes, type
+//	c_cpy stores nframes coordinate blocks with nall atoms each, including
+//	ghost atoms, laid out as c_cpy[(frame_idx * nall + atom) * 3 + dim].
+//	nloc is the number of local atoms per frame; nall is the total number of
+//	local and ghost atoms per frame.
+//	mem_size is the size of allocated memory for each jlist row.
+//	nframes is the number of coordinate frames.
+//	type may be nullptr/NULL, or a per-atom type array.
+//	The type array uses the same frame-major nall stride as c_cpy:
+//	type[frame_idx * nall + atom].
+//	When the center or neighbor atom has type < 0, that pair is excluded.
 // returns
 //	0: successful
 //	1: the memory is not large enough to hold all neighbors.
-//	   i.e. max_list_size > mem_nall
+//	   i.e. max_list_size > mem_size
 template <typename FPTYPE>
 int build_nlist_cpu(InputNlist& nlist,
                     int* max_list_size,
                     const FPTYPE* c_cpy,
                     const int& nloc,
                     const int& nall,
                     const int& mem_size,
-                    const float& rcut);
+                    const float& rcut,
+                    const int& nframes = 1,
+                    const int* type = nullptr);
 
 void use_nei_info_cpu(int* nlist,
                       int* ntype,
@@ -208,14 +221,27 @@ void use_nlist_map(int* nlist,
 // build neighbor list.
 // outputs
 //	nlist, max_list_size
+//	nlist contains nframes * nloc rows. Row frame_idx * nloc + atom_idx
+//	stores the neighbors of local atom atom_idx in that frame.
 //	max_list_size is the maximal size of jlist.
 // inputs
-//	c_cpy, nloc, nall, mem_size, rcut, region
-//	mem_size is the size of allocated memory for jlist.
+//	nlist_data, c_cpy, nloc, nall, mem_size, rcut, nframes, type
+//	nlist_data is temporary GPU workspace of at least
+//	2 * nframes * nloc * mem_size ints.
+//	c_cpy stores nframes coordinate blocks with nall atoms each, including
+//	ghost atoms, laid out as c_cpy[(frame_idx * nall + atom) * 3 + dim].
+//	nloc is the number of local atoms per frame; nall is the total number of
+//	local and ghost atoms per frame.
+//	mem_size is the size of allocated memory for each jlist row.
+//	nframes is the number of coordinate frames.
+//	type may be nullptr/NULL, or a per-atom type array.
+//	The type array uses the same frame-major nall stride as c_cpy:
+//	type[frame_idx * nall + atom].
+//	When the center or neighbor atom has type < 0, that pair is excluded.
 // returns
 //	0: successful
-//	1: the memory is not large enough to hold all neighbors.
-//	   i.e. max_list_size > mem_nall
+//	1: the temporary row storage is not large enough.
+//	   i.e. mem_size < nall
 template <typename FPTYPE>
 int build_nlist_gpu(InputNlist& nlist,
                     int* max_list_size,
@@ -224,7 +250,9 @@ int build_nlist_gpu(InputNlist& nlist,
                     const int& nloc,
                     const int& nall,
                     const int& mem_size,
-                    const float& rcut);
+                    const float& rcut,
+                    const int& nframes = 1,
+                    const int* type = NULL);
 
 /**
  * @brief Filter the fake atom type.

diff --git a/source/lib/include/prod_env_mat.h b/source/lib/include/prod_env_mat.h
@@ -20,11 +20,34 @@ void prod_env_mat_a_cpu(FPTYPE* em,
                         const FPTYPE* std,
                         const int nloc,
                         const int nall,
+                        const int nframes,
                         const float rcut,
                         const float rcut_smth,
                         const std::vector<int> sec,
                         const int* f_type = NULL);
 
+template <typename FPTYPE>
+inline void prod_env_mat_a_cpu(FPTYPE* em,
+                               FPTYPE* em_deriv,
+                               FPTYPE* rij,
+                               int* nlist,
+                               const FPTYPE* coord,
+                               const int* type,
+                               const InputNlist& inlist,
+                               const int max_nbor_size,
+                               const FPTYPE* avg,
+                               const FPTYPE* std,
+                               const int nloc,
+                               const int nall,
+                               const float rcut,
+                               const float rcut_smth,
+                               const std::vector<int> sec,
+                               const int* f_type = NULL) {
+  prod_env_mat_a_cpu(em, em_deriv, rij, nlist, coord, type, inlist,
+                     max_nbor_size, avg, std, nloc, nall, 1, rcut, rcut_smth,
+                     sec, f_type);
+}
+
 template <typename FPTYPE>
 void prod_env_mat_r_cpu(FPTYPE* em,
                         FPTYPE* em_deriv,
@@ -38,10 +61,32 @@ void prod_env_mat_r_cpu(FPTYPE* em,
                         const FPTYPE* std,
                         const int nloc,
                         const int nall,
+                        const int nframes,
                         const float rcut,
                         const float rcut_smth,
                         const std::vector<int> sec);
 
+template <typename FPTYPE>
+inline void prod_env_mat_r_cpu(FPTYPE* em,
+                               FPTYPE* em_deriv,
+                               FPTYPE* rij,
+                               int* nlist,
+                               const FPTYPE* coord,
+                               const int* type,
+                               const InputNlist& inlist,
+                               const int max_nbor_size,
+                               const FPTYPE* avg,
+                               const FPTYPE* std,
+                               const int nloc,
+                               const int nall,
+                               const float rcut,
+                               const float rcut_smth,
+                               const std::vector<int> sec) {
+  prod_env_mat_r_cpu(em, em_deriv, rij, nlist, coord, type, inlist,
+                     max_nbor_size, avg, std, nloc, nall, 1, rcut, rcut_smth,
+                     sec);
+}
+
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
 void prod_env_mat_a_gpu(FPTYPE* em,
@@ -58,11 +103,36 @@ void prod_env_mat_a_gpu(FPTYPE* em,
                         const FPTYPE* std,
                         const int nloc,
                         const int nall,
+                        const int nframes,
                         const float rcut,
                         const float rcut_smth,
                         const std::vector<int> sec,
                         const int* f_type = NULL);
 
+template <typename FPTYPE>
+inline void prod_env_mat_a_gpu(FPTYPE* em,
+                               FPTYPE* em_deriv,
+                               FPTYPE* rij,
+                               int* nlist,
+                               const FPTYPE* coord,
+                               const int* type,
+                               const InputNlist& gpu_inlist,
+                               int* array_int,
+                               unsigned long long* array_longlong,
+                               const int max_nbor_size,
+                               const FPTYPE* avg,
+                               const FPTYPE* std,
+                               const int nloc,
+                               const int nall,
+                               const float rcut,
+                               const float rcut_smth,
+                               const std::vector<int> sec,
+                               const int* f_type = NULL) {
+  prod_env_mat_a_gpu(em, em_deriv, rij, nlist, coord, type, gpu_inlist,
+                     array_int, array_longlong, max_nbor_size, avg, std, nloc,
+                     nall, 1, rcut, rcut_smth, sec, f_type);
+}
+
 template <typename FPTYPE>
 void prod_env_mat_r_gpu(FPTYPE* em,
                         FPTYPE* em_deriv,
@@ -78,10 +148,34 @@ void prod_env_mat_r_gpu(FPTYPE* em,
                         const FPTYPE* std,
                         const int nloc,
                         const int nall,
+                        const int nframes,
                         const float rcut,
                         const float rcut_smth,
                         const std::vector<int> sec);
 
+template <typename FPTYPE>
+inline void prod_env_mat_r_gpu(FPTYPE* em,
+                               FPTYPE* em_deriv,
+                               FPTYPE* rij,
+                               int* nlist,
+                               const FPTYPE* coord,
+                               const int* type,
+                               const InputNlist& gpu_inlist,
+                               int* array_int,
+                               unsigned long long* array_longlong,
+                               const int max_nbor_size,
+                               const FPTYPE* avg,
+                               const FPTYPE* std,
+                               const int nloc,
+                               const int nall,
+                               const float rcut,
+                               const float rcut_smth,
+                               const std::vector<int> sec) {
+  prod_env_mat_r_gpu(em, em_deriv, rij, nlist, coord, type, gpu_inlist,
+                     array_int, array_longlong, max_nbor_size, avg, std, nloc,
+                     nall, 1, rcut, rcut_smth, sec);
+}
+
 void env_mat_nbor_update(InputNlist& inlist,
                          InputNlist& gpu_inlist,
                          int& max_nbor_size,

diff --git a/source/lib/src/gpu/neighbor_list.cu b/source/lib/src/gpu/neighbor_list.cu
@@ -78,19 +78,28 @@ template <typename FPTYPE>
 __global__ void build_nlist(int* ilist,
                             int* temp_nlist,
                             const FPTYPE* c_cpy,
+                            const int* type,
                             const FPTYPE rcut2,
                             const int nloc,
                             const int nall,
                             const int mem_size) {
-  const unsigned int atom_idx = blockIdx.x;
+  const unsigned int row_idx = blockIdx.x;
+  const unsigned int atom_idx = row_idx % nloc;
+  const unsigned int frame_idx = row_idx / nloc;
   const unsigned int neighbor_idx = blockIdx.y * blockDim.y + threadIdx.y;
   if (neighbor_idx < nall) {
-    int* neighbor_row = temp_nlist + atom_idx * mem_size;
+    int* neighbor_row = temp_nlist + row_idx * mem_size;
     if (neighbor_idx == atom_idx) {
-      ilist[atom_idx] = atom_idx;
+      ilist[row_idx] = atom_idx;
     } else {
-      const FPTYPE* ccoord = c_cpy + atom_idx * 3;
-      const FPTYPE* ncoord = c_cpy + neighbor_idx * 3;
+      const int_64 frame_offset = int_64(frame_idx) * nall;
+      if (type != NULL && (type[frame_offset + atom_idx] < 0 ||
+                           type[frame_offset + neighbor_idx] < 0)) {
+        return;
+      }
+      const FPTYPE* frame_coord = c_cpy + int_64(frame_idx) * nall * 3;
+      const FPTYPE* ccoord = frame_coord + atom_idx * 3;
+      const FPTYPE* ncoord = frame_coord + neighbor_idx * 3;
       FPTYPE diff[3];
       for (int kk = 0; kk < 3; kk++) {
         diff[kk] = ccoord[kk] - ncoord[kk];
@@ -193,41 +202,44 @@ int build_nlist_gpu(InputNlist& nlist,
                     const int& nloc,
                     const int& nall,
                     const int& mem_size,
-                    const float& rcut) {
+                    const float& rcut,
+                    const int& nframes,
+                    const int* type) {
   if (mem_size < nall) {
     return 1;
   }
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
   const int nblock = (nall + TPB - 1) / TPB;
+  const int_64 nrows = int_64(nframes) * nloc;
   int* ilist = nlist.ilist;
   int* numneigh = nlist.numneigh;
   int** firstneigh = nlist.firstneigh;
-  DPErrcheck(gpuMemset(nlist_data, -1, sizeof(int) * 2 * nloc * mem_size));
-  int* temp_nlist = nlist_data;  // nloc*mem_size
-  int* nei_order = temp_nlist + nloc * mem_size;
-  nlist.inum = nloc;
+  DPErrcheck(gpuMemset(nlist_data, -1, sizeof(int) * 2 * nrows * mem_size));
+  int* temp_nlist = nlist_data;  // nrows*mem_size
+  int* nei_order = temp_nlist + nrows * mem_size;
+  nlist.inum = nrows;
   FPTYPE rcut2 = rcut * rcut;
 
-  dim3 block_grid(nloc, nblock);
+  dim3 block_grid(nrows, nblock);
   dim3 thread_grid(1, TPB);
-  build_nlist<<<block_grid, thread_grid>>>(ilist, temp_nlist, c_cpy, rcut2,
-                                           nloc, nall, mem_size);
+  build_nlist<<<block_grid, thread_grid>>>(ilist, temp_nlist, c_cpy, type,
+                                           rcut2, nloc, nall, mem_size);
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
   parallel_prefix_scan<TPB>
-      <<<nloc, TPB>>>(numneigh, nei_order, temp_nlist, mem_size, nloc, nall);
+      <<<nrows, TPB>>>(numneigh, nei_order, temp_nlist, mem_size, nloc, nall);
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
   fill_nlist<<<block_grid, thread_grid>>>(firstneigh, temp_nlist, nei_order,
                                           mem_size, nall);
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
-  int* numneigh_host = new int[nloc];
-  DPErrcheck(gpuMemcpy(numneigh_host, numneigh, sizeof(int) * nloc,
+  int* numneigh_host = new int[nrows];
+  DPErrcheck(gpuMemcpy(numneigh_host, numneigh, sizeof(int) * nrows,
                        gpuMemcpyDeviceToHost));
   int max_nei = 0;
-  for (int ii = 0; ii < nloc; ii++) {
+  for (int_64 ii = 0; ii < nrows; ii++) {
     if (numneigh_host[ii] > max_nei) {
       max_nei = numneigh_host[ii];
     }
@@ -285,15 +297,19 @@ template int build_nlist_gpu<float>(InputNlist& nlist,
                                     const int& nloc,
                                     const int& nall,
                                     const int& mem_size,
-                                    const float& rcut);
+                                    const float& rcut,
+                                    const int& nframes,
+                                    const int* type);
 template int build_nlist_gpu<double>(InputNlist& nlist,
                                      int* max_list_size,
                                      int* nlist_data,
                                      const double* c_cpy,
                                      const int& nloc,
                                      const int& nall,
                                      const int& mem_size,
-                                     const float& rcut);
+                                     const float& rcut,
+                                     const int& nframes,
+                                     const int* type);
 
 __global__ void map_filter_ftype(int* ftype_out,
                                  const int* ftype_in,