From 048376baa460d0082c3fbfef1a31c244d3f1a727 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@ustc.edu.cn>
Date: Tue, 23 Jun 2026 20:38:39 +0800
Subject: [PATCH 1/5] feat(core): batch prod env mat over frames

---
 source/lib/include/fmt_nlist.h            |   17 +
 source/lib/include/neighbor_list.h        |    8 +-
 source/lib/include/prod_env_mat.h         |   94 ++
 source/lib/src/gpu/neighbor_list.cu       |   54 +-
 source/lib/src/gpu/prod_env_mat.cu        |  209 ++--
 source/lib/src/neighbor_list.cc           |   28 +-
 source/lib/src/prod_env_mat.cc            |  290 +++--
 source/lib/tests/test_env_mat_a.cc        |  235 +++-
 source/lib/tests/test_env_mat_a_mix.cc    |  251 +++-
 source/lib/tests/test_env_mat_r.cc        |  134 ++-
 source/lib/tests/test_fmt_nlist.cc        |    4 +-
 source/lib/tests/test_neighbor_list.cc    |  113 ++
 source/op/tf/custom_op.h                  |   50 +-
 source/op/tf/neighbor_stat.cc             |   27 +-
 source/op/tf/prod_env_mat_multi_device.cc | 1293 +++++++++++++++++++--
 15 files changed, 2401 insertions(+), 406 deletions(-)
diff --git a/source/lib/include/fmt_nlist.h b/source/lib/include/fmt_nlist.h
index 18cb319304..60138bf54a 100644
--- a/source/lib/include/fmt_nlist.h
+++ b/source/lib/include/fmt_nlist.h
@@ -29,9 +29,26 @@ void format_nbor_list_gpu(int* nlist,
                           const int max_nbor_size,
                           const int nloc,
                           const int nall,
+                          const int nframes,
                           const float rcut,
                           const std::vector<int> sec);
 
+template <typename FPTYPE>
+inline void format_nbor_list_gpu(int* nlist,
+                                 const FPTYPE* coord,
+                                 const int* type,
+                                 const deepmd::InputNlist& gpu_inlist,
+                                 int* array_int,
+                                 uint_64* array_longlong,
+                                 const int max_nbor_size,
+                                 const int nloc,
+                                 const int nall,
+                                 const float rcut,
+                                 const std::vector<int> sec) {
+  format_nbor_list_gpu(nlist, coord, type, gpu_inlist, array_int,
+                       array_longlong, max_nbor_size, nloc, nall, 1, rcut, sec);
+}
+
 template <typename FPTYPE>
 void test_encoding_decoding_nbor_info_gpu(uint_64* key,
                                           int* out_type,
diff --git a/source/lib/include/neighbor_list.h b/source/lib/include/neighbor_list.h
index 39682bcd9a..f7c1ddde10 100644
--- a/source/lib/include/neighbor_list.h
+++ b/source/lib/include/neighbor_list.h
@@ -159,7 +159,9 @@ int build_nlist_cpu(InputNlist& nlist,
                     const int& nloc,
                     const int& nall,
                     const int& mem_size,
-                    const float& rcut);
+                    const float& rcut,
+                    const int& nframes = 1,
+                    const int* type = nullptr);
 
 void use_nei_info_cpu(int* nlist,
                       int* ntype,
@@ -224,7 +226,9 @@ int build_nlist_gpu(InputNlist& nlist,
                     const int& nloc,
                     const int& nall,
                     const int& mem_size,
-                    const float& rcut);
+                    const float& rcut,
+                    const int& nframes = 1,
+                    const int* type = NULL);
 
 /**
  * @brief Filter the fake atom type.
diff --git a/source/lib/include/prod_env_mat.h b/source/lib/include/prod_env_mat.h
index d8ca4d1861..a64605b8e9 100644
--- a/source/lib/include/prod_env_mat.h
+++ b/source/lib/include/prod_env_mat.h
@@ -20,11 +20,34 @@ void prod_env_mat_a_cpu(FPTYPE* em,
                         const FPTYPE* std,
                         const int nloc,
                         const int nall,
+                        const int nframes,
                         const float rcut,
                         const float rcut_smth,
                         const std::vector<int> sec,
                         const int* f_type = NULL);
 
+template <typename FPTYPE>
+inline void prod_env_mat_a_cpu(FPTYPE* em,
+                               FPTYPE* em_deriv,
+                               FPTYPE* rij,
+                               int* nlist,
+                               const FPTYPE* coord,
+                               const int* type,
+                               const InputNlist& inlist,
+                               const int max_nbor_size,
+                               const FPTYPE* avg,
+                               const FPTYPE* std,
+                               const int nloc,
+                               const int nall,
+                               const float rcut,
+                               const float rcut_smth,
+                               const std::vector<int> sec,
+                               const int* f_type = NULL) {
+  prod_env_mat_a_cpu(em, em_deriv, rij, nlist, coord, type, inlist,
+                     max_nbor_size, avg, std, nloc, nall, 1, rcut, rcut_smth,
+                     sec, f_type);
+}
+
 template <typename FPTYPE>
 void prod_env_mat_r_cpu(FPTYPE* em,
                         FPTYPE* em_deriv,
@@ -38,10 +61,32 @@ void prod_env_mat_r_cpu(FPTYPE* em,
                         const FPTYPE* std,
                         const int nloc,
                         const int nall,
+                        const int nframes,
                         const float rcut,
                         const float rcut_smth,
                         const std::vector<int> sec);
 
+template <typename FPTYPE>
+inline void prod_env_mat_r_cpu(FPTYPE* em,
+                               FPTYPE* em_deriv,
+                               FPTYPE* rij,
+                               int* nlist,
+                               const FPTYPE* coord,
+                               const int* type,
+                               const InputNlist& inlist,
+                               const int max_nbor_size,
+                               const FPTYPE* avg,
+                               const FPTYPE* std,
+                               const int nloc,
+                               const int nall,
+                               const float rcut,
+                               const float rcut_smth,
+                               const std::vector<int> sec) {
+  prod_env_mat_r_cpu(em, em_deriv, rij, nlist, coord, type, inlist,
+                     max_nbor_size, avg, std, nloc, nall, 1, rcut, rcut_smth,
+                     sec);
+}
+
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
 void prod_env_mat_a_gpu(FPTYPE* em,
@@ -58,11 +103,36 @@ void prod_env_mat_a_gpu(FPTYPE* em,
                         const FPTYPE* std,
                         const int nloc,
                         const int nall,
+                        const int nframes,
                         const float rcut,
                         const float rcut_smth,
                         const std::vector<int> sec,
                         const int* f_type = NULL);
 
+template <typename FPTYPE>
+inline void prod_env_mat_a_gpu(FPTYPE* em,
+                               FPTYPE* em_deriv,
+                               FPTYPE* rij,
+                               int* nlist,
+                               const FPTYPE* coord,
+                               const int* type,
+                               const InputNlist& gpu_inlist,
+                               int* array_int,
+                               unsigned long long* array_longlong,
+                               const int max_nbor_size,
+                               const FPTYPE* avg,
+                               const FPTYPE* std,
+                               const int nloc,
+                               const int nall,
+                               const float rcut,
+                               const float rcut_smth,
+                               const std::vector<int> sec,
+                               const int* f_type = NULL) {
+  prod_env_mat_a_gpu(em, em_deriv, rij, nlist, coord, type, gpu_inlist,
+                     array_int, array_longlong, max_nbor_size, avg, std, nloc,
+                     nall, 1, rcut, rcut_smth, sec, f_type);
+}
+
 template <typename FPTYPE>
 void prod_env_mat_r_gpu(FPTYPE* em,
                         FPTYPE* em_deriv,
@@ -78,10 +148,34 @@ void prod_env_mat_r_gpu(FPTYPE* em,
                         const FPTYPE* std,
                         const int nloc,
                         const int nall,
+                        const int nframes,
                         const float rcut,
                         const float rcut_smth,
                         const std::vector<int> sec);
 
+template <typename FPTYPE>
+inline void prod_env_mat_r_gpu(FPTYPE* em,
+                               FPTYPE* em_deriv,
+                               FPTYPE* rij,
+                               int* nlist,
+                               const FPTYPE* coord,
+                               const int* type,
+                               const InputNlist& gpu_inlist,
+                               int* array_int,
+                               unsigned long long* array_longlong,
+                               const int max_nbor_size,
+                               const FPTYPE* avg,
+                               const FPTYPE* std,
+                               const int nloc,
+                               const int nall,
+                               const float rcut,
+                               const float rcut_smth,
+                               const std::vector<int> sec) {
+  prod_env_mat_r_gpu(em, em_deriv, rij, nlist, coord, type, gpu_inlist,
+                     array_int, array_longlong, max_nbor_size, avg, std, nloc,
+                     nall, 1, rcut, rcut_smth, sec);
+}
+
 void env_mat_nbor_update(InputNlist& inlist,
                          InputNlist& gpu_inlist,
                          int& max_nbor_size,
diff --git a/source/lib/src/gpu/neighbor_list.cu b/source/lib/src/gpu/neighbor_list.cu
index 8f38017d4c..c9352a8c7b 100644
--- a/source/lib/src/gpu/neighbor_list.cu
+++ b/source/lib/src/gpu/neighbor_list.cu
@@ -78,19 +78,28 @@ template <typename FPTYPE>
 __global__ void build_nlist(int* ilist,
                             int* temp_nlist,
                             const FPTYPE* c_cpy,
+                            const int* type,
                             const FPTYPE rcut2,
                             const int nloc,
                             const int nall,
                             const int mem_size) {
-  const unsigned int atom_idx = blockIdx.x;
+  const unsigned int row_idx = blockIdx.x;
+  const unsigned int atom_idx = row_idx % nloc;
+  const unsigned int frame_idx = row_idx / nloc;
   const unsigned int neighbor_idx = blockIdx.y * blockDim.y + threadIdx.y;
   if (neighbor_idx < nall) {
-    int* neighbor_row = temp_nlist + atom_idx * mem_size;
+    int* neighbor_row = temp_nlist + row_idx * mem_size;
     if (neighbor_idx == atom_idx) {
-      ilist[atom_idx] = atom_idx;
+      ilist[row_idx] = atom_idx;
     } else {
-      const FPTYPE* ccoord = c_cpy + atom_idx * 3;
-      const FPTYPE* ncoord = c_cpy + neighbor_idx * 3;
+      const int_64 frame_offset = int_64(frame_idx) * nall;
+      if (type != NULL && (type[frame_offset + atom_idx] < 0 ||
+                           type[frame_offset + neighbor_idx] < 0)) {
+        return;
+      }
+      const FPTYPE* frame_coord = c_cpy + int_64(frame_idx) * nall * 3;
+      const FPTYPE* ccoord = frame_coord + atom_idx * 3;
+      const FPTYPE* ncoord = frame_coord + neighbor_idx * 3;
       FPTYPE diff[3];
       for (int kk = 0; kk < 3; kk++) {
         diff[kk] = ccoord[kk] - ncoord[kk];
@@ -193,41 +202,44 @@ int build_nlist_gpu(InputNlist& nlist,
                     const int& nloc,
                     const int& nall,
                     const int& mem_size,
-                    const float& rcut) {
+                    const float& rcut,
+                    const int& nframes,
+                    const int* type) {
   if (mem_size < nall) {
     return 1;
   }
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
   const int nblock = (nall + TPB - 1) / TPB;
+  const int_64 nrows = int_64(nframes) * nloc;
   int* ilist = nlist.ilist;
   int* numneigh = nlist.numneigh;
   int** firstneigh = nlist.firstneigh;
-  DPErrcheck(gpuMemset(nlist_data, -1, sizeof(int) * 2 * nloc * mem_size));
-  int* temp_nlist = nlist_data;  // nloc*mem_size
-  int* nei_order = temp_nlist + nloc * mem_size;
-  nlist.inum = nloc;
+  DPErrcheck(gpuMemset(nlist_data, -1, sizeof(int) * 2 * nrows * mem_size));
+  int* temp_nlist = nlist_data;  // nrows*mem_size
+  int* nei_order = temp_nlist + nrows * mem_size;
+  nlist.inum = nrows;
   FPTYPE rcut2 = rcut * rcut;
 
-  dim3 block_grid(nloc, nblock);
+  dim3 block_grid(nrows, nblock);
   dim3 thread_grid(1, TPB);
-  build_nlist<<<block_grid, thread_grid>>>(ilist, temp_nlist, c_cpy, rcut2,
-                                           nloc, nall, mem_size);
+  build_nlist<<<block_grid, thread_grid>>>(ilist, temp_nlist, c_cpy, type,
+                                           rcut2, nloc, nall, mem_size);
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
   parallel_prefix_scan<TPB>
-      <<<nloc, TPB>>>(numneigh, nei_order, temp_nlist, mem_size, nloc, nall);
+      <<<nrows, TPB>>>(numneigh, nei_order, temp_nlist, mem_size, nloc, nall);
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
   fill_nlist<<<block_grid, thread_grid>>>(firstneigh, temp_nlist, nei_order,
                                           mem_size, nall);
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
-  int* numneigh_host = new int[nloc];
-  DPErrcheck(gpuMemcpy(numneigh_host, numneigh, sizeof(int) * nloc,
+  int* numneigh_host = new int[nrows];
+  DPErrcheck(gpuMemcpy(numneigh_host, numneigh, sizeof(int) * nrows,
                        gpuMemcpyDeviceToHost));
   int max_nei = 0;
-  for (int ii = 0; ii < nloc; ii++) {
+  for (int_64 ii = 0; ii < nrows; ii++) {
     if (numneigh_host[ii] > max_nei) {
       max_nei = numneigh_host[ii];
     }
@@ -285,7 +297,9 @@ template int build_nlist_gpu<float>(InputNlist& nlist,
                                     const int& nloc,
                                     const int& nall,
                                     const int& mem_size,
-                                    const float& rcut);
+                                    const float& rcut,
+                                    const int& nframes,
+                                    const int* type);
 template int build_nlist_gpu<double>(InputNlist& nlist,
                                      int* max_list_size,
                                      int* nlist_data,
@@ -293,7 +307,9 @@ template int build_nlist_gpu<double>(InputNlist& nlist,
                                      const int& nloc,
                                      const int& nall,
                                      const int& mem_size,
-                                     const float& rcut);
+                                     const float& rcut,
+                                     const int& nframes,
+                                     const int* type);
 
 __global__ void map_filter_ftype(int* ftype_out,
                                  const int* ftype_in,
diff --git a/source/lib/src/gpu/prod_env_mat.cu b/source/lib/src/gpu/prod_env_mat.cu
index e8909edb44..6e8fc1df85 100644
--- a/source/lib/src/gpu/prod_env_mat.cu
+++ b/source/lib/src/gpu/prod_env_mat.cu
@@ -110,13 +110,16 @@ __device__ inline void decoding_nbor_info(int& type,
   index = key & 0xFFFFFF;
 }
 
-template <typename FPTYPE>
-__global__ void get_i_idx(FPTYPE* i_idx, const int nloc, const FPTYPE* ilist) {
+__global__ void get_i_idx(int* i_idx,
+                          const int nloc,
+                          const int nframes,
+                          const int* ilist) {
   const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= nloc) {
+  if (idx >= nframes * nloc) {
     return;
   }
-  i_idx[ilist[idx]] = idx;
+  const int frame_idx = idx / nloc;
+  i_idx[frame_idx * nloc + ilist[idx]] = idx;
 }
 
 template <typename FPTYPE>
@@ -127,30 +130,37 @@ __global__ void format_nlist_fill_a(uint_64* key,
                                     int** firstneigh,
                                     const float rcut,
                                     int* i_idx,
-                                    const int MAX_NBOR_SIZE) {
-  // <<<nloc, MAX_NBOR_SIZE>>>
+                                    const int MAX_NBOR_SIZE,
+                                    const int nloc,
+                                    const int nall) {
+  // <<<nframes * nloc, MAX_NBOR_SIZE>>>
   const int_64 idx = blockIdx.x;
   const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
+  const int frame_idx = idx / nloc;
+  const int atom_idx = idx % nloc;
+  const int nlist_row = i_idx[idx];
 
-  const int nsize = numneigh[i_idx[idx]];
+  const int nsize = numneigh[nlist_row];
   if (idy >= nsize) {
     return;
   }
 
-  const int* nei_idx = firstneigh[i_idx[idx]];
+  const int* nei_idx = firstneigh[nlist_row];
   // dev_copy(nei_idx, &jlist[jrange[i_idx]], nsize);
   uint_64* key_in = key + idx * MAX_NBOR_SIZE;
   FPTYPE diff[3];
   const int& j_idx = nei_idx[idy];
-  if (type[j_idx] < 0) {
+  const int_64 j_idx_nall = int_64(frame_idx) * nall + j_idx;
+  if (type[j_idx_nall] < 0) {
     return;
   }
+  const int_64 i_idx_nall = int_64(frame_idx) * nall + atom_idx;
   for (int dd = 0; dd < 3; dd++) {
-    diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd];
+    diff[dd] = coord[j_idx_nall * 3 + dd] - coord[i_idx_nall * 3 + dd];
   }
   FPTYPE rr = _sqrt(dev_dot(diff, diff));
   if (rr <= rcut) {
-    key_in[idy] = encoding_nbor_info(type[j_idx], rr, j_idx);
+    key_in[idy] = encoding_nbor_info(type[j_idx_nall], rr, j_idx);
   }
 }
 
@@ -225,16 +235,18 @@ void format_nbor_list_256(uint_64* key,
                           const int* type,
                           const deepmd::InputNlist& gpu_inlist,
                           const int& nloc,
+                          const int& nall,
+                          const int& nframes,
                           const float& rcut,
                           int* i_idx) {
   const int LEN = 256;
   const int MAX_NBOR_SIZE = 256;
   const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
-  dim3 block_grid(nloc, nblock);
+  dim3 block_grid(nframes * nloc, nblock);
   dim3 thread_grid(1, LEN);
   format_nlist_fill_a<<<block_grid, thread_grid>>>(
       key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx,
-      MAX_NBOR_SIZE);
+      MAX_NBOR_SIZE, nloc, nall);
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
   const int ITEMS_PER_THREAD = 4;
@@ -242,7 +254,8 @@ void format_nbor_list_256(uint_64* key,
   // BlockSortKernel<NeighborInfo, BLOCK_THREADS,
   // ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
   BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>
-      <<<nloc, BLOCK_THREADS>>>(key, key + nloc * MAX_NBOR_SIZE);
+      <<<nframes * nloc, BLOCK_THREADS>>>(
+          key, key + int_64(nframes) * nloc * MAX_NBOR_SIZE);
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
 }
@@ -253,16 +266,18 @@ void format_nbor_list_512(uint_64* key,
                           const int* type,
                           const deepmd::InputNlist& gpu_inlist,
                           const int& nloc,
+                          const int& nall,
+                          const int& nframes,
                           const float& rcut,
                           int* i_idx) {
   const int LEN = 256;
   const int MAX_NBOR_SIZE = 512;
   const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
-  dim3 block_grid(nloc, nblock);
+  dim3 block_grid(nframes * nloc, nblock);
   dim3 thread_grid(1, LEN);
   format_nlist_fill_a<<<block_grid, thread_grid>>>(
       key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx,
-      MAX_NBOR_SIZE);
+      MAX_NBOR_SIZE, nloc, nall);
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
   const int ITEMS_PER_THREAD = 4;
@@ -270,7 +285,8 @@ void format_nbor_list_512(uint_64* key,
   // BlockSortKernel<NeighborInfo, BLOCK_THREADS,
   // ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
   BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>
-      <<<nloc, BLOCK_THREADS>>>(key, key + nloc * MAX_NBOR_SIZE);
+      <<<nframes * nloc, BLOCK_THREADS>>>(
+          key, key + int_64(nframes) * nloc * MAX_NBOR_SIZE);
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
 }
@@ -281,16 +297,18 @@ void format_nbor_list_1024(uint_64* key,
                            const int* type,
                            const deepmd::InputNlist& gpu_inlist,
                            const int& nloc,
+                           const int& nall,
+                           const int& nframes,
                            const float& rcut,
                            int* i_idx) {
   const int LEN = 256;
   const int MAX_NBOR_SIZE = 1024;
   const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
-  dim3 block_grid(nloc, nblock);
+  dim3 block_grid(nframes * nloc, nblock);
   dim3 thread_grid(1, LEN);
   format_nlist_fill_a<<<block_grid, thread_grid>>>(
       key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx,
-      MAX_NBOR_SIZE);
+      MAX_NBOR_SIZE, nloc, nall);
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
   const int ITEMS_PER_THREAD = 8;
@@ -298,7 +316,8 @@ void format_nbor_list_1024(uint_64* key,
   // BlockSortKernel<NeighborInfo, BLOCK_THREADS,
   // ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
   BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>
-      <<<nloc, BLOCK_THREADS>>>(key, key + nloc * MAX_NBOR_SIZE);
+      <<<nframes * nloc, BLOCK_THREADS>>>(
+          key, key + int_64(nframes) * nloc * MAX_NBOR_SIZE);
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
 }
@@ -309,16 +328,18 @@ void format_nbor_list_2048(uint_64* key,
                            const int* type,
                            const deepmd::InputNlist& gpu_inlist,
                            const int& nloc,
+                           const int& nall,
+                           const int& nframes,
                            const float& rcut,
                            int* i_idx) {
   const int LEN = 256;
   const int MAX_NBOR_SIZE = 2048;
   const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
-  dim3 block_grid(nloc, nblock);
+  dim3 block_grid(nframes * nloc, nblock);
   dim3 thread_grid(1, LEN);
   format_nlist_fill_a<<<block_grid, thread_grid>>>(
       key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx,
-      MAX_NBOR_SIZE);
+      MAX_NBOR_SIZE, nloc, nall);
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
   const int ITEMS_PER_THREAD = 8;
@@ -326,7 +347,8 @@ void format_nbor_list_2048(uint_64* key,
   // BlockSortKernel<NeighborInfo, BLOCK_THREADS,
   // ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
   BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>
-      <<<nloc, BLOCK_THREADS>>>(key, key + nloc * MAX_NBOR_SIZE);
+      <<<nframes * nloc, BLOCK_THREADS>>>(
+          key, key + int_64(nframes) * nloc * MAX_NBOR_SIZE);
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
 }
@@ -337,16 +359,18 @@ void format_nbor_list_4096(uint_64* key,
                            const int* type,
                            const deepmd::InputNlist& gpu_inlist,
                            const int& nloc,
+                           const int& nall,
+                           const int& nframes,
                            const float& rcut,
                            int* i_idx) {
   const int LEN = 256;
   const int MAX_NBOR_SIZE = 4096;
   const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
-  dim3 block_grid(nloc, nblock);
+  dim3 block_grid(nframes * nloc, nblock);
   dim3 thread_grid(1, LEN);
   format_nlist_fill_a<<<block_grid, thread_grid>>>(
       key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx,
-      MAX_NBOR_SIZE);
+      MAX_NBOR_SIZE, nloc, nall);
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
   const int ITEMS_PER_THREAD = 16;
@@ -354,7 +378,8 @@ void format_nbor_list_4096(uint_64* key,
   // BlockSortKernel<NeighborInfo, BLOCK_THREADS,
   // ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
   BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>
-      <<<nloc, BLOCK_THREADS>>>(key, key + nloc * MAX_NBOR_SIZE);
+      <<<nframes * nloc, BLOCK_THREADS>>>(
+          key, key + int_64(nframes) * nloc * MAX_NBOR_SIZE);
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
 }
@@ -370,11 +395,16 @@ __global__ void compute_env_mat_a(FPTYPE* em,
                                   const int* nlist,
                                   const int nnei,
                                   const float rmin,
-                                  const float rmax) {
-  // <<<nloc, TPB>>>
+                                  const float rmax,
+                                  const int nloc,
+                                  const int nall) {
+  // <<<nframes * nloc, TPB>>>
   const int_64 bid = blockIdx.x;
   const unsigned int tid = threadIdx.x;
-  if (type[bid] < 0) {
+  const int_64 frame_idx = bid / nloc;
+  const int_64 atom_idx = bid % nloc;
+  const int_64 i_idx_nall = frame_idx * nall + atom_idx;
+  if (type[i_idx_nall] < 0) {
     return;
   }
   if (tid >= nnei) {
@@ -392,9 +422,9 @@ __global__ void compute_env_mat_a(FPTYPE* em,
       FPTYPE rr[3] = {(FPTYPE)0.};
       FPTYPE dd[4] = {(FPTYPE)0.};
       FPTYPE vv[12] = {(FPTYPE)0.};
-      const int j_idx = row_nlist[ii];
+      const int_64 j_idx = frame_idx * nall + row_nlist[ii];
       for (int kk = 0; kk < 3; kk++) {
-        rr[kk] = coord[j_idx * 3 + kk] - coord[bid * 3 + kk];
+        rr[kk] = coord[j_idx * 3 + kk] - coord[i_idx_nall * 3 + kk];
         row_rij[ii * 3 + kk] = rr[kk];
       }
       // const FPTYPE * rr = &row_rij[ii * 3];
@@ -478,16 +508,19 @@ __global__ void compute_env_mat_a(FPTYPE* em,
                     // idx_value + 3];
       for (int ii = 0; ii < 12; ii++) {
         row_descript_deriv[idx_deriv + ii] =
-            vv[ii] / std[type[bid] * ndescrpt + idx_value + ii / 3];
+            vv[ii] / std[type[i_idx_nall] * ndescrpt + idx_value + ii / 3];
       }
       for (int ii = 0; ii < 4; ii++) {
         row_descript[idx_value + ii] =
-            (dd[ii] - avg[type[bid] * ndescrpt + idx_value + ii]) /
-            std[type[bid] * ndescrpt + idx_value + ii];
+            (dd[ii] - avg[type[i_idx_nall] * ndescrpt + idx_value + ii]) /
+            std[type[i_idx_nall] * ndescrpt + idx_value + ii];
       }
     } else {
-      row_descript[idx_value] -= avg[type[bid] * ndescrpt + idx_value] /
-                                 std[type[bid] * ndescrpt + idx_value];
+      for (int ii = 0; ii < 4; ii++) {
+        row_descript[idx_value + ii] -=
+            avg[type[i_idx_nall] * ndescrpt + idx_value + ii] /
+            std[type[i_idx_nall] * ndescrpt + idx_value + ii];
+      }
     }
   }
 }
@@ -503,13 +536,21 @@ __global__ void compute_env_mat_r(FPTYPE* em,
                                   const int* nlist,
                                   const int nnei,
                                   const float rmin,
-                                  const float rmax) {
-  // <<<nloc, TPB>>>
+                                  const float rmax,
+                                  const int nloc,
+                                  const int nall) {
+  // <<<nframes * nloc, TPB>>>
   const int_64 bid = blockIdx.x;
   const unsigned int tid = threadIdx.x;
   if (tid >= nnei) {
     return;
   }
+  const int_64 frame_idx = bid / nloc;
+  const int_64 atom_idx = bid % nloc;
+  const int_64 i_idx_nall = frame_idx * nall + atom_idx;
+  if (type[i_idx_nall] < 0) {
+    return;
+  }
   const int ndescrpt = nnei;
   const int* row_nlist = nlist + bid * nnei;
   FPTYPE* row_rij = rij + bid * nnei * 3;
@@ -522,9 +563,9 @@ __global__ void compute_env_mat_r(FPTYPE* em,
       FPTYPE rr[3] = {0};
       FPTYPE vv[3] = {0};
       FPTYPE dd = 0;
-      const int& j_idx = row_nlist[ii];
+      const int_64 j_idx = frame_idx * nall + row_nlist[ii];
       for (int kk = 0; kk < 3; kk++) {
-        rr[kk] = coord[j_idx * 3 + kk] - coord[bid * 3 + kk];
+        rr[kk] = coord[j_idx * 3 + kk] - coord[i_idx_nall * 3 + kk];
         row_rij[ii * 3 + kk] = rr[kk];
       }
       // const FPTYPE * rr = &row_rij[ii * 3];
@@ -556,13 +597,13 @@ __global__ void compute_env_mat_r(FPTYPE* em,
                  // idx_value + 0];
       for (int ii = 0; ii < 3; ii++) {
         row_em_deriv[idx_deriv + ii] =
-            vv[ii] / std[type[bid] * ndescrpt + idx_value + ii / 3];
+            vv[ii] / std[type[i_idx_nall] * ndescrpt + idx_value + ii / 3];
       }
-      row_em[idx_value] = (dd - avg[type[bid] * ndescrpt + idx_value]) /
-                          std[type[bid] * ndescrpt + idx_value];
+      row_em[idx_value] = (dd - avg[type[i_idx_nall] * ndescrpt + idx_value]) /
+                          std[type[i_idx_nall] * ndescrpt + idx_value];
     } else {
-      row_em[idx_value] -= avg[type[bid] * ndescrpt + idx_value] /
-                           std[type[bid] * ndescrpt + idx_value];
+      row_em[idx_value] -= avg[type[i_idx_nall] * ndescrpt + idx_value] /
+                           std[type[i_idx_nall] * ndescrpt + idx_value];
     }
   }
 }
@@ -578,47 +619,55 @@ void format_nbor_list_gpu(int* nlist,
                           const int max_nbor_size,
                           const int nloc,
                           const int nall,
+                          const int nframes,
                           const float rcut,
                           const std::vector<int> sec) {
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
   const int LEN = 256;
   const int nnei = sec.back();
-  const int nblock = (nloc + LEN - 1) / LEN;
+  const int nblock = (int_64(nframes) * nloc + LEN - 1) / LEN;
   int* sec_dev = array_int;
   int* nei_iter = array_int + sec.size();  // = new int[sec_size];
-  int* i_idx = array_int + sec.size() + nloc * sec.size();
+  int* i_idx = array_int + sec.size() + int_64(nframes) * nloc * sec.size();
   uint_64* key = array_longlong;
   assert(max_nbor_size == 256 || max_nbor_size == 512 ||
          max_nbor_size == 1024 || max_nbor_size == 2048 ||
          max_nbor_size == 4096);
-  DPErrcheck(gpuMemset(nlist, -1, sizeof(int) * int_64(nloc) * nnei));
-  DPErrcheck(gpuMemset(key, 0xffffffff,
-                       sizeof(uint_64) * int_64(nloc) * max_nbor_size));
+  DPErrcheck(gpuMemset(nlist, -1, sizeof(int) * int_64(nframes) * nloc * nnei));
+  DPErrcheck(
+      gpuMemset(key, 0xffffffff,
+                sizeof(uint_64) * int_64(nframes) * nloc * max_nbor_size));
   DPErrcheck(gpuMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(),
                        gpuMemcpyHostToDevice));
 
-  get_i_idx<<<nblock, LEN>>>(i_idx, nloc, gpu_inlist.ilist);
+  get_i_idx<<<nblock, LEN>>>(i_idx, nloc, nframes, gpu_inlist.ilist);
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
 
   if (max_nbor_size == 256) {
-    format_nbor_list_256(key, coord, type, gpu_inlist, nloc, rcut, i_idx);
+    format_nbor_list_256(key, coord, type, gpu_inlist, nloc, nall, nframes,
+                         rcut, i_idx);
   } else if (max_nbor_size == 512) {
-    format_nbor_list_512(key, coord, type, gpu_inlist, nloc, rcut, i_idx);
+    format_nbor_list_512(key, coord, type, gpu_inlist, nloc, nall, nframes,
+                         rcut, i_idx);
   } else if (max_nbor_size == 1024) {
-    format_nbor_list_1024(key, coord, type, gpu_inlist, nloc, rcut, i_idx);
+    format_nbor_list_1024(key, coord, type, gpu_inlist, nloc, nall, nframes,
+                          rcut, i_idx);
   } else if (max_nbor_size == 2048) {
-    format_nbor_list_2048(key, coord, type, gpu_inlist, nloc, rcut, i_idx);
+    format_nbor_list_2048(key, coord, type, gpu_inlist, nloc, nall, nframes,
+                          rcut, i_idx);
   } else if (max_nbor_size == 4096) {
-    format_nbor_list_4096(key, coord, type, gpu_inlist, nloc, rcut, i_idx);
+    format_nbor_list_4096(key, coord, type, gpu_inlist, nloc, nall, nframes,
+                          rcut, i_idx);
   }
 
-  fill_nei_iter<<<dim3(nloc, (max_nbor_size + LEN - 1) / LEN), LEN>>>(
-      nei_iter, key, nloc, max_nbor_size, sec.size());
+  const int_64 nrows = int_64(nframes) * nloc;
+  fill_nei_iter<<<dim3(nrows, (max_nbor_size + LEN - 1) / LEN), LEN>>>(
+      nei_iter, key, nrows, max_nbor_size, sec.size());
 
-  format_nlist_fill_b<<<dim3(nloc, (max_nbor_size + LEN - 1) / LEN), LEN>>>(
-      nlist, nnei, nloc, key, sec_dev, sec.size(), nei_iter, max_nbor_size);
+  format_nlist_fill_b<<<dim3(nrows, (max_nbor_size + LEN - 1) / LEN), LEN>>>(
+      nlist, nnei, nrows, key, sec_dev, sec.size(), nei_iter, max_nbor_size);
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
 }
@@ -638,6 +687,7 @@ void prod_env_mat_a_gpu(FPTYPE* em,
                         const FPTYPE* std,
                         const int nloc,
                         const int nall,
+                        const int nframes,
                         const float rcut,
                         const float rcut_smth,
                         const std::vector<int> sec,
@@ -649,18 +699,22 @@ void prod_env_mat_a_gpu(FPTYPE* em,
   }
   const int nnei = sec.back();
   const int ndescrpt = nnei * 4;
-  DPErrcheck(gpuMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt));
   DPErrcheck(
-      gpuMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3));
-  DPErrcheck(gpuMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3));
+      gpuMemset(em, 0, sizeof(FPTYPE) * int_64(nframes) * nloc * ndescrpt));
+  DPErrcheck(gpuMemset(em_deriv, 0,
+                       sizeof(FPTYPE) * int_64(nframes) * nloc * ndescrpt * 3));
+  DPErrcheck(
+      gpuMemset(rij, 0, sizeof(FPTYPE) * int_64(nframes) * nloc * nnei * 3));
 
   format_nbor_list_gpu(nlist, coord, f_type, gpu_inlist, array_int,
-                       array_longlong, max_nbor_size, nloc, nall, rcut, sec);
+                       array_longlong, max_nbor_size, nloc, nall, nframes, rcut,
+                       sec);
   nborErrcheck(gpuGetLastError());
   nborErrcheck(gpuDeviceSynchronize());
 
-  compute_env_mat_a<FPTYPE, TPB><<<nloc, TPB>>>(
-      em, em_deriv, rij, coord, avg, std, type, nlist, nnei, rcut_smth, rcut);
+  compute_env_mat_a<FPTYPE, TPB><<<int_64(nframes) * nloc, TPB>>>(
+      em, em_deriv, rij, coord, avg, std, type, nlist, nnei, rcut_smth, rcut,
+      nloc, nall);
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
 }
@@ -680,6 +734,7 @@ void prod_env_mat_r_gpu(FPTYPE* em,
                         const FPTYPE* std,
                         const int nloc,
                         const int nall,
+                        const int nframes,
                         const float rcut,
                         const float rcut_smth,
                         const std::vector<int> sec) {
@@ -687,18 +742,22 @@ void prod_env_mat_r_gpu(FPTYPE* em,
   DPErrcheck(gpuDeviceSynchronize());
   const int nnei = sec.back();
   const int ndescrpt = nnei * 1;
-  DPErrcheck(gpuMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt));
   DPErrcheck(
-      gpuMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3));
-  DPErrcheck(gpuMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3));
+      gpuMemset(em, 0, sizeof(FPTYPE) * int_64(nframes) * nloc * ndescrpt));
+  DPErrcheck(gpuMemset(em_deriv, 0,
+                       sizeof(FPTYPE) * int_64(nframes) * nloc * ndescrpt * 3));
+  DPErrcheck(
+      gpuMemset(rij, 0, sizeof(FPTYPE) * int_64(nframes) * nloc * nnei * 3));
 
   format_nbor_list_gpu(nlist, coord, type, gpu_inlist, array_int,
-                       array_longlong, max_nbor_size, nloc, nall, rcut, sec);
+                       array_longlong, max_nbor_size, nloc, nall, nframes, rcut,
+                       sec);
   nborErrcheck(gpuGetLastError());
   nborErrcheck(gpuDeviceSynchronize());
 
-  compute_env_mat_r<FPTYPE, TPB><<<nloc, TPB>>>(
-      em, em_deriv, rij, coord, avg, std, type, nlist, nnei, rcut_smth, rcut);
+  compute_env_mat_r<FPTYPE, TPB><<<int_64(nframes) * nloc, TPB>>>(
+      em, em_deriv, rij, coord, avg, std, type, nlist, nnei, rcut_smth, rcut,
+      nloc, nall);
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
 }
@@ -732,6 +791,7 @@ template void prod_env_mat_a_gpu<float>(float* em,
                                         const float* std,
                                         const int nloc,
                                         const int nall,
+                                        const int nframes,
                                         const float rcut,
                                         const float rcut_smth,
                                         const std::vector<int> sec,
@@ -750,6 +810,7 @@ template void prod_env_mat_a_gpu<double>(double* em,
                                          const double* std,
                                          const int nloc,
                                          const int nall,
+                                         const int nframes,
                                          const float rcut,
                                          const float rcut_smth,
                                          const std::vector<int> sec,
@@ -768,6 +829,7 @@ template void prod_env_mat_r_gpu<float>(float* em,
                                         const float* std,
                                         const int nloc,
                                         const int nall,
+                                        const int nframes,
                                         const float rcut,
                                         const float rcut_smth,
                                         const std::vector<int> sec);
@@ -785,6 +847,7 @@ template void prod_env_mat_r_gpu<double>(double* em,
                                          const double* std,
                                          const int nloc,
                                          const int nall,
+                                         const int nframes,
                                          const float rcut,
                                          const float rcut_smth,
                                          const std::vector<int> sec);
@@ -797,6 +860,7 @@ template void format_nbor_list_gpu<float>(int* nlist,
                                           const int max_nbor_size,
                                           const int nloc,
                                           const int nall,
+                                          const int nframes,
                                           const float rcut,
                                           const std::vector<int> sec);
 template void format_nbor_list_gpu<double>(int* nlist,
@@ -808,6 +872,7 @@ template void format_nbor_list_gpu<double>(int* nlist,
                                            const int max_nbor_size,
                                            const int nloc,
                                            const int nall,
+                                           const int nframes,
                                            const float rcut,
                                            const std::vector<int> sec);
 template void test_encoding_decoding_nbor_info_gpu(uint_64* key,
diff --git a/source/lib/src/neighbor_list.cc b/source/lib/src/neighbor_list.cc
index 3a0d8eb122..5f40ae6472 100644
--- a/source/lib/src/neighbor_list.cc
+++ b/source/lib/src/neighbor_list.cc
@@ -877,23 +877,33 @@ int deepmd::build_nlist_cpu(InputNlist& nlist,
                             const int& nloc,
                             const int& nall,
                             const int& mem_size_,
-                            const float& rcut) {
+                            const float& rcut,
+                            const int& nframes,
+                            const int* type) {
   const int mem_size = mem_size_;
   *max_list_size = 0;
-  nlist.inum = nloc;
+  nlist.inum = nframes * nloc;
   FPTYPE rcut2 = rcut * rcut;
   std::vector<int> jlist;
   jlist.reserve(mem_size);
   for (int ii = 0; ii < nlist.inum; ++ii) {
-    nlist.ilist[ii] = ii;
+    const int atom_idx = ii % nloc;
+    const int frame_idx = ii / nloc;
+    const int frame_offset = frame_idx * nall;
+    nlist.ilist[ii] = atom_idx;
     jlist.clear();
     for (int jj = 0; jj < nall; ++jj) {
-      if (jj == ii) {
+      if (jj == atom_idx) {
+        continue;
+      }
+      if (type != nullptr &&
+          (type[frame_offset + atom_idx] < 0 || type[frame_offset + jj] < 0)) {
         continue;
       }
       FPTYPE diff[3];
       for (int dd = 0; dd < 3; ++dd) {
-        diff[dd] = c_cpy[ii * 3 + dd] - c_cpy[jj * 3 + dd];
+        diff[dd] = c_cpy[(frame_offset + atom_idx) * 3 + dd] -
+                   c_cpy[(frame_offset + jj) * 3 + dd];
       }
       FPTYPE diff2 = deepmd::dot3(diff, diff);
       if (diff2 < rcut2) {
@@ -963,7 +973,9 @@ template int deepmd::build_nlist_cpu<double>(InputNlist& nlist,
                                              const int& nloc,
                                              const int& nall,
                                              const int& mem_size,
-                                             const float& rcut);
+                                             const float& rcut,
+                                             const int& nframes,
+                                             const int* type);
 
 template int deepmd::build_nlist_cpu<float>(InputNlist& nlist,
                                             int* max_list_size,
@@ -971,7 +983,9 @@ template int deepmd::build_nlist_cpu<float>(InputNlist& nlist,
                                             const int& nloc,
                                             const int& nall,
                                             const int& mem_size,
-                                            const float& rcut);
+                                            const float& rcut,
+                                            const int& nframes,
+                                            const int* type);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 void deepmd::convert_nlist_gpu_device(InputNlist& gpu_nlist,
diff --git a/source/lib/src/prod_env_mat.cc b/source/lib/src/prod_env_mat.cc
index 302fac4bc9..b01f1276b9 100644
--- a/source/lib/src/prod_env_mat.cc
+++ b/source/lib/src/prod_env_mat.cc
@@ -24,6 +24,7 @@ void deepmd::prod_env_mat_a_cpu(FPTYPE* em,
                                 const FPTYPE* std,
                                 const int nloc,
                                 const int nall,
+                                const int nframes,
                                 const float rcut,
                                 const float rcut_smth,
                                 const std::vector<int> sec,
@@ -33,76 +34,99 @@ void deepmd::prod_env_mat_a_cpu(FPTYPE* em,
   }
   const int nnei = sec.back();
   const int nem = nnei * 4;
+  assert(nframes * nloc == inlist.inum);
 
-  // set & normalize coord
-  std::vector<FPTYPE> d_coord3(nall * 3);
-  for (int ii = 0; ii < nall; ++ii) {
-    for (int dd = 0; dd < 3; ++dd) {
-      d_coord3[ii * 3 + dd] = coord[ii * 3 + dd];
-    }
-  }
+  std::vector<std::vector<FPTYPE> > frame_coords(nframes);
+  std::vector<std::vector<int> > frame_f_types(nframes);
+  std::vector<std::vector<std::vector<int> > > frame_nlists(nframes);
 
-  // set type
-  std::vector<int> d_f_type(nall);
-  for (int ii = 0; ii < nall; ++ii) {
-    d_f_type[ii] = f_type[ii];
-  }
+#pragma omp parallel
+  {
+#pragma omp for
+    for (int ff = 0; ff < nframes; ++ff) {
+      const FPTYPE* frame_coord = coord + static_cast<size_t>(ff) * nall * 3;
+      const int* frame_f_type = f_type + static_cast<size_t>(ff) * nall;
+      const int_64 row_offset = static_cast<int_64>(ff) * nloc;
+
+      frame_coords[ff].resize(static_cast<size_t>(nall) * 3);
+      for (int ii = 0; ii < nall; ++ii) {
+        for (int dd = 0; dd < 3; ++dd) {
+          frame_coords[ff][ii * 3 + dd] = frame_coord[ii * 3 + dd];
+        }
+      }
 
-  // build nlist
-  std::vector<std::vector<int> > d_nlist_a(nloc);
+      frame_f_types[ff].resize(nall);
+      for (int ii = 0; ii < nall; ++ii) {
+        frame_f_types[ff][ii] = frame_f_type[ii];
+      }
 
-  assert(nloc == inlist.inum);
-  for (unsigned ii = 0; ii < nloc; ++ii) {
-    d_nlist_a[ii].reserve(max_nbor_size);
-  }
-  for (unsigned ii = 0; ii < nloc; ++ii) {
-    int i_idx = inlist.ilist[ii];
-    for (unsigned jj = 0; jj < inlist.numneigh[ii]; ++jj) {
-      int j_idx = inlist.firstneigh[ii][jj];
-      d_nlist_a[i_idx].push_back(j_idx);
+      frame_nlists[ff].resize(nloc);
+      for (int ii = 0; ii < nloc; ++ii) {
+        frame_nlists[ff][ii].reserve(max_nbor_size);
+      }
+      for (int ii = 0; ii < nloc; ++ii) {
+        const int_64 row = row_offset + ii;
+        const int i_idx = inlist.ilist[row];
+        for (int jj = 0; jj < inlist.numneigh[row]; ++jj) {
+          const int j_idx = inlist.firstneigh[row][jj];
+          frame_nlists[ff][i_idx].push_back(j_idx);
+        }
+      }
     }
-  }
 
-#pragma omp parallel for
-  for (int ii = 0; ii < nloc; ++ii) {
-    std::vector<int> fmt_nlist_a;
-    int ret = format_nlist_i_cpu(fmt_nlist_a, d_coord3, d_f_type, ii,
-                                 d_nlist_a[ii], rcut, sec);
-    std::vector<FPTYPE> d_em_a;
-    std::vector<FPTYPE> d_em_a_deriv;
-    std::vector<FPTYPE> d_em_r;
-    std::vector<FPTYPE> d_em_r_deriv;
-    std::vector<FPTYPE> d_rij_a;
-    env_mat_a_cpu(d_em_a, d_em_a_deriv, d_rij_a, d_coord3, d_f_type, ii,
-                  fmt_nlist_a, sec, rcut_smth, rcut);
+#pragma omp for
+    for (int_64 row = 0; row < static_cast<int_64>(nframes) * nloc; ++row) {
+      const int ff = row / nloc;
+      const int ii = row % nloc;
+      const int_64 row_offset = static_cast<int_64>(ff) * nloc;
+      const int* frame_type = type + static_cast<size_t>(ff) * nall;
+      FPTYPE* frame_em = em + static_cast<size_t>(row_offset) * nem;
+      FPTYPE* frame_em_deriv =
+          em_deriv + static_cast<size_t>(row_offset) * nem * 3;
+      FPTYPE* frame_rij = rij + static_cast<size_t>(row_offset) * nnei * 3;
+      int* frame_nlist = nlist + static_cast<size_t>(row_offset) * nnei;
+      const std::vector<FPTYPE>& d_coord3 = frame_coords[ff];
+      const std::vector<int>& d_f_type = frame_f_types[ff];
+      std::vector<int> fmt_nlist_a;
+      format_nlist_i_cpu(fmt_nlist_a, d_coord3, d_f_type, ii,
+                         frame_nlists[ff][ii], rcut, sec);
+      std::vector<FPTYPE> d_em_a;
+      std::vector<FPTYPE> d_em_a_deriv;
+      std::vector<FPTYPE> d_em_r;
+      std::vector<FPTYPE> d_em_r_deriv;
+      std::vector<FPTYPE> d_rij_a;
+      env_mat_a_cpu(d_em_a, d_em_a_deriv, d_rij_a, d_coord3, d_f_type, ii,
+                    fmt_nlist_a, sec, rcut_smth, rcut);
 
-    // check sizes
-    assert(d_em_a.size() == nem);
-    assert(d_em_a_deriv.size() == nem * 3);
-    assert(d_rij_a.size() == nnei * 3);
-    assert(fmt_nlist_a.size() == nnei);
-    // record outputs
-    for (int jj = 0; jj < nem; ++jj) {
-      if (type[ii] >= 0) {
-        em[ii * nem + jj] =
-            (d_em_a[jj] - avg[type[ii] * nem + jj]) / std[type[ii] * nem + jj];
-      } else {
-        em[ii * nem + jj] = 0;
+      // check sizes
+      assert(d_em_a.size() == nem);
+      assert(d_em_a_deriv.size() == nem * 3);
+      assert(d_rij_a.size() == nnei * 3);
+      assert(fmt_nlist_a.size() == nnei);
+      // record outputs
+      for (int jj = 0; jj < nem; ++jj) {
+        if (frame_type[ii] >= 0) {
+          frame_em[ii * nem + jj] =
+              (d_em_a[jj] - avg[frame_type[ii] * nem + jj]) /
+              std[frame_type[ii] * nem + jj];
+        } else {
+          frame_em[ii * nem + jj] = 0;
+        }
       }
-    }
-    for (int jj = 0; jj < nem * 3; ++jj) {
-      if (type[ii] >= 0) {
-        em_deriv[ii * nem * 3 + jj] =
-            d_em_a_deriv[jj] / std[type[ii] * nem + jj / 3];
-      } else {
-        em_deriv[ii * nem * 3 + jj] = 0;
+      for (int jj = 0; jj < nem * 3; ++jj) {
+        if (frame_type[ii] >= 0) {
+          frame_em_deriv[ii * nem * 3 + jj] =
+              d_em_a_deriv[jj] / std[frame_type[ii] * nem + jj / 3];
+        } else {
+          frame_em_deriv[ii * nem * 3 + jj] = 0;
+        }
+      }
+      for (int jj = 0; jj < nnei * 3; ++jj) {
+        frame_rij[ii * nnei * 3 + jj] = d_rij_a[jj];
+      }
+      for (int jj = 0; jj < nnei; ++jj) {
+        frame_nlist[ii * nnei + jj] = fmt_nlist_a[jj];
       }
-    }
-    for (int jj = 0; jj < nnei * 3; ++jj) {
-      rij[ii * nnei * 3 + jj] = d_rij_a[jj];
-    }
-    for (int jj = 0; jj < nnei; ++jj) {
-      nlist[ii * nnei + jj] = fmt_nlist_a[jj];
     }
   }
 }
@@ -120,73 +144,103 @@ void deepmd::prod_env_mat_r_cpu(FPTYPE* em,
                                 const FPTYPE* std,
                                 const int nloc,
                                 const int nall,
+                                const int nframes,
                                 const float rcut,
                                 const float rcut_smth,
                                 const std::vector<int> sec) {
   const int nnei = sec.back();
   const int nem = nnei * 1;
+  assert(nframes * nloc == inlist.inum);
 
-  // set & normalize coord
-  std::vector<FPTYPE> d_coord3(nall * 3);
-  for (int ii = 0; ii < nall; ++ii) {
-    for (int dd = 0; dd < 3; ++dd) {
-      d_coord3[ii * 3 + dd] = coord[ii * 3 + dd];
-    }
-  }
+  std::vector<std::vector<FPTYPE> > frame_coords(nframes);
+  std::vector<std::vector<int> > frame_types(nframes);
+  std::vector<std::vector<std::vector<int> > > frame_nlists(nframes);
 
-  // set type
-  std::vector<int> d_type(nall);
-  for (int ii = 0; ii < nall; ++ii) {
-    d_type[ii] = type[ii];
-  }
+#pragma omp parallel
+  {
+#pragma omp for
+    for (int ff = 0; ff < nframes; ++ff) {
+      const FPTYPE* frame_coord = coord + static_cast<size_t>(ff) * nall * 3;
+      const int* frame_type = type + static_cast<size_t>(ff) * nall;
+      const int_64 row_offset = static_cast<int_64>(ff) * nloc;
 
-  // build nlist
-  std::vector<std::vector<int> > d_nlist_a(nloc);
+      frame_coords[ff].resize(static_cast<size_t>(nall) * 3);
+      for (int ii = 0; ii < nall; ++ii) {
+        for (int dd = 0; dd < 3; ++dd) {
+          frame_coords[ff][ii * 3 + dd] = frame_coord[ii * 3 + dd];
+        }
+      }
 
-  assert(nloc == inlist.inum);
-  for (unsigned ii = 0; ii < nloc; ++ii) {
-    d_nlist_a[ii].reserve(max_nbor_size);
-  }
-  for (unsigned ii = 0; ii < nloc; ++ii) {
-    int i_idx = inlist.ilist[ii];
-    for (unsigned jj = 0; jj < inlist.numneigh[ii]; ++jj) {
-      int j_idx = inlist.firstneigh[ii][jj];
-      d_nlist_a[i_idx].push_back(j_idx);
+      frame_types[ff].resize(nall);
+      for (int ii = 0; ii < nall; ++ii) {
+        frame_types[ff][ii] = frame_type[ii];
+      }
+
+      frame_nlists[ff].resize(nloc);
+      for (int ii = 0; ii < nloc; ++ii) {
+        frame_nlists[ff][ii].reserve(max_nbor_size);
+      }
+      for (int ii = 0; ii < nloc; ++ii) {
+        const int_64 row = row_offset + ii;
+        const int i_idx = inlist.ilist[row];
+        for (int jj = 0; jj < inlist.numneigh[row]; ++jj) {
+          const int j_idx = inlist.firstneigh[row][jj];
+          frame_nlists[ff][i_idx].push_back(j_idx);
+        }
+      }
     }
-  }
 
-#pragma omp parallel for
-  for (int ii = 0; ii < nloc; ++ii) {
-    std::vector<int> fmt_nlist_a;
-    int ret = format_nlist_i_cpu(fmt_nlist_a, d_coord3, d_type, ii,
-                                 d_nlist_a[ii], rcut, sec);
-    std::vector<FPTYPE> d_em_a;
-    std::vector<FPTYPE> d_em_a_deriv;
-    std::vector<FPTYPE> d_em_r;
-    std::vector<FPTYPE> d_em_r_deriv;
-    std::vector<FPTYPE> d_rij_a;
-    env_mat_r_cpu(d_em_a, d_em_a_deriv, d_rij_a, d_coord3, d_type, ii,
-                  fmt_nlist_a, sec, rcut_smth, rcut);
+#pragma omp for
+    for (int_64 row = 0; row < static_cast<int_64>(nframes) * nloc; ++row) {
+      const int ff = row / nloc;
+      const int ii = row % nloc;
+      const int_64 row_offset = static_cast<int_64>(ff) * nloc;
+      FPTYPE* frame_em = em + static_cast<size_t>(row_offset) * nem;
+      FPTYPE* frame_em_deriv =
+          em_deriv + static_cast<size_t>(row_offset) * nem * 3;
+      FPTYPE* frame_rij = rij + static_cast<size_t>(row_offset) * nnei * 3;
+      int* frame_nlist = nlist + static_cast<size_t>(row_offset) * nnei;
+      const std::vector<FPTYPE>& d_coord3 = frame_coords[ff];
+      const std::vector<int>& d_type = frame_types[ff];
+      std::vector<int> fmt_nlist_a;
+      format_nlist_i_cpu(fmt_nlist_a, d_coord3, d_type, ii,
+                         frame_nlists[ff][ii], rcut, sec);
+      std::vector<FPTYPE> d_em_a;
+      std::vector<FPTYPE> d_em_a_deriv;
+      std::vector<FPTYPE> d_em_r;
+      std::vector<FPTYPE> d_em_r_deriv;
+      std::vector<FPTYPE> d_rij_a;
+      env_mat_r_cpu(d_em_a, d_em_a_deriv, d_rij_a, d_coord3, d_type, ii,
+                    fmt_nlist_a, sec, rcut_smth, rcut);
 
-    // check sizes
-    assert(d_em_a.size() == nem);
-    assert(d_em_a_deriv.size() == nem * 3);
-    assert(d_rij_a.size() == nnei * 3);
-    assert(fmt_nlist_a.size() == nnei);
-    // record outputs
-    for (int jj = 0; jj < nem; ++jj) {
-      em[ii * nem + jj] = (d_em_a[jj] - avg[d_type[ii] * nem + jj]) /
-                          std[d_type[ii] * nem + jj];
-    }
-    for (int jj = 0; jj < nem * 3; ++jj) {
-      em_deriv[ii * nem * 3 + jj] =
-          d_em_a_deriv[jj] / std[d_type[ii] * nem + jj / 3];
-    }
-    for (int jj = 0; jj < nnei * 3; ++jj) {
-      rij[ii * nnei * 3 + jj] = d_rij_a[jj];
-    }
-    for (int jj = 0; jj < nnei; ++jj) {
-      nlist[ii * nnei + jj] = fmt_nlist_a[jj];
+      // check sizes
+      assert(d_em_a.size() == nem);
+      assert(d_em_a_deriv.size() == nem * 3);
+      assert(d_rij_a.size() == nnei * 3);
+      assert(fmt_nlist_a.size() == nnei);
+      // record outputs
+      for (int jj = 0; jj < nem; ++jj) {
+        if (d_type[ii] >= 0) {
+          frame_em[ii * nem + jj] = (d_em_a[jj] - avg[d_type[ii] * nem + jj]) /
+                                    std[d_type[ii] * nem + jj];
+        } else {
+          frame_em[ii * nem + jj] = 0;
+        }
+      }
+      for (int jj = 0; jj < nem * 3; ++jj) {
+        if (d_type[ii] >= 0) {
+          frame_em_deriv[ii * nem * 3 + jj] =
+              d_em_a_deriv[jj] / std[d_type[ii] * nem + jj / 3];
+        } else {
+          frame_em_deriv[ii * nem * 3 + jj] = 0;
+        }
+      }
+      for (int jj = 0; jj < nnei * 3; ++jj) {
+        frame_rij[ii * nnei * 3 + jj] = d_rij_a[jj];
+      }
+      for (int jj = 0; jj < nnei; ++jj) {
+        frame_nlist[ii * nnei + jj] = fmt_nlist_a[jj];
+      }
     }
   }
 }
@@ -203,6 +257,7 @@ template void deepmd::prod_env_mat_a_cpu<double>(double* em,
                                                  const double* std,
                                                  const int nloc,
                                                  const int nall,
+                                                 const int nframes,
                                                  const float rcut,
                                                  const float rcut_smth,
                                                  const std::vector<int> sec,
@@ -220,6 +275,7 @@ template void deepmd::prod_env_mat_a_cpu<float>(float* em,
                                                 const float* std,
                                                 const int nloc,
                                                 const int nall,
+                                                const int nframes,
                                                 const float rcut,
                                                 const float rcut_smth,
                                                 const std::vector<int> sec,
@@ -237,6 +293,7 @@ template void deepmd::prod_env_mat_r_cpu<double>(double* em,
                                                  const double* std,
                                                  const int nloc,
                                                  const int nall,
+                                                 const int nframes,
                                                  const float rcut,
                                                  const float rcut_smth,
                                                  const std::vector<int> sec);
@@ -253,6 +310,7 @@ template void deepmd::prod_env_mat_r_cpu<float>(float* em,
                                                 const float* std,
                                                 const int nloc,
                                                 const int nall,
+                                                const int nframes,
                                                 const float rcut,
                                                 const float rcut_smth,
                                                 const std::vector<int> sec);
diff --git a/source/lib/tests/test_env_mat_a.cc b/source/lib/tests/test_env_mat_a.cc
index 3c309ca9ae..203291d58c 100644
--- a/source/lib/tests/test_env_mat_a.cc
+++ b/source/lib/tests/test_env_mat_a.cc
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: LGPL-3.0-or-later
 #include <gtest/gtest.h>
 
+#include <algorithm>
 #include <iostream>
 
 #include "device.h"
@@ -9,6 +10,17 @@
 #include "neighbor_list.h"
 #include "prod_env_mat.h"
 
+template <typename T>
+static std::vector<T> repeat_vector(const std::vector<T>& values,
+                                    const int repeats) {
+  std::vector<T> result;
+  result.reserve(static_cast<size_t>(repeats) * values.size());
+  for (int ii = 0; ii < repeats; ++ii) {
+    result.insert(result.end(), values.begin(), values.end());
+  }
+  return result;
+}
+
 class TestEnvMatA : public ::testing::Test {
  protected:
   std::vector<double> posi = {12.83, 2.56, 2.18, 12.09, 2.87, 2.74,
@@ -592,6 +604,87 @@ TEST_F(TestEnvMatA, prod_cpu_equal_cpu) {
   // }
 }
 
+TEST_F(TestEnvMatA, prod_cpu_multiple_frames) {
+  EXPECT_EQ(nlist_r_cpy.size(), nloc);
+  constexpr int nframes = 2;
+  int max_nbor_size = 0;
+  for (int ii = 0; ii < nlist_a_cpy.size(); ++ii) {
+    if (nlist_a_cpy[ii].size() > max_nbor_size) {
+      max_nbor_size = nlist_a_cpy[ii].size();
+    }
+  }
+
+  std::vector<int> base_ilist(nloc), base_numneigh(nloc);
+  std::vector<int*> base_firstneigh(nloc);
+  deepmd::InputNlist base_inlist(nloc, base_ilist.data(), base_numneigh.data(),
+                                 base_firstneigh.data());
+  convert_nlist(base_inlist, nlist_a_cpy);
+
+  const int nrows = nframes * nloc;
+  std::vector<int> ilist(nrows), numneigh(nrows);
+  std::vector<int*> firstneigh(nrows);
+  for (int ff = 0; ff < nframes; ++ff) {
+    for (int ii = 0; ii < nloc; ++ii) {
+      const int row = ff * nloc + ii;
+      ilist[row] = base_ilist[ii];
+      numneigh[row] = base_numneigh[ii];
+      firstneigh[row] = base_firstneigh[ii];
+    }
+  }
+  deepmd::InputNlist inlist(nrows, ilist.data(), numneigh.data(),
+                            firstneigh.data());
+
+  std::vector<double> posi_multi = repeat_vector(posi_cpy, nframes);
+  for (int ii = 0; ii < nall; ++ii) {
+    const size_t offset = (static_cast<size_t>(nall) + ii) * 3;
+    posi_multi[offset] += 0.01 * (ii + 1);
+    posi_multi[offset + 1] -= 0.02 * (ii % 3);
+    posi_multi[offset + 2] += 0.015 * (ii % 5);
+  }
+  std::vector<int> atype_multi = repeat_vector(atype_cpy, nframes);
+
+  std::vector<double> em(static_cast<size_t>(nframes) * nloc * ndescrpt),
+      em_deriv(static_cast<size_t>(nframes) * nloc * ndescrpt * 3),
+      rij(static_cast<size_t>(nframes) * nloc * nnei * 3);
+  std::vector<int> nlist(static_cast<size_t>(nframes) * nloc * nnei);
+  std::vector<double> avg(static_cast<size_t>(ntypes) * ndescrpt, 0);
+  std::vector<double> std(static_cast<size_t>(ntypes) * ndescrpt, 1);
+  deepmd::prod_env_mat_a_cpu(
+      em.data(), em_deriv.data(), rij.data(), nlist.data(), posi_multi.data(),
+      atype_multi.data(), inlist, max_nbor_size, avg.data(), std.data(), nloc,
+      nall, nframes, rc, rc_smth, sec_a);
+
+  for (int ff = 0; ff < nframes; ++ff) {
+    std::vector<double> frame_em(static_cast<size_t>(nloc) * ndescrpt),
+        frame_em_deriv(static_cast<size_t>(nloc) * ndescrpt * 3),
+        frame_rij(static_cast<size_t>(nloc) * nnei * 3);
+    std::vector<int> frame_nlist(static_cast<size_t>(nloc) * nnei);
+    deepmd::prod_env_mat_a_cpu(
+        frame_em.data(), frame_em_deriv.data(), frame_rij.data(),
+        frame_nlist.data(),
+        posi_multi.data() + static_cast<size_t>(ff) * nall * 3,
+        atype_multi.data() + static_cast<size_t>(ff) * nall, base_inlist,
+        max_nbor_size, avg.data(), std.data(), nloc, nall, rc, rc_smth, sec_a);
+
+    const size_t em_offset = static_cast<size_t>(ff) * nloc * ndescrpt;
+    const size_t deriv_offset = em_offset * 3;
+    const size_t rij_offset = static_cast<size_t>(ff) * nloc * nnei * 3;
+    const size_t nlist_offset = static_cast<size_t>(ff) * nloc * nnei;
+    for (size_t jj = 0; jj < frame_em.size(); ++jj) {
+      EXPECT_LT(fabs(em[em_offset + jj] - frame_em[jj]), 1e-10);
+    }
+    for (size_t jj = 0; jj < frame_em_deriv.size(); ++jj) {
+      EXPECT_LT(fabs(em_deriv[deriv_offset + jj] - frame_em_deriv[jj]), 1e-10);
+    }
+    for (size_t jj = 0; jj < frame_rij.size(); ++jj) {
+      EXPECT_LT(fabs(rij[rij_offset + jj] - frame_rij[jj]), 1e-10);
+    }
+    for (size_t jj = 0; jj < frame_nlist.size(); ++jj) {
+      EXPECT_EQ(nlist[nlist_offset + jj], frame_nlist[jj]);
+    }
+  }
+}
+
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TEST_F(TestEnvMatA, prod_gpu) {
   EXPECT_EQ(nlist_r_cpy.size(), nloc);
@@ -643,10 +736,10 @@ TEST_F(TestEnvMatA, prod_gpu) {
   deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev,
                                    max_nbor_size);
 
-  deepmd::prod_env_mat_a_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev,
-                             posi_cpy_dev, atype_cpy_dev, gpu_inlist,
-                             array_int_dev, array_longlong_dev, max_nbor_size,
-                             avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a);
+  deepmd::prod_env_mat_a_gpu(
+      em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_cpy_dev,
+      gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev,
+      std_dev, nloc, nall, 1, rc, rc_smth, sec_a);
   deepmd::memcpy_device_to_host(em_dev, em);
   deepmd::delete_device_memory(em_dev);
   deepmd::delete_device_memory(em_deriv_dev);
@@ -671,6 +764,132 @@ TEST_F(TestEnvMatA, prod_gpu) {
   }
 }
 
+TEST_F(TestEnvMatA, prod_gpu_multiple_frames) {
+  EXPECT_EQ(nlist_r_cpy.size(), nloc);
+  constexpr int nframes = 2;
+  int max_nbor_size = 0;
+  for (int ii = 0; ii < nlist_a_cpy.size(); ++ii) {
+    if (nlist_a_cpy[ii].size() > max_nbor_size) {
+      max_nbor_size = nlist_a_cpy[ii].size();
+    }
+  }
+  assert(max_nbor_size <= GPU_MAX_NBOR_SIZE);
+  if (max_nbor_size <= 1024) {
+    max_nbor_size = 1024;
+  } else if (max_nbor_size <= 2048) {
+    max_nbor_size = 2048;
+  } else {
+    max_nbor_size = 4096;
+  }
+
+  std::vector<int> base_ilist(nloc), base_numneigh(nloc);
+  std::vector<int*> base_firstneigh(nloc);
+  deepmd::InputNlist base_inlist(nloc, base_ilist.data(), base_numneigh.data(),
+                                 base_firstneigh.data());
+  convert_nlist(base_inlist, nlist_a_cpy);
+
+  const int nrows = nframes * nloc;
+  std::vector<int> ilist(nrows), numneigh(nrows);
+  std::vector<int*> firstneigh(nrows);
+  for (int ff = 0; ff < nframes; ++ff) {
+    for (int ii = 0; ii < nloc; ++ii) {
+      const int row = ff * nloc + ii;
+      ilist[row] = base_ilist[ii];
+      numneigh[row] = base_numneigh[ii];
+      firstneigh[row] = base_firstneigh[ii];
+    }
+  }
+  deepmd::InputNlist inlist(nrows, ilist.data(), numneigh.data(),
+                            firstneigh.data()),
+      gpu_inlist;
+
+  std::vector<double> posi_multi = repeat_vector(posi_cpy, nframes);
+  for (int ii = 0; ii < nall; ++ii) {
+    const size_t offset = (static_cast<size_t>(nall) + ii) * 3;
+    posi_multi[offset] += 0.01 * (ii + 1);
+    posi_multi[offset + 1] -= 0.02 * (ii % 3);
+    posi_multi[offset + 2] += 0.015 * (ii % 5);
+  }
+  std::vector<int> atype_multi = repeat_vector(atype_cpy, nframes);
+  std::vector<double> em(static_cast<size_t>(nframes) * nloc * ndescrpt, 0.0),
+      em_deriv(static_cast<size_t>(nframes) * nloc * ndescrpt * 3, 0.0),
+      rij(static_cast<size_t>(nframes) * nloc * nnei * 3, 0.0);
+  std::vector<int> nlist(static_cast<size_t>(nframes) * nloc * nnei, 0);
+  std::vector<double> avg(static_cast<size_t>(ntypes) * ndescrpt, 0);
+  std::vector<double> std(static_cast<size_t>(ntypes) * ndescrpt, 1);
+  std::vector<double> expected_multi(static_cast<size_t>(nframes) * nloc *
+                                     ndescrpt);
+  for (int ff = 0; ff < nframes; ++ff) {
+    std::vector<double> frame_em(static_cast<size_t>(nloc) * ndescrpt),
+        frame_em_deriv(static_cast<size_t>(nloc) * ndescrpt * 3),
+        frame_rij(static_cast<size_t>(nloc) * nnei * 3);
+    std::vector<int> frame_nlist(static_cast<size_t>(nloc) * nnei);
+    deepmd::prod_env_mat_a_cpu(
+        frame_em.data(), frame_em_deriv.data(), frame_rij.data(),
+        frame_nlist.data(),
+        posi_multi.data() + static_cast<size_t>(ff) * nall * 3,
+        atype_multi.data() + static_cast<size_t>(ff) * nall, base_inlist,
+        max_nbor_size, avg.data(), std.data(), nloc, nall, rc, rc_smth, sec_a);
+    std::copy(
+        frame_em.begin(), frame_em.end(),
+        expected_multi.begin() + static_cast<size_t>(ff) * nloc * ndescrpt);
+  }
+
+  double *em_dev = NULL, *em_deriv_dev = NULL, *rij_dev = NULL;
+  double *posi_dev = NULL, *avg_dev = NULL, *std_dev = NULL;
+  int *atype_dev = NULL, *nlist_dev = NULL, *array_int_dev = NULL,
+      *memory_dev = NULL;
+  uint_64* array_longlong_dev = NULL;
+  deepmd::malloc_device_memory_sync(em_dev, em);
+  deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
+  deepmd::malloc_device_memory_sync(rij_dev, rij);
+  deepmd::malloc_device_memory_sync(posi_dev, posi_multi);
+  deepmd::malloc_device_memory_sync(avg_dev, avg);
+  deepmd::malloc_device_memory_sync(std_dev, std);
+  deepmd::malloc_device_memory_sync(atype_dev, atype_multi);
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory(
+      array_int_dev, sec_a.size() +
+                         static_cast<size_t>(nframes) * nloc * sec_a.size() +
+                         static_cast<size_t>(nframes) * nloc);
+  deepmd::malloc_device_memory(
+      array_longlong_dev,
+      static_cast<size_t>(nframes) * nloc * max_nbor_size * 2);
+  deepmd::malloc_device_memory(
+      memory_dev, static_cast<size_t>(nframes) * nloc * max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev,
+                                   max_nbor_size);
+
+  deepmd::prod_env_mat_a_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_dev,
+                             atype_dev, gpu_inlist, array_int_dev,
+                             array_longlong_dev, max_nbor_size, avg_dev,
+                             std_dev, nloc, nall, nframes, rc, rc_smth, sec_a);
+  deepmd::memcpy_device_to_host(em_dev, em);
+  deepmd::delete_device_memory(em_dev);
+  deepmd::delete_device_memory(em_deriv_dev);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(posi_dev);
+  deepmd::delete_device_memory(atype_dev);
+  deepmd::delete_device_memory(array_int_dev);
+  deepmd::delete_device_memory(array_longlong_dev);
+  deepmd::delete_device_memory(avg_dev);
+  deepmd::delete_device_memory(std_dev);
+  deepmd::delete_device_memory(memory_dev);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
+
+  for (int ff = 0; ff < nframes; ++ff) {
+    for (int ii = 0; ii < nloc; ++ii) {
+      for (int jj = 0; jj < nnei; ++jj) {
+        for (int dd = 0; dd < 4; ++dd) {
+          const int_64 idx =
+              (static_cast<int_64>(ff) * nloc + ii) * nnei * 4 + jj * 4 + dd;
+          EXPECT_LT(fabs(em[idx] - expected_multi[idx]), 1e-5);
+        }
+      }
+    }
+  }
+}
+
 TEST_F(TestEnvMatA, prod_gpu_equal_cpu) {
   EXPECT_EQ(nlist_r_cpy.size(), nloc);
   int tot_nnei = 0;
@@ -722,10 +941,10 @@ TEST_F(TestEnvMatA, prod_gpu_equal_cpu) {
   deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev,
                                    max_nbor_size);
 
-  deepmd::prod_env_mat_a_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev,
-                             posi_cpy_dev, atype_cpy_dev, gpu_inlist,
-                             array_int_dev, array_longlong_dev, max_nbor_size,
-                             avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a);
+  deepmd::prod_env_mat_a_gpu(
+      em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_cpy_dev,
+      gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev,
+      std_dev, nloc, nall, 1, rc, rc_smth, sec_a);
   deepmd::memcpy_device_to_host(em_dev, em);
   deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv);
   deepmd::memcpy_device_to_host(rij_dev, rij);
diff --git a/source/lib/tests/test_env_mat_a_mix.cc b/source/lib/tests/test_env_mat_a_mix.cc
index e96311dafd..ae91256c1a 100644
--- a/source/lib/tests/test_env_mat_a_mix.cc
+++ b/source/lib/tests/test_env_mat_a_mix.cc
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: LGPL-3.0-or-later
 #include <gtest/gtest.h>
 
+#include <algorithm>
 #include <iostream>
 
 #include "device.h"
@@ -9,6 +10,17 @@
 #include "neighbor_list.h"
 #include "prod_env_mat.h"
 
+template <typename T>
+static std::vector<T> repeat_vector(const std::vector<T>& values,
+                                    const int repeats) {
+  std::vector<T> result;
+  result.reserve(static_cast<size_t>(repeats) * values.size());
+  for (int ii = 0; ii < repeats; ++ii) {
+    result.insert(result.end(), values.begin(), values.end());
+  }
+  return result;
+}
+
 class TestEnvMatAMix : public ::testing::Test {
  protected:
   std::vector<double> posi = {12.83, 2.56, 2.18, 12.09, 2.87, 2.74,
@@ -17,7 +29,7 @@ class TestEnvMatAMix : public ::testing::Test {
   std::vector<int> atype = {0, 1, 1, 0, 1, 1};
   std::vector<int> f_atype = {0, 0, 0, 0, 0, 0};
   std::vector<double> posi_cpy;
-  //   std::vector<int > atype_cpy;
+  std::vector<int> atype_cpy;
   std::vector<int> f_atype_cpy;
   int nloc, nall;
   double rc = 6;
@@ -125,6 +137,10 @@ class TestEnvMatAMix : public ::testing::Test {
     region.reinitBox(box);
     copy_coord(posi_cpy, f_atype_cpy, mapping, ncell, ngcell, posi, f_atype, rc,
                region);
+    atype_cpy.resize(mapping.size());
+    for (int ii = 0; ii < mapping.size(); ++ii) {
+      atype_cpy[ii] = atype[mapping[ii]];
+    }
     nloc = posi.size() / 3;
     nall = posi_cpy.size() / 3;
     nat_stt.resize(3);
@@ -542,7 +558,7 @@ TEST_F(TestEnvMatAMix, prod_cpu) {
   std::vector<double> avg(ntypes * ndescrpt, 0);
   std::vector<double> std(ntypes * ndescrpt, 1);
   deepmd::prod_env_mat_a_cpu(&em[0], &em_deriv[0], &rij[0], &nlist[0],
-                             &posi_cpy[0], &atype[0], inlist, max_nbor_size,
+                             &posi_cpy[0], &atype_cpy[0], inlist, max_nbor_size,
                              &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a,
                              &f_atype_cpy[0]);
   deepmd::use_nei_info_cpu(&nlist[0], &ntype[0], nmask, &atype[0], &mapping[0],
@@ -583,7 +599,7 @@ TEST_F(TestEnvMatAMix, prod_cpu_equal_cpu) {
   std::vector<double> avg(static_cast<size_t>(ntypes) * ndescrpt, 0);
   std::vector<double> std(static_cast<size_t>(ntypes) * ndescrpt, 1);
   deepmd::prod_env_mat_a_cpu(&em[0], &em_deriv[0], &rij[0], &nlist[0],
-                             &posi_cpy[0], &atype[0], inlist, max_nbor_size,
+                             &posi_cpy[0], &atype_cpy[0], inlist, max_nbor_size,
                              &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a,
                              &f_atype_cpy[0]);
 
@@ -630,6 +646,83 @@ TEST_F(TestEnvMatAMix, prod_cpu_equal_cpu) {
   // }
 }
 
+TEST_F(TestEnvMatAMix, prod_cpu_multiple_frames) {
+  constexpr int nframes = 2;
+  int max_nbor_size = 0;
+  for (int ii = 0; ii < nlist_a_cpy.size(); ++ii) {
+    if (nlist_a_cpy[ii].size() > max_nbor_size) {
+      max_nbor_size = nlist_a_cpy[ii].size();
+    }
+  }
+
+  std::vector<int> base_ilist(nloc), base_numneigh(nloc);
+  std::vector<int*> base_firstneigh(nloc);
+  deepmd::InputNlist base_inlist(nloc, base_ilist.data(), base_numneigh.data(),
+                                 base_firstneigh.data());
+  convert_nlist(base_inlist, nlist_a_cpy);
+
+  const int nrows = nframes * nloc;
+  std::vector<int> ilist(nrows), numneigh(nrows);
+  std::vector<int*> firstneigh(nrows);
+  for (int ff = 0; ff < nframes; ++ff) {
+    for (int ii = 0; ii < nloc; ++ii) {
+      const int row = ff * nloc + ii;
+      ilist[row] = base_ilist[ii];
+      numneigh[row] = base_numneigh[ii];
+      firstneigh[row] = base_firstneigh[ii];
+    }
+  }
+  deepmd::InputNlist inlist(nrows, ilist.data(), numneigh.data(),
+                            firstneigh.data());
+
+  std::vector<double> posi_multi = repeat_vector(posi_cpy, nframes);
+  std::vector<int> atype_multi = repeat_vector(atype_cpy, nframes);
+  for (int ii = nloc; ii < nall; ++ii) {
+    atype_multi[ii] = 0;
+  }
+  for (int ii = 0; ii < nloc; ++ii) {
+    atype_multi[static_cast<size_t>(nall) + ii] = 1;
+  }
+  std::vector<int> f_atype_multi = repeat_vector(f_atype_cpy, nframes);
+  std::vector<double> avg(static_cast<size_t>(ntypes) * ndescrpt, 0.0);
+  std::vector<double> std(static_cast<size_t>(ntypes) * ndescrpt, 1.0);
+  for (int jj = 0; jj < ndescrpt; ++jj) {
+    avg[static_cast<size_t>(ndescrpt) + jj] = 0.125 + 0.001 * (jj % 17);
+  }
+
+  std::vector<double> expected_multi(static_cast<size_t>(nframes) * nloc *
+                                     ndescrpt);
+  for (int ff = 0; ff < nframes; ++ff) {
+    std::vector<double> frame_em(static_cast<size_t>(nloc) * ndescrpt),
+        frame_em_deriv(static_cast<size_t>(nloc) * ndescrpt * 3),
+        frame_rij(static_cast<size_t>(nloc) * nnei * 3);
+    std::vector<int> frame_nlist(static_cast<size_t>(nloc) * nnei);
+    deepmd::prod_env_mat_a_cpu(
+        frame_em.data(), frame_em_deriv.data(), frame_rij.data(),
+        frame_nlist.data(),
+        posi_multi.data() + static_cast<size_t>(ff) * nall * 3,
+        atype_multi.data() + static_cast<size_t>(ff) * nall, base_inlist,
+        max_nbor_size, avg.data(), std.data(), nloc, nall, rc, rc_smth, sec_a,
+        f_atype_multi.data() + static_cast<size_t>(ff) * nall);
+    std::copy(
+        frame_em.begin(), frame_em.end(),
+        expected_multi.begin() + static_cast<size_t>(ff) * nloc * ndescrpt);
+  }
+
+  std::vector<double> em(static_cast<size_t>(nframes) * nloc * ndescrpt),
+      em_deriv(static_cast<size_t>(nframes) * nloc * ndescrpt * 3),
+      rij(static_cast<size_t>(nframes) * nloc * nnei * 3);
+  std::vector<int> nlist(static_cast<size_t>(nframes) * nloc * nnei);
+  deepmd::prod_env_mat_a_cpu(
+      em.data(), em_deriv.data(), rij.data(), nlist.data(), posi_multi.data(),
+      atype_multi.data(), inlist, max_nbor_size, avg.data(), std.data(), nloc,
+      nall, nframes, rc, rc_smth, sec_a, f_atype_multi.data());
+
+  for (int ii = 0; ii < em.size(); ++ii) {
+    EXPECT_LT(fabs(em[ii] - expected_multi[ii]), 1e-10);
+  }
+}
+
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TEST_F(TestEnvMatAMix, prod_gpu) {
   EXPECT_EQ(nlist_r_cpy.size(), nloc);
@@ -667,9 +760,9 @@ TEST_F(TestEnvMatAMix, prod_gpu) {
   double *em_dev = NULL, *em_deriv_dev = NULL, *rij_dev = NULL;
   bool* nmask_dev = NULL;
   double *posi_cpy_dev = NULL, *avg_dev = NULL, *std_dev = NULL;
-  int *f_atype_cpy_dev = NULL, *atype_dev = NULL, *nlist_dev = NULL,
-      *ntype_dev = NULL, *mapping_dev = NULL, *array_int_dev = NULL,
-      *memory_dev = NULL;
+  int *f_atype_cpy_dev = NULL, *atype_cpy_dev = NULL, *atype_dev = NULL,
+      *nlist_dev = NULL, *ntype_dev = NULL, *mapping_dev = NULL,
+      *array_int_dev = NULL, *memory_dev = NULL;
   uint_64* array_longlong_dev = NULL;
   deepmd::malloc_device_memory_sync(em_dev, em);
   deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
@@ -678,6 +771,7 @@ TEST_F(TestEnvMatAMix, prod_gpu) {
   deepmd::malloc_device_memory_sync(avg_dev, avg);
   deepmd::malloc_device_memory_sync(std_dev, std);
   deepmd::malloc_device_memory_sync(f_atype_cpy_dev, f_atype_cpy);
+  deepmd::malloc_device_memory_sync(atype_cpy_dev, atype_cpy);
   deepmd::malloc_device_memory_sync(atype_dev, atype);
   deepmd::malloc_device_memory_sync(nlist_dev, nlist);
   deepmd::malloc_device_memory_sync(ntype_dev, ntype);
@@ -692,9 +786,9 @@ TEST_F(TestEnvMatAMix, prod_gpu) {
                                    max_nbor_size);
 
   deepmd::prod_env_mat_a_gpu(
-      em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_dev,
+      em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_cpy_dev,
       gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev,
-      std_dev, nloc, nall, rc, rc_smth, sec_a, f_atype_cpy_dev);
+      std_dev, nloc, nall, 1, rc, rc_smth, sec_a, f_atype_cpy_dev);
 
   deepmd::use_nei_info_gpu(nlist_dev, ntype_dev, nmask_dev, atype_dev,
                            mapping_dev, nloc, nnei, ntypes, true);
@@ -709,6 +803,7 @@ TEST_F(TestEnvMatAMix, prod_gpu) {
   deepmd::delete_device_memory(nmask_dev);
   deepmd::delete_device_memory(posi_cpy_dev);
   deepmd::delete_device_memory(f_atype_cpy_dev);
+  deepmd::delete_device_memory(atype_cpy_dev);
   deepmd::delete_device_memory(atype_dev);
   deepmd::delete_device_memory(mapping_dev);
   deepmd::delete_device_memory(array_int_dev);
@@ -763,7 +858,7 @@ TEST_F(TestEnvMatAMix, prod_gpu_equal_cpu) {
 
   double *em_dev = NULL, *em_deriv_dev = NULL, *rij_dev = NULL;
   double *posi_cpy_dev = NULL, *avg_dev = NULL, *std_dev = NULL;
-  int *f_atype_cpy_dev = NULL, *atype_dev = NULL, *nlist_dev = NULL,
+  int *f_atype_cpy_dev = NULL, *atype_cpy_dev = NULL, *nlist_dev = NULL,
       *array_int_dev = NULL, *memory_dev = NULL;
   uint_64* array_longlong_dev = NULL;
   deepmd::malloc_device_memory_sync(em_dev, em);
@@ -774,7 +869,7 @@ TEST_F(TestEnvMatAMix, prod_gpu_equal_cpu) {
   deepmd::malloc_device_memory_sync(std_dev, std);
 
   deepmd::malloc_device_memory_sync(f_atype_cpy_dev, f_atype_cpy);
-  deepmd::malloc_device_memory_sync(atype_dev, atype);
+  deepmd::malloc_device_memory_sync(atype_cpy_dev, atype_cpy);
   deepmd::malloc_device_memory_sync(nlist_dev, nlist);
   deepmd::malloc_device_memory(array_int_dev,
                                sec_a.size() + nloc * sec_a.size() + nloc);
@@ -785,9 +880,9 @@ TEST_F(TestEnvMatAMix, prod_gpu_equal_cpu) {
                                    max_nbor_size);
 
   deepmd::prod_env_mat_a_gpu(
-      em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_dev,
+      em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_cpy_dev,
       gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev,
-      std_dev, nloc, nall, rc, rc_smth, sec_a, f_atype_cpy_dev);
+      std_dev, nloc, nall, 1, rc, rc_smth, sec_a, f_atype_cpy_dev);
   deepmd::memcpy_device_to_host(em_dev, em);
   deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv);
   deepmd::memcpy_device_to_host(rij_dev, rij);
@@ -797,7 +892,7 @@ TEST_F(TestEnvMatAMix, prod_gpu_equal_cpu) {
   deepmd::delete_device_memory(nlist_dev);
   deepmd::delete_device_memory(posi_cpy_dev);
   deepmd::delete_device_memory(f_atype_cpy_dev);
-  deepmd::delete_device_memory(atype_dev);
+  deepmd::delete_device_memory(atype_cpy_dev);
   deepmd::delete_device_memory(array_int_dev);
   deepmd::delete_device_memory(array_longlong_dev);
   deepmd::delete_device_memory(avg_dev);
@@ -847,4 +942,134 @@ TEST_F(TestEnvMatAMix, prod_gpu_equal_cpu) {
     }
   }
 }
+
+TEST_F(TestEnvMatAMix, prod_gpu_multiple_frames) {
+  constexpr int nframes = 2;
+  int max_nbor_size = 0;
+  for (int ii = 0; ii < nlist_a_cpy.size(); ++ii) {
+    if (nlist_a_cpy[ii].size() > max_nbor_size) {
+      max_nbor_size = nlist_a_cpy[ii].size();
+    }
+  }
+  assert(max_nbor_size <= GPU_MAX_NBOR_SIZE);
+  if (max_nbor_size <= 1024) {
+    max_nbor_size = 1024;
+  } else if (max_nbor_size <= 2048) {
+    max_nbor_size = 2048;
+  } else {
+    max_nbor_size = 4096;
+  }
+
+  std::vector<int> base_ilist(nloc), base_numneigh(nloc);
+  std::vector<int*> base_firstneigh(nloc);
+  deepmd::InputNlist base_inlist(nloc, base_ilist.data(), base_numneigh.data(),
+                                 base_firstneigh.data());
+  convert_nlist(base_inlist, nlist_a_cpy);
+
+  const int nrows = nframes * nloc;
+  std::vector<int> ilist(nrows), numneigh(nrows);
+  std::vector<int*> firstneigh(nrows);
+  for (int ff = 0; ff < nframes; ++ff) {
+    for (int ii = 0; ii < nloc; ++ii) {
+      const int row = ff * nloc + ii;
+      ilist[row] = base_ilist[ii];
+      numneigh[row] = base_numneigh[ii];
+      firstneigh[row] = base_firstneigh[ii];
+    }
+  }
+  deepmd::InputNlist inlist(nrows, ilist.data(), numneigh.data(),
+                            firstneigh.data()),
+      gpu_inlist;
+
+  std::vector<double> posi_multi = repeat_vector(posi_cpy, nframes);
+  std::vector<int> atype_multi = repeat_vector(atype_cpy, nframes);
+  for (int ii = nloc; ii < nall; ++ii) {
+    atype_multi[ii] = 0;
+  }
+  for (int ii = 0; ii < nloc; ++ii) {
+    atype_multi[static_cast<size_t>(nall) + ii] = 1;
+  }
+  std::vector<int> f_atype_multi = repeat_vector(f_atype_cpy, nframes);
+  std::vector<double> avg(static_cast<size_t>(ntypes) * ndescrpt, 0.0);
+  std::vector<double> std(static_cast<size_t>(ntypes) * ndescrpt, 1.0);
+  for (int jj = 0; jj < ndescrpt; ++jj) {
+    avg[static_cast<size_t>(ndescrpt) + jj] = 0.125 + 0.001 * (jj % 17);
+  }
+
+  std::vector<double> expected_multi(static_cast<size_t>(nframes) * nloc *
+                                     ndescrpt);
+  for (int ff = 0; ff < nframes; ++ff) {
+    std::vector<double> frame_em(static_cast<size_t>(nloc) * ndescrpt),
+        frame_em_deriv(static_cast<size_t>(nloc) * ndescrpt * 3),
+        frame_rij(static_cast<size_t>(nloc) * nnei * 3);
+    std::vector<int> frame_nlist(static_cast<size_t>(nloc) * nnei);
+    deepmd::prod_env_mat_a_cpu(
+        frame_em.data(), frame_em_deriv.data(), frame_rij.data(),
+        frame_nlist.data(),
+        posi_multi.data() + static_cast<size_t>(ff) * nall * 3,
+        atype_multi.data() + static_cast<size_t>(ff) * nall, base_inlist,
+        max_nbor_size, avg.data(), std.data(), nloc, nall, rc, rc_smth, sec_a,
+        f_atype_multi.data() + static_cast<size_t>(ff) * nall);
+    std::copy(
+        frame_em.begin(), frame_em.end(),
+        expected_multi.begin() + static_cast<size_t>(ff) * nloc * ndescrpt);
+  }
+
+  std::vector<double> em(static_cast<size_t>(nframes) * nloc * ndescrpt, 0.0),
+      em_deriv(static_cast<size_t>(nframes) * nloc * ndescrpt * 3, 0.0),
+      rij(static_cast<size_t>(nframes) * nloc * nnei * 3, 0.0);
+  std::vector<int> nlist(static_cast<size_t>(nframes) * nloc * nnei, 0);
+
+  double *em_dev = NULL, *em_deriv_dev = NULL, *rij_dev = NULL;
+  double *posi_dev = NULL, *avg_dev = NULL, *std_dev = NULL;
+  int *atype_dev = NULL, *f_atype_dev = NULL, *nlist_dev = NULL,
+      *array_int_dev = NULL, *memory_dev = NULL;
+  uint_64* array_longlong_dev = NULL;
+  deepmd::malloc_device_memory_sync(em_dev, em);
+  deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
+  deepmd::malloc_device_memory_sync(rij_dev, rij);
+  deepmd::malloc_device_memory_sync(posi_dev, posi_multi);
+  deepmd::malloc_device_memory_sync(avg_dev, avg);
+  deepmd::malloc_device_memory_sync(std_dev, std);
+  deepmd::malloc_device_memory_sync(atype_dev, atype_multi);
+  deepmd::malloc_device_memory_sync(f_atype_dev, f_atype_multi);
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory(
+      array_int_dev, sec_a.size() +
+                         static_cast<size_t>(nframes) * nloc * sec_a.size() +
+                         static_cast<size_t>(nframes) * nloc);
+  deepmd::malloc_device_memory(
+      array_longlong_dev,
+      static_cast<size_t>(nframes) * nloc * max_nbor_size * 2);
+  deepmd::malloc_device_memory(
+      memory_dev, static_cast<size_t>(nframes) * nloc * max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev,
+                                   max_nbor_size);
+
+  deepmd::prod_env_mat_a_gpu(
+      em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_dev, atype_dev, gpu_inlist,
+      array_int_dev, array_longlong_dev, max_nbor_size, avg_dev, std_dev, nloc,
+      nall, nframes, rc, rc_smth, sec_a, f_atype_dev);
+
+  deepmd::memcpy_device_to_host(em_dev, em);
+  deepmd::delete_device_memory(em_dev);
+  deepmd::delete_device_memory(em_deriv_dev);
+  deepmd::delete_device_memory(rij_dev);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(posi_dev);
+  deepmd::delete_device_memory(atype_dev);
+  deepmd::delete_device_memory(f_atype_dev);
+  deepmd::delete_device_memory(array_int_dev);
+  deepmd::delete_device_memory(array_longlong_dev);
+  deepmd::delete_device_memory(avg_dev);
+  deepmd::delete_device_memory(std_dev);
+  deepmd::delete_device_memory(memory_dev);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
+
+  for (int ii = 0; ii < em.size(); ++ii) {
+    EXPECT_LT(fabs(em[ii] - expected_multi[ii]), 1e-5)
+        << "index " << ii << " em " << em[ii] << " expected "
+        << expected_multi[ii];
+  }
+}
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/lib/tests/test_env_mat_r.cc b/source/lib/tests/test_env_mat_r.cc
index 96da7e6963..82d6b6c229 100644
--- a/source/lib/tests/test_env_mat_r.cc
+++ b/source/lib/tests/test_env_mat_r.cc
@@ -302,6 +302,41 @@ TEST_F(TestEnvMatR, prod_cpu) {
   }
 }
 
+TEST_F(TestEnvMatR, prod_cpu_negative_center_type) {
+  EXPECT_EQ(nlist_r_cpy.size(), nloc);
+  int max_nbor_size = 0;
+  for (int ii = 0; ii < nlist_a_cpy.size(); ++ii) {
+    if (nlist_a_cpy[ii].size() > max_nbor_size) {
+      max_nbor_size = nlist_a_cpy[ii].size();
+    }
+  }
+  std::vector<int> ilist(nloc), numneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
+  deepmd::InputNlist inlist(nloc, ilist.data(), numneigh.data(),
+                            firstneigh.data());
+  convert_nlist(inlist, nlist_a_cpy);
+
+  std::vector<int> atype_neg = atype_cpy;
+  atype_neg[0] = -1;
+  std::vector<double> em(static_cast<size_t>(nloc) * ndescrpt, -1.0),
+      em_deriv(static_cast<size_t>(nloc) * ndescrpt * 3, -1.0),
+      rij(static_cast<size_t>(nloc) * nnei * 3);
+  std::vector<int> nlist(static_cast<size_t>(nloc) * nnei);
+  std::vector<double> avg(static_cast<size_t>(ntypes) * ndescrpt, 0);
+  std::vector<double> std(static_cast<size_t>(ntypes) * ndescrpt, 1);
+  deepmd::prod_env_mat_r_cpu(em.data(), em_deriv.data(), rij.data(),
+                             nlist.data(), posi_cpy.data(), atype_neg.data(),
+                             inlist, max_nbor_size, avg.data(), std.data(),
+                             nloc, nall, rc, rc_smth, sec_a);
+
+  for (int jj = 0; jj < ndescrpt; ++jj) {
+    EXPECT_EQ(em[jj], 0.0);
+  }
+  for (int jj = 0; jj < ndescrpt * 3; ++jj) {
+    EXPECT_EQ(em_deriv[jj], 0.0);
+  }
+}
+
 TEST_F(TestEnvMatR, prod_cpu_equal_cpu) {
   EXPECT_EQ(nlist_r_cpy.size(), nloc);
   int tot_nnei = 0;
@@ -358,6 +393,89 @@ TEST_F(TestEnvMatR, prod_cpu_equal_cpu) {
   }
 }
 
+TEST_F(TestEnvMatR, prod_cpu_multiple_frames) {
+  EXPECT_EQ(nlist_r_cpy.size(), nloc);
+  constexpr int nframes = 2;
+  int max_nbor_size = 0;
+  for (int ii = 0; ii < nlist_a_cpy.size(); ++ii) {
+    if (nlist_a_cpy[ii].size() > max_nbor_size) {
+      max_nbor_size = nlist_a_cpy[ii].size();
+    }
+  }
+
+  std::vector<int> base_ilist(nloc), base_numneigh(nloc);
+  std::vector<int*> base_firstneigh(nloc);
+  deepmd::InputNlist base_inlist(nloc, base_ilist.data(), base_numneigh.data(),
+                                 base_firstneigh.data());
+  convert_nlist(base_inlist, nlist_a_cpy);
+
+  const int nrows = nframes * nloc;
+  std::vector<int> ilist(nrows), numneigh(nrows);
+  std::vector<int*> firstneigh(nrows);
+  for (int ff = 0; ff < nframes; ++ff) {
+    for (int ii = 0; ii < nloc; ++ii) {
+      const int row = ff * nloc + ii;
+      ilist[row] = base_ilist[ii];
+      numneigh[row] = base_numneigh[ii];
+      firstneigh[row] = base_firstneigh[ii];
+    }
+  }
+  deepmd::InputNlist inlist(nrows, ilist.data(), numneigh.data(),
+                            firstneigh.data());
+
+  std::vector<double> posi_multi = posi_cpy;
+  posi_multi.insert(posi_multi.end(), posi_cpy.begin(), posi_cpy.end());
+  for (int ii = 0; ii < nall; ++ii) {
+    const size_t offset = (static_cast<size_t>(nall) + ii) * 3;
+    posi_multi[offset] += 0.01 * (ii + 1);
+    posi_multi[offset + 1] -= 0.02 * (ii % 3);
+    posi_multi[offset + 2] += 0.015 * (ii % 5);
+  }
+  std::vector<int> atype_multi = atype_cpy;
+  atype_multi.insert(atype_multi.end(), atype_cpy.begin(), atype_cpy.end());
+
+  std::vector<double> em(static_cast<size_t>(nframes) * nloc * ndescrpt),
+      em_deriv(static_cast<size_t>(nframes) * nloc * ndescrpt * 3),
+      rij(static_cast<size_t>(nframes) * nloc * nnei * 3);
+  std::vector<int> nlist(static_cast<size_t>(nframes) * nloc * nnei);
+  std::vector<double> avg(static_cast<size_t>(ntypes) * ndescrpt, 0);
+  std::vector<double> std(static_cast<size_t>(ntypes) * ndescrpt, 1);
+  deepmd::prod_env_mat_r_cpu(
+      em.data(), em_deriv.data(), rij.data(), nlist.data(), posi_multi.data(),
+      atype_multi.data(), inlist, max_nbor_size, avg.data(), std.data(), nloc,
+      nall, nframes, rc, rc_smth, sec_a);
+
+  for (int ff = 0; ff < nframes; ++ff) {
+    std::vector<double> frame_em(static_cast<size_t>(nloc) * ndescrpt),
+        frame_em_deriv(static_cast<size_t>(nloc) * ndescrpt * 3),
+        frame_rij(static_cast<size_t>(nloc) * nnei * 3);
+    std::vector<int> frame_nlist(static_cast<size_t>(nloc) * nnei);
+    deepmd::prod_env_mat_r_cpu(
+        frame_em.data(), frame_em_deriv.data(), frame_rij.data(),
+        frame_nlist.data(),
+        posi_multi.data() + static_cast<size_t>(ff) * nall * 3,
+        atype_multi.data() + static_cast<size_t>(ff) * nall, base_inlist,
+        max_nbor_size, avg.data(), std.data(), nloc, nall, rc, rc_smth, sec_a);
+
+    const size_t em_offset = static_cast<size_t>(ff) * nloc * ndescrpt;
+    const size_t deriv_offset = em_offset * 3;
+    const size_t rij_offset = static_cast<size_t>(ff) * nloc * nnei * 3;
+    const size_t nlist_offset = static_cast<size_t>(ff) * nloc * nnei;
+    for (size_t jj = 0; jj < frame_em.size(); ++jj) {
+      EXPECT_LT(fabs(em[em_offset + jj] - frame_em[jj]), 1e-10);
+    }
+    for (size_t jj = 0; jj < frame_em_deriv.size(); ++jj) {
+      EXPECT_LT(fabs(em_deriv[deriv_offset + jj] - frame_em_deriv[jj]), 1e-10);
+    }
+    for (size_t jj = 0; jj < frame_rij.size(); ++jj) {
+      EXPECT_LT(fabs(rij[rij_offset + jj] - frame_rij[jj]), 1e-10);
+    }
+    for (size_t jj = 0; jj < frame_nlist.size(); ++jj) {
+      EXPECT_EQ(nlist[nlist_offset + jj], frame_nlist[jj]);
+    }
+  }
+}
+
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TEST_F(TestEnvMatR, prod_gpu) {
   EXPECT_EQ(nlist_r_cpy.size(), nloc);
@@ -410,10 +528,10 @@ TEST_F(TestEnvMatR, prod_gpu) {
   deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev,
                                    max_nbor_size);
 
-  deepmd::prod_env_mat_r_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev,
-                             posi_cpy_dev, atype_cpy_dev, gpu_inlist,
-                             array_int_dev, array_longlong_dev, max_nbor_size,
-                             avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a);
+  deepmd::prod_env_mat_r_gpu(
+      em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_cpy_dev,
+      gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev,
+      std_dev, nloc, nall, 1, rc, rc_smth, sec_a);
   deepmd::memcpy_device_to_host(em_dev, em);
   deepmd::delete_device_memory(em_dev);
   deepmd::delete_device_memory(em_deriv_dev);
@@ -489,10 +607,10 @@ TEST_F(TestEnvMatR, prod_gpu_equal_cpu) {
   deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev,
                                    max_nbor_size);
 
-  deepmd::prod_env_mat_r_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev,
-                             posi_cpy_dev, atype_cpy_dev, gpu_inlist,
-                             array_int_dev, array_longlong_dev, max_nbor_size,
-                             avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a);
+  deepmd::prod_env_mat_r_gpu(
+      em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_cpy_dev,
+      gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev,
+      std_dev, nloc, nall, 1, rc, rc_smth, sec_a);
   deepmd::memcpy_device_to_host(em_dev, em);
   deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv);
   deepmd::memcpy_device_to_host(rij_dev, rij);
diff --git a/source/lib/tests/test_fmt_nlist.cc b/source/lib/tests/test_fmt_nlist.cc
index 6cd24b556a..511a2be949 100644
--- a/source/lib/tests/test_fmt_nlist.cc
+++ b/source/lib/tests/test_fmt_nlist.cc
@@ -359,7 +359,7 @@ TEST_F(TestFormatNlist, gpu) {
   // format nlist
   format_nbor_list_gpu(nlist_dev, posi_cpy_dev, atype_cpy_dev, gpu_inlist,
                        array_int_dev, array_longlong_dev, max_nbor_size, nloc,
-                       nall, rc, sec_a);
+                       nall, 1, rc, sec_a);
   deepmd::memcpy_device_to_host(nlist_dev, nlist);
   deepmd::delete_device_memory(nlist_dev);
   deepmd::delete_device_memory(posi_cpy_dev);
@@ -419,7 +419,7 @@ TEST_F(TestFormatNlistShortSel, gpu) {
   // format nlist
   format_nbor_list_gpu(nlist_dev, posi_cpy_dev, atype_cpy_dev, gpu_inlist,
                        array_int_dev, array_longlong_dev, max_nbor_size, nloc,
-                       nall, rc, sec_a);
+                       nall, 1, rc, sec_a);
   deepmd::memcpy_device_to_host(nlist_dev, nlist);
   deepmd::delete_device_memory(nlist_dev);
   deepmd::delete_device_memory(posi_cpy_dev);
diff --git a/source/lib/tests/test_neighbor_list.cc b/source/lib/tests/test_neighbor_list.cc
index a700df97b5..a643c8c321 100644
--- a/source/lib/tests/test_neighbor_list.cc
+++ b/source/lib/tests/test_neighbor_list.cc
@@ -1,10 +1,23 @@
 // SPDX-License-Identifier: LGPL-3.0-or-later
 #include <gtest/gtest.h>
 
+#include <vector>
+
 #include "device.h"
 #include "fmt_nlist.h"
 #include "neighbor_list.h"
 
+template <typename T>
+static std::vector<T> repeat_vector(const std::vector<T>& values,
+                                    const int repeats) {
+  std::vector<T> result;
+  result.reserve(static_cast<size_t>(repeats) * values.size());
+  for (int ii = 0; ii < repeats; ++ii) {
+    result.insert(result.end(), values.begin(), values.end());
+  }
+  return result;
+}
+
 class TestNeighborList : public ::testing::Test {
  protected:
   std::vector<double> posi = {12.83, 2.56, 2.18, 12.09, 2.87, 2.74,
@@ -110,6 +123,41 @@ TEST_F(TestNeighborList, cpu) {
   delete[] firstneigh;
 }
 
+TEST_F(TestNeighborList, cpu_multiple_frames) {
+  constexpr int nframes = 2;
+  const int nrows = nframes * nloc;
+  int mem_size = 10;
+  std::vector<int> ilist(nrows);
+  std::vector<int> numneigh(nrows);
+  std::vector<int*> firstneigh(nrows);
+  std::vector<int> jlist(static_cast<size_t>(nrows) * mem_size);
+  std::vector<double> posi_multi = repeat_vector(posi_cpy, nframes);
+  for (int ii = 0; ii < nrows; ++ii) {
+    firstneigh[ii] = jlist.data() + ii * mem_size;
+  }
+
+  deepmd::InputNlist nlist(nrows, ilist.data(), numneigh.data(),
+                           firstneigh.data());
+  int max_list_size;
+  int ret = build_nlist_cpu(nlist, &max_list_size, posi_multi.data(), nloc,
+                            nall, mem_size, rc, nframes);
+  EXPECT_EQ(ret, 0);
+  EXPECT_EQ(nlist.inum, nrows);
+  EXPECT_EQ(max_list_size, 5);
+  for (int ff = 0; ff < nframes; ++ff) {
+    for (int ii = 0; ii < nloc; ++ii) {
+      const int row = ff * nloc + ii;
+      EXPECT_EQ(nlist.ilist[row], ii);
+      EXPECT_EQ(nlist.numneigh[row], expect_nlist_cpy[ii].size());
+      std::sort(nlist.firstneigh[row],
+                nlist.firstneigh[row] + nlist.numneigh[row]);
+      for (int jj = 0; jj < nlist.numneigh[row]; ++jj) {
+        EXPECT_EQ(nlist.firstneigh[row][jj], expect_nlist_cpy[ii][jj]);
+      }
+    }
+  }
+}
+
 TEST_F(TestNeighborList, cpu_lessmem) {
   int mem_size = 2;
   int* ilist = new int[nloc];
@@ -196,6 +244,71 @@ TEST_F(TestNeighborList, gpu) {
   deepmd::delete_device_memory(c_cpy_dev);
 }
 
+TEST_F(TestNeighborList, gpu_multiple_frames) {
+  constexpr int nframes = 2;
+  const int nrows = nframes * nloc;
+  int mem_size = 48;
+
+  int *nlist_data_dev = NULL, *jlist_dev = NULL, *ilist_dev = NULL,
+      *numneigh_dev = NULL;
+  int** firstneigh_dev = NULL;
+  std::vector<int*> temp_firstneigh(nrows);
+  std::vector<double> posi_multi = repeat_vector(posi_cpy, nframes);
+  double* c_cpy_dev = NULL;
+
+  deepmd::malloc_device_memory(nlist_data_dev, 2 * nrows * mem_size);
+  deepmd::malloc_device_memory(jlist_dev, nrows * mem_size);
+  deepmd::malloc_device_memory(ilist_dev, nrows);
+  deepmd::malloc_device_memory(numneigh_dev, nrows);
+  for (int ii = 0; ii < nrows; ++ii) {
+    temp_firstneigh[ii] = jlist_dev + ii * mem_size;
+  }
+  deepmd::malloc_device_memory_sync(firstneigh_dev, temp_firstneigh);
+  deepmd::malloc_device_memory_sync(c_cpy_dev, posi_multi);
+  deepmd::InputNlist nlist_dev(nrows, ilist_dev, numneigh_dev, firstneigh_dev);
+
+  int max_list_size;
+  int ret =
+      deepmd::build_nlist_gpu(nlist_dev, &max_list_size, nlist_data_dev,
+                              c_cpy_dev, nloc, nall, mem_size, rc, nframes);
+
+  EXPECT_EQ(ret, 0);
+  std::vector<int> ilist(nrows);
+  std::vector<int> numneigh(nrows);
+  std::vector<int*> firstneigh(nrows);
+  std::vector<int> jlist(nrows * mem_size);
+  deepmd::memcpy_device_to_host(jlist_dev, jlist.data(), nrows * mem_size);
+  deepmd::memcpy_device_to_host(ilist_dev, ilist.data(), nrows);
+  deepmd::memcpy_device_to_host(numneigh_dev, numneigh.data(), nrows);
+  for (int ii = 0; ii < nrows; ++ii) {
+    firstneigh[ii] = jlist.data() + ii * mem_size;
+  }
+
+  deepmd::InputNlist nlist(nlist_dev.inum, ilist.data(), numneigh.data(),
+                           firstneigh.data());
+  EXPECT_EQ(nlist.inum, nrows);
+  EXPECT_EQ(max_list_size, 5);
+  for (int ff = 0; ff < nframes; ++ff) {
+    for (int ii = 0; ii < nloc; ++ii) {
+      const int row = ff * nloc + ii;
+      EXPECT_EQ(nlist.ilist[row], ii);
+      EXPECT_EQ(nlist.numneigh[row], expect_nlist_cpy[ii].size());
+      std::sort(nlist.firstneigh[row],
+                nlist.firstneigh[row] + nlist.numneigh[row]);
+      for (int jj = 0; jj < nlist.numneigh[row]; ++jj) {
+        EXPECT_EQ(nlist.firstneigh[row][jj], expect_nlist_cpy[ii][jj]);
+      }
+    }
+  }
+
+  deepmd::delete_device_memory(nlist_data_dev);
+  deepmd::delete_device_memory(jlist_dev);
+  deepmd::delete_device_memory(ilist_dev);
+  deepmd::delete_device_memory(numneigh_dev);
+  deepmd::delete_device_memory(firstneigh_dev);
+  deepmd::delete_device_memory(c_cpy_dev);
+}
+
 TEST_F(TestNeighborList, gpu_lessmem) {
   int mem_size = 47;
 
diff --git a/source/op/tf/custom_op.h b/source/op/tf/custom_op.h
index baf95f3fa3..1dfb680247 100644
--- a/source/op/tf/custom_op.h
+++ b/source/op/tf/custom_op.h
@@ -28,28 +28,28 @@ void safe_compute(OpKernelContext* context,
 };
 
 template <typename FPTYPE>
-void _prepare_coord_nlist_gpu(OpKernelContext* context,
-                              Tensor* tensor_list,
-                              FPTYPE const** coord,
-                              FPTYPE*& coord_cpy,
-                              int const** type,
-                              int*& type_cpy,
-                              int*& idx_mapping,
-                              deepmd::InputNlist& inlist,
-                              int*& ilist,
-                              int*& numneigh,
-                              int**& firstneigh,
-                              int*& jlist,
-                              int*& nbor_list_dev,
-                              int& new_nall,
-                              int& mem_cpy,
-                              int& mem_nnei,
-                              int& max_nbor_size,
-                              const FPTYPE* box,
-                              const int* mesh_tensor_data,
-                              const int mesh_tensor_size,
-                              const int& nloc,
-                              const int& nei_mode,
-                              const float& rcut_r,
-                              const int& max_cpy_trial,
-                              const int& max_nnei_trial);
+tensorflow::Status _prepare_coord_nlist_gpu(OpKernelContext* context,
+                                            Tensor* tensor_list,
+                                            FPTYPE const** coord,
+                                            FPTYPE*& coord_cpy,
+                                            int const** type,
+                                            int*& type_cpy,
+                                            int*& idx_mapping,
+                                            deepmd::InputNlist& inlist,
+                                            int*& ilist,
+                                            int*& numneigh,
+                                            int**& firstneigh,
+                                            int*& jlist,
+                                            int*& nbor_list_dev,
+                                            int& new_nall,
+                                            int& mem_cpy,
+                                            int& mem_nnei,
+                                            int& max_nbor_size,
+                                            const FPTYPE* box,
+                                            const int* mesh_tensor_data,
+                                            const int mesh_tensor_size,
+                                            const int& nloc,
+                                            const int& nei_mode,
+                                            const float& rcut_r,
+                                            const int& max_cpy_trial,
+                                            const int& max_nnei_trial);
diff --git a/source/op/tf/neighbor_stat.cc b/source/op/tf/neighbor_stat.cc
index 26f13b0c84..0e3ec0a942 100644
--- a/source/op/tf/neighbor_stat.cc
+++ b/source/op/tf/neighbor_stat.cc
@@ -149,8 +149,16 @@ class NeighborStatOp : public OpKernel {
 
       int* idx_mapping = NULL;
       int *ilist = NULL, *numneigh = NULL;
-      int** firstneigh = NULL;
-      deepmd::malloc_device_memory(firstneigh, nloc);
+      struct FirstneighGuard {
+        int** ptr = NULL;
+        ~FirstneighGuard() {
+          if (ptr != NULL) {
+            deepmd::delete_device_memory(ptr);
+          }
+        }
+      } firstneigh_guard;
+      deepmd::malloc_device_memory(firstneigh_guard.ptr, nloc);
+      int** firstneigh = firstneigh_guard.ptr;
       int* jlist = NULL;
       FPTYPE* coord_cpy;
       int* type_cpy;
@@ -159,12 +167,14 @@ class NeighborStatOp : public OpKernel {
       deepmd::InputNlist gpu_inlist;
       int* nbor_list_dev = NULL;
       // prepare coord and nlist
-      _prepare_coord_nlist_gpu<FPTYPE>(
-          context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy,
-          idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist,
-          nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size_nlist,
-          box, mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode,
-          rcut, max_cpy_trial, max_nnei_trial);
+      OP_REQUIRES_OK(
+          context,
+          _prepare_coord_nlist_gpu<FPTYPE>(
+              context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy,
+              idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist,
+              nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size_nlist,
+              box, mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc,
+              nei_mode, rcut, max_cpy_trial, max_nnei_trial));
 
       TensorShape min_nbor_dist_shape;
       min_nbor_dist_shape.AddDim(static_cast<int64_t>(nloc) * mem_nnei);
@@ -177,7 +187,6 @@ class NeighborStatOp : public OpKernel {
       deepmd::neighbor_stat_gpu<FPTYPE>(coord, type, nloc, gpu_inlist,
                                         max_nbor_size, min_nbor_dist, ntypes,
                                         mem_nnei);
-      deepmd::delete_device_memory(firstneigh);
 #endif
     } else {
       for (int ii = 0;
diff --git a/source/op/tf/prod_env_mat_multi_device.cc b/source/op/tf/prod_env_mat_multi_device.cc
index e374102224..d83df10977 100644
--- a/source/op/tf/prod_env_mat_multi_device.cc
+++ b/source/op/tf/prod_env_mat_multi_device.cc
@@ -1,4 +1,6 @@
 // SPDX-License-Identifier: LGPL-3.0-or-later
+#include <algorithm>
+
 #include "coord.h"
 #include "custom_op.h"
 #include "device.h"
@@ -192,6 +194,19 @@ static int _norm_copy_coord_cpu(std::vector<FPTYPE>& coord_cpy,
                                 const int& max_cpy_trial,
                                 const float& rcut_r);
 
+template <typename FPTYPE>
+static int _norm_copy_coord_cpu_frame(FPTYPE* coord_cpy,
+                                      int* type_cpy,
+                                      int* idx_mapping,
+                                      int& frame_nall,
+                                      const int& mem_cpy,
+                                      const FPTYPE* coord,
+                                      const FPTYPE* box,
+                                      const int* type,
+                                      const int& nall,
+                                      const int& nloc,
+                                      const float& rcut_r);
+
 template <typename FPTYPE>
 static int _build_nlist_cpu(std::vector<int>& ilist,
                             std::vector<int>& numneigh,
@@ -203,7 +218,9 @@ static int _build_nlist_cpu(std::vector<int>& ilist,
                             const int& nloc,
                             const int& new_nall,
                             const int& max_nnei_trial,
-                            const float& rcut_r);
+                            const float& rcut_r,
+                            const int& nframes = 1,
+                            const int* type = NULL);
 
 static void _map_nlist_cpu(int* nlist,
                            const int* idx_mapping,
@@ -220,6 +237,18 @@ static void _map_nei_info_cpu(int* nlist,
                               const int& ntypes,
                               const bool& b_nlist_map);
 
+static tensorflow::Status _prepare_mesh_nlist_cpu_batch(
+    deepmd::InputNlist& inlist,
+    std::vector<int>& ilist,
+    std::vector<int>& numneigh,
+    std::vector<int*>& firstneigh,
+    std::vector<std::vector<int>>& jlist,
+    int& max_nbor_size,
+    const int* mesh_tensor_data,
+    const int mesh_tensor_size,
+    const int nloc,
+    const int nframes);
+
 template <typename FPTYPE>
 static void _prepare_coord_nlist_cpu(OpKernelContext* context,
                                      FPTYPE const** coord,
@@ -244,6 +273,33 @@ static void _prepare_coord_nlist_cpu(OpKernelContext* context,
                                      const int& max_cpy_trial,
                                      const int& max_nnei_trial);
 
+template <typename FPTYPE>
+static tensorflow::Status _prepare_coord_nlist_cpu_batch(
+    FPTYPE const** coord,
+    std::vector<FPTYPE>& coord_cpy,
+    int const** type,
+    std::vector<int>& type_cpy,
+    std::vector<int>& idx_mapping,
+    deepmd::InputNlist& inlist,
+    std::vector<int>& ilist,
+    std::vector<int>& numneigh,
+    std::vector<int*>& firstneigh,
+    std::vector<std::vector<int>>& jlist,
+    int& new_nall,
+    int& mem_cpy,
+    int& mem_nnei,
+    int& max_nbor_size,
+    const FPTYPE* box,
+    const int* mesh_tensor_data,
+    const int mesh_tensor_size,
+    const int& nloc,
+    const int& nall,
+    const int& nframes,
+    const int& nei_mode,
+    const float& rcut_r,
+    const int& max_cpy_trial,
+    const int& max_nnei_trial);
+
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
 static int _norm_copy_coord_gpu(OpKernelContext* context,
@@ -260,6 +316,19 @@ static int _norm_copy_coord_gpu(OpKernelContext* context,
                                 const int& max_cpy_trial,
                                 const float& rcut_r);
 
+template <typename FPTYPE>
+static int _norm_copy_coord_gpu_frame(OpKernelContext* context,
+                                      FPTYPE* coord_cpy,
+                                      int* type_cpy,
+                                      int* idx_mapping,
+                                      int& frame_nall,
+                                      const int& mem_cpy,
+                                      const FPTYPE* coord,
+                                      const FPTYPE* box,
+                                      const int* type,
+                                      const int& nloc,
+                                      const float& rcut_r);
+
 template <typename FPTYPE>
 static int _build_nlist_gpu(OpKernelContext* context,
                             Tensor* tensor_list,
@@ -273,7 +342,9 @@ static int _build_nlist_gpu(OpKernelContext* context,
                             const int& nloc,
                             const int& new_nall,
                             const int& max_nnei_trial,
-                            const float& rcut_r);
+                            const float& rcut_r,
+                            const int& nframes = 1,
+                            const int* type = NULL);
 
 static void _map_nlist_gpu(int* nlist,
                            const int* idx_mapping,
@@ -290,32 +361,58 @@ static void _map_nei_info_gpu(int* nlist,
                               const int& ntypes,
                               const bool& b_nlist_map);
 
+static tensorflow::Status _prepare_mesh_nlist_gpu_batch(
+    OpKernelContext* context,
+    Tensor* tensor_list,
+    deepmd::InputNlist& gpu_inlist,
+    int** firstneigh,
+    int& max_nbor_size,
+    const int* mesh_tensor_data,
+    const int mesh_tensor_size,
+    const int nloc,
+    const int nframes);
+
+static tensorflow::Status _round_built_gpu_nbor_size(int& max_nbor_size);
+
+template <typename T>
+struct DeviceMemoryGuard {
+  T* ptr = NULL;
+  DeviceMemoryGuard() = default;
+  DeviceMemoryGuard(const DeviceMemoryGuard&) = delete;
+  DeviceMemoryGuard& operator=(const DeviceMemoryGuard&) = delete;
+  ~DeviceMemoryGuard() {
+    if (ptr != NULL) {
+      deepmd::delete_device_memory(ptr);
+    }
+  }
+};
+
 template <typename FPTYPE>
-void _prepare_coord_nlist_gpu(OpKernelContext* context,
-                              Tensor* tensor_list,
-                              FPTYPE const** coord,
-                              FPTYPE*& coord_cpy,
-                              int const** type,
-                              int*& type_cpy,
-                              int*& idx_mapping,
-                              deepmd::InputNlist& inlist,
-                              int*& ilist,
-                              int*& numneigh,
-                              int**& firstneigh,
-                              int*& jlist,
-                              int*& nbor_list_dev,
-                              int& new_nall,
-                              int& mem_cpy,
-                              int& mem_nnei,
-                              int& max_nbor_size,
-                              const FPTYPE* box,
-                              const int* mesh_tensor_data,
-                              const int mesh_tensor_size,
-                              const int& nloc,
-                              const int& nei_mode,
-                              const float& rcut_r,
-                              const int& max_cpy_trial,
-                              const int& max_nnei_trial);
+tensorflow::Status _prepare_coord_nlist_gpu(OpKernelContext* context,
+                                            Tensor* tensor_list,
+                                            FPTYPE const** coord,
+                                            FPTYPE*& coord_cpy,
+                                            int const** type,
+                                            int*& type_cpy,
+                                            int*& idx_mapping,
+                                            deepmd::InputNlist& inlist,
+                                            int*& ilist,
+                                            int*& numneigh,
+                                            int**& firstneigh,
+                                            int*& jlist,
+                                            int*& nbor_list_dev,
+                                            int& new_nall,
+                                            int& mem_cpy,
+                                            int& mem_nnei,
+                                            int& max_nbor_size,
+                                            const FPTYPE* box,
+                                            const int* mesh_tensor_data,
+                                            const int mesh_tensor_size,
+                                            const int& nloc,
+                                            const int& nei_mode,
+                                            const float& rcut_r,
+                                            const int& max_cpy_trial,
+                                            const int& max_nnei_trial);
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
@@ -485,6 +582,176 @@ class ProdEnvMatAOp : public OpKernel {
     const FPTYPE* std = std_tensor.flat<FPTYPE>().data();
     const int* p_type = type_tensor.flat<int>().data();
 
+    if (device == "CPU" && nei_mode != 3) {
+      const FPTYPE* coord = p_coord;
+      const int* type = p_type;
+      int frame_nall = nall;
+      std::vector<FPTYPE> coord_cpy;
+      std::vector<int> type_cpy;
+      std::vector<int> idx_mapping;
+      std::vector<int> ilist;
+      std::vector<int> numneigh;
+      std::vector<int*> firstneigh;
+      std::vector<std::vector<int>> jlist;
+      deepmd::InputNlist batch_inlist;
+      int batch_max_nbor_size = max_nbor_size;
+      OP_REQUIRES_OK(
+          context,
+          _prepare_coord_nlist_cpu_batch<FPTYPE>(
+              &coord, coord_cpy, &type, type_cpy, idx_mapping, batch_inlist,
+              ilist, numneigh, firstneigh, jlist, frame_nall, mem_cpy, mem_nnei,
+              batch_max_nbor_size, p_box, mesh_tensor.flat<int>().data(),
+              static_cast<int>(mesh_tensor.NumElements()), nloc, nall, nsamples,
+              nei_mode, rcut_r, max_cpy_trial, max_nnei_trial));
+      max_nbor_size = std::max(max_nbor_size, batch_max_nbor_size);
+
+      deepmd::prod_env_mat_a_cpu(p_em, p_em_deriv, p_rij, p_nlist, coord, type,
+                                 batch_inlist, batch_max_nbor_size, avg, std,
+                                 nloc, frame_nall, nsamples, rcut_r,
+                                 rcut_r_smth, sec_a);
+      if (nei_mode == 1) {
+        for (int kk = 0; kk < nsamples; ++kk) {
+          _map_nlist_cpu(
+              p_nlist + static_cast<int_64>(kk) * nloc * nnei,
+              idx_mapping.data() + static_cast<int_64>(kk) * frame_nall, nloc,
+              nnei);
+        }
+      }
+      return;
+    }
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    if (device == "GPU" && (nei_mode == -1 || nei_mode == 1 || nei_mode == 4)) {
+      const int frame_chunk = nsamples;
+      for (int ff = 0; ff < nsamples; ff += frame_chunk) {
+        const int chunk_nframes = std::min(frame_chunk, nsamples - ff);
+        const int nrows = chunk_nframes * nloc;
+        FPTYPE* em = p_em + static_cast<int_64>(ff) * nloc * ndescrpt;
+        FPTYPE* em_deriv =
+            p_em_deriv + static_cast<int_64>(ff) * nloc * ndescrpt * 3;
+        FPTYPE* rij = p_rij + static_cast<int_64>(ff) * nloc * nnei * 3;
+        int* nlist = p_nlist + static_cast<int_64>(ff) * nloc * nnei;
+        const FPTYPE* coord = p_coord + static_cast<int_64>(ff) * nall * 3;
+        const int* type = p_type + static_cast<int_64>(ff) * nall;
+        int* idx_mapping = NULL;
+        int frame_nall = nall;
+        Tensor coord_cpy_tensor;
+        Tensor type_cpy_tensor;
+        if (nei_mode == 1) {
+          int copy_ok = 0;
+          for (int tt = 0; tt < max_cpy_trial; ++tt) {
+            TensorShape cpy_shape;
+            cpy_shape.AddDim(static_cast<int64_t>(chunk_nframes) * mem_cpy * 3);
+            OP_REQUIRES_OK(
+                context, context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
+                                                cpy_shape, &coord_cpy_tensor));
+            TensorShape t_shape;
+            t_shape.AddDim(static_cast<int64_t>(chunk_nframes) * mem_cpy * 2);
+            OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, t_shape,
+                                                           &type_cpy_tensor));
+            FPTYPE* coord_cpy = coord_cpy_tensor.flat<FPTYPE>().data();
+            int* type_cpy = type_cpy_tensor.flat<int>().data();
+            idx_mapping = type_cpy + int_64(chunk_nframes) * mem_cpy;
+            DPErrcheck(
+                gpuMemset(type_cpy, -1,
+                          sizeof(int) * int_64(chunk_nframes) * mem_cpy * 2));
+
+            copy_ok = 1;
+            for (int kk = 0; kk < chunk_nframes; ++kk) {
+              int frame_copied_nall = nall;
+              int ret = _norm_copy_coord_gpu_frame(
+                  context, coord_cpy + int_64(kk) * mem_cpy * 3,
+                  type_cpy + int_64(kk) * mem_cpy,
+                  idx_mapping + int_64(kk) * mem_cpy, frame_copied_nall,
+                  mem_cpy, p_coord + static_cast<int_64>(ff + kk) * nall * 3,
+                  p_box + static_cast<int_64>(ff + kk) * 9,
+                  p_type + static_cast<int_64>(ff + kk) * nall, nloc, rcut_r);
+              OP_REQUIRES(
+                  context, ret >= 0,
+                  errors::Aborted("cannot allocate mem for copied coords"));
+              if (ret != 0) {
+                copy_ok = 0;
+                break;
+              }
+            }
+            if (copy_ok) {
+              coord = coord_cpy;
+              type = type_cpy;
+              frame_nall = mem_cpy;
+              break;
+            }
+            mem_cpy *= 2;
+          }
+          OP_REQUIRES(context, copy_ok,
+                      errors::Aborted("cannot allocate mem for copied coords"));
+        }
+
+        std::vector<Tensor> nlist_tensors(2);
+        int *ilist = NULL, *numneigh = NULL, *jlist = NULL;
+        DeviceMemoryGuard<int*> firstneigh_guard;
+        deepmd::malloc_device_memory(firstneigh_guard.ptr, nrows);
+        int** firstneigh = firstneigh_guard.ptr;
+        int chunk_max_nbor_size = max_nbor_size;
+        deepmd::InputNlist chunk_gpu_inlist;
+        if (nei_mode == 4) {
+          OP_REQUIRES_OK(
+              context,
+              _prepare_mesh_nlist_gpu_batch(
+                  context, nlist_tensors.data(), chunk_gpu_inlist, firstneigh,
+                  chunk_max_nbor_size, mesh_tensor.flat<int>().data(),
+                  static_cast<int>(mesh_tensor.NumElements()), nloc,
+                  chunk_nframes));
+        } else {
+          TensorShape ilist_shape;
+          ilist_shape.AddDim(static_cast<int64_t>(nrows) * 2);
+          OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, ilist_shape,
+                                                         &nlist_tensors[0]));
+          TensorShape jlist_shape;
+          jlist_shape.AddDim(3 * int_64(nrows) * mem_nnei);
+          OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, jlist_shape,
+                                                         &nlist_tensors[1]));
+          int build_ok =
+              _build_nlist_gpu(context, nlist_tensors.data(), ilist, numneigh,
+                               firstneigh, jlist, chunk_max_nbor_size, mem_nnei,
+                               coord, nloc, frame_nall, max_nnei_trial, rcut_r,
+                               chunk_nframes, nei_mode == 1 ? type : NULL);
+          OP_REQUIRES(context, build_ok,
+                      errors::Aborted("cannot allocate mem for nlist"));
+          OP_REQUIRES_OK(context,
+                         _round_built_gpu_nbor_size(chunk_max_nbor_size));
+          chunk_gpu_inlist =
+              deepmd::InputNlist(nrows, ilist, numneigh, firstneigh);
+        }
+        max_nbor_size = std::max(max_nbor_size, chunk_max_nbor_size);
+
+        Tensor int_temp;
+        TensorShape int_shape;
+        int_shape.AddDim(sec_a.size() + int_64(nrows) * sec_a.size() + nrows);
+        OP_REQUIRES_OK(context,
+                       context->allocate_temp(DT_INT32, int_shape, &int_temp));
+        Tensor uint64_temp;
+        TensorShape uint64_shape;
+        uint64_shape.AddDim(int_64(nrows) * chunk_max_nbor_size * 2);
+        OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape,
+                                                       &uint64_temp));
+        array_int = int_temp.flat<int>().data();
+        array_longlong = uint64_temp.flat<unsigned long long>().data();
+
+        deepmd::prod_env_mat_a_gpu(
+            em, em_deriv, rij, nlist, coord, type, chunk_gpu_inlist, array_int,
+            array_longlong, chunk_max_nbor_size, avg, std, nloc, frame_nall,
+            chunk_nframes, rcut_r, rcut_r_smth, sec_a);
+        if (nei_mode == 1) {
+          for (int kk = 0; kk < chunk_nframes; ++kk) {
+            _map_nlist_gpu(nlist + int_64(kk) * nloc * nnei,
+                           idx_mapping + int_64(kk) * mem_cpy, nloc, nnei);
+          }
+        }
+      }
+      return;
+    }
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
     // must declare out of if, otherwise the memory will be destroyed!
     Tensor int_temp;
     Tensor uint64_temp;
@@ -548,7 +815,8 @@ class ProdEnvMatAOp : public OpKernel {
       array_longlong = uint64_temp.flat<unsigned long long>().data();
     }
 
-    // loop over samples
+    // LAMMPS external nlists are updated outside this op, so keep their
+    // existing per-sample implementation.
     for (int_64 ff = 0; ff < nsamples; ++ff) {
       FPTYPE* em = p_em + ff * nloc * ndescrpt;
       FPTYPE* em_deriv = p_em_deriv + ff * nloc * ndescrpt * 3;
@@ -562,20 +830,23 @@ class ProdEnvMatAOp : public OpKernel {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
         int* idx_mapping = NULL;
         int *ilist = NULL, *numneigh = NULL;
-        int** firstneigh = NULL;
-        deepmd::malloc_device_memory(firstneigh, nloc);
+        DeviceMemoryGuard<int*> firstneigh_guard;
+        deepmd::malloc_device_memory(firstneigh_guard.ptr, nloc);
+        int** firstneigh = firstneigh_guard.ptr;
         int* jlist = NULL;
         FPTYPE* coord_cpy;
         int* type_cpy;
         int frame_nall = nall;
         int mesh_tensor_size = static_cast<int>(mesh_tensor.NumElements());
         // prepare coord and nlist
-        _prepare_coord_nlist_gpu<FPTYPE>(
-            context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy,
-            idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist,
-            nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size, box,
-            mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode,
-            rcut_r, max_cpy_trial, max_nnei_trial);
+        OP_REQUIRES_OK(
+            context,
+            _prepare_coord_nlist_gpu<FPTYPE>(
+                context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy,
+                idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist,
+                nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size,
+                box, mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc,
+                nei_mode, rcut_r, max_cpy_trial, max_nnei_trial));
 
         // max_nbor_size may be changed after _prepare_coord_nlist_gpu
         // So we need to update the uint64_temp tensor if necessary
@@ -589,12 +860,11 @@ class ProdEnvMatAOp : public OpKernel {
         // launch the gpu(nv) compute function
         deepmd::prod_env_mat_a_gpu(em, em_deriv, rij, nlist, coord, type,
                                    gpu_inlist, array_int, array_longlong,
-                                   max_nbor_size, avg, std, nloc, frame_nall,
+                                   max_nbor_size, avg, std, nloc, frame_nall, 1,
                                    rcut_r, rcut_r_smth, sec_a);
         if (b_nlist_map) {
           _map_nlist_gpu(nlist, idx_mapping, nloc, nnei);
         }
-        deepmd::delete_device_memory(firstneigh);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       } else if (device == "CPU") {
         deepmd::InputNlist inlist;
@@ -791,6 +1061,175 @@ class ProdEnvMatROp : public OpKernel {
     const FPTYPE* std = std_tensor.flat<FPTYPE>().data();
     const int* p_type = type_tensor.flat<int>().data();
 
+    if (device == "CPU" && nei_mode != 3) {
+      const FPTYPE* coord = p_coord;
+      const int* type = p_type;
+      int frame_nall = nall;
+      std::vector<FPTYPE> coord_cpy;
+      std::vector<int> type_cpy;
+      std::vector<int> idx_mapping;
+      std::vector<int> ilist;
+      std::vector<int> numneigh;
+      std::vector<int*> firstneigh;
+      std::vector<std::vector<int>> jlist;
+      deepmd::InputNlist batch_inlist;
+      int batch_max_nbor_size = max_nbor_size;
+      OP_REQUIRES_OK(
+          context,
+          _prepare_coord_nlist_cpu_batch<FPTYPE>(
+              &coord, coord_cpy, &type, type_cpy, idx_mapping, batch_inlist,
+              ilist, numneigh, firstneigh, jlist, frame_nall, mem_cpy, mem_nnei,
+              batch_max_nbor_size, p_box, mesh_tensor.flat<int>().data(),
+              static_cast<int>(mesh_tensor.NumElements()), nloc, nall, nsamples,
+              nei_mode, rcut, max_cpy_trial, max_nnei_trial));
+      max_nbor_size = std::max(max_nbor_size, batch_max_nbor_size);
+
+      deepmd::prod_env_mat_r_cpu(p_em, p_em_deriv, p_rij, p_nlist, coord, type,
+                                 batch_inlist, batch_max_nbor_size, avg, std,
+                                 nloc, frame_nall, nsamples, rcut, rcut_smth,
+                                 sec);
+      if (nei_mode == 1) {
+        for (int kk = 0; kk < nsamples; ++kk) {
+          _map_nlist_cpu(
+              p_nlist + static_cast<int_64>(kk) * nloc * nnei,
+              idx_mapping.data() + static_cast<int_64>(kk) * frame_nall, nloc,
+              nnei);
+        }
+      }
+      return;
+    }
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    if (device == "GPU" && (nei_mode == -1 || nei_mode == 1 || nei_mode == 4)) {
+      const int frame_chunk = nsamples;
+      for (int ff = 0; ff < nsamples; ff += frame_chunk) {
+        const int chunk_nframes = std::min(frame_chunk, nsamples - ff);
+        const int nrows = chunk_nframes * nloc;
+        FPTYPE* em = p_em + static_cast<int_64>(ff) * nloc * ndescrpt;
+        FPTYPE* em_deriv =
+            p_em_deriv + static_cast<int_64>(ff) * nloc * ndescrpt * 3;
+        FPTYPE* rij = p_rij + static_cast<int_64>(ff) * nloc * nnei * 3;
+        int* nlist = p_nlist + static_cast<int_64>(ff) * nloc * nnei;
+        const FPTYPE* coord = p_coord + static_cast<int_64>(ff) * nall * 3;
+        const int* type = p_type + static_cast<int_64>(ff) * nall;
+        int* idx_mapping = NULL;
+        int frame_nall = nall;
+        Tensor coord_cpy_tensor;
+        Tensor type_cpy_tensor;
+        if (nei_mode == 1) {
+          int copy_ok = 0;
+          for (int tt = 0; tt < max_cpy_trial; ++tt) {
+            TensorShape cpy_shape;
+            cpy_shape.AddDim(static_cast<int64_t>(chunk_nframes) * mem_cpy * 3);
+            OP_REQUIRES_OK(
+                context, context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
+                                                cpy_shape, &coord_cpy_tensor));
+            TensorShape t_shape;
+            t_shape.AddDim(static_cast<int64_t>(chunk_nframes) * mem_cpy * 2);
+            OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, t_shape,
+                                                           &type_cpy_tensor));
+            FPTYPE* coord_cpy = coord_cpy_tensor.flat<FPTYPE>().data();
+            int* type_cpy = type_cpy_tensor.flat<int>().data();
+            idx_mapping = type_cpy + int_64(chunk_nframes) * mem_cpy;
+            DPErrcheck(
+                gpuMemset(type_cpy, -1,
+                          sizeof(int) * int_64(chunk_nframes) * mem_cpy * 2));
+
+            copy_ok = 1;
+            for (int kk = 0; kk < chunk_nframes; ++kk) {
+              int frame_copied_nall = nall;
+              int ret = _norm_copy_coord_gpu_frame(
+                  context, coord_cpy + int_64(kk) * mem_cpy * 3,
+                  type_cpy + int_64(kk) * mem_cpy,
+                  idx_mapping + int_64(kk) * mem_cpy, frame_copied_nall,
+                  mem_cpy, p_coord + static_cast<int_64>(ff + kk) * nall * 3,
+                  p_box + static_cast<int_64>(ff + kk) * 9,
+                  p_type + static_cast<int_64>(ff + kk) * nall, nloc, rcut);
+              OP_REQUIRES(
+                  context, ret >= 0,
+                  errors::Aborted("cannot allocate mem for copied coords"));
+              if (ret != 0) {
+                copy_ok = 0;
+                break;
+              }
+            }
+            if (copy_ok) {
+              coord = coord_cpy;
+              type = type_cpy;
+              frame_nall = mem_cpy;
+              break;
+            }
+            mem_cpy *= 2;
+          }
+          OP_REQUIRES(context, copy_ok,
+                      errors::Aborted("cannot allocate mem for copied coords"));
+        }
+
+        std::vector<Tensor> nlist_tensors(2);
+        int *ilist = NULL, *numneigh = NULL, *jlist = NULL;
+        DeviceMemoryGuard<int*> firstneigh_guard;
+        deepmd::malloc_device_memory(firstneigh_guard.ptr, nrows);
+        int** firstneigh = firstneigh_guard.ptr;
+        int chunk_max_nbor_size = max_nbor_size;
+        deepmd::InputNlist chunk_gpu_inlist;
+        if (nei_mode == 4) {
+          OP_REQUIRES_OK(
+              context,
+              _prepare_mesh_nlist_gpu_batch(
+                  context, nlist_tensors.data(), chunk_gpu_inlist, firstneigh,
+                  chunk_max_nbor_size, mesh_tensor.flat<int>().data(),
+                  static_cast<int>(mesh_tensor.NumElements()), nloc,
+                  chunk_nframes));
+        } else {
+          TensorShape ilist_shape;
+          ilist_shape.AddDim(static_cast<int64_t>(nrows) * 2);
+          OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, ilist_shape,
+                                                         &nlist_tensors[0]));
+          TensorShape jlist_shape;
+          jlist_shape.AddDim(3 * int_64(nrows) * mem_nnei);
+          OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, jlist_shape,
+                                                         &nlist_tensors[1]));
+          int build_ok = _build_nlist_gpu(
+              context, nlist_tensors.data(), ilist, numneigh, firstneigh, jlist,
+              chunk_max_nbor_size, mem_nnei, coord, nloc, frame_nall,
+              max_nnei_trial, rcut, chunk_nframes, nei_mode == 1 ? type : NULL);
+          OP_REQUIRES(context, build_ok,
+                      errors::Aborted("cannot allocate mem for nlist"));
+          OP_REQUIRES_OK(context,
+                         _round_built_gpu_nbor_size(chunk_max_nbor_size));
+          chunk_gpu_inlist =
+              deepmd::InputNlist(nrows, ilist, numneigh, firstneigh);
+        }
+        max_nbor_size = std::max(max_nbor_size, chunk_max_nbor_size);
+
+        Tensor int_temp;
+        TensorShape int_shape;
+        int_shape.AddDim(sec.size() + int_64(nrows) * sec.size() + nrows);
+        OP_REQUIRES_OK(context,
+                       context->allocate_temp(DT_INT32, int_shape, &int_temp));
+        Tensor uint64_temp;
+        TensorShape uint64_shape;
+        uint64_shape.AddDim(int_64(nrows) * chunk_max_nbor_size * 2);
+        OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape,
+                                                       &uint64_temp));
+        array_int = int_temp.flat<int>().data();
+        array_longlong = uint64_temp.flat<unsigned long long>().data();
+
+        deepmd::prod_env_mat_r_gpu(
+            em, em_deriv, rij, nlist, coord, type, chunk_gpu_inlist, array_int,
+            array_longlong, chunk_max_nbor_size, avg, std, nloc, frame_nall,
+            chunk_nframes, rcut, rcut_smth, sec);
+        if (nei_mode == 1) {
+          for (int kk = 0; kk < chunk_nframes; ++kk) {
+            _map_nlist_gpu(nlist + int_64(kk) * nloc * nnei,
+                           idx_mapping + int_64(kk) * mem_cpy, nloc, nnei);
+          }
+        }
+      }
+      return;
+    }
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
     // must declare out of if, otherwise the memory will be destroyed!
     Tensor int_temp;
     Tensor uint64_temp;
@@ -855,7 +1294,8 @@ class ProdEnvMatROp : public OpKernel {
       array_longlong = uint64_temp.flat<unsigned long long>().data();
     }
 
-    // loop over samples
+    // LAMMPS external nlists are updated outside this op, so keep their
+    // existing per-sample implementation.
     for (int_64 ff = 0; ff < nsamples; ++ff) {
       FPTYPE* em = p_em + ff * nloc * ndescrpt;
       FPTYPE* em_deriv = p_em_deriv + ff * nloc * ndescrpt * 3;
@@ -869,20 +1309,23 @@ class ProdEnvMatROp : public OpKernel {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
         int* idx_mapping = NULL;
         int *ilist = NULL, *numneigh = NULL;
-        int** firstneigh = NULL;
-        deepmd::malloc_device_memory(firstneigh, nloc);
+        DeviceMemoryGuard<int*> firstneigh_guard;
+        deepmd::malloc_device_memory(firstneigh_guard.ptr, nloc);
+        int** firstneigh = firstneigh_guard.ptr;
         int* jlist = NULL;
         FPTYPE* coord_cpy;
         int* type_cpy;
         int frame_nall = nall;
         int mesh_tensor_size = static_cast<int>(mesh_tensor.NumElements());
         // prepare coord and nlist
-        _prepare_coord_nlist_gpu<FPTYPE>(
-            context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy,
-            idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist,
-            nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size, box,
-            mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode,
-            rcut, max_cpy_trial, max_nnei_trial);
+        OP_REQUIRES_OK(
+            context,
+            _prepare_coord_nlist_gpu<FPTYPE>(
+                context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy,
+                idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist,
+                nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size,
+                box, mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc,
+                nei_mode, rcut, max_cpy_trial, max_nnei_trial));
 
         // max_nbor_size may be changed after _prepare_coord_nlist_gpu
         // So we need to update the uint64_temp tensor if necessary
@@ -897,12 +1340,11 @@ class ProdEnvMatROp : public OpKernel {
         // launch the gpu(nv) compute function
         deepmd::prod_env_mat_r_gpu(em, em_deriv, rij, nlist, coord, type,
                                    gpu_inlist, array_int, array_longlong,
-                                   max_nbor_size, avg, std, nloc, frame_nall,
+                                   max_nbor_size, avg, std, nloc, frame_nall, 1,
                                    rcut, rcut_smth, sec);
         if (b_nlist_map) {
           _map_nlist_gpu(nlist, idx_mapping, nloc, nnei);
         }
-        deepmd::delete_device_memory(firstneigh);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       } else if (device == "CPU") {
         deepmd::InputNlist inlist;
@@ -1144,6 +1586,216 @@ class ProdEnvMatAMixOp : public OpKernel {
       }
     }
 
+    if (device == "CPU" && nei_mode != 3) {
+      const FPTYPE* coord = p_coord;
+      const int* type = p_type;
+      const int* f_type = p_f_type;
+      int frame_nall = nall;
+      std::vector<FPTYPE> coord_cpy;
+      std::vector<int> f_type_cpy;
+      std::vector<int> real_type_cpy;
+      std::vector<int> idx_mapping;
+      std::vector<int> ilist;
+      std::vector<int> numneigh;
+      std::vector<int*> firstneigh;
+      std::vector<std::vector<int>> jlist;
+      deepmd::InputNlist batch_inlist;
+      int batch_max_nbor_size = max_nbor_size;
+      OP_REQUIRES_OK(
+          context,
+          _prepare_coord_nlist_cpu_batch<FPTYPE>(
+              &coord, coord_cpy, &f_type, f_type_cpy, idx_mapping, batch_inlist,
+              ilist, numneigh, firstneigh, jlist, frame_nall, mem_cpy, mem_nnei,
+              batch_max_nbor_size, p_box, mesh_tensor.flat<int>().data(),
+              static_cast<int>(mesh_tensor.NumElements()), nloc, nall, nsamples,
+              nei_mode, rcut_r, max_cpy_trial, max_nnei_trial));
+      max_nbor_size = std::max(max_nbor_size, batch_max_nbor_size);
+
+      if (nei_mode == 1) {
+        real_type_cpy.assign(static_cast<size_t>(nsamples) * frame_nall, -1);
+        for (int kk = 0; kk < nsamples; ++kk) {
+          std::copy(
+              p_type + static_cast<int_64>(kk) * nall,
+              p_type + static_cast<int_64>(kk + 1) * nall,
+              real_type_cpy.begin() + static_cast<size_t>(kk) * frame_nall);
+        }
+        type = real_type_cpy.data();
+      }
+
+      deepmd::prod_env_mat_a_cpu(p_em, p_em_deriv, p_rij, p_nlist, coord, type,
+                                 batch_inlist, batch_max_nbor_size, avg, std,
+                                 nloc, frame_nall, nsamples, rcut_r,
+                                 rcut_r_smth, sec_a, f_type);
+      for (int kk = 0; kk < nsamples; ++kk) {
+        _map_nei_info_cpu(
+            p_nlist + static_cast<int_64>(kk) * nloc * nnei,
+            p_ntype + static_cast<int_64>(kk) * nloc * nnei,
+            p_nmask + static_cast<int_64>(kk) * nloc * nnei,
+            p_type + static_cast<int_64>(kk) * nall,
+            nei_mode == 1
+                ? idx_mapping.data() + static_cast<int_64>(kk) * frame_nall
+                : NULL,
+            nloc, nnei, ntypes, nei_mode == 1);
+      }
+      return;
+    }
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    if (device == "GPU" && (nei_mode == -1 || nei_mode == 1 || nei_mode == 4)) {
+      const int frame_chunk = nsamples;
+      for (int ff = 0; ff < nsamples; ff += frame_chunk) {
+        const int chunk_nframes = std::min(frame_chunk, nsamples - ff);
+        const int nrows = chunk_nframes * nloc;
+        FPTYPE* em = p_em + static_cast<int_64>(ff) * nloc * ndescrpt;
+        FPTYPE* em_deriv =
+            p_em_deriv + static_cast<int_64>(ff) * nloc * ndescrpt * 3;
+        FPTYPE* rij = p_rij + static_cast<int_64>(ff) * nloc * nnei * 3;
+        int* nlist = p_nlist + static_cast<int_64>(ff) * nloc * nnei;
+        int* ntype = p_ntype + static_cast<int_64>(ff) * nloc * nnei;
+        bool* nmask = p_nmask + static_cast<int_64>(ff) * nloc * nnei;
+        const FPTYPE* coord = p_coord + static_cast<int_64>(ff) * nall * 3;
+        const int* type = p_type + static_cast<int_64>(ff) * nall;
+        const int* f_type = p_f_type + static_cast<int_64>(ff) * nall;
+        int* idx_mapping = NULL;
+        int frame_nall = nall;
+        Tensor coord_cpy_tensor;
+        Tensor f_type_cpy_tensor;
+        Tensor real_type_cpy_tensor;
+        if (nei_mode == 1) {
+          int copy_ok = 0;
+          for (int tt = 0; tt < max_cpy_trial; ++tt) {
+            TensorShape cpy_shape;
+            cpy_shape.AddDim(static_cast<int64_t>(chunk_nframes) * mem_cpy * 3);
+            OP_REQUIRES_OK(
+                context, context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
+                                                cpy_shape, &coord_cpy_tensor));
+            TensorShape fake_type_cpy_shape;
+            fake_type_cpy_shape.AddDim(static_cast<int64_t>(chunk_nframes) *
+                                       mem_cpy * 2);
+            OP_REQUIRES_OK(context,
+                           context->allocate_temp(DT_INT32, fake_type_cpy_shape,
+                                                  &f_type_cpy_tensor));
+            TensorShape real_type_shape;
+            real_type_shape.AddDim(static_cast<int64_t>(chunk_nframes) *
+                                   mem_cpy);
+            OP_REQUIRES_OK(context,
+                           context->allocate_temp(DT_INT32, real_type_shape,
+                                                  &real_type_cpy_tensor));
+            FPTYPE* coord_cpy = coord_cpy_tensor.flat<FPTYPE>().data();
+            int* f_type_cpy = f_type_cpy_tensor.flat<int>().data();
+            int* real_type_cpy = real_type_cpy_tensor.flat<int>().data();
+            idx_mapping = f_type_cpy + int_64(chunk_nframes) * mem_cpy;
+            DPErrcheck(
+                gpuMemset(f_type_cpy, -1,
+                          sizeof(int) * int_64(chunk_nframes) * mem_cpy * 2));
+            DPErrcheck(
+                gpuMemset(real_type_cpy, -1,
+                          sizeof(int) * int_64(chunk_nframes) * mem_cpy));
+
+            copy_ok = 1;
+            for (int kk = 0; kk < chunk_nframes; ++kk) {
+              int frame_copied_nall = nall;
+              int ret = _norm_copy_coord_gpu_frame(
+                  context, coord_cpy + int_64(kk) * mem_cpy * 3,
+                  f_type_cpy + int_64(kk) * mem_cpy,
+                  idx_mapping + int_64(kk) * mem_cpy, frame_copied_nall,
+                  mem_cpy, p_coord + static_cast<int_64>(ff + kk) * nall * 3,
+                  p_box + static_cast<int_64>(ff + kk) * 9,
+                  p_f_type + static_cast<int_64>(ff + kk) * nall, nloc, rcut_r);
+              OP_REQUIRES(
+                  context, ret >= 0,
+                  errors::Aborted("cannot allocate mem for copied coords"));
+              if (ret != 0) {
+                copy_ok = 0;
+                break;
+              }
+              DPErrcheck(gpuMemcpy(real_type_cpy + int_64(kk) * mem_cpy,
+                                   p_type + static_cast<int_64>(ff + kk) * nall,
+                                   sizeof(int) * nall,
+                                   gpuMemcpyDeviceToDevice));
+            }
+            if (copy_ok) {
+              coord = coord_cpy;
+              type = real_type_cpy;
+              f_type = f_type_cpy;
+              frame_nall = mem_cpy;
+              break;
+            }
+            mem_cpy *= 2;
+          }
+          OP_REQUIRES(context, copy_ok,
+                      errors::Aborted("cannot allocate mem for copied coords"));
+        }
+
+        std::vector<Tensor> nlist_tensors(2);
+        int *ilist = NULL, *numneigh = NULL, *jlist = NULL;
+        DeviceMemoryGuard<int*> firstneigh_guard;
+        deepmd::malloc_device_memory(firstneigh_guard.ptr, nrows);
+        int** firstneigh = firstneigh_guard.ptr;
+        int chunk_max_nbor_size = max_nbor_size;
+        deepmd::InputNlist chunk_gpu_inlist;
+        if (nei_mode == 4) {
+          OP_REQUIRES_OK(
+              context,
+              _prepare_mesh_nlist_gpu_batch(
+                  context, nlist_tensors.data(), chunk_gpu_inlist, firstneigh,
+                  chunk_max_nbor_size, mesh_tensor.flat<int>().data(),
+                  static_cast<int>(mesh_tensor.NumElements()), nloc,
+                  chunk_nframes));
+        } else {
+          TensorShape ilist_shape;
+          ilist_shape.AddDim(static_cast<int64_t>(nrows) * 2);
+          OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, ilist_shape,
+                                                         &nlist_tensors[0]));
+          TensorShape jlist_shape;
+          jlist_shape.AddDim(3 * int_64(nrows) * mem_nnei);
+          OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, jlist_shape,
+                                                         &nlist_tensors[1]));
+          int build_ok =
+              _build_nlist_gpu(context, nlist_tensors.data(), ilist, numneigh,
+                               firstneigh, jlist, chunk_max_nbor_size, mem_nnei,
+                               coord, nloc, frame_nall, max_nnei_trial, rcut_r,
+                               chunk_nframes, nei_mode == 1 ? f_type : NULL);
+          OP_REQUIRES(context, build_ok,
+                      errors::Aborted("cannot allocate mem for nlist"));
+          OP_REQUIRES_OK(context,
+                         _round_built_gpu_nbor_size(chunk_max_nbor_size));
+          chunk_gpu_inlist =
+              deepmd::InputNlist(nrows, ilist, numneigh, firstneigh);
+        }
+        max_nbor_size = std::max(max_nbor_size, chunk_max_nbor_size);
+
+        Tensor int_temp;
+        TensorShape int_shape;
+        int_shape.AddDim(sec_a.size() + int_64(nrows) * sec_a.size() + nrows);
+        OP_REQUIRES_OK(context,
+                       context->allocate_temp(DT_INT32, int_shape, &int_temp));
+        Tensor uint64_temp;
+        TensorShape uint64_shape;
+        uint64_shape.AddDim(int_64(nrows) * chunk_max_nbor_size * 2);
+        OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape,
+                                                       &uint64_temp));
+        array_int = int_temp.flat<int>().data();
+        array_longlong = uint64_temp.flat<unsigned long long>().data();
+
+        deepmd::prod_env_mat_a_gpu(
+            em, em_deriv, rij, nlist, coord, type, chunk_gpu_inlist, array_int,
+            array_longlong, chunk_max_nbor_size, avg, std, nloc, frame_nall,
+            chunk_nframes, rcut_r, rcut_r_smth, sec_a, f_type);
+        for (int kk = 0; kk < chunk_nframes; ++kk) {
+          _map_nei_info_gpu(
+              nlist + int_64(kk) * nloc * nnei,
+              ntype + int_64(kk) * nloc * nnei,
+              nmask + int_64(kk) * nloc * nnei,
+              p_type + static_cast<int_64>(ff + kk) * nall,
+              nei_mode == 1 ? idx_mapping + int_64(kk) * mem_cpy : NULL, nloc,
+              nnei, ntypes, nei_mode == 1);
+        }
+      }
+      return;
+    }
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
     // must declare out of if, otherwise the memory will be destroyed!
     Tensor int_temp;
     Tensor uint64_temp;
@@ -1208,7 +1860,8 @@ class ProdEnvMatAMixOp : public OpKernel {
       array_longlong = uint64_temp.flat<unsigned long long>().data();
     }
 
-    // loop over samples
+    // LAMMPS external nlists are updated outside this op, so keep their
+    // existing per-sample implementation.
     for (int_64 ff = 0; ff < nsamples; ++ff) {
       FPTYPE* em = p_em + ff * nloc * ndescrpt;
       FPTYPE* em_deriv = p_em_deriv + ff * nloc * ndescrpt * 3;
@@ -1225,20 +1878,23 @@ class ProdEnvMatAMixOp : public OpKernel {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
         int* idx_mapping = NULL;
         int *ilist = NULL, *numneigh = NULL;
-        int** firstneigh = NULL;
-        deepmd::malloc_device_memory(firstneigh, nloc);
+        DeviceMemoryGuard<int*> firstneigh_guard;
+        deepmd::malloc_device_memory(firstneigh_guard.ptr, nloc);
+        int** firstneigh = firstneigh_guard.ptr;
         int* jlist = NULL;
         FPTYPE* coord_cpy;
         int* type_cpy;
         int frame_nall = nall;
         int mesh_tensor_size = static_cast<int>(mesh_tensor.NumElements());
         // prepare coord and nlist
-        _prepare_coord_nlist_gpu<FPTYPE>(
-            context, &tensor_list[0], &coord, coord_cpy, &f_type, type_cpy,
-            idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist,
-            nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size, box,
-            mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode,
-            rcut_r, max_cpy_trial, max_nnei_trial);
+        OP_REQUIRES_OK(
+            context,
+            _prepare_coord_nlist_gpu<FPTYPE>(
+                context, &tensor_list[0], &coord, coord_cpy, &f_type, type_cpy,
+                idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist,
+                nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size,
+                box, mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc,
+                nei_mode, rcut_r, max_cpy_trial, max_nnei_trial));
 
         // max_nbor_size may be changed after _prepare_coord_nlist_gpu
         // So we need to update the uint64_temp tensor if necessary
@@ -1253,11 +1909,10 @@ class ProdEnvMatAMixOp : public OpKernel {
         // launch the gpu(nv) compute function
         deepmd::prod_env_mat_a_gpu(em, em_deriv, rij, nlist, coord, type,
                                    gpu_inlist, array_int, array_longlong,
-                                   max_nbor_size, avg, std, nloc, frame_nall,
+                                   max_nbor_size, avg, std, nloc, frame_nall, 1,
                                    rcut_r, rcut_r_smth, sec_a, f_type);
         _map_nei_info_gpu(nlist, ntype, nmask, type, idx_mapping, nloc, nnei,
                           ntypes, b_nlist_map);
-        deepmd::delete_device_memory(firstneigh);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       } else if (device == "CPU") {
         deepmd::InputNlist inlist;
@@ -1340,6 +1995,28 @@ static int _norm_copy_coord_cpu(std::vector<FPTYPE>& coord_cpy,
   return (tt != max_cpy_trial);
 }
 
+template <typename FPTYPE>
+static int _norm_copy_coord_cpu_frame(FPTYPE* coord_cpy,
+                                      int* type_cpy,
+                                      int* idx_mapping,
+                                      int& frame_nall,
+                                      const int& mem_cpy,
+                                      const FPTYPE* coord,
+                                      const FPTYPE* box,
+                                      const int* type,
+                                      const int& nall,
+                                      const int& nloc,
+                                      const float& rcut_r) {
+  std::vector<FPTYPE> tmp_coord(static_cast<size_t>(nall) * 3);
+  std::copy(coord, coord + static_cast<size_t>(nall) * 3, tmp_coord.begin());
+  deepmd::Region<FPTYPE> region;
+  init_region_cpu(region, box);
+  normalize_coord_cpu(&tmp_coord[0], nall, region);
+  frame_nall = nall;
+  return copy_coord_cpu(coord_cpy, type_cpy, idx_mapping, &frame_nall,
+                        &tmp_coord[0], type, nloc, mem_cpy, rcut_r, region);
+}
+
 template <typename FPTYPE>
 static int _build_nlist_cpu(std::vector<int>& ilist,
                             std::vector<int>& numneigh,
@@ -1351,16 +2028,19 @@ static int _build_nlist_cpu(std::vector<int>& ilist,
                             const int& nloc,
                             const int& new_nall,
                             const int& max_nnei_trial,
-                            const float& rcut_r) {
+                            const float& rcut_r,
+                            const int& nframes,
+                            const int* type) {
+  const int nrows = nframes * nloc;
   int tt;
   for (tt = 0; tt < max_nnei_trial; ++tt) {
-    for (int ii = 0; ii < nloc; ++ii) {
+    for (int ii = 0; ii < nrows; ++ii) {
       jlist[ii].resize(mem_nnei);
       firstneigh[ii] = &jlist[ii][0];
     }
-    deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+    deepmd::InputNlist inlist(nrows, &ilist[0], &numneigh[0], &firstneigh[0]);
     int ret = build_nlist_cpu(inlist, &max_nnei, coord, nloc, new_nall,
-                              mem_nnei, rcut_r);
+                              mem_nnei, rcut_r, nframes, type);
     if (ret == 0) {
       break;
     } else {
@@ -1397,6 +2077,173 @@ static void _map_nei_info_cpu(int* nlist,
                            ntypes, b_nlist_map);
 }
 
+static tensorflow::Status _validate_mesh_neighbor_counts(
+    int& max_numneigh,
+    int_64& neighbor_count,
+    const int* numneigh_in,
+    const int mesh_tensor_size,
+    const int_64 header_size,
+    const int nloc) {
+  max_numneigh = 0;
+  neighbor_count = 0;
+  for (int ii = 0; ii < nloc; ++ii) {
+    const int_64 numneigh = numneigh_in[ii];
+    if (numneigh < 0 || neighbor_count > static_cast<int_64>(mesh_tensor_size) -
+                                             header_size - numneigh) {
+      return errors::InvalidArgument("invalid mesh tensor");
+    }
+    max_numneigh = std::max(max_numneigh, static_cast<int>(numneigh_in[ii]));
+    neighbor_count += numneigh;
+  }
+  return tensorflow::Status();
+}
+
+static tensorflow::Status _prepare_mesh_nlist_cpu_batch(
+    deepmd::InputNlist& inlist,
+    std::vector<int>& ilist,
+    std::vector<int>& numneigh,
+    std::vector<int*>& firstneigh,
+    std::vector<std::vector<int>>& jlist,
+    int& max_nbor_size,
+    const int* mesh_tensor_data,
+    const int mesh_tensor_size,
+    const int nloc,
+    const int nframes) {
+  const int_64 header_size = 16 + static_cast<int_64>(2) * nloc;
+  if (static_cast<int_64>(mesh_tensor_size) < header_size) {
+    return errors::InvalidArgument("invalid mesh tensor");
+  }
+
+  const int* ilist_in = mesh_tensor_data + 16;
+  const int* numneigh_in = mesh_tensor_data + 16 + nloc;
+  const int* neighbors_in = mesh_tensor_data + header_size;
+
+  int max_numneigh = 0;
+  int_64 neighbor_count = 0;
+  tensorflow::Status count_status =
+      _validate_mesh_neighbor_counts(max_numneigh, neighbor_count, numneigh_in,
+                                     mesh_tensor_size, header_size, nloc);
+  if (!count_status.ok()) {
+    return count_status;
+  }
+
+  const int nrows = nframes * nloc;
+  ilist.resize(nrows);
+  numneigh.resize(nrows);
+  firstneigh.resize(nrows);
+  jlist.resize(nrows);
+  max_nbor_size = std::max(max_nbor_size, max_numneigh);
+
+  std::vector<int_64> neighbor_offset(nloc + 1, 0);
+  for (int ii = 0; ii < nloc; ++ii) {
+    neighbor_offset[ii + 1] = neighbor_offset[ii] + numneigh_in[ii];
+  }
+  for (int ff = 0; ff < nframes; ++ff) {
+    for (int ii = 0; ii < nloc; ++ii) {
+      const int row = ff * nloc + ii;
+      ilist[row] = ilist_in[ii];
+      numneigh[row] = numneigh_in[ii];
+      jlist[row].resize(numneigh_in[ii]);
+      std::copy(neighbors_in + neighbor_offset[ii],
+                neighbors_in + neighbor_offset[ii + 1], jlist[row].begin());
+      firstneigh[row] = jlist[row].data();
+    }
+  }
+  inlist = deepmd::InputNlist(nrows, ilist.data(), numneigh.data(),
+                              firstneigh.data());
+  return tensorflow::Status();
+}
+
+template <typename FPTYPE>
+static tensorflow::Status _prepare_coord_nlist_cpu_batch(
+    FPTYPE const** coord,
+    std::vector<FPTYPE>& coord_cpy,
+    int const** type,
+    std::vector<int>& type_cpy,
+    std::vector<int>& idx_mapping,
+    deepmd::InputNlist& inlist,
+    std::vector<int>& ilist,
+    std::vector<int>& numneigh,
+    std::vector<int*>& firstneigh,
+    std::vector<std::vector<int>>& jlist,
+    int& new_nall,
+    int& mem_cpy,
+    int& mem_nnei,
+    int& max_nbor_size,
+    const FPTYPE* box,
+    const int* mesh_tensor_data,
+    const int mesh_tensor_size,
+    const int& nloc,
+    const int& nall,
+    const int& nframes,
+    const int& nei_mode,
+    const float& rcut_r,
+    const int& max_cpy_trial,
+    const int& max_nnei_trial) {
+  const int nrows = nframes * nloc;
+  new_nall = nall;
+  if (nei_mode == 1) {
+    int copy_ok = 0;
+    for (int tt = 0; tt < max_cpy_trial; ++tt) {
+      coord_cpy.assign(static_cast<size_t>(nframes) * mem_cpy * 3,
+                       static_cast<FPTYPE>(0));
+      type_cpy.assign(static_cast<size_t>(nframes) * mem_cpy, -1);
+      idx_mapping.assign(static_cast<size_t>(nframes) * mem_cpy, -1);
+
+      copy_ok = 1;
+      for (int kk = 0; kk < nframes; ++kk) {
+        int frame_copied_nall = nall;
+        int ret = _norm_copy_coord_cpu_frame(
+            coord_cpy.data() + static_cast<size_t>(kk) * mem_cpy * 3,
+            type_cpy.data() + static_cast<size_t>(kk) * mem_cpy,
+            idx_mapping.data() + static_cast<size_t>(kk) * mem_cpy,
+            frame_copied_nall, mem_cpy,
+            *coord + static_cast<size_t>(kk) * nall * 3,
+            box + static_cast<size_t>(kk) * 9,
+            *type + static_cast<size_t>(kk) * nall, nall, nloc, rcut_r);
+        if (ret != 0) {
+          copy_ok = 0;
+          break;
+        }
+      }
+      if (copy_ok) {
+        *coord = coord_cpy.data();
+        *type = type_cpy.data();
+        new_nall = mem_cpy;
+        break;
+      }
+      mem_cpy *= 2;
+    }
+    if (!copy_ok) {
+      return errors::Aborted("cannot allocate mem for copied coords");
+    }
+  }
+
+  ilist.resize(nrows);
+  numneigh.resize(nrows);
+  firstneigh.resize(nrows);
+  jlist.resize(nrows);
+  if (nei_mode == 4) {
+    tensorflow::Status status = _prepare_mesh_nlist_cpu_batch(
+        inlist, ilist, numneigh, firstneigh, jlist, max_nbor_size,
+        mesh_tensor_data, mesh_tensor_size, nloc, nframes);
+    if (!status.ok()) {
+      return status;
+    }
+  } else {
+    int build_ok =
+        _build_nlist_cpu(ilist, numneigh, firstneigh, jlist, max_nbor_size,
+                         mem_nnei, *coord, nloc, new_nall, max_nnei_trial,
+                         rcut_r, nframes, nei_mode == 1 ? *type : NULL);
+    if (!build_ok) {
+      return errors::Aborted("cannot allocate mem for nlist");
+    }
+    inlist = deepmd::InputNlist(nrows, ilist.data(), numneigh.data(),
+                                firstneigh.data());
+  }
+  return tensorflow::Status();
+}
+
 /**
  * @param[in] nei_mode -1, 1, 3, or 4.
  *   - -1: Build neighbor list without PBC. The size of mesh should
@@ -1559,6 +2406,74 @@ static int _norm_copy_coord_gpu(OpKernelContext* context,
   return (tt != max_cpy_trial);
 }
 
+template <typename FPTYPE>
+static int _norm_copy_coord_gpu_frame(OpKernelContext* context,
+                                      FPTYPE* coord_cpy,
+                                      int* type_cpy,
+                                      int* idx_mapping,
+                                      int& frame_nall,
+                                      const int& mem_cpy,
+                                      const FPTYPE* coord,
+                                      const FPTYPE* box,
+                                      const int* type,
+                                      const int& nloc,
+                                      const float& rcut_r) {
+  Tensor tmp_coord_tensor;
+  TensorShape tmp_coord_shape;
+  tmp_coord_shape.AddDim(static_cast<int64_t>(nloc) * 3);
+  tensorflow::Status status = context->allocate_temp(
+      DataTypeToEnum<FPTYPE>::value, tmp_coord_shape, &tmp_coord_tensor);
+  if (!status.ok()) {
+    return -1;
+  }
+  FPTYPE* tmp_coord = tmp_coord_tensor.flat<FPTYPE>().data();
+  DPErrcheck(gpuMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nloc * 3,
+                       gpuMemcpyDeviceToDevice));
+
+  deepmd::Region<FPTYPE> region;
+  init_region_cpu(region, box);
+  FPTYPE box_info[18];
+  std::copy(region.boxt, region.boxt + 9, box_info);
+  std::copy(region.rec_boxt, region.rec_boxt + 9, box_info + 9);
+  int cell_info[23];
+  deepmd::compute_cell_info(cell_info, rcut_r, region);
+  const int loc_cellnum = cell_info[21];
+  const int total_cellnum = cell_info[22];
+
+  Tensor box_info_tensor;
+  TensorShape box_info_shape;
+  box_info_shape.AddDim(18);
+  status = context->allocate_temp(DataTypeToEnum<FPTYPE>::value, box_info_shape,
+                                  &box_info_tensor);
+  if (!status.ok()) {
+    return -1;
+  }
+
+  Tensor int_tensor;
+  TensorShape int_shape;
+  int_shape.AddDim(23 + nloc * 3 + loc_cellnum + total_cellnum * 3 +
+                   total_cellnum * 3 + loc_cellnum + 1 + total_cellnum + 1 +
+                   nloc);
+  status = context->allocate_temp(DT_INT32, int_shape, &int_tensor);
+  if (!status.ok()) {
+    return -1;
+  }
+
+  FPTYPE* box_info_dev = box_info_tensor.flat<FPTYPE>().data();
+  int* cell_info_dev = int_tensor.flat<int>().data();
+  int* int_data_dev = cell_info_dev + 23;
+  deepmd::memcpy_host_to_device(box_info_dev, box_info, 18);
+  deepmd::memcpy_host_to_device(cell_info_dev, cell_info, 23);
+  deepmd::Region<FPTYPE> region_dev(box_info_dev, box_info_dev + 9);
+  deepmd::normalize_coord_gpu(tmp_coord, nloc, region_dev);
+
+  frame_nall = nloc;
+  return deepmd::copy_coord_gpu(coord_cpy, type_cpy, idx_mapping, &frame_nall,
+                                int_data_dev, tmp_coord, type, nloc, mem_cpy,
+                                loc_cellnum, total_cellnum, cell_info_dev,
+                                region_dev);
+}
+
 template <typename FPTYPE>
 static int _build_nlist_gpu(OpKernelContext* context,
                             Tensor* tensor_list,
@@ -1572,30 +2487,34 @@ static int _build_nlist_gpu(OpKernelContext* context,
                             const int& nloc,
                             const int& new_nall,
                             const int& max_nnei_trial,
-                            const float& rcut_r) {
+                            const float& rcut_r,
+                            const int& nframes,
+                            const int* type) {
+  const int nrows = nframes * nloc;
   ilist = (*tensor_list).flat<int>().data();
-  numneigh = ilist + nloc;
+  numneigh = ilist + nrows;
   // Tensor jlist_temp;
   int* ind_data = NULL;
 
-  std::vector<int*> firstneigh_host(nloc);
+  std::vector<int*> firstneigh_host(nrows);
   int tt;
   for (tt = 0; tt < max_nnei_trial; ++tt) {
     jlist = (*(tensor_list + 1)).flat<int>().data();
-    ind_data = jlist + nloc * mem_nnei;
-    for (int_64 ii = 0; ii < nloc; ++ii) {
+    ind_data = jlist + int_64(nrows) * mem_nnei;
+    for (int_64 ii = 0; ii < nrows; ++ii) {
       firstneigh_host[ii] = jlist + ii * mem_nnei;
     }
     deepmd::memcpy_host_to_device(firstneigh, firstneigh_host);
-    deepmd::InputNlist inlist(nloc, ilist, numneigh, firstneigh);
-    int ret = deepmd::build_nlist_gpu(inlist, &max_nnei, ind_data, coord, nloc,
-                                      new_nall, mem_nnei, rcut_r);
+    deepmd::InputNlist inlist(nrows, ilist, numneigh, firstneigh);
+    int ret =
+        deepmd::build_nlist_gpu(inlist, &max_nnei, ind_data, coord, nloc,
+                                new_nall, mem_nnei, rcut_r, nframes, type);
     if (ret == 0) {
       break;
     } else {
       mem_nnei *= 2;
       TensorShape jlist_shape;
-      jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei);
+      jlist_shape.AddDim(3 * int_64(nrows) * mem_nnei);
       tensorflow::Status status =
           context->allocate_temp(DT_INT32, jlist_shape, tensor_list + 1);
       if (!status.ok()) {
@@ -1626,32 +2545,155 @@ static void _map_nei_info_gpu(int* nlist,
                            ntypes, b_nlist_map);
 }
 
+static tensorflow::Status _prepare_mesh_nlist_gpu_batch(
+    OpKernelContext* context,
+    Tensor* tensor_list,
+    deepmd::InputNlist& gpu_inlist,
+    int** firstneigh,
+    int& max_nbor_size,
+    const int* mesh_tensor_data,
+    const int mesh_tensor_size,
+    const int nloc,
+    const int nframes) {
+  const int_64 header_size = 16 + static_cast<int_64>(2) * nloc;
+  if (static_cast<int_64>(mesh_tensor_size) < header_size) {
+    return errors::InvalidArgument("invalid mesh tensor");
+  }
+
+  // Decode the external mesh on host first so malformed neighbor counts are
+  // rejected before allocating the flattened GPU list.
+  std::vector<int> mesh_tensor_data_host(mesh_tensor_size);
+  deepmd::memcpy_device_to_host(mesh_tensor_data, mesh_tensor_data_host);
+  const int* ilist_in = mesh_tensor_data_host.data() + 16;
+  const int* numneigh_in = mesh_tensor_data_host.data() + 16 + nloc;
+  const int* neighbors_in = mesh_tensor_data_host.data() + header_size;
+
+  int max_numneigh = 0;
+  int_64 neighbor_count = 0;
+  tensorflow::Status count_status =
+      _validate_mesh_neighbor_counts(max_numneigh, neighbor_count, numneigh_in,
+                                     mesh_tensor_size, header_size, nloc);
+  if (!count_status.ok()) {
+    return count_status;
+  }
+  if (max_numneigh > GPU_MAX_NBOR_SIZE) {
+    return errors::InvalidArgument(
+        "Assert failed, max neighbor size of atom(lammps) " +
+        std::to_string(max_numneigh) + " is larger than " +
+        std::to_string(GPU_MAX_NBOR_SIZE) +
+        ", which currently is not supported by deepmd-kit.");
+  }
+
+  if (max_numneigh <= 256) {
+    max_nbor_size = 256;
+  } else if (max_numneigh <= 512) {
+    max_nbor_size = 512;
+  } else if (max_numneigh <= 1024) {
+    max_nbor_size = 1024;
+  } else if (max_numneigh <= 2048) {
+    max_nbor_size = 2048;
+  } else {
+    max_nbor_size = 4096;
+  }
+
+  const int nrows = nframes * nloc;
+  TensorShape ilist_shape;
+  ilist_shape.AddDim(static_cast<int64_t>(nrows) * 2);
+  tensorflow::Status status =
+      context->allocate_temp(DT_INT32, ilist_shape, tensor_list);
+  if (!status.ok()) {
+    return status;
+  }
+  TensorShape jlist_shape;
+  jlist_shape.AddDim(int_64(nrows) * max_nbor_size);
+  status = context->allocate_temp(DT_INT32, jlist_shape, tensor_list + 1);
+  if (!status.ok()) {
+    return status;
+  }
+
+  // Repeat the single-frame external mesh layout for each frame in the batch.
+  std::vector<int> ilist_host(nrows);
+  std::vector<int> numneigh_host(nrows);
+  std::vector<int> nbor_list_host(static_cast<size_t>(nrows) * max_nbor_size,
+                                  0);
+  std::vector<int_64> neighbor_offset(nloc + 1, 0);
+  for (int ii = 0; ii < nloc; ++ii) {
+    neighbor_offset[ii + 1] = neighbor_offset[ii] + numneigh_in[ii];
+  }
+  for (int ff = 0; ff < nframes; ++ff) {
+    for (int ii = 0; ii < nloc; ++ii) {
+      const int row = ff * nloc + ii;
+      ilist_host[row] = ilist_in[ii];
+      numneigh_host[row] = numneigh_in[ii];
+      for (int jj = 0; jj < numneigh_in[ii]; ++jj) {
+        nbor_list_host[static_cast<size_t>(row) * max_nbor_size + jj] =
+            neighbors_in[neighbor_offset[ii] + jj];
+      }
+    }
+  }
+
+  int* ilist = (*tensor_list).flat<int>().data();
+  int* numneigh = ilist + nrows;
+  int* nbor_list = (*(tensor_list + 1)).flat<int>().data();
+  deepmd::memcpy_host_to_device(ilist, ilist_host);
+  deepmd::memcpy_host_to_device(numneigh, numneigh_host);
+  deepmd::memcpy_host_to_device(nbor_list, nbor_list_host);
+
+  // Store device-side row pointers separately; InputNlist expects firstneigh
+  // to point to each row in the contiguous neighbor buffer.
+  std::vector<int*> firstneigh_host(nrows);
+  for (int ii = 0; ii < nrows; ++ii) {
+    firstneigh_host[ii] = nbor_list + static_cast<int_64>(ii) * max_nbor_size;
+  }
+  deepmd::memcpy_host_to_device(firstneigh, firstneigh_host);
+  gpu_inlist = deepmd::InputNlist(nrows, ilist, numneigh, firstneigh);
+  return tensorflow::Status();
+}
+
+static tensorflow::Status _round_built_gpu_nbor_size(int& max_nbor_size) {
+  if (max_nbor_size > GPU_MAX_NBOR_SIZE) {
+    return errors::InvalidArgument(
+        "Assert failed, max neighbor size of atom(lammps) " +
+        std::to_string(max_nbor_size) + " is larger than " +
+        std::to_string(GPU_MAX_NBOR_SIZE) +
+        ", which currently is not supported by deepmd-kit.");
+  }
+  if (max_nbor_size <= 1024) {
+    max_nbor_size = 1024;
+  } else if (max_nbor_size <= 2048) {
+    max_nbor_size = 2048;
+  } else {
+    max_nbor_size = 4096;
+  }
+  return tensorflow::Status();
+}
+
 template <typename FPTYPE>
-void _prepare_coord_nlist_gpu(OpKernelContext* context,
-                              Tensor* tensor_list,
-                              FPTYPE const** coord,
-                              FPTYPE*& coord_cpy,
-                              int const** type,
-                              int*& type_cpy,
-                              int*& idx_mapping,
-                              deepmd::InputNlist& inlist,
-                              int*& ilist,
-                              int*& numneigh,
-                              int**& firstneigh,
-                              int*& jlist,
-                              int*& nbor_list_dev,
-                              int& new_nall,
-                              int& mem_cpy,
-                              int& mem_nnei,
-                              int& max_nbor_size,
-                              const FPTYPE* box,
-                              const int* mesh_tensor_data,
-                              const int mesh_tensor_size,
-                              const int& nloc,
-                              const int& nei_mode,
-                              const float& rcut_r,
-                              const int& max_cpy_trial,
-                              const int& max_nnei_trial) {
+tensorflow::Status _prepare_coord_nlist_gpu(OpKernelContext* context,
+                                            Tensor* tensor_list,
+                                            FPTYPE const** coord,
+                                            FPTYPE*& coord_cpy,
+                                            int const** type,
+                                            int*& type_cpy,
+                                            int*& idx_mapping,
+                                            deepmd::InputNlist& inlist,
+                                            int*& ilist,
+                                            int*& numneigh,
+                                            int**& firstneigh,
+                                            int*& jlist,
+                                            int*& nbor_list_dev,
+                                            int& new_nall,
+                                            int& mem_cpy,
+                                            int& mem_nnei,
+                                            int& max_nbor_size,
+                                            const FPTYPE* box,
+                                            const int* mesh_tensor_data,
+                                            const int mesh_tensor_size,
+                                            const int& nloc,
+                                            const int& nei_mode,
+                                            const float& rcut_r,
+                                            const int& max_cpy_trial,
+                                            const int& max_nnei_trial) {
   if (nei_mode != 3 && nei_mode != 4) {
     inlist.inum = nloc;
     // build nlist by myself
@@ -1660,8 +2702,9 @@ void _prepare_coord_nlist_gpu(OpKernelContext* context,
       int copy_ok = _norm_copy_coord_gpu(
           context, tensor_list, coord_cpy, type_cpy, idx_mapping, new_nall,
           mem_cpy, *coord, box, *type, nloc, max_cpy_trial, rcut_r);
-      OP_REQUIRES(context, copy_ok,
-                  errors::Aborted("cannot allocate mem for copied coords"));
+      if (!copy_ok) {
+        return errors::Aborted("cannot allocate mem for copied coords");
+      }
       *coord = coord_cpy;
       *type = type_cpy;
     }
@@ -1670,14 +2713,12 @@ void _prepare_coord_nlist_gpu(OpKernelContext* context,
         _build_nlist_gpu(context, tensor_list + 5, ilist, numneigh, firstneigh,
                          jlist, max_nbor_size, mem_nnei, *coord, nloc, new_nall,
                          max_nnei_trial, rcut_r);
-    OP_REQUIRES(context, build_ok,
-                errors::Aborted("cannot allocate mem for nlist"));
-    if (max_nbor_size <= 1024) {
-      max_nbor_size = 1024;
-    } else if (max_nbor_size <= 2048) {
-      max_nbor_size = 2048;
-    } else {
-      max_nbor_size = 4096;
+    if (!build_ok) {
+      return errors::Aborted("cannot allocate mem for nlist");
+    }
+    tensorflow::Status status = _round_built_gpu_nbor_size(max_nbor_size);
+    if (!status.ok()) {
+      return status;
     }
     inlist.ilist = ilist;
     inlist.numneigh = numneigh;
@@ -1706,22 +2747,22 @@ void _prepare_coord_nlist_gpu(OpKernelContext* context,
     std::memcpy(&fake_mesh[8], &numneigh_host, sizeof(int*));
     std::memcpy(&fake_mesh[12], &firstneigh_host, sizeof(int**));
     // copy from cpu to gpu
-    int* fake_mesh_dev = NULL;
-    deepmd::malloc_device_memory(fake_mesh_dev, 16);
-    deepmd::memcpy_host_to_device(fake_mesh_dev, fake_mesh);
+    DeviceMemoryGuard<int> fake_mesh_guard;
+    deepmd::malloc_device_memory(fake_mesh_guard.ptr, 16);
+    deepmd::memcpy_host_to_device(fake_mesh_guard.ptr, fake_mesh);
 
     deepmd::InputNlist inlist_temp;
     inlist_temp.inum = nloc;
     // everything should be copied to GPU...
     deepmd::env_mat_nbor_update(inlist_temp, inlist, max_nbor_size,
-                                nbor_list_dev, fake_mesh_dev, 16);
-    OP_REQUIRES(context, (max_numneigh(inlist_temp) <= max_nbor_size),
-                errors::InvalidArgument(
-                    "Assert failed, max neighbor size of atom(lammps) " +
-                    std::to_string(max_numneigh(inlist_temp)) +
-                    " is larger than " + std::to_string(max_nbor_size) +
-                    ", which currently is not supported by deepmd-kit."));
-    deepmd::delete_device_memory(fake_mesh_dev);
+                                nbor_list_dev, fake_mesh_guard.ptr, 16);
+    if (max_numneigh(inlist_temp) > max_nbor_size) {
+      return errors::InvalidArgument(
+          "Assert failed, max neighbor size of atom(lammps) " +
+          std::to_string(max_numneigh(inlist_temp)) + " is larger than " +
+          std::to_string(max_nbor_size) +
+          ", which currently is not supported by deepmd-kit.");
+    }
   } else {
     // update nbor list
     deepmd::InputNlist inlist_temp;
@@ -1729,13 +2770,15 @@ void _prepare_coord_nlist_gpu(OpKernelContext* context,
     deepmd::env_mat_nbor_update(inlist_temp, inlist, max_nbor_size,
                                 nbor_list_dev, mesh_tensor_data,
                                 mesh_tensor_size);
-    OP_REQUIRES(context, (max_numneigh(inlist_temp) <= max_nbor_size),
-                errors::InvalidArgument(
-                    "Assert failed, max neighbor size of atom(lammps) " +
-                    std::to_string(max_numneigh(inlist_temp)) +
-                    " is larger than " + std::to_string(max_nbor_size) +
-                    ", which currently is not supported by deepmd-kit."));
+    if (max_numneigh(inlist_temp) > max_nbor_size) {
+      return errors::InvalidArgument(
+          "Assert failed, max neighbor size of atom(lammps) " +
+          std::to_string(max_numneigh(inlist_temp)) + " is larger than " +
+          std::to_string(max_nbor_size) +
+          ", which currently is not supported by deepmd-kit.");
+    }
   }
+  return tensorflow::Status();
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 

From 0c4e30d69c17aef51f45d2a62cd6eea2e7069531 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@ustc.edu.cn>
Date: Fri, 26 Jun 2026 18:27:01 +0800
Subject: [PATCH 2/5] test(core): cover batched GPU env mat outputs

---
 source/lib/tests/test_env_mat_a.cc     |  52 ++++++---
 source/lib/tests/test_env_mat_a_mix.cc |  73 ++++++++++--
 source/lib/tests/test_env_mat_r.cc     | 156 +++++++++++++++++++++++++
 3 files changed, 256 insertions(+), 25 deletions(-)

diff --git a/source/lib/tests/test_env_mat_a.cc b/source/lib/tests/test_env_mat_a.cc
index 203291d58c..f67ac096ac 100644
--- a/source/lib/tests/test_env_mat_a.cc
+++ b/source/lib/tests/test_env_mat_a.cc
@@ -818,7 +818,11 @@ TEST_F(TestEnvMatA, prod_gpu_multiple_frames) {
   std::vector<double> avg(static_cast<size_t>(ntypes) * ndescrpt, 0);
   std::vector<double> std(static_cast<size_t>(ntypes) * ndescrpt, 1);
   std::vector<double> expected_multi(static_cast<size_t>(nframes) * nloc *
-                                     ndescrpt);
+                                     ndescrpt),
+      expected_deriv_multi(static_cast<size_t>(nframes) * nloc * ndescrpt * 3),
+      expected_rij_multi(static_cast<size_t>(nframes) * nloc * nnei * 3);
+  std::vector<int> expected_nlist_multi(static_cast<size_t>(nframes) * nloc *
+                                        nnei);
   for (int ff = 0; ff < nframes; ++ff) {
     std::vector<double> frame_em(static_cast<size_t>(nloc) * ndescrpt),
         frame_em_deriv(static_cast<size_t>(nloc) * ndescrpt * 3),
@@ -830,9 +834,18 @@ TEST_F(TestEnvMatA, prod_gpu_multiple_frames) {
         posi_multi.data() + static_cast<size_t>(ff) * nall * 3,
         atype_multi.data() + static_cast<size_t>(ff) * nall, base_inlist,
         max_nbor_size, avg.data(), std.data(), nloc, nall, rc, rc_smth, sec_a);
-    std::copy(
-        frame_em.begin(), frame_em.end(),
-        expected_multi.begin() + static_cast<size_t>(ff) * nloc * ndescrpt);
+    const size_t em_offset = static_cast<size_t>(ff) * nloc * ndescrpt;
+    const size_t deriv_offset = em_offset * 3;
+    const size_t rij_offset = static_cast<size_t>(ff) * nloc * nnei * 3;
+    const size_t nlist_offset = static_cast<size_t>(ff) * nloc * nnei;
+    std::copy(frame_em.begin(), frame_em.end(),
+              expected_multi.begin() + em_offset);
+    std::copy(frame_em_deriv.begin(), frame_em_deriv.end(),
+              expected_deriv_multi.begin() + deriv_offset);
+    std::copy(frame_rij.begin(), frame_rij.end(),
+              expected_rij_multi.begin() + rij_offset);
+    std::copy(frame_nlist.begin(), frame_nlist.end(),
+              expected_nlist_multi.begin() + nlist_offset);
   }
 
   double *em_dev = NULL, *em_deriv_dev = NULL, *rij_dev = NULL;
@@ -865,8 +878,12 @@ TEST_F(TestEnvMatA, prod_gpu_multiple_frames) {
                              array_longlong_dev, max_nbor_size, avg_dev,
                              std_dev, nloc, nall, nframes, rc, rc_smth, sec_a);
   deepmd::memcpy_device_to_host(em_dev, em);
+  deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv);
+  deepmd::memcpy_device_to_host(rij_dev, rij);
+  deepmd::memcpy_device_to_host(nlist_dev, nlist);
   deepmd::delete_device_memory(em_dev);
   deepmd::delete_device_memory(em_deriv_dev);
+  deepmd::delete_device_memory(rij_dev);
   deepmd::delete_device_memory(nlist_dev);
   deepmd::delete_device_memory(posi_dev);
   deepmd::delete_device_memory(atype_dev);
@@ -877,16 +894,23 @@ TEST_F(TestEnvMatA, prod_gpu_multiple_frames) {
   deepmd::delete_device_memory(memory_dev);
   deepmd::free_nlist_gpu_device(gpu_inlist);
 
-  for (int ff = 0; ff < nframes; ++ff) {
-    for (int ii = 0; ii < nloc; ++ii) {
-      for (int jj = 0; jj < nnei; ++jj) {
-        for (int dd = 0; dd < 4; ++dd) {
-          const int_64 idx =
-              (static_cast<int_64>(ff) * nloc + ii) * nnei * 4 + jj * 4 + dd;
-          EXPECT_LT(fabs(em[idx] - expected_multi[idx]), 1e-5);
-        }
-      }
-    }
+  for (size_t ii = 0; ii < em.size(); ++ii) {
+    EXPECT_LT(fabs(em[ii] - expected_multi[ii]), 1e-10)
+        << "index " << ii << " em " << em[ii] << " expected "
+        << expected_multi[ii];
+  }
+  for (size_t ii = 0; ii < em_deriv.size(); ++ii) {
+    EXPECT_LT(fabs(em_deriv[ii] - expected_deriv_multi[ii]), 1e-10)
+        << "index " << ii << " em_deriv " << em_deriv[ii] << " expected "
+        << expected_deriv_multi[ii];
+  }
+  for (size_t ii = 0; ii < rij.size(); ++ii) {
+    EXPECT_LT(fabs(rij[ii] - expected_rij_multi[ii]), 1e-10)
+        << "index " << ii << " rij " << rij[ii] << " expected "
+        << expected_rij_multi[ii];
+  }
+  for (size_t ii = 0; ii < nlist.size(); ++ii) {
+    EXPECT_EQ(nlist[ii], expected_nlist_multi[ii]) << "index " << ii;
   }
 }
 
diff --git a/source/lib/tests/test_env_mat_a_mix.cc b/source/lib/tests/test_env_mat_a_mix.cc
index ae91256c1a..fdb4abe9c2 100644
--- a/source/lib/tests/test_env_mat_a_mix.cc
+++ b/source/lib/tests/test_env_mat_a_mix.cc
@@ -691,7 +691,11 @@ TEST_F(TestEnvMatAMix, prod_cpu_multiple_frames) {
   }
 
   std::vector<double> expected_multi(static_cast<size_t>(nframes) * nloc *
-                                     ndescrpt);
+                                     ndescrpt),
+      expected_deriv_multi(static_cast<size_t>(nframes) * nloc * ndescrpt * 3),
+      expected_rij_multi(static_cast<size_t>(nframes) * nloc * nnei * 3);
+  std::vector<int> expected_nlist_multi(static_cast<size_t>(nframes) * nloc *
+                                        nnei);
   for (int ff = 0; ff < nframes; ++ff) {
     std::vector<double> frame_em(static_cast<size_t>(nloc) * ndescrpt),
         frame_em_deriv(static_cast<size_t>(nloc) * ndescrpt * 3),
@@ -704,9 +708,18 @@ TEST_F(TestEnvMatAMix, prod_cpu_multiple_frames) {
         atype_multi.data() + static_cast<size_t>(ff) * nall, base_inlist,
         max_nbor_size, avg.data(), std.data(), nloc, nall, rc, rc_smth, sec_a,
         f_atype_multi.data() + static_cast<size_t>(ff) * nall);
-    std::copy(
-        frame_em.begin(), frame_em.end(),
-        expected_multi.begin() + static_cast<size_t>(ff) * nloc * ndescrpt);
+    const size_t em_offset = static_cast<size_t>(ff) * nloc * ndescrpt;
+    const size_t deriv_offset = em_offset * 3;
+    const size_t rij_offset = static_cast<size_t>(ff) * nloc * nnei * 3;
+    const size_t nlist_offset = static_cast<size_t>(ff) * nloc * nnei;
+    std::copy(frame_em.begin(), frame_em.end(),
+              expected_multi.begin() + em_offset);
+    std::copy(frame_em_deriv.begin(), frame_em_deriv.end(),
+              expected_deriv_multi.begin() + deriv_offset);
+    std::copy(frame_rij.begin(), frame_rij.end(),
+              expected_rij_multi.begin() + rij_offset);
+    std::copy(frame_nlist.begin(), frame_nlist.end(),
+              expected_nlist_multi.begin() + nlist_offset);
   }
 
   std::vector<double> em(static_cast<size_t>(nframes) * nloc * ndescrpt),
@@ -718,9 +731,18 @@ TEST_F(TestEnvMatAMix, prod_cpu_multiple_frames) {
       atype_multi.data(), inlist, max_nbor_size, avg.data(), std.data(), nloc,
       nall, nframes, rc, rc_smth, sec_a, f_atype_multi.data());
 
-  for (int ii = 0; ii < em.size(); ++ii) {
+  for (size_t ii = 0; ii < em.size(); ++ii) {
     EXPECT_LT(fabs(em[ii] - expected_multi[ii]), 1e-10);
   }
+  for (size_t ii = 0; ii < em_deriv.size(); ++ii) {
+    EXPECT_LT(fabs(em_deriv[ii] - expected_deriv_multi[ii]), 1e-10);
+  }
+  for (size_t ii = 0; ii < rij.size(); ++ii) {
+    EXPECT_LT(fabs(rij[ii] - expected_rij_multi[ii]), 1e-10);
+  }
+  for (size_t ii = 0; ii < nlist.size(); ++ii) {
+    EXPECT_EQ(nlist[ii], expected_nlist_multi[ii]);
+  }
 }
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -997,7 +1019,11 @@ TEST_F(TestEnvMatAMix, prod_gpu_multiple_frames) {
   }
 
   std::vector<double> expected_multi(static_cast<size_t>(nframes) * nloc *
-                                     ndescrpt);
+                                     ndescrpt),
+      expected_deriv_multi(static_cast<size_t>(nframes) * nloc * ndescrpt * 3),
+      expected_rij_multi(static_cast<size_t>(nframes) * nloc * nnei * 3);
+  std::vector<int> expected_nlist_multi(static_cast<size_t>(nframes) * nloc *
+                                        nnei);
   for (int ff = 0; ff < nframes; ++ff) {
     std::vector<double> frame_em(static_cast<size_t>(nloc) * ndescrpt),
         frame_em_deriv(static_cast<size_t>(nloc) * ndescrpt * 3),
@@ -1010,9 +1036,18 @@ TEST_F(TestEnvMatAMix, prod_gpu_multiple_frames) {
         atype_multi.data() + static_cast<size_t>(ff) * nall, base_inlist,
         max_nbor_size, avg.data(), std.data(), nloc, nall, rc, rc_smth, sec_a,
         f_atype_multi.data() + static_cast<size_t>(ff) * nall);
-    std::copy(
-        frame_em.begin(), frame_em.end(),
-        expected_multi.begin() + static_cast<size_t>(ff) * nloc * ndescrpt);
+    const size_t em_offset = static_cast<size_t>(ff) * nloc * ndescrpt;
+    const size_t deriv_offset = em_offset * 3;
+    const size_t rij_offset = static_cast<size_t>(ff) * nloc * nnei * 3;
+    const size_t nlist_offset = static_cast<size_t>(ff) * nloc * nnei;
+    std::copy(frame_em.begin(), frame_em.end(),
+              expected_multi.begin() + em_offset);
+    std::copy(frame_em_deriv.begin(), frame_em_deriv.end(),
+              expected_deriv_multi.begin() + deriv_offset);
+    std::copy(frame_rij.begin(), frame_rij.end(),
+              expected_rij_multi.begin() + rij_offset);
+    std::copy(frame_nlist.begin(), frame_nlist.end(),
+              expected_nlist_multi.begin() + nlist_offset);
   }
 
   std::vector<double> em(static_cast<size_t>(nframes) * nloc * ndescrpt, 0.0),
@@ -1052,6 +1087,9 @@ TEST_F(TestEnvMatAMix, prod_gpu_multiple_frames) {
       nall, nframes, rc, rc_smth, sec_a, f_atype_dev);
 
   deepmd::memcpy_device_to_host(em_dev, em);
+  deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv);
+  deepmd::memcpy_device_to_host(rij_dev, rij);
+  deepmd::memcpy_device_to_host(nlist_dev, nlist);
   deepmd::delete_device_memory(em_dev);
   deepmd::delete_device_memory(em_deriv_dev);
   deepmd::delete_device_memory(rij_dev);
@@ -1066,10 +1104,23 @@ TEST_F(TestEnvMatAMix, prod_gpu_multiple_frames) {
   deepmd::delete_device_memory(memory_dev);
   deepmd::free_nlist_gpu_device(gpu_inlist);
 
-  for (int ii = 0; ii < em.size(); ++ii) {
-    EXPECT_LT(fabs(em[ii] - expected_multi[ii]), 1e-5)
+  for (size_t ii = 0; ii < em.size(); ++ii) {
+    EXPECT_LT(fabs(em[ii] - expected_multi[ii]), 1e-10)
         << "index " << ii << " em " << em[ii] << " expected "
         << expected_multi[ii];
   }
+  for (size_t ii = 0; ii < em_deriv.size(); ++ii) {
+    EXPECT_LT(fabs(em_deriv[ii] - expected_deriv_multi[ii]), 1e-10)
+        << "index " << ii << " em_deriv " << em_deriv[ii] << " expected "
+        << expected_deriv_multi[ii];
+  }
+  for (size_t ii = 0; ii < rij.size(); ++ii) {
+    EXPECT_LT(fabs(rij[ii] - expected_rij_multi[ii]), 1e-10)
+        << "index " << ii << " rij " << rij[ii] << " expected "
+        << expected_rij_multi[ii];
+  }
+  for (size_t ii = 0; ii < nlist.size(); ++ii) {
+    EXPECT_EQ(nlist[ii], expected_nlist_multi[ii]) << "index " << ii;
+  }
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/lib/tests/test_env_mat_r.cc b/source/lib/tests/test_env_mat_r.cc
index 82d6b6c229..f3659db074 100644
--- a/source/lib/tests/test_env_mat_r.cc
+++ b/source/lib/tests/test_env_mat_r.cc
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: LGPL-3.0-or-later
 #include <gtest/gtest.h>
 
+#include <algorithm>
 #include <iostream>
 
 #include "env_mat.h"
@@ -556,6 +557,161 @@ TEST_F(TestEnvMatR, prod_gpu) {
   }
 }
 
+TEST_F(TestEnvMatR, prod_gpu_multiple_frames) {
+  EXPECT_EQ(nlist_r_cpy.size(), nloc);
+  constexpr int nframes = 2;
+  int max_nbor_size = 0;
+  for (int ii = 0; ii < nlist_a_cpy.size(); ++ii) {
+    if (nlist_a_cpy[ii].size() > max_nbor_size) {
+      max_nbor_size = nlist_a_cpy[ii].size();
+    }
+  }
+  assert(max_nbor_size <= GPU_MAX_NBOR_SIZE);
+  if (max_nbor_size <= 1024) {
+    max_nbor_size = 1024;
+  } else if (max_nbor_size <= 2048) {
+    max_nbor_size = 2048;
+  } else {
+    max_nbor_size = 4096;
+  }
+
+  std::vector<int> base_ilist(nloc), base_numneigh(nloc);
+  std::vector<int*> base_firstneigh(nloc);
+  deepmd::InputNlist base_inlist(nloc, base_ilist.data(), base_numneigh.data(),
+                                 base_firstneigh.data());
+  convert_nlist(base_inlist, nlist_a_cpy);
+
+  const int nrows = nframes * nloc;
+  std::vector<int> ilist(nrows), numneigh(nrows);
+  std::vector<int*> firstneigh(nrows);
+  for (int ff = 0; ff < nframes; ++ff) {
+    for (int ii = 0; ii < nloc; ++ii) {
+      const int row = ff * nloc + ii;
+      ilist[row] = base_ilist[ii];
+      numneigh[row] = base_numneigh[ii];
+      firstneigh[row] = base_firstneigh[ii];
+    }
+  }
+  deepmd::InputNlist inlist(nrows, ilist.data(), numneigh.data(),
+                            firstneigh.data()),
+      gpu_inlist;
+
+  std::vector<double> posi_multi = posi_cpy;
+  posi_multi.insert(posi_multi.end(), posi_cpy.begin(), posi_cpy.end());
+  for (int ii = 0; ii < nall; ++ii) {
+    const size_t offset = (static_cast<size_t>(nall) + ii) * 3;
+    posi_multi[offset] += 0.01 * (ii + 1);
+    posi_multi[offset + 1] -= 0.02 * (ii % 3);
+    posi_multi[offset + 2] += 0.015 * (ii % 5);
+  }
+  std::vector<int> atype_multi = atype_cpy;
+  atype_multi.insert(atype_multi.end(), atype_cpy.begin(), atype_cpy.end());
+  std::vector<double> avg(static_cast<size_t>(ntypes) * ndescrpt, 0);
+  std::vector<double> std(static_cast<size_t>(ntypes) * ndescrpt, 1);
+
+  std::vector<double> expected_multi(static_cast<size_t>(nframes) * nloc *
+                                     ndescrpt),
+      expected_deriv_multi(static_cast<size_t>(nframes) * nloc * ndescrpt * 3),
+      expected_rij_multi(static_cast<size_t>(nframes) * nloc * nnei * 3);
+  std::vector<int> expected_nlist_multi(static_cast<size_t>(nframes) * nloc *
+                                        nnei);
+  for (int ff = 0; ff < nframes; ++ff) {
+    std::vector<double> frame_em(static_cast<size_t>(nloc) * ndescrpt),
+        frame_em_deriv(static_cast<size_t>(nloc) * ndescrpt * 3),
+        frame_rij(static_cast<size_t>(nloc) * nnei * 3);
+    std::vector<int> frame_nlist(static_cast<size_t>(nloc) * nnei);
+    deepmd::prod_env_mat_r_cpu(
+        frame_em.data(), frame_em_deriv.data(), frame_rij.data(),
+        frame_nlist.data(),
+        posi_multi.data() + static_cast<size_t>(ff) * nall * 3,
+        atype_multi.data() + static_cast<size_t>(ff) * nall, base_inlist,
+        max_nbor_size, avg.data(), std.data(), nloc, nall, rc, rc_smth, sec_a);
+
+    const size_t em_offset = static_cast<size_t>(ff) * nloc * ndescrpt;
+    const size_t deriv_offset = em_offset * 3;
+    const size_t rij_offset = static_cast<size_t>(ff) * nloc * nnei * 3;
+    const size_t nlist_offset = static_cast<size_t>(ff) * nloc * nnei;
+    std::copy(frame_em.begin(), frame_em.end(),
+              expected_multi.begin() + em_offset);
+    std::copy(frame_em_deriv.begin(), frame_em_deriv.end(),
+              expected_deriv_multi.begin() + deriv_offset);
+    std::copy(frame_rij.begin(), frame_rij.end(),
+              expected_rij_multi.begin() + rij_offset);
+    std::copy(frame_nlist.begin(), frame_nlist.end(),
+              expected_nlist_multi.begin() + nlist_offset);
+  }
+
+  std::vector<double> em(static_cast<size_t>(nframes) * nloc * ndescrpt, 0.0),
+      em_deriv(static_cast<size_t>(nframes) * nloc * ndescrpt * 3, 0.0),
+      rij(static_cast<size_t>(nframes) * nloc * nnei * 3, 0.0);
+  std::vector<int> nlist(static_cast<size_t>(nframes) * nloc * nnei, 0);
+
+  double *em_dev = NULL, *em_deriv_dev = NULL, *rij_dev = NULL;
+  double *posi_dev = NULL, *avg_dev = NULL, *std_dev = NULL;
+  int *atype_dev = NULL, *nlist_dev = NULL, *array_int_dev = NULL,
+      *memory_dev = NULL;
+  uint_64* array_longlong_dev = NULL;
+  deepmd::malloc_device_memory_sync(em_dev, em);
+  deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
+  deepmd::malloc_device_memory_sync(rij_dev, rij);
+  deepmd::malloc_device_memory_sync(posi_dev, posi_multi);
+  deepmd::malloc_device_memory_sync(avg_dev, avg);
+  deepmd::malloc_device_memory_sync(std_dev, std);
+  deepmd::malloc_device_memory_sync(atype_dev, atype_multi);
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory(
+      array_int_dev, sec_a.size() +
+                         static_cast<size_t>(nframes) * nloc * sec_a.size() +
+                         static_cast<size_t>(nframes) * nloc);
+  deepmd::malloc_device_memory(
+      array_longlong_dev,
+      static_cast<size_t>(nframes) * nloc * max_nbor_size * 2);
+  deepmd::malloc_device_memory(
+      memory_dev, static_cast<size_t>(nframes) * nloc * max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev,
+                                   max_nbor_size);
+
+  deepmd::prod_env_mat_r_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_dev,
+                             atype_dev, gpu_inlist, array_int_dev,
+                             array_longlong_dev, max_nbor_size, avg_dev,
+                             std_dev, nloc, nall, nframes, rc, rc_smth, sec_a);
+  deepmd::memcpy_device_to_host(em_dev, em);
+  deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv);
+  deepmd::memcpy_device_to_host(rij_dev, rij);
+  deepmd::memcpy_device_to_host(nlist_dev, nlist);
+  deepmd::delete_device_memory(em_dev);
+  deepmd::delete_device_memory(em_deriv_dev);
+  deepmd::delete_device_memory(rij_dev);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(posi_dev);
+  deepmd::delete_device_memory(atype_dev);
+  deepmd::delete_device_memory(array_int_dev);
+  deepmd::delete_device_memory(array_longlong_dev);
+  deepmd::delete_device_memory(avg_dev);
+  deepmd::delete_device_memory(std_dev);
+  deepmd::delete_device_memory(memory_dev);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
+
+  for (size_t ii = 0; ii < em.size(); ++ii) {
+    EXPECT_LT(fabs(em[ii] - expected_multi[ii]), 1e-10)
+        << "index " << ii << " em " << em[ii] << " expected "
+        << expected_multi[ii];
+  }
+  for (size_t ii = 0; ii < em_deriv.size(); ++ii) {
+    EXPECT_LT(fabs(em_deriv[ii] - expected_deriv_multi[ii]), 1e-10)
+        << "index " << ii << " em_deriv " << em_deriv[ii] << " expected "
+        << expected_deriv_multi[ii];
+  }
+  for (size_t ii = 0; ii < rij.size(); ++ii) {
+    EXPECT_LT(fabs(rij[ii] - expected_rij_multi[ii]), 1e-10)
+        << "index " << ii << " rij " << rij[ii] << " expected "
+        << expected_rij_multi[ii];
+  }
+  for (size_t ii = 0; ii < nlist.size(); ++ii) {
+    EXPECT_EQ(nlist[ii], expected_nlist_multi[ii]) << "index " << ii;
+  }
+}
+
 TEST_F(TestEnvMatR, prod_gpu_equal_cpu) {
   EXPECT_EQ(nlist_r_cpy.size(), nloc);
   int tot_nnei = 0;

From d6ea9eb5d9639ae971c2e07d3eabbe031879973c Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@ustc.edu.cn>
Date: Sun, 28 Jun 2026 02:07:44 +0800
Subject: [PATCH 3/5] docs(lib): document batched neighbor list inputs

---
 source/lib/include/neighbor_list.h | 38 ++++++++++++++++++++++++------
 1 file changed, 31 insertions(+), 7 deletions(-)

diff --git a/source/lib/include/neighbor_list.h b/source/lib/include/neighbor_list.h
index f7c1ddde10..7eb3f86361 100644
--- a/source/lib/include/neighbor_list.h
+++ b/source/lib/include/neighbor_list.h
@@ -144,14 +144,25 @@ int max_numneigh(const InputNlist& to_nlist);
 // build neighbor list.
 // outputs
 //	nlist, max_list_size
+//	nlist contains nframes * nloc rows. Row frame_idx * nloc + atom_idx
+//	stores the neighbors of local atom atom_idx in that frame.
 //	max_list_size is the maximal size of jlist.
 // inputs
-//	c_cpy, nloc, nall, mem_size, rcut, region
-//	mem_size is the size of allocated memory for jlist.
+//	c_cpy, nloc, nall, mem_size, rcut, nframes, type
+//	c_cpy stores nframes coordinate blocks with nall atoms each, including
+//	ghost atoms, laid out as c_cpy[(frame_idx * nall + atom) * 3 + dim].
+//	nloc is the number of local atoms per frame; nall is the total number of
+//	local and ghost atoms per frame.
+//	mem_size is the size of allocated memory for each jlist row.
+//	nframes is the number of coordinate frames.
+//	type may be nullptr/NULL, or a per-atom type array.
+//	The type array uses the same frame-major nall stride as c_cpy:
+//	type[frame_idx * nall + atom].
+//	When the center or neighbor atom has type < 0, that pair is excluded.
 // returns
 //	0: successful
 //	1: the memory is not large enough to hold all neighbors.
-//	   i.e. max_list_size > mem_nall
+//	   i.e. max_list_size > mem_size
 template <typename FPTYPE>
 int build_nlist_cpu(InputNlist& nlist,
                     int* max_list_size,
@@ -210,14 +221,27 @@ void use_nlist_map(int* nlist,
 // build neighbor list.
 // outputs
 //	nlist, max_list_size
+//	nlist contains nframes * nloc rows. Row frame_idx * nloc + atom_idx
+//	stores the neighbors of local atom atom_idx in that frame.
 //	max_list_size is the maximal size of jlist.
 // inputs
-//	c_cpy, nloc, nall, mem_size, rcut, region
-//	mem_size is the size of allocated memory for jlist.
+//	nlist_data, c_cpy, nloc, nall, mem_size, rcut, nframes, type
+//	nlist_data is temporary GPU workspace of at least
+//	2 * nframes * nloc * mem_size ints.
+//	c_cpy stores nframes coordinate blocks with nall atoms each, including
+//	ghost atoms, laid out as c_cpy[(frame_idx * nall + atom) * 3 + dim].
+//	nloc is the number of local atoms per frame; nall is the total number of
+//	local and ghost atoms per frame.
+//	mem_size is the size of allocated memory for each jlist row.
+//	nframes is the number of coordinate frames.
+//	type may be nullptr/NULL, or a per-atom type array.
+//	The type array uses the same frame-major nall stride as c_cpy:
+//	type[frame_idx * nall + atom].
+//	When the center or neighbor atom has type < 0, that pair is excluded.
 // returns
 //	0: successful
-//	1: the memory is not large enough to hold all neighbors.
-//	   i.e. max_list_size > mem_nall
+//	1: the temporary row storage is not large enough.
+//	   i.e. mem_size < nall
 template <typename FPTYPE>
 int build_nlist_gpu(InputNlist& nlist,
                     int* max_list_size,

From c0accc2aa53d187aba4a58884720645acd774445 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@ustc.edu.cn>
Date: Wed, 1 Jul 2026 14:45:35 +0800
Subject: [PATCH 4/5] Apply suggestions from code review

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
Co-authored-by: A bot of @njzjz <48687836+njzjz-bot@users.noreply.github.com>
Signed-off-by: Jinzhe Zeng <njzjz@qq.com>
---
 source/lib/tests/test_neighbor_list.cc    |  1 +
 source/op/tf/prod_env_mat_multi_device.cc | 40 +++++++++++++++++++++++
 2 files changed, 41 insertions(+)

diff --git a/source/lib/tests/test_neighbor_list.cc b/source/lib/tests/test_neighbor_list.cc
index a643c8c321..a229a3b746 100644
--- a/source/lib/tests/test_neighbor_list.cc
+++ b/source/lib/tests/test_neighbor_list.cc
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: LGPL-3.0-or-later
 #include <gtest/gtest.h>
 
+#include <algorithm>
 #include <vector>
 
 #include "device.h"
diff --git a/source/op/tf/prod_env_mat_multi_device.cc b/source/op/tf/prod_env_mat_multi_device.cc
index 799f9709f3..7422d73745 100644
--- a/source/op/tf/prod_env_mat_multi_device.cc
+++ b/source/op/tf/prod_env_mat_multi_device.cc
@@ -2128,6 +2128,34 @@ static tensorflow::Status _validate_mesh_neighbor_counts(
   return tensorflow::Status();
 }
 
+static tensorflow::Status _validate_mesh_indices(
+    const int* ilist_in,
+    const int* numneigh_in,
+    const int* neighbors_in,
+    const int nloc,
+    const int_64 neighbor_count) {
+  std::vector<unsigned char> seen(static_cast<size_t>(nloc), 0);
+  int_64 neighbor_offset = 0;
+  for (int ii = 0; ii < nloc; ++ii) {
+    const int i_idx = ilist_in[ii];
+    if (i_idx < 0 || i_idx >= nloc || seen[static_cast<size_t>(i_idx)]) {
+      return errors::InvalidArgument("invalid mesh tensor");
+    }
+    seen[static_cast<size_t>(i_idx)] = 1;
+
+    for (int jj = 0; jj < numneigh_in[ii]; ++jj) {
+      if (neighbors_in[neighbor_offset + jj] < 0) {
+        return errors::InvalidArgument("invalid mesh tensor");
+      }
+    }
+    neighbor_offset += numneigh_in[ii];
+  }
+  if (neighbor_offset != neighbor_count) {
+    return errors::InvalidArgument("invalid mesh tensor");
+  }
+  return tensorflow::Status();
+}
+
 static tensorflow::Status _prepare_mesh_nlist_cpu_batch(
     deepmd::InputNlist& inlist,
     std::vector<int>& ilist,
@@ -2157,6 +2185,12 @@ static tensorflow::Status _prepare_mesh_nlist_cpu_batch(
     return count_status;
   }
 
+  tensorflow::Status index_status = _validate_mesh_indices(
+      ilist_in, numneigh_in, neighbors_in, nloc, neighbor_count);
+  if (!index_status.ok()) {
+    return index_status;
+  }
+
   const int nrows = nframes * nloc;
   ilist.resize(nrows);
   numneigh.resize(nrows);
@@ -2614,6 +2648,12 @@ static tensorflow::Status _prepare_mesh_nlist_gpu_batch(
         ", which currently is not supported by deepmd-kit.");
   }
 
+  tensorflow::Status index_status = _validate_mesh_indices(
+      ilist_in, numneigh_in, neighbors_in, nloc, neighbor_count);
+  if (!index_status.ok()) {
+    return index_status;
+  }
+
   if (max_numneigh <= 256) {
     max_nbor_size = 256;
   } else if (max_numneigh <= 512) {

From c98dc29adec4af241c50bcb7e0f7ab02656c0616 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 1 Jul 2026 06:46:21 +0000
Subject: [PATCH 5/5] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 source/op/tf/prod_env_mat_multi_device.cc | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/source/op/tf/prod_env_mat_multi_device.cc b/source/op/tf/prod_env_mat_multi_device.cc
index 7422d73745..66a0a45c88 100644
--- a/source/op/tf/prod_env_mat_multi_device.cc
+++ b/source/op/tf/prod_env_mat_multi_device.cc
@@ -2128,12 +2128,11 @@ static tensorflow::Status _validate_mesh_neighbor_counts(
   return tensorflow::Status();
 }
 
-static tensorflow::Status _validate_mesh_indices(
-    const int* ilist_in,
-    const int* numneigh_in,
-    const int* neighbors_in,
-    const int nloc,
-    const int_64 neighbor_count) {
+static tensorflow::Status _validate_mesh_indices(const int* ilist_in,
+                                                 const int* numneigh_in,
+                                                 const int* neighbors_in,
+                                                 const int nloc,
+                                                 const int_64 neighbor_count) {
   std::vector<unsigned char> seen(static_cast<size_t>(nloc), 0);
   int_64 neighbor_offset = 0;
   for (int ii = 0; ii < nloc; ++ii) {