diff --git a/deepmd/utils/neighbor_stat.py b/deepmd/utils/neighbor_stat.py
index a4ac190946..fa9325937e 100644
--- a/deepmd/utils/neighbor_stat.py
+++ b/deepmd/utils/neighbor_stat.py
@@ -82,6 +82,8 @@ def builder():
                 rcut=self.rcut,
             )
             place_holders["dir"] = tf.placeholder(tf.string)
+            _min_nbor_dist = tf.reduce_min(_min_nbor_dist)
+            _max_nbor_size = tf.reduce_max(_max_nbor_size, axis=0)
             return place_holders, (_max_nbor_size, _min_nbor_dist, place_holders["dir"])
 
         with sub_graph.as_default():
@@ -128,10 +130,7 @@ def feed():
                         }
 
         for mn, dt, jj in self.p.generate(self.sub_sess, feed()):
-            if dt.size != 0:
-                dt = np.min(dt)
-            else:
-                dt = self.rcut
+            if np.isinf(dt):
                 log.warning(
                     "Atoms with no neighbors found in %s. Please make sure it's what you expected."
                     % jj
@@ -145,9 +144,10 @@ def feed():
                         " training data to remove duplicated atoms." % jj
                     )
                 self.min_nbor_dist = dt
-            var = np.max(mn, axis=0)
-            self.max_nbor_size = np.maximum(var, self.max_nbor_size)
+            self.max_nbor_size = np.maximum(mn, self.max_nbor_size)
 
+        # do sqrt in the final
+        self.min_nbor_dist = math.sqrt(self.min_nbor_dist)
         log.info("training data with min nbor dist: " + str(self.min_nbor_dist))
         log.info("training data with max nbor size: " + str(self.max_nbor_size))
         return self.min_nbor_dist, self.max_nbor_size
diff --git a/source/lib/include/neighbor_stat.h b/source/lib/include/neighbor_stat.h
new file mode 100644
index 0000000000..79e241f783
--- /dev/null
+++ b/source/lib/include/neighbor_stat.h
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: LGPL-3.0-or-later
+#include "neighbor_list.h"
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+namespace deepmd {
+template <typename FPTYPE>
+void neighbor_stat_gpu(const FPTYPE* coord,
+                       const int* type,
+                       const int nloc,
+                       const deepmd::InputNlist& gpu_nlist,
+                       int* max_nbor_size,
+                       FPTYPE* min_nbor_dist,
+                       const int ntypes,
+                       const int MAX_NNEI);
+}  // namespace deepmd
+
+#endif
diff --git a/source/lib/src/gpu/neighbor_stat.cu b/source/lib/src/gpu/neighbor_stat.cu
new file mode 100644
index 0000000000..ef7d3c5f8a
--- /dev/null
+++ b/source/lib/src/gpu/neighbor_stat.cu
@@ -0,0 +1,103 @@
+#include <cmath>
+
+#include "device.h"
+#include "neighbor_list.h"
+
+template <typename FPTYPE>
+__global__ void neighbor_stat_g(const FPTYPE* coord,
+                                const int* type,
+                                const int nloc,
+                                const int* ilist,
+                                int** firstneigh,
+                                const int* numneigh,
+                                int* max_nbor_size,
+                                FPTYPE* min_nbor_dist,
+                                const int ntypes,
+                                const int MAX_NNEI) {
+  int ithread = blockIdx.x * blockDim.x + threadIdx.x;
+  int ii = ithread / MAX_NNEI;
+  int jj = ithread % MAX_NNEI;
+  // assume the same block has the same ii
+  __shared__ int cache[TPB];
+  cache[threadIdx.x] = 0;
+  if (ii >= nloc) {
+    return;
+  }
+  int idx_i = ilist[ii];
+  if (type[idx_i] < 0) {
+    // set all to 10000
+    min_nbor_dist[ii * MAX_NNEI + jj] = INFINITY;
+    return;  // virtual atom
+  }
+  if (jj < numneigh[ii]) {
+    int idx_j = firstneigh[ii][jj];
+    int type_j = type[idx_j];
+    if (type_j < 0) {
+      min_nbor_dist[ii * MAX_NNEI + jj] = INFINITY;
+      return;  // virtual atom
+    }
+    __syncthreads();
+    FPTYPE rij[3] = {coord[idx_j * 3 + 0] - coord[idx_i * 3 + 0],
+                     coord[idx_j * 3 + 1] - coord[idx_i * 3 + 1],
+                     coord[idx_j * 3 + 2] - coord[idx_i * 3 + 2]};
+    // we do not need to use the real index
+    // we do not need to do slow sqrt for every dist; instead do sqrt in the
+    // final
+    min_nbor_dist[ii * MAX_NNEI + jj] =
+        rij[0] * rij[0] + rij[1] * rij[1] + rij[2] * rij[2];
+
+    // atomicAdd(max_nbor_size + ii * ntypes + type_j, 1);
+    // See https://www.cnblogs.com/neopenx/p/4705320.html
+    atomicAdd(&cache[type_j], 1);
+    __syncthreads();
+    if (threadIdx.x < ntypes) {
+      atomicAdd(&max_nbor_size[ii * ntypes + threadIdx.x], cache[threadIdx.x]);
+    }
+  } else {
+    // set others to 10000
+    min_nbor_dist[ii * MAX_NNEI + jj] = INFINITY;
+  }
+}
+
+namespace deepmd {
+
+template <typename FPTYPE>
+void neighbor_stat_gpu(const FPTYPE* coord,
+                       const int* type,
+                       const int nloc,
+                       const deepmd::InputNlist& gpu_nlist,
+                       int* max_nbor_size,
+                       FPTYPE* min_nbor_dist,
+                       const int ntypes,
+                       const int MAX_NNEI) {
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
+
+  DPErrcheck(gpuMemset(max_nbor_size, 0, sizeof(int) * int_64(nloc) * ntypes));
+  const int nblock_loc = (nloc * MAX_NNEI + TPB - 1) / TPB;
+  neighbor_stat_g<<<nblock_loc, TPB>>>(
+      coord, type, nloc, gpu_nlist.ilist, gpu_nlist.firstneigh,
+      gpu_nlist.numneigh, max_nbor_size, min_nbor_dist, ntypes, MAX_NNEI);
+
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
+}
+
+template void neighbor_stat_gpu<float>(const float* coord,
+                                       const int* type,
+                                       const int nloc,
+                                       const deepmd::InputNlist& gpu_nlist,
+                                       int* max_nbor_size,
+                                       float* min_nbor_dist,
+                                       const int ntypes,
+                                       const int MAX_NNEI);
+
+template void neighbor_stat_gpu<double>(const double* coord,
+                                        const int* type,
+                                        const int nloc,
+                                        const deepmd::InputNlist& gpu_nlist,
+                                        int* max_nbor_size,
+                                        double* min_nbor_dist,
+                                        const int ntypes,
+                                        const int MAX_NNEI);
+}  // namespace deepmd
diff --git a/source/op/custom_op.h b/source/op/custom_op.h
index b2fb290f63..baf95f3fa3 100644
--- a/source/op/custom_op.h
+++ b/source/op/custom_op.h
@@ -5,6 +5,7 @@
 #include <vector>
 
 #include "device.h"
+#include "neighbor_list.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/shape_inference.h"
@@ -25,3 +26,30 @@ namespace deepmd {
 void safe_compute(OpKernelContext* context,
                   std::function<void(OpKernelContext*)> ff);
 };
+
+template <typename FPTYPE>
+void _prepare_coord_nlist_gpu(OpKernelContext* context,
+                              Tensor* tensor_list,
+                              FPTYPE const** coord,
+                              FPTYPE*& coord_cpy,
+                              int const** type,
+                              int*& type_cpy,
+                              int*& idx_mapping,
+                              deepmd::InputNlist& inlist,
+                              int*& ilist,
+                              int*& numneigh,
+                              int**& firstneigh,
+                              int*& jlist,
+                              int*& nbor_list_dev,
+                              int& new_nall,
+                              int& mem_cpy,
+                              int& mem_nnei,
+                              int& max_nbor_size,
+                              const FPTYPE* box,
+                              const int* mesh_tensor_data,
+                              const int mesh_tensor_size,
+                              const int& nloc,
+                              const int& nei_mode,
+                              const float& rcut_r,
+                              const int& max_cpy_trial,
+                              const int& max_nnei_trial);
diff --git a/source/op/neighbor_stat.cc b/source/op/neighbor_stat.cc
index bb368cf65e..d917c60a5f 100644
--- a/source/op/neighbor_stat.cc
+++ b/source/op/neighbor_stat.cc
@@ -1,4 +1,6 @@
 // SPDX-License-Identifier: LGPL-3.0-or-later
+#include "neighbor_stat.h"
+
 #include "custom_op.h"
 #include "errors.h"
 #include "neighbor_list.h"
@@ -22,6 +24,11 @@ class NeighborStatOp : public OpKernel {
  public:
   explicit NeighborStatOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("rcut", &rcut));
+    max_nbor_size_nlist = 1024;
+    max_cpy_trial = 100;
+    mem_cpy = 256;
+    max_nnei_trial = 100;
+    mem_nnei = 256;
   }
 
   void Compute(OpKernelContext* context) override {
@@ -66,7 +73,7 @@ class NeighborStatOp : public OpKernel {
                 errors::InvalidArgument("number of atoms should match"));
     OP_REQUIRES(context, (9 == box_tensor.shape().dim_size(1)),
                 errors::InvalidArgument("number of box should be 9"));
-
+    DeviceFunctor()(device, context->eigen_device<Device>());
     int nei_mode = 0;
     if (mesh_tensor.shape().dim_size(0) == 6 ||
         mesh_tensor.shape().dim_size(0) == 7) {
@@ -99,108 +106,184 @@ class NeighborStatOp : public OpKernel {
     const FPTYPE* box = box_tensor.flat<FPTYPE>().data();
     const int* mesh = mesh_tensor.flat<int>().data();
     int* max_nbor_size = max_nbor_size_tensor->flat<int>().data();
+    if (device == "GPU") {
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+      std::vector<Tensor> tensor_list(7);
+      if (nei_mode == 1) {
+        // Tensor FPTYPE_temp;
+        TensorShape FPTYPE_shape;
+        FPTYPE_shape.AddDim(nall * 3);
+        OP_REQUIRES_OK(context,
+                       context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
+                                              FPTYPE_shape, &tensor_list[0]));
 
-    for (int ii = 0; ii < static_cast<int>(max_nbor_size_tensor->NumElements());
-         ii++) {
-      max_nbor_size[ii] = 0;
-    }
+        // Tensor double_temp;
+        TensorShape double_shape;
+        double_shape.AddDim(18);
+        OP_REQUIRES_OK(context,
+                       context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
+                                              double_shape, &tensor_list[1]));
+        // Tensor cpy_temp;
+        TensorShape cpy_shape;
+        cpy_shape.AddDim(mem_cpy * 3);
+        OP_REQUIRES_OK(context,
+                       context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
+                                              cpy_shape, &tensor_list[3]));
+        // Tensor t_temp;
+        TensorShape t_shape;
+        t_shape.AddDim(mem_cpy * 2);
+        OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, t_shape,
+                                                       &tensor_list[4]));
+      }
 
-    // set region
-    boxtensor_t boxt[9] = {0};
-    for (int dd = 0; dd < 9; ++dd) {
-      boxt[dd] = box[dd];
-    }
-    SimulationRegion<compute_t> region;
-    region.reinitBox(boxt);
-    // set & normalize coord
-    std::vector<compute_t> d_coord3(nall * 3);
-    for (int ii = 0; ii < nall; ++ii) {
-      for (int dd = 0; dd < 3; ++dd) {
-        d_coord3[ii * 3 + dd] = coord[ii * 3 + dd];
+      // Tensor nlist_temp;
+      TensorShape nlist_shape;
+      nlist_shape.AddDim(nloc * 2);
+      OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, nlist_shape,
+                                                     &tensor_list[5]));
+
+      TensorShape jlist_shape;
+      jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei);
+      OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, jlist_shape,
+                                                     &tensor_list[6]));
+
+      int* idx_mapping = NULL;
+      int *ilist = NULL, *numneigh = NULL;
+      int** firstneigh = NULL;
+      deepmd::malloc_device_memory(firstneigh, nloc);
+      int* jlist = NULL;
+      FPTYPE* coord_cpy;
+      int* type_cpy;
+      int frame_nall = nall;
+      int mesh_tensor_size = static_cast<int>(mesh_tensor.NumElements());
+      deepmd::InputNlist gpu_inlist;
+      int* nbor_list_dev = NULL;
+      // prepare coord and nlist
+      _prepare_coord_nlist_gpu<FPTYPE>(
+          context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy,
+          idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist,
+          nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size_nlist,
+          box, mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode,
+          rcut, max_cpy_trial, max_nnei_trial);
+
+      TensorShape min_nbor_dist_shape;
+      min_nbor_dist_shape.AddDim(nloc * mem_nnei);
+      Tensor* min_nbor_dist_tensor = NULL;
+      OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+                                                       min_nbor_dist_shape,
+                                                       &min_nbor_dist_tensor));
+      FPTYPE* min_nbor_dist = min_nbor_dist_tensor->flat<FPTYPE>().data();
+
+      deepmd::neighbor_stat_gpu<FPTYPE>(coord, type, nloc, gpu_inlist,
+                                        max_nbor_size, min_nbor_dist, ntypes,
+                                        mem_nnei);
+      deepmd::delete_device_memory(firstneigh);
+#endif
+    } else {
+      for (int ii = 0;
+           ii < static_cast<int>(max_nbor_size_tensor->NumElements()); ii++) {
+        max_nbor_size[ii] = 0;
+      }
+
+      // set region
+      boxtensor_t boxt[9] = {0};
+      for (int dd = 0; dd < 9; ++dd) {
+        boxt[dd] = box[dd];
       }
-      if (b_norm_atom) {
-        compute_t inter[3];
-        region.phys2Inter(inter, &d_coord3[3 * ii]);
+      SimulationRegion<compute_t> region;
+      region.reinitBox(boxt);
+      // set & normalize coord
+      std::vector<compute_t> d_coord3(nall * 3);
+      for (int ii = 0; ii < nall; ++ii) {
         for (int dd = 0; dd < 3; ++dd) {
-          if (inter[dd] < 0) {
-            inter[dd] += 1.;
-          } else if (inter[dd] >= 1) {
-            inter[dd] -= 1.;
+          d_coord3[ii * 3 + dd] = coord[ii * 3 + dd];
+        }
+        if (b_norm_atom) {
+          compute_t inter[3];
+          region.phys2Inter(inter, &d_coord3[3 * ii]);
+          for (int dd = 0; dd < 3; ++dd) {
+            if (inter[dd] < 0) {
+              inter[dd] += 1.;
+            } else if (inter[dd] >= 1) {
+              inter[dd] -= 1.;
+            }
           }
+          region.inter2Phys(&d_coord3[3 * ii], inter);
         }
-        region.inter2Phys(&d_coord3[3 * ii], inter);
       }
-    }
 
-    // set type
-    std::vector<int> d_type(nall);
-    for (int ii = 0; ii < nall; ++ii) {
-      d_type[ii] = type[ii];
-    }
+      // set type
+      std::vector<int> d_type(nall);
+      for (int ii = 0; ii < nall; ++ii) {
+        d_type[ii] = type[ii];
+      }
+
+      // build nlist
+      std::vector<std::vector<int> > d_nlist_a;
+      std::vector<std::vector<int> > d_nlist_r;
+      std::vector<int> nlist_map;
+      bool b_nlist_map = false;
 
-    // build nlist
-    std::vector<std::vector<int> > d_nlist_a;
-    std::vector<std::vector<int> > d_nlist_r;
-    std::vector<int> nlist_map;
-    bool b_nlist_map = false;
-
-    if (nei_mode == 1) {
-      // std::cout << "I'm in nei_mode 1" << std::endl;
-      std::vector<double> bk_d_coord3 = d_coord3;
-      std::vector<int> bk_d_type = d_type;
-      std::vector<int> ncell, ngcell;
-      copy_coord(d_coord3, d_type, nlist_map, ncell, ngcell, bk_d_coord3,
-                 bk_d_type, rcut, region);
-      b_nlist_map = true;
-      std::vector<int> nat_stt(3, 0);
-      std::vector<int> ext_stt(3), ext_end(3);
-      for (int dd = 0; dd < 3; ++dd) {
-        ext_stt[dd] = -ngcell[dd];
-        ext_end[dd] = ncell[dd] + ngcell[dd];
+      if (nei_mode == 1) {
+        // std::cout << "I'm in nei_mode 1" << std::endl;
+        std::vector<double> bk_d_coord3 = d_coord3;
+        std::vector<int> bk_d_type = d_type;
+        std::vector<int> ncell, ngcell;
+        copy_coord(d_coord3, d_type, nlist_map, ncell, ngcell, bk_d_coord3,
+                   bk_d_type, rcut, region);
+        b_nlist_map = true;
+        std::vector<int> nat_stt(3, 0);
+        std::vector<int> ext_stt(3), ext_end(3);
+        for (int dd = 0; dd < 3; ++dd) {
+          ext_stt[dd] = -ngcell[dd];
+          ext_end[dd] = ncell[dd] + ngcell[dd];
+        }
+        ::build_nlist(d_nlist_a, d_nlist_r, d_coord3, nloc, -1, rcut, nat_stt,
+                      ncell, ext_stt, ext_end, region, ncell);
+      } else if (nei_mode == -1) {
+        ::build_nlist(d_nlist_a, d_nlist_r, d_coord3, -1, rcut, NULL);
+      } else {
+        throw deepmd::deepmd_exception("unknow neighbor mode");
       }
-      ::build_nlist(d_nlist_a, d_nlist_r, d_coord3, nloc, -1, rcut, nat_stt,
-                    ncell, ext_stt, ext_end, region, ncell);
-    } else if (nei_mode == -1) {
-      ::build_nlist(d_nlist_a, d_nlist_r, d_coord3, -1, rcut, NULL);
-    } else {
-      throw deepmd::deepmd_exception("unknow neighbor mode");
-    }
 
-    int MAX_NNEI = 0;
-    for (int ii = 0; ii < nloc; ii++) {
-      MAX_NNEI =
-          MAX_NNEI < d_nlist_r[ii].size() ? d_nlist_r[ii].size() : MAX_NNEI;
-    }
-    // allocate output tensor for deepmd-kit
-    TensorShape min_nbor_dist_shape;
-    min_nbor_dist_shape.AddDim(nloc * MAX_NNEI);
-    Tensor* min_nbor_dist_tensor = NULL;
-    OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-                                                     min_nbor_dist_shape,
-                                                     &min_nbor_dist_tensor));
-    FPTYPE* min_nbor_dist = min_nbor_dist_tensor->flat<FPTYPE>().data();
-    for (int ii = 0; ii < static_cast<int>(min_nbor_dist_tensor->NumElements());
-         ii++) {
-      min_nbor_dist[ii] = 10000.0;
-    }
+      int MAX_NNEI = 0;
+      for (int ii = 0; ii < nloc; ii++) {
+        MAX_NNEI =
+            MAX_NNEI < d_nlist_r[ii].size() ? d_nlist_r[ii].size() : MAX_NNEI;
+      }
+      // allocate output tensor for deepmd-kit
+      TensorShape min_nbor_dist_shape;
+      min_nbor_dist_shape.AddDim(nloc * MAX_NNEI);
+      Tensor* min_nbor_dist_tensor = NULL;
+      OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+                                                       min_nbor_dist_shape,
+                                                       &min_nbor_dist_tensor));
+      FPTYPE* min_nbor_dist = min_nbor_dist_tensor->flat<FPTYPE>().data();
+      for (int ii = 0;
+           ii < static_cast<int>(min_nbor_dist_tensor->NumElements()); ii++) {
+        min_nbor_dist[ii] = 10000.0;
+      }
 
 #pragma omp parallel for
-    for (int ii = 0; ii < nloc; ii++) {
-      if (d_type[ii] < 0) {
-        continue;  // virtual atom
-      }
-      for (int jj = 0; jj < d_nlist_r[ii].size(); jj++) {
-        int type = d_type[d_nlist_r[ii][jj]];
-        if (type < 0) {
+      for (int ii = 0; ii < nloc; ii++) {
+        if (d_type[ii] < 0) {
           continue;  // virtual atom
         }
-        max_nbor_size[ii * ntypes + type] += 1;
-        compute_t rij[3] = {
-            d_coord3[d_nlist_r[ii][jj] * 3 + 0] - d_coord3[ii * 3 + 0],
-            d_coord3[d_nlist_r[ii][jj] * 3 + 1] - d_coord3[ii * 3 + 1],
-            d_coord3[d_nlist_r[ii][jj] * 3 + 2] - d_coord3[ii * 3 + 2]};
-        min_nbor_dist[ii * MAX_NNEI + jj] =
-            sqrt(rij[0] * rij[0] + rij[1] * rij[1] + rij[2] * rij[2]);
+        for (int jj = 0; jj < d_nlist_r[ii].size(); jj++) {
+          int type = d_type[d_nlist_r[ii][jj]];
+          if (type < 0) {
+            continue;  // virtual atom
+          }
+          max_nbor_size[ii * ntypes + type] += 1;
+          compute_t rij[3] = {
+              d_coord3[d_nlist_r[ii][jj] * 3 + 0] - d_coord3[ii * 3 + 0],
+              d_coord3[d_nlist_r[ii][jj] * 3 + 1] - d_coord3[ii * 3 + 1],
+              d_coord3[d_nlist_r[ii][jj] * 3 + 2] - d_coord3[ii * 3 + 2]};
+          // we do not need to do slow sqrt for every dist; instead do sqrt in
+          // the final step
+          min_nbor_dist[ii * MAX_NNEI + jj] =
+              rij[0] * rij[0] + rij[1] * rij[1] + rij[2] * rij[2];
+        }
       }
     }
   }
@@ -208,6 +291,8 @@ class NeighborStatOp : public OpKernel {
  private:
   int nnei;
   float rcut;
+  std::string device;
+  int max_nbor_size_nlist, max_cpy_trial, mem_cpy, max_nnei_trial, mem_nnei;
 };
 
 #define REGISTER_CPU(T)                                               \
@@ -216,3 +301,14 @@ class NeighborStatOp : public OpKernel {
       NeighborStatOp<CPUDevice, T>);
 REGISTER_CPU(float);
 REGISTER_CPU(double);
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define REGISTER_GPU(T)                               \
+  REGISTER_KERNEL_BUILDER(Name("NeighborStat")        \
+                              .Device(DEVICE_GPU)     \
+                              .TypeConstraint<T>("T") \
+                              .HostMemory("natoms")   \
+                              .HostMemory("box"),     \
+                          NeighborStatOp<GPUDevice, T>);
+REGISTER_GPU(float);
+REGISTER_GPU(double);
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc
index 47541bc69f..048237e042 100644
--- a/source/op/prod_env_mat_multi_device.cc
+++ b/source/op/prod_env_mat_multi_device.cc
@@ -291,31 +291,31 @@ static void _map_nei_info_gpu(int* nlist,
                               const bool& b_nlist_map);
 
 template <typename FPTYPE>
-static void _prepare_coord_nlist_gpu(OpKernelContext* context,
-                                     Tensor* tensor_list,
-                                     FPTYPE const** coord,
-                                     FPTYPE*& coord_cpy,
-                                     int const** type,
-                                     int*& type_cpy,
-                                     int*& idx_mapping,
-                                     deepmd::InputNlist& inlist,
-                                     int*& ilist,
-                                     int*& numneigh,
-                                     int**& firstneigh,
-                                     int*& jlist,
-                                     int*& nbor_list_dev,
-                                     int& new_nall,
-                                     int& mem_cpy,
-                                     int& mem_nnei,
-                                     int& max_nbor_size,
-                                     const FPTYPE* box,
-                                     const int* mesh_tensor_data,
-                                     const int mesh_tensor_size,
-                                     const int& nloc,
-                                     const int& nei_mode,
-                                     const float& rcut_r,
-                                     const int& max_cpy_trial,
-                                     const int& max_nnei_trial);
+void _prepare_coord_nlist_gpu(OpKernelContext* context,
+                              Tensor* tensor_list,
+                              FPTYPE const** coord,
+                              FPTYPE*& coord_cpy,
+                              int const** type,
+                              int*& type_cpy,
+                              int*& idx_mapping,
+                              deepmd::InputNlist& inlist,
+                              int*& ilist,
+                              int*& numneigh,
+                              int**& firstneigh,
+                              int*& jlist,
+                              int*& nbor_list_dev,
+                              int& new_nall,
+                              int& mem_cpy,
+                              int& mem_nnei,
+                              int& max_nbor_size,
+                              const FPTYPE* box,
+                              const int* mesh_tensor_data,
+                              const int mesh_tensor_size,
+                              const int& nloc,
+                              const int& nei_mode,
+                              const float& rcut_r,
+                              const int& max_cpy_trial,
+                              const int& max_nnei_trial);
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
@@ -1604,31 +1604,31 @@ static void _map_nei_info_gpu(int* nlist,
 }
 
 template <typename FPTYPE>
-static void _prepare_coord_nlist_gpu(OpKernelContext* context,
-                                     Tensor* tensor_list,
-                                     FPTYPE const** coord,
-                                     FPTYPE*& coord_cpy,
-                                     int const** type,
-                                     int*& type_cpy,
-                                     int*& idx_mapping,
-                                     deepmd::InputNlist& inlist,
-                                     int*& ilist,
-                                     int*& numneigh,
-                                     int**& firstneigh,
-                                     int*& jlist,
-                                     int*& nbor_list_dev,
-                                     int& new_nall,
-                                     int& mem_cpy,
-                                     int& mem_nnei,
-                                     int& max_nbor_size,
-                                     const FPTYPE* box,
-                                     const int* mesh_tensor_data,
-                                     const int mesh_tensor_size,
-                                     const int& nloc,
-                                     const int& nei_mode,
-                                     const float& rcut_r,
-                                     const int& max_cpy_trial,
-                                     const int& max_nnei_trial) {
+void _prepare_coord_nlist_gpu(OpKernelContext* context,
+                              Tensor* tensor_list,
+                              FPTYPE const** coord,
+                              FPTYPE*& coord_cpy,
+                              int const** type,
+                              int*& type_cpy,
+                              int*& idx_mapping,
+                              deepmd::InputNlist& inlist,
+                              int*& ilist,
+                              int*& numneigh,
+                              int**& firstneigh,
+                              int*& jlist,
+                              int*& nbor_list_dev,
+                              int& new_nall,
+                              int& mem_cpy,
+                              int& mem_nnei,
+                              int& max_nbor_size,
+                              const FPTYPE* box,
+                              const int* mesh_tensor_data,
+                              const int mesh_tensor_size,
+                              const int& nloc,
+                              const int& nei_mode,
+                              const float& rcut_r,
+                              const int& max_cpy_trial,
+                              const int& max_nnei_trial) {
   if (nei_mode != 3 && nei_mode != 4) {
     inlist.inum = nloc;
     // build nlist by myself