From 70f87edfa65c31a174a42b0359f02595d7a0fb5b Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Tue, 3 Oct 2023 22:11:43 -0400
Subject: [PATCH 01/10] add the GPU kernel for NeighborStat (first working
 version)

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/lib/include/gpu_cuda.h          |   2 +
 source/lib/include/gpu_rocm.h          |   2 +
 source/lib/include/neighbor_stat.h     |  19 ++
 source/lib/src/gpu/neighbor_stat.cu    | 125 +++++++++++
 source/op/custom_op.h                  |  28 +++
 source/op/neighbor_stat.cc             | 273 +++++++++++++++++--------
 source/op/prod_env_mat_multi_device.cc | 100 ++++-----
 7 files changed, 411 insertions(+), 138 deletions(-)
 create mode 100644 source/lib/include/neighbor_stat.h
 create mode 100644 source/lib/src/gpu/neighbor_stat.cu
diff --git a/source/lib/include/gpu_cuda.h b/source/lib/include/gpu_cuda.h
index fb467674cb..375be14e85 100644
--- a/source/lib/include/gpu_cuda.h
+++ b/source/lib/include/gpu_cuda.h
@@ -16,6 +16,8 @@
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemset cudaMemset
+#define gpuMalloc cudaMalloc
+#define gpuFree cudaFree
 
 #define GPU_MAX_NBOR_SIZE 4096
 #define DPErrcheck(res) \
diff --git a/source/lib/include/gpu_rocm.h b/source/lib/include/gpu_rocm.h
index fbd5e1ce3f..86aa719673 100644
--- a/source/lib/include/gpu_rocm.h
+++ b/source/lib/include/gpu_rocm.h
@@ -19,6 +19,8 @@
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemset hipMemset
+#define gpuMalloc hipMalloc
+#define gpuFree cudaFree
 
 #define DPErrcheck(res) \
   { DPAssert((res), __FILE__, __LINE__); }
diff --git a/source/lib/include/neighbor_stat.h b/source/lib/include/neighbor_stat.h
new file mode 100644
index 0000000000..e95eccac96
--- /dev/null
+++ b/source/lib/include/neighbor_stat.h
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: LGPL-3.0-or-later
+#include "neighbor_list.h"
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+namespace deepmd {
+void get_largest_numnei_gpu(int& numnei, const deepmd::InputNlist& gpu_nlist);
+template <typename FPTYPE>
+void neighbor_stat_gpu(const FPTYPE* coord,
+                       const int* type,
+                       const int nloc,
+                       const deepmd::InputNlist& gpu_nlist,
+                       int* max_nbor_size,
+                       FPTYPE* min_nbor_dist,
+                       const int ntypes,
+                       const int MAX_NNEI);
+}  // namespace deepmd
+
+#endif
diff --git a/source/lib/src/gpu/neighbor_stat.cu b/source/lib/src/gpu/neighbor_stat.cu
new file mode 100644
index 0000000000..092571361b
--- /dev/null
+++ b/source/lib/src/gpu/neighbor_stat.cu
@@ -0,0 +1,125 @@
+#include <cmath>
+
+#if GOOGLE_CUDA
+#include <cub/cub.cuh>
+#elif TENSORFLOW_USE_ROCM
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#else
+#error "should not touch here"
+#endif
+
+#include "device.h"
+#include "neighbor_list.h"
+
+template <typename FPTYPE>
+__global__ void neighbor_stat_g(const FPTYPE* coord,
+                                const int* type,
+                                const int nloc,
+                                const int* ilist,
+                                int** firstneigh,
+                                const int* numneigh,
+                                int* max_nbor_size,
+                                FPTYPE* min_nbor_dist,
+                                const int ntypes,
+                                const int MAX_NNEI) {
+  int ii = blockIdx.x * blockDim.x + threadIdx.x;
+  if (ii >= nloc) {
+    return;
+  }
+  int idx_i = ilist[ii];
+  if (type[idx_i] < 0) {
+    // set all to 10000
+    for (int jj = 0; jj < MAX_NNEI; jj++) {
+      min_nbor_dist[ii * MAX_NNEI + jj] = INFINITY;
+    }
+    return;  // virtual atom
+  }
+  for (int jj = 0; jj < numneigh[ii]; jj++) {
+    int idx_j = firstneigh[ii][jj];
+    int type_j = type[idx_j];
+    if (type_j < 0) {
+      min_nbor_dist[ii * MAX_NNEI + jj] = INFINITY;
+      continue;  // virtual atom
+    }
+    ++max_nbor_size[ii * ntypes + type_j];
+    FPTYPE rij[3] = {coord[idx_j * 3 + 0] - coord[idx_i * 3 + 0],
+                     coord[idx_j * 3 + 1] - coord[idx_i * 3 + 1],
+                     coord[idx_j * 3 + 2] - coord[idx_i * 3 + 2]};
+    // we do not need to use the real index
+    min_nbor_dist[ii * MAX_NNEI + jj] =
+        sqrt(rij[0] * rij[0] + rij[1] * rij[1] + rij[2] * rij[2]);
+  }
+  // set others to 10000
+  for (int jj = numneigh[ii]; jj < MAX_NNEI; ++jj) {
+    min_nbor_dist[ii * MAX_NNEI + jj] = INFINITY;
+  }
+}
+
+namespace deepmd {
+
+void get_largest_numnei_gpu(int& largest_numnei,
+                            const deepmd::InputNlist& gpu_nlist) {
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
+
+  // TODO: maybe allocate memory using TensorFlow?
+  int* d_max = NULL;
+  gpuMalloc(&d_max, sizeof(int));
+
+  void* d_temp_storage = NULL;
+  size_t temp_storage_bytes = 0;
+  cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, gpu_nlist.numneigh,
+                         d_max, gpu_nlist.inum);
+  gpuMalloc(&d_temp_storage, temp_storage_bytes);
+  cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, gpu_nlist.numneigh,
+                         d_max, gpu_nlist.inum);
+  gpuFree(d_temp_storage);
+  // copy d_max to largest_numnei
+  gpuMemcpy(&largest_numnei, d_max, sizeof(int), gpuMemcpyDeviceToHost);
+  gpuFree(d_max);
+
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
+}
+
+template <typename FPTYPE>
+void neighbor_stat_gpu(const FPTYPE* coord,
+                       const int* type,
+                       const int nloc,
+                       const deepmd::InputNlist& gpu_nlist,
+                       int* max_nbor_size,
+                       FPTYPE* min_nbor_dist,
+                       const int ntypes,
+                       const int MAX_NNEI) {
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
+
+  DPErrcheck(gpuMemset(max_nbor_size, 0, sizeof(int) * int_64(nloc) * ntypes));
+  const int nblock_loc = (nloc + TPB - 1) / TPB;
+  neighbor_stat_g<<<nblock_loc, TPB>>>(
+      coord, type, nloc, gpu_nlist.ilist, gpu_nlist.firstneigh,
+      gpu_nlist.numneigh, max_nbor_size, min_nbor_dist, ntypes, MAX_NNEI);
+
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
+}
+
+template void neighbor_stat_gpu<float>(const float* coord,
+                                       const int* type,
+                                       const int nloc,
+                                       const deepmd::InputNlist& gpu_nlist,
+                                       int* max_nbor_size,
+                                       float* min_nbor_dist,
+                                       const int ntypes,
+                                       const int MAX_NNEI);
+
+template void neighbor_stat_gpu<double>(const double* coord,
+                                        const int* type,
+                                        const int nloc,
+                                        const deepmd::InputNlist& gpu_nlist,
+                                        int* max_nbor_size,
+                                        double* min_nbor_dist,
+                                        const int ntypes,
+                                        const int MAX_NNEI);
+}  // namespace deepmd
diff --git a/source/op/custom_op.h b/source/op/custom_op.h
index b2fb290f63..baf95f3fa3 100644
--- a/source/op/custom_op.h
+++ b/source/op/custom_op.h
@@ -5,6 +5,7 @@
 #include <vector>
 
 #include "device.h"
+#include "neighbor_list.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/shape_inference.h"
@@ -25,3 +26,30 @@ namespace deepmd {
 void safe_compute(OpKernelContext* context,
                   std::function<void(OpKernelContext*)> ff);
 };
+
+template <typename FPTYPE>
+void _prepare_coord_nlist_gpu(OpKernelContext* context,
+                              Tensor* tensor_list,
+                              FPTYPE const** coord,
+                              FPTYPE*& coord_cpy,
+                              int const** type,
+                              int*& type_cpy,
+                              int*& idx_mapping,
+                              deepmd::InputNlist& inlist,
+                              int*& ilist,
+                              int*& numneigh,
+                              int**& firstneigh,
+                              int*& jlist,
+                              int*& nbor_list_dev,
+                              int& new_nall,
+                              int& mem_cpy,
+                              int& mem_nnei,
+                              int& max_nbor_size,
+                              const FPTYPE* box,
+                              const int* mesh_tensor_data,
+                              const int mesh_tensor_size,
+                              const int& nloc,
+                              const int& nei_mode,
+                              const float& rcut_r,
+                              const int& max_cpy_trial,
+                              const int& max_nnei_trial);
diff --git a/source/op/neighbor_stat.cc b/source/op/neighbor_stat.cc
index bb368cf65e..ee095699e9 100644
--- a/source/op/neighbor_stat.cc
+++ b/source/op/neighbor_stat.cc
@@ -1,4 +1,6 @@
 // SPDX-License-Identifier: LGPL-3.0-or-later
+#include "neighbor_stat.h"
+
 #include "custom_op.h"
 #include "errors.h"
 #include "neighbor_list.h"
@@ -66,7 +68,7 @@ class NeighborStatOp : public OpKernel {
                 errors::InvalidArgument("number of atoms should match"));
     OP_REQUIRES(context, (9 == box_tensor.shape().dim_size(1)),
                 errors::InvalidArgument("number of box should be 9"));
-
+    DeviceFunctor()(device, context->eigen_device<Device>());
     int nei_mode = 0;
     if (mesh_tensor.shape().dim_size(0) == 6 ||
         mesh_tensor.shape().dim_size(0) == 7) {
@@ -99,108 +101,191 @@ class NeighborStatOp : public OpKernel {
     const FPTYPE* box = box_tensor.flat<FPTYPE>().data();
     const int* mesh = mesh_tensor.flat<int>().data();
     int* max_nbor_size = max_nbor_size_tensor->flat<int>().data();
+    if (device == "GPU") {
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+      int max_nbor_size_nlist = 1024;
+      int max_cpy_trial = 100;
+      int mem_cpy = 256;
+      int max_nnei_trial = 100;
+      int mem_nnei = 256;
 
-    for (int ii = 0; ii < static_cast<int>(max_nbor_size_tensor->NumElements());
-         ii++) {
-      max_nbor_size[ii] = 0;
-    }
+      std::vector<Tensor> tensor_list(7);
+      if (nei_mode == 1) {
+        // Tensor FPTYPE_temp;
+        TensorShape FPTYPE_shape;
+        FPTYPE_shape.AddDim(nall * 3);
+        OP_REQUIRES_OK(context,
+                       context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
+                                              FPTYPE_shape, &tensor_list[0]));
 
-    // set region
-    boxtensor_t boxt[9] = {0};
-    for (int dd = 0; dd < 9; ++dd) {
-      boxt[dd] = box[dd];
-    }
-    SimulationRegion<compute_t> region;
-    region.reinitBox(boxt);
-    // set & normalize coord
-    std::vector<compute_t> d_coord3(nall * 3);
-    for (int ii = 0; ii < nall; ++ii) {
-      for (int dd = 0; dd < 3; ++dd) {
-        d_coord3[ii * 3 + dd] = coord[ii * 3 + dd];
+        // Tensor double_temp;
+        TensorShape double_shape;
+        double_shape.AddDim(18);
+        OP_REQUIRES_OK(context,
+                       context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
+                                              double_shape, &tensor_list[1]));
+        // Tensor cpy_temp;
+        TensorShape cpy_shape;
+        cpy_shape.AddDim(mem_cpy * 3);
+        OP_REQUIRES_OK(context,
+                       context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
+                                              cpy_shape, &tensor_list[3]));
+        // Tensor t_temp;
+        TensorShape t_shape;
+        t_shape.AddDim(mem_cpy * 2);
+        OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, t_shape,
+                                                       &tensor_list[4]));
+      }
+
+      // Tensor nlist_temp;
+      TensorShape nlist_shape;
+      nlist_shape.AddDim(nloc * 2);
+      OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, nlist_shape,
+                                                     &tensor_list[5]));
+
+      TensorShape jlist_shape;
+      jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei);
+      OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, jlist_shape,
+                                                     &tensor_list[6]));
+
+      int* idx_mapping = NULL;
+      int *ilist = NULL, *numneigh = NULL;
+      int** firstneigh = NULL;
+      deepmd::malloc_device_memory(firstneigh, nloc);
+      int* jlist = NULL;
+      FPTYPE* coord_cpy;
+      int* type_cpy;
+      int frame_nall = nall;
+      int mesh_tensor_size = static_cast<int>(mesh_tensor.NumElements());
+      deepmd::InputNlist gpu_inlist;
+      int* nbor_list_dev = NULL;
+      // prepare coord and nlist
+      _prepare_coord_nlist_gpu<FPTYPE>(
+          context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy,
+          idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist,
+          nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size_nlist,
+          box, mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode,
+          rcut, max_cpy_trial, max_nnei_trial);
+
+      int MAX_NNEI;
+      deepmd::get_largest_numnei_gpu(MAX_NNEI, gpu_inlist);
+
+      TensorShape min_nbor_dist_shape;
+      min_nbor_dist_shape.AddDim(nloc * MAX_NNEI);
+      Tensor* min_nbor_dist_tensor = NULL;
+      OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+                                                       min_nbor_dist_shape,
+                                                       &min_nbor_dist_tensor));
+      FPTYPE* min_nbor_dist = min_nbor_dist_tensor->flat<FPTYPE>().data();
+
+      deepmd::neighbor_stat_gpu<FPTYPE>(coord, type, nloc, gpu_inlist,
+                                        max_nbor_size, min_nbor_dist, ntypes,
+                                        MAX_NNEI);
+      deepmd::delete_device_memory(firstneigh);
+#endif
+    } else {
+      for (int ii = 0;
+           ii < static_cast<int>(max_nbor_size_tensor->NumElements()); ii++) {
+        max_nbor_size[ii] = 0;
       }
-      if (b_norm_atom) {
-        compute_t inter[3];
-        region.phys2Inter(inter, &d_coord3[3 * ii]);
+
+      // set region
+      boxtensor_t boxt[9] = {0};
+      for (int dd = 0; dd < 9; ++dd) {
+        boxt[dd] = box[dd];
+      }
+      SimulationRegion<compute_t> region;
+      region.reinitBox(boxt);
+      // set & normalize coord
+      std::vector<compute_t> d_coord3(nall * 3);
+      for (int ii = 0; ii < nall; ++ii) {
         for (int dd = 0; dd < 3; ++dd) {
-          if (inter[dd] < 0) {
-            inter[dd] += 1.;
-          } else if (inter[dd] >= 1) {
-            inter[dd] -= 1.;
+          d_coord3[ii * 3 + dd] = coord[ii * 3 + dd];
+        }
+        if (b_norm_atom) {
+          compute_t inter[3];
+          region.phys2Inter(inter, &d_coord3[3 * ii]);
+          for (int dd = 0; dd < 3; ++dd) {
+            if (inter[dd] < 0) {
+              inter[dd] += 1.;
+            } else if (inter[dd] >= 1) {
+              inter[dd] -= 1.;
+            }
           }
+          region.inter2Phys(&d_coord3[3 * ii], inter);
         }
-        region.inter2Phys(&d_coord3[3 * ii], inter);
       }
-    }
 
-    // set type
-    std::vector<int> d_type(nall);
-    for (int ii = 0; ii < nall; ++ii) {
-      d_type[ii] = type[ii];
-    }
+      // set type
+      std::vector<int> d_type(nall);
+      for (int ii = 0; ii < nall; ++ii) {
+        d_type[ii] = type[ii];
+      }
 
-    // build nlist
-    std::vector<std::vector<int> > d_nlist_a;
-    std::vector<std::vector<int> > d_nlist_r;
-    std::vector<int> nlist_map;
-    bool b_nlist_map = false;
-
-    if (nei_mode == 1) {
-      // std::cout << "I'm in nei_mode 1" << std::endl;
-      std::vector<double> bk_d_coord3 = d_coord3;
-      std::vector<int> bk_d_type = d_type;
-      std::vector<int> ncell, ngcell;
-      copy_coord(d_coord3, d_type, nlist_map, ncell, ngcell, bk_d_coord3,
-                 bk_d_type, rcut, region);
-      b_nlist_map = true;
-      std::vector<int> nat_stt(3, 0);
-      std::vector<int> ext_stt(3), ext_end(3);
-      for (int dd = 0; dd < 3; ++dd) {
-        ext_stt[dd] = -ngcell[dd];
-        ext_end[dd] = ncell[dd] + ngcell[dd];
+      // build nlist
+      std::vector<std::vector<int> > d_nlist_a;
+      std::vector<std::vector<int> > d_nlist_r;
+      std::vector<int> nlist_map;
+      bool b_nlist_map = false;
+
+      if (nei_mode == 1) {
+        // std::cout << "I'm in nei_mode 1" << std::endl;
+        std::vector<double> bk_d_coord3 = d_coord3;
+        std::vector<int> bk_d_type = d_type;
+        std::vector<int> ncell, ngcell;
+        copy_coord(d_coord3, d_type, nlist_map, ncell, ngcell, bk_d_coord3,
+                   bk_d_type, rcut, region);
+        b_nlist_map = true;
+        std::vector<int> nat_stt(3, 0);
+        std::vector<int> ext_stt(3), ext_end(3);
+        for (int dd = 0; dd < 3; ++dd) {
+          ext_stt[dd] = -ngcell[dd];
+          ext_end[dd] = ncell[dd] + ngcell[dd];
+        }
+        ::build_nlist(d_nlist_a, d_nlist_r, d_coord3, nloc, -1, rcut, nat_stt,
+                      ncell, ext_stt, ext_end, region, ncell);
+      } else if (nei_mode == -1) {
+        ::build_nlist(d_nlist_a, d_nlist_r, d_coord3, -1, rcut, NULL);
+      } else {
+        throw deepmd::deepmd_exception("unknow neighbor mode");
       }
-      ::build_nlist(d_nlist_a, d_nlist_r, d_coord3, nloc, -1, rcut, nat_stt,
-                    ncell, ext_stt, ext_end, region, ncell);
-    } else if (nei_mode == -1) {
-      ::build_nlist(d_nlist_a, d_nlist_r, d_coord3, -1, rcut, NULL);
-    } else {
-      throw deepmd::deepmd_exception("unknow neighbor mode");
-    }
 
-    int MAX_NNEI = 0;
-    for (int ii = 0; ii < nloc; ii++) {
-      MAX_NNEI =
-          MAX_NNEI < d_nlist_r[ii].size() ? d_nlist_r[ii].size() : MAX_NNEI;
-    }
-    // allocate output tensor for deepmd-kit
-    TensorShape min_nbor_dist_shape;
-    min_nbor_dist_shape.AddDim(nloc * MAX_NNEI);
-    Tensor* min_nbor_dist_tensor = NULL;
-    OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-                                                     min_nbor_dist_shape,
-                                                     &min_nbor_dist_tensor));
-    FPTYPE* min_nbor_dist = min_nbor_dist_tensor->flat<FPTYPE>().data();
-    for (int ii = 0; ii < static_cast<int>(min_nbor_dist_tensor->NumElements());
-         ii++) {
-      min_nbor_dist[ii] = 10000.0;
-    }
+      int MAX_NNEI = 0;
+      for (int ii = 0; ii < nloc; ii++) {
+        MAX_NNEI =
+            MAX_NNEI < d_nlist_r[ii].size() ? d_nlist_r[ii].size() : MAX_NNEI;
+      }
+      // allocate output tensor for deepmd-kit
+      TensorShape min_nbor_dist_shape;
+      min_nbor_dist_shape.AddDim(nloc * MAX_NNEI);
+      Tensor* min_nbor_dist_tensor = NULL;
+      OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+                                                       min_nbor_dist_shape,
+                                                       &min_nbor_dist_tensor));
+      FPTYPE* min_nbor_dist = min_nbor_dist_tensor->flat<FPTYPE>().data();
+      for (int ii = 0;
+           ii < static_cast<int>(min_nbor_dist_tensor->NumElements()); ii++) {
+        min_nbor_dist[ii] = 10000.0;
+      }
 
 #pragma omp parallel for
-    for (int ii = 0; ii < nloc; ii++) {
-      if (d_type[ii] < 0) {
-        continue;  // virtual atom
-      }
-      for (int jj = 0; jj < d_nlist_r[ii].size(); jj++) {
-        int type = d_type[d_nlist_r[ii][jj]];
-        if (type < 0) {
+      for (int ii = 0; ii < nloc; ii++) {
+        if (d_type[ii] < 0) {
           continue;  // virtual atom
         }
-        max_nbor_size[ii * ntypes + type] += 1;
-        compute_t rij[3] = {
-            d_coord3[d_nlist_r[ii][jj] * 3 + 0] - d_coord3[ii * 3 + 0],
-            d_coord3[d_nlist_r[ii][jj] * 3 + 1] - d_coord3[ii * 3 + 1],
-            d_coord3[d_nlist_r[ii][jj] * 3 + 2] - d_coord3[ii * 3 + 2]};
-        min_nbor_dist[ii * MAX_NNEI + jj] =
-            sqrt(rij[0] * rij[0] + rij[1] * rij[1] + rij[2] * rij[2]);
+        for (int jj = 0; jj < d_nlist_r[ii].size(); jj++) {
+          int type = d_type[d_nlist_r[ii][jj]];
+          if (type < 0) {
+            continue;  // virtual atom
+          }
+          max_nbor_size[ii * ntypes + type] += 1;
+          compute_t rij[3] = {
+              d_coord3[d_nlist_r[ii][jj] * 3 + 0] - d_coord3[ii * 3 + 0],
+              d_coord3[d_nlist_r[ii][jj] * 3 + 1] - d_coord3[ii * 3 + 1],
+              d_coord3[d_nlist_r[ii][jj] * 3 + 2] - d_coord3[ii * 3 + 2]};
+          min_nbor_dist[ii * MAX_NNEI + jj] =
+              sqrt(rij[0] * rij[0] + rij[1] * rij[1] + rij[2] * rij[2]);
+        }
       }
     }
   }
@@ -208,6 +293,7 @@ class NeighborStatOp : public OpKernel {
  private:
   int nnei;
   float rcut;
+  std::string device;
 };
 
 #define REGISTER_CPU(T)                                               \
@@ -216,3 +302,14 @@ class NeighborStatOp : public OpKernel {
       NeighborStatOp<CPUDevice, T>);
 REGISTER_CPU(float);
 REGISTER_CPU(double);
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define REGISTER_GPU(T)                               \
+  REGISTER_KERNEL_BUILDER(Name("NeighborStat")        \
+                              .Device(DEVICE_GPU)     \
+                              .TypeConstraint<T>("T") \
+                              .HostMemory("natoms")   \
+                              .HostMemory("box"),     \
+                          NeighborStatOp<GPUDevice, T>);
+REGISTER_GPU(float);
+REGISTER_GPU(double);
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc
index 47541bc69f..048237e042 100644
--- a/source/op/prod_env_mat_multi_device.cc
+++ b/source/op/prod_env_mat_multi_device.cc
@@ -291,31 +291,31 @@ static void _map_nei_info_gpu(int* nlist,
                               const bool& b_nlist_map);
 
 template <typename FPTYPE>
-static void _prepare_coord_nlist_gpu(OpKernelContext* context,
-                                     Tensor* tensor_list,
-                                     FPTYPE const** coord,
-                                     FPTYPE*& coord_cpy,
-                                     int const** type,
-                                     int*& type_cpy,
-                                     int*& idx_mapping,
-                                     deepmd::InputNlist& inlist,
-                                     int*& ilist,
-                                     int*& numneigh,
-                                     int**& firstneigh,
-                                     int*& jlist,
-                                     int*& nbor_list_dev,
-                                     int& new_nall,
-                                     int& mem_cpy,
-                                     int& mem_nnei,
-                                     int& max_nbor_size,
-                                     const FPTYPE* box,
-                                     const int* mesh_tensor_data,
-                                     const int mesh_tensor_size,
-                                     const int& nloc,
-                                     const int& nei_mode,
-                                     const float& rcut_r,
-                                     const int& max_cpy_trial,
-                                     const int& max_nnei_trial);
+void _prepare_coord_nlist_gpu(OpKernelContext* context,
+                              Tensor* tensor_list,
+                              FPTYPE const** coord,
+                              FPTYPE*& coord_cpy,
+                              int const** type,
+                              int*& type_cpy,
+                              int*& idx_mapping,
+                              deepmd::InputNlist& inlist,
+                              int*& ilist,
+                              int*& numneigh,
+                              int**& firstneigh,
+                              int*& jlist,
+                              int*& nbor_list_dev,
+                              int& new_nall,
+                              int& mem_cpy,
+                              int& mem_nnei,
+                              int& max_nbor_size,
+                              const FPTYPE* box,
+                              const int* mesh_tensor_data,
+                              const int mesh_tensor_size,
+                              const int& nloc,
+                              const int& nei_mode,
+                              const float& rcut_r,
+                              const int& max_cpy_trial,
+                              const int& max_nnei_trial);
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
@@ -1604,31 +1604,31 @@ static void _map_nei_info_gpu(int* nlist,
 }
 
 template <typename FPTYPE>
-static void _prepare_coord_nlist_gpu(OpKernelContext* context,
-                                     Tensor* tensor_list,
-                                     FPTYPE const** coord,
-                                     FPTYPE*& coord_cpy,
-                                     int const** type,
-                                     int*& type_cpy,
-                                     int*& idx_mapping,
-                                     deepmd::InputNlist& inlist,
-                                     int*& ilist,
-                                     int*& numneigh,
-                                     int**& firstneigh,
-                                     int*& jlist,
-                                     int*& nbor_list_dev,
-                                     int& new_nall,
-                                     int& mem_cpy,
-                                     int& mem_nnei,
-                                     int& max_nbor_size,
-                                     const FPTYPE* box,
-                                     const int* mesh_tensor_data,
-                                     const int mesh_tensor_size,
-                                     const int& nloc,
-                                     const int& nei_mode,
-                                     const float& rcut_r,
-                                     const int& max_cpy_trial,
-                                     const int& max_nnei_trial) {
+void _prepare_coord_nlist_gpu(OpKernelContext* context,
+                              Tensor* tensor_list,
+                              FPTYPE const** coord,
+                              FPTYPE*& coord_cpy,
+                              int const** type,
+                              int*& type_cpy,
+                              int*& idx_mapping,
+                              deepmd::InputNlist& inlist,
+                              int*& ilist,
+                              int*& numneigh,
+                              int**& firstneigh,
+                              int*& jlist,
+                              int*& nbor_list_dev,
+                              int& new_nall,
+                              int& mem_cpy,
+                              int& mem_nnei,
+                              int& max_nbor_size,
+                              const FPTYPE* box,
+                              const int* mesh_tensor_data,
+                              const int mesh_tensor_size,
+                              const int& nloc,
+                              const int& nei_mode,
+                              const float& rcut_r,
+                              const int& max_cpy_trial,
+                              const int& max_nnei_trial) {
   if (nei_mode != 3 && nei_mode != 4) {
     inlist.inum = nloc;
     // build nlist by myself

From fd4f62d38885d35307b89469ac49837e3d9c0c2b Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Tue, 3 Oct 2023 22:22:39 -0400
Subject: [PATCH 02/10] we don't need to calculate the maximum neighbor number
 of each frame; using a constant is fine

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/lib/include/neighbor_stat.h  |  1 -
 source/lib/src/gpu/neighbor_stat.cu | 25 -------------------------
 source/op/neighbor_stat.cc          |  7 ++-----
 3 files changed, 2 insertions(+), 31 deletions(-)

diff --git a/source/lib/include/neighbor_stat.h b/source/lib/include/neighbor_stat.h
index e95eccac96..79e241f783 100644
--- a/source/lib/include/neighbor_stat.h
+++ b/source/lib/include/neighbor_stat.h
@@ -4,7 +4,6 @@
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 namespace deepmd {
-void get_largest_numnei_gpu(int& numnei, const deepmd::InputNlist& gpu_nlist);
 template <typename FPTYPE>
 void neighbor_stat_gpu(const FPTYPE* coord,
                        const int* type,
diff --git a/source/lib/src/gpu/neighbor_stat.cu b/source/lib/src/gpu/neighbor_stat.cu
index 092571361b..9604ef3f16 100644
--- a/source/lib/src/gpu/neighbor_stat.cu
+++ b/source/lib/src/gpu/neighbor_stat.cu
@@ -58,31 +58,6 @@ __global__ void neighbor_stat_g(const FPTYPE* coord,
 
 namespace deepmd {
 
-void get_largest_numnei_gpu(int& largest_numnei,
-                            const deepmd::InputNlist& gpu_nlist) {
-  DPErrcheck(gpuGetLastError());
-  DPErrcheck(gpuDeviceSynchronize());
-
-  // TODO: maybe allocate memory using TensorFlow?
-  int* d_max = NULL;
-  gpuMalloc(&d_max, sizeof(int));
-
-  void* d_temp_storage = NULL;
-  size_t temp_storage_bytes = 0;
-  cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, gpu_nlist.numneigh,
-                         d_max, gpu_nlist.inum);
-  gpuMalloc(&d_temp_storage, temp_storage_bytes);
-  cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, gpu_nlist.numneigh,
-                         d_max, gpu_nlist.inum);
-  gpuFree(d_temp_storage);
-  // copy d_max to largest_numnei
-  gpuMemcpy(&largest_numnei, d_max, sizeof(int), gpuMemcpyDeviceToHost);
-  gpuFree(d_max);
-
-  DPErrcheck(gpuGetLastError());
-  DPErrcheck(gpuDeviceSynchronize());
-}
-
 template <typename FPTYPE>
 void neighbor_stat_gpu(const FPTYPE* coord,
                        const int* type,
diff --git a/source/op/neighbor_stat.cc b/source/op/neighbor_stat.cc
index ee095699e9..8835f00261 100644
--- a/source/op/neighbor_stat.cc
+++ b/source/op/neighbor_stat.cc
@@ -167,11 +167,8 @@ class NeighborStatOp : public OpKernel {
           box, mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode,
           rcut, max_cpy_trial, max_nnei_trial);
 
-      int MAX_NNEI;
-      deepmd::get_largest_numnei_gpu(MAX_NNEI, gpu_inlist);
-
       TensorShape min_nbor_dist_shape;
-      min_nbor_dist_shape.AddDim(nloc * MAX_NNEI);
+      min_nbor_dist_shape.AddDim(nloc * max_nbor_size_nlist);
       Tensor* min_nbor_dist_tensor = NULL;
       OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
                                                        min_nbor_dist_shape,
@@ -180,7 +177,7 @@ class NeighborStatOp : public OpKernel {
 
       deepmd::neighbor_stat_gpu<FPTYPE>(coord, type, nloc, gpu_inlist,
                                         max_nbor_size, min_nbor_dist, ntypes,
-                                        MAX_NNEI);
+                                        max_nbor_size_nlist);
       deepmd::delete_device_memory(firstneigh);
 #endif
     } else {

From ce92bac18b25a06c06c281555d59beccd1c0db82 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Tue, 3 Oct 2023 22:36:33 -0400
Subject: [PATCH 03/10] make neighbor atoms in parallel

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/lib/src/gpu/neighbor_stat.cu | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/source/lib/src/gpu/neighbor_stat.cu b/source/lib/src/gpu/neighbor_stat.cu
index 9604ef3f16..e793a0040e 100644
--- a/source/lib/src/gpu/neighbor_stat.cu
+++ b/source/lib/src/gpu/neighbor_stat.cu
@@ -12,6 +12,9 @@ namespace cub = hipcub;
 #include "device.h"
 #include "neighbor_list.h"
 
+__device__ inline double _sqrt(double x) { return sqrt(x); }
+__device__ inline float _sqrt(float x) { return sqrtf(x); }
+
 template <typename FPTYPE>
 __global__ void neighbor_stat_g(const FPTYPE* coord,
                                 const int* type,
@@ -23,35 +26,34 @@ __global__ void neighbor_stat_g(const FPTYPE* coord,
                                 FPTYPE* min_nbor_dist,
                                 const int ntypes,
                                 const int MAX_NNEI) {
-  int ii = blockIdx.x * blockDim.x + threadIdx.x;
+  int ithread = blockIdx.x * blockDim.x + threadIdx.x;
+  int ii = ithread / MAX_NNEI;
+  int jj = ithread % MAX_NNEI;
   if (ii >= nloc) {
     return;
   }
   int idx_i = ilist[ii];
   if (type[idx_i] < 0) {
     // set all to 10000
-    for (int jj = 0; jj < MAX_NNEI; jj++) {
-      min_nbor_dist[ii * MAX_NNEI + jj] = INFINITY;
-    }
+    min_nbor_dist[ii * MAX_NNEI + jj] = INFINITY;
     return;  // virtual atom
   }
-  for (int jj = 0; jj < numneigh[ii]; jj++) {
+  if (jj < numneigh[ii]) {
     int idx_j = firstneigh[ii][jj];
     int type_j = type[idx_j];
     if (type_j < 0) {
       min_nbor_dist[ii * MAX_NNEI + jj] = INFINITY;
-      continue;  // virtual atom
+      return;  // virtual atom
     }
-    ++max_nbor_size[ii * ntypes + type_j];
+    atomicAdd(max_nbor_size + ii * ntypes + type_j, 1);
     FPTYPE rij[3] = {coord[idx_j * 3 + 0] - coord[idx_i * 3 + 0],
                      coord[idx_j * 3 + 1] - coord[idx_i * 3 + 1],
                      coord[idx_j * 3 + 2] - coord[idx_i * 3 + 2]};
     // we do not need to use the real index
     min_nbor_dist[ii * MAX_NNEI + jj] =
-        sqrt(rij[0] * rij[0] + rij[1] * rij[1] + rij[2] * rij[2]);
-  }
-  // set others to 10000
-  for (int jj = numneigh[ii]; jj < MAX_NNEI; ++jj) {
+        _sqrt(rij[0] * rij[0] + rij[1] * rij[1] + rij[2] * rij[2]);
+  } else {
+    // set others to 10000
     min_nbor_dist[ii * MAX_NNEI + jj] = INFINITY;
   }
 }
@@ -71,7 +73,7 @@ void neighbor_stat_gpu(const FPTYPE* coord,
   DPErrcheck(gpuDeviceSynchronize());
 
   DPErrcheck(gpuMemset(max_nbor_size, 0, sizeof(int) * int_64(nloc) * ntypes));
-  const int nblock_loc = (nloc + TPB - 1) / TPB;
+  const int nblock_loc = (nloc * MAX_NNEI + TPB - 1) / TPB;
   neighbor_stat_g<<<nblock_loc, TPB>>>(
       coord, type, nloc, gpu_nlist.ilist, gpu_nlist.firstneigh,
       gpu_nlist.numneigh, max_nbor_size, min_nbor_dist, ntypes, MAX_NNEI);

From 36fb319b231b6d354b55415e9da5ee1caf769374 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Tue, 3 Oct 2023 22:41:39 -0400
Subject: [PATCH 04/10] use tf.reduce_min instead of np.min

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 deepmd/utils/neighbor_stat.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/deepmd/utils/neighbor_stat.py b/deepmd/utils/neighbor_stat.py
index a4ac190946..1ba9552ad4 100644
--- a/deepmd/utils/neighbor_stat.py
+++ b/deepmd/utils/neighbor_stat.py
@@ -82,6 +82,7 @@ def builder():
                 rcut=self.rcut,
             )
             place_holders["dir"] = tf.placeholder(tf.string)
+            _min_nbor_dist = tf.reduce_min(_min_nbor_dist)
             return place_holders, (_max_nbor_size, _min_nbor_dist, place_holders["dir"])
 
         with sub_graph.as_default():
@@ -128,10 +129,7 @@ def feed():
                         }
 
         for mn, dt, jj in self.p.generate(self.sub_sess, feed()):
-            if dt.size != 0:
-                dt = np.min(dt)
-            else:
-                dt = self.rcut
+            if np.isinf(dt):
                 log.warning(
                     "Atoms with no neighbors found in %s. Please make sure it's what you expected."
                     % jj

From 1f44ebcf89f3ab5fa07c5782aa05c7fc05178011 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Tue, 3 Oct 2023 23:42:54 -0400
Subject: [PATCH 05/10] use GPU to max

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 deepmd/utils/neighbor_stat.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/deepmd/utils/neighbor_stat.py b/deepmd/utils/neighbor_stat.py
index 1ba9552ad4..afa75eb9fb 100644
--- a/deepmd/utils/neighbor_stat.py
+++ b/deepmd/utils/neighbor_stat.py
@@ -83,6 +83,7 @@ def builder():
             )
             place_holders["dir"] = tf.placeholder(tf.string)
             _min_nbor_dist = tf.reduce_min(_min_nbor_dist)
+            _max_nbor_size = tf.reduce_max(_max_nbor_size, axis=0)
             return place_holders, (_max_nbor_size, _min_nbor_dist, place_holders["dir"])
 
         with sub_graph.as_default():
@@ -143,8 +144,7 @@ def feed():
                         " training data to remove duplicated atoms." % jj
                     )
                 self.min_nbor_dist = dt
-            var = np.max(mn, axis=0)
-            self.max_nbor_size = np.maximum(var, self.max_nbor_size)
+            self.max_nbor_size = np.maximum(mn, self.max_nbor_size)
 
         log.info("training data with min nbor dist: " + str(self.min_nbor_dist))
         log.info("training data with max nbor size: " + str(self.max_nbor_size))

From a730226ef4d4057f846ecf1623d88e043b956c2d Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Tue, 3 Oct 2023 23:54:44 -0400
Subject: [PATCH 06/10] use mem_nnei; make several variable to be class attr

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/op/neighbor_stat.cc | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/source/op/neighbor_stat.cc b/source/op/neighbor_stat.cc
index 8835f00261..890c683ffd 100644
--- a/source/op/neighbor_stat.cc
+++ b/source/op/neighbor_stat.cc
@@ -24,6 +24,11 @@ class NeighborStatOp : public OpKernel {
  public:
   explicit NeighborStatOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("rcut", &rcut));
+    max_nbor_size_nlist = 1024;
+    max_cpy_trial = 100;
+    mem_cpy = 256;
+    max_nnei_trial = 100;
+    mem_nnei = 256;
   }
 
   void Compute(OpKernelContext* context) override {
@@ -103,12 +108,6 @@ class NeighborStatOp : public OpKernel {
     int* max_nbor_size = max_nbor_size_tensor->flat<int>().data();
     if (device == "GPU") {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-      int max_nbor_size_nlist = 1024;
-      int max_cpy_trial = 100;
-      int mem_cpy = 256;
-      int max_nnei_trial = 100;
-      int mem_nnei = 256;
-
       std::vector<Tensor> tensor_list(7);
       if (nei_mode == 1) {
         // Tensor FPTYPE_temp;
@@ -168,7 +167,7 @@ class NeighborStatOp : public OpKernel {
           rcut, max_cpy_trial, max_nnei_trial);
 
       TensorShape min_nbor_dist_shape;
-      min_nbor_dist_shape.AddDim(nloc * max_nbor_size_nlist);
+      min_nbor_dist_shape.AddDim(nloc * mem_nnei);
       Tensor* min_nbor_dist_tensor = NULL;
       OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
                                                        min_nbor_dist_shape,
@@ -177,7 +176,7 @@ class NeighborStatOp : public OpKernel {
 
       deepmd::neighbor_stat_gpu<FPTYPE>(coord, type, nloc, gpu_inlist,
                                         max_nbor_size, min_nbor_dist, ntypes,
-                                        max_nbor_size_nlist);
+                                        mem_nnei);
       deepmd::delete_device_memory(firstneigh);
 #endif
     } else {
@@ -291,6 +290,7 @@ class NeighborStatOp : public OpKernel {
   int nnei;
   float rcut;
   std::string device;
+  int max_nbor_size_nlist, max_cpy_trial, mem_cpy, max_nnei_trial, mem_nnei;
 };
 
 #define REGISTER_CPU(T)                                               \

From b92f0b62dbe9455423d243159d0a41745995e5f8 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Wed, 4 Oct 2023 00:31:45 -0400
Subject: [PATCH 07/10] use shared memory

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/lib/src/gpu/neighbor_stat.cu | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/source/lib/src/gpu/neighbor_stat.cu b/source/lib/src/gpu/neighbor_stat.cu
index e793a0040e..c0ac265bf0 100644
--- a/source/lib/src/gpu/neighbor_stat.cu
+++ b/source/lib/src/gpu/neighbor_stat.cu
@@ -29,6 +29,9 @@ __global__ void neighbor_stat_g(const FPTYPE* coord,
   int ithread = blockIdx.x * blockDim.x + threadIdx.x;
   int ii = ithread / MAX_NNEI;
   int jj = ithread % MAX_NNEI;
+  // assume the same block has the same ii
+  __shared__ int cache[TPB];
+  cache[threadIdx.x] = 0;
   if (ii >= nloc) {
     return;
   }
@@ -45,7 +48,15 @@ __global__ void neighbor_stat_g(const FPTYPE* coord,
       min_nbor_dist[ii * MAX_NNEI + jj] = INFINITY;
       return;  // virtual atom
     }
-    atomicAdd(max_nbor_size + ii * ntypes + type_j, 1);
+    // atomicAdd(max_nbor_size + ii * ntypes + type_j, 1);
+    // See https://www.cnblogs.com/neopenx/p/4705320.html
+    __syncthreads();
+    atomicAdd(&cache[type_j], 1);
+    __syncthreads();
+    if (threadIdx.x < ntypes) {
+      atomicAdd(&max_nbor_size[ii * ntypes + threadIdx.x], cache[threadIdx.x]);
+    }
+
     FPTYPE rij[3] = {coord[idx_j * 3 + 0] - coord[idx_i * 3 + 0],
                      coord[idx_j * 3 + 1] - coord[idx_i * 3 + 1],
                      coord[idx_j * 3 + 2] - coord[idx_i * 3 + 2]};

From 5bbc62e45136afc1e9263f9eac888bb7858524aa Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Wed, 4 Oct 2023 01:23:41 -0400
Subject: [PATCH 08/10] do not sqrt for every dist; instead do it in the final

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 deepmd/utils/neighbor_stat.py       |  2 ++
 source/lib/src/gpu/neighbor_stat.cu | 18 ++++++++++--------
 source/op/neighbor_stat.cc          |  4 +++-
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/deepmd/utils/neighbor_stat.py b/deepmd/utils/neighbor_stat.py
index afa75eb9fb..fa9325937e 100644
--- a/deepmd/utils/neighbor_stat.py
+++ b/deepmd/utils/neighbor_stat.py
@@ -146,6 +146,8 @@ def feed():
                 self.min_nbor_dist = dt
             self.max_nbor_size = np.maximum(mn, self.max_nbor_size)
 
+        # do sqrt in the final
+        self.min_nbor_dist = math.sqrt(self.min_nbor_dist)
         log.info("training data with min nbor dist: " + str(self.min_nbor_dist))
         log.info("training data with max nbor size: " + str(self.max_nbor_size))
         return self.min_nbor_dist, self.max_nbor_size
diff --git a/source/lib/src/gpu/neighbor_stat.cu b/source/lib/src/gpu/neighbor_stat.cu
index c0ac265bf0..c9f87a51f0 100644
--- a/source/lib/src/gpu/neighbor_stat.cu
+++ b/source/lib/src/gpu/neighbor_stat.cu
@@ -48,21 +48,23 @@ __global__ void neighbor_stat_g(const FPTYPE* coord,
       min_nbor_dist[ii * MAX_NNEI + jj] = INFINITY;
       return;  // virtual atom
     }
+    __syncthreads();
+    FPTYPE rij[3] = {coord[idx_j * 3 + 0] - coord[idx_i * 3 + 0],
+                     coord[idx_j * 3 + 1] - coord[idx_i * 3 + 1],
+                     coord[idx_j * 3 + 2] - coord[idx_i * 3 + 2]};
+    // we do not need to use the real index
+    // we do not need to do slow sqrt for every dist; instead do sqrt in the
+    // final
+    min_nbor_dist[ii * MAX_NNEI + jj] =
+        rij[0] * rij[0] + rij[1] * rij[1] + rij[2] * rij[2];
+
     // atomicAdd(max_nbor_size + ii * ntypes + type_j, 1);
     // See https://www.cnblogs.com/neopenx/p/4705320.html
-    __syncthreads();
     atomicAdd(&cache[type_j], 1);
     __syncthreads();
     if (threadIdx.x < ntypes) {
       atomicAdd(&max_nbor_size[ii * ntypes + threadIdx.x], cache[threadIdx.x]);
     }
-
-    FPTYPE rij[3] = {coord[idx_j * 3 + 0] - coord[idx_i * 3 + 0],
-                     coord[idx_j * 3 + 1] - coord[idx_i * 3 + 1],
-                     coord[idx_j * 3 + 2] - coord[idx_i * 3 + 2]};
-    // we do not need to use the real index
-    min_nbor_dist[ii * MAX_NNEI + jj] =
-        _sqrt(rij[0] * rij[0] + rij[1] * rij[1] + rij[2] * rij[2]);
   } else {
     // set others to 10000
     min_nbor_dist[ii * MAX_NNEI + jj] = INFINITY;
diff --git a/source/op/neighbor_stat.cc b/source/op/neighbor_stat.cc
index 890c683ffd..d917c60a5f 100644
--- a/source/op/neighbor_stat.cc
+++ b/source/op/neighbor_stat.cc
@@ -279,8 +279,10 @@ class NeighborStatOp : public OpKernel {
               d_coord3[d_nlist_r[ii][jj] * 3 + 0] - d_coord3[ii * 3 + 0],
               d_coord3[d_nlist_r[ii][jj] * 3 + 1] - d_coord3[ii * 3 + 1],
               d_coord3[d_nlist_r[ii][jj] * 3 + 2] - d_coord3[ii * 3 + 2]};
+          // we do not need to do slow sqrt for every dist; instead do sqrt in
+          // the final step
           min_nbor_dist[ii * MAX_NNEI + jj] =
-              sqrt(rij[0] * rij[0] + rij[1] * rij[1] + rij[2] * rij[2]);
+              rij[0] * rij[0] + rij[1] * rij[1] + rij[2] * rij[2];
         }
       }
     }

From 685e34cda2c1bf597d1d4293a96dd19b5b4ea37f Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Wed, 4 Oct 2023 01:54:35 -0400
Subject: [PATCH 09/10] revert gpu_cuda/rocm.h

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/lib/include/gpu_cuda.h | 2 --
 source/lib/include/gpu_rocm.h | 2 --
 2 files changed, 4 deletions(-)

diff --git a/source/lib/include/gpu_cuda.h b/source/lib/include/gpu_cuda.h
index 375be14e85..fb467674cb 100644
--- a/source/lib/include/gpu_cuda.h
+++ b/source/lib/include/gpu_cuda.h
@@ -16,8 +16,6 @@
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemset cudaMemset
-#define gpuMalloc cudaMalloc
-#define gpuFree cudaFree
 
 #define GPU_MAX_NBOR_SIZE 4096
 #define DPErrcheck(res) \
diff --git a/source/lib/include/gpu_rocm.h b/source/lib/include/gpu_rocm.h
index 86aa719673..fbd5e1ce3f 100644
--- a/source/lib/include/gpu_rocm.h
+++ b/source/lib/include/gpu_rocm.h
@@ -19,8 +19,6 @@
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemset hipMemset
-#define gpuMalloc hipMalloc
-#define gpuFree cudaFree
 
 #define DPErrcheck(res) \
   { DPAssert((res), __FILE__, __LINE__); }

From 0ce33766364110e984be210acd74b77e80046fac Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Wed, 4 Oct 2023 01:56:12 -0400
Subject: [PATCH 10/10] clean unused codes

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/lib/src/gpu/neighbor_stat.cu | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/source/lib/src/gpu/neighbor_stat.cu b/source/lib/src/gpu/neighbor_stat.cu
index c9f87a51f0..ef7d3c5f8a 100644
--- a/source/lib/src/gpu/neighbor_stat.cu
+++ b/source/lib/src/gpu/neighbor_stat.cu
@@ -1,20 +1,8 @@
 #include <cmath>
 
-#if GOOGLE_CUDA
-#include <cub/cub.cuh>
-#elif TENSORFLOW_USE_ROCM
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#else
-#error "should not touch here"
-#endif
-
 #include "device.h"
 #include "neighbor_list.h"
 
-__device__ inline double _sqrt(double x) { return sqrt(x); }
-__device__ inline float _sqrt(float x) { return sqrtf(x); }
-
 template <typename FPTYPE>
 __global__ void neighbor_stat_g(const FPTYPE* coord,
                                 const int* type,