From 70f87edfa65c31a174a42b0359f02595d7a0fb5b Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Tue, 3 Oct 2023 22:11:43 -0400 Subject: [PATCH 01/10] add the GPU kernel for NeighborStat (first working version) Signed-off-by: Jinzhe Zeng --- source/lib/include/gpu_cuda.h | 2 + source/lib/include/gpu_rocm.h | 2 + source/lib/include/neighbor_stat.h | 19 ++ source/lib/src/gpu/neighbor_stat.cu | 125 +++++++++++ source/op/custom_op.h | 28 +++ source/op/neighbor_stat.cc | 273 +++++++++++++++++-------- source/op/prod_env_mat_multi_device.cc | 100 ++++----- 7 files changed, 411 insertions(+), 138 deletions(-) create mode 100644 source/lib/include/neighbor_stat.h create mode 100644 source/lib/src/gpu/neighbor_stat.cu diff --git a/source/lib/include/gpu_cuda.h b/source/lib/include/gpu_cuda.h index fb467674cb..375be14e85 100644 --- a/source/lib/include/gpu_cuda.h +++ b/source/lib/include/gpu_cuda.h @@ -16,6 +16,8 @@ #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemset cudaMemset +#define gpuMalloc cudaMalloc +#define gpuFree cudaFree #define GPU_MAX_NBOR_SIZE 4096 #define DPErrcheck(res) \ diff --git a/source/lib/include/gpu_rocm.h b/source/lib/include/gpu_rocm.h index fbd5e1ce3f..86aa719673 100644 --- a/source/lib/include/gpu_rocm.h +++ b/source/lib/include/gpu_rocm.h @@ -19,6 +19,8 @@ #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemset hipMemset +#define gpuMalloc hipMalloc +#define gpuFree cudaFree #define DPErrcheck(res) \ { DPAssert((res), __FILE__, __LINE__); } diff --git a/source/lib/include/neighbor_stat.h b/source/lib/include/neighbor_stat.h new file mode 100644 index 0000000000..e95eccac96 --- /dev/null +++ b/source/lib/include/neighbor_stat.h @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: LGPL-3.0-or-later +#include "neighbor_list.h" + +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM + +namespace deepmd { +void get_largest_numnei_gpu(int& numnei, const deepmd::InputNlist& gpu_nlist); +template +void neighbor_stat_gpu(const FPTYPE* coord, + const int* type, + const int nloc, + const deepmd::InputNlist& gpu_nlist, + int* max_nbor_size, + FPTYPE* min_nbor_dist, + const int ntypes, + const int MAX_NNEI); +} // namespace deepmd + +#endif diff --git a/source/lib/src/gpu/neighbor_stat.cu b/source/lib/src/gpu/neighbor_stat.cu new file mode 100644 index 0000000000..092571361b --- /dev/null +++ b/source/lib/src/gpu/neighbor_stat.cu @@ -0,0 +1,125 @@ +#include + +#if GOOGLE_CUDA +#include +#elif TENSORFLOW_USE_ROCM +#include +namespace cub = hipcub; +#else +#error "should not touch here" +#endif + +#include "device.h" +#include "neighbor_list.h" + +template +__global__ void neighbor_stat_g(const FPTYPE* coord, + const int* type, + const int nloc, + const int* ilist, + int** firstneigh, + const int* numneigh, + int* max_nbor_size, + FPTYPE* min_nbor_dist, + const int ntypes, + const int MAX_NNEI) { + int ii = blockIdx.x * blockDim.x + threadIdx.x; + if (ii >= nloc) { + return; + } + int idx_i = ilist[ii]; + if (type[idx_i] < 0) { + // set all to 10000 + for (int jj = 0; jj < MAX_NNEI; jj++) { + min_nbor_dist[ii * MAX_NNEI + jj] = INFINITY; + } + return; // virtual atom + } + for (int jj = 0; jj < numneigh[ii]; jj++) { + int idx_j = firstneigh[ii][jj]; + int type_j = type[idx_j]; + if (type_j < 0) { + min_nbor_dist[ii * MAX_NNEI + jj] = INFINITY; + continue; // virtual atom + } + ++max_nbor_size[ii * ntypes + type_j]; + FPTYPE rij[3] = {coord[idx_j * 3 + 0] - coord[idx_i * 3 + 0], + coord[idx_j * 3 + 1] - coord[idx_i * 3 + 1], + coord[idx_j * 3 + 2] - coord[idx_i * 3 + 2]}; + // we do not need to use the real index + min_nbor_dist[ii * MAX_NNEI + jj] = + sqrt(rij[0] * rij[0] + rij[1] * rij[1] + rij[2] * rij[2]); + } + // set others to 10000 + for (int jj = numneigh[ii]; jj < MAX_NNEI; ++jj) { + min_nbor_dist[ii * MAX_NNEI + jj] = INFINITY; + } +} + +namespace deepmd { + +void get_largest_numnei_gpu(int& largest_numnei, + const deepmd::InputNlist& gpu_nlist) { + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); + + // TODO: maybe allocate memory using TensorFlow? + int* d_max = NULL; + gpuMalloc(&d_max, sizeof(int)); + + void* d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, gpu_nlist.numneigh, + d_max, gpu_nlist.inum); + gpuMalloc(&d_temp_storage, temp_storage_bytes); + cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, gpu_nlist.numneigh, + d_max, gpu_nlist.inum); + gpuFree(d_temp_storage); + // copy d_max to largest_numnei + gpuMemcpy(&largest_numnei, d_max, sizeof(int), gpuMemcpyDeviceToHost); + gpuFree(d_max); + + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); +} + +template +void neighbor_stat_gpu(const FPTYPE* coord, + const int* type, + const int nloc, + const deepmd::InputNlist& gpu_nlist, + int* max_nbor_size, + FPTYPE* min_nbor_dist, + const int ntypes, + const int MAX_NNEI) { + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); + + DPErrcheck(gpuMemset(max_nbor_size, 0, sizeof(int) * int_64(nloc) * ntypes)); + const int nblock_loc = (nloc + TPB - 1) / TPB; + neighbor_stat_g<<>>( + coord, type, nloc, gpu_nlist.ilist, gpu_nlist.firstneigh, + gpu_nlist.numneigh, max_nbor_size, min_nbor_dist, ntypes, MAX_NNEI); + + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); +} + +template void neighbor_stat_gpu(const float* coord, + const int* type, + const int nloc, + const deepmd::InputNlist& gpu_nlist, + int* max_nbor_size, + float* min_nbor_dist, + const int ntypes, + const int MAX_NNEI); + +template void neighbor_stat_gpu(const double* coord, + const int* type, + const int nloc, + const deepmd::InputNlist& gpu_nlist, + int* max_nbor_size, + double* min_nbor_dist, + const int ntypes, + const int MAX_NNEI); +} // namespace deepmd diff --git a/source/op/custom_op.h b/source/op/custom_op.h index b2fb290f63..baf95f3fa3 100644 --- a/source/op/custom_op.h +++ b/source/op/custom_op.h @@ -5,6 +5,7 @@ #include #include "device.h" +#include "neighbor_list.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/shape_inference.h" @@ -25,3 +26,30 @@ namespace deepmd { void safe_compute(OpKernelContext* context, std::function ff); }; + +template +void _prepare_coord_nlist_gpu(OpKernelContext* context, + Tensor* tensor_list, + FPTYPE const** coord, + FPTYPE*& coord_cpy, + int const** type, + int*& type_cpy, + int*& idx_mapping, + deepmd::InputNlist& inlist, + int*& ilist, + int*& numneigh, + int**& firstneigh, + int*& jlist, + int*& nbor_list_dev, + int& new_nall, + int& mem_cpy, + int& mem_nnei, + int& max_nbor_size, + const FPTYPE* box, + const int* mesh_tensor_data, + const int mesh_tensor_size, + const int& nloc, + const int& nei_mode, + const float& rcut_r, + const int& max_cpy_trial, + const int& max_nnei_trial); diff --git a/source/op/neighbor_stat.cc b/source/op/neighbor_stat.cc index bb368cf65e..ee095699e9 100644 --- a/source/op/neighbor_stat.cc +++ b/source/op/neighbor_stat.cc @@ -1,4 +1,6 @@ // SPDX-License-Identifier: LGPL-3.0-or-later +#include "neighbor_stat.h" + #include "custom_op.h" #include "errors.h" #include "neighbor_list.h" @@ -66,7 +68,7 @@ class NeighborStatOp : public OpKernel { errors::InvalidArgument("number of atoms should match")); OP_REQUIRES(context, (9 == box_tensor.shape().dim_size(1)), errors::InvalidArgument("number of box should be 9")); - + DeviceFunctor()(device, context->eigen_device()); int nei_mode = 0; if (mesh_tensor.shape().dim_size(0) == 6 || mesh_tensor.shape().dim_size(0) == 7) { @@ -99,108 +101,191 @@ class NeighborStatOp : public OpKernel { const FPTYPE* box = box_tensor.flat().data(); const int* mesh = mesh_tensor.flat().data(); int* max_nbor_size = max_nbor_size_tensor->flat().data(); + if (device == "GPU") { +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM + int max_nbor_size_nlist = 1024; + int max_cpy_trial = 100; + int mem_cpy = 256; + int max_nnei_trial = 100; + int mem_nnei = 256; - for (int ii = 0; ii < static_cast(max_nbor_size_tensor->NumElements()); - ii++) { - max_nbor_size[ii] = 0; - } + std::vector tensor_list(7); + if (nei_mode == 1) { + // Tensor FPTYPE_temp; + TensorShape FPTYPE_shape; + FPTYPE_shape.AddDim(nall * 3); + OP_REQUIRES_OK(context, + context->allocate_temp(DataTypeToEnum::value, + FPTYPE_shape, &tensor_list[0])); - // set region - boxtensor_t boxt[9] = {0}; - for (int dd = 0; dd < 9; ++dd) { - boxt[dd] = box[dd]; - } - SimulationRegion region; - region.reinitBox(boxt); - // set & normalize coord - std::vector d_coord3(nall * 3); - for (int ii = 0; ii < nall; ++ii) { - for (int dd = 0; dd < 3; ++dd) { - d_coord3[ii * 3 + dd] = coord[ii * 3 + dd]; + // Tensor double_temp; + TensorShape double_shape; + double_shape.AddDim(18); + OP_REQUIRES_OK(context, + context->allocate_temp(DataTypeToEnum::value, + double_shape, &tensor_list[1])); + // Tensor cpy_temp; + TensorShape cpy_shape; + cpy_shape.AddDim(mem_cpy * 3); + OP_REQUIRES_OK(context, + context->allocate_temp(DataTypeToEnum::value, + cpy_shape, &tensor_list[3])); + // Tensor t_temp; + TensorShape t_shape; + t_shape.AddDim(mem_cpy * 2); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, t_shape, + &tensor_list[4])); + } + + // Tensor nlist_temp; + TensorShape nlist_shape; + nlist_shape.AddDim(nloc * 2); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, nlist_shape, + &tensor_list[5])); + + TensorShape jlist_shape; + jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, jlist_shape, + &tensor_list[6])); + + int* idx_mapping = NULL; + int *ilist = NULL, *numneigh = NULL; + int** firstneigh = NULL; + deepmd::malloc_device_memory(firstneigh, nloc); + int* jlist = NULL; + FPTYPE* coord_cpy; + int* type_cpy; + int frame_nall = nall; + int mesh_tensor_size = static_cast(mesh_tensor.NumElements()); + deepmd::InputNlist gpu_inlist; + int* nbor_list_dev = NULL; + // prepare coord and nlist + _prepare_coord_nlist_gpu( + context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy, + idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist, + nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size_nlist, + box, mesh_tensor.flat().data(), mesh_tensor_size, nloc, nei_mode, + rcut, max_cpy_trial, max_nnei_trial); + + int MAX_NNEI; + deepmd::get_largest_numnei_gpu(MAX_NNEI, gpu_inlist); + + TensorShape min_nbor_dist_shape; + min_nbor_dist_shape.AddDim(nloc * MAX_NNEI); + Tensor* min_nbor_dist_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + min_nbor_dist_shape, + &min_nbor_dist_tensor)); + FPTYPE* min_nbor_dist = min_nbor_dist_tensor->flat().data(); + + deepmd::neighbor_stat_gpu(coord, type, nloc, gpu_inlist, + max_nbor_size, min_nbor_dist, ntypes, + MAX_NNEI); + deepmd::delete_device_memory(firstneigh); +#endif + } else { + for (int ii = 0; + ii < static_cast(max_nbor_size_tensor->NumElements()); ii++) { + max_nbor_size[ii] = 0; } - if (b_norm_atom) { - compute_t inter[3]; - region.phys2Inter(inter, &d_coord3[3 * ii]); + + // set region + boxtensor_t boxt[9] = {0}; + for (int dd = 0; dd < 9; ++dd) { + boxt[dd] = box[dd]; + } + SimulationRegion region; + region.reinitBox(boxt); + // set & normalize coord + std::vector d_coord3(nall * 3); + for (int ii = 0; ii < nall; ++ii) { for (int dd = 0; dd < 3; ++dd) { - if (inter[dd] < 0) { - inter[dd] += 1.; - } else if (inter[dd] >= 1) { - inter[dd] -= 1.; + d_coord3[ii * 3 + dd] = coord[ii * 3 + dd]; + } + if (b_norm_atom) { + compute_t inter[3]; + region.phys2Inter(inter, &d_coord3[3 * ii]); + for (int dd = 0; dd < 3; ++dd) { + if (inter[dd] < 0) { + inter[dd] += 1.; + } else if (inter[dd] >= 1) { + inter[dd] -= 1.; + } } + region.inter2Phys(&d_coord3[3 * ii], inter); } - region.inter2Phys(&d_coord3[3 * ii], inter); } - } - // set type - std::vector d_type(nall); - for (int ii = 0; ii < nall; ++ii) { - d_type[ii] = type[ii]; - } + // set type + std::vector d_type(nall); + for (int ii = 0; ii < nall; ++ii) { + d_type[ii] = type[ii]; + } - // build nlist - std::vector > d_nlist_a; - std::vector > d_nlist_r; - std::vector nlist_map; - bool b_nlist_map = false; - - if (nei_mode == 1) { - // std::cout << "I'm in nei_mode 1" << std::endl; - std::vector bk_d_coord3 = d_coord3; - std::vector bk_d_type = d_type; - std::vector ncell, ngcell; - copy_coord(d_coord3, d_type, nlist_map, ncell, ngcell, bk_d_coord3, - bk_d_type, rcut, region); - b_nlist_map = true; - std::vector nat_stt(3, 0); - std::vector ext_stt(3), ext_end(3); - for (int dd = 0; dd < 3; ++dd) { - ext_stt[dd] = -ngcell[dd]; - ext_end[dd] = ncell[dd] + ngcell[dd]; + // build nlist + std::vector > d_nlist_a; + std::vector > d_nlist_r; + std::vector nlist_map; + bool b_nlist_map = false; + + if (nei_mode == 1) { + // std::cout << "I'm in nei_mode 1" << std::endl; + std::vector bk_d_coord3 = d_coord3; + std::vector bk_d_type = d_type; + std::vector ncell, ngcell; + copy_coord(d_coord3, d_type, nlist_map, ncell, ngcell, bk_d_coord3, + bk_d_type, rcut, region); + b_nlist_map = true; + std::vector nat_stt(3, 0); + std::vector ext_stt(3), ext_end(3); + for (int dd = 0; dd < 3; ++dd) { + ext_stt[dd] = -ngcell[dd]; + ext_end[dd] = ncell[dd] + ngcell[dd]; + } + ::build_nlist(d_nlist_a, d_nlist_r, d_coord3, nloc, -1, rcut, nat_stt, + ncell, ext_stt, ext_end, region, ncell); + } else if (nei_mode == -1) { + ::build_nlist(d_nlist_a, d_nlist_r, d_coord3, -1, rcut, NULL); + } else { + throw deepmd::deepmd_exception("unknow neighbor mode"); } - ::build_nlist(d_nlist_a, d_nlist_r, d_coord3, nloc, -1, rcut, nat_stt, - ncell, ext_stt, ext_end, region, ncell); - } else if (nei_mode == -1) { - ::build_nlist(d_nlist_a, d_nlist_r, d_coord3, -1, rcut, NULL); - } else { - throw deepmd::deepmd_exception("unknow neighbor mode"); - } - int MAX_NNEI = 0; - for (int ii = 0; ii < nloc; ii++) { - MAX_NNEI = - MAX_NNEI < d_nlist_r[ii].size() ? d_nlist_r[ii].size() : MAX_NNEI; - } - // allocate output tensor for deepmd-kit - TensorShape min_nbor_dist_shape; - min_nbor_dist_shape.AddDim(nloc * MAX_NNEI); - Tensor* min_nbor_dist_tensor = NULL; - OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - min_nbor_dist_shape, - &min_nbor_dist_tensor)); - FPTYPE* min_nbor_dist = min_nbor_dist_tensor->flat().data(); - for (int ii = 0; ii < static_cast(min_nbor_dist_tensor->NumElements()); - ii++) { - min_nbor_dist[ii] = 10000.0; - } + int MAX_NNEI = 0; + for (int ii = 0; ii < nloc; ii++) { + MAX_NNEI = + MAX_NNEI < d_nlist_r[ii].size() ? d_nlist_r[ii].size() : MAX_NNEI; + } + // allocate output tensor for deepmd-kit + TensorShape min_nbor_dist_shape; + min_nbor_dist_shape.AddDim(nloc * MAX_NNEI); + Tensor* min_nbor_dist_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + min_nbor_dist_shape, + &min_nbor_dist_tensor)); + FPTYPE* min_nbor_dist = min_nbor_dist_tensor->flat().data(); + for (int ii = 0; + ii < static_cast(min_nbor_dist_tensor->NumElements()); ii++) { + min_nbor_dist[ii] = 10000.0; + } #pragma omp parallel for - for (int ii = 0; ii < nloc; ii++) { - if (d_type[ii] < 0) { - continue; // virtual atom - } - for (int jj = 0; jj < d_nlist_r[ii].size(); jj++) { - int type = d_type[d_nlist_r[ii][jj]]; - if (type < 0) { + for (int ii = 0; ii < nloc; ii++) { + if (d_type[ii] < 0) { continue; // virtual atom } - max_nbor_size[ii * ntypes + type] += 1; - compute_t rij[3] = { - d_coord3[d_nlist_r[ii][jj] * 3 + 0] - d_coord3[ii * 3 + 0], - d_coord3[d_nlist_r[ii][jj] * 3 + 1] - d_coord3[ii * 3 + 1], - d_coord3[d_nlist_r[ii][jj] * 3 + 2] - d_coord3[ii * 3 + 2]}; - min_nbor_dist[ii * MAX_NNEI + jj] = - sqrt(rij[0] * rij[0] + rij[1] * rij[1] + rij[2] * rij[2]); + for (int jj = 0; jj < d_nlist_r[ii].size(); jj++) { + int type = d_type[d_nlist_r[ii][jj]]; + if (type < 0) { + continue; // virtual atom + } + max_nbor_size[ii * ntypes + type] += 1; + compute_t rij[3] = { + d_coord3[d_nlist_r[ii][jj] * 3 + 0] - d_coord3[ii * 3 + 0], + d_coord3[d_nlist_r[ii][jj] * 3 + 1] - d_coord3[ii * 3 + 1], + d_coord3[d_nlist_r[ii][jj] * 3 + 2] - d_coord3[ii * 3 + 2]}; + min_nbor_dist[ii * MAX_NNEI + jj] = + sqrt(rij[0] * rij[0] + rij[1] * rij[1] + rij[2] * rij[2]); + } } } } @@ -208,6 +293,7 @@ class NeighborStatOp : public OpKernel { private: int nnei; float rcut; + std::string device; }; #define REGISTER_CPU(T) \ @@ -216,3 +302,14 @@ class NeighborStatOp : public OpKernel { NeighborStatOp); REGISTER_CPU(float); REGISTER_CPU(double); +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM +#define REGISTER_GPU(T) \ + REGISTER_KERNEL_BUILDER(Name("NeighborStat") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .HostMemory("natoms") \ + .HostMemory("box"), \ + NeighborStatOp); +REGISTER_GPU(float); +REGISTER_GPU(double); +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc index 47541bc69f..048237e042 100644 --- a/source/op/prod_env_mat_multi_device.cc +++ b/source/op/prod_env_mat_multi_device.cc @@ -291,31 +291,31 @@ static void _map_nei_info_gpu(int* nlist, const bool& b_nlist_map); template -static void _prepare_coord_nlist_gpu(OpKernelContext* context, - Tensor* tensor_list, - FPTYPE const** coord, - FPTYPE*& coord_cpy, - int const** type, - int*& type_cpy, - int*& idx_mapping, - deepmd::InputNlist& inlist, - int*& ilist, - int*& numneigh, - int**& firstneigh, - int*& jlist, - int*& nbor_list_dev, - int& new_nall, - int& mem_cpy, - int& mem_nnei, - int& max_nbor_size, - const FPTYPE* box, - const int* mesh_tensor_data, - const int mesh_tensor_size, - const int& nloc, - const int& nei_mode, - const float& rcut_r, - const int& max_cpy_trial, - const int& max_nnei_trial); +void _prepare_coord_nlist_gpu(OpKernelContext* context, + Tensor* tensor_list, + FPTYPE const** coord, + FPTYPE*& coord_cpy, + int const** type, + int*& type_cpy, + int*& idx_mapping, + deepmd::InputNlist& inlist, + int*& ilist, + int*& numneigh, + int**& firstneigh, + int*& jlist, + int*& nbor_list_dev, + int& new_nall, + int& mem_cpy, + int& mem_nnei, + int& max_nbor_size, + const FPTYPE* box, + const int* mesh_tensor_data, + const int mesh_tensor_size, + const int& nloc, + const int& nei_mode, + const float& rcut_r, + const int& max_cpy_trial, + const int& max_nnei_trial); #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM @@ -1604,31 +1604,31 @@ static void _map_nei_info_gpu(int* nlist, } template -static void _prepare_coord_nlist_gpu(OpKernelContext* context, - Tensor* tensor_list, - FPTYPE const** coord, - FPTYPE*& coord_cpy, - int const** type, - int*& type_cpy, - int*& idx_mapping, - deepmd::InputNlist& inlist, - int*& ilist, - int*& numneigh, - int**& firstneigh, - int*& jlist, - int*& nbor_list_dev, - int& new_nall, - int& mem_cpy, - int& mem_nnei, - int& max_nbor_size, - const FPTYPE* box, - const int* mesh_tensor_data, - const int mesh_tensor_size, - const int& nloc, - const int& nei_mode, - const float& rcut_r, - const int& max_cpy_trial, - const int& max_nnei_trial) { +void _prepare_coord_nlist_gpu(OpKernelContext* context, + Tensor* tensor_list, + FPTYPE const** coord, + FPTYPE*& coord_cpy, + int const** type, + int*& type_cpy, + int*& idx_mapping, + deepmd::InputNlist& inlist, + int*& ilist, + int*& numneigh, + int**& firstneigh, + int*& jlist, + int*& nbor_list_dev, + int& new_nall, + int& mem_cpy, + int& mem_nnei, + int& max_nbor_size, + const FPTYPE* box, + const int* mesh_tensor_data, + const int mesh_tensor_size, + const int& nloc, + const int& nei_mode, + const float& rcut_r, + const int& max_cpy_trial, + const int& max_nnei_trial) { if (nei_mode != 3 && nei_mode != 4) { inlist.inum = nloc; // build nlist by myself From fd4f62d38885d35307b89469ac49837e3d9c0c2b Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Tue, 3 Oct 2023 22:22:39 -0400 Subject: [PATCH 02/10] we don't need to calculate the maximum neighbor number of each frame; using a constant is fine Signed-off-by: Jinzhe Zeng --- source/lib/include/neighbor_stat.h | 1 - source/lib/src/gpu/neighbor_stat.cu | 25 ------------------------- source/op/neighbor_stat.cc | 7 ++----- 3 files changed, 2 insertions(+), 31 deletions(-) diff --git a/source/lib/include/neighbor_stat.h b/source/lib/include/neighbor_stat.h index e95eccac96..79e241f783 100644 --- a/source/lib/include/neighbor_stat.h +++ b/source/lib/include/neighbor_stat.h @@ -4,7 +4,6 @@ #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM namespace deepmd { -void get_largest_numnei_gpu(int& numnei, const deepmd::InputNlist& gpu_nlist); template void neighbor_stat_gpu(const FPTYPE* coord, const int* type, diff --git a/source/lib/src/gpu/neighbor_stat.cu b/source/lib/src/gpu/neighbor_stat.cu index 092571361b..9604ef3f16 100644 --- a/source/lib/src/gpu/neighbor_stat.cu +++ b/source/lib/src/gpu/neighbor_stat.cu @@ -58,31 +58,6 @@ __global__ void neighbor_stat_g(const FPTYPE* coord, namespace deepmd { -void get_largest_numnei_gpu(int& largest_numnei, - const deepmd::InputNlist& gpu_nlist) { - DPErrcheck(gpuGetLastError()); - DPErrcheck(gpuDeviceSynchronize()); - - // TODO: maybe allocate memory using TensorFlow? - int* d_max = NULL; - gpuMalloc(&d_max, sizeof(int)); - - void* d_temp_storage = NULL; - size_t temp_storage_bytes = 0; - cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, gpu_nlist.numneigh, - d_max, gpu_nlist.inum); - gpuMalloc(&d_temp_storage, temp_storage_bytes); - cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, gpu_nlist.numneigh, - d_max, gpu_nlist.inum); - gpuFree(d_temp_storage); - // copy d_max to largest_numnei - gpuMemcpy(&largest_numnei, d_max, sizeof(int), gpuMemcpyDeviceToHost); - gpuFree(d_max); - - DPErrcheck(gpuGetLastError()); - DPErrcheck(gpuDeviceSynchronize()); -} - template void neighbor_stat_gpu(const FPTYPE* coord, const int* type, diff --git a/source/op/neighbor_stat.cc b/source/op/neighbor_stat.cc index ee095699e9..8835f00261 100644 --- a/source/op/neighbor_stat.cc +++ b/source/op/neighbor_stat.cc @@ -167,11 +167,8 @@ class NeighborStatOp : public OpKernel { box, mesh_tensor.flat().data(), mesh_tensor_size, nloc, nei_mode, rcut, max_cpy_trial, max_nnei_trial); - int MAX_NNEI; - deepmd::get_largest_numnei_gpu(MAX_NNEI, gpu_inlist); - TensorShape min_nbor_dist_shape; - min_nbor_dist_shape.AddDim(nloc * MAX_NNEI); + min_nbor_dist_shape.AddDim(nloc * max_nbor_size_nlist); Tensor* min_nbor_dist_tensor = NULL; OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, min_nbor_dist_shape, @@ -180,7 +177,7 @@ class NeighborStatOp : public OpKernel { deepmd::neighbor_stat_gpu(coord, type, nloc, gpu_inlist, max_nbor_size, min_nbor_dist, ntypes, - MAX_NNEI); + max_nbor_size_nlist); deepmd::delete_device_memory(firstneigh); #endif } else { From ce92bac18b25a06c06c281555d59beccd1c0db82 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Tue, 3 Oct 2023 22:36:33 -0400 Subject: [PATCH 03/10] make neighbor atoms in parallel Signed-off-by: Jinzhe Zeng --- source/lib/src/gpu/neighbor_stat.cu | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/source/lib/src/gpu/neighbor_stat.cu b/source/lib/src/gpu/neighbor_stat.cu index 9604ef3f16..e793a0040e 100644 --- a/source/lib/src/gpu/neighbor_stat.cu +++ b/source/lib/src/gpu/neighbor_stat.cu @@ -12,6 +12,9 @@ namespace cub = hipcub; #include "device.h" #include "neighbor_list.h" +__device__ inline double _sqrt(double x) { return sqrt(x); } +__device__ inline float _sqrt(float x) { return sqrtf(x); } + template __global__ void neighbor_stat_g(const FPTYPE* coord, const int* type, @@ -23,35 +26,34 @@ __global__ void neighbor_stat_g(const FPTYPE* coord, FPTYPE* min_nbor_dist, const int ntypes, const int MAX_NNEI) { - int ii = blockIdx.x * blockDim.x + threadIdx.x; + int ithread = blockIdx.x * blockDim.x + threadIdx.x; + int ii = ithread / MAX_NNEI; + int jj = ithread % MAX_NNEI; if (ii >= nloc) { return; } int idx_i = ilist[ii]; if (type[idx_i] < 0) { // set all to 10000 - for (int jj = 0; jj < MAX_NNEI; jj++) { - min_nbor_dist[ii * MAX_NNEI + jj] = INFINITY; - } + min_nbor_dist[ii * MAX_NNEI + jj] = INFINITY; return; // virtual atom } - for (int jj = 0; jj < numneigh[ii]; jj++) { + if (jj < numneigh[ii]) { int idx_j = firstneigh[ii][jj]; int type_j = type[idx_j]; if (type_j < 0) { min_nbor_dist[ii * MAX_NNEI + jj] = INFINITY; - continue; // virtual atom + return; // virtual atom } - ++max_nbor_size[ii * ntypes + type_j]; + atomicAdd(max_nbor_size + ii * ntypes + type_j, 1); FPTYPE rij[3] = {coord[idx_j * 3 + 0] - coord[idx_i * 3 + 0], coord[idx_j * 3 + 1] - coord[idx_i * 3 + 1], coord[idx_j * 3 + 2] - coord[idx_i * 3 + 2]}; // we do not need to use the real index min_nbor_dist[ii * MAX_NNEI + jj] = - sqrt(rij[0] * rij[0] + rij[1] * rij[1] + rij[2] * rij[2]); - } - // set others to 10000 - for (int jj = numneigh[ii]; jj < MAX_NNEI; ++jj) { + _sqrt(rij[0] * rij[0] + rij[1] * rij[1] + rij[2] * rij[2]); + } else { + // set others to 10000 min_nbor_dist[ii * MAX_NNEI + jj] = INFINITY; } } @@ -71,7 +73,7 @@ void neighbor_stat_gpu(const FPTYPE* coord, DPErrcheck(gpuDeviceSynchronize()); DPErrcheck(gpuMemset(max_nbor_size, 0, sizeof(int) * int_64(nloc) * ntypes)); - const int nblock_loc = (nloc + TPB - 1) / TPB; + const int nblock_loc = (nloc * MAX_NNEI + TPB - 1) / TPB; neighbor_stat_g<<>>( coord, type, nloc, gpu_nlist.ilist, gpu_nlist.firstneigh, gpu_nlist.numneigh, max_nbor_size, min_nbor_dist, ntypes, MAX_NNEI); From 36fb319b231b6d354b55415e9da5ee1caf769374 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Tue, 3 Oct 2023 22:41:39 -0400 Subject: [PATCH 04/10] use tf.reduce_min instead of np.min Signed-off-by: Jinzhe Zeng --- deepmd/utils/neighbor_stat.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/deepmd/utils/neighbor_stat.py b/deepmd/utils/neighbor_stat.py index a4ac190946..1ba9552ad4 100644 --- a/deepmd/utils/neighbor_stat.py +++ b/deepmd/utils/neighbor_stat.py @@ -82,6 +82,7 @@ def builder(): rcut=self.rcut, ) place_holders["dir"] = tf.placeholder(tf.string) + _min_nbor_dist = tf.reduce_min(_min_nbor_dist) return place_holders, (_max_nbor_size, _min_nbor_dist, place_holders["dir"]) with sub_graph.as_default(): @@ -128,10 +129,7 @@ def feed(): } for mn, dt, jj in self.p.generate(self.sub_sess, feed()): - if dt.size != 0: - dt = np.min(dt) - else: - dt = self.rcut + if np.isinf(dt): log.warning( "Atoms with no neighbors found in %s. Please make sure it's what you expected." % jj From 1f44ebcf89f3ab5fa07c5782aa05c7fc05178011 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Tue, 3 Oct 2023 23:42:54 -0400 Subject: [PATCH 05/10] use GPU to max Signed-off-by: Jinzhe Zeng --- deepmd/utils/neighbor_stat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deepmd/utils/neighbor_stat.py b/deepmd/utils/neighbor_stat.py index 1ba9552ad4..afa75eb9fb 100644 --- a/deepmd/utils/neighbor_stat.py +++ b/deepmd/utils/neighbor_stat.py @@ -83,6 +83,7 @@ def builder(): ) place_holders["dir"] = tf.placeholder(tf.string) _min_nbor_dist = tf.reduce_min(_min_nbor_dist) + _max_nbor_size = tf.reduce_max(_max_nbor_size, axis=0) return place_holders, (_max_nbor_size, _min_nbor_dist, place_holders["dir"]) with sub_graph.as_default(): @@ -143,8 +144,7 @@ def feed(): " training data to remove duplicated atoms." % jj ) self.min_nbor_dist = dt - var = np.max(mn, axis=0) - self.max_nbor_size = np.maximum(var, self.max_nbor_size) + self.max_nbor_size = np.maximum(mn, self.max_nbor_size) log.info("training data with min nbor dist: " + str(self.min_nbor_dist)) log.info("training data with max nbor size: " + str(self.max_nbor_size)) From a730226ef4d4057f846ecf1623d88e043b956c2d Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Tue, 3 Oct 2023 23:54:44 -0400 Subject: [PATCH 06/10] use mem_nnei; make several variable to be class attr Signed-off-by: Jinzhe Zeng --- source/op/neighbor_stat.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/source/op/neighbor_stat.cc b/source/op/neighbor_stat.cc index 8835f00261..890c683ffd 100644 --- a/source/op/neighbor_stat.cc +++ b/source/op/neighbor_stat.cc @@ -24,6 +24,11 @@ class NeighborStatOp : public OpKernel { public: explicit NeighborStatOp(OpKernelConstruction* context) : OpKernel(context) { OP_REQUIRES_OK(context, context->GetAttr("rcut", &rcut)); + max_nbor_size_nlist = 1024; + max_cpy_trial = 100; + mem_cpy = 256; + max_nnei_trial = 100; + mem_nnei = 256; } void Compute(OpKernelContext* context) override { @@ -103,12 +108,6 @@ class NeighborStatOp : public OpKernel { int* max_nbor_size = max_nbor_size_tensor->flat().data(); if (device == "GPU") { #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM - int max_nbor_size_nlist = 1024; - int max_cpy_trial = 100; - int mem_cpy = 256; - int max_nnei_trial = 100; - int mem_nnei = 256; - std::vector tensor_list(7); if (nei_mode == 1) { // Tensor FPTYPE_temp; @@ -168,7 +167,7 @@ class NeighborStatOp : public OpKernel { rcut, max_cpy_trial, max_nnei_trial); TensorShape min_nbor_dist_shape; - min_nbor_dist_shape.AddDim(nloc * max_nbor_size_nlist); + min_nbor_dist_shape.AddDim(nloc * mem_nnei); Tensor* min_nbor_dist_tensor = NULL; OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, min_nbor_dist_shape, @@ -177,7 +176,7 @@ class NeighborStatOp : public OpKernel { deepmd::neighbor_stat_gpu(coord, type, nloc, gpu_inlist, max_nbor_size, min_nbor_dist, ntypes, - max_nbor_size_nlist); + mem_nnei); deepmd::delete_device_memory(firstneigh); #endif } else { @@ -291,6 +290,7 @@ class NeighborStatOp : public OpKernel { int nnei; float rcut; std::string device; + int max_nbor_size_nlist, max_cpy_trial, mem_cpy, max_nnei_trial, mem_nnei; }; #define REGISTER_CPU(T) \ From b92f0b62dbe9455423d243159d0a41745995e5f8 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Wed, 4 Oct 2023 00:31:45 -0400 Subject: [PATCH 07/10] use shared memory Signed-off-by: Jinzhe Zeng --- source/lib/src/gpu/neighbor_stat.cu | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/source/lib/src/gpu/neighbor_stat.cu b/source/lib/src/gpu/neighbor_stat.cu index e793a0040e..c0ac265bf0 100644 --- a/source/lib/src/gpu/neighbor_stat.cu +++ b/source/lib/src/gpu/neighbor_stat.cu @@ -29,6 +29,9 @@ __global__ void neighbor_stat_g(const FPTYPE* coord, int ithread = blockIdx.x * blockDim.x + threadIdx.x; int ii = ithread / MAX_NNEI; int jj = ithread % MAX_NNEI; + // assume the same block has the same ii + __shared__ int cache[TPB]; + cache[threadIdx.x] = 0; if (ii >= nloc) { return; } @@ -45,7 +48,15 @@ __global__ void neighbor_stat_g(const FPTYPE* coord, min_nbor_dist[ii * MAX_NNEI + jj] = INFINITY; return; // virtual atom } - atomicAdd(max_nbor_size + ii * ntypes + type_j, 1); + // atomicAdd(max_nbor_size + ii * ntypes + type_j, 1); + // See https://www.cnblogs.com/neopenx/p/4705320.html + __syncthreads(); + atomicAdd(&cache[type_j], 1); + __syncthreads(); + if (threadIdx.x < ntypes) { + atomicAdd(&max_nbor_size[ii * ntypes + threadIdx.x], cache[threadIdx.x]); + } + FPTYPE rij[3] = {coord[idx_j * 3 + 0] - coord[idx_i * 3 + 0], coord[idx_j * 3 + 1] - coord[idx_i * 3 + 1], coord[idx_j * 3 + 2] - coord[idx_i * 3 + 2]}; From 5bbc62e45136afc1e9263f9eac888bb7858524aa Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Wed, 4 Oct 2023 01:23:41 -0400 Subject: [PATCH 08/10] do not sqrt for every dist; instead do it in the final Signed-off-by: Jinzhe Zeng --- deepmd/utils/neighbor_stat.py | 2 ++ source/lib/src/gpu/neighbor_stat.cu | 18 ++++++++++-------- source/op/neighbor_stat.cc | 4 +++- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/deepmd/utils/neighbor_stat.py b/deepmd/utils/neighbor_stat.py index afa75eb9fb..fa9325937e 100644 --- a/deepmd/utils/neighbor_stat.py +++ b/deepmd/utils/neighbor_stat.py @@ -146,6 +146,8 @@ def feed(): self.min_nbor_dist = dt self.max_nbor_size = np.maximum(mn, self.max_nbor_size) + # do sqrt in the final + self.min_nbor_dist = math.sqrt(self.min_nbor_dist) log.info("training data with min nbor dist: " + str(self.min_nbor_dist)) log.info("training data with max nbor size: " + str(self.max_nbor_size)) return self.min_nbor_dist, self.max_nbor_size diff --git a/source/lib/src/gpu/neighbor_stat.cu b/source/lib/src/gpu/neighbor_stat.cu index c0ac265bf0..c9f87a51f0 100644 --- a/source/lib/src/gpu/neighbor_stat.cu +++ b/source/lib/src/gpu/neighbor_stat.cu @@ -48,21 +48,23 @@ __global__ void neighbor_stat_g(const FPTYPE* coord, min_nbor_dist[ii * MAX_NNEI + jj] = INFINITY; return; // virtual atom } + __syncthreads(); + FPTYPE rij[3] = {coord[idx_j * 3 + 0] - coord[idx_i * 3 + 0], + coord[idx_j * 3 + 1] - coord[idx_i * 3 + 1], + coord[idx_j * 3 + 2] - coord[idx_i * 3 + 2]}; + // we do not need to use the real index + // we do not need to do slow sqrt for every dist; instead do sqrt in the + // final + min_nbor_dist[ii * MAX_NNEI + jj] = + rij[0] * rij[0] + rij[1] * rij[1] + rij[2] * rij[2]; + // atomicAdd(max_nbor_size + ii * ntypes + type_j, 1); // See https://www.cnblogs.com/neopenx/p/4705320.html - __syncthreads(); atomicAdd(&cache[type_j], 1); __syncthreads(); if (threadIdx.x < ntypes) { atomicAdd(&max_nbor_size[ii * ntypes + threadIdx.x], cache[threadIdx.x]); } - - FPTYPE rij[3] = {coord[idx_j * 3 + 0] - coord[idx_i * 3 + 0], - coord[idx_j * 3 + 1] - coord[idx_i * 3 + 1], - coord[idx_j * 3 + 2] - coord[idx_i * 3 + 2]}; - // we do not need to use the real index - min_nbor_dist[ii * MAX_NNEI + jj] = - _sqrt(rij[0] * rij[0] + rij[1] * rij[1] + rij[2] * rij[2]); } else { // set others to 10000 min_nbor_dist[ii * MAX_NNEI + jj] = INFINITY; diff --git a/source/op/neighbor_stat.cc b/source/op/neighbor_stat.cc index 890c683ffd..d917c60a5f 100644 --- a/source/op/neighbor_stat.cc +++ b/source/op/neighbor_stat.cc @@ -279,8 +279,10 @@ class NeighborStatOp : public OpKernel { d_coord3[d_nlist_r[ii][jj] * 3 + 0] - d_coord3[ii * 3 + 0], d_coord3[d_nlist_r[ii][jj] * 3 + 1] - d_coord3[ii * 3 + 1], d_coord3[d_nlist_r[ii][jj] * 3 + 2] - d_coord3[ii * 3 + 2]}; + // we do not need to do slow sqrt for every dist; instead do sqrt in + // the final step min_nbor_dist[ii * MAX_NNEI + jj] = - sqrt(rij[0] * rij[0] + rij[1] * rij[1] + rij[2] * rij[2]); + rij[0] * rij[0] + rij[1] * rij[1] + rij[2] * rij[2]; } } } From 685e34cda2c1bf597d1d4293a96dd19b5b4ea37f Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Wed, 4 Oct 2023 01:54:35 -0400 Subject: [PATCH 09/10] revert gpu_cuda/rocm.h Signed-off-by: Jinzhe Zeng --- source/lib/include/gpu_cuda.h | 2 -- source/lib/include/gpu_rocm.h | 2 -- 2 files changed, 4 deletions(-) diff --git a/source/lib/include/gpu_cuda.h b/source/lib/include/gpu_cuda.h index 375be14e85..fb467674cb 100644 --- a/source/lib/include/gpu_cuda.h +++ b/source/lib/include/gpu_cuda.h @@ -16,8 +16,6 @@ #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemset cudaMemset -#define gpuMalloc cudaMalloc -#define gpuFree cudaFree #define GPU_MAX_NBOR_SIZE 4096 #define DPErrcheck(res) \ diff --git a/source/lib/include/gpu_rocm.h b/source/lib/include/gpu_rocm.h index 86aa719673..fbd5e1ce3f 100644 --- a/source/lib/include/gpu_rocm.h +++ b/source/lib/include/gpu_rocm.h @@ -19,8 +19,6 @@ #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemset hipMemset -#define gpuMalloc hipMalloc -#define gpuFree cudaFree #define DPErrcheck(res) \ { DPAssert((res), __FILE__, __LINE__); } From 0ce33766364110e984be210acd74b77e80046fac Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Wed, 4 Oct 2023 01:56:12 -0400 Subject: [PATCH 10/10] clean unused codes Signed-off-by: Jinzhe Zeng --- source/lib/src/gpu/neighbor_stat.cu | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/source/lib/src/gpu/neighbor_stat.cu b/source/lib/src/gpu/neighbor_stat.cu index c9f87a51f0..ef7d3c5f8a 100644 --- a/source/lib/src/gpu/neighbor_stat.cu +++ b/source/lib/src/gpu/neighbor_stat.cu @@ -1,20 +1,8 @@ #include -#if GOOGLE_CUDA -#include -#elif TENSORFLOW_USE_ROCM -#include -namespace cub = hipcub; -#else -#error "should not touch here" -#endif - #include "device.h" #include "neighbor_list.h" -__device__ inline double _sqrt(double x) { return sqrt(x); } -__device__ inline float _sqrt(float x) { return sqrtf(x); } - template __global__ void neighbor_stat_g(const FPTYPE* coord, const int* type,