diff --git a/deepmd/utils/neighbor_stat.py b/deepmd/utils/neighbor_stat.py index a4ac190946..fa9325937e 100644 --- a/deepmd/utils/neighbor_stat.py +++ b/deepmd/utils/neighbor_stat.py @@ -82,6 +82,8 @@ def builder(): rcut=self.rcut, ) place_holders["dir"] = tf.placeholder(tf.string) + _min_nbor_dist = tf.reduce_min(_min_nbor_dist) + _max_nbor_size = tf.reduce_max(_max_nbor_size, axis=0) return place_holders, (_max_nbor_size, _min_nbor_dist, place_holders["dir"]) with sub_graph.as_default(): @@ -128,10 +130,7 @@ def feed(): } for mn, dt, jj in self.p.generate(self.sub_sess, feed()): - if dt.size != 0: - dt = np.min(dt) - else: - dt = self.rcut + if np.isinf(dt): log.warning( "Atoms with no neighbors found in %s. Please make sure it's what you expected." % jj @@ -145,9 +144,10 @@ def feed(): " training data to remove duplicated atoms." % jj ) self.min_nbor_dist = dt - var = np.max(mn, axis=0) - self.max_nbor_size = np.maximum(var, self.max_nbor_size) + self.max_nbor_size = np.maximum(mn, self.max_nbor_size) + # do sqrt in the final + self.min_nbor_dist = math.sqrt(self.min_nbor_dist) log.info("training data with min nbor dist: " + str(self.min_nbor_dist)) log.info("training data with max nbor size: " + str(self.max_nbor_size)) return self.min_nbor_dist, self.max_nbor_size diff --git a/source/lib/include/neighbor_stat.h b/source/lib/include/neighbor_stat.h new file mode 100644 index 0000000000..79e241f783 --- /dev/null +++ b/source/lib/include/neighbor_stat.h @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: LGPL-3.0-or-later +#include "neighbor_list.h" + +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM + +namespace deepmd { +template +void neighbor_stat_gpu(const FPTYPE* coord, + const int* type, + const int nloc, + const deepmd::InputNlist& gpu_nlist, + int* max_nbor_size, + FPTYPE* min_nbor_dist, + const int ntypes, + const int MAX_NNEI); +} // namespace deepmd + +#endif diff --git a/source/lib/src/gpu/neighbor_stat.cu b/source/lib/src/gpu/neighbor_stat.cu new file mode 100644 index 0000000000..ef7d3c5f8a --- /dev/null +++ b/source/lib/src/gpu/neighbor_stat.cu @@ -0,0 +1,103 @@ +#include + +#include "device.h" +#include "neighbor_list.h" + +template +__global__ void neighbor_stat_g(const FPTYPE* coord, + const int* type, + const int nloc, + const int* ilist, + int** firstneigh, + const int* numneigh, + int* max_nbor_size, + FPTYPE* min_nbor_dist, + const int ntypes, + const int MAX_NNEI) { + int ithread = blockIdx.x * blockDim.x + threadIdx.x; + int ii = ithread / MAX_NNEI; + int jj = ithread % MAX_NNEI; + // assume the same block has the same ii + __shared__ int cache[TPB]; + cache[threadIdx.x] = 0; + if (ii >= nloc) { + return; + } + int idx_i = ilist[ii]; + if (type[idx_i] < 0) { + // set all to 10000 + min_nbor_dist[ii * MAX_NNEI + jj] = INFINITY; + return; // virtual atom + } + if (jj < numneigh[ii]) { + int idx_j = firstneigh[ii][jj]; + int type_j = type[idx_j]; + if (type_j < 0) { + min_nbor_dist[ii * MAX_NNEI + jj] = INFINITY; + return; // virtual atom + } + __syncthreads(); + FPTYPE rij[3] = {coord[idx_j * 3 + 0] - coord[idx_i * 3 + 0], + coord[idx_j * 3 + 1] - coord[idx_i * 3 + 1], + coord[idx_j * 3 + 2] - coord[idx_i * 3 + 2]}; + // we do not need to use the real index + // we do not need to do slow sqrt for every dist; instead do sqrt in the + // final + min_nbor_dist[ii * MAX_NNEI + jj] = + rij[0] * rij[0] + rij[1] * rij[1] + rij[2] * rij[2]; + + // atomicAdd(max_nbor_size + ii * ntypes + type_j, 1); + // See https://www.cnblogs.com/neopenx/p/4705320.html + atomicAdd(&cache[type_j], 1); + __syncthreads(); + if (threadIdx.x < ntypes) { + atomicAdd(&max_nbor_size[ii * ntypes + threadIdx.x], cache[threadIdx.x]); + } + } else { + // set others to 10000 + min_nbor_dist[ii * MAX_NNEI + jj] = INFINITY; + } +} + +namespace deepmd { + +template +void neighbor_stat_gpu(const FPTYPE* coord, + const int* type, + const int nloc, + const deepmd::InputNlist& gpu_nlist, + int* max_nbor_size, + FPTYPE* min_nbor_dist, + const int ntypes, + const int MAX_NNEI) { + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); + + DPErrcheck(gpuMemset(max_nbor_size, 0, sizeof(int) * int_64(nloc) * ntypes)); + const int nblock_loc = (nloc * MAX_NNEI + TPB - 1) / TPB; + neighbor_stat_g<<>>( + coord, type, nloc, gpu_nlist.ilist, gpu_nlist.firstneigh, + gpu_nlist.numneigh, max_nbor_size, min_nbor_dist, ntypes, MAX_NNEI); + + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); +} + +template void neighbor_stat_gpu(const float* coord, + const int* type, + const int nloc, + const deepmd::InputNlist& gpu_nlist, + int* max_nbor_size, + float* min_nbor_dist, + const int ntypes, + const int MAX_NNEI); + +template void neighbor_stat_gpu(const double* coord, + const int* type, + const int nloc, + const deepmd::InputNlist& gpu_nlist, + int* max_nbor_size, + double* min_nbor_dist, + const int ntypes, + const int MAX_NNEI); +} // namespace deepmd diff --git a/source/op/custom_op.h b/source/op/custom_op.h index b2fb290f63..baf95f3fa3 100644 --- a/source/op/custom_op.h +++ b/source/op/custom_op.h @@ -5,6 +5,7 @@ #include #include "device.h" +#include "neighbor_list.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/shape_inference.h" @@ -25,3 +26,30 @@ namespace deepmd { void safe_compute(OpKernelContext* context, std::function ff); }; + +template +void _prepare_coord_nlist_gpu(OpKernelContext* context, + Tensor* tensor_list, + FPTYPE const** coord, + FPTYPE*& coord_cpy, + int const** type, + int*& type_cpy, + int*& idx_mapping, + deepmd::InputNlist& inlist, + int*& ilist, + int*& numneigh, + int**& firstneigh, + int*& jlist, + int*& nbor_list_dev, + int& new_nall, + int& mem_cpy, + int& mem_nnei, + int& max_nbor_size, + const FPTYPE* box, + const int* mesh_tensor_data, + const int mesh_tensor_size, + const int& nloc, + const int& nei_mode, + const float& rcut_r, + const int& max_cpy_trial, + const int& max_nnei_trial); diff --git a/source/op/neighbor_stat.cc b/source/op/neighbor_stat.cc index bb368cf65e..d917c60a5f 100644 --- a/source/op/neighbor_stat.cc +++ b/source/op/neighbor_stat.cc @@ -1,4 +1,6 @@ // SPDX-License-Identifier: LGPL-3.0-or-later +#include "neighbor_stat.h" + #include "custom_op.h" #include "errors.h" #include "neighbor_list.h" @@ -22,6 +24,11 @@ class NeighborStatOp : public OpKernel { public: explicit NeighborStatOp(OpKernelConstruction* context) : OpKernel(context) { OP_REQUIRES_OK(context, context->GetAttr("rcut", &rcut)); + max_nbor_size_nlist = 1024; + max_cpy_trial = 100; + mem_cpy = 256; + max_nnei_trial = 100; + mem_nnei = 256; } void Compute(OpKernelContext* context) override { @@ -66,7 +73,7 @@ class NeighborStatOp : public OpKernel { errors::InvalidArgument("number of atoms should match")); OP_REQUIRES(context, (9 == box_tensor.shape().dim_size(1)), errors::InvalidArgument("number of box should be 9")); - + DeviceFunctor()(device, context->eigen_device()); int nei_mode = 0; if (mesh_tensor.shape().dim_size(0) == 6 || mesh_tensor.shape().dim_size(0) == 7) { @@ -99,108 +106,184 @@ class NeighborStatOp : public OpKernel { const FPTYPE* box = box_tensor.flat().data(); const int* mesh = mesh_tensor.flat().data(); int* max_nbor_size = max_nbor_size_tensor->flat().data(); + if (device == "GPU") { +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM + std::vector tensor_list(7); + if (nei_mode == 1) { + // Tensor FPTYPE_temp; + TensorShape FPTYPE_shape; + FPTYPE_shape.AddDim(nall * 3); + OP_REQUIRES_OK(context, + context->allocate_temp(DataTypeToEnum::value, + FPTYPE_shape, &tensor_list[0])); - for (int ii = 0; ii < static_cast(max_nbor_size_tensor->NumElements()); - ii++) { - max_nbor_size[ii] = 0; - } + // Tensor double_temp; + TensorShape double_shape; + double_shape.AddDim(18); + OP_REQUIRES_OK(context, + context->allocate_temp(DataTypeToEnum::value, + double_shape, &tensor_list[1])); + // Tensor cpy_temp; + TensorShape cpy_shape; + cpy_shape.AddDim(mem_cpy * 3); + OP_REQUIRES_OK(context, + context->allocate_temp(DataTypeToEnum::value, + cpy_shape, &tensor_list[3])); + // Tensor t_temp; + TensorShape t_shape; + t_shape.AddDim(mem_cpy * 2); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, t_shape, + &tensor_list[4])); + } - // set region - boxtensor_t boxt[9] = {0}; - for (int dd = 0; dd < 9; ++dd) { - boxt[dd] = box[dd]; - } - SimulationRegion region; - region.reinitBox(boxt); - // set & normalize coord - std::vector d_coord3(nall * 3); - for (int ii = 0; ii < nall; ++ii) { - for (int dd = 0; dd < 3; ++dd) { - d_coord3[ii * 3 + dd] = coord[ii * 3 + dd]; + // Tensor nlist_temp; + TensorShape nlist_shape; + nlist_shape.AddDim(nloc * 2); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, nlist_shape, + &tensor_list[5])); + + TensorShape jlist_shape; + jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, jlist_shape, + &tensor_list[6])); + + int* idx_mapping = NULL; + int *ilist = NULL, *numneigh = NULL; + int** firstneigh = NULL; + deepmd::malloc_device_memory(firstneigh, nloc); + int* jlist = NULL; + FPTYPE* coord_cpy; + int* type_cpy; + int frame_nall = nall; + int mesh_tensor_size = static_cast(mesh_tensor.NumElements()); + deepmd::InputNlist gpu_inlist; + int* nbor_list_dev = NULL; + // prepare coord and nlist + _prepare_coord_nlist_gpu( + context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy, + idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist, + nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size_nlist, + box, mesh_tensor.flat().data(), mesh_tensor_size, nloc, nei_mode, + rcut, max_cpy_trial, max_nnei_trial); + + TensorShape min_nbor_dist_shape; + min_nbor_dist_shape.AddDim(nloc * mem_nnei); + Tensor* min_nbor_dist_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + min_nbor_dist_shape, + &min_nbor_dist_tensor)); + FPTYPE* min_nbor_dist = min_nbor_dist_tensor->flat().data(); + + deepmd::neighbor_stat_gpu(coord, type, nloc, gpu_inlist, + max_nbor_size, min_nbor_dist, ntypes, + mem_nnei); + deepmd::delete_device_memory(firstneigh); +#endif + } else { + for (int ii = 0; + ii < static_cast(max_nbor_size_tensor->NumElements()); ii++) { + max_nbor_size[ii] = 0; + } + + // set region + boxtensor_t boxt[9] = {0}; + for (int dd = 0; dd < 9; ++dd) { + boxt[dd] = box[dd]; } - if (b_norm_atom) { - compute_t inter[3]; - region.phys2Inter(inter, &d_coord3[3 * ii]); + SimulationRegion region; + region.reinitBox(boxt); + // set & normalize coord + std::vector d_coord3(nall * 3); + for (int ii = 0; ii < nall; ++ii) { for (int dd = 0; dd < 3; ++dd) { - if (inter[dd] < 0) { - inter[dd] += 1.; - } else if (inter[dd] >= 1) { - inter[dd] -= 1.; + d_coord3[ii * 3 + dd] = coord[ii * 3 + dd]; + } + if (b_norm_atom) { + compute_t inter[3]; + region.phys2Inter(inter, &d_coord3[3 * ii]); + for (int dd = 0; dd < 3; ++dd) { + if (inter[dd] < 0) { + inter[dd] += 1.; + } else if (inter[dd] >= 1) { + inter[dd] -= 1.; + } } + region.inter2Phys(&d_coord3[3 * ii], inter); } - region.inter2Phys(&d_coord3[3 * ii], inter); } - } - // set type - std::vector d_type(nall); - for (int ii = 0; ii < nall; ++ii) { - d_type[ii] = type[ii]; - } + // set type + std::vector d_type(nall); + for (int ii = 0; ii < nall; ++ii) { + d_type[ii] = type[ii]; + } + + // build nlist + std::vector > d_nlist_a; + std::vector > d_nlist_r; + std::vector nlist_map; + bool b_nlist_map = false; - // build nlist - std::vector > d_nlist_a; - std::vector > d_nlist_r; - std::vector nlist_map; - bool b_nlist_map = false; - - if (nei_mode == 1) { - // std::cout << "I'm in nei_mode 1" << std::endl; - std::vector bk_d_coord3 = d_coord3; - std::vector bk_d_type = d_type; - std::vector ncell, ngcell; - copy_coord(d_coord3, d_type, nlist_map, ncell, ngcell, bk_d_coord3, - bk_d_type, rcut, region); - b_nlist_map = true; - std::vector nat_stt(3, 0); - std::vector ext_stt(3), ext_end(3); - for (int dd = 0; dd < 3; ++dd) { - ext_stt[dd] = -ngcell[dd]; - ext_end[dd] = ncell[dd] + ngcell[dd]; + if (nei_mode == 1) { + // std::cout << "I'm in nei_mode 1" << std::endl; + std::vector bk_d_coord3 = d_coord3; + std::vector bk_d_type = d_type; + std::vector ncell, ngcell; + copy_coord(d_coord3, d_type, nlist_map, ncell, ngcell, bk_d_coord3, + bk_d_type, rcut, region); + b_nlist_map = true; + std::vector nat_stt(3, 0); + std::vector ext_stt(3), ext_end(3); + for (int dd = 0; dd < 3; ++dd) { + ext_stt[dd] = -ngcell[dd]; + ext_end[dd] = ncell[dd] + ngcell[dd]; + } + ::build_nlist(d_nlist_a, d_nlist_r, d_coord3, nloc, -1, rcut, nat_stt, + ncell, ext_stt, ext_end, region, ncell); + } else if (nei_mode == -1) { + ::build_nlist(d_nlist_a, d_nlist_r, d_coord3, -1, rcut, NULL); + } else { + throw deepmd::deepmd_exception("unknow neighbor mode"); } - ::build_nlist(d_nlist_a, d_nlist_r, d_coord3, nloc, -1, rcut, nat_stt, - ncell, ext_stt, ext_end, region, ncell); - } else if (nei_mode == -1) { - ::build_nlist(d_nlist_a, d_nlist_r, d_coord3, -1, rcut, NULL); - } else { - throw deepmd::deepmd_exception("unknow neighbor mode"); - } - int MAX_NNEI = 0; - for (int ii = 0; ii < nloc; ii++) { - MAX_NNEI = - MAX_NNEI < d_nlist_r[ii].size() ? d_nlist_r[ii].size() : MAX_NNEI; - } - // allocate output tensor for deepmd-kit - TensorShape min_nbor_dist_shape; - min_nbor_dist_shape.AddDim(nloc * MAX_NNEI); - Tensor* min_nbor_dist_tensor = NULL; - OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - min_nbor_dist_shape, - &min_nbor_dist_tensor)); - FPTYPE* min_nbor_dist = min_nbor_dist_tensor->flat().data(); - for (int ii = 0; ii < static_cast(min_nbor_dist_tensor->NumElements()); - ii++) { - min_nbor_dist[ii] = 10000.0; - } + int MAX_NNEI = 0; + for (int ii = 0; ii < nloc; ii++) { + MAX_NNEI = + MAX_NNEI < d_nlist_r[ii].size() ? d_nlist_r[ii].size() : MAX_NNEI; + } + // allocate output tensor for deepmd-kit + TensorShape min_nbor_dist_shape; + min_nbor_dist_shape.AddDim(nloc * MAX_NNEI); + Tensor* min_nbor_dist_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + min_nbor_dist_shape, + &min_nbor_dist_tensor)); + FPTYPE* min_nbor_dist = min_nbor_dist_tensor->flat().data(); + for (int ii = 0; + ii < static_cast(min_nbor_dist_tensor->NumElements()); ii++) { + min_nbor_dist[ii] = 10000.0; + } #pragma omp parallel for - for (int ii = 0; ii < nloc; ii++) { - if (d_type[ii] < 0) { - continue; // virtual atom - } - for (int jj = 0; jj < d_nlist_r[ii].size(); jj++) { - int type = d_type[d_nlist_r[ii][jj]]; - if (type < 0) { + for (int ii = 0; ii < nloc; ii++) { + if (d_type[ii] < 0) { continue; // virtual atom } - max_nbor_size[ii * ntypes + type] += 1; - compute_t rij[3] = { - d_coord3[d_nlist_r[ii][jj] * 3 + 0] - d_coord3[ii * 3 + 0], - d_coord3[d_nlist_r[ii][jj] * 3 + 1] - d_coord3[ii * 3 + 1], - d_coord3[d_nlist_r[ii][jj] * 3 + 2] - d_coord3[ii * 3 + 2]}; - min_nbor_dist[ii * MAX_NNEI + jj] = - sqrt(rij[0] * rij[0] + rij[1] * rij[1] + rij[2] * rij[2]); + for (int jj = 0; jj < d_nlist_r[ii].size(); jj++) { + int type = d_type[d_nlist_r[ii][jj]]; + if (type < 0) { + continue; // virtual atom + } + max_nbor_size[ii * ntypes + type] += 1; + compute_t rij[3] = { + d_coord3[d_nlist_r[ii][jj] * 3 + 0] - d_coord3[ii * 3 + 0], + d_coord3[d_nlist_r[ii][jj] * 3 + 1] - d_coord3[ii * 3 + 1], + d_coord3[d_nlist_r[ii][jj] * 3 + 2] - d_coord3[ii * 3 + 2]}; + // we do not need to do slow sqrt for every dist; instead do sqrt in + // the final step + min_nbor_dist[ii * MAX_NNEI + jj] = + rij[0] * rij[0] + rij[1] * rij[1] + rij[2] * rij[2]; + } } } } @@ -208,6 +291,8 @@ class NeighborStatOp : public OpKernel { private: int nnei; float rcut; + std::string device; + int max_nbor_size_nlist, max_cpy_trial, mem_cpy, max_nnei_trial, mem_nnei; }; #define REGISTER_CPU(T) \ @@ -216,3 +301,14 @@ class NeighborStatOp : public OpKernel { NeighborStatOp); REGISTER_CPU(float); REGISTER_CPU(double); +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM +#define REGISTER_GPU(T) \ + REGISTER_KERNEL_BUILDER(Name("NeighborStat") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .HostMemory("natoms") \ + .HostMemory("box"), \ + NeighborStatOp); +REGISTER_GPU(float); +REGISTER_GPU(double); +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc index 47541bc69f..048237e042 100644 --- a/source/op/prod_env_mat_multi_device.cc +++ b/source/op/prod_env_mat_multi_device.cc @@ -291,31 +291,31 @@ static void _map_nei_info_gpu(int* nlist, const bool& b_nlist_map); template -static void _prepare_coord_nlist_gpu(OpKernelContext* context, - Tensor* tensor_list, - FPTYPE const** coord, - FPTYPE*& coord_cpy, - int const** type, - int*& type_cpy, - int*& idx_mapping, - deepmd::InputNlist& inlist, - int*& ilist, - int*& numneigh, - int**& firstneigh, - int*& jlist, - int*& nbor_list_dev, - int& new_nall, - int& mem_cpy, - int& mem_nnei, - int& max_nbor_size, - const FPTYPE* box, - const int* mesh_tensor_data, - const int mesh_tensor_size, - const int& nloc, - const int& nei_mode, - const float& rcut_r, - const int& max_cpy_trial, - const int& max_nnei_trial); +void _prepare_coord_nlist_gpu(OpKernelContext* context, + Tensor* tensor_list, + FPTYPE const** coord, + FPTYPE*& coord_cpy, + int const** type, + int*& type_cpy, + int*& idx_mapping, + deepmd::InputNlist& inlist, + int*& ilist, + int*& numneigh, + int**& firstneigh, + int*& jlist, + int*& nbor_list_dev, + int& new_nall, + int& mem_cpy, + int& mem_nnei, + int& max_nbor_size, + const FPTYPE* box, + const int* mesh_tensor_data, + const int mesh_tensor_size, + const int& nloc, + const int& nei_mode, + const float& rcut_r, + const int& max_cpy_trial, + const int& max_nnei_trial); #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM @@ -1604,31 +1604,31 @@ static void _map_nei_info_gpu(int* nlist, } template -static void _prepare_coord_nlist_gpu(OpKernelContext* context, - Tensor* tensor_list, - FPTYPE const** coord, - FPTYPE*& coord_cpy, - int const** type, - int*& type_cpy, - int*& idx_mapping, - deepmd::InputNlist& inlist, - int*& ilist, - int*& numneigh, - int**& firstneigh, - int*& jlist, - int*& nbor_list_dev, - int& new_nall, - int& mem_cpy, - int& mem_nnei, - int& max_nbor_size, - const FPTYPE* box, - const int* mesh_tensor_data, - const int mesh_tensor_size, - const int& nloc, - const int& nei_mode, - const float& rcut_r, - const int& max_cpy_trial, - const int& max_nnei_trial) { +void _prepare_coord_nlist_gpu(OpKernelContext* context, + Tensor* tensor_list, + FPTYPE const** coord, + FPTYPE*& coord_cpy, + int const** type, + int*& type_cpy, + int*& idx_mapping, + deepmd::InputNlist& inlist, + int*& ilist, + int*& numneigh, + int**& firstneigh, + int*& jlist, + int*& nbor_list_dev, + int& new_nall, + int& mem_cpy, + int& mem_nnei, + int& max_nbor_size, + const FPTYPE* box, + const int* mesh_tensor_data, + const int mesh_tensor_size, + const int& nloc, + const int& nei_mode, + const float& rcut_r, + const int& max_cpy_trial, + const int& max_nnei_trial) { if (nei_mode != 3 && nei_mode != 4) { inlist.inum = nloc; // build nlist by myself