From 048376baa460d0082c3fbfef1a31c244d3f1a727 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Tue, 23 Jun 2026 20:38:39 +0800 Subject: [PATCH 1/5] feat(core): batch prod env mat over frames --- source/lib/include/fmt_nlist.h | 17 + source/lib/include/neighbor_list.h | 8 +- source/lib/include/prod_env_mat.h | 94 ++ source/lib/src/gpu/neighbor_list.cu | 54 +- source/lib/src/gpu/prod_env_mat.cu | 209 ++-- source/lib/src/neighbor_list.cc | 28 +- source/lib/src/prod_env_mat.cc | 290 +++-- source/lib/tests/test_env_mat_a.cc | 235 +++- source/lib/tests/test_env_mat_a_mix.cc | 251 +++- source/lib/tests/test_env_mat_r.cc | 134 ++- source/lib/tests/test_fmt_nlist.cc | 4 +- source/lib/tests/test_neighbor_list.cc | 113 ++ source/op/tf/custom_op.h | 50 +- source/op/tf/neighbor_stat.cc | 27 +- source/op/tf/prod_env_mat_multi_device.cc | 1293 +++++++++++++++++++-- 15 files changed, 2401 insertions(+), 406 deletions(-) diff --git a/source/lib/include/fmt_nlist.h b/source/lib/include/fmt_nlist.h index 18cb319304..60138bf54a 100644 --- a/source/lib/include/fmt_nlist.h +++ b/source/lib/include/fmt_nlist.h @@ -29,9 +29,26 @@ void format_nbor_list_gpu(int* nlist, const int max_nbor_size, const int nloc, const int nall, + const int nframes, const float rcut, const std::vector sec); +template +inline void format_nbor_list_gpu(int* nlist, + const FPTYPE* coord, + const int* type, + const deepmd::InputNlist& gpu_inlist, + int* array_int, + uint_64* array_longlong, + const int max_nbor_size, + const int nloc, + const int nall, + const float rcut, + const std::vector sec) { + format_nbor_list_gpu(nlist, coord, type, gpu_inlist, array_int, + array_longlong, max_nbor_size, nloc, nall, 1, rcut, sec); +} + template void test_encoding_decoding_nbor_info_gpu(uint_64* key, int* out_type, diff --git a/source/lib/include/neighbor_list.h b/source/lib/include/neighbor_list.h index 39682bcd9a..f7c1ddde10 100644 --- a/source/lib/include/neighbor_list.h +++ b/source/lib/include/neighbor_list.h @@ -159,7 +159,9 @@ int build_nlist_cpu(InputNlist& nlist, const int& nloc, const int& nall, const int& mem_size, - const float& rcut); + const float& rcut, + const int& nframes = 1, + const int* type = nullptr); void use_nei_info_cpu(int* nlist, int* ntype, @@ -224,7 +226,9 @@ int build_nlist_gpu(InputNlist& nlist, const int& nloc, const int& nall, const int& mem_size, - const float& rcut); + const float& rcut, + const int& nframes = 1, + const int* type = NULL); /** * @brief Filter the fake atom type. diff --git a/source/lib/include/prod_env_mat.h b/source/lib/include/prod_env_mat.h index d8ca4d1861..a64605b8e9 100644 --- a/source/lib/include/prod_env_mat.h +++ b/source/lib/include/prod_env_mat.h @@ -20,11 +20,34 @@ void prod_env_mat_a_cpu(FPTYPE* em, const FPTYPE* std, const int nloc, const int nall, + const int nframes, const float rcut, const float rcut_smth, const std::vector sec, const int* f_type = NULL); +template +inline void prod_env_mat_a_cpu(FPTYPE* em, + FPTYPE* em_deriv, + FPTYPE* rij, + int* nlist, + const FPTYPE* coord, + const int* type, + const InputNlist& inlist, + const int max_nbor_size, + const FPTYPE* avg, + const FPTYPE* std, + const int nloc, + const int nall, + const float rcut, + const float rcut_smth, + const std::vector sec, + const int* f_type = NULL) { + prod_env_mat_a_cpu(em, em_deriv, rij, nlist, coord, type, inlist, + max_nbor_size, avg, std, nloc, nall, 1, rcut, rcut_smth, + sec, f_type); +} + template void prod_env_mat_r_cpu(FPTYPE* em, FPTYPE* em_deriv, @@ -38,10 +61,32 @@ void prod_env_mat_r_cpu(FPTYPE* em, const FPTYPE* std, const int nloc, const int nall, + const int nframes, const float rcut, const float rcut_smth, const std::vector sec); +template +inline void prod_env_mat_r_cpu(FPTYPE* em, + FPTYPE* em_deriv, + FPTYPE* rij, + int* nlist, + const FPTYPE* coord, + const int* type, + const InputNlist& inlist, + const int max_nbor_size, + const FPTYPE* avg, + const FPTYPE* std, + const int nloc, + const int nall, + const float rcut, + const float rcut_smth, + const std::vector sec) { + prod_env_mat_r_cpu(em, em_deriv, rij, nlist, coord, type, inlist, + max_nbor_size, avg, std, nloc, nall, 1, rcut, rcut_smth, + sec); +} + #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM template void prod_env_mat_a_gpu(FPTYPE* em, @@ -58,11 +103,36 @@ void prod_env_mat_a_gpu(FPTYPE* em, const FPTYPE* std, const int nloc, const int nall, + const int nframes, const float rcut, const float rcut_smth, const std::vector sec, const int* f_type = NULL); +template +inline void prod_env_mat_a_gpu(FPTYPE* em, + FPTYPE* em_deriv, + FPTYPE* rij, + int* nlist, + const FPTYPE* coord, + const int* type, + const InputNlist& gpu_inlist, + int* array_int, + unsigned long long* array_longlong, + const int max_nbor_size, + const FPTYPE* avg, + const FPTYPE* std, + const int nloc, + const int nall, + const float rcut, + const float rcut_smth, + const std::vector sec, + const int* f_type = NULL) { + prod_env_mat_a_gpu(em, em_deriv, rij, nlist, coord, type, gpu_inlist, + array_int, array_longlong, max_nbor_size, avg, std, nloc, + nall, 1, rcut, rcut_smth, sec, f_type); +} + template void prod_env_mat_r_gpu(FPTYPE* em, FPTYPE* em_deriv, @@ -78,10 +148,34 @@ void prod_env_mat_r_gpu(FPTYPE* em, const FPTYPE* std, const int nloc, const int nall, + const int nframes, const float rcut, const float rcut_smth, const std::vector sec); +template +inline void prod_env_mat_r_gpu(FPTYPE* em, + FPTYPE* em_deriv, + FPTYPE* rij, + int* nlist, + const FPTYPE* coord, + const int* type, + const InputNlist& gpu_inlist, + int* array_int, + unsigned long long* array_longlong, + const int max_nbor_size, + const FPTYPE* avg, + const FPTYPE* std, + const int nloc, + const int nall, + const float rcut, + const float rcut_smth, + const std::vector sec) { + prod_env_mat_r_gpu(em, em_deriv, rij, nlist, coord, type, gpu_inlist, + array_int, array_longlong, max_nbor_size, avg, std, nloc, + nall, 1, rcut, rcut_smth, sec); +} + void env_mat_nbor_update(InputNlist& inlist, InputNlist& gpu_inlist, int& max_nbor_size, diff --git a/source/lib/src/gpu/neighbor_list.cu b/source/lib/src/gpu/neighbor_list.cu index 8f38017d4c..c9352a8c7b 100644 --- a/source/lib/src/gpu/neighbor_list.cu +++ b/source/lib/src/gpu/neighbor_list.cu @@ -78,19 +78,28 @@ template __global__ void build_nlist(int* ilist, int* temp_nlist, const FPTYPE* c_cpy, + const int* type, const FPTYPE rcut2, const int nloc, const int nall, const int mem_size) { - const unsigned int atom_idx = blockIdx.x; + const unsigned int row_idx = blockIdx.x; + const unsigned int atom_idx = row_idx % nloc; + const unsigned int frame_idx = row_idx / nloc; const unsigned int neighbor_idx = blockIdx.y * blockDim.y + threadIdx.y; if (neighbor_idx < nall) { - int* neighbor_row = temp_nlist + atom_idx * mem_size; + int* neighbor_row = temp_nlist + row_idx * mem_size; if (neighbor_idx == atom_idx) { - ilist[atom_idx] = atom_idx; + ilist[row_idx] = atom_idx; } else { - const FPTYPE* ccoord = c_cpy + atom_idx * 3; - const FPTYPE* ncoord = c_cpy + neighbor_idx * 3; + const int_64 frame_offset = int_64(frame_idx) * nall; + if (type != NULL && (type[frame_offset + atom_idx] < 0 || + type[frame_offset + neighbor_idx] < 0)) { + return; + } + const FPTYPE* frame_coord = c_cpy + int_64(frame_idx) * nall * 3; + const FPTYPE* ccoord = frame_coord + atom_idx * 3; + const FPTYPE* ncoord = frame_coord + neighbor_idx * 3; FPTYPE diff[3]; for (int kk = 0; kk < 3; kk++) { diff[kk] = ccoord[kk] - ncoord[kk]; @@ -193,41 +202,44 @@ int build_nlist_gpu(InputNlist& nlist, const int& nloc, const int& nall, const int& mem_size, - const float& rcut) { + const float& rcut, + const int& nframes, + const int* type) { if (mem_size < nall) { return 1; } DPErrcheck(gpuGetLastError()); DPErrcheck(gpuDeviceSynchronize()); const int nblock = (nall + TPB - 1) / TPB; + const int_64 nrows = int_64(nframes) * nloc; int* ilist = nlist.ilist; int* numneigh = nlist.numneigh; int** firstneigh = nlist.firstneigh; - DPErrcheck(gpuMemset(nlist_data, -1, sizeof(int) * 2 * nloc * mem_size)); - int* temp_nlist = nlist_data; // nloc*mem_size - int* nei_order = temp_nlist + nloc * mem_size; - nlist.inum = nloc; + DPErrcheck(gpuMemset(nlist_data, -1, sizeof(int) * 2 * nrows * mem_size)); + int* temp_nlist = nlist_data; // nrows*mem_size + int* nei_order = temp_nlist + nrows * mem_size; + nlist.inum = nrows; FPTYPE rcut2 = rcut * rcut; - dim3 block_grid(nloc, nblock); + dim3 block_grid(nrows, nblock); dim3 thread_grid(1, TPB); - build_nlist<<>>(ilist, temp_nlist, c_cpy, rcut2, - nloc, nall, mem_size); + build_nlist<<>>(ilist, temp_nlist, c_cpy, type, + rcut2, nloc, nall, mem_size); DPErrcheck(gpuGetLastError()); DPErrcheck(gpuDeviceSynchronize()); parallel_prefix_scan - <<>>(numneigh, nei_order, temp_nlist, mem_size, nloc, nall); + <<>>(numneigh, nei_order, temp_nlist, mem_size, nloc, nall); DPErrcheck(gpuGetLastError()); DPErrcheck(gpuDeviceSynchronize()); fill_nlist<<>>(firstneigh, temp_nlist, nei_order, mem_size, nall); DPErrcheck(gpuGetLastError()); DPErrcheck(gpuDeviceSynchronize()); - int* numneigh_host = new int[nloc]; - DPErrcheck(gpuMemcpy(numneigh_host, numneigh, sizeof(int) * nloc, + int* numneigh_host = new int[nrows]; + DPErrcheck(gpuMemcpy(numneigh_host, numneigh, sizeof(int) * nrows, gpuMemcpyDeviceToHost)); int max_nei = 0; - for (int ii = 0; ii < nloc; ii++) { + for (int_64 ii = 0; ii < nrows; ii++) { if (numneigh_host[ii] > max_nei) { max_nei = numneigh_host[ii]; } @@ -285,7 +297,9 @@ template int build_nlist_gpu(InputNlist& nlist, const int& nloc, const int& nall, const int& mem_size, - const float& rcut); + const float& rcut, + const int& nframes, + const int* type); template int build_nlist_gpu(InputNlist& nlist, int* max_list_size, int* nlist_data, @@ -293,7 +307,9 @@ template int build_nlist_gpu(InputNlist& nlist, const int& nloc, const int& nall, const int& mem_size, - const float& rcut); + const float& rcut, + const int& nframes, + const int* type); __global__ void map_filter_ftype(int* ftype_out, const int* ftype_in, diff --git a/source/lib/src/gpu/prod_env_mat.cu b/source/lib/src/gpu/prod_env_mat.cu index e8909edb44..6e8fc1df85 100644 --- a/source/lib/src/gpu/prod_env_mat.cu +++ b/source/lib/src/gpu/prod_env_mat.cu @@ -110,13 +110,16 @@ __device__ inline void decoding_nbor_info(int& type, index = key & 0xFFFFFF; } -template -__global__ void get_i_idx(FPTYPE* i_idx, const int nloc, const FPTYPE* ilist) { +__global__ void get_i_idx(int* i_idx, + const int nloc, + const int nframes, + const int* ilist) { const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= nloc) { + if (idx >= nframes * nloc) { return; } - i_idx[ilist[idx]] = idx; + const int frame_idx = idx / nloc; + i_idx[frame_idx * nloc + ilist[idx]] = idx; } template @@ -127,30 +130,37 @@ __global__ void format_nlist_fill_a(uint_64* key, int** firstneigh, const float rcut, int* i_idx, - const int MAX_NBOR_SIZE) { - // <<>> + const int MAX_NBOR_SIZE, + const int nloc, + const int nall) { + // <<>> const int_64 idx = blockIdx.x; const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y; + const int frame_idx = idx / nloc; + const int atom_idx = idx % nloc; + const int nlist_row = i_idx[idx]; - const int nsize = numneigh[i_idx[idx]]; + const int nsize = numneigh[nlist_row]; if (idy >= nsize) { return; } - const int* nei_idx = firstneigh[i_idx[idx]]; + const int* nei_idx = firstneigh[nlist_row]; // dev_copy(nei_idx, &jlist[jrange[i_idx]], nsize); uint_64* key_in = key + idx * MAX_NBOR_SIZE; FPTYPE diff[3]; const int& j_idx = nei_idx[idy]; - if (type[j_idx] < 0) { + const int_64 j_idx_nall = int_64(frame_idx) * nall + j_idx; + if (type[j_idx_nall] < 0) { return; } + const int_64 i_idx_nall = int_64(frame_idx) * nall + atom_idx; for (int dd = 0; dd < 3; dd++) { - diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd]; + diff[dd] = coord[j_idx_nall * 3 + dd] - coord[i_idx_nall * 3 + dd]; } FPTYPE rr = _sqrt(dev_dot(diff, diff)); if (rr <= rcut) { - key_in[idy] = encoding_nbor_info(type[j_idx], rr, j_idx); + key_in[idy] = encoding_nbor_info(type[j_idx_nall], rr, j_idx); } } @@ -225,16 +235,18 @@ void format_nbor_list_256(uint_64* key, const int* type, const deepmd::InputNlist& gpu_inlist, const int& nloc, + const int& nall, + const int& nframes, const float& rcut, int* i_idx) { const int LEN = 256; const int MAX_NBOR_SIZE = 256; const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN; - dim3 block_grid(nloc, nblock); + dim3 block_grid(nframes * nloc, nblock); dim3 thread_grid(1, LEN); format_nlist_fill_a<<>>( key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, - MAX_NBOR_SIZE); + MAX_NBOR_SIZE, nloc, nall); DPErrcheck(gpuGetLastError()); DPErrcheck(gpuDeviceSynchronize()); const int ITEMS_PER_THREAD = 4; @@ -242,7 +254,8 @@ void format_nbor_list_256(uint_64* key, // BlockSortKernel<<>> ( BlockSortKernel - <<>>(key, key + nloc * MAX_NBOR_SIZE); + <<>>( + key, key + int_64(nframes) * nloc * MAX_NBOR_SIZE); DPErrcheck(gpuGetLastError()); DPErrcheck(gpuDeviceSynchronize()); } @@ -253,16 +266,18 @@ void format_nbor_list_512(uint_64* key, const int* type, const deepmd::InputNlist& gpu_inlist, const int& nloc, + const int& nall, + const int& nframes, const float& rcut, int* i_idx) { const int LEN = 256; const int MAX_NBOR_SIZE = 512; const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN; - dim3 block_grid(nloc, nblock); + dim3 block_grid(nframes * nloc, nblock); dim3 thread_grid(1, LEN); format_nlist_fill_a<<>>( key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, - MAX_NBOR_SIZE); + MAX_NBOR_SIZE, nloc, nall); DPErrcheck(gpuGetLastError()); DPErrcheck(gpuDeviceSynchronize()); const int ITEMS_PER_THREAD = 4; @@ -270,7 +285,8 @@ void format_nbor_list_512(uint_64* key, // BlockSortKernel<<>> ( BlockSortKernel - <<>>(key, key + nloc * MAX_NBOR_SIZE); + <<>>( + key, key + int_64(nframes) * nloc * MAX_NBOR_SIZE); DPErrcheck(gpuGetLastError()); DPErrcheck(gpuDeviceSynchronize()); } @@ -281,16 +297,18 @@ void format_nbor_list_1024(uint_64* key, const int* type, const deepmd::InputNlist& gpu_inlist, const int& nloc, + const int& nall, + const int& nframes, const float& rcut, int* i_idx) { const int LEN = 256; const int MAX_NBOR_SIZE = 1024; const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN; - dim3 block_grid(nloc, nblock); + dim3 block_grid(nframes * nloc, nblock); dim3 thread_grid(1, LEN); format_nlist_fill_a<<>>( key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, - MAX_NBOR_SIZE); + MAX_NBOR_SIZE, nloc, nall); DPErrcheck(gpuGetLastError()); DPErrcheck(gpuDeviceSynchronize()); const int ITEMS_PER_THREAD = 8; @@ -298,7 +316,8 @@ void format_nbor_list_1024(uint_64* key, // BlockSortKernel<<>> ( BlockSortKernel - <<>>(key, key + nloc * MAX_NBOR_SIZE); + <<>>( + key, key + int_64(nframes) * nloc * MAX_NBOR_SIZE); DPErrcheck(gpuGetLastError()); DPErrcheck(gpuDeviceSynchronize()); } @@ -309,16 +328,18 @@ void format_nbor_list_2048(uint_64* key, const int* type, const deepmd::InputNlist& gpu_inlist, const int& nloc, + const int& nall, + const int& nframes, const float& rcut, int* i_idx) { const int LEN = 256; const int MAX_NBOR_SIZE = 2048; const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN; - dim3 block_grid(nloc, nblock); + dim3 block_grid(nframes * nloc, nblock); dim3 thread_grid(1, LEN); format_nlist_fill_a<<>>( key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, - MAX_NBOR_SIZE); + MAX_NBOR_SIZE, nloc, nall); DPErrcheck(gpuGetLastError()); DPErrcheck(gpuDeviceSynchronize()); const int ITEMS_PER_THREAD = 8; @@ -326,7 +347,8 @@ void format_nbor_list_2048(uint_64* key, // BlockSortKernel<<>> ( BlockSortKernel - <<>>(key, key + nloc * MAX_NBOR_SIZE); + <<>>( + key, key + int_64(nframes) * nloc * MAX_NBOR_SIZE); DPErrcheck(gpuGetLastError()); DPErrcheck(gpuDeviceSynchronize()); } @@ -337,16 +359,18 @@ void format_nbor_list_4096(uint_64* key, const int* type, const deepmd::InputNlist& gpu_inlist, const int& nloc, + const int& nall, + const int& nframes, const float& rcut, int* i_idx) { const int LEN = 256; const int MAX_NBOR_SIZE = 4096; const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN; - dim3 block_grid(nloc, nblock); + dim3 block_grid(nframes * nloc, nblock); dim3 thread_grid(1, LEN); format_nlist_fill_a<<>>( key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, - MAX_NBOR_SIZE); + MAX_NBOR_SIZE, nloc, nall); DPErrcheck(gpuGetLastError()); DPErrcheck(gpuDeviceSynchronize()); const int ITEMS_PER_THREAD = 16; @@ -354,7 +378,8 @@ void format_nbor_list_4096(uint_64* key, // BlockSortKernel<<>> ( BlockSortKernel - <<>>(key, key + nloc * MAX_NBOR_SIZE); + <<>>( + key, key + int_64(nframes) * nloc * MAX_NBOR_SIZE); DPErrcheck(gpuGetLastError()); DPErrcheck(gpuDeviceSynchronize()); } @@ -370,11 +395,16 @@ __global__ void compute_env_mat_a(FPTYPE* em, const int* nlist, const int nnei, const float rmin, - const float rmax) { - // <<>> + const float rmax, + const int nloc, + const int nall) { + // <<>> const int_64 bid = blockIdx.x; const unsigned int tid = threadIdx.x; - if (type[bid] < 0) { + const int_64 frame_idx = bid / nloc; + const int_64 atom_idx = bid % nloc; + const int_64 i_idx_nall = frame_idx * nall + atom_idx; + if (type[i_idx_nall] < 0) { return; } if (tid >= nnei) { @@ -392,9 +422,9 @@ __global__ void compute_env_mat_a(FPTYPE* em, FPTYPE rr[3] = {(FPTYPE)0.}; FPTYPE dd[4] = {(FPTYPE)0.}; FPTYPE vv[12] = {(FPTYPE)0.}; - const int j_idx = row_nlist[ii]; + const int_64 j_idx = frame_idx * nall + row_nlist[ii]; for (int kk = 0; kk < 3; kk++) { - rr[kk] = coord[j_idx * 3 + kk] - coord[bid * 3 + kk]; + rr[kk] = coord[j_idx * 3 + kk] - coord[i_idx_nall * 3 + kk]; row_rij[ii * 3 + kk] = rr[kk]; } // const FPTYPE * rr = &row_rij[ii * 3]; @@ -478,16 +508,19 @@ __global__ void compute_env_mat_a(FPTYPE* em, // idx_value + 3]; for (int ii = 0; ii < 12; ii++) { row_descript_deriv[idx_deriv + ii] = - vv[ii] / std[type[bid] * ndescrpt + idx_value + ii / 3]; + vv[ii] / std[type[i_idx_nall] * ndescrpt + idx_value + ii / 3]; } for (int ii = 0; ii < 4; ii++) { row_descript[idx_value + ii] = - (dd[ii] - avg[type[bid] * ndescrpt + idx_value + ii]) / - std[type[bid] * ndescrpt + idx_value + ii]; + (dd[ii] - avg[type[i_idx_nall] * ndescrpt + idx_value + ii]) / + std[type[i_idx_nall] * ndescrpt + idx_value + ii]; } } else { - row_descript[idx_value] -= avg[type[bid] * ndescrpt + idx_value] / - std[type[bid] * ndescrpt + idx_value]; + for (int ii = 0; ii < 4; ii++) { + row_descript[idx_value + ii] -= + avg[type[i_idx_nall] * ndescrpt + idx_value + ii] / + std[type[i_idx_nall] * ndescrpt + idx_value + ii]; + } } } } @@ -503,13 +536,21 @@ __global__ void compute_env_mat_r(FPTYPE* em, const int* nlist, const int nnei, const float rmin, - const float rmax) { - // <<>> + const float rmax, + const int nloc, + const int nall) { + // <<>> const int_64 bid = blockIdx.x; const unsigned int tid = threadIdx.x; if (tid >= nnei) { return; } + const int_64 frame_idx = bid / nloc; + const int_64 atom_idx = bid % nloc; + const int_64 i_idx_nall = frame_idx * nall + atom_idx; + if (type[i_idx_nall] < 0) { + return; + } const int ndescrpt = nnei; const int* row_nlist = nlist + bid * nnei; FPTYPE* row_rij = rij + bid * nnei * 3; @@ -522,9 +563,9 @@ __global__ void compute_env_mat_r(FPTYPE* em, FPTYPE rr[3] = {0}; FPTYPE vv[3] = {0}; FPTYPE dd = 0; - const int& j_idx = row_nlist[ii]; + const int_64 j_idx = frame_idx * nall + row_nlist[ii]; for (int kk = 0; kk < 3; kk++) { - rr[kk] = coord[j_idx * 3 + kk] - coord[bid * 3 + kk]; + rr[kk] = coord[j_idx * 3 + kk] - coord[i_idx_nall * 3 + kk]; row_rij[ii * 3 + kk] = rr[kk]; } // const FPTYPE * rr = &row_rij[ii * 3]; @@ -556,13 +597,13 @@ __global__ void compute_env_mat_r(FPTYPE* em, // idx_value + 0]; for (int ii = 0; ii < 3; ii++) { row_em_deriv[idx_deriv + ii] = - vv[ii] / std[type[bid] * ndescrpt + idx_value + ii / 3]; + vv[ii] / std[type[i_idx_nall] * ndescrpt + idx_value + ii / 3]; } - row_em[idx_value] = (dd - avg[type[bid] * ndescrpt + idx_value]) / - std[type[bid] * ndescrpt + idx_value]; + row_em[idx_value] = (dd - avg[type[i_idx_nall] * ndescrpt + idx_value]) / + std[type[i_idx_nall] * ndescrpt + idx_value]; } else { - row_em[idx_value] -= avg[type[bid] * ndescrpt + idx_value] / - std[type[bid] * ndescrpt + idx_value]; + row_em[idx_value] -= avg[type[i_idx_nall] * ndescrpt + idx_value] / + std[type[i_idx_nall] * ndescrpt + idx_value]; } } } @@ -578,47 +619,55 @@ void format_nbor_list_gpu(int* nlist, const int max_nbor_size, const int nloc, const int nall, + const int nframes, const float rcut, const std::vector sec) { DPErrcheck(gpuGetLastError()); DPErrcheck(gpuDeviceSynchronize()); const int LEN = 256; const int nnei = sec.back(); - const int nblock = (nloc + LEN - 1) / LEN; + const int nblock = (int_64(nframes) * nloc + LEN - 1) / LEN; int* sec_dev = array_int; int* nei_iter = array_int + sec.size(); // = new int[sec_size]; - int* i_idx = array_int + sec.size() + nloc * sec.size(); + int* i_idx = array_int + sec.size() + int_64(nframes) * nloc * sec.size(); uint_64* key = array_longlong; assert(max_nbor_size == 256 || max_nbor_size == 512 || max_nbor_size == 1024 || max_nbor_size == 2048 || max_nbor_size == 4096); - DPErrcheck(gpuMemset(nlist, -1, sizeof(int) * int_64(nloc) * nnei)); - DPErrcheck(gpuMemset(key, 0xffffffff, - sizeof(uint_64) * int_64(nloc) * max_nbor_size)); + DPErrcheck(gpuMemset(nlist, -1, sizeof(int) * int_64(nframes) * nloc * nnei)); + DPErrcheck( + gpuMemset(key, 0xffffffff, + sizeof(uint_64) * int_64(nframes) * nloc * max_nbor_size)); DPErrcheck(gpuMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), gpuMemcpyHostToDevice)); - get_i_idx<<>>(i_idx, nloc, gpu_inlist.ilist); + get_i_idx<<>>(i_idx, nloc, nframes, gpu_inlist.ilist); DPErrcheck(gpuGetLastError()); DPErrcheck(gpuDeviceSynchronize()); if (max_nbor_size == 256) { - format_nbor_list_256(key, coord, type, gpu_inlist, nloc, rcut, i_idx); + format_nbor_list_256(key, coord, type, gpu_inlist, nloc, nall, nframes, + rcut, i_idx); } else if (max_nbor_size == 512) { - format_nbor_list_512(key, coord, type, gpu_inlist, nloc, rcut, i_idx); + format_nbor_list_512(key, coord, type, gpu_inlist, nloc, nall, nframes, + rcut, i_idx); } else if (max_nbor_size == 1024) { - format_nbor_list_1024(key, coord, type, gpu_inlist, nloc, rcut, i_idx); + format_nbor_list_1024(key, coord, type, gpu_inlist, nloc, nall, nframes, + rcut, i_idx); } else if (max_nbor_size == 2048) { - format_nbor_list_2048(key, coord, type, gpu_inlist, nloc, rcut, i_idx); + format_nbor_list_2048(key, coord, type, gpu_inlist, nloc, nall, nframes, + rcut, i_idx); } else if (max_nbor_size == 4096) { - format_nbor_list_4096(key, coord, type, gpu_inlist, nloc, rcut, i_idx); + format_nbor_list_4096(key, coord, type, gpu_inlist, nloc, nall, nframes, + rcut, i_idx); } - fill_nei_iter<<>>( - nei_iter, key, nloc, max_nbor_size, sec.size()); + const int_64 nrows = int_64(nframes) * nloc; + fill_nei_iter<<>>( + nei_iter, key, nrows, max_nbor_size, sec.size()); - format_nlist_fill_b<<>>( - nlist, nnei, nloc, key, sec_dev, sec.size(), nei_iter, max_nbor_size); + format_nlist_fill_b<<>>( + nlist, nnei, nrows, key, sec_dev, sec.size(), nei_iter, max_nbor_size); DPErrcheck(gpuGetLastError()); DPErrcheck(gpuDeviceSynchronize()); } @@ -638,6 +687,7 @@ void prod_env_mat_a_gpu(FPTYPE* em, const FPTYPE* std, const int nloc, const int nall, + const int nframes, const float rcut, const float rcut_smth, const std::vector sec, @@ -649,18 +699,22 @@ void prod_env_mat_a_gpu(FPTYPE* em, } const int nnei = sec.back(); const int ndescrpt = nnei * 4; - DPErrcheck(gpuMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt)); DPErrcheck( - gpuMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3)); - DPErrcheck(gpuMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3)); + gpuMemset(em, 0, sizeof(FPTYPE) * int_64(nframes) * nloc * ndescrpt)); + DPErrcheck(gpuMemset(em_deriv, 0, + sizeof(FPTYPE) * int_64(nframes) * nloc * ndescrpt * 3)); + DPErrcheck( + gpuMemset(rij, 0, sizeof(FPTYPE) * int_64(nframes) * nloc * nnei * 3)); format_nbor_list_gpu(nlist, coord, f_type, gpu_inlist, array_int, - array_longlong, max_nbor_size, nloc, nall, rcut, sec); + array_longlong, max_nbor_size, nloc, nall, nframes, rcut, + sec); nborErrcheck(gpuGetLastError()); nborErrcheck(gpuDeviceSynchronize()); - compute_env_mat_a<<>>( - em, em_deriv, rij, coord, avg, std, type, nlist, nnei, rcut_smth, rcut); + compute_env_mat_a<<>>( + em, em_deriv, rij, coord, avg, std, type, nlist, nnei, rcut_smth, rcut, + nloc, nall); DPErrcheck(gpuGetLastError()); DPErrcheck(gpuDeviceSynchronize()); } @@ -680,6 +734,7 @@ void prod_env_mat_r_gpu(FPTYPE* em, const FPTYPE* std, const int nloc, const int nall, + const int nframes, const float rcut, const float rcut_smth, const std::vector sec) { @@ -687,18 +742,22 @@ void prod_env_mat_r_gpu(FPTYPE* em, DPErrcheck(gpuDeviceSynchronize()); const int nnei = sec.back(); const int ndescrpt = nnei * 1; - DPErrcheck(gpuMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt)); DPErrcheck( - gpuMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3)); - DPErrcheck(gpuMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3)); + gpuMemset(em, 0, sizeof(FPTYPE) * int_64(nframes) * nloc * ndescrpt)); + DPErrcheck(gpuMemset(em_deriv, 0, + sizeof(FPTYPE) * int_64(nframes) * nloc * ndescrpt * 3)); + DPErrcheck( + gpuMemset(rij, 0, sizeof(FPTYPE) * int_64(nframes) * nloc * nnei * 3)); format_nbor_list_gpu(nlist, coord, type, gpu_inlist, array_int, - array_longlong, max_nbor_size, nloc, nall, rcut, sec); + array_longlong, max_nbor_size, nloc, nall, nframes, rcut, + sec); nborErrcheck(gpuGetLastError()); nborErrcheck(gpuDeviceSynchronize()); - compute_env_mat_r<<>>( - em, em_deriv, rij, coord, avg, std, type, nlist, nnei, rcut_smth, rcut); + compute_env_mat_r<<>>( + em, em_deriv, rij, coord, avg, std, type, nlist, nnei, rcut_smth, rcut, + nloc, nall); DPErrcheck(gpuGetLastError()); DPErrcheck(gpuDeviceSynchronize()); } @@ -732,6 +791,7 @@ template void prod_env_mat_a_gpu(float* em, const float* std, const int nloc, const int nall, + const int nframes, const float rcut, const float rcut_smth, const std::vector sec, @@ -750,6 +810,7 @@ template void prod_env_mat_a_gpu(double* em, const double* std, const int nloc, const int nall, + const int nframes, const float rcut, const float rcut_smth, const std::vector sec, @@ -768,6 +829,7 @@ template void prod_env_mat_r_gpu(float* em, const float* std, const int nloc, const int nall, + const int nframes, const float rcut, const float rcut_smth, const std::vector sec); @@ -785,6 +847,7 @@ template void prod_env_mat_r_gpu(double* em, const double* std, const int nloc, const int nall, + const int nframes, const float rcut, const float rcut_smth, const std::vector sec); @@ -797,6 +860,7 @@ template void format_nbor_list_gpu(int* nlist, const int max_nbor_size, const int nloc, const int nall, + const int nframes, const float rcut, const std::vector sec); template void format_nbor_list_gpu(int* nlist, @@ -808,6 +872,7 @@ template void format_nbor_list_gpu(int* nlist, const int max_nbor_size, const int nloc, const int nall, + const int nframes, const float rcut, const std::vector sec); template void test_encoding_decoding_nbor_info_gpu(uint_64* key, diff --git a/source/lib/src/neighbor_list.cc b/source/lib/src/neighbor_list.cc index 3a0d8eb122..5f40ae6472 100644 --- a/source/lib/src/neighbor_list.cc +++ b/source/lib/src/neighbor_list.cc @@ -877,23 +877,33 @@ int deepmd::build_nlist_cpu(InputNlist& nlist, const int& nloc, const int& nall, const int& mem_size_, - const float& rcut) { + const float& rcut, + const int& nframes, + const int* type) { const int mem_size = mem_size_; *max_list_size = 0; - nlist.inum = nloc; + nlist.inum = nframes * nloc; FPTYPE rcut2 = rcut * rcut; std::vector jlist; jlist.reserve(mem_size); for (int ii = 0; ii < nlist.inum; ++ii) { - nlist.ilist[ii] = ii; + const int atom_idx = ii % nloc; + const int frame_idx = ii / nloc; + const int frame_offset = frame_idx * nall; + nlist.ilist[ii] = atom_idx; jlist.clear(); for (int jj = 0; jj < nall; ++jj) { - if (jj == ii) { + if (jj == atom_idx) { + continue; + } + if (type != nullptr && + (type[frame_offset + atom_idx] < 0 || type[frame_offset + jj] < 0)) { continue; } FPTYPE diff[3]; for (int dd = 0; dd < 3; ++dd) { - diff[dd] = c_cpy[ii * 3 + dd] - c_cpy[jj * 3 + dd]; + diff[dd] = c_cpy[(frame_offset + atom_idx) * 3 + dd] - + c_cpy[(frame_offset + jj) * 3 + dd]; } FPTYPE diff2 = deepmd::dot3(diff, diff); if (diff2 < rcut2) { @@ -963,7 +973,9 @@ template int deepmd::build_nlist_cpu(InputNlist& nlist, const int& nloc, const int& nall, const int& mem_size, - const float& rcut); + const float& rcut, + const int& nframes, + const int* type); template int deepmd::build_nlist_cpu(InputNlist& nlist, int* max_list_size, @@ -971,7 +983,9 @@ template int deepmd::build_nlist_cpu(InputNlist& nlist, const int& nloc, const int& nall, const int& mem_size, - const float& rcut); + const float& rcut, + const int& nframes, + const int* type); #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM void deepmd::convert_nlist_gpu_device(InputNlist& gpu_nlist, diff --git a/source/lib/src/prod_env_mat.cc b/source/lib/src/prod_env_mat.cc index 302fac4bc9..b01f1276b9 100644 --- a/source/lib/src/prod_env_mat.cc +++ b/source/lib/src/prod_env_mat.cc @@ -24,6 +24,7 @@ void deepmd::prod_env_mat_a_cpu(FPTYPE* em, const FPTYPE* std, const int nloc, const int nall, + const int nframes, const float rcut, const float rcut_smth, const std::vector sec, @@ -33,76 +34,99 @@ void deepmd::prod_env_mat_a_cpu(FPTYPE* em, } const int nnei = sec.back(); const int nem = nnei * 4; + assert(nframes * nloc == inlist.inum); - // set & normalize coord - std::vector d_coord3(nall * 3); - for (int ii = 0; ii < nall; ++ii) { - for (int dd = 0; dd < 3; ++dd) { - d_coord3[ii * 3 + dd] = coord[ii * 3 + dd]; - } - } + std::vector > frame_coords(nframes); + std::vector > frame_f_types(nframes); + std::vector > > frame_nlists(nframes); - // set type - std::vector d_f_type(nall); - for (int ii = 0; ii < nall; ++ii) { - d_f_type[ii] = f_type[ii]; - } +#pragma omp parallel + { +#pragma omp for + for (int ff = 0; ff < nframes; ++ff) { + const FPTYPE* frame_coord = coord + static_cast(ff) * nall * 3; + const int* frame_f_type = f_type + static_cast(ff) * nall; + const int_64 row_offset = static_cast(ff) * nloc; + + frame_coords[ff].resize(static_cast(nall) * 3); + for (int ii = 0; ii < nall; ++ii) { + for (int dd = 0; dd < 3; ++dd) { + frame_coords[ff][ii * 3 + dd] = frame_coord[ii * 3 + dd]; + } + } - // build nlist - std::vector > d_nlist_a(nloc); + frame_f_types[ff].resize(nall); + for (int ii = 0; ii < nall; ++ii) { + frame_f_types[ff][ii] = frame_f_type[ii]; + } - assert(nloc == inlist.inum); - for (unsigned ii = 0; ii < nloc; ++ii) { - d_nlist_a[ii].reserve(max_nbor_size); - } - for (unsigned ii = 0; ii < nloc; ++ii) { - int i_idx = inlist.ilist[ii]; - for (unsigned jj = 0; jj < inlist.numneigh[ii]; ++jj) { - int j_idx = inlist.firstneigh[ii][jj]; - d_nlist_a[i_idx].push_back(j_idx); + frame_nlists[ff].resize(nloc); + for (int ii = 0; ii < nloc; ++ii) { + frame_nlists[ff][ii].reserve(max_nbor_size); + } + for (int ii = 0; ii < nloc; ++ii) { + const int_64 row = row_offset + ii; + const int i_idx = inlist.ilist[row]; + for (int jj = 0; jj < inlist.numneigh[row]; ++jj) { + const int j_idx = inlist.firstneigh[row][jj]; + frame_nlists[ff][i_idx].push_back(j_idx); + } + } } - } -#pragma omp parallel for - for (int ii = 0; ii < nloc; ++ii) { - std::vector fmt_nlist_a; - int ret = format_nlist_i_cpu(fmt_nlist_a, d_coord3, d_f_type, ii, - d_nlist_a[ii], rcut, sec); - std::vector d_em_a; - std::vector d_em_a_deriv; - std::vector d_em_r; - std::vector d_em_r_deriv; - std::vector d_rij_a; - env_mat_a_cpu(d_em_a, d_em_a_deriv, d_rij_a, d_coord3, d_f_type, ii, - fmt_nlist_a, sec, rcut_smth, rcut); +#pragma omp for + for (int_64 row = 0; row < static_cast(nframes) * nloc; ++row) { + const int ff = row / nloc; + const int ii = row % nloc; + const int_64 row_offset = static_cast(ff) * nloc; + const int* frame_type = type + static_cast(ff) * nall; + FPTYPE* frame_em = em + static_cast(row_offset) * nem; + FPTYPE* frame_em_deriv = + em_deriv + static_cast(row_offset) * nem * 3; + FPTYPE* frame_rij = rij + static_cast(row_offset) * nnei * 3; + int* frame_nlist = nlist + static_cast(row_offset) * nnei; + const std::vector& d_coord3 = frame_coords[ff]; + const std::vector& d_f_type = frame_f_types[ff]; + std::vector fmt_nlist_a; + format_nlist_i_cpu(fmt_nlist_a, d_coord3, d_f_type, ii, + frame_nlists[ff][ii], rcut, sec); + std::vector d_em_a; + std::vector d_em_a_deriv; + std::vector d_em_r; + std::vector d_em_r_deriv; + std::vector d_rij_a; + env_mat_a_cpu(d_em_a, d_em_a_deriv, d_rij_a, d_coord3, d_f_type, ii, + fmt_nlist_a, sec, rcut_smth, rcut); - // check sizes - assert(d_em_a.size() == nem); - assert(d_em_a_deriv.size() == nem * 3); - assert(d_rij_a.size() == nnei * 3); - assert(fmt_nlist_a.size() == nnei); - // record outputs - for (int jj = 0; jj < nem; ++jj) { - if (type[ii] >= 0) { - em[ii * nem + jj] = - (d_em_a[jj] - avg[type[ii] * nem + jj]) / std[type[ii] * nem + jj]; - } else { - em[ii * nem + jj] = 0; + // check sizes + assert(d_em_a.size() == nem); + assert(d_em_a_deriv.size() == nem * 3); + assert(d_rij_a.size() == nnei * 3); + assert(fmt_nlist_a.size() == nnei); + // record outputs + for (int jj = 0; jj < nem; ++jj) { + if (frame_type[ii] >= 0) { + frame_em[ii * nem + jj] = + (d_em_a[jj] - avg[frame_type[ii] * nem + jj]) / + std[frame_type[ii] * nem + jj]; + } else { + frame_em[ii * nem + jj] = 0; + } } - } - for (int jj = 0; jj < nem * 3; ++jj) { - if (type[ii] >= 0) { - em_deriv[ii * nem * 3 + jj] = - d_em_a_deriv[jj] / std[type[ii] * nem + jj / 3]; - } else { - em_deriv[ii * nem * 3 + jj] = 0; + for (int jj = 0; jj < nem * 3; ++jj) { + if (frame_type[ii] >= 0) { + frame_em_deriv[ii * nem * 3 + jj] = + d_em_a_deriv[jj] / std[frame_type[ii] * nem + jj / 3]; + } else { + frame_em_deriv[ii * nem * 3 + jj] = 0; + } + } + for (int jj = 0; jj < nnei * 3; ++jj) { + frame_rij[ii * nnei * 3 + jj] = d_rij_a[jj]; + } + for (int jj = 0; jj < nnei; ++jj) { + frame_nlist[ii * nnei + jj] = fmt_nlist_a[jj]; } - } - for (int jj = 0; jj < nnei * 3; ++jj) { - rij[ii * nnei * 3 + jj] = d_rij_a[jj]; - } - for (int jj = 0; jj < nnei; ++jj) { - nlist[ii * nnei + jj] = fmt_nlist_a[jj]; } } } @@ -120,73 +144,103 @@ void deepmd::prod_env_mat_r_cpu(FPTYPE* em, const FPTYPE* std, const int nloc, const int nall, + const int nframes, const float rcut, const float rcut_smth, const std::vector sec) { const int nnei = sec.back(); const int nem = nnei * 1; + assert(nframes * nloc == inlist.inum); - // set & normalize coord - std::vector d_coord3(nall * 3); - for (int ii = 0; ii < nall; ++ii) { - for (int dd = 0; dd < 3; ++dd) { - d_coord3[ii * 3 + dd] = coord[ii * 3 + dd]; - } - } + std::vector > frame_coords(nframes); + std::vector > frame_types(nframes); + std::vector > > frame_nlists(nframes); - // set type - std::vector d_type(nall); - for (int ii = 0; ii < nall; ++ii) { - d_type[ii] = type[ii]; - } +#pragma omp parallel + { +#pragma omp for + for (int ff = 0; ff < nframes; ++ff) { + const FPTYPE* frame_coord = coord + static_cast(ff) * nall * 3; + const int* frame_type = type + static_cast(ff) * nall; + const int_64 row_offset = static_cast(ff) * nloc; - // build nlist - std::vector > d_nlist_a(nloc); + frame_coords[ff].resize(static_cast(nall) * 3); + for (int ii = 0; ii < nall; ++ii) { + for (int dd = 0; dd < 3; ++dd) { + frame_coords[ff][ii * 3 + dd] = frame_coord[ii * 3 + dd]; + } + } - assert(nloc == inlist.inum); - for (unsigned ii = 0; ii < nloc; ++ii) { - d_nlist_a[ii].reserve(max_nbor_size); - } - for (unsigned ii = 0; ii < nloc; ++ii) { - int i_idx = inlist.ilist[ii]; - for (unsigned jj = 0; jj < inlist.numneigh[ii]; ++jj) { - int j_idx = inlist.firstneigh[ii][jj]; - d_nlist_a[i_idx].push_back(j_idx); + frame_types[ff].resize(nall); + for (int ii = 0; ii < nall; ++ii) { + frame_types[ff][ii] = frame_type[ii]; + } + + frame_nlists[ff].resize(nloc); + for (int ii = 0; ii < nloc; ++ii) { + frame_nlists[ff][ii].reserve(max_nbor_size); + } + for (int ii = 0; ii < nloc; ++ii) { + const int_64 row = row_offset + ii; + const int i_idx = inlist.ilist[row]; + for (int jj = 0; jj < inlist.numneigh[row]; ++jj) { + const int j_idx = inlist.firstneigh[row][jj]; + frame_nlists[ff][i_idx].push_back(j_idx); + } + } } - } -#pragma omp parallel for - for (int ii = 0; ii < nloc; ++ii) { - std::vector fmt_nlist_a; - int ret = format_nlist_i_cpu(fmt_nlist_a, d_coord3, d_type, ii, - d_nlist_a[ii], rcut, sec); - std::vector d_em_a; - std::vector d_em_a_deriv; - std::vector d_em_r; - std::vector d_em_r_deriv; - std::vector d_rij_a; - env_mat_r_cpu(d_em_a, d_em_a_deriv, d_rij_a, d_coord3, d_type, ii, - fmt_nlist_a, sec, rcut_smth, rcut); +#pragma omp for + for (int_64 row = 0; row < static_cast(nframes) * nloc; ++row) { + const int ff = row / nloc; + const int ii = row % nloc; + const int_64 row_offset = static_cast(ff) * nloc; + FPTYPE* frame_em = em + static_cast(row_offset) * nem; + FPTYPE* frame_em_deriv = + em_deriv + static_cast(row_offset) * nem * 3; + FPTYPE* frame_rij = rij + static_cast(row_offset) * nnei * 3; + int* frame_nlist = nlist + static_cast(row_offset) * nnei; + const std::vector& d_coord3 = frame_coords[ff]; + const std::vector& d_type = frame_types[ff]; + std::vector fmt_nlist_a; + format_nlist_i_cpu(fmt_nlist_a, d_coord3, d_type, ii, + frame_nlists[ff][ii], rcut, sec); + std::vector d_em_a; + std::vector d_em_a_deriv; + std::vector d_em_r; + std::vector d_em_r_deriv; + std::vector d_rij_a; + env_mat_r_cpu(d_em_a, d_em_a_deriv, d_rij_a, d_coord3, d_type, ii, + fmt_nlist_a, sec, rcut_smth, rcut); - // check sizes - assert(d_em_a.size() == nem); - assert(d_em_a_deriv.size() == nem * 3); - assert(d_rij_a.size() == nnei * 3); - assert(fmt_nlist_a.size() == nnei); - // record outputs - for (int jj = 0; jj < nem; ++jj) { - em[ii * nem + jj] = (d_em_a[jj] - avg[d_type[ii] * nem + jj]) / - std[d_type[ii] * nem + jj]; - } - for (int jj = 0; jj < nem * 3; ++jj) { - em_deriv[ii * nem * 3 + jj] = - d_em_a_deriv[jj] / std[d_type[ii] * nem + jj / 3]; - } - for (int jj = 0; jj < nnei * 3; ++jj) { - rij[ii * nnei * 3 + jj] = d_rij_a[jj]; - } - for (int jj = 0; jj < nnei; ++jj) { - nlist[ii * nnei + jj] = fmt_nlist_a[jj]; + // check sizes + assert(d_em_a.size() == nem); + assert(d_em_a_deriv.size() == nem * 3); + assert(d_rij_a.size() == nnei * 3); + assert(fmt_nlist_a.size() == nnei); + // record outputs + for (int jj = 0; jj < nem; ++jj) { + if (d_type[ii] >= 0) { + frame_em[ii * nem + jj] = (d_em_a[jj] - avg[d_type[ii] * nem + jj]) / + std[d_type[ii] * nem + jj]; + } else { + frame_em[ii * nem + jj] = 0; + } + } + for (int jj = 0; jj < nem * 3; ++jj) { + if (d_type[ii] >= 0) { + frame_em_deriv[ii * nem * 3 + jj] = + d_em_a_deriv[jj] / std[d_type[ii] * nem + jj / 3]; + } else { + frame_em_deriv[ii * nem * 3 + jj] = 0; + } + } + for (int jj = 0; jj < nnei * 3; ++jj) { + frame_rij[ii * nnei * 3 + jj] = d_rij_a[jj]; + } + for (int jj = 0; jj < nnei; ++jj) { + frame_nlist[ii * nnei + jj] = fmt_nlist_a[jj]; + } } } } @@ -203,6 +257,7 @@ template void deepmd::prod_env_mat_a_cpu(double* em, const double* std, const int nloc, const int nall, + const int nframes, const float rcut, const float rcut_smth, const std::vector sec, @@ -220,6 +275,7 @@ template void deepmd::prod_env_mat_a_cpu(float* em, const float* std, const int nloc, const int nall, + const int nframes, const float rcut, const float rcut_smth, const std::vector sec, @@ -237,6 +293,7 @@ template void deepmd::prod_env_mat_r_cpu(double* em, const double* std, const int nloc, const int nall, + const int nframes, const float rcut, const float rcut_smth, const std::vector sec); @@ -253,6 +310,7 @@ template void deepmd::prod_env_mat_r_cpu(float* em, const float* std, const int nloc, const int nall, + const int nframes, const float rcut, const float rcut_smth, const std::vector sec); diff --git a/source/lib/tests/test_env_mat_a.cc b/source/lib/tests/test_env_mat_a.cc index 3c309ca9ae..203291d58c 100644 --- a/source/lib/tests/test_env_mat_a.cc +++ b/source/lib/tests/test_env_mat_a.cc @@ -1,6 +1,7 @@ // SPDX-License-Identifier: LGPL-3.0-or-later #include +#include #include #include "device.h" @@ -9,6 +10,17 @@ #include "neighbor_list.h" #include "prod_env_mat.h" +template +static std::vector repeat_vector(const std::vector& values, + const int repeats) { + std::vector result; + result.reserve(static_cast(repeats) * values.size()); + for (int ii = 0; ii < repeats; ++ii) { + result.insert(result.end(), values.begin(), values.end()); + } + return result; +} + class TestEnvMatA : public ::testing::Test { protected: std::vector posi = {12.83, 2.56, 2.18, 12.09, 2.87, 2.74, @@ -592,6 +604,87 @@ TEST_F(TestEnvMatA, prod_cpu_equal_cpu) { // } } +TEST_F(TestEnvMatA, prod_cpu_multiple_frames) { + EXPECT_EQ(nlist_r_cpy.size(), nloc); + constexpr int nframes = 2; + int max_nbor_size = 0; + for (int ii = 0; ii < nlist_a_cpy.size(); ++ii) { + if (nlist_a_cpy[ii].size() > max_nbor_size) { + max_nbor_size = nlist_a_cpy[ii].size(); + } + } + + std::vector base_ilist(nloc), base_numneigh(nloc); + std::vector base_firstneigh(nloc); + deepmd::InputNlist base_inlist(nloc, base_ilist.data(), base_numneigh.data(), + base_firstneigh.data()); + convert_nlist(base_inlist, nlist_a_cpy); + + const int nrows = nframes * nloc; + std::vector ilist(nrows), numneigh(nrows); + std::vector firstneigh(nrows); + for (int ff = 0; ff < nframes; ++ff) { + for (int ii = 0; ii < nloc; ++ii) { + const int row = ff * nloc + ii; + ilist[row] = base_ilist[ii]; + numneigh[row] = base_numneigh[ii]; + firstneigh[row] = base_firstneigh[ii]; + } + } + deepmd::InputNlist inlist(nrows, ilist.data(), numneigh.data(), + firstneigh.data()); + + std::vector posi_multi = repeat_vector(posi_cpy, nframes); + for (int ii = 0; ii < nall; ++ii) { + const size_t offset = (static_cast(nall) + ii) * 3; + posi_multi[offset] += 0.01 * (ii + 1); + posi_multi[offset + 1] -= 0.02 * (ii % 3); + posi_multi[offset + 2] += 0.015 * (ii % 5); + } + std::vector atype_multi = repeat_vector(atype_cpy, nframes); + + std::vector em(static_cast(nframes) * nloc * ndescrpt), + em_deriv(static_cast(nframes) * nloc * ndescrpt * 3), + rij(static_cast(nframes) * nloc * nnei * 3); + std::vector nlist(static_cast(nframes) * nloc * nnei); + std::vector avg(static_cast(ntypes) * ndescrpt, 0); + std::vector std(static_cast(ntypes) * ndescrpt, 1); + deepmd::prod_env_mat_a_cpu( + em.data(), em_deriv.data(), rij.data(), nlist.data(), posi_multi.data(), + atype_multi.data(), inlist, max_nbor_size, avg.data(), std.data(), nloc, + nall, nframes, rc, rc_smth, sec_a); + + for (int ff = 0; ff < nframes; ++ff) { + std::vector frame_em(static_cast(nloc) * ndescrpt), + frame_em_deriv(static_cast(nloc) * ndescrpt * 3), + frame_rij(static_cast(nloc) * nnei * 3); + std::vector frame_nlist(static_cast(nloc) * nnei); + deepmd::prod_env_mat_a_cpu( + frame_em.data(), frame_em_deriv.data(), frame_rij.data(), + frame_nlist.data(), + posi_multi.data() + static_cast(ff) * nall * 3, + atype_multi.data() + static_cast(ff) * nall, base_inlist, + max_nbor_size, avg.data(), std.data(), nloc, nall, rc, rc_smth, sec_a); + + const size_t em_offset = static_cast(ff) * nloc * ndescrpt; + const size_t deriv_offset = em_offset * 3; + const size_t rij_offset = static_cast(ff) * nloc * nnei * 3; + const size_t nlist_offset = static_cast(ff) * nloc * nnei; + for (size_t jj = 0; jj < frame_em.size(); ++jj) { + EXPECT_LT(fabs(em[em_offset + jj] - frame_em[jj]), 1e-10); + } + for (size_t jj = 0; jj < frame_em_deriv.size(); ++jj) { + EXPECT_LT(fabs(em_deriv[deriv_offset + jj] - frame_em_deriv[jj]), 1e-10); + } + for (size_t jj = 0; jj < frame_rij.size(); ++jj) { + EXPECT_LT(fabs(rij[rij_offset + jj] - frame_rij[jj]), 1e-10); + } + for (size_t jj = 0; jj < frame_nlist.size(); ++jj) { + EXPECT_EQ(nlist[nlist_offset + jj], frame_nlist[jj]); + } + } +} + #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM TEST_F(TestEnvMatA, prod_gpu) { EXPECT_EQ(nlist_r_cpy.size(), nloc); @@ -643,10 +736,10 @@ TEST_F(TestEnvMatA, prod_gpu) { deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size); - deepmd::prod_env_mat_a_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev, - posi_cpy_dev, atype_cpy_dev, gpu_inlist, - array_int_dev, array_longlong_dev, max_nbor_size, - avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a); + deepmd::prod_env_mat_a_gpu( + em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_cpy_dev, + gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev, + std_dev, nloc, nall, 1, rc, rc_smth, sec_a); deepmd::memcpy_device_to_host(em_dev, em); deepmd::delete_device_memory(em_dev); deepmd::delete_device_memory(em_deriv_dev); @@ -671,6 +764,132 @@ TEST_F(TestEnvMatA, prod_gpu) { } } +TEST_F(TestEnvMatA, prod_gpu_multiple_frames) { + EXPECT_EQ(nlist_r_cpy.size(), nloc); + constexpr int nframes = 2; + int max_nbor_size = 0; + for (int ii = 0; ii < nlist_a_cpy.size(); ++ii) { + if (nlist_a_cpy[ii].size() > max_nbor_size) { + max_nbor_size = nlist_a_cpy[ii].size(); + } + } + assert(max_nbor_size <= GPU_MAX_NBOR_SIZE); + if (max_nbor_size <= 1024) { + max_nbor_size = 1024; + } else if (max_nbor_size <= 2048) { + max_nbor_size = 2048; + } else { + max_nbor_size = 4096; + } + + std::vector base_ilist(nloc), base_numneigh(nloc); + std::vector base_firstneigh(nloc); + deepmd::InputNlist base_inlist(nloc, base_ilist.data(), base_numneigh.data(), + base_firstneigh.data()); + convert_nlist(base_inlist, nlist_a_cpy); + + const int nrows = nframes * nloc; + std::vector ilist(nrows), numneigh(nrows); + std::vector firstneigh(nrows); + for (int ff = 0; ff < nframes; ++ff) { + for (int ii = 0; ii < nloc; ++ii) { + const int row = ff * nloc + ii; + ilist[row] = base_ilist[ii]; + numneigh[row] = base_numneigh[ii]; + firstneigh[row] = base_firstneigh[ii]; + } + } + deepmd::InputNlist inlist(nrows, ilist.data(), numneigh.data(), + firstneigh.data()), + gpu_inlist; + + std::vector posi_multi = repeat_vector(posi_cpy, nframes); + for (int ii = 0; ii < nall; ++ii) { + const size_t offset = (static_cast(nall) + ii) * 3; + posi_multi[offset] += 0.01 * (ii + 1); + posi_multi[offset + 1] -= 0.02 * (ii % 3); + posi_multi[offset + 2] += 0.015 * (ii % 5); + } + std::vector atype_multi = repeat_vector(atype_cpy, nframes); + std::vector em(static_cast(nframes) * nloc * ndescrpt, 0.0), + em_deriv(static_cast(nframes) * nloc * ndescrpt * 3, 0.0), + rij(static_cast(nframes) * nloc * nnei * 3, 0.0); + std::vector nlist(static_cast(nframes) * nloc * nnei, 0); + std::vector avg(static_cast(ntypes) * ndescrpt, 0); + std::vector std(static_cast(ntypes) * ndescrpt, 1); + std::vector expected_multi(static_cast(nframes) * nloc * + ndescrpt); + for (int ff = 0; ff < nframes; ++ff) { + std::vector frame_em(static_cast(nloc) * ndescrpt), + frame_em_deriv(static_cast(nloc) * ndescrpt * 3), + frame_rij(static_cast(nloc) * nnei * 3); + std::vector frame_nlist(static_cast(nloc) * nnei); + deepmd::prod_env_mat_a_cpu( + frame_em.data(), frame_em_deriv.data(), frame_rij.data(), + frame_nlist.data(), + posi_multi.data() + static_cast(ff) * nall * 3, + atype_multi.data() + static_cast(ff) * nall, base_inlist, + max_nbor_size, avg.data(), std.data(), nloc, nall, rc, rc_smth, sec_a); + std::copy( + frame_em.begin(), frame_em.end(), + expected_multi.begin() + static_cast(ff) * nloc * ndescrpt); + } + + double *em_dev = NULL, *em_deriv_dev = NULL, *rij_dev = NULL; + double *posi_dev = NULL, *avg_dev = NULL, *std_dev = NULL; + int *atype_dev = NULL, *nlist_dev = NULL, *array_int_dev = NULL, + *memory_dev = NULL; + uint_64* array_longlong_dev = NULL; + deepmd::malloc_device_memory_sync(em_dev, em); + deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv); + deepmd::malloc_device_memory_sync(rij_dev, rij); + deepmd::malloc_device_memory_sync(posi_dev, posi_multi); + deepmd::malloc_device_memory_sync(avg_dev, avg); + deepmd::malloc_device_memory_sync(std_dev, std); + deepmd::malloc_device_memory_sync(atype_dev, atype_multi); + deepmd::malloc_device_memory_sync(nlist_dev, nlist); + deepmd::malloc_device_memory( + array_int_dev, sec_a.size() + + static_cast(nframes) * nloc * sec_a.size() + + static_cast(nframes) * nloc); + deepmd::malloc_device_memory( + array_longlong_dev, + static_cast(nframes) * nloc * max_nbor_size * 2); + deepmd::malloc_device_memory( + memory_dev, static_cast(nframes) * nloc * max_nbor_size); + deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, + max_nbor_size); + + deepmd::prod_env_mat_a_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_dev, + atype_dev, gpu_inlist, array_int_dev, + array_longlong_dev, max_nbor_size, avg_dev, + std_dev, nloc, nall, nframes, rc, rc_smth, sec_a); + deepmd::memcpy_device_to_host(em_dev, em); + deepmd::delete_device_memory(em_dev); + deepmd::delete_device_memory(em_deriv_dev); + deepmd::delete_device_memory(nlist_dev); + deepmd::delete_device_memory(posi_dev); + deepmd::delete_device_memory(atype_dev); + deepmd::delete_device_memory(array_int_dev); + deepmd::delete_device_memory(array_longlong_dev); + deepmd::delete_device_memory(avg_dev); + deepmd::delete_device_memory(std_dev); + deepmd::delete_device_memory(memory_dev); + deepmd::free_nlist_gpu_device(gpu_inlist); + + for (int ff = 0; ff < nframes; ++ff) { + for (int ii = 0; ii < nloc; ++ii) { + for (int jj = 0; jj < nnei; ++jj) { + for (int dd = 0; dd < 4; ++dd) { + const int_64 idx = + (static_cast(ff) * nloc + ii) * nnei * 4 + jj * 4 + dd; + EXPECT_LT(fabs(em[idx] - expected_multi[idx]), 1e-5); + } + } + } + } +} + TEST_F(TestEnvMatA, prod_gpu_equal_cpu) { EXPECT_EQ(nlist_r_cpy.size(), nloc); int tot_nnei = 0; @@ -722,10 +941,10 @@ TEST_F(TestEnvMatA, prod_gpu_equal_cpu) { deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size); - deepmd::prod_env_mat_a_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev, - posi_cpy_dev, atype_cpy_dev, gpu_inlist, - array_int_dev, array_longlong_dev, max_nbor_size, - avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a); + deepmd::prod_env_mat_a_gpu( + em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_cpy_dev, + gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev, + std_dev, nloc, nall, 1, rc, rc_smth, sec_a); deepmd::memcpy_device_to_host(em_dev, em); deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv); deepmd::memcpy_device_to_host(rij_dev, rij); diff --git a/source/lib/tests/test_env_mat_a_mix.cc b/source/lib/tests/test_env_mat_a_mix.cc index e96311dafd..ae91256c1a 100644 --- a/source/lib/tests/test_env_mat_a_mix.cc +++ b/source/lib/tests/test_env_mat_a_mix.cc @@ -1,6 +1,7 @@ // SPDX-License-Identifier: LGPL-3.0-or-later #include +#include #include #include "device.h" @@ -9,6 +10,17 @@ #include "neighbor_list.h" #include "prod_env_mat.h" +template +static std::vector repeat_vector(const std::vector& values, + const int repeats) { + std::vector result; + result.reserve(static_cast(repeats) * values.size()); + for (int ii = 0; ii < repeats; ++ii) { + result.insert(result.end(), values.begin(), values.end()); + } + return result; +} + class TestEnvMatAMix : public ::testing::Test { protected: std::vector posi = {12.83, 2.56, 2.18, 12.09, 2.87, 2.74, @@ -17,7 +29,7 @@ class TestEnvMatAMix : public ::testing::Test { std::vector atype = {0, 1, 1, 0, 1, 1}; std::vector f_atype = {0, 0, 0, 0, 0, 0}; std::vector posi_cpy; - // std::vector atype_cpy; + std::vector atype_cpy; std::vector f_atype_cpy; int nloc, nall; double rc = 6; @@ -125,6 +137,10 @@ class TestEnvMatAMix : public ::testing::Test { region.reinitBox(box); copy_coord(posi_cpy, f_atype_cpy, mapping, ncell, ngcell, posi, f_atype, rc, region); + atype_cpy.resize(mapping.size()); + for (int ii = 0; ii < mapping.size(); ++ii) { + atype_cpy[ii] = atype[mapping[ii]]; + } nloc = posi.size() / 3; nall = posi_cpy.size() / 3; nat_stt.resize(3); @@ -542,7 +558,7 @@ TEST_F(TestEnvMatAMix, prod_cpu) { std::vector avg(ntypes * ndescrpt, 0); std::vector std(ntypes * ndescrpt, 1); deepmd::prod_env_mat_a_cpu(&em[0], &em_deriv[0], &rij[0], &nlist[0], - &posi_cpy[0], &atype[0], inlist, max_nbor_size, + &posi_cpy[0], &atype_cpy[0], inlist, max_nbor_size, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a, &f_atype_cpy[0]); deepmd::use_nei_info_cpu(&nlist[0], &ntype[0], nmask, &atype[0], &mapping[0], @@ -583,7 +599,7 @@ TEST_F(TestEnvMatAMix, prod_cpu_equal_cpu) { std::vector avg(static_cast(ntypes) * ndescrpt, 0); std::vector std(static_cast(ntypes) * ndescrpt, 1); deepmd::prod_env_mat_a_cpu(&em[0], &em_deriv[0], &rij[0], &nlist[0], - &posi_cpy[0], &atype[0], inlist, max_nbor_size, + &posi_cpy[0], &atype_cpy[0], inlist, max_nbor_size, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a, &f_atype_cpy[0]); @@ -630,6 +646,83 @@ TEST_F(TestEnvMatAMix, prod_cpu_equal_cpu) { // } } +TEST_F(TestEnvMatAMix, prod_cpu_multiple_frames) { + constexpr int nframes = 2; + int max_nbor_size = 0; + for (int ii = 0; ii < nlist_a_cpy.size(); ++ii) { + if (nlist_a_cpy[ii].size() > max_nbor_size) { + max_nbor_size = nlist_a_cpy[ii].size(); + } + } + + std::vector base_ilist(nloc), base_numneigh(nloc); + std::vector base_firstneigh(nloc); + deepmd::InputNlist base_inlist(nloc, base_ilist.data(), base_numneigh.data(), + base_firstneigh.data()); + convert_nlist(base_inlist, nlist_a_cpy); + + const int nrows = nframes * nloc; + std::vector ilist(nrows), numneigh(nrows); + std::vector firstneigh(nrows); + for (int ff = 0; ff < nframes; ++ff) { + for (int ii = 0; ii < nloc; ++ii) { + const int row = ff * nloc + ii; + ilist[row] = base_ilist[ii]; + numneigh[row] = base_numneigh[ii]; + firstneigh[row] = base_firstneigh[ii]; + } + } + deepmd::InputNlist inlist(nrows, ilist.data(), numneigh.data(), + firstneigh.data()); + + std::vector posi_multi = repeat_vector(posi_cpy, nframes); + std::vector atype_multi = repeat_vector(atype_cpy, nframes); + for (int ii = nloc; ii < nall; ++ii) { + atype_multi[ii] = 0; + } + for (int ii = 0; ii < nloc; ++ii) { + atype_multi[static_cast(nall) + ii] = 1; + } + std::vector f_atype_multi = repeat_vector(f_atype_cpy, nframes); + std::vector avg(static_cast(ntypes) * ndescrpt, 0.0); + std::vector std(static_cast(ntypes) * ndescrpt, 1.0); + for (int jj = 0; jj < ndescrpt; ++jj) { + avg[static_cast(ndescrpt) + jj] = 0.125 + 0.001 * (jj % 17); + } + + std::vector expected_multi(static_cast(nframes) * nloc * + ndescrpt); + for (int ff = 0; ff < nframes; ++ff) { + std::vector frame_em(static_cast(nloc) * ndescrpt), + frame_em_deriv(static_cast(nloc) * ndescrpt * 3), + frame_rij(static_cast(nloc) * nnei * 3); + std::vector frame_nlist(static_cast(nloc) * nnei); + deepmd::prod_env_mat_a_cpu( + frame_em.data(), frame_em_deriv.data(), frame_rij.data(), + frame_nlist.data(), + posi_multi.data() + static_cast(ff) * nall * 3, + atype_multi.data() + static_cast(ff) * nall, base_inlist, + max_nbor_size, avg.data(), std.data(), nloc, nall, rc, rc_smth, sec_a, + f_atype_multi.data() + static_cast(ff) * nall); + std::copy( + frame_em.begin(), frame_em.end(), + expected_multi.begin() + static_cast(ff) * nloc * ndescrpt); + } + + std::vector em(static_cast(nframes) * nloc * ndescrpt), + em_deriv(static_cast(nframes) * nloc * ndescrpt * 3), + rij(static_cast(nframes) * nloc * nnei * 3); + std::vector nlist(static_cast(nframes) * nloc * nnei); + deepmd::prod_env_mat_a_cpu( + em.data(), em_deriv.data(), rij.data(), nlist.data(), posi_multi.data(), + atype_multi.data(), inlist, max_nbor_size, avg.data(), std.data(), nloc, + nall, nframes, rc, rc_smth, sec_a, f_atype_multi.data()); + + for (int ii = 0; ii < em.size(); ++ii) { + EXPECT_LT(fabs(em[ii] - expected_multi[ii]), 1e-10); + } +} + #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM TEST_F(TestEnvMatAMix, prod_gpu) { EXPECT_EQ(nlist_r_cpy.size(), nloc); @@ -667,9 +760,9 @@ TEST_F(TestEnvMatAMix, prod_gpu) { double *em_dev = NULL, *em_deriv_dev = NULL, *rij_dev = NULL; bool* nmask_dev = NULL; double *posi_cpy_dev = NULL, *avg_dev = NULL, *std_dev = NULL; - int *f_atype_cpy_dev = NULL, *atype_dev = NULL, *nlist_dev = NULL, - *ntype_dev = NULL, *mapping_dev = NULL, *array_int_dev = NULL, - *memory_dev = NULL; + int *f_atype_cpy_dev = NULL, *atype_cpy_dev = NULL, *atype_dev = NULL, + *nlist_dev = NULL, *ntype_dev = NULL, *mapping_dev = NULL, + *array_int_dev = NULL, *memory_dev = NULL; uint_64* array_longlong_dev = NULL; deepmd::malloc_device_memory_sync(em_dev, em); deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv); @@ -678,6 +771,7 @@ TEST_F(TestEnvMatAMix, prod_gpu) { deepmd::malloc_device_memory_sync(avg_dev, avg); deepmd::malloc_device_memory_sync(std_dev, std); deepmd::malloc_device_memory_sync(f_atype_cpy_dev, f_atype_cpy); + deepmd::malloc_device_memory_sync(atype_cpy_dev, atype_cpy); deepmd::malloc_device_memory_sync(atype_dev, atype); deepmd::malloc_device_memory_sync(nlist_dev, nlist); deepmd::malloc_device_memory_sync(ntype_dev, ntype); @@ -692,9 +786,9 @@ TEST_F(TestEnvMatAMix, prod_gpu) { max_nbor_size); deepmd::prod_env_mat_a_gpu( - em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_dev, + em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_cpy_dev, gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev, - std_dev, nloc, nall, rc, rc_smth, sec_a, f_atype_cpy_dev); + std_dev, nloc, nall, 1, rc, rc_smth, sec_a, f_atype_cpy_dev); deepmd::use_nei_info_gpu(nlist_dev, ntype_dev, nmask_dev, atype_dev, mapping_dev, nloc, nnei, ntypes, true); @@ -709,6 +803,7 @@ TEST_F(TestEnvMatAMix, prod_gpu) { deepmd::delete_device_memory(nmask_dev); deepmd::delete_device_memory(posi_cpy_dev); deepmd::delete_device_memory(f_atype_cpy_dev); + deepmd::delete_device_memory(atype_cpy_dev); deepmd::delete_device_memory(atype_dev); deepmd::delete_device_memory(mapping_dev); deepmd::delete_device_memory(array_int_dev); @@ -763,7 +858,7 @@ TEST_F(TestEnvMatAMix, prod_gpu_equal_cpu) { double *em_dev = NULL, *em_deriv_dev = NULL, *rij_dev = NULL; double *posi_cpy_dev = NULL, *avg_dev = NULL, *std_dev = NULL; - int *f_atype_cpy_dev = NULL, *atype_dev = NULL, *nlist_dev = NULL, + int *f_atype_cpy_dev = NULL, *atype_cpy_dev = NULL, *nlist_dev = NULL, *array_int_dev = NULL, *memory_dev = NULL; uint_64* array_longlong_dev = NULL; deepmd::malloc_device_memory_sync(em_dev, em); @@ -774,7 +869,7 @@ TEST_F(TestEnvMatAMix, prod_gpu_equal_cpu) { deepmd::malloc_device_memory_sync(std_dev, std); deepmd::malloc_device_memory_sync(f_atype_cpy_dev, f_atype_cpy); - deepmd::malloc_device_memory_sync(atype_dev, atype); + deepmd::malloc_device_memory_sync(atype_cpy_dev, atype_cpy); deepmd::malloc_device_memory_sync(nlist_dev, nlist); deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc); @@ -785,9 +880,9 @@ TEST_F(TestEnvMatAMix, prod_gpu_equal_cpu) { max_nbor_size); deepmd::prod_env_mat_a_gpu( - em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_dev, + em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_cpy_dev, gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev, - std_dev, nloc, nall, rc, rc_smth, sec_a, f_atype_cpy_dev); + std_dev, nloc, nall, 1, rc, rc_smth, sec_a, f_atype_cpy_dev); deepmd::memcpy_device_to_host(em_dev, em); deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv); deepmd::memcpy_device_to_host(rij_dev, rij); @@ -797,7 +892,7 @@ TEST_F(TestEnvMatAMix, prod_gpu_equal_cpu) { deepmd::delete_device_memory(nlist_dev); deepmd::delete_device_memory(posi_cpy_dev); deepmd::delete_device_memory(f_atype_cpy_dev); - deepmd::delete_device_memory(atype_dev); + deepmd::delete_device_memory(atype_cpy_dev); deepmd::delete_device_memory(array_int_dev); deepmd::delete_device_memory(array_longlong_dev); deepmd::delete_device_memory(avg_dev); @@ -847,4 +942,134 @@ TEST_F(TestEnvMatAMix, prod_gpu_equal_cpu) { } } } + +TEST_F(TestEnvMatAMix, prod_gpu_multiple_frames) { + constexpr int nframes = 2; + int max_nbor_size = 0; + for (int ii = 0; ii < nlist_a_cpy.size(); ++ii) { + if (nlist_a_cpy[ii].size() > max_nbor_size) { + max_nbor_size = nlist_a_cpy[ii].size(); + } + } + assert(max_nbor_size <= GPU_MAX_NBOR_SIZE); + if (max_nbor_size <= 1024) { + max_nbor_size = 1024; + } else if (max_nbor_size <= 2048) { + max_nbor_size = 2048; + } else { + max_nbor_size = 4096; + } + + std::vector base_ilist(nloc), base_numneigh(nloc); + std::vector base_firstneigh(nloc); + deepmd::InputNlist base_inlist(nloc, base_ilist.data(), base_numneigh.data(), + base_firstneigh.data()); + convert_nlist(base_inlist, nlist_a_cpy); + + const int nrows = nframes * nloc; + std::vector ilist(nrows), numneigh(nrows); + std::vector firstneigh(nrows); + for (int ff = 0; ff < nframes; ++ff) { + for (int ii = 0; ii < nloc; ++ii) { + const int row = ff * nloc + ii; + ilist[row] = base_ilist[ii]; + numneigh[row] = base_numneigh[ii]; + firstneigh[row] = base_firstneigh[ii]; + } + } + deepmd::InputNlist inlist(nrows, ilist.data(), numneigh.data(), + firstneigh.data()), + gpu_inlist; + + std::vector posi_multi = repeat_vector(posi_cpy, nframes); + std::vector atype_multi = repeat_vector(atype_cpy, nframes); + for (int ii = nloc; ii < nall; ++ii) { + atype_multi[ii] = 0; + } + for (int ii = 0; ii < nloc; ++ii) { + atype_multi[static_cast(nall) + ii] = 1; + } + std::vector f_atype_multi = repeat_vector(f_atype_cpy, nframes); + std::vector avg(static_cast(ntypes) * ndescrpt, 0.0); + std::vector std(static_cast(ntypes) * ndescrpt, 1.0); + for (int jj = 0; jj < ndescrpt; ++jj) { + avg[static_cast(ndescrpt) + jj] = 0.125 + 0.001 * (jj % 17); + } + + std::vector expected_multi(static_cast(nframes) * nloc * + ndescrpt); + for (int ff = 0; ff < nframes; ++ff) { + std::vector frame_em(static_cast(nloc) * ndescrpt), + frame_em_deriv(static_cast(nloc) * ndescrpt * 3), + frame_rij(static_cast(nloc) * nnei * 3); + std::vector frame_nlist(static_cast(nloc) * nnei); + deepmd::prod_env_mat_a_cpu( + frame_em.data(), frame_em_deriv.data(), frame_rij.data(), + frame_nlist.data(), + posi_multi.data() + static_cast(ff) * nall * 3, + atype_multi.data() + static_cast(ff) * nall, base_inlist, + max_nbor_size, avg.data(), std.data(), nloc, nall, rc, rc_smth, sec_a, + f_atype_multi.data() + static_cast(ff) * nall); + std::copy( + frame_em.begin(), frame_em.end(), + expected_multi.begin() + static_cast(ff) * nloc * ndescrpt); + } + + std::vector em(static_cast(nframes) * nloc * ndescrpt, 0.0), + em_deriv(static_cast(nframes) * nloc * ndescrpt * 3, 0.0), + rij(static_cast(nframes) * nloc * nnei * 3, 0.0); + std::vector nlist(static_cast(nframes) * nloc * nnei, 0); + + double *em_dev = NULL, *em_deriv_dev = NULL, *rij_dev = NULL; + double *posi_dev = NULL, *avg_dev = NULL, *std_dev = NULL; + int *atype_dev = NULL, *f_atype_dev = NULL, *nlist_dev = NULL, + *array_int_dev = NULL, *memory_dev = NULL; + uint_64* array_longlong_dev = NULL; + deepmd::malloc_device_memory_sync(em_dev, em); + deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv); + deepmd::malloc_device_memory_sync(rij_dev, rij); + deepmd::malloc_device_memory_sync(posi_dev, posi_multi); + deepmd::malloc_device_memory_sync(avg_dev, avg); + deepmd::malloc_device_memory_sync(std_dev, std); + deepmd::malloc_device_memory_sync(atype_dev, atype_multi); + deepmd::malloc_device_memory_sync(f_atype_dev, f_atype_multi); + deepmd::malloc_device_memory_sync(nlist_dev, nlist); + deepmd::malloc_device_memory( + array_int_dev, sec_a.size() + + static_cast(nframes) * nloc * sec_a.size() + + static_cast(nframes) * nloc); + deepmd::malloc_device_memory( + array_longlong_dev, + static_cast(nframes) * nloc * max_nbor_size * 2); + deepmd::malloc_device_memory( + memory_dev, static_cast(nframes) * nloc * max_nbor_size); + deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, + max_nbor_size); + + deepmd::prod_env_mat_a_gpu( + em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_dev, atype_dev, gpu_inlist, + array_int_dev, array_longlong_dev, max_nbor_size, avg_dev, std_dev, nloc, + nall, nframes, rc, rc_smth, sec_a, f_atype_dev); + + deepmd::memcpy_device_to_host(em_dev, em); + deepmd::delete_device_memory(em_dev); + deepmd::delete_device_memory(em_deriv_dev); + deepmd::delete_device_memory(rij_dev); + deepmd::delete_device_memory(nlist_dev); + deepmd::delete_device_memory(posi_dev); + deepmd::delete_device_memory(atype_dev); + deepmd::delete_device_memory(f_atype_dev); + deepmd::delete_device_memory(array_int_dev); + deepmd::delete_device_memory(array_longlong_dev); + deepmd::delete_device_memory(avg_dev); + deepmd::delete_device_memory(std_dev); + deepmd::delete_device_memory(memory_dev); + deepmd::free_nlist_gpu_device(gpu_inlist); + + for (int ii = 0; ii < em.size(); ++ii) { + EXPECT_LT(fabs(em[ii] - expected_multi[ii]), 1e-5) + << "index " << ii << " em " << em[ii] << " expected " + << expected_multi[ii]; + } +} #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/source/lib/tests/test_env_mat_r.cc b/source/lib/tests/test_env_mat_r.cc index 96da7e6963..82d6b6c229 100644 --- a/source/lib/tests/test_env_mat_r.cc +++ b/source/lib/tests/test_env_mat_r.cc @@ -302,6 +302,41 @@ TEST_F(TestEnvMatR, prod_cpu) { } } +TEST_F(TestEnvMatR, prod_cpu_negative_center_type) { + EXPECT_EQ(nlist_r_cpy.size(), nloc); + int max_nbor_size = 0; + for (int ii = 0; ii < nlist_a_cpy.size(); ++ii) { + if (nlist_a_cpy[ii].size() > max_nbor_size) { + max_nbor_size = nlist_a_cpy[ii].size(); + } + } + std::vector ilist(nloc), numneigh(nloc); + std::vector firstneigh(nloc); + deepmd::InputNlist inlist(nloc, ilist.data(), numneigh.data(), + firstneigh.data()); + convert_nlist(inlist, nlist_a_cpy); + + std::vector atype_neg = atype_cpy; + atype_neg[0] = -1; + std::vector em(static_cast(nloc) * ndescrpt, -1.0), + em_deriv(static_cast(nloc) * ndescrpt * 3, -1.0), + rij(static_cast(nloc) * nnei * 3); + std::vector nlist(static_cast(nloc) * nnei); + std::vector avg(static_cast(ntypes) * ndescrpt, 0); + std::vector std(static_cast(ntypes) * ndescrpt, 1); + deepmd::prod_env_mat_r_cpu(em.data(), em_deriv.data(), rij.data(), + nlist.data(), posi_cpy.data(), atype_neg.data(), + inlist, max_nbor_size, avg.data(), std.data(), + nloc, nall, rc, rc_smth, sec_a); + + for (int jj = 0; jj < ndescrpt; ++jj) { + EXPECT_EQ(em[jj], 0.0); + } + for (int jj = 0; jj < ndescrpt * 3; ++jj) { + EXPECT_EQ(em_deriv[jj], 0.0); + } +} + TEST_F(TestEnvMatR, prod_cpu_equal_cpu) { EXPECT_EQ(nlist_r_cpy.size(), nloc); int tot_nnei = 0; @@ -358,6 +393,89 @@ TEST_F(TestEnvMatR, prod_cpu_equal_cpu) { } } +TEST_F(TestEnvMatR, prod_cpu_multiple_frames) { + EXPECT_EQ(nlist_r_cpy.size(), nloc); + constexpr int nframes = 2; + int max_nbor_size = 0; + for (int ii = 0; ii < nlist_a_cpy.size(); ++ii) { + if (nlist_a_cpy[ii].size() > max_nbor_size) { + max_nbor_size = nlist_a_cpy[ii].size(); + } + } + + std::vector base_ilist(nloc), base_numneigh(nloc); + std::vector base_firstneigh(nloc); + deepmd::InputNlist base_inlist(nloc, base_ilist.data(), base_numneigh.data(), + base_firstneigh.data()); + convert_nlist(base_inlist, nlist_a_cpy); + + const int nrows = nframes * nloc; + std::vector ilist(nrows), numneigh(nrows); + std::vector firstneigh(nrows); + for (int ff = 0; ff < nframes; ++ff) { + for (int ii = 0; ii < nloc; ++ii) { + const int row = ff * nloc + ii; + ilist[row] = base_ilist[ii]; + numneigh[row] = base_numneigh[ii]; + firstneigh[row] = base_firstneigh[ii]; + } + } + deepmd::InputNlist inlist(nrows, ilist.data(), numneigh.data(), + firstneigh.data()); + + std::vector posi_multi = posi_cpy; + posi_multi.insert(posi_multi.end(), posi_cpy.begin(), posi_cpy.end()); + for (int ii = 0; ii < nall; ++ii) { + const size_t offset = (static_cast(nall) + ii) * 3; + posi_multi[offset] += 0.01 * (ii + 1); + posi_multi[offset + 1] -= 0.02 * (ii % 3); + posi_multi[offset + 2] += 0.015 * (ii % 5); + } + std::vector atype_multi = atype_cpy; + atype_multi.insert(atype_multi.end(), atype_cpy.begin(), atype_cpy.end()); + + std::vector em(static_cast(nframes) * nloc * ndescrpt), + em_deriv(static_cast(nframes) * nloc * ndescrpt * 3), + rij(static_cast(nframes) * nloc * nnei * 3); + std::vector nlist(static_cast(nframes) * nloc * nnei); + std::vector avg(static_cast(ntypes) * ndescrpt, 0); + std::vector std(static_cast(ntypes) * ndescrpt, 1); + deepmd::prod_env_mat_r_cpu( + em.data(), em_deriv.data(), rij.data(), nlist.data(), posi_multi.data(), + atype_multi.data(), inlist, max_nbor_size, avg.data(), std.data(), nloc, + nall, nframes, rc, rc_smth, sec_a); + + for (int ff = 0; ff < nframes; ++ff) { + std::vector frame_em(static_cast(nloc) * ndescrpt), + frame_em_deriv(static_cast(nloc) * ndescrpt * 3), + frame_rij(static_cast(nloc) * nnei * 3); + std::vector frame_nlist(static_cast(nloc) * nnei); + deepmd::prod_env_mat_r_cpu( + frame_em.data(), frame_em_deriv.data(), frame_rij.data(), + frame_nlist.data(), + posi_multi.data() + static_cast(ff) * nall * 3, + atype_multi.data() + static_cast(ff) * nall, base_inlist, + max_nbor_size, avg.data(), std.data(), nloc, nall, rc, rc_smth, sec_a); + + const size_t em_offset = static_cast(ff) * nloc * ndescrpt; + const size_t deriv_offset = em_offset * 3; + const size_t rij_offset = static_cast(ff) * nloc * nnei * 3; + const size_t nlist_offset = static_cast(ff) * nloc * nnei; + for (size_t jj = 0; jj < frame_em.size(); ++jj) { + EXPECT_LT(fabs(em[em_offset + jj] - frame_em[jj]), 1e-10); + } + for (size_t jj = 0; jj < frame_em_deriv.size(); ++jj) { + EXPECT_LT(fabs(em_deriv[deriv_offset + jj] - frame_em_deriv[jj]), 1e-10); + } + for (size_t jj = 0; jj < frame_rij.size(); ++jj) { + EXPECT_LT(fabs(rij[rij_offset + jj] - frame_rij[jj]), 1e-10); + } + for (size_t jj = 0; jj < frame_nlist.size(); ++jj) { + EXPECT_EQ(nlist[nlist_offset + jj], frame_nlist[jj]); + } + } +} + #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM TEST_F(TestEnvMatR, prod_gpu) { EXPECT_EQ(nlist_r_cpy.size(), nloc); @@ -410,10 +528,10 @@ TEST_F(TestEnvMatR, prod_gpu) { deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size); - deepmd::prod_env_mat_r_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev, - posi_cpy_dev, atype_cpy_dev, gpu_inlist, - array_int_dev, array_longlong_dev, max_nbor_size, - avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a); + deepmd::prod_env_mat_r_gpu( + em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_cpy_dev, + gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev, + std_dev, nloc, nall, 1, rc, rc_smth, sec_a); deepmd::memcpy_device_to_host(em_dev, em); deepmd::delete_device_memory(em_dev); deepmd::delete_device_memory(em_deriv_dev); @@ -489,10 +607,10 @@ TEST_F(TestEnvMatR, prod_gpu_equal_cpu) { deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size); - deepmd::prod_env_mat_r_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev, - posi_cpy_dev, atype_cpy_dev, gpu_inlist, - array_int_dev, array_longlong_dev, max_nbor_size, - avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a); + deepmd::prod_env_mat_r_gpu( + em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_cpy_dev, + gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev, + std_dev, nloc, nall, 1, rc, rc_smth, sec_a); deepmd::memcpy_device_to_host(em_dev, em); deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv); deepmd::memcpy_device_to_host(rij_dev, rij); diff --git a/source/lib/tests/test_fmt_nlist.cc b/source/lib/tests/test_fmt_nlist.cc index 6cd24b556a..511a2be949 100644 --- a/source/lib/tests/test_fmt_nlist.cc +++ b/source/lib/tests/test_fmt_nlist.cc @@ -359,7 +359,7 @@ TEST_F(TestFormatNlist, gpu) { // format nlist format_nbor_list_gpu(nlist_dev, posi_cpy_dev, atype_cpy_dev, gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, nloc, - nall, rc, sec_a); + nall, 1, rc, sec_a); deepmd::memcpy_device_to_host(nlist_dev, nlist); deepmd::delete_device_memory(nlist_dev); deepmd::delete_device_memory(posi_cpy_dev); @@ -419,7 +419,7 @@ TEST_F(TestFormatNlistShortSel, gpu) { // format nlist format_nbor_list_gpu(nlist_dev, posi_cpy_dev, atype_cpy_dev, gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, nloc, - nall, rc, sec_a); + nall, 1, rc, sec_a); deepmd::memcpy_device_to_host(nlist_dev, nlist); deepmd::delete_device_memory(nlist_dev); deepmd::delete_device_memory(posi_cpy_dev); diff --git a/source/lib/tests/test_neighbor_list.cc b/source/lib/tests/test_neighbor_list.cc index a700df97b5..a643c8c321 100644 --- a/source/lib/tests/test_neighbor_list.cc +++ b/source/lib/tests/test_neighbor_list.cc @@ -1,10 +1,23 @@ // SPDX-License-Identifier: LGPL-3.0-or-later #include +#include + #include "device.h" #include "fmt_nlist.h" #include "neighbor_list.h" +template +static std::vector repeat_vector(const std::vector& values, + const int repeats) { + std::vector result; + result.reserve(static_cast(repeats) * values.size()); + for (int ii = 0; ii < repeats; ++ii) { + result.insert(result.end(), values.begin(), values.end()); + } + return result; +} + class TestNeighborList : public ::testing::Test { protected: std::vector posi = {12.83, 2.56, 2.18, 12.09, 2.87, 2.74, @@ -110,6 +123,41 @@ TEST_F(TestNeighborList, cpu) { delete[] firstneigh; } +TEST_F(TestNeighborList, cpu_multiple_frames) { + constexpr int nframes = 2; + const int nrows = nframes * nloc; + int mem_size = 10; + std::vector ilist(nrows); + std::vector numneigh(nrows); + std::vector firstneigh(nrows); + std::vector jlist(static_cast(nrows) * mem_size); + std::vector posi_multi = repeat_vector(posi_cpy, nframes); + for (int ii = 0; ii < nrows; ++ii) { + firstneigh[ii] = jlist.data() + ii * mem_size; + } + + deepmd::InputNlist nlist(nrows, ilist.data(), numneigh.data(), + firstneigh.data()); + int max_list_size; + int ret = build_nlist_cpu(nlist, &max_list_size, posi_multi.data(), nloc, + nall, mem_size, rc, nframes); + EXPECT_EQ(ret, 0); + EXPECT_EQ(nlist.inum, nrows); + EXPECT_EQ(max_list_size, 5); + for (int ff = 0; ff < nframes; ++ff) { + for (int ii = 0; ii < nloc; ++ii) { + const int row = ff * nloc + ii; + EXPECT_EQ(nlist.ilist[row], ii); + EXPECT_EQ(nlist.numneigh[row], expect_nlist_cpy[ii].size()); + std::sort(nlist.firstneigh[row], + nlist.firstneigh[row] + nlist.numneigh[row]); + for (int jj = 0; jj < nlist.numneigh[row]; ++jj) { + EXPECT_EQ(nlist.firstneigh[row][jj], expect_nlist_cpy[ii][jj]); + } + } + } +} + TEST_F(TestNeighborList, cpu_lessmem) { int mem_size = 2; int* ilist = new int[nloc]; @@ -196,6 +244,71 @@ TEST_F(TestNeighborList, gpu) { deepmd::delete_device_memory(c_cpy_dev); } +TEST_F(TestNeighborList, gpu_multiple_frames) { + constexpr int nframes = 2; + const int nrows = nframes * nloc; + int mem_size = 48; + + int *nlist_data_dev = NULL, *jlist_dev = NULL, *ilist_dev = NULL, + *numneigh_dev = NULL; + int** firstneigh_dev = NULL; + std::vector temp_firstneigh(nrows); + std::vector posi_multi = repeat_vector(posi_cpy, nframes); + double* c_cpy_dev = NULL; + + deepmd::malloc_device_memory(nlist_data_dev, 2 * nrows * mem_size); + deepmd::malloc_device_memory(jlist_dev, nrows * mem_size); + deepmd::malloc_device_memory(ilist_dev, nrows); + deepmd::malloc_device_memory(numneigh_dev, nrows); + for (int ii = 0; ii < nrows; ++ii) { + temp_firstneigh[ii] = jlist_dev + ii * mem_size; + } + deepmd::malloc_device_memory_sync(firstneigh_dev, temp_firstneigh); + deepmd::malloc_device_memory_sync(c_cpy_dev, posi_multi); + deepmd::InputNlist nlist_dev(nrows, ilist_dev, numneigh_dev, firstneigh_dev); + + int max_list_size; + int ret = + deepmd::build_nlist_gpu(nlist_dev, &max_list_size, nlist_data_dev, + c_cpy_dev, nloc, nall, mem_size, rc, nframes); + + EXPECT_EQ(ret, 0); + std::vector ilist(nrows); + std::vector numneigh(nrows); + std::vector firstneigh(nrows); + std::vector jlist(nrows * mem_size); + deepmd::memcpy_device_to_host(jlist_dev, jlist.data(), nrows * mem_size); + deepmd::memcpy_device_to_host(ilist_dev, ilist.data(), nrows); + deepmd::memcpy_device_to_host(numneigh_dev, numneigh.data(), nrows); + for (int ii = 0; ii < nrows; ++ii) { + firstneigh[ii] = jlist.data() + ii * mem_size; + } + + deepmd::InputNlist nlist(nlist_dev.inum, ilist.data(), numneigh.data(), + firstneigh.data()); + EXPECT_EQ(nlist.inum, nrows); + EXPECT_EQ(max_list_size, 5); + for (int ff = 0; ff < nframes; ++ff) { + for (int ii = 0; ii < nloc; ++ii) { + const int row = ff * nloc + ii; + EXPECT_EQ(nlist.ilist[row], ii); + EXPECT_EQ(nlist.numneigh[row], expect_nlist_cpy[ii].size()); + std::sort(nlist.firstneigh[row], + nlist.firstneigh[row] + nlist.numneigh[row]); + for (int jj = 0; jj < nlist.numneigh[row]; ++jj) { + EXPECT_EQ(nlist.firstneigh[row][jj], expect_nlist_cpy[ii][jj]); + } + } + } + + deepmd::delete_device_memory(nlist_data_dev); + deepmd::delete_device_memory(jlist_dev); + deepmd::delete_device_memory(ilist_dev); + deepmd::delete_device_memory(numneigh_dev); + deepmd::delete_device_memory(firstneigh_dev); + deepmd::delete_device_memory(c_cpy_dev); +} + TEST_F(TestNeighborList, gpu_lessmem) { int mem_size = 47; diff --git a/source/op/tf/custom_op.h b/source/op/tf/custom_op.h index baf95f3fa3..1dfb680247 100644 --- a/source/op/tf/custom_op.h +++ b/source/op/tf/custom_op.h @@ -28,28 +28,28 @@ void safe_compute(OpKernelContext* context, }; template -void _prepare_coord_nlist_gpu(OpKernelContext* context, - Tensor* tensor_list, - FPTYPE const** coord, - FPTYPE*& coord_cpy, - int const** type, - int*& type_cpy, - int*& idx_mapping, - deepmd::InputNlist& inlist, - int*& ilist, - int*& numneigh, - int**& firstneigh, - int*& jlist, - int*& nbor_list_dev, - int& new_nall, - int& mem_cpy, - int& mem_nnei, - int& max_nbor_size, - const FPTYPE* box, - const int* mesh_tensor_data, - const int mesh_tensor_size, - const int& nloc, - const int& nei_mode, - const float& rcut_r, - const int& max_cpy_trial, - const int& max_nnei_trial); +tensorflow::Status _prepare_coord_nlist_gpu(OpKernelContext* context, + Tensor* tensor_list, + FPTYPE const** coord, + FPTYPE*& coord_cpy, + int const** type, + int*& type_cpy, + int*& idx_mapping, + deepmd::InputNlist& inlist, + int*& ilist, + int*& numneigh, + int**& firstneigh, + int*& jlist, + int*& nbor_list_dev, + int& new_nall, + int& mem_cpy, + int& mem_nnei, + int& max_nbor_size, + const FPTYPE* box, + const int* mesh_tensor_data, + const int mesh_tensor_size, + const int& nloc, + const int& nei_mode, + const float& rcut_r, + const int& max_cpy_trial, + const int& max_nnei_trial); diff --git a/source/op/tf/neighbor_stat.cc b/source/op/tf/neighbor_stat.cc index 26f13b0c84..0e3ec0a942 100644 --- a/source/op/tf/neighbor_stat.cc +++ b/source/op/tf/neighbor_stat.cc @@ -149,8 +149,16 @@ class NeighborStatOp : public OpKernel { int* idx_mapping = NULL; int *ilist = NULL, *numneigh = NULL; - int** firstneigh = NULL; - deepmd::malloc_device_memory(firstneigh, nloc); + struct FirstneighGuard { + int** ptr = NULL; + ~FirstneighGuard() { + if (ptr != NULL) { + deepmd::delete_device_memory(ptr); + } + } + } firstneigh_guard; + deepmd::malloc_device_memory(firstneigh_guard.ptr, nloc); + int** firstneigh = firstneigh_guard.ptr; int* jlist = NULL; FPTYPE* coord_cpy; int* type_cpy; @@ -159,12 +167,14 @@ class NeighborStatOp : public OpKernel { deepmd::InputNlist gpu_inlist; int* nbor_list_dev = NULL; // prepare coord and nlist - _prepare_coord_nlist_gpu( - context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy, - idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist, - nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size_nlist, - box, mesh_tensor.flat().data(), mesh_tensor_size, nloc, nei_mode, - rcut, max_cpy_trial, max_nnei_trial); + OP_REQUIRES_OK( + context, + _prepare_coord_nlist_gpu( + context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy, + idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist, + nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size_nlist, + box, mesh_tensor.flat().data(), mesh_tensor_size, nloc, + nei_mode, rcut, max_cpy_trial, max_nnei_trial)); TensorShape min_nbor_dist_shape; min_nbor_dist_shape.AddDim(static_cast(nloc) * mem_nnei); @@ -177,7 +187,6 @@ class NeighborStatOp : public OpKernel { deepmd::neighbor_stat_gpu(coord, type, nloc, gpu_inlist, max_nbor_size, min_nbor_dist, ntypes, mem_nnei); - deepmd::delete_device_memory(firstneigh); #endif } else { for (int ii = 0; diff --git a/source/op/tf/prod_env_mat_multi_device.cc b/source/op/tf/prod_env_mat_multi_device.cc index e374102224..d83df10977 100644 --- a/source/op/tf/prod_env_mat_multi_device.cc +++ b/source/op/tf/prod_env_mat_multi_device.cc @@ -1,4 +1,6 @@ // SPDX-License-Identifier: LGPL-3.0-or-later +#include + #include "coord.h" #include "custom_op.h" #include "device.h" @@ -192,6 +194,19 @@ static int _norm_copy_coord_cpu(std::vector& coord_cpy, const int& max_cpy_trial, const float& rcut_r); +template +static int _norm_copy_coord_cpu_frame(FPTYPE* coord_cpy, + int* type_cpy, + int* idx_mapping, + int& frame_nall, + const int& mem_cpy, + const FPTYPE* coord, + const FPTYPE* box, + const int* type, + const int& nall, + const int& nloc, + const float& rcut_r); + template static int _build_nlist_cpu(std::vector& ilist, std::vector& numneigh, @@ -203,7 +218,9 @@ static int _build_nlist_cpu(std::vector& ilist, const int& nloc, const int& new_nall, const int& max_nnei_trial, - const float& rcut_r); + const float& rcut_r, + const int& nframes = 1, + const int* type = NULL); static void _map_nlist_cpu(int* nlist, const int* idx_mapping, @@ -220,6 +237,18 @@ static void _map_nei_info_cpu(int* nlist, const int& ntypes, const bool& b_nlist_map); +static tensorflow::Status _prepare_mesh_nlist_cpu_batch( + deepmd::InputNlist& inlist, + std::vector& ilist, + std::vector& numneigh, + std::vector& firstneigh, + std::vector>& jlist, + int& max_nbor_size, + const int* mesh_tensor_data, + const int mesh_tensor_size, + const int nloc, + const int nframes); + template static void _prepare_coord_nlist_cpu(OpKernelContext* context, FPTYPE const** coord, @@ -244,6 +273,33 @@ static void _prepare_coord_nlist_cpu(OpKernelContext* context, const int& max_cpy_trial, const int& max_nnei_trial); +template +static tensorflow::Status _prepare_coord_nlist_cpu_batch( + FPTYPE const** coord, + std::vector& coord_cpy, + int const** type, + std::vector& type_cpy, + std::vector& idx_mapping, + deepmd::InputNlist& inlist, + std::vector& ilist, + std::vector& numneigh, + std::vector& firstneigh, + std::vector>& jlist, + int& new_nall, + int& mem_cpy, + int& mem_nnei, + int& max_nbor_size, + const FPTYPE* box, + const int* mesh_tensor_data, + const int mesh_tensor_size, + const int& nloc, + const int& nall, + const int& nframes, + const int& nei_mode, + const float& rcut_r, + const int& max_cpy_trial, + const int& max_nnei_trial); + #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM template static int _norm_copy_coord_gpu(OpKernelContext* context, @@ -260,6 +316,19 @@ static int _norm_copy_coord_gpu(OpKernelContext* context, const int& max_cpy_trial, const float& rcut_r); +template +static int _norm_copy_coord_gpu_frame(OpKernelContext* context, + FPTYPE* coord_cpy, + int* type_cpy, + int* idx_mapping, + int& frame_nall, + const int& mem_cpy, + const FPTYPE* coord, + const FPTYPE* box, + const int* type, + const int& nloc, + const float& rcut_r); + template static int _build_nlist_gpu(OpKernelContext* context, Tensor* tensor_list, @@ -273,7 +342,9 @@ static int _build_nlist_gpu(OpKernelContext* context, const int& nloc, const int& new_nall, const int& max_nnei_trial, - const float& rcut_r); + const float& rcut_r, + const int& nframes = 1, + const int* type = NULL); static void _map_nlist_gpu(int* nlist, const int* idx_mapping, @@ -290,32 +361,58 @@ static void _map_nei_info_gpu(int* nlist, const int& ntypes, const bool& b_nlist_map); +static tensorflow::Status _prepare_mesh_nlist_gpu_batch( + OpKernelContext* context, + Tensor* tensor_list, + deepmd::InputNlist& gpu_inlist, + int** firstneigh, + int& max_nbor_size, + const int* mesh_tensor_data, + const int mesh_tensor_size, + const int nloc, + const int nframes); + +static tensorflow::Status _round_built_gpu_nbor_size(int& max_nbor_size); + +template +struct DeviceMemoryGuard { + T* ptr = NULL; + DeviceMemoryGuard() = default; + DeviceMemoryGuard(const DeviceMemoryGuard&) = delete; + DeviceMemoryGuard& operator=(const DeviceMemoryGuard&) = delete; + ~DeviceMemoryGuard() { + if (ptr != NULL) { + deepmd::delete_device_memory(ptr); + } + } +}; + template -void _prepare_coord_nlist_gpu(OpKernelContext* context, - Tensor* tensor_list, - FPTYPE const** coord, - FPTYPE*& coord_cpy, - int const** type, - int*& type_cpy, - int*& idx_mapping, - deepmd::InputNlist& inlist, - int*& ilist, - int*& numneigh, - int**& firstneigh, - int*& jlist, - int*& nbor_list_dev, - int& new_nall, - int& mem_cpy, - int& mem_nnei, - int& max_nbor_size, - const FPTYPE* box, - const int* mesh_tensor_data, - const int mesh_tensor_size, - const int& nloc, - const int& nei_mode, - const float& rcut_r, - const int& max_cpy_trial, - const int& max_nnei_trial); +tensorflow::Status _prepare_coord_nlist_gpu(OpKernelContext* context, + Tensor* tensor_list, + FPTYPE const** coord, + FPTYPE*& coord_cpy, + int const** type, + int*& type_cpy, + int*& idx_mapping, + deepmd::InputNlist& inlist, + int*& ilist, + int*& numneigh, + int**& firstneigh, + int*& jlist, + int*& nbor_list_dev, + int& new_nall, + int& mem_cpy, + int& mem_nnei, + int& max_nbor_size, + const FPTYPE* box, + const int* mesh_tensor_data, + const int mesh_tensor_size, + const int& nloc, + const int& nei_mode, + const float& rcut_r, + const int& max_cpy_trial, + const int& max_nnei_trial); #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM @@ -485,6 +582,176 @@ class ProdEnvMatAOp : public OpKernel { const FPTYPE* std = std_tensor.flat().data(); const int* p_type = type_tensor.flat().data(); + if (device == "CPU" && nei_mode != 3) { + const FPTYPE* coord = p_coord; + const int* type = p_type; + int frame_nall = nall; + std::vector coord_cpy; + std::vector type_cpy; + std::vector idx_mapping; + std::vector ilist; + std::vector numneigh; + std::vector firstneigh; + std::vector> jlist; + deepmd::InputNlist batch_inlist; + int batch_max_nbor_size = max_nbor_size; + OP_REQUIRES_OK( + context, + _prepare_coord_nlist_cpu_batch( + &coord, coord_cpy, &type, type_cpy, idx_mapping, batch_inlist, + ilist, numneigh, firstneigh, jlist, frame_nall, mem_cpy, mem_nnei, + batch_max_nbor_size, p_box, mesh_tensor.flat().data(), + static_cast(mesh_tensor.NumElements()), nloc, nall, nsamples, + nei_mode, rcut_r, max_cpy_trial, max_nnei_trial)); + max_nbor_size = std::max(max_nbor_size, batch_max_nbor_size); + + deepmd::prod_env_mat_a_cpu(p_em, p_em_deriv, p_rij, p_nlist, coord, type, + batch_inlist, batch_max_nbor_size, avg, std, + nloc, frame_nall, nsamples, rcut_r, + rcut_r_smth, sec_a); + if (nei_mode == 1) { + for (int kk = 0; kk < nsamples; ++kk) { + _map_nlist_cpu( + p_nlist + static_cast(kk) * nloc * nnei, + idx_mapping.data() + static_cast(kk) * frame_nall, nloc, + nnei); + } + } + return; + } + +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM + if (device == "GPU" && (nei_mode == -1 || nei_mode == 1 || nei_mode == 4)) { + const int frame_chunk = nsamples; + for (int ff = 0; ff < nsamples; ff += frame_chunk) { + const int chunk_nframes = std::min(frame_chunk, nsamples - ff); + const int nrows = chunk_nframes * nloc; + FPTYPE* em = p_em + static_cast(ff) * nloc * ndescrpt; + FPTYPE* em_deriv = + p_em_deriv + static_cast(ff) * nloc * ndescrpt * 3; + FPTYPE* rij = p_rij + static_cast(ff) * nloc * nnei * 3; + int* nlist = p_nlist + static_cast(ff) * nloc * nnei; + const FPTYPE* coord = p_coord + static_cast(ff) * nall * 3; + const int* type = p_type + static_cast(ff) * nall; + int* idx_mapping = NULL; + int frame_nall = nall; + Tensor coord_cpy_tensor; + Tensor type_cpy_tensor; + if (nei_mode == 1) { + int copy_ok = 0; + for (int tt = 0; tt < max_cpy_trial; ++tt) { + TensorShape cpy_shape; + cpy_shape.AddDim(static_cast(chunk_nframes) * mem_cpy * 3); + OP_REQUIRES_OK( + context, context->allocate_temp(DataTypeToEnum::value, + cpy_shape, &coord_cpy_tensor)); + TensorShape t_shape; + t_shape.AddDim(static_cast(chunk_nframes) * mem_cpy * 2); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, t_shape, + &type_cpy_tensor)); + FPTYPE* coord_cpy = coord_cpy_tensor.flat().data(); + int* type_cpy = type_cpy_tensor.flat().data(); + idx_mapping = type_cpy + int_64(chunk_nframes) * mem_cpy; + DPErrcheck( + gpuMemset(type_cpy, -1, + sizeof(int) * int_64(chunk_nframes) * mem_cpy * 2)); + + copy_ok = 1; + for (int kk = 0; kk < chunk_nframes; ++kk) { + int frame_copied_nall = nall; + int ret = _norm_copy_coord_gpu_frame( + context, coord_cpy + int_64(kk) * mem_cpy * 3, + type_cpy + int_64(kk) * mem_cpy, + idx_mapping + int_64(kk) * mem_cpy, frame_copied_nall, + mem_cpy, p_coord + static_cast(ff + kk) * nall * 3, + p_box + static_cast(ff + kk) * 9, + p_type + static_cast(ff + kk) * nall, nloc, rcut_r); + OP_REQUIRES( + context, ret >= 0, + errors::Aborted("cannot allocate mem for copied coords")); + if (ret != 0) { + copy_ok = 0; + break; + } + } + if (copy_ok) { + coord = coord_cpy; + type = type_cpy; + frame_nall = mem_cpy; + break; + } + mem_cpy *= 2; + } + OP_REQUIRES(context, copy_ok, + errors::Aborted("cannot allocate mem for copied coords")); + } + + std::vector nlist_tensors(2); + int *ilist = NULL, *numneigh = NULL, *jlist = NULL; + DeviceMemoryGuard firstneigh_guard; + deepmd::malloc_device_memory(firstneigh_guard.ptr, nrows); + int** firstneigh = firstneigh_guard.ptr; + int chunk_max_nbor_size = max_nbor_size; + deepmd::InputNlist chunk_gpu_inlist; + if (nei_mode == 4) { + OP_REQUIRES_OK( + context, + _prepare_mesh_nlist_gpu_batch( + context, nlist_tensors.data(), chunk_gpu_inlist, firstneigh, + chunk_max_nbor_size, mesh_tensor.flat().data(), + static_cast(mesh_tensor.NumElements()), nloc, + chunk_nframes)); + } else { + TensorShape ilist_shape; + ilist_shape.AddDim(static_cast(nrows) * 2); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, ilist_shape, + &nlist_tensors[0])); + TensorShape jlist_shape; + jlist_shape.AddDim(3 * int_64(nrows) * mem_nnei); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, jlist_shape, + &nlist_tensors[1])); + int build_ok = + _build_nlist_gpu(context, nlist_tensors.data(), ilist, numneigh, + firstneigh, jlist, chunk_max_nbor_size, mem_nnei, + coord, nloc, frame_nall, max_nnei_trial, rcut_r, + chunk_nframes, nei_mode == 1 ? type : NULL); + OP_REQUIRES(context, build_ok, + errors::Aborted("cannot allocate mem for nlist")); + OP_REQUIRES_OK(context, + _round_built_gpu_nbor_size(chunk_max_nbor_size)); + chunk_gpu_inlist = + deepmd::InputNlist(nrows, ilist, numneigh, firstneigh); + } + max_nbor_size = std::max(max_nbor_size, chunk_max_nbor_size); + + Tensor int_temp; + TensorShape int_shape; + int_shape.AddDim(sec_a.size() + int_64(nrows) * sec_a.size() + nrows); + OP_REQUIRES_OK(context, + context->allocate_temp(DT_INT32, int_shape, &int_temp)); + Tensor uint64_temp; + TensorShape uint64_shape; + uint64_shape.AddDim(int_64(nrows) * chunk_max_nbor_size * 2); + OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, + &uint64_temp)); + array_int = int_temp.flat().data(); + array_longlong = uint64_temp.flat().data(); + + deepmd::prod_env_mat_a_gpu( + em, em_deriv, rij, nlist, coord, type, chunk_gpu_inlist, array_int, + array_longlong, chunk_max_nbor_size, avg, std, nloc, frame_nall, + chunk_nframes, rcut_r, rcut_r_smth, sec_a); + if (nei_mode == 1) { + for (int kk = 0; kk < chunk_nframes; ++kk) { + _map_nlist_gpu(nlist + int_64(kk) * nloc * nnei, + idx_mapping + int_64(kk) * mem_cpy, nloc, nnei); + } + } + } + return; + } +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM + // must declare out of if, otherwise the memory will be destroyed! Tensor int_temp; Tensor uint64_temp; @@ -548,7 +815,8 @@ class ProdEnvMatAOp : public OpKernel { array_longlong = uint64_temp.flat().data(); } - // loop over samples + // LAMMPS external nlists are updated outside this op, so keep their + // existing per-sample implementation. for (int_64 ff = 0; ff < nsamples; ++ff) { FPTYPE* em = p_em + ff * nloc * ndescrpt; FPTYPE* em_deriv = p_em_deriv + ff * nloc * ndescrpt * 3; @@ -562,20 +830,23 @@ class ProdEnvMatAOp : public OpKernel { #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM int* idx_mapping = NULL; int *ilist = NULL, *numneigh = NULL; - int** firstneigh = NULL; - deepmd::malloc_device_memory(firstneigh, nloc); + DeviceMemoryGuard firstneigh_guard; + deepmd::malloc_device_memory(firstneigh_guard.ptr, nloc); + int** firstneigh = firstneigh_guard.ptr; int* jlist = NULL; FPTYPE* coord_cpy; int* type_cpy; int frame_nall = nall; int mesh_tensor_size = static_cast(mesh_tensor.NumElements()); // prepare coord and nlist - _prepare_coord_nlist_gpu( - context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy, - idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist, - nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size, box, - mesh_tensor.flat().data(), mesh_tensor_size, nloc, nei_mode, - rcut_r, max_cpy_trial, max_nnei_trial); + OP_REQUIRES_OK( + context, + _prepare_coord_nlist_gpu( + context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy, + idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist, + nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size, + box, mesh_tensor.flat().data(), mesh_tensor_size, nloc, + nei_mode, rcut_r, max_cpy_trial, max_nnei_trial)); // max_nbor_size may be changed after _prepare_coord_nlist_gpu // So we need to update the uint64_temp tensor if necessary @@ -589,12 +860,11 @@ class ProdEnvMatAOp : public OpKernel { // launch the gpu(nv) compute function deepmd::prod_env_mat_a_gpu(em, em_deriv, rij, nlist, coord, type, gpu_inlist, array_int, array_longlong, - max_nbor_size, avg, std, nloc, frame_nall, + max_nbor_size, avg, std, nloc, frame_nall, 1, rcut_r, rcut_r_smth, sec_a); if (b_nlist_map) { _map_nlist_gpu(nlist, idx_mapping, nloc, nnei); } - deepmd::delete_device_memory(firstneigh); #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::InputNlist inlist; @@ -791,6 +1061,175 @@ class ProdEnvMatROp : public OpKernel { const FPTYPE* std = std_tensor.flat().data(); const int* p_type = type_tensor.flat().data(); + if (device == "CPU" && nei_mode != 3) { + const FPTYPE* coord = p_coord; + const int* type = p_type; + int frame_nall = nall; + std::vector coord_cpy; + std::vector type_cpy; + std::vector idx_mapping; + std::vector ilist; + std::vector numneigh; + std::vector firstneigh; + std::vector> jlist; + deepmd::InputNlist batch_inlist; + int batch_max_nbor_size = max_nbor_size; + OP_REQUIRES_OK( + context, + _prepare_coord_nlist_cpu_batch( + &coord, coord_cpy, &type, type_cpy, idx_mapping, batch_inlist, + ilist, numneigh, firstneigh, jlist, frame_nall, mem_cpy, mem_nnei, + batch_max_nbor_size, p_box, mesh_tensor.flat().data(), + static_cast(mesh_tensor.NumElements()), nloc, nall, nsamples, + nei_mode, rcut, max_cpy_trial, max_nnei_trial)); + max_nbor_size = std::max(max_nbor_size, batch_max_nbor_size); + + deepmd::prod_env_mat_r_cpu(p_em, p_em_deriv, p_rij, p_nlist, coord, type, + batch_inlist, batch_max_nbor_size, avg, std, + nloc, frame_nall, nsamples, rcut, rcut_smth, + sec); + if (nei_mode == 1) { + for (int kk = 0; kk < nsamples; ++kk) { + _map_nlist_cpu( + p_nlist + static_cast(kk) * nloc * nnei, + idx_mapping.data() + static_cast(kk) * frame_nall, nloc, + nnei); + } + } + return; + } + +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM + if (device == "GPU" && (nei_mode == -1 || nei_mode == 1 || nei_mode == 4)) { + const int frame_chunk = nsamples; + for (int ff = 0; ff < nsamples; ff += frame_chunk) { + const int chunk_nframes = std::min(frame_chunk, nsamples - ff); + const int nrows = chunk_nframes * nloc; + FPTYPE* em = p_em + static_cast(ff) * nloc * ndescrpt; + FPTYPE* em_deriv = + p_em_deriv + static_cast(ff) * nloc * ndescrpt * 3; + FPTYPE* rij = p_rij + static_cast(ff) * nloc * nnei * 3; + int* nlist = p_nlist + static_cast(ff) * nloc * nnei; + const FPTYPE* coord = p_coord + static_cast(ff) * nall * 3; + const int* type = p_type + static_cast(ff) * nall; + int* idx_mapping = NULL; + int frame_nall = nall; + Tensor coord_cpy_tensor; + Tensor type_cpy_tensor; + if (nei_mode == 1) { + int copy_ok = 0; + for (int tt = 0; tt < max_cpy_trial; ++tt) { + TensorShape cpy_shape; + cpy_shape.AddDim(static_cast(chunk_nframes) * mem_cpy * 3); + OP_REQUIRES_OK( + context, context->allocate_temp(DataTypeToEnum::value, + cpy_shape, &coord_cpy_tensor)); + TensorShape t_shape; + t_shape.AddDim(static_cast(chunk_nframes) * mem_cpy * 2); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, t_shape, + &type_cpy_tensor)); + FPTYPE* coord_cpy = coord_cpy_tensor.flat().data(); + int* type_cpy = type_cpy_tensor.flat().data(); + idx_mapping = type_cpy + int_64(chunk_nframes) * mem_cpy; + DPErrcheck( + gpuMemset(type_cpy, -1, + sizeof(int) * int_64(chunk_nframes) * mem_cpy * 2)); + + copy_ok = 1; + for (int kk = 0; kk < chunk_nframes; ++kk) { + int frame_copied_nall = nall; + int ret = _norm_copy_coord_gpu_frame( + context, coord_cpy + int_64(kk) * mem_cpy * 3, + type_cpy + int_64(kk) * mem_cpy, + idx_mapping + int_64(kk) * mem_cpy, frame_copied_nall, + mem_cpy, p_coord + static_cast(ff + kk) * nall * 3, + p_box + static_cast(ff + kk) * 9, + p_type + static_cast(ff + kk) * nall, nloc, rcut); + OP_REQUIRES( + context, ret >= 0, + errors::Aborted("cannot allocate mem for copied coords")); + if (ret != 0) { + copy_ok = 0; + break; + } + } + if (copy_ok) { + coord = coord_cpy; + type = type_cpy; + frame_nall = mem_cpy; + break; + } + mem_cpy *= 2; + } + OP_REQUIRES(context, copy_ok, + errors::Aborted("cannot allocate mem for copied coords")); + } + + std::vector nlist_tensors(2); + int *ilist = NULL, *numneigh = NULL, *jlist = NULL; + DeviceMemoryGuard firstneigh_guard; + deepmd::malloc_device_memory(firstneigh_guard.ptr, nrows); + int** firstneigh = firstneigh_guard.ptr; + int chunk_max_nbor_size = max_nbor_size; + deepmd::InputNlist chunk_gpu_inlist; + if (nei_mode == 4) { + OP_REQUIRES_OK( + context, + _prepare_mesh_nlist_gpu_batch( + context, nlist_tensors.data(), chunk_gpu_inlist, firstneigh, + chunk_max_nbor_size, mesh_tensor.flat().data(), + static_cast(mesh_tensor.NumElements()), nloc, + chunk_nframes)); + } else { + TensorShape ilist_shape; + ilist_shape.AddDim(static_cast(nrows) * 2); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, ilist_shape, + &nlist_tensors[0])); + TensorShape jlist_shape; + jlist_shape.AddDim(3 * int_64(nrows) * mem_nnei); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, jlist_shape, + &nlist_tensors[1])); + int build_ok = _build_nlist_gpu( + context, nlist_tensors.data(), ilist, numneigh, firstneigh, jlist, + chunk_max_nbor_size, mem_nnei, coord, nloc, frame_nall, + max_nnei_trial, rcut, chunk_nframes, nei_mode == 1 ? type : NULL); + OP_REQUIRES(context, build_ok, + errors::Aborted("cannot allocate mem for nlist")); + OP_REQUIRES_OK(context, + _round_built_gpu_nbor_size(chunk_max_nbor_size)); + chunk_gpu_inlist = + deepmd::InputNlist(nrows, ilist, numneigh, firstneigh); + } + max_nbor_size = std::max(max_nbor_size, chunk_max_nbor_size); + + Tensor int_temp; + TensorShape int_shape; + int_shape.AddDim(sec.size() + int_64(nrows) * sec.size() + nrows); + OP_REQUIRES_OK(context, + context->allocate_temp(DT_INT32, int_shape, &int_temp)); + Tensor uint64_temp; + TensorShape uint64_shape; + uint64_shape.AddDim(int_64(nrows) * chunk_max_nbor_size * 2); + OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, + &uint64_temp)); + array_int = int_temp.flat().data(); + array_longlong = uint64_temp.flat().data(); + + deepmd::prod_env_mat_r_gpu( + em, em_deriv, rij, nlist, coord, type, chunk_gpu_inlist, array_int, + array_longlong, chunk_max_nbor_size, avg, std, nloc, frame_nall, + chunk_nframes, rcut, rcut_smth, sec); + if (nei_mode == 1) { + for (int kk = 0; kk < chunk_nframes; ++kk) { + _map_nlist_gpu(nlist + int_64(kk) * nloc * nnei, + idx_mapping + int_64(kk) * mem_cpy, nloc, nnei); + } + } + } + return; + } +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM + // must declare out of if, otherwise the memory will be destroyed! Tensor int_temp; Tensor uint64_temp; @@ -855,7 +1294,8 @@ class ProdEnvMatROp : public OpKernel { array_longlong = uint64_temp.flat().data(); } - // loop over samples + // LAMMPS external nlists are updated outside this op, so keep their + // existing per-sample implementation. for (int_64 ff = 0; ff < nsamples; ++ff) { FPTYPE* em = p_em + ff * nloc * ndescrpt; FPTYPE* em_deriv = p_em_deriv + ff * nloc * ndescrpt * 3; @@ -869,20 +1309,23 @@ class ProdEnvMatROp : public OpKernel { #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM int* idx_mapping = NULL; int *ilist = NULL, *numneigh = NULL; - int** firstneigh = NULL; - deepmd::malloc_device_memory(firstneigh, nloc); + DeviceMemoryGuard firstneigh_guard; + deepmd::malloc_device_memory(firstneigh_guard.ptr, nloc); + int** firstneigh = firstneigh_guard.ptr; int* jlist = NULL; FPTYPE* coord_cpy; int* type_cpy; int frame_nall = nall; int mesh_tensor_size = static_cast(mesh_tensor.NumElements()); // prepare coord and nlist - _prepare_coord_nlist_gpu( - context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy, - idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist, - nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size, box, - mesh_tensor.flat().data(), mesh_tensor_size, nloc, nei_mode, - rcut, max_cpy_trial, max_nnei_trial); + OP_REQUIRES_OK( + context, + _prepare_coord_nlist_gpu( + context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy, + idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist, + nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size, + box, mesh_tensor.flat().data(), mesh_tensor_size, nloc, + nei_mode, rcut, max_cpy_trial, max_nnei_trial)); // max_nbor_size may be changed after _prepare_coord_nlist_gpu // So we need to update the uint64_temp tensor if necessary @@ -897,12 +1340,11 @@ class ProdEnvMatROp : public OpKernel { // launch the gpu(nv) compute function deepmd::prod_env_mat_r_gpu(em, em_deriv, rij, nlist, coord, type, gpu_inlist, array_int, array_longlong, - max_nbor_size, avg, std, nloc, frame_nall, + max_nbor_size, avg, std, nloc, frame_nall, 1, rcut, rcut_smth, sec); if (b_nlist_map) { _map_nlist_gpu(nlist, idx_mapping, nloc, nnei); } - deepmd::delete_device_memory(firstneigh); #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::InputNlist inlist; @@ -1144,6 +1586,216 @@ class ProdEnvMatAMixOp : public OpKernel { } } + if (device == "CPU" && nei_mode != 3) { + const FPTYPE* coord = p_coord; + const int* type = p_type; + const int* f_type = p_f_type; + int frame_nall = nall; + std::vector coord_cpy; + std::vector f_type_cpy; + std::vector real_type_cpy; + std::vector idx_mapping; + std::vector ilist; + std::vector numneigh; + std::vector firstneigh; + std::vector> jlist; + deepmd::InputNlist batch_inlist; + int batch_max_nbor_size = max_nbor_size; + OP_REQUIRES_OK( + context, + _prepare_coord_nlist_cpu_batch( + &coord, coord_cpy, &f_type, f_type_cpy, idx_mapping, batch_inlist, + ilist, numneigh, firstneigh, jlist, frame_nall, mem_cpy, mem_nnei, + batch_max_nbor_size, p_box, mesh_tensor.flat().data(), + static_cast(mesh_tensor.NumElements()), nloc, nall, nsamples, + nei_mode, rcut_r, max_cpy_trial, max_nnei_trial)); + max_nbor_size = std::max(max_nbor_size, batch_max_nbor_size); + + if (nei_mode == 1) { + real_type_cpy.assign(static_cast(nsamples) * frame_nall, -1); + for (int kk = 0; kk < nsamples; ++kk) { + std::copy( + p_type + static_cast(kk) * nall, + p_type + static_cast(kk + 1) * nall, + real_type_cpy.begin() + static_cast(kk) * frame_nall); + } + type = real_type_cpy.data(); + } + + deepmd::prod_env_mat_a_cpu(p_em, p_em_deriv, p_rij, p_nlist, coord, type, + batch_inlist, batch_max_nbor_size, avg, std, + nloc, frame_nall, nsamples, rcut_r, + rcut_r_smth, sec_a, f_type); + for (int kk = 0; kk < nsamples; ++kk) { + _map_nei_info_cpu( + p_nlist + static_cast(kk) * nloc * nnei, + p_ntype + static_cast(kk) * nloc * nnei, + p_nmask + static_cast(kk) * nloc * nnei, + p_type + static_cast(kk) * nall, + nei_mode == 1 + ? idx_mapping.data() + static_cast(kk) * frame_nall + : NULL, + nloc, nnei, ntypes, nei_mode == 1); + } + return; + } + +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM + if (device == "GPU" && (nei_mode == -1 || nei_mode == 1 || nei_mode == 4)) { + const int frame_chunk = nsamples; + for (int ff = 0; ff < nsamples; ff += frame_chunk) { + const int chunk_nframes = std::min(frame_chunk, nsamples - ff); + const int nrows = chunk_nframes * nloc; + FPTYPE* em = p_em + static_cast(ff) * nloc * ndescrpt; + FPTYPE* em_deriv = + p_em_deriv + static_cast(ff) * nloc * ndescrpt * 3; + FPTYPE* rij = p_rij + static_cast(ff) * nloc * nnei * 3; + int* nlist = p_nlist + static_cast(ff) * nloc * nnei; + int* ntype = p_ntype + static_cast(ff) * nloc * nnei; + bool* nmask = p_nmask + static_cast(ff) * nloc * nnei; + const FPTYPE* coord = p_coord + static_cast(ff) * nall * 3; + const int* type = p_type + static_cast(ff) * nall; + const int* f_type = p_f_type + static_cast(ff) * nall; + int* idx_mapping = NULL; + int frame_nall = nall; + Tensor coord_cpy_tensor; + Tensor f_type_cpy_tensor; + Tensor real_type_cpy_tensor; + if (nei_mode == 1) { + int copy_ok = 0; + for (int tt = 0; tt < max_cpy_trial; ++tt) { + TensorShape cpy_shape; + cpy_shape.AddDim(static_cast(chunk_nframes) * mem_cpy * 3); + OP_REQUIRES_OK( + context, context->allocate_temp(DataTypeToEnum::value, + cpy_shape, &coord_cpy_tensor)); + TensorShape fake_type_cpy_shape; + fake_type_cpy_shape.AddDim(static_cast(chunk_nframes) * + mem_cpy * 2); + OP_REQUIRES_OK(context, + context->allocate_temp(DT_INT32, fake_type_cpy_shape, + &f_type_cpy_tensor)); + TensorShape real_type_shape; + real_type_shape.AddDim(static_cast(chunk_nframes) * + mem_cpy); + OP_REQUIRES_OK(context, + context->allocate_temp(DT_INT32, real_type_shape, + &real_type_cpy_tensor)); + FPTYPE* coord_cpy = coord_cpy_tensor.flat().data(); + int* f_type_cpy = f_type_cpy_tensor.flat().data(); + int* real_type_cpy = real_type_cpy_tensor.flat().data(); + idx_mapping = f_type_cpy + int_64(chunk_nframes) * mem_cpy; + DPErrcheck( + gpuMemset(f_type_cpy, -1, + sizeof(int) * int_64(chunk_nframes) * mem_cpy * 2)); + DPErrcheck( + gpuMemset(real_type_cpy, -1, + sizeof(int) * int_64(chunk_nframes) * mem_cpy)); + + copy_ok = 1; + for (int kk = 0; kk < chunk_nframes; ++kk) { + int frame_copied_nall = nall; + int ret = _norm_copy_coord_gpu_frame( + context, coord_cpy + int_64(kk) * mem_cpy * 3, + f_type_cpy + int_64(kk) * mem_cpy, + idx_mapping + int_64(kk) * mem_cpy, frame_copied_nall, + mem_cpy, p_coord + static_cast(ff + kk) * nall * 3, + p_box + static_cast(ff + kk) * 9, + p_f_type + static_cast(ff + kk) * nall, nloc, rcut_r); + OP_REQUIRES( + context, ret >= 0, + errors::Aborted("cannot allocate mem for copied coords")); + if (ret != 0) { + copy_ok = 0; + break; + } + DPErrcheck(gpuMemcpy(real_type_cpy + int_64(kk) * mem_cpy, + p_type + static_cast(ff + kk) * nall, + sizeof(int) * nall, + gpuMemcpyDeviceToDevice)); + } + if (copy_ok) { + coord = coord_cpy; + type = real_type_cpy; + f_type = f_type_cpy; + frame_nall = mem_cpy; + break; + } + mem_cpy *= 2; + } + OP_REQUIRES(context, copy_ok, + errors::Aborted("cannot allocate mem for copied coords")); + } + + std::vector nlist_tensors(2); + int *ilist = NULL, *numneigh = NULL, *jlist = NULL; + DeviceMemoryGuard firstneigh_guard; + deepmd::malloc_device_memory(firstneigh_guard.ptr, nrows); + int** firstneigh = firstneigh_guard.ptr; + int chunk_max_nbor_size = max_nbor_size; + deepmd::InputNlist chunk_gpu_inlist; + if (nei_mode == 4) { + OP_REQUIRES_OK( + context, + _prepare_mesh_nlist_gpu_batch( + context, nlist_tensors.data(), chunk_gpu_inlist, firstneigh, + chunk_max_nbor_size, mesh_tensor.flat().data(), + static_cast(mesh_tensor.NumElements()), nloc, + chunk_nframes)); + } else { + TensorShape ilist_shape; + ilist_shape.AddDim(static_cast(nrows) * 2); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, ilist_shape, + &nlist_tensors[0])); + TensorShape jlist_shape; + jlist_shape.AddDim(3 * int_64(nrows) * mem_nnei); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, jlist_shape, + &nlist_tensors[1])); + int build_ok = + _build_nlist_gpu(context, nlist_tensors.data(), ilist, numneigh, + firstneigh, jlist, chunk_max_nbor_size, mem_nnei, + coord, nloc, frame_nall, max_nnei_trial, rcut_r, + chunk_nframes, nei_mode == 1 ? f_type : NULL); + OP_REQUIRES(context, build_ok, + errors::Aborted("cannot allocate mem for nlist")); + OP_REQUIRES_OK(context, + _round_built_gpu_nbor_size(chunk_max_nbor_size)); + chunk_gpu_inlist = + deepmd::InputNlist(nrows, ilist, numneigh, firstneigh); + } + max_nbor_size = std::max(max_nbor_size, chunk_max_nbor_size); + + Tensor int_temp; + TensorShape int_shape; + int_shape.AddDim(sec_a.size() + int_64(nrows) * sec_a.size() + nrows); + OP_REQUIRES_OK(context, + context->allocate_temp(DT_INT32, int_shape, &int_temp)); + Tensor uint64_temp; + TensorShape uint64_shape; + uint64_shape.AddDim(int_64(nrows) * chunk_max_nbor_size * 2); + OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, + &uint64_temp)); + array_int = int_temp.flat().data(); + array_longlong = uint64_temp.flat().data(); + + deepmd::prod_env_mat_a_gpu( + em, em_deriv, rij, nlist, coord, type, chunk_gpu_inlist, array_int, + array_longlong, chunk_max_nbor_size, avg, std, nloc, frame_nall, + chunk_nframes, rcut_r, rcut_r_smth, sec_a, f_type); + for (int kk = 0; kk < chunk_nframes; ++kk) { + _map_nei_info_gpu( + nlist + int_64(kk) * nloc * nnei, + ntype + int_64(kk) * nloc * nnei, + nmask + int_64(kk) * nloc * nnei, + p_type + static_cast(ff + kk) * nall, + nei_mode == 1 ? idx_mapping + int_64(kk) * mem_cpy : NULL, nloc, + nnei, ntypes, nei_mode == 1); + } + } + return; + } +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM + // must declare out of if, otherwise the memory will be destroyed! Tensor int_temp; Tensor uint64_temp; @@ -1208,7 +1860,8 @@ class ProdEnvMatAMixOp : public OpKernel { array_longlong = uint64_temp.flat().data(); } - // loop over samples + // LAMMPS external nlists are updated outside this op, so keep their + // existing per-sample implementation. for (int_64 ff = 0; ff < nsamples; ++ff) { FPTYPE* em = p_em + ff * nloc * ndescrpt; FPTYPE* em_deriv = p_em_deriv + ff * nloc * ndescrpt * 3; @@ -1225,20 +1878,23 @@ class ProdEnvMatAMixOp : public OpKernel { #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM int* idx_mapping = NULL; int *ilist = NULL, *numneigh = NULL; - int** firstneigh = NULL; - deepmd::malloc_device_memory(firstneigh, nloc); + DeviceMemoryGuard firstneigh_guard; + deepmd::malloc_device_memory(firstneigh_guard.ptr, nloc); + int** firstneigh = firstneigh_guard.ptr; int* jlist = NULL; FPTYPE* coord_cpy; int* type_cpy; int frame_nall = nall; int mesh_tensor_size = static_cast(mesh_tensor.NumElements()); // prepare coord and nlist - _prepare_coord_nlist_gpu( - context, &tensor_list[0], &coord, coord_cpy, &f_type, type_cpy, - idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist, - nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size, box, - mesh_tensor.flat().data(), mesh_tensor_size, nloc, nei_mode, - rcut_r, max_cpy_trial, max_nnei_trial); + OP_REQUIRES_OK( + context, + _prepare_coord_nlist_gpu( + context, &tensor_list[0], &coord, coord_cpy, &f_type, type_cpy, + idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist, + nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size, + box, mesh_tensor.flat().data(), mesh_tensor_size, nloc, + nei_mode, rcut_r, max_cpy_trial, max_nnei_trial)); // max_nbor_size may be changed after _prepare_coord_nlist_gpu // So we need to update the uint64_temp tensor if necessary @@ -1253,11 +1909,10 @@ class ProdEnvMatAMixOp : public OpKernel { // launch the gpu(nv) compute function deepmd::prod_env_mat_a_gpu(em, em_deriv, rij, nlist, coord, type, gpu_inlist, array_int, array_longlong, - max_nbor_size, avg, std, nloc, frame_nall, + max_nbor_size, avg, std, nloc, frame_nall, 1, rcut_r, rcut_r_smth, sec_a, f_type); _map_nei_info_gpu(nlist, ntype, nmask, type, idx_mapping, nloc, nnei, ntypes, b_nlist_map); - deepmd::delete_device_memory(firstneigh); #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::InputNlist inlist; @@ -1340,6 +1995,28 @@ static int _norm_copy_coord_cpu(std::vector& coord_cpy, return (tt != max_cpy_trial); } +template +static int _norm_copy_coord_cpu_frame(FPTYPE* coord_cpy, + int* type_cpy, + int* idx_mapping, + int& frame_nall, + const int& mem_cpy, + const FPTYPE* coord, + const FPTYPE* box, + const int* type, + const int& nall, + const int& nloc, + const float& rcut_r) { + std::vector tmp_coord(static_cast(nall) * 3); + std::copy(coord, coord + static_cast(nall) * 3, tmp_coord.begin()); + deepmd::Region region; + init_region_cpu(region, box); + normalize_coord_cpu(&tmp_coord[0], nall, region); + frame_nall = nall; + return copy_coord_cpu(coord_cpy, type_cpy, idx_mapping, &frame_nall, + &tmp_coord[0], type, nloc, mem_cpy, rcut_r, region); +} + template static int _build_nlist_cpu(std::vector& ilist, std::vector& numneigh, @@ -1351,16 +2028,19 @@ static int _build_nlist_cpu(std::vector& ilist, const int& nloc, const int& new_nall, const int& max_nnei_trial, - const float& rcut_r) { + const float& rcut_r, + const int& nframes, + const int* type) { + const int nrows = nframes * nloc; int tt; for (tt = 0; tt < max_nnei_trial; ++tt) { - for (int ii = 0; ii < nloc; ++ii) { + for (int ii = 0; ii < nrows; ++ii) { jlist[ii].resize(mem_nnei); firstneigh[ii] = &jlist[ii][0]; } - deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]); + deepmd::InputNlist inlist(nrows, &ilist[0], &numneigh[0], &firstneigh[0]); int ret = build_nlist_cpu(inlist, &max_nnei, coord, nloc, new_nall, - mem_nnei, rcut_r); + mem_nnei, rcut_r, nframes, type); if (ret == 0) { break; } else { @@ -1397,6 +2077,173 @@ static void _map_nei_info_cpu(int* nlist, ntypes, b_nlist_map); } +static tensorflow::Status _validate_mesh_neighbor_counts( + int& max_numneigh, + int_64& neighbor_count, + const int* numneigh_in, + const int mesh_tensor_size, + const int_64 header_size, + const int nloc) { + max_numneigh = 0; + neighbor_count = 0; + for (int ii = 0; ii < nloc; ++ii) { + const int_64 numneigh = numneigh_in[ii]; + if (numneigh < 0 || neighbor_count > static_cast(mesh_tensor_size) - + header_size - numneigh) { + return errors::InvalidArgument("invalid mesh tensor"); + } + max_numneigh = std::max(max_numneigh, static_cast(numneigh_in[ii])); + neighbor_count += numneigh; + } + return tensorflow::Status(); +} + +static tensorflow::Status _prepare_mesh_nlist_cpu_batch( + deepmd::InputNlist& inlist, + std::vector& ilist, + std::vector& numneigh, + std::vector& firstneigh, + std::vector>& jlist, + int& max_nbor_size, + const int* mesh_tensor_data, + const int mesh_tensor_size, + const int nloc, + const int nframes) { + const int_64 header_size = 16 + static_cast(2) * nloc; + if (static_cast(mesh_tensor_size) < header_size) { + return errors::InvalidArgument("invalid mesh tensor"); + } + + const int* ilist_in = mesh_tensor_data + 16; + const int* numneigh_in = mesh_tensor_data + 16 + nloc; + const int* neighbors_in = mesh_tensor_data + header_size; + + int max_numneigh = 0; + int_64 neighbor_count = 0; + tensorflow::Status count_status = + _validate_mesh_neighbor_counts(max_numneigh, neighbor_count, numneigh_in, + mesh_tensor_size, header_size, nloc); + if (!count_status.ok()) { + return count_status; + } + + const int nrows = nframes * nloc; + ilist.resize(nrows); + numneigh.resize(nrows); + firstneigh.resize(nrows); + jlist.resize(nrows); + max_nbor_size = std::max(max_nbor_size, max_numneigh); + + std::vector neighbor_offset(nloc + 1, 0); + for (int ii = 0; ii < nloc; ++ii) { + neighbor_offset[ii + 1] = neighbor_offset[ii] + numneigh_in[ii]; + } + for (int ff = 0; ff < nframes; ++ff) { + for (int ii = 0; ii < nloc; ++ii) { + const int row = ff * nloc + ii; + ilist[row] = ilist_in[ii]; + numneigh[row] = numneigh_in[ii]; + jlist[row].resize(numneigh_in[ii]); + std::copy(neighbors_in + neighbor_offset[ii], + neighbors_in + neighbor_offset[ii + 1], jlist[row].begin()); + firstneigh[row] = jlist[row].data(); + } + } + inlist = deepmd::InputNlist(nrows, ilist.data(), numneigh.data(), + firstneigh.data()); + return tensorflow::Status(); +} + +template +static tensorflow::Status _prepare_coord_nlist_cpu_batch( + FPTYPE const** coord, + std::vector& coord_cpy, + int const** type, + std::vector& type_cpy, + std::vector& idx_mapping, + deepmd::InputNlist& inlist, + std::vector& ilist, + std::vector& numneigh, + std::vector& firstneigh, + std::vector>& jlist, + int& new_nall, + int& mem_cpy, + int& mem_nnei, + int& max_nbor_size, + const FPTYPE* box, + const int* mesh_tensor_data, + const int mesh_tensor_size, + const int& nloc, + const int& nall, + const int& nframes, + const int& nei_mode, + const float& rcut_r, + const int& max_cpy_trial, + const int& max_nnei_trial) { + const int nrows = nframes * nloc; + new_nall = nall; + if (nei_mode == 1) { + int copy_ok = 0; + for (int tt = 0; tt < max_cpy_trial; ++tt) { + coord_cpy.assign(static_cast(nframes) * mem_cpy * 3, + static_cast(0)); + type_cpy.assign(static_cast(nframes) * mem_cpy, -1); + idx_mapping.assign(static_cast(nframes) * mem_cpy, -1); + + copy_ok = 1; + for (int kk = 0; kk < nframes; ++kk) { + int frame_copied_nall = nall; + int ret = _norm_copy_coord_cpu_frame( + coord_cpy.data() + static_cast(kk) * mem_cpy * 3, + type_cpy.data() + static_cast(kk) * mem_cpy, + idx_mapping.data() + static_cast(kk) * mem_cpy, + frame_copied_nall, mem_cpy, + *coord + static_cast(kk) * nall * 3, + box + static_cast(kk) * 9, + *type + static_cast(kk) * nall, nall, nloc, rcut_r); + if (ret != 0) { + copy_ok = 0; + break; + } + } + if (copy_ok) { + *coord = coord_cpy.data(); + *type = type_cpy.data(); + new_nall = mem_cpy; + break; + } + mem_cpy *= 2; + } + if (!copy_ok) { + return errors::Aborted("cannot allocate mem for copied coords"); + } + } + + ilist.resize(nrows); + numneigh.resize(nrows); + firstneigh.resize(nrows); + jlist.resize(nrows); + if (nei_mode == 4) { + tensorflow::Status status = _prepare_mesh_nlist_cpu_batch( + inlist, ilist, numneigh, firstneigh, jlist, max_nbor_size, + mesh_tensor_data, mesh_tensor_size, nloc, nframes); + if (!status.ok()) { + return status; + } + } else { + int build_ok = + _build_nlist_cpu(ilist, numneigh, firstneigh, jlist, max_nbor_size, + mem_nnei, *coord, nloc, new_nall, max_nnei_trial, + rcut_r, nframes, nei_mode == 1 ? *type : NULL); + if (!build_ok) { + return errors::Aborted("cannot allocate mem for nlist"); + } + inlist = deepmd::InputNlist(nrows, ilist.data(), numneigh.data(), + firstneigh.data()); + } + return tensorflow::Status(); +} + /** * @param[in] nei_mode -1, 1, 3, or 4. * - -1: Build neighbor list without PBC. The size of mesh should @@ -1559,6 +2406,74 @@ static int _norm_copy_coord_gpu(OpKernelContext* context, return (tt != max_cpy_trial); } +template +static int _norm_copy_coord_gpu_frame(OpKernelContext* context, + FPTYPE* coord_cpy, + int* type_cpy, + int* idx_mapping, + int& frame_nall, + const int& mem_cpy, + const FPTYPE* coord, + const FPTYPE* box, + const int* type, + const int& nloc, + const float& rcut_r) { + Tensor tmp_coord_tensor; + TensorShape tmp_coord_shape; + tmp_coord_shape.AddDim(static_cast(nloc) * 3); + tensorflow::Status status = context->allocate_temp( + DataTypeToEnum::value, tmp_coord_shape, &tmp_coord_tensor); + if (!status.ok()) { + return -1; + } + FPTYPE* tmp_coord = tmp_coord_tensor.flat().data(); + DPErrcheck(gpuMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nloc * 3, + gpuMemcpyDeviceToDevice)); + + deepmd::Region region; + init_region_cpu(region, box); + FPTYPE box_info[18]; + std::copy(region.boxt, region.boxt + 9, box_info); + std::copy(region.rec_boxt, region.rec_boxt + 9, box_info + 9); + int cell_info[23]; + deepmd::compute_cell_info(cell_info, rcut_r, region); + const int loc_cellnum = cell_info[21]; + const int total_cellnum = cell_info[22]; + + Tensor box_info_tensor; + TensorShape box_info_shape; + box_info_shape.AddDim(18); + status = context->allocate_temp(DataTypeToEnum::value, box_info_shape, + &box_info_tensor); + if (!status.ok()) { + return -1; + } + + Tensor int_tensor; + TensorShape int_shape; + int_shape.AddDim(23 + nloc * 3 + loc_cellnum + total_cellnum * 3 + + total_cellnum * 3 + loc_cellnum + 1 + total_cellnum + 1 + + nloc); + status = context->allocate_temp(DT_INT32, int_shape, &int_tensor); + if (!status.ok()) { + return -1; + } + + FPTYPE* box_info_dev = box_info_tensor.flat().data(); + int* cell_info_dev = int_tensor.flat().data(); + int* int_data_dev = cell_info_dev + 23; + deepmd::memcpy_host_to_device(box_info_dev, box_info, 18); + deepmd::memcpy_host_to_device(cell_info_dev, cell_info, 23); + deepmd::Region region_dev(box_info_dev, box_info_dev + 9); + deepmd::normalize_coord_gpu(tmp_coord, nloc, region_dev); + + frame_nall = nloc; + return deepmd::copy_coord_gpu(coord_cpy, type_cpy, idx_mapping, &frame_nall, + int_data_dev, tmp_coord, type, nloc, mem_cpy, + loc_cellnum, total_cellnum, cell_info_dev, + region_dev); +} + template static int _build_nlist_gpu(OpKernelContext* context, Tensor* tensor_list, @@ -1572,30 +2487,34 @@ static int _build_nlist_gpu(OpKernelContext* context, const int& nloc, const int& new_nall, const int& max_nnei_trial, - const float& rcut_r) { + const float& rcut_r, + const int& nframes, + const int* type) { + const int nrows = nframes * nloc; ilist = (*tensor_list).flat().data(); - numneigh = ilist + nloc; + numneigh = ilist + nrows; // Tensor jlist_temp; int* ind_data = NULL; - std::vector firstneigh_host(nloc); + std::vector firstneigh_host(nrows); int tt; for (tt = 0; tt < max_nnei_trial; ++tt) { jlist = (*(tensor_list + 1)).flat().data(); - ind_data = jlist + nloc * mem_nnei; - for (int_64 ii = 0; ii < nloc; ++ii) { + ind_data = jlist + int_64(nrows) * mem_nnei; + for (int_64 ii = 0; ii < nrows; ++ii) { firstneigh_host[ii] = jlist + ii * mem_nnei; } deepmd::memcpy_host_to_device(firstneigh, firstneigh_host); - deepmd::InputNlist inlist(nloc, ilist, numneigh, firstneigh); - int ret = deepmd::build_nlist_gpu(inlist, &max_nnei, ind_data, coord, nloc, - new_nall, mem_nnei, rcut_r); + deepmd::InputNlist inlist(nrows, ilist, numneigh, firstneigh); + int ret = + deepmd::build_nlist_gpu(inlist, &max_nnei, ind_data, coord, nloc, + new_nall, mem_nnei, rcut_r, nframes, type); if (ret == 0) { break; } else { mem_nnei *= 2; TensorShape jlist_shape; - jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei); + jlist_shape.AddDim(3 * int_64(nrows) * mem_nnei); tensorflow::Status status = context->allocate_temp(DT_INT32, jlist_shape, tensor_list + 1); if (!status.ok()) { @@ -1626,32 +2545,155 @@ static void _map_nei_info_gpu(int* nlist, ntypes, b_nlist_map); } +static tensorflow::Status _prepare_mesh_nlist_gpu_batch( + OpKernelContext* context, + Tensor* tensor_list, + deepmd::InputNlist& gpu_inlist, + int** firstneigh, + int& max_nbor_size, + const int* mesh_tensor_data, + const int mesh_tensor_size, + const int nloc, + const int nframes) { + const int_64 header_size = 16 + static_cast(2) * nloc; + if (static_cast(mesh_tensor_size) < header_size) { + return errors::InvalidArgument("invalid mesh tensor"); + } + + // Decode the external mesh on host first so malformed neighbor counts are + // rejected before allocating the flattened GPU list. + std::vector mesh_tensor_data_host(mesh_tensor_size); + deepmd::memcpy_device_to_host(mesh_tensor_data, mesh_tensor_data_host); + const int* ilist_in = mesh_tensor_data_host.data() + 16; + const int* numneigh_in = mesh_tensor_data_host.data() + 16 + nloc; + const int* neighbors_in = mesh_tensor_data_host.data() + header_size; + + int max_numneigh = 0; + int_64 neighbor_count = 0; + tensorflow::Status count_status = + _validate_mesh_neighbor_counts(max_numneigh, neighbor_count, numneigh_in, + mesh_tensor_size, header_size, nloc); + if (!count_status.ok()) { + return count_status; + } + if (max_numneigh > GPU_MAX_NBOR_SIZE) { + return errors::InvalidArgument( + "Assert failed, max neighbor size of atom(lammps) " + + std::to_string(max_numneigh) + " is larger than " + + std::to_string(GPU_MAX_NBOR_SIZE) + + ", which currently is not supported by deepmd-kit."); + } + + if (max_numneigh <= 256) { + max_nbor_size = 256; + } else if (max_numneigh <= 512) { + max_nbor_size = 512; + } else if (max_numneigh <= 1024) { + max_nbor_size = 1024; + } else if (max_numneigh <= 2048) { + max_nbor_size = 2048; + } else { + max_nbor_size = 4096; + } + + const int nrows = nframes * nloc; + TensorShape ilist_shape; + ilist_shape.AddDim(static_cast(nrows) * 2); + tensorflow::Status status = + context->allocate_temp(DT_INT32, ilist_shape, tensor_list); + if (!status.ok()) { + return status; + } + TensorShape jlist_shape; + jlist_shape.AddDim(int_64(nrows) * max_nbor_size); + status = context->allocate_temp(DT_INT32, jlist_shape, tensor_list + 1); + if (!status.ok()) { + return status; + } + + // Repeat the single-frame external mesh layout for each frame in the batch. + std::vector ilist_host(nrows); + std::vector numneigh_host(nrows); + std::vector nbor_list_host(static_cast(nrows) * max_nbor_size, + 0); + std::vector neighbor_offset(nloc + 1, 0); + for (int ii = 0; ii < nloc; ++ii) { + neighbor_offset[ii + 1] = neighbor_offset[ii] + numneigh_in[ii]; + } + for (int ff = 0; ff < nframes; ++ff) { + for (int ii = 0; ii < nloc; ++ii) { + const int row = ff * nloc + ii; + ilist_host[row] = ilist_in[ii]; + numneigh_host[row] = numneigh_in[ii]; + for (int jj = 0; jj < numneigh_in[ii]; ++jj) { + nbor_list_host[static_cast(row) * max_nbor_size + jj] = + neighbors_in[neighbor_offset[ii] + jj]; + } + } + } + + int* ilist = (*tensor_list).flat().data(); + int* numneigh = ilist + nrows; + int* nbor_list = (*(tensor_list + 1)).flat().data(); + deepmd::memcpy_host_to_device(ilist, ilist_host); + deepmd::memcpy_host_to_device(numneigh, numneigh_host); + deepmd::memcpy_host_to_device(nbor_list, nbor_list_host); + + // Store device-side row pointers separately; InputNlist expects firstneigh + // to point to each row in the contiguous neighbor buffer. + std::vector firstneigh_host(nrows); + for (int ii = 0; ii < nrows; ++ii) { + firstneigh_host[ii] = nbor_list + static_cast(ii) * max_nbor_size; + } + deepmd::memcpy_host_to_device(firstneigh, firstneigh_host); + gpu_inlist = deepmd::InputNlist(nrows, ilist, numneigh, firstneigh); + return tensorflow::Status(); +} + +static tensorflow::Status _round_built_gpu_nbor_size(int& max_nbor_size) { + if (max_nbor_size > GPU_MAX_NBOR_SIZE) { + return errors::InvalidArgument( + "Assert failed, max neighbor size of atom(lammps) " + + std::to_string(max_nbor_size) + " is larger than " + + std::to_string(GPU_MAX_NBOR_SIZE) + + ", which currently is not supported by deepmd-kit."); + } + if (max_nbor_size <= 1024) { + max_nbor_size = 1024; + } else if (max_nbor_size <= 2048) { + max_nbor_size = 2048; + } else { + max_nbor_size = 4096; + } + return tensorflow::Status(); +} + template -void _prepare_coord_nlist_gpu(OpKernelContext* context, - Tensor* tensor_list, - FPTYPE const** coord, - FPTYPE*& coord_cpy, - int const** type, - int*& type_cpy, - int*& idx_mapping, - deepmd::InputNlist& inlist, - int*& ilist, - int*& numneigh, - int**& firstneigh, - int*& jlist, - int*& nbor_list_dev, - int& new_nall, - int& mem_cpy, - int& mem_nnei, - int& max_nbor_size, - const FPTYPE* box, - const int* mesh_tensor_data, - const int mesh_tensor_size, - const int& nloc, - const int& nei_mode, - const float& rcut_r, - const int& max_cpy_trial, - const int& max_nnei_trial) { +tensorflow::Status _prepare_coord_nlist_gpu(OpKernelContext* context, + Tensor* tensor_list, + FPTYPE const** coord, + FPTYPE*& coord_cpy, + int const** type, + int*& type_cpy, + int*& idx_mapping, + deepmd::InputNlist& inlist, + int*& ilist, + int*& numneigh, + int**& firstneigh, + int*& jlist, + int*& nbor_list_dev, + int& new_nall, + int& mem_cpy, + int& mem_nnei, + int& max_nbor_size, + const FPTYPE* box, + const int* mesh_tensor_data, + const int mesh_tensor_size, + const int& nloc, + const int& nei_mode, + const float& rcut_r, + const int& max_cpy_trial, + const int& max_nnei_trial) { if (nei_mode != 3 && nei_mode != 4) { inlist.inum = nloc; // build nlist by myself @@ -1660,8 +2702,9 @@ void _prepare_coord_nlist_gpu(OpKernelContext* context, int copy_ok = _norm_copy_coord_gpu( context, tensor_list, coord_cpy, type_cpy, idx_mapping, new_nall, mem_cpy, *coord, box, *type, nloc, max_cpy_trial, rcut_r); - OP_REQUIRES(context, copy_ok, - errors::Aborted("cannot allocate mem for copied coords")); + if (!copy_ok) { + return errors::Aborted("cannot allocate mem for copied coords"); + } *coord = coord_cpy; *type = type_cpy; } @@ -1670,14 +2713,12 @@ void _prepare_coord_nlist_gpu(OpKernelContext* context, _build_nlist_gpu(context, tensor_list + 5, ilist, numneigh, firstneigh, jlist, max_nbor_size, mem_nnei, *coord, nloc, new_nall, max_nnei_trial, rcut_r); - OP_REQUIRES(context, build_ok, - errors::Aborted("cannot allocate mem for nlist")); - if (max_nbor_size <= 1024) { - max_nbor_size = 1024; - } else if (max_nbor_size <= 2048) { - max_nbor_size = 2048; - } else { - max_nbor_size = 4096; + if (!build_ok) { + return errors::Aborted("cannot allocate mem for nlist"); + } + tensorflow::Status status = _round_built_gpu_nbor_size(max_nbor_size); + if (!status.ok()) { + return status; } inlist.ilist = ilist; inlist.numneigh = numneigh; @@ -1706,22 +2747,22 @@ void _prepare_coord_nlist_gpu(OpKernelContext* context, std::memcpy(&fake_mesh[8], &numneigh_host, sizeof(int*)); std::memcpy(&fake_mesh[12], &firstneigh_host, sizeof(int**)); // copy from cpu to gpu - int* fake_mesh_dev = NULL; - deepmd::malloc_device_memory(fake_mesh_dev, 16); - deepmd::memcpy_host_to_device(fake_mesh_dev, fake_mesh); + DeviceMemoryGuard fake_mesh_guard; + deepmd::malloc_device_memory(fake_mesh_guard.ptr, 16); + deepmd::memcpy_host_to_device(fake_mesh_guard.ptr, fake_mesh); deepmd::InputNlist inlist_temp; inlist_temp.inum = nloc; // everything should be copied to GPU... deepmd::env_mat_nbor_update(inlist_temp, inlist, max_nbor_size, - nbor_list_dev, fake_mesh_dev, 16); - OP_REQUIRES(context, (max_numneigh(inlist_temp) <= max_nbor_size), - errors::InvalidArgument( - "Assert failed, max neighbor size of atom(lammps) " + - std::to_string(max_numneigh(inlist_temp)) + - " is larger than " + std::to_string(max_nbor_size) + - ", which currently is not supported by deepmd-kit.")); - deepmd::delete_device_memory(fake_mesh_dev); + nbor_list_dev, fake_mesh_guard.ptr, 16); + if (max_numneigh(inlist_temp) > max_nbor_size) { + return errors::InvalidArgument( + "Assert failed, max neighbor size of atom(lammps) " + + std::to_string(max_numneigh(inlist_temp)) + " is larger than " + + std::to_string(max_nbor_size) + + ", which currently is not supported by deepmd-kit."); + } } else { // update nbor list deepmd::InputNlist inlist_temp; @@ -1729,13 +2770,15 @@ void _prepare_coord_nlist_gpu(OpKernelContext* context, deepmd::env_mat_nbor_update(inlist_temp, inlist, max_nbor_size, nbor_list_dev, mesh_tensor_data, mesh_tensor_size); - OP_REQUIRES(context, (max_numneigh(inlist_temp) <= max_nbor_size), - errors::InvalidArgument( - "Assert failed, max neighbor size of atom(lammps) " + - std::to_string(max_numneigh(inlist_temp)) + - " is larger than " + std::to_string(max_nbor_size) + - ", which currently is not supported by deepmd-kit.")); + if (max_numneigh(inlist_temp) > max_nbor_size) { + return errors::InvalidArgument( + "Assert failed, max neighbor size of atom(lammps) " + + std::to_string(max_numneigh(inlist_temp)) + " is larger than " + + std::to_string(max_nbor_size) + + ", which currently is not supported by deepmd-kit."); + } } + return tensorflow::Status(); } #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM From 0c4e30d69c17aef51f45d2a62cd6eea2e7069531 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 26 Jun 2026 18:27:01 +0800 Subject: [PATCH 2/5] test(core): cover batched GPU env mat outputs --- source/lib/tests/test_env_mat_a.cc | 52 ++++++--- source/lib/tests/test_env_mat_a_mix.cc | 73 ++++++++++-- source/lib/tests/test_env_mat_r.cc | 156 +++++++++++++++++++++++++ 3 files changed, 256 insertions(+), 25 deletions(-) diff --git a/source/lib/tests/test_env_mat_a.cc b/source/lib/tests/test_env_mat_a.cc index 203291d58c..f67ac096ac 100644 --- a/source/lib/tests/test_env_mat_a.cc +++ b/source/lib/tests/test_env_mat_a.cc @@ -818,7 +818,11 @@ TEST_F(TestEnvMatA, prod_gpu_multiple_frames) { std::vector avg(static_cast(ntypes) * ndescrpt, 0); std::vector std(static_cast(ntypes) * ndescrpt, 1); std::vector expected_multi(static_cast(nframes) * nloc * - ndescrpt); + ndescrpt), + expected_deriv_multi(static_cast(nframes) * nloc * ndescrpt * 3), + expected_rij_multi(static_cast(nframes) * nloc * nnei * 3); + std::vector expected_nlist_multi(static_cast(nframes) * nloc * + nnei); for (int ff = 0; ff < nframes; ++ff) { std::vector frame_em(static_cast(nloc) * ndescrpt), frame_em_deriv(static_cast(nloc) * ndescrpt * 3), @@ -830,9 +834,18 @@ TEST_F(TestEnvMatA, prod_gpu_multiple_frames) { posi_multi.data() + static_cast(ff) * nall * 3, atype_multi.data() + static_cast(ff) * nall, base_inlist, max_nbor_size, avg.data(), std.data(), nloc, nall, rc, rc_smth, sec_a); - std::copy( - frame_em.begin(), frame_em.end(), - expected_multi.begin() + static_cast(ff) * nloc * ndescrpt); + const size_t em_offset = static_cast(ff) * nloc * ndescrpt; + const size_t deriv_offset = em_offset * 3; + const size_t rij_offset = static_cast(ff) * nloc * nnei * 3; + const size_t nlist_offset = static_cast(ff) * nloc * nnei; + std::copy(frame_em.begin(), frame_em.end(), + expected_multi.begin() + em_offset); + std::copy(frame_em_deriv.begin(), frame_em_deriv.end(), + expected_deriv_multi.begin() + deriv_offset); + std::copy(frame_rij.begin(), frame_rij.end(), + expected_rij_multi.begin() + rij_offset); + std::copy(frame_nlist.begin(), frame_nlist.end(), + expected_nlist_multi.begin() + nlist_offset); } double *em_dev = NULL, *em_deriv_dev = NULL, *rij_dev = NULL; @@ -865,8 +878,12 @@ TEST_F(TestEnvMatA, prod_gpu_multiple_frames) { array_longlong_dev, max_nbor_size, avg_dev, std_dev, nloc, nall, nframes, rc, rc_smth, sec_a); deepmd::memcpy_device_to_host(em_dev, em); + deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv); + deepmd::memcpy_device_to_host(rij_dev, rij); + deepmd::memcpy_device_to_host(nlist_dev, nlist); deepmd::delete_device_memory(em_dev); deepmd::delete_device_memory(em_deriv_dev); + deepmd::delete_device_memory(rij_dev); deepmd::delete_device_memory(nlist_dev); deepmd::delete_device_memory(posi_dev); deepmd::delete_device_memory(atype_dev); @@ -877,16 +894,23 @@ TEST_F(TestEnvMatA, prod_gpu_multiple_frames) { deepmd::delete_device_memory(memory_dev); deepmd::free_nlist_gpu_device(gpu_inlist); - for (int ff = 0; ff < nframes; ++ff) { - for (int ii = 0; ii < nloc; ++ii) { - for (int jj = 0; jj < nnei; ++jj) { - for (int dd = 0; dd < 4; ++dd) { - const int_64 idx = - (static_cast(ff) * nloc + ii) * nnei * 4 + jj * 4 + dd; - EXPECT_LT(fabs(em[idx] - expected_multi[idx]), 1e-5); - } - } - } + for (size_t ii = 0; ii < em.size(); ++ii) { + EXPECT_LT(fabs(em[ii] - expected_multi[ii]), 1e-10) + << "index " << ii << " em " << em[ii] << " expected " + << expected_multi[ii]; + } + for (size_t ii = 0; ii < em_deriv.size(); ++ii) { + EXPECT_LT(fabs(em_deriv[ii] - expected_deriv_multi[ii]), 1e-10) + << "index " << ii << " em_deriv " << em_deriv[ii] << " expected " + << expected_deriv_multi[ii]; + } + for (size_t ii = 0; ii < rij.size(); ++ii) { + EXPECT_LT(fabs(rij[ii] - expected_rij_multi[ii]), 1e-10) + << "index " << ii << " rij " << rij[ii] << " expected " + << expected_rij_multi[ii]; + } + for (size_t ii = 0; ii < nlist.size(); ++ii) { + EXPECT_EQ(nlist[ii], expected_nlist_multi[ii]) << "index " << ii; } } diff --git a/source/lib/tests/test_env_mat_a_mix.cc b/source/lib/tests/test_env_mat_a_mix.cc index ae91256c1a..fdb4abe9c2 100644 --- a/source/lib/tests/test_env_mat_a_mix.cc +++ b/source/lib/tests/test_env_mat_a_mix.cc @@ -691,7 +691,11 @@ TEST_F(TestEnvMatAMix, prod_cpu_multiple_frames) { } std::vector expected_multi(static_cast(nframes) * nloc * - ndescrpt); + ndescrpt), + expected_deriv_multi(static_cast(nframes) * nloc * ndescrpt * 3), + expected_rij_multi(static_cast(nframes) * nloc * nnei * 3); + std::vector expected_nlist_multi(static_cast(nframes) * nloc * + nnei); for (int ff = 0; ff < nframes; ++ff) { std::vector frame_em(static_cast(nloc) * ndescrpt), frame_em_deriv(static_cast(nloc) * ndescrpt * 3), @@ -704,9 +708,18 @@ TEST_F(TestEnvMatAMix, prod_cpu_multiple_frames) { atype_multi.data() + static_cast(ff) * nall, base_inlist, max_nbor_size, avg.data(), std.data(), nloc, nall, rc, rc_smth, sec_a, f_atype_multi.data() + static_cast(ff) * nall); - std::copy( - frame_em.begin(), frame_em.end(), - expected_multi.begin() + static_cast(ff) * nloc * ndescrpt); + const size_t em_offset = static_cast(ff) * nloc * ndescrpt; + const size_t deriv_offset = em_offset * 3; + const size_t rij_offset = static_cast(ff) * nloc * nnei * 3; + const size_t nlist_offset = static_cast(ff) * nloc * nnei; + std::copy(frame_em.begin(), frame_em.end(), + expected_multi.begin() + em_offset); + std::copy(frame_em_deriv.begin(), frame_em_deriv.end(), + expected_deriv_multi.begin() + deriv_offset); + std::copy(frame_rij.begin(), frame_rij.end(), + expected_rij_multi.begin() + rij_offset); + std::copy(frame_nlist.begin(), frame_nlist.end(), + expected_nlist_multi.begin() + nlist_offset); } std::vector em(static_cast(nframes) * nloc * ndescrpt), @@ -718,9 +731,18 @@ TEST_F(TestEnvMatAMix, prod_cpu_multiple_frames) { atype_multi.data(), inlist, max_nbor_size, avg.data(), std.data(), nloc, nall, nframes, rc, rc_smth, sec_a, f_atype_multi.data()); - for (int ii = 0; ii < em.size(); ++ii) { + for (size_t ii = 0; ii < em.size(); ++ii) { EXPECT_LT(fabs(em[ii] - expected_multi[ii]), 1e-10); } + for (size_t ii = 0; ii < em_deriv.size(); ++ii) { + EXPECT_LT(fabs(em_deriv[ii] - expected_deriv_multi[ii]), 1e-10); + } + for (size_t ii = 0; ii < rij.size(); ++ii) { + EXPECT_LT(fabs(rij[ii] - expected_rij_multi[ii]), 1e-10); + } + for (size_t ii = 0; ii < nlist.size(); ++ii) { + EXPECT_EQ(nlist[ii], expected_nlist_multi[ii]); + } } #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM @@ -997,7 +1019,11 @@ TEST_F(TestEnvMatAMix, prod_gpu_multiple_frames) { } std::vector expected_multi(static_cast(nframes) * nloc * - ndescrpt); + ndescrpt), + expected_deriv_multi(static_cast(nframes) * nloc * ndescrpt * 3), + expected_rij_multi(static_cast(nframes) * nloc * nnei * 3); + std::vector expected_nlist_multi(static_cast(nframes) * nloc * + nnei); for (int ff = 0; ff < nframes; ++ff) { std::vector frame_em(static_cast(nloc) * ndescrpt), frame_em_deriv(static_cast(nloc) * ndescrpt * 3), @@ -1010,9 +1036,18 @@ TEST_F(TestEnvMatAMix, prod_gpu_multiple_frames) { atype_multi.data() + static_cast(ff) * nall, base_inlist, max_nbor_size, avg.data(), std.data(), nloc, nall, rc, rc_smth, sec_a, f_atype_multi.data() + static_cast(ff) * nall); - std::copy( - frame_em.begin(), frame_em.end(), - expected_multi.begin() + static_cast(ff) * nloc * ndescrpt); + const size_t em_offset = static_cast(ff) * nloc * ndescrpt; + const size_t deriv_offset = em_offset * 3; + const size_t rij_offset = static_cast(ff) * nloc * nnei * 3; + const size_t nlist_offset = static_cast(ff) * nloc * nnei; + std::copy(frame_em.begin(), frame_em.end(), + expected_multi.begin() + em_offset); + std::copy(frame_em_deriv.begin(), frame_em_deriv.end(), + expected_deriv_multi.begin() + deriv_offset); + std::copy(frame_rij.begin(), frame_rij.end(), + expected_rij_multi.begin() + rij_offset); + std::copy(frame_nlist.begin(), frame_nlist.end(), + expected_nlist_multi.begin() + nlist_offset); } std::vector em(static_cast(nframes) * nloc * ndescrpt, 0.0), @@ -1052,6 +1087,9 @@ TEST_F(TestEnvMatAMix, prod_gpu_multiple_frames) { nall, nframes, rc, rc_smth, sec_a, f_atype_dev); deepmd::memcpy_device_to_host(em_dev, em); + deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv); + deepmd::memcpy_device_to_host(rij_dev, rij); + deepmd::memcpy_device_to_host(nlist_dev, nlist); deepmd::delete_device_memory(em_dev); deepmd::delete_device_memory(em_deriv_dev); deepmd::delete_device_memory(rij_dev); @@ -1066,10 +1104,23 @@ TEST_F(TestEnvMatAMix, prod_gpu_multiple_frames) { deepmd::delete_device_memory(memory_dev); deepmd::free_nlist_gpu_device(gpu_inlist); - for (int ii = 0; ii < em.size(); ++ii) { - EXPECT_LT(fabs(em[ii] - expected_multi[ii]), 1e-5) + for (size_t ii = 0; ii < em.size(); ++ii) { + EXPECT_LT(fabs(em[ii] - expected_multi[ii]), 1e-10) << "index " << ii << " em " << em[ii] << " expected " << expected_multi[ii]; } + for (size_t ii = 0; ii < em_deriv.size(); ++ii) { + EXPECT_LT(fabs(em_deriv[ii] - expected_deriv_multi[ii]), 1e-10) + << "index " << ii << " em_deriv " << em_deriv[ii] << " expected " + << expected_deriv_multi[ii]; + } + for (size_t ii = 0; ii < rij.size(); ++ii) { + EXPECT_LT(fabs(rij[ii] - expected_rij_multi[ii]), 1e-10) + << "index " << ii << " rij " << rij[ii] << " expected " + << expected_rij_multi[ii]; + } + for (size_t ii = 0; ii < nlist.size(); ++ii) { + EXPECT_EQ(nlist[ii], expected_nlist_multi[ii]) << "index " << ii; + } } #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/source/lib/tests/test_env_mat_r.cc b/source/lib/tests/test_env_mat_r.cc index 82d6b6c229..f3659db074 100644 --- a/source/lib/tests/test_env_mat_r.cc +++ b/source/lib/tests/test_env_mat_r.cc @@ -1,6 +1,7 @@ // SPDX-License-Identifier: LGPL-3.0-or-later #include +#include #include #include "env_mat.h" @@ -556,6 +557,161 @@ TEST_F(TestEnvMatR, prod_gpu) { } } +TEST_F(TestEnvMatR, prod_gpu_multiple_frames) { + EXPECT_EQ(nlist_r_cpy.size(), nloc); + constexpr int nframes = 2; + int max_nbor_size = 0; + for (int ii = 0; ii < nlist_a_cpy.size(); ++ii) { + if (nlist_a_cpy[ii].size() > max_nbor_size) { + max_nbor_size = nlist_a_cpy[ii].size(); + } + } + assert(max_nbor_size <= GPU_MAX_NBOR_SIZE); + if (max_nbor_size <= 1024) { + max_nbor_size = 1024; + } else if (max_nbor_size <= 2048) { + max_nbor_size = 2048; + } else { + max_nbor_size = 4096; + } + + std::vector base_ilist(nloc), base_numneigh(nloc); + std::vector base_firstneigh(nloc); + deepmd::InputNlist base_inlist(nloc, base_ilist.data(), base_numneigh.data(), + base_firstneigh.data()); + convert_nlist(base_inlist, nlist_a_cpy); + + const int nrows = nframes * nloc; + std::vector ilist(nrows), numneigh(nrows); + std::vector firstneigh(nrows); + for (int ff = 0; ff < nframes; ++ff) { + for (int ii = 0; ii < nloc; ++ii) { + const int row = ff * nloc + ii; + ilist[row] = base_ilist[ii]; + numneigh[row] = base_numneigh[ii]; + firstneigh[row] = base_firstneigh[ii]; + } + } + deepmd::InputNlist inlist(nrows, ilist.data(), numneigh.data(), + firstneigh.data()), + gpu_inlist; + + std::vector posi_multi = posi_cpy; + posi_multi.insert(posi_multi.end(), posi_cpy.begin(), posi_cpy.end()); + for (int ii = 0; ii < nall; ++ii) { + const size_t offset = (static_cast(nall) + ii) * 3; + posi_multi[offset] += 0.01 * (ii + 1); + posi_multi[offset + 1] -= 0.02 * (ii % 3); + posi_multi[offset + 2] += 0.015 * (ii % 5); + } + std::vector atype_multi = atype_cpy; + atype_multi.insert(atype_multi.end(), atype_cpy.begin(), atype_cpy.end()); + std::vector avg(static_cast(ntypes) * ndescrpt, 0); + std::vector std(static_cast(ntypes) * ndescrpt, 1); + + std::vector expected_multi(static_cast(nframes) * nloc * + ndescrpt), + expected_deriv_multi(static_cast(nframes) * nloc * ndescrpt * 3), + expected_rij_multi(static_cast(nframes) * nloc * nnei * 3); + std::vector expected_nlist_multi(static_cast(nframes) * nloc * + nnei); + for (int ff = 0; ff < nframes; ++ff) { + std::vector frame_em(static_cast(nloc) * ndescrpt), + frame_em_deriv(static_cast(nloc) * ndescrpt * 3), + frame_rij(static_cast(nloc) * nnei * 3); + std::vector frame_nlist(static_cast(nloc) * nnei); + deepmd::prod_env_mat_r_cpu( + frame_em.data(), frame_em_deriv.data(), frame_rij.data(), + frame_nlist.data(), + posi_multi.data() + static_cast(ff) * nall * 3, + atype_multi.data() + static_cast(ff) * nall, base_inlist, + max_nbor_size, avg.data(), std.data(), nloc, nall, rc, rc_smth, sec_a); + + const size_t em_offset = static_cast(ff) * nloc * ndescrpt; + const size_t deriv_offset = em_offset * 3; + const size_t rij_offset = static_cast(ff) * nloc * nnei * 3; + const size_t nlist_offset = static_cast(ff) * nloc * nnei; + std::copy(frame_em.begin(), frame_em.end(), + expected_multi.begin() + em_offset); + std::copy(frame_em_deriv.begin(), frame_em_deriv.end(), + expected_deriv_multi.begin() + deriv_offset); + std::copy(frame_rij.begin(), frame_rij.end(), + expected_rij_multi.begin() + rij_offset); + std::copy(frame_nlist.begin(), frame_nlist.end(), + expected_nlist_multi.begin() + nlist_offset); + } + + std::vector em(static_cast(nframes) * nloc * ndescrpt, 0.0), + em_deriv(static_cast(nframes) * nloc * ndescrpt * 3, 0.0), + rij(static_cast(nframes) * nloc * nnei * 3, 0.0); + std::vector nlist(static_cast(nframes) * nloc * nnei, 0); + + double *em_dev = NULL, *em_deriv_dev = NULL, *rij_dev = NULL; + double *posi_dev = NULL, *avg_dev = NULL, *std_dev = NULL; + int *atype_dev = NULL, *nlist_dev = NULL, *array_int_dev = NULL, + *memory_dev = NULL; + uint_64* array_longlong_dev = NULL; + deepmd::malloc_device_memory_sync(em_dev, em); + deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv); + deepmd::malloc_device_memory_sync(rij_dev, rij); + deepmd::malloc_device_memory_sync(posi_dev, posi_multi); + deepmd::malloc_device_memory_sync(avg_dev, avg); + deepmd::malloc_device_memory_sync(std_dev, std); + deepmd::malloc_device_memory_sync(atype_dev, atype_multi); + deepmd::malloc_device_memory_sync(nlist_dev, nlist); + deepmd::malloc_device_memory( + array_int_dev, sec_a.size() + + static_cast(nframes) * nloc * sec_a.size() + + static_cast(nframes) * nloc); + deepmd::malloc_device_memory( + array_longlong_dev, + static_cast(nframes) * nloc * max_nbor_size * 2); + deepmd::malloc_device_memory( + memory_dev, static_cast(nframes) * nloc * max_nbor_size); + deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, + max_nbor_size); + + deepmd::prod_env_mat_r_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_dev, + atype_dev, gpu_inlist, array_int_dev, + array_longlong_dev, max_nbor_size, avg_dev, + std_dev, nloc, nall, nframes, rc, rc_smth, sec_a); + deepmd::memcpy_device_to_host(em_dev, em); + deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv); + deepmd::memcpy_device_to_host(rij_dev, rij); + deepmd::memcpy_device_to_host(nlist_dev, nlist); + deepmd::delete_device_memory(em_dev); + deepmd::delete_device_memory(em_deriv_dev); + deepmd::delete_device_memory(rij_dev); + deepmd::delete_device_memory(nlist_dev); + deepmd::delete_device_memory(posi_dev); + deepmd::delete_device_memory(atype_dev); + deepmd::delete_device_memory(array_int_dev); + deepmd::delete_device_memory(array_longlong_dev); + deepmd::delete_device_memory(avg_dev); + deepmd::delete_device_memory(std_dev); + deepmd::delete_device_memory(memory_dev); + deepmd::free_nlist_gpu_device(gpu_inlist); + + for (size_t ii = 0; ii < em.size(); ++ii) { + EXPECT_LT(fabs(em[ii] - expected_multi[ii]), 1e-10) + << "index " << ii << " em " << em[ii] << " expected " + << expected_multi[ii]; + } + for (size_t ii = 0; ii < em_deriv.size(); ++ii) { + EXPECT_LT(fabs(em_deriv[ii] - expected_deriv_multi[ii]), 1e-10) + << "index " << ii << " em_deriv " << em_deriv[ii] << " expected " + << expected_deriv_multi[ii]; + } + for (size_t ii = 0; ii < rij.size(); ++ii) { + EXPECT_LT(fabs(rij[ii] - expected_rij_multi[ii]), 1e-10) + << "index " << ii << " rij " << rij[ii] << " expected " + << expected_rij_multi[ii]; + } + for (size_t ii = 0; ii < nlist.size(); ++ii) { + EXPECT_EQ(nlist[ii], expected_nlist_multi[ii]) << "index " << ii; + } +} + TEST_F(TestEnvMatR, prod_gpu_equal_cpu) { EXPECT_EQ(nlist_r_cpy.size(), nloc); int tot_nnei = 0; From d6ea9eb5d9639ae971c2e07d3eabbe031879973c Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Sun, 28 Jun 2026 02:07:44 +0800 Subject: [PATCH 3/5] docs(lib): document batched neighbor list inputs --- source/lib/include/neighbor_list.h | 38 ++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/source/lib/include/neighbor_list.h b/source/lib/include/neighbor_list.h index f7c1ddde10..7eb3f86361 100644 --- a/source/lib/include/neighbor_list.h +++ b/source/lib/include/neighbor_list.h @@ -144,14 +144,25 @@ int max_numneigh(const InputNlist& to_nlist); // build neighbor list. // outputs // nlist, max_list_size +// nlist contains nframes * nloc rows. Row frame_idx * nloc + atom_idx +// stores the neighbors of local atom atom_idx in that frame. // max_list_size is the maximal size of jlist. // inputs -// c_cpy, nloc, nall, mem_size, rcut, region -// mem_size is the size of allocated memory for jlist. +// c_cpy, nloc, nall, mem_size, rcut, nframes, type +// c_cpy stores nframes coordinate blocks with nall atoms each, including +// ghost atoms, laid out as c_cpy[(frame_idx * nall + atom) * 3 + dim]. +// nloc is the number of local atoms per frame; nall is the total number of +// local and ghost atoms per frame. +// mem_size is the size of allocated memory for each jlist row. +// nframes is the number of coordinate frames. +// type may be nullptr/NULL, or a per-atom type array. +// The type array uses the same frame-major nall stride as c_cpy: +// type[frame_idx * nall + atom]. +// When the center or neighbor atom has type < 0, that pair is excluded. // returns // 0: successful // 1: the memory is not large enough to hold all neighbors. -// i.e. max_list_size > mem_nall +// i.e. max_list_size > mem_size template int build_nlist_cpu(InputNlist& nlist, int* max_list_size, @@ -210,14 +221,27 @@ void use_nlist_map(int* nlist, // build neighbor list. // outputs // nlist, max_list_size +// nlist contains nframes * nloc rows. Row frame_idx * nloc + atom_idx +// stores the neighbors of local atom atom_idx in that frame. // max_list_size is the maximal size of jlist. // inputs -// c_cpy, nloc, nall, mem_size, rcut, region -// mem_size is the size of allocated memory for jlist. +// nlist_data, c_cpy, nloc, nall, mem_size, rcut, nframes, type +// nlist_data is temporary GPU workspace of at least +// 2 * nframes * nloc * mem_size ints. +// c_cpy stores nframes coordinate blocks with nall atoms each, including +// ghost atoms, laid out as c_cpy[(frame_idx * nall + atom) * 3 + dim]. +// nloc is the number of local atoms per frame; nall is the total number of +// local and ghost atoms per frame. +// mem_size is the size of allocated memory for each jlist row. +// nframes is the number of coordinate frames. +// type may be nullptr/NULL, or a per-atom type array. +// The type array uses the same frame-major nall stride as c_cpy: +// type[frame_idx * nall + atom]. +// When the center or neighbor atom has type < 0, that pair is excluded. // returns // 0: successful -// 1: the memory is not large enough to hold all neighbors. -// i.e. max_list_size > mem_nall +// 1: the temporary row storage is not large enough. +// i.e. mem_size < nall template int build_nlist_gpu(InputNlist& nlist, int* max_list_size, From c0accc2aa53d187aba4a58884720645acd774445 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Wed, 1 Jul 2026 14:45:35 +0800 Subject: [PATCH 4/5] Apply suggestions from code review Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> Co-authored-by: A bot of @njzjz <48687836+njzjz-bot@users.noreply.github.com> Signed-off-by: Jinzhe Zeng --- source/lib/tests/test_neighbor_list.cc | 1 + source/op/tf/prod_env_mat_multi_device.cc | 40 +++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/source/lib/tests/test_neighbor_list.cc b/source/lib/tests/test_neighbor_list.cc index a643c8c321..a229a3b746 100644 --- a/source/lib/tests/test_neighbor_list.cc +++ b/source/lib/tests/test_neighbor_list.cc @@ -1,6 +1,7 @@ // SPDX-License-Identifier: LGPL-3.0-or-later #include +#include #include #include "device.h" diff --git a/source/op/tf/prod_env_mat_multi_device.cc b/source/op/tf/prod_env_mat_multi_device.cc index 799f9709f3..7422d73745 100644 --- a/source/op/tf/prod_env_mat_multi_device.cc +++ b/source/op/tf/prod_env_mat_multi_device.cc @@ -2128,6 +2128,34 @@ static tensorflow::Status _validate_mesh_neighbor_counts( return tensorflow::Status(); } +static tensorflow::Status _validate_mesh_indices( + const int* ilist_in, + const int* numneigh_in, + const int* neighbors_in, + const int nloc, + const int_64 neighbor_count) { + std::vector seen(static_cast(nloc), 0); + int_64 neighbor_offset = 0; + for (int ii = 0; ii < nloc; ++ii) { + const int i_idx = ilist_in[ii]; + if (i_idx < 0 || i_idx >= nloc || seen[static_cast(i_idx)]) { + return errors::InvalidArgument("invalid mesh tensor"); + } + seen[static_cast(i_idx)] = 1; + + for (int jj = 0; jj < numneigh_in[ii]; ++jj) { + if (neighbors_in[neighbor_offset + jj] < 0) { + return errors::InvalidArgument("invalid mesh tensor"); + } + } + neighbor_offset += numneigh_in[ii]; + } + if (neighbor_offset != neighbor_count) { + return errors::InvalidArgument("invalid mesh tensor"); + } + return tensorflow::Status(); +} + static tensorflow::Status _prepare_mesh_nlist_cpu_batch( deepmd::InputNlist& inlist, std::vector& ilist, @@ -2157,6 +2185,12 @@ static tensorflow::Status _prepare_mesh_nlist_cpu_batch( return count_status; } + tensorflow::Status index_status = _validate_mesh_indices( + ilist_in, numneigh_in, neighbors_in, nloc, neighbor_count); + if (!index_status.ok()) { + return index_status; + } + const int nrows = nframes * nloc; ilist.resize(nrows); numneigh.resize(nrows); @@ -2614,6 +2648,12 @@ static tensorflow::Status _prepare_mesh_nlist_gpu_batch( ", which currently is not supported by deepmd-kit."); } + tensorflow::Status index_status = _validate_mesh_indices( + ilist_in, numneigh_in, neighbors_in, nloc, neighbor_count); + if (!index_status.ok()) { + return index_status; + } + if (max_numneigh <= 256) { max_nbor_size = 256; } else if (max_numneigh <= 512) { From c98dc29adec4af241c50bcb7e0f7ab02656c0616 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 1 Jul 2026 06:46:21 +0000 Subject: [PATCH 5/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- source/op/tf/prod_env_mat_multi_device.cc | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/source/op/tf/prod_env_mat_multi_device.cc b/source/op/tf/prod_env_mat_multi_device.cc index 7422d73745..66a0a45c88 100644 --- a/source/op/tf/prod_env_mat_multi_device.cc +++ b/source/op/tf/prod_env_mat_multi_device.cc @@ -2128,12 +2128,11 @@ static tensorflow::Status _validate_mesh_neighbor_counts( return tensorflow::Status(); } -static tensorflow::Status _validate_mesh_indices( - const int* ilist_in, - const int* numneigh_in, - const int* neighbors_in, - const int nloc, - const int_64 neighbor_count) { +static tensorflow::Status _validate_mesh_indices(const int* ilist_in, + const int* numneigh_in, + const int* neighbors_in, + const int nloc, + const int_64 neighbor_count) { std::vector seen(static_cast(nloc), 0); int_64 neighbor_offset = 0; for (int ii = 0; ii < nloc; ++ii) {