From b3a130dc31d992dc5a0b4612415babe2820935bb Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Sat, 10 Jun 2023 01:58:26 -0400 Subject: [PATCH] prod_force_grad: support multiple frames in parallel Signed-off-by: Jinzhe Zeng (cherry picked from commit 91b98b4b664f0184d6213dfcf55634a7bc5e34ab) --- source/lib/include/prod_force_grad.h | 18 ++++--- source/lib/src/cuda/prod_force_grad.cu | 57 +++++++++++++--------- source/lib/src/prod_force_grad.cc | 32 +++++++----- source/lib/src/rocm/prod_force_grad.hip.cu | 57 +++++++++++++--------- source/lib/tests/test_prod_force_grad_a.cc | 32 +++++++----- source/lib/tests/test_prod_force_grad_r.cc | 32 +++++++----- source/op/prod_force_grad_multi_device.cc | 45 ++++++----------- 7 files changed, 160 insertions(+), 113 deletions(-) diff --git a/source/lib/include/prod_force_grad.h b/source/lib/include/prod_force_grad.h index b0b5a59a18..56f4975222 100644 --- a/source/lib/include/prod_force_grad.h +++ b/source/lib/include/prod_force_grad.h @@ -8,7 +8,8 @@ void prod_force_grad_a_cpu(FPTYPE* grad_net, const FPTYPE* env_deriv, const int* nlist, const int nloc, - const int nnei); + const int nnei, + const int nframes); template void prod_force_grad_r_cpu(FPTYPE* grad_net, @@ -16,7 +17,8 @@ void prod_force_grad_r_cpu(FPTYPE* grad_net, const FPTYPE* env_deriv, const int* nlist, const int nloc, - const int nnei); + const int nnei, + const int nframes); #if GOOGLE_CUDA template @@ -25,7 +27,8 @@ void prod_force_grad_a_gpu_cuda(FPTYPE* grad_net, const FPTYPE* env_deriv, const int* nlist, const int nloc, - const int nnei); + const int nnei, + const int nframes); template void prod_force_grad_r_gpu_cuda(FPTYPE* grad_net, @@ -33,7 +36,8 @@ void prod_force_grad_r_gpu_cuda(FPTYPE* grad_net, const FPTYPE* env_deriv, const int* nlist, const int nloc, - const int nnei); + const int nnei, + const int nframes); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM @@ -43,7 +47,8 @@ void prod_force_grad_a_gpu_rocm(FPTYPE* grad_net, const FPTYPE* env_deriv, const int* nlist, const int nloc, - const int nnei); + const int nnei, + const int nframes); template void prod_force_grad_r_gpu_rocm(FPTYPE* grad_net, @@ -51,6 +56,7 @@ void prod_force_grad_r_gpu_rocm(FPTYPE* grad_net, const FPTYPE* env_deriv, const int* nlist, const int nloc, - const int nnei); + const int nnei, + const int nframes); #endif // TENSORFLOW_USE_ROCM } // namespace deepmd diff --git a/source/lib/src/cuda/prod_force_grad.cu b/source/lib/src/cuda/prod_force_grad.cu index b44f191ffb..b0429d1492 100644 --- a/source/lib/src/cuda/prod_force_grad.cu +++ b/source/lib/src/cuda/prod_force_grad.cu @@ -31,12 +31,13 @@ __global__ void force_grad_wrt_neighbors_a(FPTYPE* grad_net, const FPTYPE* env_deriv, const int* nlist, const int nloc, - const int nnei) { + const int nnei, + const int nframes) { // idy -> nnei const int_64 idx = blockIdx.x * blockDim.x + threadIdx.x; const unsigned int idy = blockIdx.y; const unsigned int idw = threadIdx.y; - if (idx >= nloc) { + if (idx >= nframes * nloc) { return; } int j_idx = nlist[idx * nnei + idy]; @@ -44,8 +45,10 @@ __global__ void force_grad_wrt_neighbors_a(FPTYPE* grad_net, return; } if (j_idx >= nloc) j_idx = j_idx % nloc; - grad_net[idx * nnei * 4 + idy * 4 + idw] += dev_dot( - grad + j_idx * 3, env_deriv + idx * nnei * 4 * 3 + idy * 4 * 3 + idw * 3); + const int kk = idx / nloc; // frame index + grad_net[idx * nnei * 4 + idy * 4 + idw] += + dev_dot(grad + kk * nloc * 3 + j_idx * 3, + env_deriv + idx * nnei * 4 * 3 + idy * 4 * 3 + idw * 3); } template @@ -54,11 +57,12 @@ __global__ void force_grad_wrt_neighbors_r(FPTYPE* grad_net, const FPTYPE* env_deriv, const int* nlist, const int nloc, - const int nnei) { + const int nnei, + const int nframes) { // idy -> nnei const int_64 idx = blockIdx.x * blockDim.x + threadIdx.x; const unsigned int idy = blockIdx.y; - if (idx >= nloc) { + if (idx >= nframes * nloc) { return; } int j_idx = nlist[idx * nnei + idy]; @@ -66,8 +70,9 @@ __global__ void force_grad_wrt_neighbors_r(FPTYPE* grad_net, return; } if (j_idx >= nloc) j_idx = j_idx % nloc; - grad_net[idx * nnei + idy] += - dev_dot(grad + j_idx * 3, env_deriv + idx * nnei * 3 + idy * 3); + const int kk = idx / nloc; // frame index + grad_net[idx * nnei + idy] += dev_dot(grad + kk * nloc * 3 + j_idx * 3, + env_deriv + idx * nnei * 3 + idy * 3); } namespace deepmd { @@ -77,11 +82,13 @@ void prod_force_grad_a_gpu_cuda(FPTYPE* grad_net, const FPTYPE* env_deriv, const int* nlist, const int nloc, - const int nnei) { + const int nnei, + const int nframes) { const int ndescrpt = nnei * 4; - DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt)); + DPErrcheck( + cudaMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt)); const int nblock = (ndescrpt + TPB - 1) / TPB; - dim3 block_grid(nloc, nblock); + dim3 block_grid(nframes * nloc, nblock); dim3 thread_grid(TPB, 1); force_grad_wrt_center_atom<<>>(grad_net, grad, env_deriv, ndescrpt); @@ -89,11 +96,11 @@ void prod_force_grad_a_gpu_cuda(FPTYPE* grad_net, DPErrcheck(cudaDeviceSynchronize()); const int LEN = 128; - const int nblock_ = (nloc + LEN - 1) / LEN; + const int nblock_ = (nframes * nloc + LEN - 1) / LEN; dim3 block_grid_(nblock_, nnei); dim3 thread_grid_(LEN, 4); force_grad_wrt_neighbors_a<<>>( - grad_net, grad, env_deriv, nlist, nloc, nnei); + grad_net, grad, env_deriv, nlist, nloc, nnei, nframes); DPErrcheck(cudaGetLastError()); DPErrcheck(cudaDeviceSynchronize()); } @@ -104,11 +111,13 @@ void prod_force_grad_r_gpu_cuda(FPTYPE* grad_net, const FPTYPE* env_deriv, const int* nlist, const int nloc, - const int nnei) { + const int nnei, + const int nframes) { const int ndescrpt = nnei * 1; - DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt)); + DPErrcheck( + cudaMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt)); const int nblock = (ndescrpt + TPB - 1) / TPB; - dim3 block_grid(nloc, nblock); + dim3 block_grid(nframes * nloc, nblock); dim3 thread_grid(TPB, 1); force_grad_wrt_center_atom<<>>(grad_net, grad, env_deriv, ndescrpt); @@ -116,11 +125,11 @@ void prod_force_grad_r_gpu_cuda(FPTYPE* grad_net, DPErrcheck(cudaDeviceSynchronize()); const int LEN = 128; - const int nblock_ = (nloc + LEN - 1) / LEN; + const int nblock_ = (nframes * nloc + LEN - 1) / LEN; dim3 block_grid_(nblock_, nnei); dim3 thread_grid_(LEN, 1); force_grad_wrt_neighbors_r<<>>( - grad_net, grad, env_deriv, nlist, nloc, nnei); + grad_net, grad, env_deriv, nlist, nloc, nnei, nframes); DPErrcheck(cudaGetLastError()); DPErrcheck(cudaDeviceSynchronize()); } @@ -130,23 +139,27 @@ template void prod_force_grad_a_gpu_cuda(float* grad_net, const float* env_deriv, const int* nlist, const int nloc, - const int nnei); + const int nnei, + const int nframes); template void prod_force_grad_a_gpu_cuda(double* grad_net, const double* grad, const double* env_deriv, const int* nlist, const int nloc, - const int nnei); + const int nnei, + const int nframes); template void prod_force_grad_r_gpu_cuda(float* grad_net, const float* grad, const float* env_deriv, const int* nlist, const int nloc, - const int nnei); + const int nnei, + const int nframes); template void prod_force_grad_r_gpu_cuda(double* grad_net, const double* grad, const double* env_deriv, const int* nlist, const int nloc, - const int nnei); + const int nnei, + const int nframes); } // namespace deepmd diff --git a/source/lib/src/prod_force_grad.cc b/source/lib/src/prod_force_grad.cc index adc52baaf0..920601c113 100644 --- a/source/lib/src/prod_force_grad.cc +++ b/source/lib/src/prod_force_grad.cc @@ -24,11 +24,12 @@ void deepmd::prod_force_grad_a_cpu(FPTYPE* grad_net, const FPTYPE* env_deriv, const int* nlist, const int nloc, - const int nnei) { + const int nnei, + const int nframes) { const int ndescrpt = nnei * 4; // reset the frame to 0 - for (int ii = 0; ii < nloc; ++ii) { + for (int ii = 0; ii < nframes * nloc; ++ii) { for (int aa = 0; aa < ndescrpt; ++aa) { grad_net[ii * ndescrpt + aa] = (FPTYPE)0.; } @@ -36,7 +37,7 @@ void deepmd::prod_force_grad_a_cpu(FPTYPE* grad_net, // compute grad of one frame #pragma omp parallel for - for (int ii = 0; ii < nloc; ++ii) { + for (int ii = 0; ii < nframes * nloc; ++ii) { int i_idx = ii; // deriv wrt center atom @@ -55,10 +56,11 @@ void deepmd::prod_force_grad_a_cpu(FPTYPE* grad_net, if (j_idx < 0) continue; int aa_start, aa_end; make_index_range(aa_start, aa_end, jj, nnei); + const int kk = i_idx / nloc; // frame index for (int aa = aa_start; aa < aa_end; ++aa) { for (int dd = 0; dd < 3; ++dd) { grad_net[i_idx * ndescrpt + aa] += - grad[j_idx * 3 + dd] * + grad[kk * nloc * 3 + j_idx * 3 + dd] * env_deriv[i_idx * ndescrpt * 3 + aa * 3 + dd]; } } @@ -71,14 +73,16 @@ template void deepmd::prod_force_grad_a_cpu(double* grad_net, const double* env_deriv, const int* nlist, const int nloc, - const int nnei); + const int nnei, + const int nframes); template void deepmd::prod_force_grad_a_cpu(float* grad_net, const float* grad, const float* env_deriv, const int* nlist, const int nloc, - const int nnei); + const int nnei, + const int nframes); template void deepmd::prod_force_grad_r_cpu(FPTYPE* grad_net, @@ -86,7 +90,8 @@ void deepmd::prod_force_grad_r_cpu(FPTYPE* grad_net, const FPTYPE* env_deriv, const int* nlist, const int nloc, - const int nnei) + const int nnei, + const int nframes) // // grad_net: nloc x ndescrpt // grad: nloc x 3 @@ -97,7 +102,7 @@ void deepmd::prod_force_grad_r_cpu(FPTYPE* grad_net, const int ndescrpt = nnei * 1; // reset the frame to 0 - for (int ii = 0; ii < nloc; ++ii) { + for (int ii = 0; ii < nframes * nloc; ++ii) { for (int aa = 0; aa < ndescrpt; ++aa) { grad_net[ii * ndescrpt + aa] = (FPTYPE)0.; } @@ -105,7 +110,7 @@ void deepmd::prod_force_grad_r_cpu(FPTYPE* grad_net, // compute grad of one frame #pragma omp parallel for - for (int ii = 0; ii < nloc; ++ii) { + for (int ii = 0; ii < nframes * nloc; ++ii) { int i_idx = ii; // deriv wrt center atom @@ -122,9 +127,10 @@ void deepmd::prod_force_grad_r_cpu(FPTYPE* grad_net, int j_idx = nlist[i_idx * nnei + jj]; if (j_idx >= nloc) j_idx = j_idx % nloc; if (j_idx < 0) continue; + int kk = i_idx / nloc; // frame index for (int dd = 0; dd < 3; ++dd) { grad_net[i_idx * ndescrpt + jj] += - grad[j_idx * 3 + dd] * + grad[kk * nloc * 3 + j_idx * 3 + dd] * env_deriv[i_idx * ndescrpt * 3 + jj * 3 + dd]; } } @@ -136,11 +142,13 @@ template void deepmd::prod_force_grad_r_cpu(double* grad_net, const double* env_deriv, const int* nlist, const int nloc, - const int nnei); + const int nnei, + const int nframes); template void deepmd::prod_force_grad_r_cpu(float* grad_net, const float* grad, const float* env_deriv, const int* nlist, const int nloc, - const int nnei); + const int nnei, + const int nframes); diff --git a/source/lib/src/rocm/prod_force_grad.hip.cu b/source/lib/src/rocm/prod_force_grad.hip.cu index e266389eb6..bb7fe7792f 100644 --- a/source/lib/src/rocm/prod_force_grad.hip.cu +++ b/source/lib/src/rocm/prod_force_grad.hip.cu @@ -31,12 +31,13 @@ __global__ void force_grad_wrt_neighbors_a(FPTYPE* grad_net, const FPTYPE* env_deriv, const int* nlist, const int nloc, - const int nnei) { + const int nnei, + const int nframes) { // idy -> nnei const int_64 idx = blockIdx.x * blockDim.x + threadIdx.x; const unsigned int idy = blockIdx.y; const unsigned int idw = threadIdx.y; - if (idx >= nloc) { + if (idx >= nframes * nloc) { return; } int j_idx = nlist[idx * nnei + idy]; @@ -44,8 +45,10 @@ __global__ void force_grad_wrt_neighbors_a(FPTYPE* grad_net, return; } if (j_idx >= nloc) j_idx = j_idx % nloc; - grad_net[idx * nnei * 4 + idy * 4 + idw] += dev_dot( - grad + j_idx * 3, env_deriv + idx * nnei * 4 * 3 + idy * 4 * 3 + idw * 3); + const int kk = idx / nloc; // frame index + grad_net[idx * nnei * 4 + idy * 4 + idw] += + dev_dot(grad + kk * nloc * 3 + j_idx * 3, + env_deriv + idx * nnei * 4 * 3 + idy * 4 * 3 + idw * 3); } template @@ -54,11 +57,12 @@ __global__ void force_grad_wrt_neighbors_r(FPTYPE* grad_net, const FPTYPE* env_deriv, const int* nlist, const int nloc, - const int nnei) { + const int nnei, + const int nframes) { // idy -> nnei const int_64 idx = blockIdx.x * blockDim.x + threadIdx.x; const unsigned int idy = blockIdx.y; - if (idx >= nloc) { + if (idx >= nframes * nloc) { return; } int j_idx = nlist[idx * nnei + idy]; @@ -66,8 +70,9 @@ __global__ void force_grad_wrt_neighbors_r(FPTYPE* grad_net, return; } if (j_idx >= nloc) j_idx = j_idx % nloc; - grad_net[idx * nnei + idy] += - dev_dot(grad + j_idx * 3, env_deriv + idx * nnei * 3 + idy * 3); + const int kk = idx / nloc; // frame index + grad_net[idx * nnei + idy] += dev_dot(grad + kk * nloc * 3 + j_idx * 3, + env_deriv + idx * nnei * 3 + idy * 3); } namespace deepmd { @@ -77,22 +82,24 @@ void prod_force_grad_a_gpu_rocm(FPTYPE* grad_net, const FPTYPE* env_deriv, const int* nlist, const int nloc, - const int nnei) { + const int nnei, + const int nframes) { const int ndescrpt = nnei * 4; - DPErrcheck(hipMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt)); + DPErrcheck( + hipMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt)); const int nblock = (ndescrpt + TPB - 1) / TPB; - dim3 block_grid(nloc, nblock); + dim3 block_grid(nframes * nloc, nblock); dim3 thread_grid(TPB, 1); hipLaunchKernelGGL(force_grad_wrt_center_atom, block_grid, thread_grid, 0, 0, grad_net, grad, env_deriv, ndescrpt); DPErrcheck(hipGetLastError()); DPErrcheck(hipDeviceSynchronize()); const int LEN = 128; - const int nblock_ = (nloc + LEN - 1) / LEN; + const int nblock_ = (nframes * nloc + LEN - 1) / LEN; dim3 block_grid_(nblock_, nnei); dim3 thread_grid_(LEN, 4); hipLaunchKernelGGL(force_grad_wrt_neighbors_a, block_grid_, thread_grid_, 0, - 0, grad_net, grad, env_deriv, nlist, nloc, nnei); + 0, grad_net, grad, env_deriv, nlist, nloc, nnei, nframes); DPErrcheck(hipGetLastError()); DPErrcheck(hipDeviceSynchronize()); } @@ -103,11 +110,13 @@ void prod_force_grad_r_gpu_rocm(FPTYPE* grad_net, const FPTYPE* env_deriv, const int* nlist, const int nloc, - const int nnei) { + const int nnei, + const int nframes) { const int ndescrpt = nnei * 1; - DPErrcheck(hipMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt)); + DPErrcheck( + hipMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt)); const int nblock = (ndescrpt + TPB - 1) / TPB; - dim3 block_grid(nloc, nblock); + dim3 block_grid(nframes * nloc, nblock); dim3 thread_grid(TPB, 1); hipLaunchKernelGGL(force_grad_wrt_center_atom, block_grid, thread_grid, 0, 0, grad_net, grad, env_deriv, ndescrpt); @@ -115,11 +124,11 @@ void prod_force_grad_r_gpu_rocm(FPTYPE* grad_net, DPErrcheck(hipDeviceSynchronize()); const int LEN = 128; - const int nblock_ = (nloc + LEN - 1) / LEN; + const int nblock_ = (nframes * nloc + LEN - 1) / LEN; dim3 block_grid_(nblock_, nnei); dim3 thread_grid_(LEN, 1); hipLaunchKernelGGL(force_grad_wrt_neighbors_r, block_grid_, thread_grid_, 0, - 0, grad_net, grad, env_deriv, nlist, nloc, nnei); + 0, grad_net, grad, env_deriv, nlist, nloc, nnei, nframes); DPErrcheck(hipGetLastError()); DPErrcheck(hipDeviceSynchronize()); } @@ -129,23 +138,27 @@ template void prod_force_grad_a_gpu_rocm(float* grad_net, const float* env_deriv, const int* nlist, const int nloc, - const int nnei); + const int nnei, + const int nframes); template void prod_force_grad_a_gpu_rocm(double* grad_net, const double* grad, const double* env_deriv, const int* nlist, const int nloc, - const int nnei); + const int nnei, + const int nframes); template void prod_force_grad_r_gpu_rocm(float* grad_net, const float* grad, const float* env_deriv, const int* nlist, const int nloc, - const int nnei); + const int nnei, + const int nframes); template void prod_force_grad_r_gpu_rocm(double* grad_net, const double* grad, const double* env_deriv, const int* nlist, const int nloc, - const int nnei); + const int nnei, + const int nframes); } // namespace deepmd diff --git a/source/lib/tests/test_prod_force_grad_a.cc b/source/lib/tests/test_prod_force_grad_a.cc index bd88a86c77..68baf85571 100644 --- a/source/lib/tests/test_prod_force_grad_a.cc +++ b/source/lib/tests/test_prod_force_grad_a.cc @@ -8,6 +8,11 @@ #include "neighbor_list.h" #include "prod_force_grad.h" +template +inline void double_vec(std::vector& v) { + v.insert(std::end(v), std::begin(v), std::end(v)); +} + class TestProdForceGradA : public ::testing::Test { protected: std::vector posi = {12.83, 2.56, 2.18, 12.09, 2.87, 2.74, @@ -16,6 +21,7 @@ class TestProdForceGradA : public ::testing::Test { std::vector atype = {0, 1, 1, 0, 1, 1}; std::vector posi_cpy; std::vector atype_cpy; + int nframes = 2; int ntypes = 2; int nloc, nall, nnei, ndescrpt; double rc = 6; @@ -113,15 +119,19 @@ class TestProdForceGradA : public ::testing::Test { for (int ii = 0; ii < nloc * 3; ++ii) { grad[ii] = 10 - ii * 0.1; } + double_vec(grad); + double_vec(nlist); + double_vec(env_deriv); + double_vec(expected_grad_net); } void TearDown() override {} }; TEST_F(TestProdForceGradA, cpu) { - std::vector grad_net(nloc * ndescrpt); + std::vector grad_net(nframes * nloc * ndescrpt); deepmd::prod_force_grad_a_cpu(&grad_net[0], &grad[0], &env_deriv[0], - &nlist[0], nloc, nnei); - EXPECT_EQ(grad_net.size(), nloc * ndescrpt); + &nlist[0], nloc, nnei, nframes); + EXPECT_EQ(grad_net.size(), nframes * nloc * ndescrpt); EXPECT_EQ(grad_net.size(), expected_grad_net.size()); for (int jj = 0; jj < grad_net.size(); ++jj) { EXPECT_LT(fabs(grad_net[jj] - expected_grad_net[jj]), 1e-5); @@ -134,23 +144,23 @@ TEST_F(TestProdForceGradA, cpu) { #if GOOGLE_CUDA TEST_F(TestProdForceGradA, gpu) { - std::vector grad_net(nloc * ndescrpt); + std::vector grad_net(nframes * nloc * ndescrpt); int* nlist_dev = NULL; double *grad_net_dev = NULL, *grad_dev = NULL, *env_deriv_dev = NULL; deepmd::malloc_device_memory_sync(nlist_dev, nlist); deepmd::malloc_device_memory_sync(grad_dev, grad); deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); - deepmd::malloc_device_memory(grad_net_dev, nloc * ndescrpt); + deepmd::malloc_device_memory(grad_net_dev, nframes * nloc * ndescrpt); deepmd::prod_force_grad_a_gpu_cuda( - grad_net_dev, grad_dev, env_deriv_dev, nlist_dev, nloc, nnei); + grad_net_dev, grad_dev, env_deriv_dev, nlist_dev, nloc, nnei, nframes); deepmd::memcpy_device_to_host(grad_net_dev, grad_net); deepmd::delete_device_memory(nlist_dev); deepmd::delete_device_memory(grad_dev); deepmd::delete_device_memory(env_deriv_dev); deepmd::delete_device_memory(grad_net_dev); - EXPECT_EQ(grad_net.size(), nloc * ndescrpt); + EXPECT_EQ(grad_net.size(), nframes * nloc * ndescrpt); EXPECT_EQ(grad_net.size(), expected_grad_net.size()); for (int jj = 0; jj < grad_net.size(); ++jj) { EXPECT_LT(fabs(grad_net[jj] - expected_grad_net[jj]), 1e-5); @@ -164,23 +174,23 @@ TEST_F(TestProdForceGradA, gpu) { #if TENSORFLOW_USE_ROCM TEST_F(TestProdForceGradA, gpu) { - std::vector grad_net(nloc * ndescrpt); + std::vector grad_net(nframes * nloc * ndescrpt); int* nlist_dev = NULL; double *grad_net_dev = NULL, *grad_dev = NULL, *env_deriv_dev = NULL; deepmd::malloc_device_memory_sync(nlist_dev, nlist); deepmd::malloc_device_memory_sync(grad_dev, grad); deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); - deepmd::malloc_device_memory(grad_net_dev, nloc * ndescrpt); + deepmd::malloc_device_memory(grad_net_dev, nframes * nloc * ndescrpt); deepmd::prod_force_grad_a_gpu_rocm( - grad_net_dev, grad_dev, env_deriv_dev, nlist_dev, nloc, nnei); + grad_net_dev, grad_dev, env_deriv_dev, nlist_dev, nloc, nnei, nframes); deepmd::memcpy_device_to_host(grad_net_dev, grad_net); deepmd::delete_device_memory(nlist_dev); deepmd::delete_device_memory(grad_dev); deepmd::delete_device_memory(env_deriv_dev); deepmd::delete_device_memory(grad_net_dev); - EXPECT_EQ(grad_net.size(), nloc * ndescrpt); + EXPECT_EQ(grad_net.size(), nframes * nloc * ndescrpt); EXPECT_EQ(grad_net.size(), expected_grad_net.size()); for (int jj = 0; jj < grad_net.size(); ++jj) { EXPECT_LT(fabs(grad_net[jj] - expected_grad_net[jj]), 1e-5); diff --git a/source/lib/tests/test_prod_force_grad_r.cc b/source/lib/tests/test_prod_force_grad_r.cc index b0c062902f..2c4e088f3d 100644 --- a/source/lib/tests/test_prod_force_grad_r.cc +++ b/source/lib/tests/test_prod_force_grad_r.cc @@ -8,6 +8,11 @@ #include "neighbor_list.h" #include "prod_force_grad.h" +template +inline void double_vec(std::vector& v) { + v.insert(std::end(v), std::begin(v), std::end(v)); +} + class TestProdForceGradR : public ::testing::Test { protected: std::vector posi = {12.83, 2.56, 2.18, 12.09, 2.87, 2.74, @@ -16,6 +21,7 @@ class TestProdForceGradR : public ::testing::Test { std::vector atype = {0, 1, 1, 0, 1, 1}; std::vector posi_cpy; std::vector atype_cpy; + int nframes = 2; int ntypes = 2; int nloc, nall, nnei, ndescrpt; double rc = 6; @@ -87,15 +93,19 @@ class TestProdForceGradR : public ::testing::Test { for (int ii = 0; ii < nloc * 3; ++ii) { grad[ii] = 10 - ii * 0.1; } + double_vec(grad); + double_vec(nlist); + double_vec(env_deriv); + double_vec(expected_grad_net); } void TearDown() override {} }; TEST_F(TestProdForceGradR, cpu) { - std::vector grad_net(nloc * ndescrpt); + std::vector grad_net(nframes * nloc * ndescrpt); deepmd::prod_force_grad_r_cpu(&grad_net[0], &grad[0], &env_deriv[0], - &nlist[0], nloc, nnei); - EXPECT_EQ(grad_net.size(), nloc * ndescrpt); + &nlist[0], nloc, nnei, nframes); + EXPECT_EQ(grad_net.size(), nframes * nloc * ndescrpt); EXPECT_EQ(grad_net.size(), expected_grad_net.size()); for (int jj = 0; jj < grad_net.size(); ++jj) { EXPECT_LT(fabs(grad_net[jj] - expected_grad_net[jj]), 1e-5); @@ -108,23 +118,23 @@ TEST_F(TestProdForceGradR, cpu) { #if GOOGLE_CUDA TEST_F(TestProdForceGradR, gpu) { - std::vector grad_net(nloc * ndescrpt); + std::vector grad_net(nframes * nloc * ndescrpt); int* nlist_dev = NULL; double *grad_net_dev = NULL, *grad_dev = NULL, *env_deriv_dev = NULL; deepmd::malloc_device_memory_sync(nlist_dev, nlist); deepmd::malloc_device_memory_sync(grad_dev, grad); deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); - deepmd::malloc_device_memory(grad_net_dev, nloc * ndescrpt); + deepmd::malloc_device_memory(grad_net_dev, nframes * nloc * ndescrpt); deepmd::prod_force_grad_r_gpu_cuda( - grad_net_dev, grad_dev, env_deriv_dev, nlist_dev, nloc, nnei); + grad_net_dev, grad_dev, env_deriv_dev, nlist_dev, nloc, nnei, nframes); deepmd::memcpy_device_to_host(grad_net_dev, grad_net); deepmd::delete_device_memory(nlist_dev); deepmd::delete_device_memory(grad_dev); deepmd::delete_device_memory(env_deriv_dev); deepmd::delete_device_memory(grad_net_dev); - EXPECT_EQ(grad_net.size(), nloc * ndescrpt); + EXPECT_EQ(grad_net.size(), nframes * nloc * ndescrpt); EXPECT_EQ(grad_net.size(), expected_grad_net.size()); for (int jj = 0; jj < grad_net.size(); ++jj) { EXPECT_LT(fabs(grad_net[jj] - expected_grad_net[jj]), 1e-5); @@ -138,23 +148,23 @@ TEST_F(TestProdForceGradR, gpu) { #if TENSORFLOW_USE_ROCM TEST_F(TestProdForceGradR, gpu) { - std::vector grad_net(nloc * ndescrpt); + std::vector grad_net(nframes * nloc * ndescrpt); int* nlist_dev = NULL; double *grad_net_dev = NULL, *grad_dev = NULL, *env_deriv_dev = NULL; deepmd::malloc_device_memory_sync(nlist_dev, nlist); deepmd::malloc_device_memory_sync(grad_dev, grad); deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); - deepmd::malloc_device_memory(grad_net_dev, nloc * ndescrpt); + deepmd::malloc_device_memory(grad_net_dev, nframes * nloc * ndescrpt); deepmd::prod_force_grad_r_gpu_rocm( - grad_net_dev, grad_dev, env_deriv_dev, nlist_dev, nloc, nnei); + grad_net_dev, grad_dev, env_deriv_dev, nlist_dev, nloc, nnei, nframes); deepmd::memcpy_device_to_host(grad_net_dev, grad_net); deepmd::delete_device_memory(nlist_dev); deepmd::delete_device_memory(grad_dev); deepmd::delete_device_memory(env_deriv_dev); deepmd::delete_device_memory(grad_net_dev); - EXPECT_EQ(grad_net.size(), nloc * ndescrpt); + EXPECT_EQ(grad_net.size(), nframes * nloc * ndescrpt); EXPECT_EQ(grad_net.size(), expected_grad_net.size()); for (int jj = 0; jj < grad_net.size(); ++jj) { EXPECT_LT(fabs(grad_net[jj] - expected_grad_net[jj]), 1e-5); diff --git a/source/op/prod_force_grad_multi_device.cc b/source/op/prod_force_grad_multi_device.cc index 3383eb9f6c..b2ea76902b 100644 --- a/source/op/prod_force_grad_multi_device.cc +++ b/source/op/prod_force_grad_multi_device.cc @@ -119,25 +119,19 @@ class ProdForceSeAGradOp : public OpKernel { const FPTYPE* p_in_deriv = in_deriv_tensor.flat().data(); const int* p_nlist = nlist_tensor.flat().data(); - for (int_64 kk = 0; kk < nframes; ++kk) { - FPTYPE* grad_net = p_grad_net + kk * nloc * ndescrpt; - const FPTYPE* grad = p_grad + kk * nloc * 3; - const FPTYPE* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3; - const int* nlist = p_nlist + kk * nloc * nnei; - if (device == "GPU") { + if (device == "GPU") { #if GOOGLE_CUDA - deepmd::prod_force_grad_a_gpu_cuda(grad_net, grad, in_deriv, nlist, - nloc, nnei); + deepmd::prod_force_grad_a_gpu_cuda(p_grad_net, p_grad, p_in_deriv, + p_nlist, nloc, nnei, nframes); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM - deepmd::prod_force_grad_a_gpu_rocm(grad_net, grad, in_deriv, nlist, - nloc, nnei); + deepmd::prod_force_grad_a_gpu_rocm(p_grad_net, p_grad, p_in_deriv, + p_nlist, nloc, nnei, nframes); #endif // TENSORFLOW_USE_ROCM - } else if (device == "CPU") { - deepmd::prod_force_grad_a_cpu(grad_net, grad, in_deriv, nlist, nloc, - nnei); - } + } else if (device == "CPU") { + deepmd::prod_force_grad_a_cpu(p_grad_net, p_grad, p_in_deriv, p_nlist, + nloc, nnei, nframes); } } @@ -238,26 +232,19 @@ class ProdForceSeRGradOp : public OpKernel { const FPTYPE* p_in_deriv = in_deriv_tensor.flat().data(); const int* p_nlist = nlist_tensor.flat().data(); - // loop over frames - for (int_64 kk = 0; kk < nframes; ++kk) { - FPTYPE* grad_net = p_grad_net + kk * nloc * ndescrpt; - const FPTYPE* grad = p_grad + kk * nloc * 3; - const FPTYPE* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3; - const int* nlist = p_nlist + kk * nloc * nnei; - if (device == "GPU") { + if (device == "GPU") { #if GOOGLE_CUDA - deepmd::prod_force_grad_r_gpu_cuda(grad_net, grad, in_deriv, nlist, - nloc, nnei); + deepmd::prod_force_grad_r_gpu_cuda(p_grad_net, p_grad, p_in_deriv, + p_nlist, nloc, nnei, nframes); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM - deepmd::prod_force_grad_r_gpu_rocm(grad_net, grad, in_deriv, nlist, - nloc, nnei); + deepmd::prod_force_grad_r_gpu_rocm(p_grad_net, p_grad, p_in_deriv, + p_nlist, nloc, nnei, nframes); #endif // TENSORFLOW_USE_ROCM - } else if (device == "CPU") { - deepmd::prod_force_grad_r_cpu(grad_net, grad, in_deriv, nlist, nloc, - nnei); - } + } else if (device == "CPU") { + deepmd::prod_force_grad_r_cpu(p_grad_net, p_grad, p_in_deriv, p_nlist, + nloc, nnei, nframes); } }