Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions source/lib/include/prod_force_grad.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,17 @@ void prod_force_grad_a_cpu(FPTYPE* grad_net,
const FPTYPE* env_deriv,
const int* nlist,
const int nloc,
const int nnei);
const int nnei,
const int nframes);

template <typename FPTYPE>
void prod_force_grad_r_cpu(FPTYPE* grad_net,
const FPTYPE* grad,
const FPTYPE* env_deriv,
const int* nlist,
const int nloc,
const int nnei);
const int nnei,
const int nframes);

#if GOOGLE_CUDA
template <typename FPTYPE>
Expand All @@ -25,15 +27,17 @@ void prod_force_grad_a_gpu_cuda(FPTYPE* grad_net,
const FPTYPE* env_deriv,
const int* nlist,
const int nloc,
const int nnei);
const int nnei,
const int nframes);

template <typename FPTYPE>
void prod_force_grad_r_gpu_cuda(FPTYPE* grad_net,
const FPTYPE* grad,
const FPTYPE* env_deriv,
const int* nlist,
const int nloc,
const int nnei);
const int nnei,
const int nframes);
#endif // GOOGLE_CUDA

#if TENSORFLOW_USE_ROCM
Expand All @@ -43,14 +47,16 @@ void prod_force_grad_a_gpu_rocm(FPTYPE* grad_net,
const FPTYPE* env_deriv,
const int* nlist,
const int nloc,
const int nnei);
const int nnei,
const int nframes);

template <typename FPTYPE>
void prod_force_grad_r_gpu_rocm(FPTYPE* grad_net,
const FPTYPE* grad,
const FPTYPE* env_deriv,
const int* nlist,
const int nloc,
const int nnei);
const int nnei,
const int nframes);
#endif // TENSORFLOW_USE_ROCM
} // namespace deepmd
57 changes: 35 additions & 22 deletions source/lib/src/cuda/prod_force_grad.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,21 +31,24 @@ __global__ void force_grad_wrt_neighbors_a(FPTYPE* grad_net,
const FPTYPE* env_deriv,
const int* nlist,
const int nloc,
const int nnei) {
const int nnei,
const int nframes) {
// idy -> nnei
const int_64 idx = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int idy = blockIdx.y;
const unsigned int idw = threadIdx.y;
if (idx >= nloc) {
if (idx >= nframes * nloc) {
return;
}
int j_idx = nlist[idx * nnei + idy];
if (j_idx < 0) {
return;
}
if (j_idx >= nloc) j_idx = j_idx % nloc;
grad_net[idx * nnei * 4 + idy * 4 + idw] += dev_dot(
grad + j_idx * 3, env_deriv + idx * nnei * 4 * 3 + idy * 4 * 3 + idw * 3);
const int kk = idx / nloc; // frame index
grad_net[idx * nnei * 4 + idy * 4 + idw] +=
dev_dot(grad + kk * nloc * 3 + j_idx * 3,
env_deriv + idx * nnei * 4 * 3 + idy * 4 * 3 + idw * 3);
}

template <typename FPTYPE>
Expand All @@ -54,20 +57,22 @@ __global__ void force_grad_wrt_neighbors_r(FPTYPE* grad_net,
const FPTYPE* env_deriv,
const int* nlist,
const int nloc,
const int nnei) {
const int nnei,
const int nframes) {
// idy -> nnei
const int_64 idx = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int idy = blockIdx.y;
if (idx >= nloc) {
if (idx >= nframes * nloc) {
return;
}
int j_idx = nlist[idx * nnei + idy];
if (j_idx < 0) {
return;
}
if (j_idx >= nloc) j_idx = j_idx % nloc;
grad_net[idx * nnei + idy] +=
dev_dot(grad + j_idx * 3, env_deriv + idx * nnei * 3 + idy * 3);
const int kk = idx / nloc; // frame index
grad_net[idx * nnei + idy] += dev_dot(grad + kk * nloc * 3 + j_idx * 3,
env_deriv + idx * nnei * 3 + idy * 3);
}

namespace deepmd {
Expand All @@ -77,23 +82,25 @@ void prod_force_grad_a_gpu_cuda(FPTYPE* grad_net,
const FPTYPE* env_deriv,
const int* nlist,
const int nloc,
const int nnei) {
const int nnei,
const int nframes) {
const int ndescrpt = nnei * 4;
DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
DPErrcheck(
cudaMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt));
const int nblock = (ndescrpt + TPB - 1) / TPB;
dim3 block_grid(nloc, nblock);
dim3 block_grid(nframes * nloc, nblock);
dim3 thread_grid(TPB, 1);
force_grad_wrt_center_atom<<<block_grid, thread_grid>>>(grad_net, grad,
env_deriv, ndescrpt);
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());

const int LEN = 128;
const int nblock_ = (nloc + LEN - 1) / LEN;
const int nblock_ = (nframes * nloc + LEN - 1) / LEN;
dim3 block_grid_(nblock_, nnei);
dim3 thread_grid_(LEN, 4);
force_grad_wrt_neighbors_a<<<block_grid_, thread_grid_>>>(
grad_net, grad, env_deriv, nlist, nloc, nnei);
grad_net, grad, env_deriv, nlist, nloc, nnei, nframes);
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
}
Expand All @@ -104,23 +111,25 @@ void prod_force_grad_r_gpu_cuda(FPTYPE* grad_net,
const FPTYPE* env_deriv,
const int* nlist,
const int nloc,
const int nnei) {
const int nnei,
const int nframes) {
const int ndescrpt = nnei * 1;
DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
DPErrcheck(
cudaMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt));
const int nblock = (ndescrpt + TPB - 1) / TPB;
dim3 block_grid(nloc, nblock);
dim3 block_grid(nframes * nloc, nblock);
dim3 thread_grid(TPB, 1);
force_grad_wrt_center_atom<<<block_grid, thread_grid>>>(grad_net, grad,
env_deriv, ndescrpt);
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());

const int LEN = 128;
const int nblock_ = (nloc + LEN - 1) / LEN;
const int nblock_ = (nframes * nloc + LEN - 1) / LEN;
dim3 block_grid_(nblock_, nnei);
dim3 thread_grid_(LEN, 1);
force_grad_wrt_neighbors_r<<<block_grid_, thread_grid_>>>(
grad_net, grad, env_deriv, nlist, nloc, nnei);
grad_net, grad, env_deriv, nlist, nloc, nnei, nframes);
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
}
Expand All @@ -130,23 +139,27 @@ template void prod_force_grad_a_gpu_cuda<float>(float* grad_net,
const float* env_deriv,
const int* nlist,
const int nloc,
const int nnei);
const int nnei,
const int nframes);
template void prod_force_grad_a_gpu_cuda<double>(double* grad_net,
const double* grad,
const double* env_deriv,
const int* nlist,
const int nloc,
const int nnei);
const int nnei,
const int nframes);
template void prod_force_grad_r_gpu_cuda<float>(float* grad_net,
const float* grad,
const float* env_deriv,
const int* nlist,
const int nloc,
const int nnei);
const int nnei,
const int nframes);
template void prod_force_grad_r_gpu_cuda<double>(double* grad_net,
const double* grad,
const double* env_deriv,
const int* nlist,
const int nloc,
const int nnei);
const int nnei,
const int nframes);
} // namespace deepmd
32 changes: 20 additions & 12 deletions source/lib/src/prod_force_grad.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,20 @@ void deepmd::prod_force_grad_a_cpu(FPTYPE* grad_net,
const FPTYPE* env_deriv,
const int* nlist,
const int nloc,
const int nnei) {
const int nnei,
const int nframes) {
const int ndescrpt = nnei * 4;

// reset the frame to 0
for (int ii = 0; ii < nloc; ++ii) {
for (int ii = 0; ii < nframes * nloc; ++ii) {
for (int aa = 0; aa < ndescrpt; ++aa) {
grad_net[ii * ndescrpt + aa] = (FPTYPE)0.;
}
}

// compute grad of one frame
#pragma omp parallel for
for (int ii = 0; ii < nloc; ++ii) {
for (int ii = 0; ii < nframes * nloc; ++ii) {
int i_idx = ii;

// deriv wrt center atom
Expand All @@ -55,10 +56,11 @@ void deepmd::prod_force_grad_a_cpu(FPTYPE* grad_net,
if (j_idx < 0) continue;
int aa_start, aa_end;
make_index_range(aa_start, aa_end, jj, nnei);
const int kk = i_idx / nloc; // frame index
for (int aa = aa_start; aa < aa_end; ++aa) {
for (int dd = 0; dd < 3; ++dd) {
grad_net[i_idx * ndescrpt + aa] +=
grad[j_idx * 3 + dd] *
grad[kk * nloc * 3 + j_idx * 3 + dd] *
env_deriv[i_idx * ndescrpt * 3 + aa * 3 + dd];
}
}
Expand All @@ -71,22 +73,25 @@ template void deepmd::prod_force_grad_a_cpu<double>(double* grad_net,
const double* env_deriv,
const int* nlist,
const int nloc,
const int nnei);
const int nnei,
const int nframes);

template void deepmd::prod_force_grad_a_cpu<float>(float* grad_net,
const float* grad,
const float* env_deriv,
const int* nlist,
const int nloc,
const int nnei);
const int nnei,
const int nframes);

template <typename FPTYPE>
void deepmd::prod_force_grad_r_cpu(FPTYPE* grad_net,
const FPTYPE* grad,
const FPTYPE* env_deriv,
const int* nlist,
const int nloc,
const int nnei)
const int nnei,
const int nframes)
//
// grad_net: nloc x ndescrpt
// grad: nloc x 3
Expand All @@ -97,15 +102,15 @@ void deepmd::prod_force_grad_r_cpu(FPTYPE* grad_net,
const int ndescrpt = nnei * 1;

// reset the frame to 0
for (int ii = 0; ii < nloc; ++ii) {
for (int ii = 0; ii < nframes * nloc; ++ii) {
for (int aa = 0; aa < ndescrpt; ++aa) {
grad_net[ii * ndescrpt + aa] = (FPTYPE)0.;
}
}

// compute grad of one frame
#pragma omp parallel for
for (int ii = 0; ii < nloc; ++ii) {
for (int ii = 0; ii < nframes * nloc; ++ii) {
int i_idx = ii;

// deriv wrt center atom
Expand All @@ -122,9 +127,10 @@ void deepmd::prod_force_grad_r_cpu(FPTYPE* grad_net,
int j_idx = nlist[i_idx * nnei + jj];
if (j_idx >= nloc) j_idx = j_idx % nloc;
if (j_idx < 0) continue;
int kk = i_idx / nloc; // frame index
for (int dd = 0; dd < 3; ++dd) {
grad_net[i_idx * ndescrpt + jj] +=
grad[j_idx * 3 + dd] *
grad[kk * nloc * 3 + j_idx * 3 + dd] *
env_deriv[i_idx * ndescrpt * 3 + jj * 3 + dd];
}
}
Expand All @@ -136,11 +142,13 @@ template void deepmd::prod_force_grad_r_cpu<double>(double* grad_net,
const double* env_deriv,
const int* nlist,
const int nloc,
const int nnei);
const int nnei,
const int nframes);

template void deepmd::prod_force_grad_r_cpu<float>(float* grad_net,
const float* grad,
const float* env_deriv,
const int* nlist,
const int nloc,
const int nnei);
const int nnei,
const int nframes);
Loading