Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions source/lib/include/fmt_nlist.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,26 @@ void format_nbor_list_gpu(int* nlist,
const int max_nbor_size,
const int nloc,
const int nall,
const int nframes,
const float rcut,
const std::vector<int> sec);

template <typename FPTYPE>
inline void format_nbor_list_gpu(int* nlist,
const FPTYPE* coord,
const int* type,
const deepmd::InputNlist& gpu_inlist,
int* array_int,
uint_64* array_longlong,
const int max_nbor_size,
const int nloc,
const int nall,
const float rcut,
const std::vector<int> sec) {
format_nbor_list_gpu(nlist, coord, type, gpu_inlist, array_int,
array_longlong, max_nbor_size, nloc, nall, 1, rcut, sec);
}

template <typename FPTYPE>
void test_encoding_decoding_nbor_info_gpu(uint_64* key,
int* out_type,
Expand Down
46 changes: 37 additions & 9 deletions source/lib/include/neighbor_list.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,22 +144,35 @@ int max_numneigh(const InputNlist& to_nlist);
// build neighbor list.
// outputs
// nlist, max_list_size
// nlist contains nframes * nloc rows. Row frame_idx * nloc + atom_idx
// stores the neighbors of local atom atom_idx in that frame.
// max_list_size is the maximal size of jlist.
// inputs
// c_cpy, nloc, nall, mem_size, rcut, region
// mem_size is the size of allocated memory for jlist.
// c_cpy, nloc, nall, mem_size, rcut, nframes, type
// c_cpy stores nframes coordinate blocks with nall atoms each, including
// ghost atoms, laid out as c_cpy[(frame_idx * nall + atom) * 3 + dim].
// nloc is the number of local atoms per frame; nall is the total number of
// local and ghost atoms per frame.
// mem_size is the size of allocated memory for each jlist row.
// nframes is the number of coordinate frames.
// type may be nullptr/NULL, or a per-atom type array.
// The type array uses the same frame-major nall stride as c_cpy:
// type[frame_idx * nall + atom].
// When the center or neighbor atom has type < 0, that pair is excluded.
// returns
// 0: successful
// 1: the memory is not large enough to hold all neighbors.
// i.e. max_list_size > mem_nall
// i.e. max_list_size > mem_size
template <typename FPTYPE>
int build_nlist_cpu(InputNlist& nlist,
int* max_list_size,
const FPTYPE* c_cpy,
const int& nloc,
const int& nall,
const int& mem_size,
const float& rcut);
const float& rcut,
const int& nframes = 1,
const int* type = nullptr);
Comment thread
njzjz marked this conversation as resolved.

void use_nei_info_cpu(int* nlist,
int* ntype,
Expand Down Expand Up @@ -208,14 +221,27 @@ void use_nlist_map(int* nlist,
// build neighbor list.
// outputs
// nlist, max_list_size
// nlist contains nframes * nloc rows. Row frame_idx * nloc + atom_idx
// stores the neighbors of local atom atom_idx in that frame.
// max_list_size is the maximal size of jlist.
// inputs
// c_cpy, nloc, nall, mem_size, rcut, region
// mem_size is the size of allocated memory for jlist.
// nlist_data, c_cpy, nloc, nall, mem_size, rcut, nframes, type
// nlist_data is temporary GPU workspace of at least
// 2 * nframes * nloc * mem_size ints.
// c_cpy stores nframes coordinate blocks with nall atoms each, including
// ghost atoms, laid out as c_cpy[(frame_idx * nall + atom) * 3 + dim].
// nloc is the number of local atoms per frame; nall is the total number of
// local and ghost atoms per frame.
// mem_size is the size of allocated memory for each jlist row.
// nframes is the number of coordinate frames.
// type may be nullptr/NULL, or a per-atom type array.
// The type array uses the same frame-major nall stride as c_cpy:
// type[frame_idx * nall + atom].
// When the center or neighbor atom has type < 0, that pair is excluded.
// returns
// 0: successful
// 1: the memory is not large enough to hold all neighbors.
// i.e. max_list_size > mem_nall
// 1: the temporary row storage is not large enough.
// i.e. mem_size < nall
template <typename FPTYPE>
int build_nlist_gpu(InputNlist& nlist,
int* max_list_size,
Expand All @@ -224,7 +250,9 @@ int build_nlist_gpu(InputNlist& nlist,
const int& nloc,
const int& nall,
const int& mem_size,
const float& rcut);
const float& rcut,
const int& nframes = 1,
const int* type = NULL);

/**
* @brief Filter the fake atom type.
Expand Down
94 changes: 94 additions & 0 deletions source/lib/include/prod_env_mat.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,34 @@ void prod_env_mat_a_cpu(FPTYPE* em,
const FPTYPE* std,
const int nloc,
const int nall,
const int nframes,
const float rcut,
const float rcut_smth,
const std::vector<int> sec,
const int* f_type = NULL);

template <typename FPTYPE>
inline void prod_env_mat_a_cpu(FPTYPE* em,
FPTYPE* em_deriv,
FPTYPE* rij,
int* nlist,
const FPTYPE* coord,
const int* type,
const InputNlist& inlist,
const int max_nbor_size,
const FPTYPE* avg,
const FPTYPE* std,
const int nloc,
const int nall,
const float rcut,
const float rcut_smth,
const std::vector<int> sec,
const int* f_type = NULL) {
prod_env_mat_a_cpu(em, em_deriv, rij, nlist, coord, type, inlist,
max_nbor_size, avg, std, nloc, nall, 1, rcut, rcut_smth,
sec, f_type);
}

template <typename FPTYPE>
void prod_env_mat_r_cpu(FPTYPE* em,
FPTYPE* em_deriv,
Expand All @@ -38,10 +61,32 @@ void prod_env_mat_r_cpu(FPTYPE* em,
const FPTYPE* std,
const int nloc,
const int nall,
const int nframes,
const float rcut,
const float rcut_smth,
const std::vector<int> sec);

template <typename FPTYPE>
inline void prod_env_mat_r_cpu(FPTYPE* em,
FPTYPE* em_deriv,
FPTYPE* rij,
int* nlist,
const FPTYPE* coord,
const int* type,
const InputNlist& inlist,
const int max_nbor_size,
const FPTYPE* avg,
const FPTYPE* std,
const int nloc,
const int nall,
const float rcut,
const float rcut_smth,
const std::vector<int> sec) {
prod_env_mat_r_cpu(em, em_deriv, rij, nlist, coord, type, inlist,
max_nbor_size, avg, std, nloc, nall, 1, rcut, rcut_smth,
sec);
}

#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
template <typename FPTYPE>
void prod_env_mat_a_gpu(FPTYPE* em,
Expand All @@ -58,11 +103,36 @@ void prod_env_mat_a_gpu(FPTYPE* em,
const FPTYPE* std,
const int nloc,
const int nall,
const int nframes,
const float rcut,
const float rcut_smth,
const std::vector<int> sec,
const int* f_type = NULL);

template <typename FPTYPE>
inline void prod_env_mat_a_gpu(FPTYPE* em,
FPTYPE* em_deriv,
FPTYPE* rij,
int* nlist,
const FPTYPE* coord,
const int* type,
const InputNlist& gpu_inlist,
int* array_int,
unsigned long long* array_longlong,
const int max_nbor_size,
const FPTYPE* avg,
const FPTYPE* std,
const int nloc,
const int nall,
const float rcut,
const float rcut_smth,
const std::vector<int> sec,
const int* f_type = NULL) {
prod_env_mat_a_gpu(em, em_deriv, rij, nlist, coord, type, gpu_inlist,
array_int, array_longlong, max_nbor_size, avg, std, nloc,
nall, 1, rcut, rcut_smth, sec, f_type);
}

template <typename FPTYPE>
void prod_env_mat_r_gpu(FPTYPE* em,
FPTYPE* em_deriv,
Expand All @@ -78,10 +148,34 @@ void prod_env_mat_r_gpu(FPTYPE* em,
const FPTYPE* std,
const int nloc,
const int nall,
const int nframes,
const float rcut,
const float rcut_smth,
const std::vector<int> sec);

template <typename FPTYPE>
inline void prod_env_mat_r_gpu(FPTYPE* em,
FPTYPE* em_deriv,
FPTYPE* rij,
int* nlist,
const FPTYPE* coord,
const int* type,
const InputNlist& gpu_inlist,
int* array_int,
unsigned long long* array_longlong,
const int max_nbor_size,
const FPTYPE* avg,
const FPTYPE* std,
const int nloc,
const int nall,
const float rcut,
const float rcut_smth,
const std::vector<int> sec) {
prod_env_mat_r_gpu(em, em_deriv, rij, nlist, coord, type, gpu_inlist,
array_int, array_longlong, max_nbor_size, avg, std, nloc,
nall, 1, rcut, rcut_smth, sec);
}

void env_mat_nbor_update(InputNlist& inlist,
InputNlist& gpu_inlist,
int& max_nbor_size,
Expand Down
54 changes: 35 additions & 19 deletions source/lib/src/gpu/neighbor_list.cu
Original file line number Diff line number Diff line change
Expand Up @@ -78,19 +78,28 @@ template <typename FPTYPE>
__global__ void build_nlist(int* ilist,
int* temp_nlist,
const FPTYPE* c_cpy,
const int* type,
const FPTYPE rcut2,
const int nloc,
const int nall,
const int mem_size) {
const unsigned int atom_idx = blockIdx.x;
const unsigned int row_idx = blockIdx.x;
const unsigned int atom_idx = row_idx % nloc;
const unsigned int frame_idx = row_idx / nloc;
const unsigned int neighbor_idx = blockIdx.y * blockDim.y + threadIdx.y;
if (neighbor_idx < nall) {
int* neighbor_row = temp_nlist + atom_idx * mem_size;
int* neighbor_row = temp_nlist + row_idx * mem_size;
if (neighbor_idx == atom_idx) {
ilist[atom_idx] = atom_idx;
ilist[row_idx] = atom_idx;
} else {
const FPTYPE* ccoord = c_cpy + atom_idx * 3;
const FPTYPE* ncoord = c_cpy + neighbor_idx * 3;
const int_64 frame_offset = int_64(frame_idx) * nall;
if (type != NULL && (type[frame_offset + atom_idx] < 0 ||
type[frame_offset + neighbor_idx] < 0)) {
return;
}
const FPTYPE* frame_coord = c_cpy + int_64(frame_idx) * nall * 3;
const FPTYPE* ccoord = frame_coord + atom_idx * 3;
const FPTYPE* ncoord = frame_coord + neighbor_idx * 3;
FPTYPE diff[3];
for (int kk = 0; kk < 3; kk++) {
diff[kk] = ccoord[kk] - ncoord[kk];
Expand Down Expand Up @@ -193,41 +202,44 @@ int build_nlist_gpu(InputNlist& nlist,
const int& nloc,
const int& nall,
const int& mem_size,
const float& rcut) {
const float& rcut,
const int& nframes,
const int* type) {
if (mem_size < nall) {
return 1;
}
DPErrcheck(gpuGetLastError());
DPErrcheck(gpuDeviceSynchronize());
const int nblock = (nall + TPB - 1) / TPB;
const int_64 nrows = int_64(nframes) * nloc;
int* ilist = nlist.ilist;
int* numneigh = nlist.numneigh;
int** firstneigh = nlist.firstneigh;
DPErrcheck(gpuMemset(nlist_data, -1, sizeof(int) * 2 * nloc * mem_size));
int* temp_nlist = nlist_data; // nloc*mem_size
int* nei_order = temp_nlist + nloc * mem_size;
nlist.inum = nloc;
DPErrcheck(gpuMemset(nlist_data, -1, sizeof(int) * 2 * nrows * mem_size));
int* temp_nlist = nlist_data; // nrows*mem_size
int* nei_order = temp_nlist + nrows * mem_size;
nlist.inum = nrows;
FPTYPE rcut2 = rcut * rcut;

dim3 block_grid(nloc, nblock);
dim3 block_grid(nrows, nblock);
dim3 thread_grid(1, TPB);
build_nlist<<<block_grid, thread_grid>>>(ilist, temp_nlist, c_cpy, rcut2,
nloc, nall, mem_size);
build_nlist<<<block_grid, thread_grid>>>(ilist, temp_nlist, c_cpy, type,
rcut2, nloc, nall, mem_size);
DPErrcheck(gpuGetLastError());
DPErrcheck(gpuDeviceSynchronize());
parallel_prefix_scan<TPB>
<<<nloc, TPB>>>(numneigh, nei_order, temp_nlist, mem_size, nloc, nall);
<<<nrows, TPB>>>(numneigh, nei_order, temp_nlist, mem_size, nloc, nall);
DPErrcheck(gpuGetLastError());
DPErrcheck(gpuDeviceSynchronize());
fill_nlist<<<block_grid, thread_grid>>>(firstneigh, temp_nlist, nei_order,
mem_size, nall);
DPErrcheck(gpuGetLastError());
DPErrcheck(gpuDeviceSynchronize());
int* numneigh_host = new int[nloc];
DPErrcheck(gpuMemcpy(numneigh_host, numneigh, sizeof(int) * nloc,
int* numneigh_host = new int[nrows];
DPErrcheck(gpuMemcpy(numneigh_host, numneigh, sizeof(int) * nrows,
gpuMemcpyDeviceToHost));
int max_nei = 0;
for (int ii = 0; ii < nloc; ii++) {
for (int_64 ii = 0; ii < nrows; ii++) {
if (numneigh_host[ii] > max_nei) {
max_nei = numneigh_host[ii];
}
Expand Down Expand Up @@ -285,15 +297,19 @@ template int build_nlist_gpu<float>(InputNlist& nlist,
const int& nloc,
const int& nall,
const int& mem_size,
const float& rcut);
const float& rcut,
const int& nframes,
const int* type);
template int build_nlist_gpu<double>(InputNlist& nlist,
int* max_list_size,
int* nlist_data,
const double* c_cpy,
const int& nloc,
const int& nall,
const int& mem_size,
const float& rcut);
const float& rcut,
const int& nframes,
const int* type);

__global__ void map_filter_ftype(int* ftype_out,
const int* ftype_in,
Expand Down
Loading
Loading