diff --git a/deepmd/descriptor/se_atten.py b/deepmd/descriptor/se_atten.py index d0c1b38134..d1097f3fcf 100644 --- a/deepmd/descriptor/se_atten.py +++ b/deepmd/descriptor/se_atten.py @@ -1122,6 +1122,7 @@ def _filter_lower( tf.reshape(inputs_i, [natom, shape_i[1] // 4, 4]), two_embd, last_layer_size=outputs_size[-1], + is_sorted=len(self.exclude_types) == 0, ) if (not self.uniform_seed) and (self.seed is not None): diff --git a/source/lib/include/tabulate.h b/source/lib/include/tabulate.h index 395621d97e..2e2c021d9c 100644 --- a/source/lib/include/tabulate.h +++ b/source/lib/include/tabulate.h @@ -12,7 +12,8 @@ void tabulate_fusion_se_a_cpu(FPTYPE* out, const FPTYPE* two_embed, const int nloc, const int nnei, - const int last_layer_size); + const int last_layer_size, + const bool is_sorted = true); template void tabulate_fusion_se_a_grad_cpu(FPTYPE* dy_dem_x, @@ -25,7 +26,8 @@ void tabulate_fusion_se_a_grad_cpu(FPTYPE* dy_dem_x, const FPTYPE* dy, const int nloc, const int nnei, - const int last_layer_size); + const int last_layer_size, + const bool is_sorted = true); template void tabulate_fusion_se_a_grad_grad_cpu(FPTYPE* dz_dy, @@ -37,7 +39,8 @@ void tabulate_fusion_se_a_grad_grad_cpu(FPTYPE* dz_dy, const FPTYPE* dz_dy_dem, const int nloc, const int nnei, - const int last_layer_size); + const int last_layer_size, + const bool is_sorted = true); template void tabulate_fusion_se_t_cpu(FPTYPE* out, @@ -115,7 +118,8 @@ void tabulate_fusion_se_a_gpu_cuda(FPTYPE* out, const FPTYPE* two_embed, const int nloc, const int nnei, - const int last_layer_size); + const int last_layer_size, + const bool is_sorted = true); template void tabulate_fusion_se_a_grad_gpu_cuda(FPTYPE* dy_dem_x, @@ -128,7 +132,8 @@ void tabulate_fusion_se_a_grad_gpu_cuda(FPTYPE* dy_dem_x, const FPTYPE* dy, const int nloc, const int nnei, - const int last_layer_size); + const int last_layer_size, + const bool is_sorted = true); template void tabulate_fusion_se_a_grad_grad_gpu_cuda(FPTYPE* dz_dy, @@ -140,7 +145,8 @@ void tabulate_fusion_se_a_grad_grad_gpu_cuda(FPTYPE* dz_dy, const FPTYPE* dz_dy_dem, const int nloc, const int nnei, - const int last_layer_size); + const int last_layer_size, + const bool is_sorted = true); template void tabulate_fusion_se_t_gpu_cuda(FPTYPE* out, @@ -219,7 +225,8 @@ void tabulate_fusion_se_a_gpu_rocm(FPTYPE* out, const FPTYPE* two_embed, const int nloc, const int nnei, - const int last_layer_size); + const int last_layer_size, + const bool is_sorted = true); template void tabulate_fusion_se_a_grad_gpu_rocm(FPTYPE* dy_dem_x, @@ -232,7 +239,8 @@ void tabulate_fusion_se_a_grad_gpu_rocm(FPTYPE* dy_dem_x, const FPTYPE* dy, const int nloc, const int nnei, - const int last_layer_size); + const int last_layer_size, + const bool is_sorted = true); template void tabulate_fusion_se_a_grad_grad_gpu_rocm(FPTYPE* dz_dy, @@ -244,7 +252,8 @@ void tabulate_fusion_se_a_grad_grad_gpu_rocm(FPTYPE* dz_dy, const FPTYPE* dz_dy_dem, const int nloc, const int nnei, - const int last_layer_size); + const int last_layer_size, + const bool is_sorted = true); template void tabulate_fusion_se_t_gpu_rocm(FPTYPE* out, diff --git a/source/lib/src/cuda/tabulate.cu b/source/lib/src/cuda/tabulate.cu index 06d1d49057..bb05fd59a4 100644 --- a/source/lib/src/cuda/tabulate.cu +++ b/source/lib/src/cuda/tabulate.cu @@ -128,7 +128,8 @@ __global__ void tabulate_fusion_se_a_fifth_order_polynomial( const FPTYPE stride0, const FPTYPE stride1, const int nnei, - const int last_layer_size) { + const int last_layer_size, + const bool is_sorted) { bool enable_se_atten = two_embed != nullptr; const int_64 block_idx = blockIdx.x; // nloc const int thread_idx = threadIdx.x; // last_layer_size @@ -141,7 +142,7 @@ __global__ void tabulate_fusion_se_a_fifth_order_polynomial( FPTYPE var[6]; for (int ii = 0; ii < nnei; ii++) { FPTYPE xx = em_x[block_idx * nnei + ii]; - if (xx == ago) { + if (xx == ago && is_sorted) { unloop = true; breakpoint = ii; } @@ -191,7 +192,8 @@ __global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial( const FPTYPE stride0, const FPTYPE stride1, const int nnei, - const int last_layer_size) { + const int last_layer_size, + const bool is_sorted) { bool enable_se_atten = two_embed != nullptr; extern __shared__ int _data[]; const int_64 block_idx = blockIdx.x; // nloc @@ -211,7 +213,7 @@ __global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial( FPTYPE ago = __shfl_sync(0xffffffff, em_x[block_idx * nnei + nnei - 1], 0); for (int ii = warp_idx; ii < nnei; ii += KTILE) { FPTYPE xx = em_x[block_idx * nnei + ii]; - if (ago == xx) { + if (ago == xx && is_sorted) { unloop = true; breakpoint = ii; } @@ -286,7 +288,8 @@ __global__ void tabulate_fusion_se_a_grad_grad_fifth_order_polynomial( const FPTYPE stride0, const FPTYPE stride1, const int nnei, - const int last_layer_size) { + const int last_layer_size, + const bool is_sorted) { extern __shared__ int _data[]; const int_64 block_idx = blockIdx.x; // nloc const int thread_idx = threadIdx.x; // last_layer_size @@ -304,7 +307,7 @@ __global__ void tabulate_fusion_se_a_grad_grad_fifth_order_polynomial( for (int ii = 0; ii < nnei; ii++) { FPTYPE xx = em_x[block_idx * nnei + ii]; FPTYPE dz_xx = dz_dy_dem_x[block_idx * nnei + ii]; - if (xx == ago) { + if (xx == ago && is_sorted) { unloop = true; breakpoint = ii; } @@ -626,16 +629,18 @@ void tabulate_fusion_se_a_gpu_cuda(FPTYPE* out, const FPTYPE* two_embed, const int nloc, const int nnei, - const int last_layer_size) { + const int last_layer_size, + const bool is_sorted) { if (nloc <= 0) { return; } DPErrcheck(cudaGetLastError()); DPErrcheck(cudaDeviceSynchronize()); tabulate_fusion_se_a_fifth_order_polynomial - <<>>( - out, table, em_x, em, two_embed, table_info[0], table_info[1], - table_info[2], table_info[3], table_info[4], nnei, last_layer_size); + <<>>(out, table, em_x, em, two_embed, + table_info[0], table_info[1], table_info[2], + table_info[3], table_info[4], nnei, + last_layer_size, is_sorted); DPErrcheck(cudaGetLastError()); DPErrcheck(cudaDeviceSynchronize()); } @@ -651,7 +656,8 @@ void tabulate_fusion_se_a_grad_gpu_cuda(FPTYPE* dy_dem_x, const FPTYPE* dy, const int nloc, const int nnei, - const int last_layer_size) { + const int last_layer_size, + const bool is_sorted) { if (nloc <= 0) { return; } @@ -664,7 +670,7 @@ void tabulate_fusion_se_a_grad_gpu_cuda(FPTYPE* dy_dem_x, <<>>( dy_dem_x, dy_dem, table, em_x, em, two_embed, dy, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, - last_layer_size); + last_layer_size, is_sorted); DPErrcheck(cudaGetLastError()); DPErrcheck(cudaDeviceSynchronize()); } @@ -679,7 +685,8 @@ void tabulate_fusion_se_a_grad_grad_gpu_cuda(FPTYPE* dz_dy, const FPTYPE* dz_dy_dem, const int nloc, const int nnei, - const int last_layer_size) { + const int last_layer_size, + const bool is_sorted) { if (nloc <= 0) { return; } @@ -690,7 +697,7 @@ void tabulate_fusion_se_a_grad_grad_gpu_cuda(FPTYPE* dz_dy, <<>>( dz_dy, table, em_x, em, dz_dy_dem_x, dz_dy_dem, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, - last_layer_size); + last_layer_size, is_sorted); DPErrcheck(cudaGetLastError()); DPErrcheck(cudaDeviceSynchronize()); } @@ -852,7 +859,8 @@ template void tabulate_fusion_se_a_gpu_cuda(float* out, const float* two_embed, const int nloc, const int nnei, - const int last_layer_size); + const int last_layer_size, + const bool is_sorted); template void tabulate_fusion_se_a_gpu_cuda(double* out, const double* table, const double* table_info, @@ -861,7 +869,8 @@ template void tabulate_fusion_se_a_gpu_cuda(double* out, const double* two_embed, const int nloc, const int nnei, - const int last_layer_size); + const int last_layer_size, + const bool is_sorted); template void tabulate_fusion_se_a_grad_gpu_cuda( float* dy_dem_x, float* dy_dem, @@ -873,7 +882,8 @@ template void tabulate_fusion_se_a_grad_gpu_cuda( const float* dy, const int nloc, const int nnei, - const int last_layer_size); + const int last_layer_size, + const bool is_sorted); template void tabulate_fusion_se_a_grad_gpu_cuda( double* dy_dem_x, double* dy_dem, @@ -885,7 +895,8 @@ template void tabulate_fusion_se_a_grad_gpu_cuda( const double* dy, const int nloc, const int nnei, - const int last_layer_size); + const int last_layer_size, + const bool is_sorted); template void tabulate_fusion_se_a_grad_grad_gpu_cuda( float* dz_dy, const float* table, @@ -896,7 +907,8 @@ template void tabulate_fusion_se_a_grad_grad_gpu_cuda( const float* dz_dy_dem, const int nloc, const int nnei, - const int last_layer_size); + const int last_layer_size, + const bool is_sorted); template void tabulate_fusion_se_a_grad_grad_gpu_cuda( double* dz_dy, const double* table, @@ -907,7 +919,8 @@ template void tabulate_fusion_se_a_grad_grad_gpu_cuda( const double* dz_dy_dem, const int nloc, const int nnei, - const int last_layer_size); + const int last_layer_size, + const bool is_sorted); template void tabulate_fusion_se_t_gpu_cuda(float* out, const float* table, diff --git a/source/lib/src/rocm/tabulate.hip.cu b/source/lib/src/rocm/tabulate.hip.cu index 5aaf023262..6ae21d4e0b 100644 --- a/source/lib/src/rocm/tabulate.hip.cu +++ b/source/lib/src/rocm/tabulate.hip.cu @@ -88,7 +88,8 @@ __global__ void tabulate_fusion_se_a_fifth_order_polynomial( const FPTYPE stride0, const FPTYPE stride1, const int nnei, - const int last_layer_size) { + const int last_layer_size, + const bool is_sorted) { bool enable_se_atten = two_embed != nullptr; HIP_DYNAMIC_SHARED(int, _data) const int_64 block_idx = blockIdx.x; // nloc @@ -105,7 +106,7 @@ __global__ void tabulate_fusion_se_a_fifth_order_polynomial( for (int ii = 0; ii < nnei; ii++) { FPTYPE var[6]; FPTYPE xx = em_x[block_idx * nnei + ii]; - if (xx == ago) { + if (xx == ago && is_sorted) { unloop = true; breakpoint = ii; } @@ -157,7 +158,8 @@ __global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial( const FPTYPE stride0, const FPTYPE stride1, const int nnei, - const int last_layer_size) { + const int last_layer_size, + const bool is_sorted) { bool enable_se_atten = two_embed != nullptr; HIP_DYNAMIC_SHARED(int, _data) const int_64 block_idx = blockIdx.x; // nloc @@ -177,7 +179,7 @@ __global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial( FPTYPE ago = __shfl(em_x[block_idx * nnei + nnei - 1], 0); for (int ii = 0; ii < nnei - warp_idx; ii += KTILE) { FPTYPE xx = em_x[block_idx * nnei + ii + warp_idx]; - if (ago == xx) { + if (ago == xx && is_sorted) { unloop = true; breakpoint = ii + warp_idx; } @@ -260,7 +262,8 @@ __global__ void tabulate_fusion_se_a_grad_grad_fifth_order_polynomial( const FPTYPE stride0, const FPTYPE stride1, const int nnei, - const int last_layer_size) { + const int last_layer_size, + const bool is_sorted) { extern __shared__ int _data[]; const int_64 block_idx = blockIdx.x; // nloc const int thread_idx = threadIdx.x; // last_layer_size @@ -277,7 +280,7 @@ __global__ void tabulate_fusion_se_a_grad_grad_fifth_order_polynomial( FPTYPE var[6]; FPTYPE xx = em_x[block_idx * nnei + ii]; FPTYPE dz_xx = dz_dy_dem_x[block_idx * nnei + ii]; - if (xx == ago) { + if (xx == ago && is_sorted) { unloop = true; breakpoint = ii; } @@ -626,7 +629,8 @@ void tabulate_fusion_se_a_gpu_rocm(FPTYPE* out, const FPTYPE* two_embed, const int nloc, const int nnei, - const int last_layer_size) { + const int last_layer_size, + const bool is_sorted) { if (nloc <= 0) { return; } @@ -635,7 +639,7 @@ void tabulate_fusion_se_a_gpu_rocm(FPTYPE* out, tabulate_fusion_se_a_fifth_order_polynomial), nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size, 0, out, table, em_x, em, two_embed, table_info[0], table_info[1], table_info[2], - table_info[3], table_info[4], nnei, last_layer_size); + table_info[3], table_info[4], nnei, last_layer_size, is_sorted); DPErrcheck(hipGetLastError()); DPErrcheck(hipDeviceSynchronize()); } @@ -651,7 +655,8 @@ void tabulate_fusion_se_a_grad_gpu_rocm(FPTYPE* dy_dem_x, const FPTYPE* dy, const int nloc, const int nnei, - const int last_layer_size) { + const int last_layer_size, + const bool is_sorted) { if (nloc <= 0) { return; } @@ -663,7 +668,8 @@ void tabulate_fusion_se_a_grad_gpu_rocm(FPTYPE* dy_dem_x, tabulate_fusion_se_a_grad_fifth_order_polynomial), nloc, KK * WARP_SIZE, sizeof(FPTYPE) * MM * last_layer_size, 0, dy_dem_x, dy_dem, table, em_x, em, two_embed, dy, table_info[0], table_info[1], - table_info[2], table_info[3], table_info[4], nnei, last_layer_size); + table_info[2], table_info[3], table_info[4], nnei, last_layer_size, + is_sorted); DPErrcheck(hipGetLastError()); DPErrcheck(hipDeviceSynchronize()); } @@ -678,7 +684,8 @@ void tabulate_fusion_se_a_grad_grad_gpu_rocm(FPTYPE* dz_dy, const FPTYPE* dz_dy_dem, const int nloc, const int nnei, - const int last_layer_size) { + const int last_layer_size, + const bool is_sorted) { if (nloc <= 0) { return; } @@ -689,7 +696,8 @@ void tabulate_fusion_se_a_grad_grad_gpu_rocm(FPTYPE* dz_dy, KK>), nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size, 0, dz_dy, table, em_x, em, dz_dy_dem_x, dz_dy_dem, table_info[0], table_info[1], - table_info[2], table_info[3], table_info[4], nnei, last_layer_size); + table_info[2], table_info[3], table_info[4], nnei, last_layer_size, + is_sorted); DPErrcheck(hipGetLastError()); DPErrcheck(hipDeviceSynchronize()); } @@ -850,7 +858,8 @@ template void tabulate_fusion_se_a_gpu_rocm(float* out, const float* two_embed, const int nloc, const int nnei, - const int last_layer_size); + const int last_layer_size, + const bool is_sorted); template void tabulate_fusion_se_a_gpu_rocm(double* out, const double* table, const double* table_info, @@ -859,7 +868,8 @@ template void tabulate_fusion_se_a_gpu_rocm(double* out, const double* two_embed, const int nloc, const int nnei, - const int last_layer_size); + const int last_layer_size, + const bool is_sorted); template void tabulate_fusion_se_a_grad_gpu_rocm( float* dy_dem_x, float* dy_dem, @@ -871,7 +881,8 @@ template void tabulate_fusion_se_a_grad_gpu_rocm( const float* dy, const int nloc, const int nnei, - const int last_layer_size); + const int last_layer_size, + const bool is_sorted); template void tabulate_fusion_se_a_grad_gpu_rocm( double* dy_dem_x, double* dy_dem, @@ -883,7 +894,8 @@ template void tabulate_fusion_se_a_grad_gpu_rocm( const double* dy, const int nloc, const int nnei, - const int last_layer_size); + const int last_layer_size, + const bool is_sorted); template void tabulate_fusion_se_a_grad_grad_gpu_rocm( float* dz_dy, const float* table, @@ -894,7 +906,8 @@ template void tabulate_fusion_se_a_grad_grad_gpu_rocm( const float* dz_dy_dem, const int nloc, const int nnei, - const int last_layer_size); + const int last_layer_size, + const bool is_sorted); template void tabulate_fusion_se_a_grad_grad_gpu_rocm( double* dz_dy, const double* table, @@ -905,7 +918,8 @@ template void tabulate_fusion_se_a_grad_grad_gpu_rocm( const double* dz_dy_dem, const int nloc, const int nnei, - const int last_layer_size); + const int last_layer_size, + const bool is_sorted); template void tabulate_fusion_se_t_gpu_rocm(float* out, const float* table, diff --git a/source/lib/src/tabulate.cc b/source/lib/src/tabulate.cc index cd23991cf8..377e6d06db 100644 --- a/source/lib/src/tabulate.cc +++ b/source/lib/src/tabulate.cc @@ -86,7 +86,8 @@ void deepmd::tabulate_fusion_se_a_cpu(FPTYPE* out, const FPTYPE* two_embed, const int nloc, const int nnei, - const int last_layer_size) { + const int last_layer_size, + const bool is_sorted) { bool enable_se_atten = two_embed != nullptr; memset(out, 0, sizeof(FPTYPE) * nloc * 4 * last_layer_size); const FPTYPE lower = table_info[0]; @@ -107,7 +108,7 @@ void deepmd::tabulate_fusion_se_a_cpu(FPTYPE* out, ll[2] = em[ii * nnei * 4 + jj * 4 + 2]; ll[3] = em[ii * nnei * 4 + jj * 4 + 3]; FPTYPE xx = em_x[ii * nnei + jj]; - if (ago == xx) { + if (ago == xx && is_sorted) { unloop = true; } int table_idx = 0; @@ -165,7 +166,8 @@ void deepmd::tabulate_fusion_se_a_grad_cpu(FPTYPE* dy_dem_x, const FPTYPE* dy, const int nloc, const int nnei, - const int last_layer_size) { + const int last_layer_size, + const bool is_sorted) { bool enable_se_atten = two_embed != nullptr; memset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei); memset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei * 4); @@ -189,7 +191,7 @@ void deepmd::tabulate_fusion_se_a_grad_cpu(FPTYPE* dy_dem_x, ll[2] = em[ii * nnei * 4 + jj * 4 + 2]; ll[3] = em[ii * nnei * 4 + jj * 4 + 3]; FPTYPE xx = em_x[ii * nnei + jj]; - if (ago == xx) { + if (ago == xx && is_sorted) { unloop = true; } int table_idx = 0; @@ -249,7 +251,8 @@ void deepmd::tabulate_fusion_se_a_grad_grad_cpu(FPTYPE* dz_dy, const FPTYPE* dz_dy_dem, const int nloc, const int nnei, - const int last_layer_size) { + const int last_layer_size, + const bool is_sorted) { memset(dz_dy, 0, sizeof(FPTYPE) * nloc * 4 * last_layer_size); const FPTYPE lower = table_info[0]; const FPTYPE upper = table_info[1]; @@ -275,7 +278,7 @@ void deepmd::tabulate_fusion_se_a_grad_grad_cpu(FPTYPE* dz_dy, hh[3] = dz_dy_dem[ii * nnei * 4 + jj * 4 + 3]; FPTYPE xx = em_x[ii * nnei + jj]; FPTYPE dz_xx = dz_dy_dem_x[ii * nnei + jj]; - if (ago == xx) { + if (ago == xx && is_sorted) { unloop = true; } int table_idx = 0; @@ -604,16 +607,16 @@ void deepmd::tabulate_fusion_se_r_grad_grad_cpu(FPTYPE* dz_dy, } } -template void deepmd::tabulate_fusion_se_a_cpu( - float* out, - const float* table, - const float* table_info, - const float* em_x, - const float* em, - const float* two_embed, - const int nloc, - const int nnei, - const int last_layer_size); +template void deepmd::tabulate_fusion_se_a_cpu(float* out, + const float* table, + const float* table_info, + const float* em_x, + const float* em, + const float* two_embed, + const int nloc, + const int nnei, + const int last_layer_size, + const bool is_sorted); template void deepmd::tabulate_fusion_se_a_cpu( double* out, const double* table, @@ -623,7 +626,8 @@ template void deepmd::tabulate_fusion_se_a_cpu( const double* two_embed, const int nloc, const int nnei, - const int last_layer_size); + const int last_layer_size, + const bool is_sorted); template void deepmd::tabulate_fusion_se_a_grad_cpu( float* dy_dem_x, float* dy_dem, @@ -635,7 +639,8 @@ template void deepmd::tabulate_fusion_se_a_grad_cpu( const float* dy, const int nloc, const int nnei, - const int last_layer_size); + const int last_layer_size, + const bool is_sorted); template void deepmd::tabulate_fusion_se_a_grad_cpu( double* dy_dem_x, double* dy_dem, @@ -647,7 +652,8 @@ template void deepmd::tabulate_fusion_se_a_grad_cpu( const double* dy, const int nloc, const int nnei, - const int last_layer_size); + const int last_layer_size, + const bool is_sorted); template void deepmd::tabulate_fusion_se_a_grad_grad_cpu( float* dz_dy, const float* table, @@ -658,7 +664,8 @@ template void deepmd::tabulate_fusion_se_a_grad_grad_cpu( const float* dz_dy_dem, const int nloc, const int nnei, - const int last_layer_size); + const int last_layer_size, + const bool is_sorted); template void deepmd::tabulate_fusion_se_a_grad_grad_cpu( double* dz_dy, const double* table, @@ -669,7 +676,8 @@ template void deepmd::tabulate_fusion_se_a_grad_grad_cpu( const double* dz_dy_dem, const int nloc, const int nnei, - const int last_layer_size); + const int last_layer_size, + const bool is_sorted); template void deepmd::tabulate_fusion_se_t_cpu( float* out, diff --git a/source/op/_tabulate_grad.py b/source/op/_tabulate_grad.py index ac4d9df731..e91aa5fd2f 100644 --- a/source/op/_tabulate_grad.py +++ b/source/op/_tabulate_grad.py @@ -31,7 +31,14 @@ def _tabulate_fusion_se_a_grad_cc(op, dy): @ops.RegisterGradient("TabulateFusionSeAGrad") def _tabulate_fusion_se_a_grad_grad_cc(op, dy, dy_): dz_dy = op_module.tabulate_fusion_se_a_grad_grad( - op.inputs[0], op.inputs[1], op.inputs[2], op.inputs[3], dy, dy_, op.inputs[5] + op.inputs[0], + op.inputs[1], + op.inputs[2], + op.inputs[3], + dy, + dy_, + op.inputs[5], + is_sorted=True, ) return [None, None, None, None, dz_dy, None] @@ -46,6 +53,7 @@ def _tabulate_fusion_se_atten_grad_cc(op, dy): op.inputs[4], dy, op.outputs[0], + is_sorted=op.get_attr("is_sorted"), ) return [None, None, dy_dx, dy_df, dy_dtwo] @@ -53,7 +61,14 @@ def _tabulate_fusion_se_atten_grad_cc(op, dy): @ops.RegisterGradient("TabulateFusionSeAttenGrad") def _tabulate_fusion_se_atten_grad_grad_cc(op, dy, dy_, dy_dtwo): dz_dy = op_module.tabulate_fusion_se_a_grad_grad( - op.inputs[0], op.inputs[1], op.inputs[2], op.inputs[3], dy, dy_, op.inputs[6] + op.inputs[0], + op.inputs[1], + op.inputs[2], + op.inputs[3], + dy, + dy_, + op.inputs[6], + is_sorted=op.get_attr("is_sorted"), ) return [None, None, None, None, None, dz_dy, None] diff --git a/source/op/tabulate_multi_device.cc b/source/op/tabulate_multi_device.cc index a3ec2dbd82..0ac8745f64 100644 --- a/source/op/tabulate_multi_device.cc +++ b/source/op/tabulate_multi_device.cc @@ -63,7 +63,8 @@ REGISTER_OP("TabulateFusionSeAGradGrad") .Input("dz_dy_dem_x: T") .Input("dz_dy_dem: T") .Input("descriptor: T") - .Output("dz_dy: T"); + .Output("dz_dy: T") + .Attr("is_sorted: bool = true"); REGISTER_OP("TabulateFusionSeAtten") .Attr("T: {float, double} = DT_DOUBLE") @@ -73,6 +74,7 @@ REGISTER_OP("TabulateFusionSeAtten") .Input("em: T") .Input("two_embed: T") .Attr("last_layer_size: int") + .Attr("is_sorted: bool = true") .Output("descriptor: T"); REGISTER_OP("TabulateFusionSeAttenGrad") @@ -86,7 +88,8 @@ REGISTER_OP("TabulateFusionSeAttenGrad") .Input("descriptor: T") .Output("dy_dem_x: T") .Output("dy_dem: T") - .Output("dy_dtwo: T"); + .Output("dy_dtwo: T") + .Attr("is_sorted: bool = true"); REGISTER_OP("TabulateFusionSeT") .Attr("T: {float, double} = DT_DOUBLE") @@ -291,7 +294,9 @@ template class TabulateFusionSeAGradGradOp : public OpKernel { public: explicit TabulateFusionSeAGradGradOp(OpKernelConstruction* context) - : OpKernel(context) {} + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("is_sorted", &is_sorted)); + } void Compute(OpKernelContext* context) override { // Grab the input tensor int context_input_index = 0; @@ -330,25 +335,26 @@ class TabulateFusionSeAGradGradOp : public OpKernel { #if GOOGLE_CUDA deepmd::tabulate_fusion_se_a_grad_grad_gpu_cuda( dz_dy, table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc, - nnei, last_layer_size); + nnei, last_layer_size, is_sorted); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM deepmd::tabulate_fusion_se_a_grad_grad_gpu_rocm( dz_dy, table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc, - nnei, last_layer_size); + nnei, last_layer_size, is_sorted); #endif // TENSORFLOW_USE_ROCM OP_REQUIRES(context, (last_layer_size <= 1024), errors::InvalidArgument( "In the process of model compression, the size of the " "last layer of embedding net must be less than 1024!")); } else if (device == "CPU") { - deepmd::tabulate_fusion_se_a_grad_grad_cpu(dz_dy, table, table_info, em_x, - em, dz_dy_dem_x, dz_dy_dem, - nloc, nnei, last_layer_size); + deepmd::tabulate_fusion_se_a_grad_grad_cpu( + dz_dy, table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc, + nnei, last_layer_size, is_sorted); } } private: + bool is_sorted; std::string device; }; @@ -359,6 +365,7 @@ class TabulateFusionSeAttenOp : public OpKernel { : OpKernel(context) { OP_REQUIRES_OK(context, context->GetAttr("last_layer_size", &last_layer_size)); + OP_REQUIRES_OK(context, context->GetAttr("is_sorted", &is_sorted)); } void Compute(OpKernelContext* context) override { deepmd::safe_compute( @@ -406,22 +413,24 @@ class TabulateFusionSeAttenOp : public OpKernel { #if GOOGLE_CUDA deepmd::tabulate_fusion_se_a_gpu_cuda(descriptor, table, table_info, em_x, em, two_embed, nloc, nnei, - last_layer_size); + last_layer_size, is_sorted); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM deepmd::tabulate_fusion_se_a_gpu_rocm(descriptor, table, table_info, em_x, em, two_embed, nloc, nnei, - last_layer_size); + last_layer_size, is_sorted); #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::tabulate_fusion_se_a_cpu(descriptor, table, table_info, em_x, em, - two_embed, nloc, nnei, last_layer_size); + two_embed, nloc, nnei, last_layer_size, + is_sorted); } } private: int last_layer_size; + bool is_sorted; std::string device; }; @@ -429,7 +438,9 @@ template class TabulateFusionSeAttenGradOp : public OpKernel { public: explicit TabulateFusionSeAttenGradOp(OpKernelConstruction* context) - : OpKernel(context) {} + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("is_sorted", &is_sorted)); + } void Compute(OpKernelContext* context) override { deepmd::safe_compute( context, [this](OpKernelContext* context) { this->_Compute(context); }); @@ -483,22 +494,23 @@ class TabulateFusionSeAttenGradOp : public OpKernel { #if GOOGLE_CUDA deepmd::tabulate_fusion_se_a_grad_gpu_cuda( dy_dem_x, dy_dem, table, table_info, em_x, em, two_embed, dy, nloc, - nnei, last_layer_size); + nnei, last_layer_size, is_sorted); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM deepmd::tabulate_fusion_se_a_grad_gpu_rocm( dy_dem_x, dy_dem, table, table_info, em_x, em, two_embed, dy, nloc, - nnei, last_layer_size); + nnei, last_layer_size, is_sorted); #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::tabulate_fusion_se_a_grad_cpu(dy_dem_x, dy_dem, table, table_info, em_x, em, two_embed, dy, nloc, nnei, - last_layer_size); + last_layer_size, is_sorted); } } private: + bool is_sorted; std::string device; }; diff --git a/source/tests/test_model_compression_se_atten.py b/source/tests/test_model_compression_se_atten.py index 8ec5a6c75f..f047447975 100644 --- a/source/tests/test_model_compression_se_atten.py +++ b/source/tests/test_model_compression_se_atten.py @@ -94,7 +94,65 @@ def _init_models(): return inputs, frozen_models, compressed_models +@unittest.skipIf( + parse_version(tf.__version__) < parse_version("2"), + f"The current tf version {tf.__version__} is too low to run the new testing model.", +) +def _init_models_exclude_types(): + data_file = str(tests_path / os.path.join("model_compression", "data")) + inputs, frozen_models, compressed_models = [], [], [] + # 4 tests: + # - type embedding FP64, se_atten FP64 + # - type embedding FP64, se_atten FP32 + # - type embedding FP32, se_atten FP64 + # - type embedding FP32, se_atten FP32 + tests = [ + {"se_atten precision": "float64", "type embedding precision": "float64"}, + {"se_atten precision": "float64", "type embedding precision": "float32"}, + {"se_atten precision": "float32", "type embedding precision": "float64"}, + {"se_atten precision": "float32", "type embedding precision": "float32"}, + ] + for i in range(4): + INPUT = str(tests_path / f"input{i}.json") + frozen_model = str(tests_path / f"dp-original-se-atten{i}-exclude-types.pb") + compressed_model = str( + tests_path / f"dp-compressed-se-atten{i}-exclude-types.pb" + ) + jdata = j_loader( + str(tests_path / os.path.join("model_compression", "input.json")) + ) + jdata["model"]["descriptor"] = {} + jdata["model"]["descriptor"]["type"] = "se_atten" + jdata["model"]["descriptor"]["exclude_types"] = [[0, 1]] + jdata["model"]["descriptor"]["precision"] = tests[i]["se_atten precision"] + jdata["model"]["descriptor"]["stripped_type_embedding"] = True + jdata["model"]["descriptor"]["sel"] = 120 + jdata["model"]["descriptor"]["attn_layer"] = 0 + jdata["model"]["type_embedding"] = {} + jdata["model"]["type_embedding"]["precision"] = tests[i][ + "type embedding precision" + ] + jdata["training"]["training_data"]["systems"] = data_file + jdata["training"]["validation_data"]["systems"] = data_file + with open(INPUT, "w") as fp: + json.dump(jdata, fp, indent=4) + + ret = run_dp("dp train " + INPUT) + np.testing.assert_equal(ret, 0, "DP train failed!") + ret = run_dp("dp freeze -o " + frozen_model) + np.testing.assert_equal(ret, 0, "DP freeze failed!") + ret = run_dp("dp compress " + " -i " + frozen_model + " -o " + compressed_model) + np.testing.assert_equal(ret, 0, "DP model compression failed!") + + inputs.append(INPUT) + frozen_models.append(frozen_model) + compressed_models.append(compressed_model) + + return inputs, frozen_models, compressed_models + + INPUTS, FROZEN_MODELS, COMPRESSED_MODELS = _init_models() +INPUTS_ET, FROZEN_MODELS_ET, COMPRESSED_MODELS_ET = _init_models_exclude_types() def _get_default_places(nth_test): @@ -505,8 +563,8 @@ def test_ase(self): class TestDeepPotAPBCExcludeTypes(unittest.TestCase): @classmethod def setUpClass(self): - self.dp_originals = [DeepPot(FROZEN_MODELS[i]) for i in range(4)] - self.dp_compresseds = [DeepPot(COMPRESSED_MODELS[i]) for i in range(4)] + self.dp_originals = [DeepPot(FROZEN_MODELS_ET[i]) for i in range(4)] + self.dp_compresseds = [DeepPot(COMPRESSED_MODELS_ET[i]) for i in range(4)] self.coords = np.array( [ 12.83, @@ -535,9 +593,9 @@ def setUpClass(self): @classmethod def tearDownClass(self): for i in range(4): - _file_delete(INPUTS[i]) - _file_delete(FROZEN_MODELS[i]) - _file_delete(COMPRESSED_MODELS[i]) + _file_delete(INPUTS_ET[i]) + _file_delete(FROZEN_MODELS_ET[i]) + _file_delete(COMPRESSED_MODELS_ET[i]) _file_delete("out.json") _file_delete("compress.json") _file_delete("checkpoint")