diff --git a/source/api_cc/src/DataModifier.cc b/source/api_cc/src/DataModifier.cc index c6c009b0d8..2fbd58584b 100644 --- a/source/api_cc/src/DataModifier.cc +++ b/source/api_cc/src/DataModifier.cc @@ -142,8 +142,8 @@ compute (std::vector & dfcorr_, if (nloc_real == 0){ dfcorr_.resize(nall * 3); dvcorr_.resize(9); - fill(dfcorr_.begin(), dfcorr_.end(), 0.0); - fill(dvcorr_.begin(), dvcorr_.end(), 0.0); + fill(dfcorr_.begin(), dfcorr_.end(), (VALUETYPE)0.0); + fill(dvcorr_.begin(), dvcorr_.end(), (VALUETYPE)0.0); return; } // resize to nall_real @@ -223,7 +223,7 @@ compute (std::vector & dfcorr_, assert(dfcorr_1.size() == nall_real * 3); // resize to all and clear std::vector dfcorr_2(nall*3); - fill(dfcorr_2.begin(), dfcorr_2.end(), 0.0); + fill(dfcorr_2.begin(), dfcorr_2.end(), (VALUETYPE)0.0); // back map to original position for (int ii = 0; ii < nall_real; ++ii){ for (int dd = 0; dd < 3; ++dd){ diff --git a/source/api_cc/src/DeepPot.cc b/source/api_cc/src/DeepPot.cc index 047c665e8d..7b763d1b7e 100644 --- a/source/api_cc/src/DeepPot.cc +++ b/source/api_cc/src/DeepPot.cc @@ -34,10 +34,10 @@ run_model (ENERGYTYPE & dener, // no backward map needed // dforce of size nall * 3 dforce_.resize(nall * 3); - fill(dforce_.begin(), dforce_.end(), 0.0); + fill(dforce_.begin(), dforce_.end(), (VALUETYPE)0.0); // dvirial of size 9 dvirial.resize(9); - fill(dvirial.begin(), dvirial.end(), 0.0); + fill(dvirial.begin(), dvirial.end(), (VALUETYPE)0.0); return; } @@ -62,17 +62,17 @@ run_model (ENERGYTYPE & dener, dforce[ii] = of(ii); } // set dvirial to zero, prevent input vector is not zero (#1123) - std::fill(dvirial.begin(), dvirial.end(), 0.); + std::fill(dvirial.begin(), dvirial.end(), (VALUETYPE)0.); for (int ii = 0; ii < nall; ++ii) { - dvirial[0] += 1.0 * oav(9*ii+0); - dvirial[1] += 1.0 * oav(9*ii+1); - dvirial[2] += 1.0 * oav(9*ii+2); - dvirial[3] += 1.0 * oav(9*ii+3); - dvirial[4] += 1.0 * oav(9*ii+4); - dvirial[5] += 1.0 * oav(9*ii+5); - dvirial[6] += 1.0 * oav(9*ii+6); - dvirial[7] += 1.0 * oav(9*ii+7); - dvirial[8] += 1.0 * oav(9*ii+8); + dvirial[0] += (VALUETYPE)1.0 * oav(9*ii+0); + dvirial[1] += (VALUETYPE)1.0 * oav(9*ii+1); + dvirial[2] += (VALUETYPE)1.0 * oav(9*ii+2); + dvirial[3] += (VALUETYPE)1.0 * oav(9*ii+3); + dvirial[4] += (VALUETYPE)1.0 * oav(9*ii+4); + dvirial[5] += (VALUETYPE)1.0 * oav(9*ii+5); + dvirial[6] += (VALUETYPE)1.0 * oav(9*ii+6); + dvirial[7] += (VALUETYPE)1.0 * oav(9*ii+7); + dvirial[8] += (VALUETYPE)1.0 * oav(9*ii+8); } dforce_ = dforce; atommap.backward (dforce_.begin(), dforce.begin(), 3); @@ -95,16 +95,16 @@ static void run_model (ENERGYTYPE & dener, // no backward map needed // dforce of size nall * 3 dforce_.resize(nall * 3); - fill(dforce_.begin(), dforce_.end(), 0.0); + fill(dforce_.begin(), dforce_.end(), (VALUETYPE)0.0); // dvirial of size 9 dvirial.resize(9); - fill(dvirial.begin(), dvirial.end(), 0.0); + fill(dvirial.begin(), dvirial.end(), (VALUETYPE)0.0); // datom_energy_ of size nall datom_energy_.resize(nall); - fill(datom_energy_.begin(), datom_energy_.end(), 0.0); + fill(datom_energy_.begin(), datom_energy_.end(), (VALUETYPE)0.0); // datom_virial_ of size nall * 9 datom_virial_.resize(nall * 9); - fill(datom_virial_.begin(), datom_virial_.end(), 0.0); + fill(datom_virial_.begin(), datom_virial_.end(), (VALUETYPE)0.0); return; } std::vector output_tensors; @@ -139,17 +139,17 @@ static void run_model (ENERGYTYPE & dener, datom_virial[ii] = oav(ii); } // set dvirial to zero, prevent input vector is not zero (#1123) - std::fill(dvirial.begin(), dvirial.end(), 0.); + std::fill(dvirial.begin(), dvirial.end(), (VALUETYPE)0.); for (int ii = 0; ii < nall; ++ii) { - dvirial[0] += 1.0 * datom_virial[9*ii+0]; - dvirial[1] += 1.0 * datom_virial[9*ii+1]; - dvirial[2] += 1.0 * datom_virial[9*ii+2]; - dvirial[3] += 1.0 * datom_virial[9*ii+3]; - dvirial[4] += 1.0 * datom_virial[9*ii+4]; - dvirial[5] += 1.0 * datom_virial[9*ii+5]; - dvirial[6] += 1.0 * datom_virial[9*ii+6]; - dvirial[7] += 1.0 * datom_virial[9*ii+7]; - dvirial[8] += 1.0 * datom_virial[9*ii+8]; + dvirial[0] += (VALUETYPE)1.0 * datom_virial[9*ii+0]; + dvirial[1] += (VALUETYPE)1.0 * datom_virial[9*ii+1]; + dvirial[2] += (VALUETYPE)1.0 * datom_virial[9*ii+2]; + dvirial[3] += (VALUETYPE)1.0 * datom_virial[9*ii+3]; + dvirial[4] += (VALUETYPE)1.0 * datom_virial[9*ii+4]; + dvirial[5] += (VALUETYPE)1.0 * datom_virial[9*ii+5]; + dvirial[6] += (VALUETYPE)1.0 * datom_virial[9*ii+6]; + dvirial[7] += (VALUETYPE)1.0 * datom_virial[9*ii+7]; + dvirial[8] += (VALUETYPE)1.0 * datom_virial[9*ii+8]; } dforce_ = dforce; datom_energy_ = datom_energy; diff --git a/source/lib/include/utilities.h b/source/lib/include/utilities.h index e95ca3e684..06e6498ed6 100644 --- a/source/lib/include/utilities.h +++ b/source/lib/include/utilities.h @@ -73,7 +73,7 @@ template <> inline float invsqrt (const float x) { - return 1./sqrtf (x); + return 1.f/sqrtf (x); } } diff --git a/source/lib/src/coord.cc b/source/lib/src/coord.cc index ecaec9b34c..7e40286023 100644 --- a/source/lib/src/coord.cc +++ b/source/lib/src/coord.cc @@ -18,8 +18,8 @@ normalize_coord_cpu( FPTYPE ri[3]; convert_to_inter_cpu(ri, region, coord+3*ii); for(int dd = 0; dd < 3; ++dd){ - ri[dd] = fmod(ri[dd], 1.); - if (ri[dd] < 0.) ri[dd] += 1.; + ri[dd] = fmod(ri[dd], (FPTYPE)1.); + if (ri[dd] < (FPTYPE)0.) ri[dd] += (FPTYPE)1.; } convert_to_phys_cpu(coord+3*ii, region, ri); } diff --git a/source/lib/src/cuda/coord.cu b/source/lib/src/cuda/coord.cu index 76d1d08ca9..660619cbad 100644 --- a/source/lib/src/cuda/coord.cu +++ b/source/lib/src/cuda/coord.cu @@ -51,6 +51,10 @@ __device__ inline int compute_pbc_shift( return shift; } +__device__ inline double _fmod(double x, double y) {return fmod(x, y);} +__device__ inline float _fmod(float x, float y) {return fmodf(x, y);} + + template __global__ void normalize_one( FPTYPE *out_c, @@ -64,8 +68,8 @@ __global__ void normalize_one( FPTYPE inter[3]; phys2Inter(inter,out_c+idy*3,rec_boxt); for (int dd = 0; dd < 3; ++dd) { - inter[dd]=(FPTYPE)fmod((double)inter[dd], 1.); - if (inter[dd] < 0.) inter[dd] += 1.; + inter[dd]=_fmod(inter[dd], (FPTYPE)1.); + if (inter[dd] < (FPTYPE)0.) inter[dd] += (FPTYPE)1.; } inter2Phys(out_c+idy*3,inter,boxt); } @@ -93,7 +97,7 @@ __global__ void _fill_idx_cellmap( ext_ncell[dd] = ext_end[dd] - ext_stt[dd]; global_grid[dd] = nat_end[dd] - nat_stt[dd]; idx_orig_shift[dd] = nat_stt[dd] - ext_stt[dd]; - cell_size[dd] = 1./global_grid[dd]; + cell_size[dd] = (FPTYPE)1./global_grid[dd]; nat_orig[dd] = nat_stt[dd] * cell_size[dd]; } if (idy __global__ void gelu( FPTYPE * out, @@ -11,7 +14,7 @@ __global__ void gelu( if (idx >= size) { return; } - out[idx] = xx[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (xx[idx] + 0.044715 * xx[idx] * xx[idx] *xx[idx]))); + out[idx] = xx[idx] * (FPTYPE)0.5 * ((FPTYPE)1.0 + _tanh((FPTYPE)SQRT_2_PI * (xx[idx] + (FPTYPE)0.044715 * xx[idx] * xx[idx] *xx[idx]))); } template @@ -26,8 +29,8 @@ __global__ void gelu_grad( return; } // out[idx] = xx[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (xx[idx] + 0.044715 * xx[idx] * xx[idx] *xx[idx]))); - const FPTYPE var = tanh(SQRT_2_PI * (xx[idx] + 0.044715 * xx[idx] * xx[idx] *xx[idx])); - out[idx] = dy[idx] * (0.5 * SQRT_2_PI * xx[idx] * (1 - var * var) * (0.134145 * xx[idx] * xx[idx] + 1) + 0.5 * var + 0.5); + const FPTYPE var = _tanh((FPTYPE)SQRT_2_PI * (xx[idx] + (FPTYPE)0.044715 * xx[idx] * xx[idx] *xx[idx])); + out[idx] = dy[idx] * ((FPTYPE)0.5 * SQRT_2_PI * xx[idx] * ((FPTYPE)1. - var * var) * ((FPTYPE)0.134145 * xx[idx] * xx[idx] + 1) + (FPTYPE)0.5 * var + (FPTYPE)0.5); } template @@ -43,9 +46,9 @@ __global__ void gelu_grad_grad( return; } // out[idx] = xx[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (xx[idx] + 0.044715 * xx[idx] * xx[idx] *xx[idx]))); - const FPTYPE var1 = tanh(SQRT_2_PI * (xx[idx] + 0.044715 * xx[idx] * xx[idx] *xx[idx])); - const FPTYPE var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * xx[idx] * xx[idx] + 1); - out[idx] = dy[idx] * dy_2[idx] * (0.134145 * SQRT_2_PI * xx[idx] * xx[idx] * (1 - var1 * var1) - SQRT_2_PI * xx[idx] * var2 * (0.134145 * xx[idx] * xx[idx] + 1) * var1 + var2); + const FPTYPE var1 = _tanh((FPTYPE)SQRT_2_PI * (xx[idx] + (FPTYPE)0.044715 * xx[idx] * xx[idx] *xx[idx])); + const FPTYPE var2 = (FPTYPE)SQRT_2_PI * ((FPTYPE)1. - var1 * var1) * ((FPTYPE)0.134145 * xx[idx] * xx[idx] + (FPTYPE)1.); + out[idx] = dy[idx] * dy_2[idx] * ((FPTYPE)0.134145 * (FPTYPE)SQRT_2_PI * xx[idx] * xx[idx] * ((FPTYPE)1. - var1 * var1) - (FPTYPE)SQRT_2_PI * xx[idx] * var2 * ((FPTYPE)0.134145 * xx[idx] * xx[idx] + (FPTYPE)1.) * var1 + var2); } namespace deepmd { diff --git a/source/lib/src/cuda/prod_env_mat.cu b/source/lib/src/cuda/prod_env_mat.cu index b2cd4dcaf7..21a3b60647 100644 --- a/source/lib/src/cuda/prod_env_mat.cu +++ b/source/lib/src/cuda/prod_env_mat.cu @@ -5,6 +5,9 @@ #include #include +__device__ inline double _sqrt(double x) {return sqrt(x);} +__device__ inline float _sqrt(float x) {return sqrtf(x);} + // common part of prod_env_mat template < typename Key, @@ -57,18 +60,18 @@ __device__ inline void spline5_switch( const float & rmax) { if (xx < rmin) { - dd = 0; - vv = 1; + dd = (FPTYPE)0.; + vv = (FPTYPE)1.; } else if (xx < rmax) { FPTYPE uu = (xx - rmin) / (rmax - rmin) ; - FPTYPE du = 1. / (rmax - rmin) ; - vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1; - dd = ( 3 * uu*uu * (-6 * uu*uu + 15 * uu - 10) + uu*uu*uu * (-12 * uu + 15) ) * du; + FPTYPE du = (FPTYPE)1. / (rmax - rmin) ; + vv = uu*uu*uu * ((FPTYPE)-6. * uu*uu + (FPTYPE)15. * uu - (FPTYPE)10.) + (FPTYPE)1.; + dd = ( (FPTYPE)3. * uu*uu * ((FPTYPE)-6. * uu*uu + (FPTYPE)15. * uu - (FPTYPE)10.) + uu*uu*uu * ((FPTYPE)-12. * uu + (FPTYPE)15.) ) * du; } else { - dd = 0; - vv = 0; + dd = (FPTYPE)0.; + vv = (FPTYPE)0.; } } @@ -82,7 +85,7 @@ __device__ inline uint_64 encoding_nbor_info( // the type of nbor atom must be smaller than 128 // the distance of center atom between nbor atom must be smaller than 128 // the index of nbor atom(including ghost region) must be smaller than 16777216(1 << 24) - if(type >= 128 || dist >= 128.0 || index >= (1 << 24)) { + if(type >= 128 || dist >= (FPTYPE)128.0 || index >= (1 << 24)) { asm("trap;"); } return ((uint_64)type << 57) + (uint_64)((double)dist * ((uint_64)1 << 50)) / (1 << 24) * (1 << 24) + index; @@ -138,7 +141,7 @@ __global__ void format_nlist_fill_a( for (int dd = 0; dd < 3; dd++) { diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd]; } - FPTYPE rr = sqrt(dev_dot(diff, diff)); + FPTYPE rr = _sqrt(dev_dot(diff, diff)); if (rr <= rcut) { key_in[idy] = encoding_nbor_info(type[j_idx], rr, j_idx); } @@ -345,14 +348,14 @@ __global__ void compute_env_mat_a( } // const FPTYPE * rr = &row_rij[ii * 3]; FPTYPE nr2 = dev_dot(rr, rr); - FPTYPE inr = 1./sqrt(nr2); + FPTYPE inr = (FPTYPE)1./_sqrt(nr2); FPTYPE nr = nr2 * inr; FPTYPE inr2 = inr * inr; FPTYPE inr4 = inr2 * inr2; FPTYPE inr3 = inr4 * nr; FPTYPE sw, dsw; spline5_switch(sw, dsw, nr, rmin, rmax); - dd[0] = (1./nr) ;//* sw; + dd[0] = ((FPTYPE)1./nr) ;//* sw; dd[1] = (rr[0] / nr2) ;//* sw; dd[2] = (rr[1] / nr2) ;//* sw; dd[3] = (rr[2] / nr2) ;//* sw; @@ -360,17 +363,17 @@ __global__ void compute_env_mat_a( vv[1] = (rr[1] * inr3 * sw - dd[0] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3]; vv[2] = (rr[2] * inr3 * sw - dd[0] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3]; // ****deriv of component x/r2 - vv[3] = ((2. * rr[0] * rr[0] * inr4 - inr2) * sw - dd[1] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 3) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 3) % (ndescrpt * 3)) / 3]; - vv[4] = ((2. * rr[0] * rr[1] * inr4 ) * sw - dd[1] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 4) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 4) % (ndescrpt * 3)) / 3]; - vv[5] = ((2. * rr[0] * rr[2] * inr4 ) * sw - dd[1] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 5) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 5) % (ndescrpt * 3)) / 3]; + vv[3] = (((FPTYPE)2. * rr[0] * rr[0] * inr4 - inr2) * sw - dd[1] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 3) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 3) % (ndescrpt * 3)) / 3]; + vv[4] = (((FPTYPE)2. * rr[0] * rr[1] * inr4 ) * sw - dd[1] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 4) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 4) % (ndescrpt * 3)) / 3]; + vv[5] = (((FPTYPE)2. * rr[0] * rr[2] * inr4 ) * sw - dd[1] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 5) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 5) % (ndescrpt * 3)) / 3]; // ***deriv of component y/r2 - vv[6] = ((2. * rr[1] * rr[0] * inr4 ) * sw - dd[2] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 6) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 6) % (ndescrpt * 3)) / 3]; - vv[7] = ((2. * rr[1] * rr[1] * inr4 - inr2) * sw - dd[2] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 7) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 7) % (ndescrpt * 3)) / 3]; - vv[8] = ((2. * rr[1] * rr[2] * inr4 ) * sw - dd[2] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 8) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 8) % (ndescrpt * 3)) / 3]; + vv[6] = (((FPTYPE)2. * rr[1] * rr[0] * inr4 ) * sw - dd[2] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 6) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 6) % (ndescrpt * 3)) / 3]; + vv[7] = (((FPTYPE)2. * rr[1] * rr[1] * inr4 - inr2) * sw - dd[2] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 7) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 7) % (ndescrpt * 3)) / 3]; + vv[8] = (((FPTYPE)2. * rr[1] * rr[2] * inr4 ) * sw - dd[2] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 8) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 8) % (ndescrpt * 3)) / 3]; // ***deriv of component z/r2 - vv[9] = ((2. * rr[2] * rr[0] * inr4 ) * sw - dd[3] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 9) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 9) % (ndescrpt * 3)) / 3]; - vv[10]= ((2. * rr[2] * rr[1] * inr4 ) * sw - dd[3] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 10) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 10) % (ndescrpt * 3)) / 3]; - vv[11]= ((2. * rr[2] * rr[2] * inr4 - inr2) * sw - dd[3] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 11) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 11) % (ndescrpt * 3)) / 3]; + vv[9] = (((FPTYPE)2. * rr[2] * rr[0] * inr4 ) * sw - dd[3] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 9) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 9) % (ndescrpt * 3)) / 3]; + vv[10]= (((FPTYPE)2. * rr[2] * rr[1] * inr4 ) * sw - dd[3] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 10) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 10) % (ndescrpt * 3)) / 3]; + vv[11]= (((FPTYPE)2. * rr[2] * rr[2] * inr4 - inr2) * sw - dd[3] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 11) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 11) % (ndescrpt * 3)) / 3]; // 4 value components dd[0] *= sw; // * em[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] * ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt + idx_value + 0]; dd[1] *= sw; // * em[idx * ndescrpt + idx_value + 1]);// - avg[type[idx] * ndescrpt + idx_value + 1]) / std[type[idx] * ndescrpt + idx_value + 1]; @@ -431,14 +434,14 @@ __global__ void compute_env_mat_r( } // const FPTYPE * rr = &row_rij[ii * 3]; FPTYPE nr2 = dev_dot(rr, rr); - FPTYPE inr = 1./sqrt(nr2); + FPTYPE inr = (FPTYPE)1./_sqrt(nr2); FPTYPE nr = nr2 * inr; FPTYPE inr2 = inr * inr; FPTYPE inr4 = inr2 * inr2; FPTYPE inr3 = inr4 * nr; FPTYPE sw, dsw; spline5_switch(sw, dsw, nr, rmin, rmax); - dd = (1./nr) ;//* sw; + dd = ((FPTYPE)1./nr) ;//* sw; vv[0] = (rr[0] * inr3 * sw - dd * dsw * rr[0] * inr); // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3]; vv[1] = (rr[1] * inr3 * sw - dd * dsw * rr[1] * inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3]; vv[2] = (rr[2] * inr3 * sw - dd * dsw * rr[2] * inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3]; diff --git a/source/lib/src/cuda/prod_virial.cu b/source/lib/src/cuda/prod_virial.cu index e806af4e57..c305fb76f9 100644 --- a/source/lib/src/cuda/prod_virial.cu +++ b/source/lib/src/cuda/prod_virial.cu @@ -12,7 +12,7 @@ __global__ void atom_virial_reduction( unsigned int bid = blockIdx.x; unsigned int tid = threadIdx.x; __shared__ FPTYPE data[THREADS_PER_BLOCK]; - data[tid] = 0.f; + data[tid] = (FPTYPE)0.; for (int ii = tid; ii < nall; ii += THREADS_PER_BLOCK) { data[tid] += atom_virial[ii * 9 + bid]; } @@ -58,7 +58,7 @@ __global__ void virial_deriv_wrt_neighbors_a( // atomicAdd( // virial + idz, // net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz % 3]); - FPTYPE virial_tmp = 0.f; + FPTYPE virial_tmp = (FPTYPE)0.; for (int idw = 0; idw < 4; ++idw) { virial_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz % 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3]; } diff --git a/source/lib/src/cuda/prod_virial_grad.cu b/source/lib/src/cuda/prod_virial_grad.cu index 0209ba933a..2eea51875a 100644 --- a/source/lib/src/cuda/prod_virial_grad.cu +++ b/source/lib/src/cuda/prod_virial_grad.cu @@ -6,7 +6,7 @@ __device__ inline FPTYPE dev_dot9( const FPTYPE * arr1, const FPTYPE * arr2) { - FPTYPE result = 0.0; + FPTYPE result = (FPTYPE)0.0; for(int ii=0; ii<9; ii++){ result += arr1[ii] * arr2[ii]; } @@ -47,7 +47,7 @@ __global__ void virial_grad_wrt_neighbors_a( tmp[dd0 * 3 + dd1] = rij[idx * nnei * 3 + idy * 3 + dd1] * env_deriv[idx * ndescrpt * 3 + idy * 4 * 3 + idw * 3 + dd0]; } } - grad_net[idx * ndescrpt + idy * 4 + idw] -= -1.0 * dev_dot9(grad_one, tmp); + grad_net[idx * ndescrpt + idy * 4 + idw] -= (FPTYPE)-1.0 * dev_dot9(grad_one, tmp); } template @@ -83,7 +83,7 @@ __global__ void virial_grad_wrt_neighbors_r( tmp[dd0 * 3 + dd1] = rij[idx * nnei * 3 + idy * 3 + dd1] * env_deriv[idx * ndescrpt * 3 + idy * 3 + dd0]; } } - grad_net[idx * ndescrpt + idy] -= -1.0 * dev_dot9(grad_one, tmp); + grad_net[idx * ndescrpt + idy] -= (FPTYPE)-1.0 * dev_dot9(grad_one, tmp); } namespace deepmd { diff --git a/source/lib/src/cuda/tabulate.cu b/source/lib/src/cuda/tabulate.cu index b9b1f80a0e..06ebeef22d 100644 --- a/source/lib/src/cuda/tabulate.cu +++ b/source/lib/src/cuda/tabulate.cu @@ -20,7 +20,7 @@ void locate_xx_se_a( { if (xx < lower) { table_idx = 0; - xx = 0; + xx = (FPTYPE)0.; } else if (xx < upper) { table_idx = (int)((xx - lower) / stride0); @@ -33,7 +33,7 @@ void locate_xx_se_a( } else { table_idx = int((upper - lower) / stride0) + (int)((max - upper) / stride1) - 1; - xx = 0; + xx = (FPTYPE)0.; } } @@ -51,7 +51,7 @@ void locate_xx_se_t( { if (xx < min) { table_idx = 0; - xx = 0; + xx = (FPTYPE)0.; } else if (xx < lower) { table_idx = (int)((xx - min) / stride1); @@ -69,7 +69,7 @@ void locate_xx_se_t( } else { table_idx = int((lower - min) / stride1) + int((upper - lower) / stride0) + (int)((max - upper) / stride1) - 1; - xx = 0; + xx = (FPTYPE)0.; } } @@ -86,7 +86,7 @@ void locate_xx_se_r( { if (xx < lower) { table_idx = 0; - xx = 0; + xx = (FPTYPE)0.; } else if (xx < upper) { table_idx = (int)((xx - lower) / stride0); @@ -99,7 +99,7 @@ void locate_xx_se_r( } else { table_idx = int((upper - lower) / stride0) + (int)((max - upper) / stride1) - 1; - xx = 0; + xx = (FPTYPE)0.; } } @@ -163,7 +163,7 @@ __global__ void tabulate_fusion_se_a_fifth_order_polynomial( bool unloop = false; int breakpoint = nnei - 1; - FPTYPE sum[MTILE] = {0.f}; + FPTYPE sum[MTILE] = {(FPTYPE)0.}; int mark_table_idx = -1; FPTYPE var[6]; for (int ii = 0; ii < nnei; ii++) { @@ -238,8 +238,8 @@ __global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial( em[block_idx * nnei * MTILE + ii * 4 + 2], em[block_idx * nnei * MTILE + ii * 4 + 3] }; - FPTYPE Csub = 0.f; - FPTYPE sum[MTILE] = {0.f}; + FPTYPE Csub = (FPTYPE)0.; + FPTYPE sum[MTILE] = {(FPTYPE)0.}; locate_xx_se_a(xx, table_idx, lower, upper, max, stride0, stride1); FPTYPE var[6]; @@ -298,7 +298,7 @@ __global__ void tabulate_fusion_se_a_grad_grad_fifth_order_polynomial( int breakpoint = nnei - 1; FPTYPE * iteratorC = (FPTYPE*) &_data[0]; for (int kk = 0; kk < MTILE; kk++) - iteratorC[kk * last_layer_size + thread_idx] = 0.f; + iteratorC[kk * last_layer_size + thread_idx] = (FPTYPE)0.; __syncthreads(); int mark_table_idx = -1; @@ -352,7 +352,7 @@ __global__ void tabulate_fusion_se_t_fifth_order_polynomial( const int block_idx = blockIdx.x; // nloc const int thread_idx = threadIdx.x; // last_layer_size - FPTYPE sum = 0.f; + FPTYPE sum = (FPTYPE)0.; for (int ii = 0; ii < nnei_i; ii++) { FPTYPE ago = __shfl_sync(0xffffffff, em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + nnei_j - 1], 0); int breakpoint = nnei_j - 1; @@ -423,15 +423,15 @@ __global__ void tabulate_fusion_se_t_grad_fifth_order_polynomial( } int table_idx = 0; locate_xx_se_t(xx, table_idx, lower, upper, -max, max, stride0, stride1); - FPTYPE sum = 0.f; - FPTYPE Csub = 0.f; + FPTYPE sum = (FPTYPE)0.; + FPTYPE Csub = (FPTYPE)0.; for (int kk = lane_idx; kk < last_layer_size; kk += WARP_SIZE) { FPTYPE var[6]; load_polynomial_params(var, table, table_idx, kk, last_layer_size); FPTYPE res = var[0] + (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * xx; sum += iteratorA[kk] * res; - Csub += iteratorA[kk] * tmp * (var[1] + (2 * var[2] + (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) * xx); + Csub += iteratorA[kk] * tmp * (var[1] + ((FPTYPE)2. * var[2] + ((FPTYPE)3. * var[3] + ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) * xx) * xx); } __syncwarp(); warp_reduce(sum); @@ -468,7 +468,7 @@ __global__ void tabulate_fusion_se_t_grad_grad_fifth_order_polynomial( const int block_idx = blockIdx.x; // nloc const int thread_idx = threadIdx.x; // last_layer_size - FPTYPE sum = 0.f; + FPTYPE sum = (FPTYPE)0.; for (int ii = 0; ii < nnei_i; ii++) { FPTYPE ago = __shfl_sync(0xffffffff, em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + nnei_j - 1], 0); bool unloop = false; @@ -559,7 +559,7 @@ __global__ void tabulate_fusion_se_r_grad_fifth_order_polynomial( FPTYPE xx = em[block_idx * nnei + ii]; int table_idx = 0; - FPTYPE Csub = 0.f; + FPTYPE Csub = (FPTYPE)0.; locate_xx_se_r(xx, table_idx, lower, upper, max, stride0, stride1); FPTYPE var[6]; diff --git a/source/lib/src/env_mat.cc b/source/lib/src/env_mat.cc index 7b0d3e4140..f269056cbb 100644 --- a/source/lib/src/env_mat.cc +++ b/source/lib/src/env_mat.cc @@ -108,7 +108,7 @@ env_mat_a_cpu ( { // compute the diff of the neighbors rij_a.resize (sec_a.back() * 3); - fill (rij_a.begin(), rij_a.end(), 0.0); + fill (rij_a.begin(), rij_a.end(), (FPTYPE)0.0); for (int ii = 0; ii < int(sec_a.size()) - 1; ++ii) { for (int jj = sec_a[ii]; jj < sec_a[ii + 1]; ++jj) { if (fmt_nlist_a[jj] < 0) break; @@ -120,17 +120,17 @@ env_mat_a_cpu ( } // 1./rr, cos(theta), cos(phi), sin(phi) descrpt_a.resize (sec_a.back() * 4); - fill (descrpt_a.begin(), descrpt_a.end(), 0.0); + fill (descrpt_a.begin(), descrpt_a.end(), (FPTYPE)0.0); // deriv wrt center: 3 descrpt_a_deriv.resize (sec_a.back() * 4 * 3); - fill (descrpt_a_deriv.begin(), descrpt_a_deriv.end(), 0.0); + fill (descrpt_a_deriv.begin(), descrpt_a_deriv.end(), (FPTYPE)0.0); for (int sec_iter = 0; sec_iter < int(sec_a.size()) - 1; ++sec_iter) { for (int nei_iter = sec_a[sec_iter]; nei_iter < sec_a[sec_iter+1]; ++nei_iter) { if (fmt_nlist_a[nei_iter] < 0) break; const FPTYPE * rr = &rij_a[nei_iter * 3]; FPTYPE nr2 = deepmd::dot3(rr, rr); - FPTYPE inr = 1./sqrt(nr2); + FPTYPE inr = (FPTYPE)1./sqrt(nr2); FPTYPE nr = nr2 * inr; FPTYPE inr2 = inr * inr; FPTYPE inr4 = inr2 * inr2; @@ -140,7 +140,7 @@ env_mat_a_cpu ( int idx_deriv = nei_iter * 4 * 3; // 4 components time 3 directions int idx_value = nei_iter * 4; // 4 components // 4 value components - descrpt_a[idx_value + 0] = 1./nr; + descrpt_a[idx_value + 0] = (FPTYPE)1./nr; descrpt_a[idx_value + 1] = rr[0] / nr2; descrpt_a[idx_value + 2] = rr[1] / nr2; descrpt_a[idx_value + 3] = rr[2] / nr2; @@ -149,17 +149,17 @@ env_mat_a_cpu ( descrpt_a_deriv[idx_deriv + 1] = rr[1] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[1] * inr; descrpt_a_deriv[idx_deriv + 2] = rr[2] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[2] * inr; // deriv of component x/r2 - descrpt_a_deriv[idx_deriv + 3] = (2. * rr[0] * rr[0] * inr4 - inr2) * sw - descrpt_a[idx_value + 1] * dsw * rr[0] * inr; - descrpt_a_deriv[idx_deriv + 4] = (2. * rr[0] * rr[1] * inr4 ) * sw - descrpt_a[idx_value + 1] * dsw * rr[1] * inr; - descrpt_a_deriv[idx_deriv + 5] = (2. * rr[0] * rr[2] * inr4 ) * sw - descrpt_a[idx_value + 1] * dsw * rr[2] * inr; + descrpt_a_deriv[idx_deriv + 3] = ((FPTYPE)2. * rr[0] * rr[0] * inr4 - inr2) * sw - descrpt_a[idx_value + 1] * dsw * rr[0] * inr; + descrpt_a_deriv[idx_deriv + 4] = ((FPTYPE)2. * rr[0] * rr[1] * inr4 ) * sw - descrpt_a[idx_value + 1] * dsw * rr[1] * inr; + descrpt_a_deriv[idx_deriv + 5] = ((FPTYPE)2. * rr[0] * rr[2] * inr4 ) * sw - descrpt_a[idx_value + 1] * dsw * rr[2] * inr; // deriv of component y/r2 - descrpt_a_deriv[idx_deriv + 6] = (2. * rr[1] * rr[0] * inr4 ) * sw - descrpt_a[idx_value + 2] * dsw * rr[0] * inr; - descrpt_a_deriv[idx_deriv + 7] = (2. * rr[1] * rr[1] * inr4 - inr2) * sw - descrpt_a[idx_value + 2] * dsw * rr[1] * inr; - descrpt_a_deriv[idx_deriv + 8] = (2. * rr[1] * rr[2] * inr4 ) * sw - descrpt_a[idx_value + 2] * dsw * rr[2] * inr; + descrpt_a_deriv[idx_deriv + 6] = ((FPTYPE)2. * rr[1] * rr[0] * inr4 ) * sw - descrpt_a[idx_value + 2] * dsw * rr[0] * inr; + descrpt_a_deriv[idx_deriv + 7] = ((FPTYPE)2. * rr[1] * rr[1] * inr4 - inr2) * sw - descrpt_a[idx_value + 2] * dsw * rr[1] * inr; + descrpt_a_deriv[idx_deriv + 8] = ((FPTYPE)2. * rr[1] * rr[2] * inr4 ) * sw - descrpt_a[idx_value + 2] * dsw * rr[2] * inr; // deriv of component z/r2 - descrpt_a_deriv[idx_deriv + 9] = (2. * rr[2] * rr[0] * inr4 ) * sw - descrpt_a[idx_value + 3] * dsw * rr[0] * inr; - descrpt_a_deriv[idx_deriv +10] = (2. * rr[2] * rr[1] * inr4 ) * sw - descrpt_a[idx_value + 3] * dsw * rr[1] * inr; - descrpt_a_deriv[idx_deriv +11] = (2. * rr[2] * rr[2] * inr4 - inr2) * sw - descrpt_a[idx_value + 3] * dsw * rr[2] * inr; + descrpt_a_deriv[idx_deriv + 9] = ((FPTYPE)2. * rr[2] * rr[0] * inr4 ) * sw - descrpt_a[idx_value + 3] * dsw * rr[0] * inr; + descrpt_a_deriv[idx_deriv +10] = ((FPTYPE)2. * rr[2] * rr[1] * inr4 ) * sw - descrpt_a[idx_value + 3] * dsw * rr[1] * inr; + descrpt_a_deriv[idx_deriv +11] = ((FPTYPE)2. * rr[2] * rr[2] * inr4 - inr2) * sw - descrpt_a[idx_value + 3] * dsw * rr[2] * inr; // 4 value components descrpt_a[idx_value + 0] *= sw; descrpt_a[idx_value + 1] *= sw; @@ -256,7 +256,7 @@ env_mat_r_cpu ( { // compute the diff of the neighbors rij_a.resize (sec.back() * 3); - fill (rij_a.begin(), rij_a.end(), 0.0); + fill (rij_a.begin(), rij_a.end(), (FPTYPE)0.0); for (int ii = 0; ii < int(sec.size()) - 1; ++ii) { for (int jj = sec[ii]; jj < sec[ii + 1]; ++jj) { if (fmt_nlist[jj] < 0) break; @@ -269,17 +269,17 @@ env_mat_r_cpu ( } // 1./rr, cos(theta), cos(phi), sin(phi) descrpt_a.resize (sec.back()); - fill (descrpt_a.begin(), descrpt_a.end(), 0.0); + fill (descrpt_a.begin(), descrpt_a.end(), (FPTYPE)0.0); // deriv wrt center: 3 descrpt_a_deriv.resize (sec.back() * 3); - fill (descrpt_a_deriv.begin(), descrpt_a_deriv.end(), 0.0); + fill (descrpt_a_deriv.begin(), descrpt_a_deriv.end(), (FPTYPE)0.0); for (int sec_iter = 0; sec_iter < int(sec.size()) - 1; ++sec_iter) { for (int nei_iter = sec[sec_iter]; nei_iter < sec[sec_iter+1]; ++nei_iter) { if (fmt_nlist[nei_iter] < 0) break; const FPTYPE * rr = &rij_a[nei_iter * 3]; FPTYPE nr2 = deepmd::dot3(rr, rr); - FPTYPE inr = 1./sqrt(nr2); + FPTYPE inr = (FPTYPE)1./sqrt(nr2); FPTYPE nr = nr2 * inr; FPTYPE inr2 = inr * inr; FPTYPE inr4 = inr2 * inr2; @@ -289,7 +289,7 @@ env_mat_r_cpu ( int idx_deriv = nei_iter * 3; // 1 components time 3 directions int idx_value = nei_iter; // 1 components // 4 value components - descrpt_a[idx_value + 0] = 1./nr; + descrpt_a[idx_value + 0] = (FPTYPE)1./nr; // deriv of component 1/r descrpt_a_deriv[idx_deriv + 0] = rr[0] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[0] * inr; descrpt_a_deriv[idx_deriv + 1] = rr[1] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[1] * inr; diff --git a/source/lib/src/ewald.cc b/source/lib/src/ewald.cc index 486d2cbb73..08d0354023 100644 --- a/source/lib/src/ewald.cc +++ b/source/lib/src/ewald.cc @@ -13,7 +13,7 @@ dir_err_esti(const VALUETYPE & test_q, const VALUETYPE & rcut = param.rcut; const VALUETYPE & beta = param.beta; const VALUETYPE rho_q2 = c2/nn; - VALUETYPE sum = 2 * test_q + VALUETYPE sum = (VALUETYPE)2. * test_q * sqrt (rho_q2 / rcut) * exp (- beta*beta*rcut*rcut) * ElectrostaticConvertion; return sum; @@ -215,7 +215,7 @@ ewald_recp( VALUETYPE eincr = expnmm2 * (sqr[mc] * sqr[mc] + sqi[mc] * sqi[mc]); thread_ener[thread_id] += eincr; // virial - VALUETYPE vpref = -2. * (1. + M_PI * M_PI * nmm2 / (param.beta * param.beta)) / nmm2; + VALUETYPE vpref = (VALUETYPE)-2. * ((VALUETYPE)1. + M_PI * M_PI * nmm2 / (param.beta * param.beta)) / nmm2; for (int dd0 = 0; dd0 < 3; ++dd0){ for (int dd1 = 0; dd1 < 3; ++dd1){ VALUETYPE tmp = vpref * rm[dd0] * rm[dd1]; @@ -225,10 +225,10 @@ ewald_recp( } // force for (int ii = 0; ii < natoms; ++ii){ - VALUETYPE mdotr = - 2. * M_PI * (coord[ii*3+0]*rm[0] + coord[ii*3+1]*rm[1] + coord[ii*3+2]*rm[2]); + VALUETYPE mdotr = (VALUETYPE)-2. * M_PI * (coord[ii*3+0]*rm[0] + coord[ii*3+1]*rm[1] + coord[ii*3+2]*rm[2]); VALUETYPE tmpr = charge[ii] * cos(mdotr); VALUETYPE tmpi = charge[ii] * sin(mdotr); - VALUETYPE cc = 4. * M_PI * (tmpr * sqi[mc] + tmpi * sqr[mc]) * expnmm2; + VALUETYPE cc = (VALUETYPE)4. * M_PI * (tmpr * sqi[mc] + tmpi * sqr[mc]) * expnmm2; thread_force[thread_id][ii*3+0] -= rm[0] * cc; thread_force[thread_id][ii*3+1] -= rm[1] * cc; thread_force[thread_id][ii*3+2] -= rm[2] * cc; @@ -252,14 +252,14 @@ ewald_recp( } VALUETYPE vol = volume_cpu(region); - ener /= 2 * M_PI * vol; + ener /= (VALUETYPE)2. * M_PI * vol; ener *= ElectrostaticConvertion; for (int ii = 0; ii < 3*natoms; ++ii){ - force[ii] /= 2 * M_PI * vol; + force[ii] /= (VALUETYPE)2. * M_PI * vol; force[ii] *= ElectrostaticConvertion; } for (int ii = 0; ii < 3*3; ++ii){ - virial[ii] /= 2 * M_PI * vol; + virial[ii] /= (VALUETYPE)2. * M_PI * vol; virial[ii] *= ElectrostaticConvertion; } delete[]sqr; diff --git a/source/lib/src/gelu.cc b/source/lib/src/gelu.cc index e86faa882b..cfeff17a27 100644 --- a/source/lib/src/gelu.cc +++ b/source/lib/src/gelu.cc @@ -1,5 +1,5 @@ #include "gelu.h" -#include "math.h" +#include #include "device.h" template @@ -9,7 +9,7 @@ void deepmd::gelu_cpu( const int size) { for (int ii = 0; ii < size; ii++) { - out[ii] = xx[ii] * 0.5 * (1.0 + tanh(SQRT_2_PI * (xx[ii] + 0.044715 * xx[ii] * xx[ii] *xx[ii]))); + out[ii] = xx[ii] * (FPTYPE)0.5 * ((FPTYPE)1.0 + tanh((FPTYPE)SQRT_2_PI * (xx[ii] + (FPTYPE)0.044715 * xx[ii] * xx[ii] *xx[ii]))); } } @@ -21,8 +21,8 @@ void deepmd::gelu_grad_cpu( const int size) { for (int ii = 0; ii < size; ii++) { - const FPTYPE var = tanh(SQRT_2_PI * (xx[ii] + 0.044715 * xx[ii] * xx[ii] * xx[ii])); - out[ii] = dy[ii] * (0.5 * SQRT_2_PI * xx[ii] * (1 - var * var) * (0.134145 * xx[ii] * xx[ii] + 1) + 0.5 * var + 0.5); + const FPTYPE var = tanh((FPTYPE)SQRT_2_PI * (xx[ii] + (FPTYPE)0.044715 * xx[ii] * xx[ii] * xx[ii])); + out[ii] = dy[ii] * ((FPTYPE)0.5 * (FPTYPE)SQRT_2_PI * xx[ii] * ((FPTYPE)1. - var * var) * ((FPTYPE)0.134145 * xx[ii] * xx[ii] + (FPTYPE)1.) + (FPTYPE)0.5 * var + (FPTYPE)0.5); } } @@ -35,9 +35,9 @@ void deepmd::gelu_grad_grad_cpu( const int size) { for (int ii = 0; ii < size; ii++) { - const FPTYPE var1 = tanh(SQRT_2_PI * (xx[ii] + 0.044715 * xx[ii] * xx[ii] *xx[ii])); - const FPTYPE var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * xx[ii] * xx[ii] + 1); - out[ii] = dy[ii] * dy_2[ii] * (0.134145 * SQRT_2_PI * xx[ii] * xx[ii] * (1 - var1 * var1) - SQRT_2_PI * xx[ii] * var2 * (0.134145 * xx[ii] * xx[ii] + 1) * var1 + var2); + const FPTYPE var1 = tanh((FPTYPE)SQRT_2_PI * (xx[ii] + (FPTYPE)0.044715 * xx[ii] * xx[ii] *xx[ii])); + const FPTYPE var2 = (FPTYPE)SQRT_2_PI * ((FPTYPE)1. - var1 * var1) * ((FPTYPE)0.134145 * xx[ii] * xx[ii] + (FPTYPE)1.); + out[ii] = dy[ii] * dy_2[ii] * ((FPTYPE)0.134145 * (FPTYPE)SQRT_2_PI * xx[ii] * xx[ii] * ((FPTYPE)1. - var1 * var1) - (FPTYPE)SQRT_2_PI * xx[ii] * var2 * ((FPTYPE)0.134145 * xx[ii] * xx[ii] + (FPTYPE)1.) * var1 + var2); } } diff --git a/source/lib/src/pair_tab.cc b/source/lib/src/pair_tab.cc index 2c48ce957a..22c9bd5390 100644 --- a/source/lib/src/pair_tab.cc +++ b/source/lib/src/pair_tab.cc @@ -157,11 +157,11 @@ deepmd::pair_tab_cpu( } for (int ii = 0; ii < nall; ++ii){ int i_idx = ii; - force[i_idx * 3 + 0] = 0; - force[i_idx * 3 + 1] = 0; - force[i_idx * 3 + 2] = 0; + force[i_idx * 3 + 0] = (FPTYPE)0.; + force[i_idx * 3 + 1] = (FPTYPE)0.; + force[i_idx * 3 + 2] = (FPTYPE)0.; for (int dd = 0; dd < 9; ++dd) { - virial[i_idx * 9 + dd] = 0; + virial[i_idx * 9 + dd] = (FPTYPE)0.; } } // compute force of a frame diff --git a/source/lib/src/prod_force.cc b/source/lib/src/prod_force.cc index 6859a5bae3..626b572b00 100644 --- a/source/lib/src/prod_force.cc +++ b/source/lib/src/prod_force.cc @@ -105,9 +105,9 @@ prod_force_r_cpu( for (int ii = 0; ii < nall; ++ii){ int i_idx = ii; - force[i_idx * 3 + 0] = 0; - force[i_idx * 3 + 1] = 0; - force[i_idx * 3 + 2] = 0; + force[i_idx * 3 + 0] = (FPTYPE)0.; + force[i_idx * 3 + 1] = (FPTYPE)0.; + force[i_idx * 3 + 2] = (FPTYPE)0.; } // compute force of a frame diff --git a/source/lib/src/prod_force_grad.cc b/source/lib/src/prod_force_grad.cc index 78bad3c9ca..88d61a4436 100644 --- a/source/lib/src/prod_force_grad.cc +++ b/source/lib/src/prod_force_grad.cc @@ -37,7 +37,7 @@ prod_force_grad_a_cpu( // reset the frame to 0 for (int ii = 0; ii < nloc; ++ii){ for (int aa = 0; aa < ndescrpt; ++aa){ - grad_net[ii * ndescrpt + aa] = 0; + grad_net[ii * ndescrpt + aa] = (FPTYPE)0.; } } @@ -116,7 +116,7 @@ prod_force_grad_r_cpu( // reset the frame to 0 for (int ii = 0; ii < nloc; ++ii){ for (int aa = 0; aa < ndescrpt; ++aa){ - grad_net[ii * ndescrpt + aa] = 0; + grad_net[ii * ndescrpt + aa] = (FPTYPE)0.; } } diff --git a/source/lib/src/prod_virial.cc b/source/lib/src/prod_virial.cc index d715cf9e5b..29b343ba0b 100644 --- a/source/lib/src/prod_virial.cc +++ b/source/lib/src/prod_virial.cc @@ -37,10 +37,10 @@ prod_virial_a_cpu( const int ndescrpt = 4 * nnei; for (int ii = 0; ii < 9; ++ ii){ - virial[ii] = 0.; + virial[ii] = (FPTYPE)0.; } for (int ii = 0; ii < 9 * nall; ++ ii){ - atom_virial[ii] = 0.; + atom_virial[ii] = (FPTYPE)0.; } // compute virial of a frame @@ -55,7 +55,7 @@ prod_virial_a_cpu( int aa_start, aa_end; make_index_range (aa_start, aa_end, jj, nnei); for (int aa = aa_start; aa < aa_end; ++aa) { - FPTYPE pref = -1.0 * net_deriv[i_idx * ndescrpt + aa]; + FPTYPE pref = (FPTYPE)-1.0 * net_deriv[i_idx * ndescrpt + aa]; for (int dd0 = 0; dd0 < 3; ++dd0){ for (int dd1 = 0; dd1 < 3; ++dd1){ FPTYPE tmp_v = pref * rij[i_idx * nnei * 3 + jj * 3 + dd1] * env_deriv[i_idx * ndescrpt * 3 + aa * 3 + dd0]; @@ -116,10 +116,10 @@ prod_virial_r_cpu( const int ndescrpt = nnei; for (int ii = 0; ii < 9; ++ ii){ - virial[ii] = 0.; + virial[ii] = (FPTYPE)0.; } for (int ii = 0; ii < 9 * nall; ++ ii){ - atom_virial[ii] = 0.; + atom_virial[ii] = (FPTYPE)0.; } // compute virial of a frame diff --git a/source/lib/src/rocm/coord.hip.cu b/source/lib/src/rocm/coord.hip.cu index 73d85d2111..ab75e7f7a0 100644 --- a/source/lib/src/rocm/coord.hip.cu +++ b/source/lib/src/rocm/coord.hip.cu @@ -51,6 +51,9 @@ __device__ inline int compute_pbc_shift( return shift; } +__device__ inline double _fmod(double x, double y) {return fmod(x, y);} +__device__ inline float _fmod(float x, float y) {return fmodf(x, y);} + template __global__ void normalize_one( FPTYPE *out_c, @@ -64,8 +67,8 @@ __global__ void normalize_one( FPTYPE inter[3]; phys2Inter(inter,out_c+idy*3,rec_boxt); for (int dd = 0; dd < 3; ++dd) { - inter[dd]=(FPTYPE)fmod((double)inter[dd], 1.); - if (inter[dd] < 0.) inter[dd] += 1.; + inter[dd]=_fmod(inter[dd], (FPTYPE)1.); + if (inter[dd] < (FPTYPE)0.) inter[dd] += (FPTYPE)1.; } inter2Phys(out_c+idy*3,inter,boxt); } @@ -93,7 +96,7 @@ __global__ void _fill_idx_cellmap( ext_ncell[dd] = ext_end[dd] - ext_stt[dd]; global_grid[dd] = nat_end[dd] - nat_stt[dd]; idx_orig_shift[dd] = nat_stt[dd] - ext_stt[dd]; - cell_size[dd] = 1./global_grid[dd]; + cell_size[dd] = (FPTYPE)1./global_grid[dd]; nat_orig[dd] = nat_stt[dd] * cell_size[dd]; } if (idy __global__ void gelu( FPTYPE * out, @@ -11,7 +14,7 @@ __global__ void gelu( if (idx >= size) { return; } - out[idx] = xx[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (xx[idx] + 0.044715 * xx[idx] * xx[idx] *xx[idx]))); + out[idx] = xx[idx] * (FPTYPE)0.5 * ((FPTYPE)1.0 + _tanh((FPTYPE)SQRT_2_PI * (xx[idx] + (FPTYPE)0.044715 * xx[idx] * xx[idx] *xx[idx]))); } template @@ -26,8 +29,8 @@ __global__ void gelu_grad( return; } // out[idx] = xx[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (xx[idx] + 0.044715 * xx[idx] * xx[idx] *xx[idx]))); - const FPTYPE var = tanh(SQRT_2_PI * (xx[idx] + 0.044715 * xx[idx] * xx[idx] *xx[idx])); - out[idx] = dy[idx] * (0.5 * SQRT_2_PI * xx[idx] * (1 - var * var) * (0.134145 * xx[idx] * xx[idx] + 1) + 0.5 * var + 0.5); + const FPTYPE var = _tanh((FPTYPE)SQRT_2_PI * (xx[idx] + (FPTYPE)0.044715 * xx[idx] * xx[idx] *xx[idx])); + out[idx] = dy[idx] * ((FPTYPE)0.5 * (FPTYPE)SQRT_2_PI * xx[idx] * ((FPTYPE)1. - var * var) * ((FPTYPE)0.134145 * xx[idx] * xx[idx] + (FPTYPE)1.) + (FPTYPE)0.5 * var + (FPTYPE)0.5); } template @@ -43,9 +46,9 @@ __global__ void gelu_grad_grad( return; } // out[idx] = xx[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (xx[idx] + 0.044715 * xx[idx] * xx[idx] *xx[idx]))); - const FPTYPE var1 = tanh(SQRT_2_PI * (xx[idx] + 0.044715 * xx[idx] * xx[idx] *xx[idx])); - const FPTYPE var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * xx[idx] * xx[idx] + 1); - out[idx] = dy[idx] * dy_2[idx] * (0.134145 * SQRT_2_PI * xx[idx] * xx[idx] * (1 - var1 * var1) - SQRT_2_PI * xx[idx] * var2 * (0.134145 * xx[idx] * xx[idx] + 1) * var1 + var2); + const FPTYPE var1 = _tanh((FPTYPE)SQRT_2_PI * (xx[idx] + (FPTYPE)0.044715 * xx[idx] * xx[idx] *xx[idx])); + const FPTYPE var2 = (FPTYPE)SQRT_2_PI * ((FPTYPE)1. - var1 * var1) * ((FPTYPE)0.134145 * xx[idx] * xx[idx] + (FPTYPE)1.); + out[idx] = dy[idx] * dy_2[idx] * ((FPTYPE)0.134145 * (FPTYPE)SQRT_2_PI * xx[idx] * xx[idx] * ((FPTYPE)1. - var1 * var1) - (FPTYPE)SQRT_2_PI * xx[idx] * var2 * ((FPTYPE)0.134145 * xx[idx] * xx[idx] + (FPTYPE)1.) * var1 + var2); } namespace deepmd { diff --git a/source/lib/src/rocm/prod_env_mat.hip.cu b/source/lib/src/rocm/prod_env_mat.hip.cu index 45fa0deb41..5f88fac4f4 100644 --- a/source/lib/src/rocm/prod_env_mat.hip.cu +++ b/source/lib/src/rocm/prod_env_mat.hip.cu @@ -3,6 +3,9 @@ #include "device.h" #include "hipcub/hipcub.hpp" +__device__ inline double _sqrt(double x) {return sqrt(x);} +__device__ inline float _sqrt(float x) {return sqrtf(x);} + // common part of prod_env_mat template < typename Key, @@ -55,18 +58,18 @@ __device__ inline void spline5_switch( const float & rmax) { if (xx < rmin) { - dd = 0; - vv = 1; + dd = (FPTYPE)0.; + vv = (FPTYPE)1.; } else if (xx < rmax) { FPTYPE uu = (xx - rmin) / (rmax - rmin) ; - FPTYPE du = 1. / (rmax - rmin) ; - vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1; - dd = ( 3 * uu*uu * (-6 * uu*uu + 15 * uu - 10) + uu*uu*uu * (-12 * uu + 15) ) * du; + FPTYPE du = (FPTYPE)1. / (rmax - rmin) ; + vv = uu*uu*uu * ((FPTYPE)-6. * uu*uu + (FPTYPE)15. * uu - (FPTYPE)10.) + (FPTYPE)1.; + dd = ( (FPTYPE)3. * uu*uu * ((FPTYPE)-6. * uu*uu + (FPTYPE)15. * uu - (FPTYPE)10.) + uu*uu*uu * ((FPTYPE)-12. * uu + (FPTYPE)15.) ) * du; } else { - dd = 0; - vv = 0; + dd = (FPTYPE)0.; + vv = (FPTYPE)0.; } } @@ -80,7 +83,7 @@ __device__ inline uint_64 encoding_nbor_info( // the type of nbor atom must be smaller than 128 // the distance of center atom between nbor atom must be smaller than 128 // the index of nbor atom(including ghost region) must be smaller than 16777216(1 << 24) - if(type >= 128 || dist >= 128.0 || index >= (1 << 24)) { + if(type >= 128 || dist >= (FPTYPE)128.0 || index >= (1 << 24)) { __builtin_trap(); } return ((uint_64)type << 57) + (uint_64)((double)dist * ((uint_64)1 << 50)) / (1 << 24) * (1 << 24) + index; @@ -136,7 +139,7 @@ __global__ void format_nlist_fill_a( for (int dd = 0; dd < 3; dd++) { diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd]; } - FPTYPE rr = sqrt(dev_dot(diff, diff)); + FPTYPE rr = _sqrt(dev_dot(diff, diff)); if (rr <= rcut) { key_in[idy] = encoding_nbor_info(type[j_idx], rr, j_idx); } @@ -343,14 +346,14 @@ __global__ void compute_env_mat_a( } // const FPTYPE * rr = &row_rij[ii * 3]; FPTYPE nr2 = dev_dot(rr, rr); - FPTYPE inr = 1./sqrt(nr2); + FPTYPE inr = (FPTYPE)1./_sqrt(nr2); FPTYPE nr = nr2 * inr; FPTYPE inr2 = inr * inr; FPTYPE inr4 = inr2 * inr2; FPTYPE inr3 = inr4 * nr; FPTYPE sw, dsw; spline5_switch(sw, dsw, nr, rmin, rmax); - dd[0] = (1./nr) ;//* sw; + dd[0] = ((FPTYPE)1./nr) ;//* sw; dd[1] = (rr[0] / nr2) ;//* sw; dd[2] = (rr[1] / nr2) ;//* sw; dd[3] = (rr[2] / nr2) ;//* sw; @@ -358,17 +361,17 @@ __global__ void compute_env_mat_a( vv[1] = (rr[1] * inr3 * sw - dd[0] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3]; vv[2] = (rr[2] * inr3 * sw - dd[0] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3]; // ****deriv of component x/r2 - vv[3] = ((2. * rr[0] * rr[0] * inr4 - inr2) * sw - dd[1] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 3) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 3) % (ndescrpt * 3)) / 3]; - vv[4] = ((2. * rr[0] * rr[1] * inr4 ) * sw - dd[1] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 4) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 4) % (ndescrpt * 3)) / 3]; - vv[5] = ((2. * rr[0] * rr[2] * inr4 ) * sw - dd[1] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 5) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 5) % (ndescrpt * 3)) / 3]; + vv[3] = (((FPTYPE)2. * rr[0] * rr[0] * inr4 - inr2) * sw - dd[1] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 3) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 3) % (ndescrpt * 3)) / 3]; + vv[4] = (((FPTYPE)2. * rr[0] * rr[1] * inr4 ) * sw - dd[1] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 4) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 4) % (ndescrpt * 3)) / 3]; + vv[5] = (((FPTYPE)2. * rr[0] * rr[2] * inr4 ) * sw - dd[1] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 5) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 5) % (ndescrpt * 3)) / 3]; // ***deriv of component y/r2 - vv[6] = ((2. * rr[1] * rr[0] * inr4 ) * sw - dd[2] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 6) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 6) % (ndescrpt * 3)) / 3]; - vv[7] = ((2. * rr[1] * rr[1] * inr4 - inr2) * sw - dd[2] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 7) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 7) % (ndescrpt * 3)) / 3]; - vv[8] = ((2. * rr[1] * rr[2] * inr4 ) * sw - dd[2] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 8) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 8) % (ndescrpt * 3)) / 3]; + vv[6] = (((FPTYPE)2. * rr[1] * rr[0] * inr4 ) * sw - dd[2] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 6) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 6) % (ndescrpt * 3)) / 3]; + vv[7] = (((FPTYPE)2. * rr[1] * rr[1] * inr4 - inr2) * sw - dd[2] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 7) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 7) % (ndescrpt * 3)) / 3]; + vv[8] = (((FPTYPE)2. * rr[1] * rr[2] * inr4 ) * sw - dd[2] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 8) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 8) % (ndescrpt * 3)) / 3]; // ***deriv of component z/r2 - vv[9] = ((2. * rr[2] * rr[0] * inr4 ) * sw - dd[3] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 9) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 9) % (ndescrpt * 3)) / 3]; - vv[10]= ((2. * rr[2] * rr[1] * inr4 ) * sw - dd[3] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 10) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 10) % (ndescrpt * 3)) / 3]; - vv[11]= ((2. * rr[2] * rr[2] * inr4 - inr2) * sw - dd[3] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 11) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 11) % (ndescrpt * 3)) / 3]; + vv[9] = (((FPTYPE)2. * rr[2] * rr[0] * inr4 ) * sw - dd[3] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 9) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 9) % (ndescrpt * 3)) / 3]; + vv[10]= (((FPTYPE)2. * rr[2] * rr[1] * inr4 ) * sw - dd[3] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 10) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 10) % (ndescrpt * 3)) / 3]; + vv[11]= (((FPTYPE)2. * rr[2] * rr[2] * inr4 - inr2) * sw - dd[3] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 11) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 11) % (ndescrpt * 3)) / 3]; // 4 value components dd[0] *= sw; // * em[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] * ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt + idx_value + 0]; dd[1] *= sw; // * em[idx * ndescrpt + idx_value + 1]);// - avg[type[idx] * ndescrpt + idx_value + 1]) / std[type[idx] * ndescrpt + idx_value + 1]; @@ -419,9 +422,9 @@ __global__ void compute_env_mat_r( const int idx_value = ii; // 4 components const int idx_deriv = ii * 3; // 4 components time 3 directions if (row_nlist[ii] >= 0) { - FPTYPE rr[3] = {0}; - FPTYPE vv[3] = {0}; - FPTYPE dd = 0; + FPTYPE rr[3] = {(FPTYPE)0.}; + FPTYPE vv[3] = {(FPTYPE)0.}; + FPTYPE dd = (FPTYPE)0.; const int & j_idx = row_nlist[ii]; for (int kk = 0; kk < 3; kk++) { rr[kk] = coord[j_idx * 3 + kk] - coord[bid * 3 + kk]; @@ -429,14 +432,14 @@ __global__ void compute_env_mat_r( } // const FPTYPE * rr = &row_rij[ii * 3]; FPTYPE nr2 = dev_dot(rr, rr); - FPTYPE inr = 1./sqrt(nr2); + FPTYPE inr = (FPTYPE)1./_sqrt(nr2); FPTYPE nr = nr2 * inr; FPTYPE inr2 = inr * inr; FPTYPE inr4 = inr2 * inr2; FPTYPE inr3 = inr4 * nr; FPTYPE sw, dsw; spline5_switch(sw, dsw, nr, rmin, rmax); - dd = (1./nr) ;//* sw; + dd = ((FPTYPE)1./nr) ;//* sw; vv[0] = (rr[0] * inr3 * sw - dd * dsw * rr[0] * inr); // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3]; vv[1] = (rr[1] * inr3 * sw - dd * dsw * rr[1] * inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3]; vv[2] = (rr[2] * inr3 * sw - dd * dsw * rr[2] * inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3]; diff --git a/source/lib/src/rocm/prod_force.hip.cu b/source/lib/src/rocm/prod_force.hip.cu index 19ca5f0b89..d2136bf12f 100644 --- a/source/lib/src/rocm/prod_force.hip.cu +++ b/source/lib/src/rocm/prod_force.hip.cu @@ -14,7 +14,7 @@ __global__ void force_deriv_wrt_center_atom( unsigned int bid = blockIdx.x; unsigned int tid = threadIdx.x; for (int ii = tid; ii < THREADS_PER_BLOCK * 3; ii += THREADS_PER_BLOCK) { - data[ii] = 0.f; + data[ii] = (FPTYPE)0.; } for (int ii = tid; ii < ndescrpt; ii += THREADS_PER_BLOCK) { for (int jj = 0; jj < 3; jj++) { @@ -61,7 +61,7 @@ __global__ void force_deriv_wrt_neighbors_a( if (j_idx < 0) { return; } - FPTYPE force_tmp = 0.f; + FPTYPE force_tmp = (FPTYPE)0.; for (int idw = 0; idw < 4; ++idw) { force_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz]; } diff --git a/source/lib/src/rocm/prod_virial.hip.cu b/source/lib/src/rocm/prod_virial.hip.cu index 5c7cc05721..d1304766cd 100644 --- a/source/lib/src/rocm/prod_virial.hip.cu +++ b/source/lib/src/rocm/prod_virial.hip.cu @@ -12,7 +12,7 @@ __global__ void atom_virial_reduction( unsigned int bid = blockIdx.x; unsigned int tid = threadIdx.x; __shared__ FPTYPE data[THREADS_PER_BLOCK]; - data[tid] = 0.f; + data[tid] = (FPTYPE)0.; for (int ii = tid; ii < nall; ii += THREADS_PER_BLOCK) { data[tid] += atom_virial[ii * 9 + bid]; } @@ -55,7 +55,7 @@ __global__ void virial_deriv_wrt_neighbors_a( if (j_idx < 0) { return; } - FPTYPE virial_tmp = 0.f; + FPTYPE virial_tmp = (FPTYPE)0.; for (int idw = 0; idw < 4; ++idw) { virial_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz % 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3]; } diff --git a/source/lib/src/rocm/prod_virial_grad.hip.cu b/source/lib/src/rocm/prod_virial_grad.hip.cu index 18c009f0c8..9875b35f24 100644 --- a/source/lib/src/rocm/prod_virial_grad.hip.cu +++ b/source/lib/src/rocm/prod_virial_grad.hip.cu @@ -6,7 +6,7 @@ __device__ inline FPTYPE dev_dot9( const FPTYPE * arr1, const FPTYPE * arr2) { - FPTYPE result = 0.0; + FPTYPE result = (FPTYPE)0.0; for(int ii=0; ii<9; ii++){ result += arr1[ii] * arr2[ii]; } @@ -83,7 +83,7 @@ __global__ void virial_grad_wrt_neighbors_r( tmp[dd0 * 3 + dd1] = rij[idx * nnei * 3 + idy * 3 + dd1] * env_deriv[idx * ndescrpt * 3 + idy * 3 + dd0]; } } - grad_net[idx * ndescrpt + idy] -= -1.0 * dev_dot9(grad_one, tmp); + grad_net[idx * ndescrpt + idy] -= (FPTYPE)-1.0 * dev_dot9(grad_one, tmp); } namespace deepmd { diff --git a/source/lib/src/rocm/tabulate.hip.cu b/source/lib/src/rocm/tabulate.hip.cu index 0354bc68b3..f213cb7d28 100644 --- a/source/lib/src/rocm/tabulate.hip.cu +++ b/source/lib/src/rocm/tabulate.hip.cu @@ -20,7 +20,7 @@ void locate_xx( { if (xx < lower) { table_idx = 0; - xx = 0; + xx = (FPTYPE)0.; } else if (xx < upper) { table_idx = (int)((xx - lower) / stride0); @@ -33,7 +33,7 @@ void locate_xx( } else { table_idx = int((upper - lower) / stride0) + (int)((max - upper) / stride1) - 1; - xx = 0; + xx = (FPTYPE)0.; } } @@ -51,7 +51,7 @@ void locate_xx_se_t( { if (xx < min) { table_idx = 0; - xx = 0; + xx = (FPTYPE)0.; } else if (xx < lower) { table_idx = (int)((xx - min) / stride1); @@ -69,7 +69,7 @@ void locate_xx_se_t( } else { table_idx = int((lower - min) / stride1) + int((upper - lower) / stride0) + (int)((max - upper) / stride1) - 1; - xx = 0; + xx = (FPTYPE)0.; } } @@ -117,7 +117,7 @@ __global__ void tabulate_fusion_se_a_fifth_order_polynomial( int breakpoint = nnei - 1; FPTYPE * iteratorC = (FPTYPE*) &_data[0]; for (int kk = 0; kk < MTILE; kk++) - iteratorC[kk * last_layer_size + thread_idx] = 0.f; + iteratorC[kk * last_layer_size + thread_idx] = (FPTYPE)0.; __syncthreads(); for (int ii = 0; ii < nnei; ii++) { @@ -190,8 +190,8 @@ __global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial( int table_idx = 0; locate_xx(xx, table_idx, lower, upper, max, stride0, stride1); - FPTYPE sum[KTILE] = {0.f}; - FPTYPE Csub = 0.f; + FPTYPE sum[KTILE] = {(FPTYPE)0.}; + FPTYPE Csub = (FPTYPE)0.; for (int jj = lane_idx; jj < last_layer_size; jj += WARP_SIZE) { FPTYPE var[6]; // load iteratorB through table @@ -210,7 +210,7 @@ __global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial( res += em[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 1] * iteratorA[1 * last_layer_size + jj]; res += em[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 2] * iteratorA[2 * last_layer_size + jj]; res += em[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 3] * iteratorA[3 * last_layer_size + jj]; - Csub += (nnei - breakpoint) * (var[1] + (2 * var[2] + (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) * xx) * res; + Csub += (nnei - breakpoint) * (var[1] + ((FPTYPE)2. * var[2] + ((FPTYPE)3. * var[3] + ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) * xx) * xx) * res; } //__syncwarp();->syncwrap __syncthreads(); @@ -255,7 +255,7 @@ __global__ void tabulate_fusion_se_a_grad_grad_fifth_order_polynomial( int breakpoint = nnei - 1; FPTYPE * iteratorC = (FPTYPE*) &_data[0]; for (int kk = 0; kk < MTILE; kk++) - iteratorC[kk * last_layer_size + thread_idx] = 0.f; + iteratorC[kk * last_layer_size + thread_idx] = (FPTYPE)0.; __syncthreads(); for (int ii = 0; ii < nnei; ii++) { @@ -275,7 +275,7 @@ __global__ void tabulate_fusion_se_a_grad_grad_fifth_order_polynomial( var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4]; var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5]; FPTYPE res = var[0] + (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * xx; - FPTYPE res_grad = var[1] + (2 * var[2] + (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) * xx; + FPTYPE res_grad = var[1] + ((FPTYPE)2. * var[2] + ((FPTYPE)3. * var[3] + ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) * xx) * xx; for (int kk = 0; kk < MTILE; kk++) { int em_index = block_idx * nnei * MTILE + ii * MTILE + kk; @@ -310,7 +310,7 @@ __global__ void tabulate_fusion_se_t_fifth_order_polynomial( const int block_idx = blockIdx.x; // nloc const int thread_idx = threadIdx.x; // last_layer_size - FPTYPE sum = 0.f; + FPTYPE sum = (FPTYPE)0.; for (int ii = 0; ii < nnei_i; ii++) { FPTYPE ago = __shfl(em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + nnei_j - 1], 0); int breakpoint = nnei_j - 1; @@ -382,8 +382,8 @@ __global__ void tabulate_fusion_se_t_grad_fifth_order_polynomial( } int table_idx = 0; locate_xx_se_t(xx, table_idx, lower, upper, -max, max, stride0, stride1); - FPTYPE sum = 0.f; - FPTYPE Csub = 0.f; + FPTYPE sum = (FPTYPE)0.; + FPTYPE Csub = (FPTYPE)0.; for (int kk = lane_idx; kk < last_layer_size; kk += WARP_SIZE) { FPTYPE var[6]; // load iteratorB through table @@ -396,7 +396,7 @@ __global__ void tabulate_fusion_se_t_grad_fifth_order_polynomial( FPTYPE res = var[0] + (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * xx; sum += iteratorA[kk] * res; - Csub += iteratorA[kk] * tmp * (var[1] + (2 * var[2] + (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) * xx); + Csub += iteratorA[kk] * tmp * (var[1] + ((FPTYPE)2. * var[2] + ((FPTYPE)3. * var[3] + ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) * xx) * xx); } __syncthreads(); warp_reduce(sum); @@ -433,7 +433,7 @@ __global__ void tabulate_fusion_se_t_grad_grad_fifth_order_polynomial( const int block_idx = blockIdx.x; // nloc const int thread_idx = threadIdx.x; // last_layer_size - FPTYPE sum = 0.f; + FPTYPE sum = (FPTYPE)0.; for (int ii = 0; ii < nnei_i; ii++) { FPTYPE ago = __shfl(em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + nnei_j - 1], 0); bool unloop = false; @@ -456,7 +456,7 @@ __global__ void tabulate_fusion_se_t_grad_grad_fifth_order_polynomial( var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4]; var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5]; FPTYPE res = var[0] + (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * xx; - FPTYPE res_grad = var[1] + (2 * var[2] + (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) * xx; + FPTYPE res_grad = var[1] + ((FPTYPE)2. * var[2] + ((FPTYPE)3. * var[3] + ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) * xx) * xx; sum += (tmp * res_grad * dz_xx + dz_em * res); if (unloop) break; @@ -539,7 +539,7 @@ __global__ void tabulate_fusion_se_r_grad_fifth_order_polynomial( var[3] = table[table_idx * last_layer_size * 6 + 6 * jj + 3]; var[4] = table[table_idx * last_layer_size * 6 + 6 * jj + 4]; var[5] = table[table_idx * last_layer_size * 6 + 6 * jj + 5]; - Csub +=(var[1] + (2 * var[2] + (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) * xx) * dy[block_idx * nnei * last_layer_size + ii * last_layer_size + jj]; + Csub +=(var[1] + ((FPTYPE)2. * var[2] + ((FPTYPE)3. * var[3] + ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) * xx) * xx) * dy[block_idx * nnei * last_layer_size + ii * last_layer_size + jj]; } //__syncwarp();->syncwrap __syncthreads(); @@ -585,7 +585,7 @@ __global__ void tabulate_fusion_se_r_grad_grad_fifth_order_polynomial( var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3]; var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4]; var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5]; - FPTYPE res_grad = var[1] + (2 * var[2] + (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) * xx; + FPTYPE res_grad = var[1] + ((FPTYPE)2. * var[2] + ((FPTYPE)3. * var[3] + ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) * xx) * xx; dz_dy[block_idx * nnei * last_layer_size + ii * last_layer_size + thread_idx] = dz_dy_dem[block_idx * nnei + ii]*res_grad; } diff --git a/source/lib/src/soft_min_switch.cc b/source/lib/src/soft_min_switch.cc index 88471a3d4b..9b37b29cde 100644 --- a/source/lib/src/soft_min_switch.cc +++ b/source/lib/src/soft_min_switch.cc @@ -17,12 +17,12 @@ void deepmd::soft_min_switch_cpu( { // fill results with 0 for (int ii = 0; ii < nloc; ++ii){ - sw_value[ii] = 0; + sw_value[ii] = (FPTYPE)0.; } for (int ii = 0; ii < nloc * nnei; ++ii){ - sw_deriv[ii * 3 + 0] = 0; - sw_deriv[ii * 3 + 1] = 0; - sw_deriv[ii * 3 + 2] = 0; + sw_deriv[ii * 3 + 0] = (FPTYPE)0.; + sw_deriv[ii * 3 + 1] = (FPTYPE)0.; + sw_deriv[ii * 3 + 2] = (FPTYPE)0.; } // compute force of a frame for (int ii = 0; ii < nloc; ++ii){ @@ -62,8 +62,8 @@ void deepmd::soft_min_switch_cpu( FPTYPE rr2 = dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2]; FPTYPE rr = sqrt(rr2); FPTYPE ee = exp(-rr / alpha); - FPTYPE pref_c = (1./rr - 1./alpha) * ee ; - FPTYPE pref_d = 1./(rr * alpha) * ee; + FPTYPE pref_c = ((FPTYPE)1./rr - (FPTYPE)1./alpha) * ee ; + FPTYPE pref_d = (FPTYPE)1./(rr * alpha) * ee; FPTYPE ts; ts = dd / (aa * aa) * (aa * pref_c + bb * pref_d); sw_deriv[rij_idx_shift + 0] += ts * dr[0]; diff --git a/source/lib/src/soft_min_switch_force.cc b/source/lib/src/soft_min_switch_force.cc index 724952493d..9a6633daa0 100644 --- a/source/lib/src/soft_min_switch_force.cc +++ b/source/lib/src/soft_min_switch_force.cc @@ -19,9 +19,9 @@ void deepmd::soft_min_switch_force_cpu( // set zeros for (int ii = 0; ii < nall; ++ii){ int i_idx = ii; - force[i_idx * 3 + 0] = 0; - force[i_idx * 3 + 1] = 0; - force[i_idx * 3 + 2] = 0; + force[i_idx * 3 + 0] = (FPTYPE)0.; + force[i_idx * 3 + 1] = (FPTYPE)0.; + force[i_idx * 3 + 2] = (FPTYPE)0.; } // compute force of a frame for (int ii = 0; ii < nloc; ++ii){ diff --git a/source/lib/src/soft_min_switch_force_grad.cc b/source/lib/src/soft_min_switch_force_grad.cc index 31e46e9d6d..138d20d93c 100644 --- a/source/lib/src/soft_min_switch_force_grad.cc +++ b/source/lib/src/soft_min_switch_force_grad.cc @@ -18,7 +18,7 @@ void deepmd::soft_min_switch_force_grad_cpu( { // reset the frame to 0 for (int ii = 0; ii < nloc; ++ii){ - grad_net[ii] = 0; + grad_net[ii] = (FPTYPE)0.; } // compute grad of one frame diff --git a/source/lib/src/soft_min_switch_virial.cc b/source/lib/src/soft_min_switch_virial.cc index a93ab3c1fb..634f6b6d49 100644 --- a/source/lib/src/soft_min_switch_virial.cc +++ b/source/lib/src/soft_min_switch_virial.cc @@ -20,10 +20,10 @@ void deepmd::soft_min_switch_virial_cpu( // { for (int ii = 0; ii < 9; ++ ii){ - virial[ii] = 0.; + virial[ii] = (FPTYPE)0.; } for (int ii = 0; ii < 9 * nall; ++ ii){ - atom_virial[ii] = 0.; + atom_virial[ii] = (FPTYPE)0.; } // compute virial of a frame diff --git a/source/lib/src/soft_min_switch_virial_grad.cc b/source/lib/src/soft_min_switch_virial_grad.cc index 1bb28a7c63..5c86376737 100644 --- a/source/lib/src/soft_min_switch_virial_grad.cc +++ b/source/lib/src/soft_min_switch_virial_grad.cc @@ -19,7 +19,7 @@ void deepmd::soft_min_switch_virial_grad_cpu( { // reset the frame to 0 for (int ii = 0; ii < nloc; ++ii){ - grad_net[ii] = 0; + grad_net[ii] = (FPTYPE)0.; } // compute grad of one frame diff --git a/source/lib/src/tabulate.cc b/source/lib/src/tabulate.cc index e116711466..b75e9155f9 100644 --- a/source/lib/src/tabulate.cc +++ b/source/lib/src/tabulate.cc @@ -25,7 +25,7 @@ inline void locate_xx( { if (xx < lower) { table_idx = 0; - xx = 0; + xx = (FPTYPE)0.; } else if (xx < upper) { table_idx = (int)((xx - lower) / stride0); @@ -38,7 +38,7 @@ inline void locate_xx( } else { table_idx = int((upper - lower) / stride0) + (int)((max - upper) / stride1) - 1; - xx = 0; + xx = (FPTYPE)0.; } } @@ -56,7 +56,7 @@ inline void locate_xx_se_t( { if (xx < min) { table_idx = 0; - xx = 0; + xx = (FPTYPE)0.; } else if (xx < lower) { table_idx = (int)((xx - min) / stride1); @@ -74,7 +74,7 @@ inline void locate_xx_se_t( } else { table_idx = int((lower - min) / stride1) + int((upper - lower) / stride0) + (int)((max - upper) / stride1) - 1; - xx = 0; + xx = (FPTYPE)0.; } } @@ -187,7 +187,7 @@ void deepmd::tabulate_fusion_se_a_grad_cpu( } int table_idx = 0; locate_xx(lower, upper, _max, stride0, stride1, xx, table_idx); - FPTYPE grad = 0.0; + FPTYPE grad = (FPTYPE)0.0; for (int kk = 0; kk < last_layer_size; kk++) { rr[0] = dy[ii * last_layer_size * 4 + 0 * last_layer_size + kk]; rr[1] = dy[ii * last_layer_size * 4 + 1 * last_layer_size + kk]; @@ -273,7 +273,7 @@ void deepmd::tabulate_fusion_se_a_grad_grad_cpu( FPTYPE a4 = table[table_idx * last_layer_size * 6 + 6 * kk + 4]; FPTYPE a5 = table[table_idx * last_layer_size * 6 + 6 * kk + 5]; FPTYPE var = a0 + (a1 + (a2 + (a3 + (a4 + a5 * xx) * xx) * xx) * xx) * xx; - FPTYPE var_grad = a1 + (2 * a2 + (3 * a3 + (4 * a4 + 5 * a5 * xx) * xx) * xx) * xx; + FPTYPE var_grad = a1 + ((FPTYPE)2. * a2 + ((FPTYPE)3. * a3 + ((FPTYPE)4. * a4 + (FPTYPE)5. * a5 * xx) * xx) * xx) * xx; if (unloop) { dz_dy[ii * last_layer_size * 4 + 0 * last_layer_size + kk] += (nnei - jj) * (var * hh[0] + dz_xx * var_grad * ll[0]); dz_dy[ii * last_layer_size * 4 + 1 * last_layer_size + kk] += (nnei - jj) * (var * hh[1] + dz_xx * var_grad * ll[1]); @@ -371,8 +371,8 @@ void deepmd::tabulate_fusion_se_t_grad_cpu( // FPTYPE * res = new FPTYPE[4 * last_layer_size]; #pragma omp parallel for for (int ii = 0; ii < nloc; ii++) { - FPTYPE ll = 0; - FPTYPE rr = 0; + FPTYPE ll = (FPTYPE)0.; + FPTYPE rr = (FPTYPE)0.; for (int jj = 0; jj < nnei_i; jj++) { FPTYPE ago = em_x[ii * nnei_i * nnei_j + jj * nnei_j + nnei_j - 1]; bool unloop = false; @@ -385,7 +385,7 @@ void deepmd::tabulate_fusion_se_t_grad_cpu( } int table_idx = 0; locate_xx_se_t(lower, upper, -_max, _max, stride0, stride1, xx, table_idx); - FPTYPE grad = 0.0; + FPTYPE grad = (FPTYPE)0.0; for (int mm = 0; mm < last_layer_size; mm++) { rr = dy[ii * last_layer_size + mm]; FPTYPE a0 = table[table_idx * last_layer_size * 6 + 6 * mm + 0]; @@ -397,11 +397,11 @@ void deepmd::tabulate_fusion_se_t_grad_cpu( FPTYPE res = a0 + (a1 + (a2 + (a3 + (a4 + a5 * xx) * xx) * xx) * xx) * xx; if (unloop) { - grad += (a1 + (2 * a2 + (3 * a3 + (4 * a4 + 5 * a5 * xx) * xx) * xx) * xx) * ll * rr * (nnei_j - kk); + grad += (a1 + ((FPTYPE)2. * a2 + ((FPTYPE)3. * a3 + ((FPTYPE)4. * a4 + (FPTYPE)5. * a5 * xx) * xx) * xx) * xx) * ll * rr * (nnei_j - kk); dy_dem[ii * nnei_i * nnei_j + jj * nnei_j + kk] += res * rr * (nnei_j - kk); } else { - grad += (a1 + (2 * a2 + (3 * a3 + (4 * a4 + 5 * a5 * xx) * xx) * xx) * xx) * ll * rr; + grad += (a1 + ((FPTYPE)2. * a2 + ((FPTYPE)3. * a3 + ((FPTYPE)4. * a4 + (FPTYPE)5. * a5 * xx) * xx) * xx) * xx) * ll * rr; dy_dem[ii * nnei_i * nnei_j + jj * nnei_j + kk] += res * rr; } } @@ -458,7 +458,7 @@ void deepmd::tabulate_fusion_se_t_grad_grad_cpu( FPTYPE a4 = table[table_idx * last_layer_size * 6 + 6 * mm + 4]; FPTYPE a5 = table[table_idx * last_layer_size * 6 + 6 * mm + 5]; FPTYPE var = a0 + (a1 + (a2 + (a3 + (a4 + a5 * xx) * xx) * xx) * xx) * xx; - FPTYPE var_grad = a1 + (2 * a2 + (3 * a3 + (4 * a4 + 5 * a5 * xx) * xx) * xx) * xx; + FPTYPE var_grad = a1 + ((FPTYPE)2. * a2 + ((FPTYPE)3. * a3 + ((FPTYPE)4. * a4 + (FPTYPE)5. * a5 * xx) * xx) * xx) * xx; dz_dy[ii * last_layer_size + mm] += var * dz_em + dz_xx * var_grad * tmp; } @@ -531,7 +531,7 @@ void deepmd::tabulate_fusion_se_r_grad_cpu( FPTYPE xx = em[ii * nnei + jj]; int table_idx = 0; locate_xx(lower, upper, _max, stride0, stride1, xx, table_idx); - FPTYPE grad = 0.0; + FPTYPE grad = (FPTYPE)0.0; for (int kk = 0; kk < last_layer_size; kk++) { FPTYPE a0 = table[table_idx * last_layer_size * 6 + 6 * kk + 0]; FPTYPE a1 = table[table_idx * last_layer_size * 6 + 6 * kk + 1]; @@ -539,7 +539,7 @@ void deepmd::tabulate_fusion_se_r_grad_cpu( FPTYPE a3 = table[table_idx * last_layer_size * 6 + 6 * kk + 3]; FPTYPE a4 = table[table_idx * last_layer_size * 6 + 6 * kk + 4]; FPTYPE a5 = table[table_idx * last_layer_size * 6 + 6 * kk + 5]; - grad += (a1 + (2 * a2 + (3 * a3 + (4 * a4 + 5 * a5 * xx) * xx) * xx) * xx) * dy[ii * last_layer_size * nnei + jj * last_layer_size + kk]; + grad += (a1 + ((FPTYPE)2. * a2 + ((FPTYPE)3. * a3 + ((FPTYPE)4. * a4 + (FPTYPE)5. * a5 * xx) * xx) * xx) * xx) * dy[ii * last_layer_size * nnei + jj * last_layer_size + kk]; } dy_dem[ii * nnei + jj] = grad; } @@ -578,7 +578,7 @@ void deepmd::tabulate_fusion_se_r_grad_grad_cpu( FPTYPE a3 = table[table_idx * last_layer_size * 6 + 6 * kk + 3]; FPTYPE a4 = table[table_idx * last_layer_size * 6 + 6 * kk + 4]; FPTYPE a5 = table[table_idx * last_layer_size * 6 + 6 * kk + 5]; - FPTYPE var_grad = a1 + (2 * a2 + (3 * a3 + (4 * a4 + 5 * a5 * xx) * xx) * xx) * xx; + FPTYPE var_grad = a1 + ((FPTYPE)2. * a2 + ((FPTYPE)3. * a3 + ((FPTYPE)4. * a4 + (FPTYPE)5. * a5 * xx) * xx) * xx) * xx; dz_dy[ii * last_layer_size * nnei + jj * last_layer_size + kk] = dz_dy_dem[ii * nnei + jj] * var_grad; } } diff --git a/source/lib/tests/CMakeLists.txt b/source/lib/tests/CMakeLists.txt index 8c3a3e4c16..394c9730c7 100644 --- a/source/lib/tests/CMakeLists.txt +++ b/source/lib/tests/CMakeLists.txt @@ -72,13 +72,22 @@ endif() if (USE_CUDA_TOOLKIT) target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread deepmd_op_cuda coverage_config) + install(TARGETS deepmd_op_cuda DESTINATION lib/) + elseif (USE_ROCM_TOOLKIT) target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread deepmd_op_rocm coverage_config ${ROCM_LIBRARIES}) + install(TARGETS deepmd_op_rocm DESTINATION lib/) else() target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread coverage_config) endif() add_test( runUnitTests runUnitTests ) +set_target_properties( + runUnitTests + PROPERTIES + INSTALL_RPATH "$ORIGIN/../lib" +) + # include(GoogleTest) # add_executable(FooTest tests/test_simulation_region.cc) # gtest_add_tests(TARGET FooTest @@ -108,4 +117,4 @@ else () endif () install(TARGETS runUnitTests DESTINATION bin/) -install(TARGETS runUnitTests DESTINATION lib/) +