From d5679950388e7a0ebd912c0cd77f44c4b167e40c Mon Sep 17 00:00:00 2001 From: Han Wang Date: Mon, 9 Oct 2023 17:01:20 +0800 Subject: [PATCH 01/14] fix the smoothness issue of se_attn_v2 --- deepmd/descriptor/se_atten.py | 23 ++++- source/tests/common.py | 72 ++++++++++++++ source/tests/test_model_se_atten.py | 139 ++++++++++++++++++++++++++++ 3 files changed, 230 insertions(+), 4 deletions(-) diff --git a/deepmd/descriptor/se_atten.py b/deepmd/descriptor/se_atten.py index 90cc86c4d9..102bc5ec6f 100644 --- a/deepmd/descriptor/se_atten.py +++ b/deepmd/descriptor/se_atten.py @@ -564,6 +564,8 @@ def build( self.filter_precision, ) self.negative_mask = -(2 << 32) * (1.0 - self.nmask) + # hard coding the magnitude of attention weight shift + self.smth_attn_w_shift = 20.0 # only used when tensorboard was set as true tf.summary.histogram("descrpt", self.descrpt) tf.summary.histogram("rij", self.rij) @@ -599,7 +601,7 @@ def build( ) self.recovered_r = ( tf.reshape( - tf.slice(tf.reshape(self.descrpt, [-1, 4]), [0, 0], [-1, 1]), + tf.slice(tf.reshape(self.descrpt_reshape, [-1, 4]), [0, 0], [-1, 1]), [-1, natoms[0], self.sel_all_a[0]], ) * self.std_looked_up @@ -865,10 +867,23 @@ def _scaled_dot_attn( save_weights=True, ): attn = tf.matmul(Q / temperature, K, transpose_b=True) - attn *= self.nmask - attn += self.negative_mask + if self.smooth: + # (nb x nloc) x nsel + nsel = self.sel_all_a[0] + attn = ((attn + self.smth_attn_w_shift) * + tf.reshape(self.recovered_switch, [-1,1,nsel]) * + tf.reshape(self.recovered_switch, [-1,nsel,1]) - + self.smth_attn_w_shift) + else: + attn *= self.nmask + attn += self.negative_mask attn = tf.nn.softmax(attn, axis=-1) - attn *= tf.reshape(self.nmask, [-1, attn.shape[-1], 1]) + if self.smooth: + attn = (attn * + tf.reshape(self.recovered_switch, [-1,1,nsel]) * + tf.reshape(self.recovered_switch, [-1,nsel,1])) + else: + attn *= tf.reshape(self.nmask, [-1, attn.shape[-1], 1]) if save_weights: self.attn_weight[layer] = attn[0] # atom 0 if dotr: diff --git a/source/tests/common.py b/source/tests/common.py index f8ed23df03..40724dc406 100644 --- a/source/tests/common.py +++ b/source/tests/common.py @@ -530,6 +530,78 @@ def strerch_box(old_coord, old_box, new_box): return ncoord.reshape(old_coord.shape) +def finite_difference_fv(sess, energy, feed_dict, t_coord, t_box, delta=1e-6): + """for energy models, compute f, v by finite difference + """ + base_dict = feed_dict.copy() + coord0 = base_dict.pop(t_coord) + box0 = base_dict.pop(t_box) + fdf = -finite_difference( + lambda coord: sess.run( + energy, feed_dict={**base_dict, t_coord: coord, t_box: box0} + ).reshape(-1), + coord0, + delta=delta, + ).reshape(-1) + fdv = -( + finite_difference( + lambda box: sess.run( + energy, + feed_dict={ + **base_dict, + t_coord: strerch_box(coord0, box0, box), + t_box: box, + }, + ).reshape(-1), + box0, + delta=delta, + ) + .reshape([-1, 3, 3]) + .transpose(0, 2, 1) + @ box0.reshape(3, 3) + ).reshape(-1) + return fdf, fdv + + +def check_continuity(f, cc, rcut, delta): + """coord[0:2] to [[0, 0, 0], [rcut+-.5*delta, 0, 0]] + """ + cc = cc.reshape([-1,3]) + cc0 = np.copy(cc) + cc1 = np.copy(cc) + cc0[:2,:] = np.array([ + 0.0, 0.0, 0.0, + rcut-0.5*delta, 0.0, 0.0, + ]).reshape([-1,3]) + cc1[:2,:] = np.array([ + 0.0, 0.0, 0.0, + rcut+0.5*delta, 0.0, 0.0, + ]).reshape([-1,3]) + return f(cc0.reshape(-1)), f(cc1.reshape(-1)) + + +def check_smooth_efv(sess, energy, force, virial, feed_dict, t_coord, rcut, delta=1e-5): + """check the smoothness of e, f and v + the returned values are de, df, dv + de[0] are supposed to be closed to de[1] + df[0] are supposed to be closed to df[1] + dv[0] are supposed to be closed to dv[1] + """ + base_dict = feed_dict.copy() + coord0 = base_dict.pop(t_coord) + [fe, ff, fv] = [ + lambda coord: sess.run( + ii, feed_dict={**base_dict, t_coord: coord} + ).reshape(-1) + for ii in [energy, force, virial] + ] + [de, df, dv] = [ + check_continuity(ii, coord0, rcut, delta=delta) + for ii in [fe, ff, fv] + ] + return de, df, dv + + def run_dp(cmd: str) -> int: """Run DP directly from the entry point instead of the subprocess. diff --git a/source/tests/test_model_se_atten.py b/source/tests/test_model_se_atten.py index 445959ceb2..e674c88998 100644 --- a/source/tests/test_model_se_atten.py +++ b/source/tests/test_model_se_atten.py @@ -28,6 +28,7 @@ from deepmd.utils.type_embed import ( TypeEmbedNet, ) +from common import finite_difference_fv, check_smooth_efv GLOBAL_ENER_FLOAT_PRECISION = tf.float64 GLOBAL_TF_FLOAT_PRECISION = tf.float64 @@ -726,3 +727,141 @@ def test_stripped_type_embedding_exclude_types(self): np.testing.assert_almost_equal(des[:, 0:2], 0.0, 10) with self.assertRaises(AssertionError): np.testing.assert_almost_equal(des[:, 2:6], 0.0, 10) + + + def test_smoothness_of_stripped_type_embedding_smooth_model(self): + """test: auto-diff, continuity of e,f,v + """ + jfile = "water_se_atten.json" + jdata = j_loader(jfile) + + systems = j_must_have(jdata, "systems") + set_pfx = j_must_have(jdata, "set_prefix") + batch_size = j_must_have(jdata, "batch_size") + test_size = j_must_have(jdata, "numb_test") + batch_size = 1 + test_size = 1 + stop_batch = j_must_have(jdata, "stop_batch") + rcut = j_must_have(jdata["model"]["descriptor"], "rcut") + + data = DataSystem(systems, set_pfx, batch_size, test_size, rcut, run_opt=None) + + test_data = data.get_test() + numb_test = 1 + + jdata["model"]["descriptor"].pop("type", None) + jdata["model"]["descriptor"]["ntypes"] = 2 + jdata["model"]["descriptor"]["stripped_type_embedding"] = True + jdata["model"]["descriptor"]["smooth_type_embdding"] = True + jdata["model"]["descriptor"]["attn_layer"] = 1 + descrpt = DescrptSeAtten(**jdata["model"]["descriptor"], uniform_seed=True) + jdata["model"]["fitting_net"]["descrpt"] = descrpt + fitting = EnerFitting(**jdata["model"]["fitting_net"], uniform_seed=True) + typeebd_param = jdata["model"]["type_embedding"] + typeebd = TypeEmbedNet( + neuron=typeebd_param["neuron"], + activation_function=None, + resnet_dt=typeebd_param["resnet_dt"], + seed=typeebd_param["seed"], + uniform_seed=True, + padding=True, + ) + model = EnerModel(descrpt, fitting, typeebd) + + # model._compute_dstats([test_data['coord']], [test_data['box']], [test_data['type']], [test_data['natoms_vec']], [test_data['default_mesh']]) + input_data = { + "coord": [test_data["coord"]], + "box": [test_data["box"]], + "type": [test_data["type"]], + "natoms_vec": [test_data["natoms_vec"]], + "default_mesh": [test_data["default_mesh"]], + } + model._compute_input_stat(input_data) + model.descrpt.bias_atom_e = data.compute_energy_shift() + + t_prop_c = tf.placeholder(tf.float32, [5], name="t_prop_c") + t_energy = tf.placeholder(GLOBAL_ENER_FLOAT_PRECISION, [None], name="t_energy") + t_force = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None], name="t_force") + t_virial = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None], name="t_virial") + t_atom_ener = tf.placeholder( + GLOBAL_TF_FLOAT_PRECISION, [None], name="t_atom_ener" + ) + t_coord = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None], name="i_coord") + t_type = tf.placeholder(tf.int32, [None], name="i_type") + t_natoms = tf.placeholder(tf.int32, [model.ntypes + 2], name="i_natoms") + t_box = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None, 9], name="i_box") + t_mesh = tf.placeholder(tf.int32, [None], name="i_mesh") + is_training = tf.placeholder(tf.bool) + t_fparam = None + inputs_dict = {} + + model_pred = model.build( + t_coord, + t_type, + t_natoms, + t_box, + t_mesh, + inputs_dict, + suffix=self.filename + + "-" + + inspect.stack()[0][3] + + "test_model_se_atten_model_compressible", + reuse=False, + ) + energy = model_pred["energy"] + force = model_pred["force"] + virial = model_pred["virial"] + atom_ener = model_pred["atom_ener"] + + feed_dict_test = { + t_prop_c: test_data["prop_c"], + t_energy: test_data["energy"][:numb_test], + t_force: np.reshape(test_data["force"][:numb_test, :], [-1]), + t_virial: np.reshape(test_data["virial"][:numb_test, :], [-1]), + t_atom_ener: np.reshape(test_data["atom_ener"][:numb_test, :], [-1]), + t_coord: np.reshape(test_data["coord"][:numb_test, :], [-1]), + t_box: test_data["box"][:numb_test, :], + t_type: np.reshape(test_data["type"][:numb_test, :], [-1]), + t_natoms: test_data["natoms_vec"], + t_mesh: test_data["default_mesh"], + is_training: False, + } + sess = self.cached_session().__enter__() + sess.run(tf.global_variables_initializer()) + [pe, pf, pv] = sess.run([energy, force, virial], feed_dict=feed_dict_test) + pf, pv = pf.reshape(-1), pv.reshape(-1) + + eps = 1e-4 + delta = 1e-5 + fdf, fdv = finite_difference_fv( + sess, energy, feed_dict_test, t_coord, t_box, delta=eps) + np.testing.assert_allclose(pf, fdf, delta) + np.testing.assert_allclose(pv, fdv, delta) + + tested_eps = [1e-3, 1e-4, 1e-5, 1e-6, 1e-7] + for eps in tested_eps: + deltae = eps + deltad = eps + de, df, dv = check_smooth_efv( + sess, energy, force, virial, + feed_dict_test, t_coord, + jdata["model"]["descriptor"]["rcut"], + delta=eps, + ) + np.testing.assert_allclose(de[0], de[1], rtol=0, atol=deltae) + np.testing.assert_allclose(df[0], df[1], rtol=0, atol=deltad) + np.testing.assert_allclose(dv[0], dv[1], rtol=0, atol=deltad) + + for eps in tested_eps: + deltae = 5.*eps + deltad = 5.*eps + de, df, dv = check_smooth_efv( + sess, energy, force, virial, + feed_dict_test, t_coord, + jdata["model"]["descriptor"]["rcut_smth"], + delta=eps, + ) + np.testing.assert_allclose(de[0], de[1], rtol=0, atol=deltae) + np.testing.assert_allclose(df[0], df[1], rtol=0, atol=deltad) + np.testing.assert_allclose(dv[0], dv[1], rtol=0, atol=deltad) + From 7667a4bd09bcaa9d6ebe58a809898efc1c620503 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Oct 2023 11:32:25 +0000 Subject: [PATCH 02/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/descriptor/se_atten.py | 23 ++++--- source/tests/common.py | 93 ++++++++++++++++------------- source/tests/test_model_se_atten.py | 67 +++++++++++---------- 3 files changed, 101 insertions(+), 82 deletions(-) diff --git a/deepmd/descriptor/se_atten.py b/deepmd/descriptor/se_atten.py index 102bc5ec6f..8f3be40596 100644 --- a/deepmd/descriptor/se_atten.py +++ b/deepmd/descriptor/se_atten.py @@ -564,7 +564,7 @@ def build( self.filter_precision, ) self.negative_mask = -(2 << 32) * (1.0 - self.nmask) - # hard coding the magnitude of attention weight shift + # hard coding the magnitude of attention weight shift self.smth_attn_w_shift = 20.0 # only used when tensorboard was set as true tf.summary.histogram("descrpt", self.descrpt) @@ -601,7 +601,9 @@ def build( ) self.recovered_r = ( tf.reshape( - tf.slice(tf.reshape(self.descrpt_reshape, [-1, 4]), [0, 0], [-1, 1]), + tf.slice( + tf.reshape(self.descrpt_reshape, [-1, 4]), [0, 0], [-1, 1] + ), [-1, natoms[0], self.sel_all_a[0]], ) * self.std_looked_up @@ -870,18 +872,21 @@ def _scaled_dot_attn( if self.smooth: # (nb x nloc) x nsel nsel = self.sel_all_a[0] - attn = ((attn + self.smth_attn_w_shift) * - tf.reshape(self.recovered_switch, [-1,1,nsel]) * - tf.reshape(self.recovered_switch, [-1,nsel,1]) - - self.smth_attn_w_shift) + attn = (attn + self.smth_attn_w_shift) * tf.reshape( + self.recovered_switch, [-1, 1, nsel] + ) * tf.reshape( + self.recovered_switch, [-1, nsel, 1] + ) - self.smth_attn_w_shift else: attn *= self.nmask attn += self.negative_mask attn = tf.nn.softmax(attn, axis=-1) if self.smooth: - attn = (attn * - tf.reshape(self.recovered_switch, [-1,1,nsel]) * - tf.reshape(self.recovered_switch, [-1,nsel,1])) + attn = ( + attn + * tf.reshape(self.recovered_switch, [-1, 1, nsel]) + * tf.reshape(self.recovered_switch, [-1, nsel, 1]) + ) else: attn *= tf.reshape(self.nmask, [-1, attn.shape[-1], 1]) if save_weights: diff --git a/source/tests/common.py b/source/tests/common.py index 40724dc406..9af324896f 100644 --- a/source/tests/common.py +++ b/source/tests/common.py @@ -530,77 +530,84 @@ def strerch_box(old_coord, old_box, new_box): return ncoord.reshape(old_coord.shape) -def finite_difference_fv(sess, energy, feed_dict, t_coord, t_box, delta=1e-6): - """for energy models, compute f, v by finite difference - """ +def finite_difference_fv(sess, energy, feed_dict, t_coord, t_box, delta=1e-6): + """For energy models, compute f, v by finite difference.""" base_dict = feed_dict.copy() coord0 = base_dict.pop(t_coord) box0 = base_dict.pop(t_box) fdf = -finite_difference( - lambda coord: sess.run( - energy, feed_dict={**base_dict, t_coord: coord, t_box: box0} - ).reshape(-1), - coord0, - delta=delta, - ).reshape(-1) - fdv = -( - finite_difference( - lambda box: sess.run( - energy, - feed_dict={ - **base_dict, - t_coord: strerch_box(coord0, box0, box), - t_box: box, - }, + lambda coord: sess.run( + energy, feed_dict={**base_dict, t_coord: coord, t_box: box0} ).reshape(-1), - box0, + coord0, delta=delta, - ) - .reshape([-1, 3, 3]) - .transpose(0, 2, 1) - @ box0.reshape(3, 3) + ).reshape(-1) + fdv = -( + finite_difference( + lambda box: sess.run( + energy, + feed_dict={ + **base_dict, + t_coord: strerch_box(coord0, box0, box), + t_box: box, + }, + ).reshape(-1), + box0, + delta=delta, + ) + .reshape([-1, 3, 3]) + .transpose(0, 2, 1) + @ box0.reshape(3, 3) ).reshape(-1) return fdf, fdv def check_continuity(f, cc, rcut, delta): - """coord[0:2] to [[0, 0, 0], [rcut+-.5*delta, 0, 0]] - """ - cc = cc.reshape([-1,3]) + """coord[0:2] to [[0, 0, 0], [rcut+-.5*delta, 0, 0]].""" + cc = cc.reshape([-1, 3]) cc0 = np.copy(cc) cc1 = np.copy(cc) - cc0[:2,:] = np.array([ - 0.0, 0.0, 0.0, - rcut-0.5*delta, 0.0, 0.0, - ]).reshape([-1,3]) - cc1[:2,:] = np.array([ - 0.0, 0.0, 0.0, - rcut+0.5*delta, 0.0, 0.0, - ]).reshape([-1,3]) + cc0[:2, :] = np.array( + [ + 0.0, + 0.0, + 0.0, + rcut - 0.5 * delta, + 0.0, + 0.0, + ] + ).reshape([-1, 3]) + cc1[:2, :] = np.array( + [ + 0.0, + 0.0, + 0.0, + rcut + 0.5 * delta, + 0.0, + 0.0, + ] + ).reshape([-1, 3]) return f(cc0.reshape(-1)), f(cc1.reshape(-1)) def check_smooth_efv(sess, energy, force, virial, feed_dict, t_coord, rcut, delta=1e-5): - """check the smoothness of e, f and v + """Check the smoothness of e, f and v the returned values are de, df, dv de[0] are supposed to be closed to de[1] df[0] are supposed to be closed to df[1] - dv[0] are supposed to be closed to dv[1] + dv[0] are supposed to be closed to dv[1]. """ base_dict = feed_dict.copy() coord0 = base_dict.pop(t_coord) [fe, ff, fv] = [ - lambda coord: sess.run( - ii, feed_dict={**base_dict, t_coord: coord} - ).reshape(-1) - for ii in [energy, force, virial] + lambda coord: sess.run(ii, feed_dict={**base_dict, t_coord: coord}).reshape(-1) + for ii in [energy, force, virial] ] [de, df, dv] = [ - check_continuity(ii, coord0, rcut, delta=delta) - for ii in [fe, ff, fv] + check_continuity(ii, coord0, rcut, delta=delta) for ii in [fe, ff, fv] ] return de, df, dv - + def run_dp(cmd: str) -> int: """Run DP directly from the entry point instead of the subprocess. diff --git a/source/tests/test_model_se_atten.py b/source/tests/test_model_se_atten.py index e674c88998..ad037e2931 100644 --- a/source/tests/test_model_se_atten.py +++ b/source/tests/test_model_se_atten.py @@ -5,6 +5,8 @@ import numpy as np from common import ( DataSystem, + check_smooth_efv, + finite_difference_fv, gen_data, j_loader, ) @@ -28,7 +30,6 @@ from deepmd.utils.type_embed import ( TypeEmbedNet, ) -from common import finite_difference_fv, check_smooth_efv GLOBAL_ENER_FLOAT_PRECISION = tf.float64 GLOBAL_TF_FLOAT_PRECISION = tf.float64 @@ -728,10 +729,8 @@ def test_stripped_type_embedding_exclude_types(self): with self.assertRaises(AssertionError): np.testing.assert_almost_equal(des[:, 2:6], 0.0, 10) - def test_smoothness_of_stripped_type_embedding_smooth_model(self): - """test: auto-diff, continuity of e,f,v - """ + """test: auto-diff, continuity of e,f,v.""" jfile = "water_se_atten.json" jdata = j_loader(jfile) @@ -834,34 +833,42 @@ def test_smoothness_of_stripped_type_embedding_smooth_model(self): eps = 1e-4 delta = 1e-5 fdf, fdv = finite_difference_fv( - sess, energy, feed_dict_test, t_coord, t_box, delta=eps) + sess, energy, feed_dict_test, t_coord, t_box, delta=eps + ) np.testing.assert_allclose(pf, fdf, delta) np.testing.assert_allclose(pv, fdv, delta) - + tested_eps = [1e-3, 1e-4, 1e-5, 1e-6, 1e-7] for eps in tested_eps: - deltae = eps - deltad = eps - de, df, dv = check_smooth_efv( - sess, energy, force, virial, - feed_dict_test, t_coord, - jdata["model"]["descriptor"]["rcut"], - delta=eps, - ) - np.testing.assert_allclose(de[0], de[1], rtol=0, atol=deltae) - np.testing.assert_allclose(df[0], df[1], rtol=0, atol=deltad) - np.testing.assert_allclose(dv[0], dv[1], rtol=0, atol=deltad) - + deltae = eps + deltad = eps + de, df, dv = check_smooth_efv( + sess, + energy, + force, + virial, + feed_dict_test, + t_coord, + jdata["model"]["descriptor"]["rcut"], + delta=eps, + ) + np.testing.assert_allclose(de[0], de[1], rtol=0, atol=deltae) + np.testing.assert_allclose(df[0], df[1], rtol=0, atol=deltad) + np.testing.assert_allclose(dv[0], dv[1], rtol=0, atol=deltad) + for eps in tested_eps: - deltae = 5.*eps - deltad = 5.*eps - de, df, dv = check_smooth_efv( - sess, energy, force, virial, - feed_dict_test, t_coord, - jdata["model"]["descriptor"]["rcut_smth"], - delta=eps, - ) - np.testing.assert_allclose(de[0], de[1], rtol=0, atol=deltae) - np.testing.assert_allclose(df[0], df[1], rtol=0, atol=deltad) - np.testing.assert_allclose(dv[0], dv[1], rtol=0, atol=deltad) - + deltae = 5.0 * eps + deltad = 5.0 * eps + de, df, dv = check_smooth_efv( + sess, + energy, + force, + virial, + feed_dict_test, + t_coord, + jdata["model"]["descriptor"]["rcut_smth"], + delta=eps, + ) + np.testing.assert_allclose(de[0], de[1], rtol=0, atol=deltae) + np.testing.assert_allclose(df[0], df[1], rtol=0, atol=deltad) + np.testing.assert_allclose(dv[0], dv[1], rtol=0, atol=deltad) From 39bc5c7661de3c8fbbcbb957e21123cd7b17cf58 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Mon, 9 Oct 2023 15:41:00 -0400 Subject: [PATCH 03/14] se_atten_grad --- deepmd/op/_tabulate_grad.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepmd/op/_tabulate_grad.py b/deepmd/op/_tabulate_grad.py index 9076ee3213..7cc7532a39 100644 --- a/deepmd/op/_tabulate_grad.py +++ b/deepmd/op/_tabulate_grad.py @@ -55,7 +55,7 @@ def _tabulate_fusion_se_atten_grad_cc(op, dy): op.outputs[0], is_sorted=op.get_attr("is_sorted"), ) - return [None, None, dy_dx, dy_df, None] + return [None, None, dy_dx, dy_df, dy_dtwo] @ops.RegisterGradient("TabulateFusionSeAttenGrad") From 63a4aee915b259124e55489e0253476a4eb22d2d Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Mon, 9 Oct 2023 17:55:07 -0400 Subject: [PATCH 04/14] implement dy_dtwo Signed-off-by: Jinzhe Zeng --- source/lib/include/tabulate.h | 2 ++ source/lib/src/gpu/tabulate.cu | 21 ++++++++++++++++++--- source/lib/src/tabulate.cc | 23 +++++++++++++++++++++-- source/lib/tests/test_tabulate_se_a.cc | 15 ++++++++------- source/op/tabulate_multi_device.cc | 26 ++++++++++++++------------ 5 files changed, 63 insertions(+), 24 deletions(-) diff --git a/source/lib/include/tabulate.h b/source/lib/include/tabulate.h index 93992cea5b..52bc06e2d4 100644 --- a/source/lib/include/tabulate.h +++ b/source/lib/include/tabulate.h @@ -18,6 +18,7 @@ void tabulate_fusion_se_a_cpu(FPTYPE* out, template void tabulate_fusion_se_a_grad_cpu(FPTYPE* dy_dem_x, FPTYPE* dy_dem, + FPTYPE* dy_dtwo, const FPTYPE* table, const FPTYPE* table_info, const FPTYPE* em_x, @@ -125,6 +126,7 @@ void tabulate_fusion_se_a_gpu(FPTYPE* out, template void tabulate_fusion_se_a_grad_gpu(FPTYPE* dy_dem_x, FPTYPE* dy_dem, + FPTYPE* dy_dtwo, const FPTYPE* table, const FPTYPE* table_info, const FPTYPE* em_x, diff --git a/source/lib/src/gpu/tabulate.cu b/source/lib/src/gpu/tabulate.cu index 09d02bdf2c..323f89a26b 100644 --- a/source/lib/src/gpu/tabulate.cu +++ b/source/lib/src/gpu/tabulate.cu @@ -253,6 +253,7 @@ template __global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial( FPTYPE* dy_dem_x, FPTYPE* dy_dem, + FPTYPE* dy_dtwo, const FPTYPE* table, const FPTYPE* em_x, const FPTYPE* em, @@ -307,6 +308,7 @@ __global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial( (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * xx; + FPTYPE oldres = res; FPTYPE t; if (enable_se_atten) { t = two_embed[block_idx * nnei * last_layer_size + @@ -330,6 +332,13 @@ __global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial( xx) * xx) * (enable_se_atten ? res * t + res : res); + if (enable_se_atten) { + // from ii to ii + (nnei - breakpoint) + for (int ii2 = ii; ii2 < ii + nnei - breakpoint; ii2++) { + dy_dtwo[block_idx * nnei * last_layer_size + ii2 * last_layer_size + + jj] = oldres * res; + } + } } GpuSyncThreads(); for (int kk = 0; kk < MTILE; kk++) { @@ -764,6 +773,7 @@ void tabulate_fusion_se_a_gpu(FPTYPE* out, template void tabulate_fusion_se_a_grad_gpu(FPTYPE* dy_dem_x, FPTYPE* dy_dem, + FPTYPE* dy_dtwo, const FPTYPE* table, const FPTYPE* table_info, const FPTYPE* em_x, @@ -781,12 +791,15 @@ void tabulate_fusion_se_a_grad_gpu(FPTYPE* dy_dem_x, DPErrcheck(gpuDeviceSynchronize()); DPErrcheck(gpuMemset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei)); DPErrcheck(gpuMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei * 4)); + if (two_embed != nullptr) { + DPErrcheck(gpuMemset(dy_dtwo, 0, sizeof(FPTYPE) * nloc * nnei)); + } tabulate_fusion_se_a_grad_fifth_order_polynomial <<>>( - dy_dem_x, dy_dem, table, em_x, em, two_embed, dy, table_info[0], - table_info[1], table_info[2], table_info[3], table_info[4], nnei, - last_layer_size, is_sorted); + dy_dem_x, dy_dem, dy_dtwo, table, em_x, em, two_embed, dy, + table_info[0], table_info[1], table_info[2], table_info[3], + table_info[4], nnei, last_layer_size, is_sorted); DPErrcheck(gpuGetLastError()); DPErrcheck(gpuDeviceSynchronize()); } @@ -990,6 +1003,7 @@ template void tabulate_fusion_se_a_gpu(double* out, const bool is_sorted); template void tabulate_fusion_se_a_grad_gpu(float* dy_dem_x, float* dy_dem, + float* dy_dtwo, const float* table, const float* table_info, const float* em_x, @@ -1002,6 +1016,7 @@ template void tabulate_fusion_se_a_grad_gpu(float* dy_dem_x, const bool is_sorted); template void tabulate_fusion_se_a_grad_gpu(double* dy_dem_x, double* dy_dem, + double* dy_dtwo, const double* table, const double* table_info, const double* em_x, diff --git a/source/lib/src/tabulate.cc b/source/lib/src/tabulate.cc index 1f49cf0daa..6c2068b262 100644 --- a/source/lib/src/tabulate.cc +++ b/source/lib/src/tabulate.cc @@ -158,6 +158,7 @@ void deepmd::tabulate_fusion_se_a_cpu(FPTYPE* out, template void deepmd::tabulate_fusion_se_a_grad_cpu(FPTYPE* dy_dem_x, FPTYPE* dy_dem, + FPTYPE* dy_dtwo, const FPTYPE* table, const FPTYPE* table_info, const FPTYPE* em_x, @@ -171,6 +172,9 @@ void deepmd::tabulate_fusion_se_a_grad_cpu(FPTYPE* dy_dem_x, bool enable_se_atten = two_embed != nullptr; memset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei); memset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei * 4); + if (enable_se_atten) { + memset(dy_dtwo, 0, sizeof(FPTYPE) * nloc * nnei * last_layer_size); + } FPTYPE const lower = table_info[0]; FPTYPE const upper = table_info[1]; FPTYPE const _max = table_info[2]; @@ -212,6 +216,7 @@ void deepmd::tabulate_fusion_se_a_grad_cpu(FPTYPE* dy_dem_x, a0 + (a1 + (a2 + (a3 + (a4 + a5 * xx) * xx) * xx) * xx) * xx; FPTYPE g = (a1 + (2 * a2 + (3 * a3 + (4 * a4 + 5 * a5 * xx) * xx) * xx) * xx); + FPTYPE resold = res; if (enable_se_atten) { FPTYPE t = two_embed[ii * nnei * last_layer_size + jj * last_layer_size + kk]; @@ -219,18 +224,30 @@ void deepmd::tabulate_fusion_se_a_grad_cpu(FPTYPE* dy_dem_x, g += t * g; } + FPTYPE dotllrr = dot(ll, rr); if (unloop) { - grad += g * dot(ll, rr) * (nnei - jj); + grad += g * dotllrr * (nnei - jj); dy_dem[ii * nnei * 4 + jj * 4 + 0] += res * rr[0] * (nnei - jj); dy_dem[ii * nnei * 4 + jj * 4 + 1] += res * rr[1] * (nnei - jj); dy_dem[ii * nnei * 4 + jj * 4 + 2] += res * rr[2] * (nnei - jj); dy_dem[ii * nnei * 4 + jj * 4 + 3] += res * rr[3] * (nnei - jj); + if (enable_se_atten) { + // fill from jj to nnei + for (int jj2 = jj; jj2 < nnei; jj2++) { + dy_dtwo[ii * nnei * last_layer_size + jj2 * last_layer_size + + kk] += res * dotllrr; + } + } } else { - grad += g * dot(ll, rr); + grad += g * dotllrr; dy_dem[ii * nnei * 4 + jj * 4 + 0] += res * rr[0]; dy_dem[ii * nnei * 4 + jj * 4 + 1] += res * rr[1]; dy_dem[ii * nnei * 4 + jj * 4 + 2] += res * rr[2]; dy_dem[ii * nnei * 4 + jj * 4 + 3] += res * rr[3]; + if (enable_se_atten) { + dy_dtwo[ii * nnei * last_layer_size + jj * last_layer_size + kk] += + resold * dotllrr; + } } } dy_dem_x[ii * nnei + jj] = grad; @@ -660,6 +677,7 @@ template void deepmd::tabulate_fusion_se_a_cpu( template void deepmd::tabulate_fusion_se_a_grad_cpu( float* dy_dem_x, float* dy_dem, + float* dy_dtwo, const float* table, const float* table_info, const float* em_x, @@ -673,6 +691,7 @@ template void deepmd::tabulate_fusion_se_a_grad_cpu( template void deepmd::tabulate_fusion_se_a_grad_cpu( double* dy_dem_x, double* dy_dem, + double* dy_dtwo, const double* table, const double* table_info, const double* em_x, diff --git a/source/lib/tests/test_tabulate_se_a.cc b/source/lib/tests/test_tabulate_se_a.cc index fc0fd04980..b272740584 100644 --- a/source/lib/tests/test_tabulate_se_a.cc +++ b/source/lib/tests/test_tabulate_se_a.cc @@ -726,9 +726,10 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_cpu) { std::vector dy_dem_x(em_x.size()); std::vector dy_dem(em.size()); std::vector dy(nloc * nnei * last_layer_size, 1.0); + FPTYPE *dy_dtwo = nullptr; deepmd::tabulate_fusion_se_a_grad_cpu( - &dy_dem_x[0], &dy_dem[0], &table[0], &info[0], &em_x[0], &em[0], nullptr, - &dy[0], nloc, nnei, last_layer_size); + &dy_dem_x[0], &dy_dem[0], dy_dtwo, table[0], &info[0], &em_x[0], &em[0], + nullptr, &dy[0], nloc, nnei, last_layer_size); EXPECT_EQ(dy_dem_x.size(), nloc * nnei); EXPECT_EQ(dy_dem.size(), nloc * nnei * 4); EXPECT_EQ(dy_dem_x.size(), expected_dy_dem_x.size()); @@ -741,7 +742,7 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_cpu) { } deepmd::tabulate_fusion_se_a_grad_cpu( - &dy_dem_x[0], &dy_dem[0], &table[0], &info[0], &em_x[0], &em[0], + &dy_dem_x[0], &dy_dem[0], dy_dtwo, &table[0], &info[0], &em_x[0], &em[0], &two_embed[0], &dy[0], nloc, nnei, last_layer_size); EXPECT_EQ(dy_dem_x.size(), nloc * nnei); EXPECT_EQ(dy_dem.size(), nloc * nnei * 4); @@ -804,7 +805,7 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_gpu) { std::vector dy(nloc * nnei * last_layer_size, 1.0); double *dy_dem_x_dev = NULL, *dy_dem_dev = NULL, *table_dev = NULL, - *em_x_dev = NULL, *em_dev = NULL, *dy_dev = NULL; + *em_x_dev = NULL, *em_dev = NULL, *dy_dev = NULL, *dy_dtwo = nullptr; deepmd::malloc_device_memory_sync(dy_dem_x_dev, dy_dem_x); deepmd::malloc_device_memory_sync(dy_dem_dev, dy_dem); deepmd::malloc_device_memory_sync(table_dev, table); @@ -812,8 +813,8 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_gpu) { deepmd::malloc_device_memory_sync(em_dev, em); deepmd::malloc_device_memory_sync(dy_dev, dy); deepmd::tabulate_fusion_se_a_grad_gpu( - dy_dem_x_dev, dy_dem_dev, table_dev, &info[0], em_x_dev, em_dev, nullptr, - dy_dev, nloc, nnei, last_layer_size); + dy_dem_x_dev, dy_dem_dev, dy_dtwo, table_dev, &info[0], em_x_dev, em_dev, + nullptr, dy_dev, nloc, nnei, last_layer_size); deepmd::memcpy_device_to_host(dy_dem_x_dev, dy_dem_x); deepmd::memcpy_device_to_host(dy_dem_dev, dy_dem); @@ -833,7 +834,7 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_gpu) { deepmd::malloc_device_memory_sync(dy_dem_x_dev, dy_dem_x); deepmd::malloc_device_memory_sync(dy_dem_dev, dy_dem); deepmd::tabulate_fusion_se_a_grad_gpu( - dy_dem_x_dev, dy_dem_dev, table_dev, &info[0], em_x_dev, em_dev, + dy_dem_x_dev, dy_dem_dev, dy_dtwo, table_dev, &info[0], em_x_dev, em_dev, two_embed_dev, dy_dev, nloc, nnei, last_layer_size); deepmd::memcpy_device_to_host(dy_dem_x_dev, dy_dem_x); deepmd::memcpy_device_to_host(dy_dem_dev, dy_dem); diff --git a/source/op/tabulate_multi_device.cc b/source/op/tabulate_multi_device.cc index 488a99bd7d..ceac954d0d 100644 --- a/source/op/tabulate_multi_device.cc +++ b/source/op/tabulate_multi_device.cc @@ -261,6 +261,7 @@ class TabulateFusionSeAGradOp : public OpKernel { // flat the tensors FPTYPE* dy_dem_x = dy_dem_x_tensor->flat().data(); FPTYPE* dy_dem = dy_dem_tensor->flat().data(); + FPTYPE* dy_dtwo = nullptr; const FPTYPE* descriptor = descriptor_tensor.flat().data(); const FPTYPE* table = table_tensor.flat().data(); @@ -275,14 +276,14 @@ class TabulateFusionSeAGradOp : public OpKernel { if (device == "GPU") { #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM - deepmd::tabulate_fusion_se_a_grad_gpu(dy_dem_x, dy_dem, table, table_info, - em_x, em, two_embed, dy, nloc, nnei, - last_layer_size); + deepmd::tabulate_fusion_se_a_grad_gpu(dy_dem_x, dy_dem, dy_dtwo, table, + table_info, em_x, em, two_embed, dy, + nloc, nnei, last_layer_size); #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { - deepmd::tabulate_fusion_se_a_grad_cpu(dy_dem_x, dy_dem, table, table_info, - em_x, em, two_embed, dy, nloc, nnei, - last_layer_size); + deepmd::tabulate_fusion_se_a_grad_cpu(dy_dem_x, dy_dem, dy_dtwo, table, + table_info, em_x, em, two_embed, dy, + nloc, nnei, last_layer_size); } } @@ -468,6 +469,7 @@ class TabulateFusionSeAttenGradOp : public OpKernel { // flat the tensors FPTYPE* dy_dem_x = dy_dem_x_tensor->flat().data(); FPTYPE* dy_dem = dy_dem_tensor->flat().data(); + FPTYPE* dy_dtwo = dy_dtwo_tensor->flat().data(); const FPTYPE* descriptor = descriptor_tensor.flat().data(); const FPTYPE* table = table_tensor.flat().data(); @@ -482,14 +484,14 @@ class TabulateFusionSeAttenGradOp : public OpKernel { if (device == "GPU") { #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM - deepmd::tabulate_fusion_se_a_grad_gpu(dy_dem_x, dy_dem, table, table_info, - em_x, em, two_embed, dy, nloc, nnei, - last_layer_size, is_sorted); + deepmd::tabulate_fusion_se_a_grad_gpu( + dy_dem_x, dy_dem, dy_dtwo, table, table_info, em_x, em, two_embed, dy, + nloc, nnei, last_layer_size, is_sorted); #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { - deepmd::tabulate_fusion_se_a_grad_cpu(dy_dem_x, dy_dem, table, table_info, - em_x, em, two_embed, dy, nloc, nnei, - last_layer_size, is_sorted); + deepmd::tabulate_fusion_se_a_grad_cpu( + dy_dem_x, dy_dem, dy_dtwo, table, table_info, em_x, em, two_embed, dy, + nloc, nnei, last_layer_size, is_sorted); } } From 2cfcb155930f64c5bf84a054a8b427a72c42ab61 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Mon, 9 Oct 2023 18:30:45 -0400 Subject: [PATCH 05/14] FPTYPE -> double --- source/lib/tests/test_tabulate_se_a.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/lib/tests/test_tabulate_se_a.cc b/source/lib/tests/test_tabulate_se_a.cc index b272740584..655897c29b 100644 --- a/source/lib/tests/test_tabulate_se_a.cc +++ b/source/lib/tests/test_tabulate_se_a.cc @@ -726,7 +726,7 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_cpu) { std::vector dy_dem_x(em_x.size()); std::vector dy_dem(em.size()); std::vector dy(nloc * nnei * last_layer_size, 1.0); - FPTYPE *dy_dtwo = nullptr; + double *dy_dtwo = nullptr; deepmd::tabulate_fusion_se_a_grad_cpu( &dy_dem_x[0], &dy_dem[0], dy_dtwo, table[0], &info[0], &em_x[0], &em[0], nullptr, &dy[0], nloc, nnei, last_layer_size); From 0a3b01159c8c487c534b19824766020977539cc3 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Mon, 9 Oct 2023 18:45:58 -0400 Subject: [PATCH 06/14] typo --- source/lib/tests/test_tabulate_se_a.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/lib/tests/test_tabulate_se_a.cc b/source/lib/tests/test_tabulate_se_a.cc index 655897c29b..41a3b09734 100644 --- a/source/lib/tests/test_tabulate_se_a.cc +++ b/source/lib/tests/test_tabulate_se_a.cc @@ -728,7 +728,7 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_cpu) { std::vector dy(nloc * nnei * last_layer_size, 1.0); double *dy_dtwo = nullptr; deepmd::tabulate_fusion_se_a_grad_cpu( - &dy_dem_x[0], &dy_dem[0], dy_dtwo, table[0], &info[0], &em_x[0], &em[0], + &dy_dem_x[0], &dy_dem[0], dy_dtwo, &table[0], &info[0], &em_x[0], &em[0], nullptr, &dy[0], nloc, nnei, last_layer_size); EXPECT_EQ(dy_dem_x.size(), nloc * nnei); EXPECT_EQ(dy_dem.size(), nloc * nnei * 4); From 1bc9a2ee64ede39ad6ec8c01375efb7a276f7058 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Mon, 9 Oct 2023 19:04:32 -0400 Subject: [PATCH 07/14] fix tests --- source/lib/tests/test_tabulate_se_a.cc | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/source/lib/tests/test_tabulate_se_a.cc b/source/lib/tests/test_tabulate_se_a.cc index 41a3b09734..6829177f08 100644 --- a/source/lib/tests/test_tabulate_se_a.cc +++ b/source/lib/tests/test_tabulate_se_a.cc @@ -726,9 +726,10 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_cpu) { std::vector dy_dem_x(em_x.size()); std::vector dy_dem(em.size()); std::vector dy(nloc * nnei * last_layer_size, 1.0); + std::vector dy_dtwo(nloc * nnei * last_layer_size); double *dy_dtwo = nullptr; deepmd::tabulate_fusion_se_a_grad_cpu( - &dy_dem_x[0], &dy_dem[0], dy_dtwo, &table[0], &info[0], &em_x[0], &em[0], + &dy_dem_x[0], &dy_dem[0], &dy_dtwo[0], &table[0], &info[0], &em_x[0], &em[0], nullptr, &dy[0], nloc, nnei, last_layer_size); EXPECT_EQ(dy_dem_x.size(), nloc * nnei); EXPECT_EQ(dy_dem.size(), nloc * nnei * 4); @@ -742,7 +743,7 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_cpu) { } deepmd::tabulate_fusion_se_a_grad_cpu( - &dy_dem_x[0], &dy_dem[0], dy_dtwo, &table[0], &info[0], &em_x[0], &em[0], + &dy_dem_x[0], &dy_dem[0], &dy_dtwo[0], &table[0], &info[0], &em_x[0], &em[0], &two_embed[0], &dy[0], nloc, nnei, last_layer_size); EXPECT_EQ(dy_dem_x.size(), nloc * nnei); EXPECT_EQ(dy_dem.size(), nloc * nnei * 4); @@ -803,9 +804,10 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_gpu) { std::vector dy_dem_x(em_x.size(), 0.0); std::vector dy_dem(em.size(), 0.0); std::vector dy(nloc * nnei * last_layer_size, 1.0); + std::vector dy_dtwo(nloc * nnei * last_layer_size, 0.0); double *dy_dem_x_dev = NULL, *dy_dem_dev = NULL, *table_dev = NULL, - *em_x_dev = NULL, *em_dev = NULL, *dy_dev = NULL, *dy_dtwo = nullptr; + *em_x_dev = NULL, *em_dev = NULL, *dy_dev = NULL, *dy_dtwo_dev = nullptr; deepmd::malloc_device_memory_sync(dy_dem_x_dev, dy_dem_x); deepmd::malloc_device_memory_sync(dy_dem_dev, dy_dem); deepmd::malloc_device_memory_sync(table_dev, table); @@ -833,8 +835,9 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_gpu) { deepmd::malloc_device_memory_sync(two_embed_dev, two_embed); deepmd::malloc_device_memory_sync(dy_dem_x_dev, dy_dem_x); deepmd::malloc_device_memory_sync(dy_dem_dev, dy_dem); + deepmd::malloc_device_memory_sync(dy_dtwo_dev, dy_dtwo); deepmd::tabulate_fusion_se_a_grad_gpu( - dy_dem_x_dev, dy_dem_dev, dy_dtwo, table_dev, &info[0], em_x_dev, em_dev, + dy_dem_x_dev, dy_dem_dev, dy_dtwo_dev, table_dev, &info[0], em_x_dev, em_dev, two_embed_dev, dy_dev, nloc, nnei, last_layer_size); deepmd::memcpy_device_to_host(dy_dem_x_dev, dy_dem_x); deepmd::memcpy_device_to_host(dy_dem_dev, dy_dem); From 068aa33735c51046c17c9c23e889c302cff74a6e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Oct 2023 23:06:44 +0000 Subject: [PATCH 08/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- source/lib/tests/test_tabulate_se_a.cc | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/source/lib/tests/test_tabulate_se_a.cc b/source/lib/tests/test_tabulate_se_a.cc index 6829177f08..266b90458c 100644 --- a/source/lib/tests/test_tabulate_se_a.cc +++ b/source/lib/tests/test_tabulate_se_a.cc @@ -729,8 +729,8 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_cpu) { std::vector dy_dtwo(nloc * nnei * last_layer_size); double *dy_dtwo = nullptr; deepmd::tabulate_fusion_se_a_grad_cpu( - &dy_dem_x[0], &dy_dem[0], &dy_dtwo[0], &table[0], &info[0], &em_x[0], &em[0], - nullptr, &dy[0], nloc, nnei, last_layer_size); + &dy_dem_x[0], &dy_dem[0], &dy_dtwo[0], &table[0], &info[0], &em_x[0], + &em[0], nullptr, &dy[0], nloc, nnei, last_layer_size); EXPECT_EQ(dy_dem_x.size(), nloc * nnei); EXPECT_EQ(dy_dem.size(), nloc * nnei * 4); EXPECT_EQ(dy_dem_x.size(), expected_dy_dem_x.size()); @@ -743,8 +743,8 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_cpu) { } deepmd::tabulate_fusion_se_a_grad_cpu( - &dy_dem_x[0], &dy_dem[0], &dy_dtwo[0], &table[0], &info[0], &em_x[0], &em[0], - &two_embed[0], &dy[0], nloc, nnei, last_layer_size); + &dy_dem_x[0], &dy_dem[0], &dy_dtwo[0], &table[0], &info[0], &em_x[0], + &em[0], &two_embed[0], &dy[0], nloc, nnei, last_layer_size); EXPECT_EQ(dy_dem_x.size(), nloc * nnei); EXPECT_EQ(dy_dem.size(), nloc * nnei * 4); EXPECT_EQ(dy_dem_x.size(), expected_dy_dem_x.size()); @@ -807,7 +807,8 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_gpu) { std::vector dy_dtwo(nloc * nnei * last_layer_size, 0.0); double *dy_dem_x_dev = NULL, *dy_dem_dev = NULL, *table_dev = NULL, - *em_x_dev = NULL, *em_dev = NULL, *dy_dev = NULL, *dy_dtwo_dev = nullptr; + *em_x_dev = NULL, *em_dev = NULL, *dy_dev = NULL, + *dy_dtwo_dev = nullptr; deepmd::malloc_device_memory_sync(dy_dem_x_dev, dy_dem_x); deepmd::malloc_device_memory_sync(dy_dem_dev, dy_dem); deepmd::malloc_device_memory_sync(table_dev, table); @@ -837,8 +838,8 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_gpu) { deepmd::malloc_device_memory_sync(dy_dem_dev, dy_dem); deepmd::malloc_device_memory_sync(dy_dtwo_dev, dy_dtwo); deepmd::tabulate_fusion_se_a_grad_gpu( - dy_dem_x_dev, dy_dem_dev, dy_dtwo_dev, table_dev, &info[0], em_x_dev, em_dev, - two_embed_dev, dy_dev, nloc, nnei, last_layer_size); + dy_dem_x_dev, dy_dem_dev, dy_dtwo_dev, table_dev, &info[0], em_x_dev, + em_dev, two_embed_dev, dy_dev, nloc, nnei, last_layer_size); deepmd::memcpy_device_to_host(dy_dem_x_dev, dy_dem_x); deepmd::memcpy_device_to_host(dy_dem_dev, dy_dem); for (int jj = 0; jj < dy_dem_x.size(); ++jj) { From bc20144316ed58f7bd9a155c524a65361622e3bc Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Mon, 9 Oct 2023 19:13:14 -0400 Subject: [PATCH 09/14] fix --- source/lib/tests/test_tabulate_se_a.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/source/lib/tests/test_tabulate_se_a.cc b/source/lib/tests/test_tabulate_se_a.cc index 266b90458c..e300c437d4 100644 --- a/source/lib/tests/test_tabulate_se_a.cc +++ b/source/lib/tests/test_tabulate_se_a.cc @@ -727,7 +727,6 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_cpu) { std::vector dy_dem(em.size()); std::vector dy(nloc * nnei * last_layer_size, 1.0); std::vector dy_dtwo(nloc * nnei * last_layer_size); - double *dy_dtwo = nullptr; deepmd::tabulate_fusion_se_a_grad_cpu( &dy_dem_x[0], &dy_dem[0], &dy_dtwo[0], &table[0], &info[0], &em_x[0], &em[0], nullptr, &dy[0], nloc, nnei, last_layer_size); From 83f2aa7e2f006b46fab65f07a2220c9987d4f82b Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Mon, 9 Oct 2023 19:52:23 -0400 Subject: [PATCH 10/14] fix a typo: res->resold --- source/lib/src/tabulate.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/lib/src/tabulate.cc b/source/lib/src/tabulate.cc index 6c2068b262..eb96d044e4 100644 --- a/source/lib/src/tabulate.cc +++ b/source/lib/src/tabulate.cc @@ -235,7 +235,7 @@ void deepmd::tabulate_fusion_se_a_grad_cpu(FPTYPE* dy_dem_x, // fill from jj to nnei for (int jj2 = jj; jj2 < nnei; jj2++) { dy_dtwo[ii * nnei * last_layer_size + jj2 * last_layer_size + - kk] += res * dotllrr; + kk] += resold * dotllrr; } } } else { From 99fe74df9db6e00643921cd6037f71c9843910da Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Mon, 9 Oct 2023 20:13:00 -0400 Subject: [PATCH 11/14] Update source/lib/tests/test_tabulate_se_a.cc Signed-off-by: Jinzhe Zeng --- source/lib/tests/test_tabulate_se_a.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/lib/tests/test_tabulate_se_a.cc b/source/lib/tests/test_tabulate_se_a.cc index e300c437d4..9342082fc6 100644 --- a/source/lib/tests/test_tabulate_se_a.cc +++ b/source/lib/tests/test_tabulate_se_a.cc @@ -815,7 +815,7 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_gpu) { deepmd::malloc_device_memory_sync(em_dev, em); deepmd::malloc_device_memory_sync(dy_dev, dy); deepmd::tabulate_fusion_se_a_grad_gpu( - dy_dem_x_dev, dy_dem_dev, dy_dtwo, table_dev, &info[0], em_x_dev, em_dev, + dy_dem_x_dev, dy_dem_dev, dy_dtwo_dev, table_dev, &info[0], em_x_dev, em_dev, nullptr, dy_dev, nloc, nnei, last_layer_size); deepmd::memcpy_device_to_host(dy_dem_x_dev, dy_dem_x); deepmd::memcpy_device_to_host(dy_dem_dev, dy_dem); From f2bf0cdace387a827ba839ea409092d8ba13bb09 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 10 Oct 2023 00:13:25 +0000 Subject: [PATCH 12/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- source/lib/tests/test_tabulate_se_a.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/lib/tests/test_tabulate_se_a.cc b/source/lib/tests/test_tabulate_se_a.cc index 9342082fc6..ce2defb22c 100644 --- a/source/lib/tests/test_tabulate_se_a.cc +++ b/source/lib/tests/test_tabulate_se_a.cc @@ -815,8 +815,8 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_gpu) { deepmd::malloc_device_memory_sync(em_dev, em); deepmd::malloc_device_memory_sync(dy_dev, dy); deepmd::tabulate_fusion_se_a_grad_gpu( - dy_dem_x_dev, dy_dem_dev, dy_dtwo_dev, table_dev, &info[0], em_x_dev, em_dev, - nullptr, dy_dev, nloc, nnei, last_layer_size); + dy_dem_x_dev, dy_dem_dev, dy_dtwo_dev, table_dev, &info[0], em_x_dev, + em_dev, nullptr, dy_dev, nloc, nnei, last_layer_size); deepmd::memcpy_device_to_host(dy_dem_x_dev, dy_dem_x); deepmd::memcpy_device_to_host(dy_dem_dev, dy_dem); From aad7830da7342c39b63a4d55545e5c304080ffda Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Mon, 9 Oct 2023 23:47:54 -0400 Subject: [PATCH 13/14] implement dy_dtwo for gradgrad Signed-off-by: Jinzhe Zeng --- deepmd/op/_tabulate_grad.py | 1 + source/lib/include/tabulate.h | 2 ++ source/lib/src/gpu/tabulate.cu | 16 ++++++++++++--- source/lib/src/tabulate.cc | 31 +++++++++++++++++++++--------- source/op/tabulate_multi_device.cc | 12 ++++++++---- 5 files changed, 46 insertions(+), 16 deletions(-) diff --git a/deepmd/op/_tabulate_grad.py b/deepmd/op/_tabulate_grad.py index 7cc7532a39..8ad8908d7e 100644 --- a/deepmd/op/_tabulate_grad.py +++ b/deepmd/op/_tabulate_grad.py @@ -68,6 +68,7 @@ def _tabulate_fusion_se_atten_grad_grad_cc(op, dy, dy_, dy_dtwo): op.inputs[4], dy, dy_, + dy_dtwo, op.inputs[6], is_sorted=op.get_attr("is_sorted"), ) diff --git a/source/lib/include/tabulate.h b/source/lib/include/tabulate.h index 52bc06e2d4..47c3062449 100644 --- a/source/lib/include/tabulate.h +++ b/source/lib/include/tabulate.h @@ -39,6 +39,7 @@ void tabulate_fusion_se_a_grad_grad_cpu(FPTYPE* dz_dy, const FPTYPE* two_embed, const FPTYPE* dz_dy_dem_x, const FPTYPE* dz_dy_dem, + const FPTYPE* dz_dy_dtwo, const int nloc, const int nnei, const int last_layer_size, @@ -147,6 +148,7 @@ void tabulate_fusion_se_a_grad_grad_gpu(FPTYPE* dz_dy, const FPTYPE* two_embed, const FPTYPE* dz_dy_dem_x, const FPTYPE* dz_dy_dem, + const FPTYPE* dz_dy_dtwo, const int nloc, const int nnei, const int last_layer_size, diff --git a/source/lib/src/gpu/tabulate.cu b/source/lib/src/gpu/tabulate.cu index 323f89a26b..57e889e716 100644 --- a/source/lib/src/gpu/tabulate.cu +++ b/source/lib/src/gpu/tabulate.cu @@ -366,6 +366,7 @@ __global__ void tabulate_fusion_se_a_grad_grad_fifth_order_polynomial( const FPTYPE* two_embed, const FPTYPE* dz_dy_dem_x, const FPTYPE* dz_dy_dem, + const FPTYPE* dz_dy_dtwo, const FPTYPE lower, const FPTYPE upper, const FPTYPE max, @@ -413,9 +414,15 @@ __global__ void tabulate_fusion_se_a_grad_grad_fifth_order_polynomial( ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) * xx) * xx; + FPTYPE two_grad = 0.; if (enable_se_atten) { FPTYPE t = two_embed[block_idx * nnei * last_layer_size + ii * last_layer_size + thread_idx]; + // dz_dy_dtwo * res * em + // res above should be used instead of res + res * t below + two_grad = dz_dy_dtwo[block_idx * nnei * last_layer_size + + ii * last_layer_size + thread_idx] * + res; res += res * t; res_grad += res_grad * t; } @@ -443,8 +450,8 @@ __global__ void tabulate_fusion_se_a_grad_grad_fifth_order_polynomial( for (int kk = 0; kk < MTILE; kk++) { int em_index = block_idx * nnei * MTILE + ii * MTILE + kk; iteratorC[kk * last_layer_size + thread_idx] += - (nnei - breakpoint) * - (em[em_index] * res_grad * dz_xx + dz_dy_dem[em_index] * res); + (nnei - breakpoint) * (em[em_index] * (res_grad * dz_xx + two_grad) + + dz_dy_dem[em_index] * res); } mark_table_idx = table_idx; if (unloop) { @@ -813,6 +820,7 @@ void tabulate_fusion_se_a_grad_grad_gpu(FPTYPE* dz_dy, const FPTYPE* two_embed, const FPTYPE* dz_dy_dem_x, const FPTYPE* dz_dy_dem, + const FPTYPE* dz_dy_dtwo, const int nloc, const int nnei, const int last_layer_size, @@ -825,7 +833,7 @@ void tabulate_fusion_se_a_grad_grad_gpu(FPTYPE* dz_dy, DPErrcheck(gpuMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * 4 * last_layer_size)); tabulate_fusion_se_a_grad_grad_fifth_order_polynomial <<>>( - dz_dy, table, em_x, em, two_embed, dz_dy_dem_x, dz_dy_dem, + dz_dy, table, em_x, em, two_embed, dz_dy_dem_x, dz_dy_dem, dz_dy_dtwo, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size, is_sorted); DPErrcheck(gpuGetLastError()); @@ -1036,6 +1044,7 @@ template void tabulate_fusion_se_a_grad_grad_gpu( const float* two_embed, const float* dz_dy_dem_x, const float* dz_dy_dem, + const float* dz_dy_dtwo, const int nloc, const int nnei, const int last_layer_size, @@ -1049,6 +1058,7 @@ template void tabulate_fusion_se_a_grad_grad_gpu( const double* two_embed, const double* dz_dy_dem_x, const double* dz_dy_dem, + const double* dz_dy_dtwo, const int nloc, const int nnei, const int last_layer_size, diff --git a/source/lib/src/tabulate.cc b/source/lib/src/tabulate.cc index 6c2068b262..3e2a1bec62 100644 --- a/source/lib/src/tabulate.cc +++ b/source/lib/src/tabulate.cc @@ -235,7 +235,7 @@ void deepmd::tabulate_fusion_se_a_grad_cpu(FPTYPE* dy_dem_x, // fill from jj to nnei for (int jj2 = jj; jj2 < nnei; jj2++) { dy_dtwo[ii * nnei * last_layer_size + jj2 * last_layer_size + - kk] += res * dotllrr; + kk] += resold * dotllrr; } } } else { @@ -267,6 +267,7 @@ void deepmd::tabulate_fusion_se_a_grad_grad_cpu(FPTYPE* dz_dy, const FPTYPE* two_embed, const FPTYPE* dz_dy_dem_x, const FPTYPE* dz_dy_dem, + const FPTYPE* dz_dy_dtwo, const int nloc, const int nnei, const int last_layer_size, @@ -317,9 +318,15 @@ void deepmd::tabulate_fusion_se_a_grad_grad_cpu(FPTYPE* dz_dy, ((FPTYPE)3. * a3 + ((FPTYPE)4. * a4 + (FPTYPE)5. * a5 * xx) * xx) * xx) * xx; + FPTYPE two_grad = 0.; if (enable_se_atten) { FPTYPE t = two_embed[ii * nnei * last_layer_size + jj * last_layer_size + kk]; + // dz_dy_dtwo * var * ll + // var above should be used instead of var + var * t below + two_grad = dz_dy_dtwo[ii * nnei * last_layer_size + + jj * last_layer_size + kk] * + var; var += var * t; var_grad += var_grad * t; } @@ -346,22 +353,26 @@ void deepmd::tabulate_fusion_se_a_grad_grad_cpu(FPTYPE* dz_dy, */ if (unloop) { dz_dy[ii * last_layer_size * 4 + 0 * last_layer_size + kk] += - (nnei - jj) * (var * hh[0] + dz_xx * var_grad * ll[0]); + (nnei - jj) * + (var * hh[0] + (dz_xx * var_grad + two_grad) * ll[0]); dz_dy[ii * last_layer_size * 4 + 1 * last_layer_size + kk] += - (nnei - jj) * (var * hh[1] + dz_xx * var_grad * ll[1]); + (nnei - jj) * + (var * hh[1] + (dz_xx * var_grad + two_grad) * ll[1]); dz_dy[ii * last_layer_size * 4 + 2 * last_layer_size + kk] += - (nnei - jj) * (var * hh[2] + dz_xx * var_grad * ll[2]); + (nnei - jj) * + (var * hh[2] + (dz_xx * var_grad + two_grad) * ll[2]); dz_dy[ii * last_layer_size * 4 + 3 * last_layer_size + kk] += - (nnei - jj) * (var * hh[3] + dz_xx * var_grad * ll[3]); + (nnei - jj) * + (var * hh[3] + (dz_xx * var_grad + two_grad) * ll[3]); } else { dz_dy[ii * last_layer_size * 4 + 0 * last_layer_size + kk] += - var * hh[0] + dz_xx * var_grad * ll[0]; + var * hh[0] + (dz_xx * var_grad + two_grad) * ll[0]; dz_dy[ii * last_layer_size * 4 + 1 * last_layer_size + kk] += - var * hh[1] + dz_xx * var_grad * ll[1]; + var * hh[1] + (dz_xx * var_grad + two_grad) * ll[1]; dz_dy[ii * last_layer_size * 4 + 2 * last_layer_size + kk] += - var * hh[2] + dz_xx * var_grad * ll[2]; + var * hh[2] + (dz_xx * var_grad + two_grad) * ll[2]; dz_dy[ii * last_layer_size * 4 + 3 * last_layer_size + kk] += - var * hh[3] + dz_xx * var_grad * ll[3]; + var * hh[3] + (dz_xx * var_grad + two_grad) * ll[3]; } } if (unloop) { @@ -711,6 +722,7 @@ template void deepmd::tabulate_fusion_se_a_grad_grad_cpu( const float* two_embed, const float* dz_dy_dem_x, const float* dz_dy_dem, + const float* dz_dy_dtwo, const int nloc, const int nnei, const int last_layer_size, @@ -724,6 +736,7 @@ template void deepmd::tabulate_fusion_se_a_grad_grad_cpu( const double* two_embed, const double* dz_dy_dem_x, const double* dz_dy_dem, + const double* dz_dy_dtwo, const int nloc, const int nnei, const int last_layer_size, diff --git a/source/op/tabulate_multi_device.cc b/source/op/tabulate_multi_device.cc index ceac954d0d..6a70f60a96 100644 --- a/source/op/tabulate_multi_device.cc +++ b/source/op/tabulate_multi_device.cc @@ -100,6 +100,7 @@ REGISTER_OP("TabulateFusionSeAttenGradGrad") .Input("two_embed: T") .Input("dz_dy_dem_x: T") .Input("dz_dy_dem: T") + .Input("dz_dy_dtwo: T") .Input("descriptor: T") .Output("dz_dy: T") .Attr("is_sorted: bool = true"); @@ -329,6 +330,7 @@ class TabulateFusionSeAGradGradOp : public OpKernel { const FPTYPE* two_embed = nullptr; const FPTYPE* dz_dy_dem_x = dz_dy_dem_x_tensor.flat().data(); const FPTYPE* dz_dy_dem = dz_dy_dem_tensor.flat().data(); + const FPTYPE* dz_dy_dtwo = nullptr; const int nloc = em_tensor.shape().dim_size(0); const int nnei = em_tensor.shape().dim_size(1); const int last_layer_size = descriptor_tensor.shape().dim_size(2); @@ -337,7 +339,7 @@ class TabulateFusionSeAGradGradOp : public OpKernel { #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::tabulate_fusion_se_a_grad_grad_gpu( dz_dy, table, table_info, em_x, em, two_embed, dz_dy_dem_x, dz_dy_dem, - nloc, nnei, last_layer_size, is_sorted); + dz_dy_dtwo, nloc, nnei, last_layer_size, is_sorted); #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM OP_REQUIRES(context, (last_layer_size <= 1024), errors::InvalidArgument( @@ -346,7 +348,7 @@ class TabulateFusionSeAGradGradOp : public OpKernel { } else if (device == "CPU") { deepmd::tabulate_fusion_se_a_grad_grad_cpu( dz_dy, table, table_info, em_x, em, two_embed, dz_dy_dem_x, dz_dy_dem, - nloc, nnei, last_layer_size, is_sorted); + dz_dy_dtwo, nloc, nnei, last_layer_size, is_sorted); } } @@ -522,6 +524,7 @@ class TabulateFusionSeAttenGradGradOp : public OpKernel { const Tensor& two_embed_tensor = context->input(context_input_index++); const Tensor& dz_dy_dem_x_tensor = context->input(context_input_index++); const Tensor& dz_dy_dem_tensor = context->input(context_input_index++); + const Tensor& dz_dy_dtwo_tensor = context->input(context_input_index++); const Tensor& descriptor_tensor = context->input(context_input_index++); // set size of the sample OP_REQUIRES(context, (dz_dy_dem_x_tensor.shape().dims() == 2), @@ -544,6 +547,7 @@ class TabulateFusionSeAttenGradGradOp : public OpKernel { const FPTYPE* two_embed = two_embed_tensor.flat().data(); const FPTYPE* dz_dy_dem_x = dz_dy_dem_x_tensor.flat().data(); const FPTYPE* dz_dy_dem = dz_dy_dem_tensor.flat().data(); + const FPTYPE* dz_dy_dtwo = dz_dy_dtwo_tensor.flat().data(); const int nloc = em_tensor.shape().dim_size(0); const int nnei = em_tensor.shape().dim_size(1); const int last_layer_size = descriptor_tensor.shape().dim_size(2); @@ -552,7 +556,7 @@ class TabulateFusionSeAttenGradGradOp : public OpKernel { #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::tabulate_fusion_se_a_grad_grad_gpu( dz_dy, table, table_info, em_x, em, two_embed, dz_dy_dem_x, dz_dy_dem, - nloc, nnei, last_layer_size, is_sorted); + dz_dy_dtwo, nloc, nnei, last_layer_size, is_sorted); #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM OP_REQUIRES(context, (last_layer_size <= 1024), errors::InvalidArgument( @@ -561,7 +565,7 @@ class TabulateFusionSeAttenGradGradOp : public OpKernel { } else if (device == "CPU") { deepmd::tabulate_fusion_se_a_grad_grad_cpu( dz_dy, table, table_info, em_x, em, two_embed, dz_dy_dem_x, dz_dy_dem, - nloc, nnei, last_layer_size, is_sorted); + dz_dy_dtwo, nloc, nnei, last_layer_size, is_sorted); } } From 5559d2a2e5f434cf48ae9f0daf701a28b67aa13e Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Tue, 10 Oct 2023 21:35:50 -0400 Subject: [PATCH 14/14] no need to memset dy_dtwo --- source/lib/src/gpu/tabulate.cu | 3 --- 1 file changed, 3 deletions(-) diff --git a/source/lib/src/gpu/tabulate.cu b/source/lib/src/gpu/tabulate.cu index 57e889e716..a22742ae19 100644 --- a/source/lib/src/gpu/tabulate.cu +++ b/source/lib/src/gpu/tabulate.cu @@ -798,9 +798,6 @@ void tabulate_fusion_se_a_grad_gpu(FPTYPE* dy_dem_x, DPErrcheck(gpuDeviceSynchronize()); DPErrcheck(gpuMemset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei)); DPErrcheck(gpuMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei * 4)); - if (two_embed != nullptr) { - DPErrcheck(gpuMemset(dy_dtwo, 0, sizeof(FPTYPE) * nloc * nnei)); - } tabulate_fusion_se_a_grad_fifth_order_polynomial <<>>(