deepmodeling · wanghan-iapcm · Sep 9, 2022 · Jun 1, 2021 · Jun 3, 2021 · Jun 14, 2021
diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py
@@ -164,10 +164,7 @@ def __init__ (self,
         self.embedding_net_variables = None
         self.mixed_prec = None
         self.place_holders = {}
-        nei_type = np.array([])
-        for ii in range(self.ntypes):
-            nei_type = np.append(nei_type, ii * np.ones(self.sel_a[ii])) # like a mask 
-        self.nei_type = tf.constant(nei_type, dtype = tf.int32)
+        self.nei_type = np.repeat(np.arange(self.ntypes), self.sel_a)  # like a mask
 
         avg_zero = np.zeros([self.ntypes,self.ndescrpt]).astype(GLOBAL_NP_FLOAT_PRECISION)
         std_ones = np.ones ([self.ntypes,self.ndescrpt]).astype(GLOBAL_NP_FLOAT_PRECISION)
@@ -673,8 +670,9 @@ def _concat_type_embedding(
             embedding:
                 environment of each atom represented by embedding.
         '''
-        te_out_dim = type_embedding.get_shape().as_list()[-1]        
-        nei_embed = tf.nn.embedding_lookup(type_embedding,tf.cast(self.nei_type,dtype=tf.int32))  # shape is [self.nnei, 1+te_out_dim]
+        te_out_dim = type_embedding.get_shape().as_list()[-1]
+        self.t_nei_type = tf.constant(self.nei_type, dtype=tf.int32)
+        nei_embed = tf.nn.embedding_lookup(type_embedding,tf.cast(self.t_nei_type,dtype=tf.int32))  # shape is [self.nnei, 1+te_out_dim]
         nei_embed = tf.tile(nei_embed,(nframes*natoms[0],1))  # shape is [nframes*natoms[0]*self.nnei, te_out_dim]
         nei_embed = tf.reshape(nei_embed,[-1,te_out_dim])
         embedding_input = tf.concat([xyz_scatter,nei_embed],1)  # shape is [nframes*natoms[0]*self.nnei, 1+te_out_dim]

diff --git a/deepmd/descriptor/se_atten.py b/deepmd/descriptor/se_atten.py
@@ -15,6 +15,7 @@
 from deepmd.utils.type_embed import embed_atom_type
 from deepmd.utils.sess import run_sess
 from deepmd.utils.graph import load_graph_def, get_tensor_by_name_from_graph, get_tensor_by_name
+from deepmd.utils.graph import get_attention_layer_variables_from_graph_def
 from deepmd.utils.errors import GraphWithoutTensorError
 from .descriptor import Descriptor
 from .se_a import DescrptSeA
@@ -117,6 +118,9 @@ def __init__(self,
         self.sel_all_r = [0]
         avg_zero = np.zeros([self.ntypes, self.ndescrpt]).astype(GLOBAL_NP_FLOAT_PRECISION)
         std_ones = np.ones([self.ntypes, self.ndescrpt]).astype(GLOBAL_NP_FLOAT_PRECISION)
+        self.beta = np.zeros([self.attn_layer, self.filter_neuron[-1]]).astype(GLOBAL_NP_FLOAT_PRECISION)
+        self.gamma = np.ones([self.attn_layer, self.filter_neuron[-1]]).astype(GLOBAL_NP_FLOAT_PRECISION)
+        self.attention_layer_variables = None
         sub_graph = tf.Graph()
         with sub_graph.as_default():
             name_pfx = 'd_sea_'
@@ -305,10 +309,6 @@ def build(self,
         self.attn_weight = [None for i in range(self.attn_layer)]
         self.angular_weight = [None for i in range(self.attn_layer)]
         self.attn_weight_final = [None for i in range(self.attn_layer)]
-        self.G = None
-        self.qs = [None for i in range(self.attn_layer)]
-        self.ks = [None for i in range(self.attn_layer)]
-        self.vs = [None for i in range(self.attn_layer)]
 
         self.descrpt, self.descrpt_deriv, self.rij, self.nlist, self.nei_type_vec, self.nmask \
             = op_module.prod_env_mat_a_mix(coord,
@@ -365,8 +365,8 @@ def _pass_filter(self,
         inputs_i = inputs
         inputs_i = tf.reshape(inputs_i, [-1, self.ndescrpt])
         type_i = -1
-        layer, qmat = self._filter(inputs_i, type_i, natoms, name='filter_type_all' + suffix, reuse=reuse,
-                                   trainable=trainable, activation_fn=self.filter_activation_fn,
+        layer, qmat = self._filter(inputs_i, type_i, natoms, name='filter_type_all' + suffix, suffix=suffix,
+                                   reuse=reuse, trainable=trainable, activation_fn=self.filter_activation_fn,
                                    type_embedding=type_embedding, atype=atype)
         layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[0], self.get_dim_out()])
         qmat = tf.reshape(qmat, [tf.shape(inputs)[0], natoms[0], self.get_dim_rot_mat_1() * 3])
@@ -508,7 +508,8 @@ def _feedforward(self, input_xyz, d_in, d_mid):
             activation_fn=None,
             precision=self.filter_precision,
             trainable=True,
-            uniform_seed=self.uniform_seed))
+            uniform_seed=self.uniform_seed,
+            initial_variables=self.attention_layer_variables))
         input_xyz = one_layer(
             input_xyz,
             d_in,
@@ -518,7 +519,8 @@ def _feedforward(self, input_xyz, d_in, d_mid):
             activation_fn=None,
             precision=self.filter_precision,
             trainable=True,
-            uniform_seed=self.uniform_seed)
+            uniform_seed=self.uniform_seed,
+            initial_variables=self.attention_layer_variables)
         input_xyz += residual
         input_xyz = tf.keras.layers.LayerNormalization()(input_xyz)
         return input_xyz
@@ -553,75 +555,75 @@ def _attention_layers(
             input_r,
             dotr=False,
             do_mask=False,
-            trainable=True
+            trainable=True,
+            suffix=''
     ):
         sd_k = tf.sqrt(tf.cast(1., dtype=self.filter_precision))
-        self.G = tf.reshape(input_xyz, (-1, shape_i[1] // 4, outputs_size[-1]))[0]
         for i in range(layer_num):
-            with tf.variable_scope('attention_layer{}_'.format(i), reuse=tf.AUTO_REUSE):
+            name = 'attention_layer_{}{}'.format(i, suffix)
+            with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
                 # input_xyz_in = tf.nn.l2_normalize(input_xyz, -1)
                 Q_c = one_layer(
                     input_xyz,
                     self.att_n,
                     name='c_query',
+                    scope=name+'/',
                     reuse=tf.AUTO_REUSE,
                     seed=self.seed,
                     activation_fn=None,
                     precision=self.filter_precision,
                     trainable=trainable,
-                    uniform_seed=self.uniform_seed)
+                    uniform_seed=self.uniform_seed,
+                    initial_variables=self.attention_layer_variables)
                 K_c = one_layer(
                     input_xyz,
                     self.att_n,
                     name='c_key',
+                    scope=name+'/',
                     reuse=tf.AUTO_REUSE,
                     seed=self.seed,
                     activation_fn=None,
                     precision=self.filter_precision,
                     trainable=trainable,
-                    uniform_seed=self.uniform_seed)
+                    uniform_seed=self.uniform_seed,
+                    initial_variables=self.attention_layer_variables)
                 V_c = one_layer(
                     input_xyz,
                     self.att_n,
                     name='c_value',
+                    scope=name+'/',
                     reuse=tf.AUTO_REUSE,
                     seed=self.seed,
                     activation_fn=None,
                     precision=self.filter_precision,
                     trainable=trainable,
-                    uniform_seed=self.uniform_seed)
+                    uniform_seed=self.uniform_seed,
+                    initial_variables=self.attention_layer_variables)
                 # # natom x nei_type_i x out_size
                 # xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1] // 4, outputs_size[-1]))
                 # natom x nei_type_i x att_n
                 Q_c = tf.nn.l2_normalize(tf.reshape(Q_c, (-1, shape_i[1] // 4, self.att_n)), -1)
                 K_c = tf.nn.l2_normalize(tf.reshape(K_c, (-1, shape_i[1] // 4, self.att_n)), -1)
                 V_c = tf.nn.l2_normalize(tf.reshape(V_c, (-1, shape_i[1] // 4, self.att_n)), -1)
-                # Q_c = tf.reshape(Q_c, (-1, shape_i[1] // 4, self.att_n))
-                # K_c = tf.reshape(K_c, (-1, shape_i[1] // 4, self.att_n))
-                # V_c = tf.reshape(V_c, (-1, shape_i[1] // 4, self.att_n))
-                self.qs[i] = Q_c[0]
-                self.ks[i] = K_c[0]
-                self.vs[i] = V_c[0]
 
                 input_att = self._scaled_dot_attn(Q_c, K_c, V_c, sd_k, input_r, dotr=dotr, do_mask=do_mask, layer=i)
                 input_att = tf.reshape(input_att, (-1, self.att_n))
 
-                # A_c = tf.nn.softmax(tf.matmul(Q_c, K_c, transpose_b=True)/sd_k)
-                # # (natom x nei_type_i) x att_n
-                # input_att = tf.reshape(tf.matmul(A_c, V_c), (-1, self.att_n))
-
                 # (natom x nei_type_i) x out_size
                 input_xyz += one_layer(
                     input_att,
                     outputs_size[-1],
                     name='c_out',
+                    scope=name+'/',
                     reuse=tf.AUTO_REUSE,
                     seed=self.seed,
                     activation_fn=None,
                     precision=self.filter_precision,
                     trainable=trainable,
-                    uniform_seed=self.uniform_seed)
-                input_xyz = tf.keras.layers.LayerNormalization()(input_xyz)
+                    uniform_seed=self.uniform_seed,
+                    initial_variables=self.attention_layer_variables)
+                input_xyz = tf.keras.layers.LayerNormalization(beta_initializer=tf.constant_initializer(self.beta[i]),
+                                                gamma_initializer=tf.constant_initializer(self.gamma[i]))(input_xyz)
                 # input_xyz = self._feedforward(input_xyz, outputs_size[-1], self.att_n)
         return input_xyz
 
@@ -688,7 +690,7 @@ def _filter_lower(
             # natom x nei_type_i x out_size
             xyz_scatter_att = tf.reshape(
                 self._attention_layers(xyz_scatter, self.attn_layer, shape_i, outputs_size, input_r,
-                                       dotr=self.attn_dotr, do_mask=self.attn_mask, trainable=trainable),
+                                       dotr=self.attn_dotr, do_mask=self.attn_mask, trainable=trainable, suffix=suffix),
                 (-1, shape_i[1] // 4, outputs_size[-1]))
             # xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1] // 4, outputs_size[-1]))
         else:
@@ -712,6 +714,7 @@ def _filter(
             activation_fn=tf.nn.tanh,
             stddev=1.0,
             bavg=0.0,
+            suffix='',
             name='linear',
             reuse=None,
             trainable=True):
@@ -745,6 +748,7 @@ def _filter(
             stddev=stddev,
             bavg=bavg,
             trainable=trainable,
+            suffix=suffix,
             name=name,
             reuse=reuse,
             atype=atype)
@@ -775,3 +779,31 @@ def _filter(
         result = tf.reshape(result, [-1, outputs_size_2 * outputs_size[-1]])
 
         return result, qmat
+
+    def init_variables(self,
+                       graph: tf.Graph,
+                       graph_def: tf.GraphDef,
+                       suffix: str = "",
+                       ) -> None:
+        """
+        Init the embedding net variables with the given dict
+
+        Parameters
+        ----------
+        graph : tf.Graph
+            The input frozen model graph
+        graph_def : tf.GraphDef
+            The input frozen model graph_def
+        suffix : str, optional
+            The suffix of the scope
+        """
+        super().init_variables(graph=graph, graph_def=graph_def, suffix=suffix)
+        self.attention_layer_variables = get_attention_layer_variables_from_graph_def(graph_def, suffix=suffix)
+        if self.attn_layer > 0:
+            self.beta[0] = self.attention_layer_variables['attention_layer_0{}/layer_normalization/beta'.format(suffix)]
+            self.gamma[0] = self.attention_layer_variables['attention_layer_0{}/layer_normalization/gamma'.format(suffix)]
+            for i in range(1, self.attn_layer):
+                self.beta[i] = self.attention_layer_variables[
+                    'attention_layer_{}{}/layer_normalization_{}/beta'.format(i, suffix, i)]
+                self.gamma[i] = self.attention_layer_variables[
+                    'attention_layer_{}{}/layer_normalization_{}/gamma'.format(i, suffix, i)]
diff --git a/deepmd/env.py b/deepmd/env.py
@@ -44,6 +44,8 @@
     "TRANSFER_PATTERN",
     "FITTING_NET_PATTERN",
     "EMBEDDING_NET_PATTERN",
+    "TYPE_EMBEDDING_PATTERN",
+    "ATTENTION_LAYER_PATTERN",
     "TF_VERSION"
 ]
 
@@ -59,18 +61,26 @@
     r"filter_type_\d+/matrix_\d+_\d+|"
     r"filter_type_\d+/bias_\d+_\d+|"
     r"filter_type_\d+/idt_\d+_\d+|"
+    r"filter_type_all/matrix_\d+|"
     r"filter_type_all/matrix_\d+_\d+|"
     r"filter_type_all/matrix_\d+_\d+_\d+|"
+    r"filter_type_all/bias_\d+|"
     r"filter_type_all/bias_\d+_\d+|"
     r"filter_type_all/bias_\d+_\d+_\d+|"
+    r"filter_type_all/idt_\d+|"
     r"filter_type_all/idt_\d+_\d+|"
 )
 
 FITTING_NET_PATTERN = str(
+    r"layer_\d+/matrix|"
     r"layer_\d+_type_\d+/matrix|"
+    r"layer_\d+/bias|"
     r"layer_\d+_type_\d+/bias|"
+    r"layer_\d+/idt|"
     r"layer_\d+_type_\d+/idt|"
+    r"final_layer/matrix|"
     r"final_layer_type_\d+/matrix|"
+    r"final_layer/bias|"
     r"final_layer_type_\d+/bias|"
 )
 
@@ -80,6 +90,21 @@
     r"type_embed_net+/idt_\d+|"
 )
 
+ATTENTION_LAYER_PATTERN = str(
+    r"attention_layer_\d+/c_query/matrix|"
+    r"attention_layer_\d+/c_query/bias|"
+    r"attention_layer_\d+/c_key/matrix|"
+    r"attention_layer_\d+/c_key/bias|"
+    r"attention_layer_\d+/c_value/matrix|"
+    r"attention_layer_\d+/c_value/bias|"
+    r"attention_layer_\d+/c_out/matrix|"
+    r"attention_layer_\d+/c_out/bias|"
+    r"attention_layer_\d+/layer_normalization/beta|"
+    r"attention_layer_\d+/layer_normalization/gamma|"
+    r"attention_layer_\d+/layer_normalization_\d+/beta|"
+    r"attention_layer_\d+/layer_normalization_\d+/gamma|"
+)
+
 TRANSFER_PATTERN = \
     EMBEDDING_NET_PATTERN + \
     FITTING_NET_PATTERN + \

diff --git a/deepmd/fit/ener.py b/deepmd/fit/ener.py
@@ -9,6 +9,7 @@
 from deepmd.utils.network import one_layer as one_layer_deepmd
 from deepmd.utils.type_embed import embed_atom_type
 from deepmd.utils.graph import get_fitting_net_variables_from_graph_def, load_graph_def, get_tensor_by_name_from_graph
+from deepmd.utils.errors import GraphWithoutTensorError
 from deepmd.fit.fitting import Fitting
 
 from deepmd.env import global_cvt_2_tf_float
@@ -400,6 +401,8 @@ def build (self,
         if input_dict is None:
             input_dict = {}
         bias_atom_e = self.bias_atom_e
+        type_embedding = input_dict.get('type_embedding', None)
+        atype = input_dict.get('atype', None)
         if self.numb_fparam > 0:
             if self.fparam_avg is None:
                 self.fparam_avg = 0.
@@ -418,9 +421,10 @@ def build (self,
             t_daparam = tf.constant(self.numb_aparam, 
                                     name = 'daparam', 
                                     dtype = tf.int32)
-            self.t_bias_atom_e = tf.get_variable('t_bias_atom_e',
+            if type_embedding is not None:
+                self.t_bias_atom_e = tf.get_variable('t_bias_atom_e',
                                             self.bias_atom_e.shape,
-                                            dtype=GLOBAL_TF_FLOAT_PRECISION,
+                                            dtype=self.fitting_precision,
                                             trainable=False,
                                             initializer=tf.constant_initializer(self.bias_atom_e))
             if self.numb_fparam > 0: 
@@ -471,9 +475,7 @@ def build (self,
             aparam = tf.reshape(aparam, [-1, self.numb_aparam])
             aparam = (aparam - t_aparam_avg) * t_aparam_istd
             aparam = tf.reshape(aparam, [-1, self.numb_aparam * natoms[0]])
-
-        type_embedding = input_dict.get('type_embedding', None)
-        atype = input_dict.get('atype', None)
+
         if type_embedding is not None:
             atype_nall = tf.reshape(atype, [-1, natoms[1]])
             self.atype_nloc = tf.reshape(tf.slice(atype_nall, [0, 0], [-1, natoms[0]]), [-1])  ## lammps will make error
@@ -570,6 +572,11 @@ def init_variables(self,
         if self.numb_aparam > 0:
             self.aparam_avg = get_tensor_by_name_from_graph(graph, 'fitting_attr%s/t_aparam_avg' % suffix)
             self.aparam_inv_std = get_tensor_by_name_from_graph(graph, 'fitting_attr%s/t_aparam_istd' % suffix)
+        try:
+            self.bias_atom_e = get_tensor_by_name_from_graph(graph, 'fitting_attr%s/t_bias_atom_e' % suffix)
+        except GraphWithoutTensorError:
+            # model without type_embedding has no t_bias_atom_e
+            pass
 
     def enable_compression(self,
                            model_file: str,

diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py
@@ -288,7 +288,8 @@ def _init_param(self, jdata):
 
     def build (self, 
                data = None, 
-               stop_batch = 0) :
+               stop_batch = 0,
+               suffix = "") :
         self.ntypes = self.model.get_ntypes()
         self.stop_batch = stop_batch
 
@@ -348,7 +349,7 @@ def build (self,
             self.fitting.enable_mixed_precision(self.mixed_prec)
 
         self._build_lr()
-        self._build_network(data)
+        self._build_network(data, suffix)
         self._build_training()
 
 
@@ -358,7 +359,7 @@ def _build_lr(self):
         self.learning_rate = self.lr.build(self.global_step, self.stop_batch)
         log.info("built lr")
 
-    def _build_network(self, data):        
+    def _build_network(self, data, suffix=""):
         self.place_holders = {}
         if self.is_compress :
             for kk in ['coord', 'box']:
@@ -379,7 +380,7 @@ def _build_network(self, data):
                                 self.place_holders['default_mesh'],
                                 self.place_holders,
                                 self.frz_model,
-                                suffix = "", 
+                                suffix = suffix,
                                 reuse = False)
 
         self.l2_l, self.l2_more\

diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
@@ -276,9 +276,9 @@ def descrpt_se_atten_args():
         Argument("seed", [int, None], optional=True, doc=doc_seed),
         Argument("exclude_types", list, optional=True, default=[], doc=doc_exclude_types),
         Argument("set_davg_zero", bool, optional=True, default=False, doc=doc_set_davg_zero),
-        Argument("attn", int, optional=True, default=100, doc=doc_attn),
-        Argument("attn_layer", int, optional=True, default=4, doc=doc_attn_layer),
-        Argument("attn_dotr", bool, optional=True, default=False, doc=doc_attn_dotr),
+        Argument("attn", int, optional=True, default=128, doc=doc_attn),
+        Argument("attn_layer", int, optional=True, default=2, doc=doc_attn_layer),
+        Argument("attn_dotr", bool, optional=True, default=True, doc=doc_attn_dotr),
         Argument("attn_mask", bool, optional=True, default=False, doc=doc_attn_mask)
     ]