From d8897a9eb48e984b173c8647adbec64dc07f3ed1 Mon Sep 17 00:00:00 2001
From: denghuilu <denghuilu@pku.edu.cn>
Date: Tue, 28 Jun 2022 23:03:14 +0800
Subject: [PATCH 1/4] support optional custom gelu implementation

---
 deepmd/common.py | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/deepmd/common.py b/deepmd/common.py
index 6a18cda677..3185f2da2f 100644
--- a/deepmd/common.py
+++ b/deepmd/common.py
@@ -34,7 +34,7 @@
         from typing import Literal  # python >3.6
     except ImportError:
         from typing_extensions import Literal  # type: ignore
-    _ACTIVATION = Literal["relu", "relu6", "softplus", "sigmoid", "tanh", "gelu"]
+    _ACTIVATION = Literal["relu", "relu6", "softplus", "sigmoid", "tanh", "gelu", "gelu_tf"]
     _PRECISION = Literal["default", "float16", "float32", "float64"]
 
 # define constants
@@ -49,7 +49,29 @@
 def gelu(x: tf.Tensor) -> tf.Tensor:
     """Gaussian Error Linear Unit.
 
-    This is a smoother version of the RELU.
+    This is a smoother version of the RELU, implemented by custom operator.
+
+    Parameters
+    ----------
+    x : tf.Tensor
+        float Tensor to perform activation
+
+    Returns
+    -------
+    `x` with the GELU activation applied
+
+    References
+    ----------
+    Original paper
+    https://arxiv.org/abs/1606.08415
+    """
+    return op_module.gelu(x)
+
+
+def gelu_tf(x: tf.Tensor) -> tf.Tensor:
+    """Gaussian Error Linear Unit.
+
+    This is a smoother version of the RELU, implemented by TF.
 
     Parameters
     ----------
@@ -69,10 +91,10 @@ def gelu_wrapper(x):
         try:
             return tensorflow.nn.gelu(x, approximate=True)
         except AttributeError:
+            warnings.warn("TensorFlow does not provide an implementation of gelu, please upgrade your TensorFlow version. Fallback to the custom gelu operator.")
             return op_module.gelu(x)
     return (lambda x: gelu_wrapper(x))(x)
 
-
 # TODO this is not a good way to do things. This is some global variable to which
 # TODO anyone can write and there is no good way to keep track of the changes
 data_requirement = {}
@@ -84,6 +106,7 @@ def gelu_wrapper(x):
     "sigmoid": tf.sigmoid,
     "tanh": tf.nn.tanh,
     "gelu": gelu,
+    "gelu_tf": gelu_tf,
 }
 
 

From d6dfdfbc40973e78fd900bfa792c09ad3988aa20 Mon Sep 17 00:00:00 2001
From: Denghui Lu <denghuilu@pku.edu.cn>
Date: Tue, 28 Jun 2022 23:18:23 +0800
Subject: [PATCH 2/4] add doc for gelu_tf

---
 doc/train-input-auto.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/train-input-auto.rst b/doc/train-input-auto.rst
index 9201809549..2eb0ed6caa 100644
--- a/doc/train-input-auto.rst
+++ b/doc/train-input-auto.rst
@@ -268,7 +268,7 @@ model:
             | type: ``str``, optional, default: ``tanh``
             | argument path: ``model/descriptor[se_e2_a]/activation_function``
 
-            The activation function in the embedding net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
+            The activation function in the embedding net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu", "gelu_tf". Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF statdard version.
 
         .. _`model/descriptor[se_e2_a]/resnet_dt`: 
 
@@ -373,7 +373,7 @@ model:
             | type: ``str``, optional, default: ``tanh``
             | argument path: ``model/descriptor[se_e2_r]/activation_function``
 
-            The activation function in the embedding net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
+            The activation function in the embedding net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu", "gelu_tf". Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF statdard version.
 
         .. _`model/descriptor[se_e2_r]/resnet_dt`: 
 

From e9592c6f245a307e239e4a7d418dcb37cb7e4a34 Mon Sep 17 00:00:00 2001
From: denghuilu <denghuilu@pku.edu.cn>
Date: Wed, 29 Jun 2022 10:38:19 +0800
Subject: [PATCH 3/4] address doc issue

---
 deepmd/common.py         |  6 ++++--
 deepmd/utils/argcheck.py | 14 +++++++-------
 doc/train-input-auto.rst |  4 ++--
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/deepmd/common.py b/deepmd/common.py
index 3185f2da2f..1146f291d5 100644
--- a/deepmd/common.py
+++ b/deepmd/common.py
@@ -58,7 +58,8 @@ def gelu(x: tf.Tensor) -> tf.Tensor:
 
     Returns
     -------
-    `x` with the GELU activation applied
+    tf.Tensor
+        `x` with the GELU activation applied
 
     References
     ----------
@@ -80,7 +81,8 @@ def gelu_tf(x: tf.Tensor) -> tf.Tensor:
 
     Returns
     -------
-    `x` with the GELU activation applied
+    tf.Tensor
+        `x` with the GELU activation applied
 
     References
     ----------
diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
index 9784e1cf1a..e7c7edb170 100644
--- a/deepmd/utils/argcheck.py
+++ b/deepmd/utils/argcheck.py
@@ -28,7 +28,7 @@ def type_embedding_args():
     doc_neuron = 'Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
     doc_seed = 'Random seed for parameter initialization'
-    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}'
+    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}. Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
     doc_precision = f'The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
     doc_trainable = 'If the parameters in the embedding net are trainable'
     
@@ -128,7 +128,7 @@ def descrpt_se_a_args():
     doc_rcut_smth = 'Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`'
     doc_neuron = 'Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.'
     doc_axis_neuron = 'Size of the submatrix of G (embedding matrix).'
-    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}'
+    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}. Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
     doc_type_one_side = 'Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets'
     doc_precision = f'The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
@@ -162,7 +162,7 @@ def descrpt_se_t_args():
     doc_rcut = 'The cut-off radius.'
     doc_rcut_smth = 'Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`'
     doc_neuron = 'Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.'
-    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}'
+    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}. Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
     doc_precision = f'The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
     doc_trainable = 'If the parameters in the embedding net are trainable'
@@ -205,7 +205,7 @@ def descrpt_se_r_args():
     doc_rcut = 'The cut-off radius.'
     doc_rcut_smth = 'Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`'
     doc_neuron = 'Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.'
-    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}'
+    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}. Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
     doc_type_one_side = 'Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets'
     doc_precision = f'The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
@@ -262,7 +262,7 @@ def fitting_ener():
     doc_numb_fparam = 'The dimension of the frame parameter. If set to >0, file `fparam.npy` should be included to provided the input fparams.'
     doc_numb_aparam = 'The dimension of the atomic parameter. If set to >0, file `aparam.npy` should be included to provided the input aparams.'
     doc_neuron = 'The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.'
-    doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}'
+    doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}. Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
     doc_precision = f'The precision of the fitting net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
     doc_trainable = 'Whether the parameters in the fitting net are trainable. This option can be\n\n\
@@ -288,7 +288,7 @@ def fitting_ener():
 
 def fitting_polar():
     doc_neuron = 'The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.'
-    doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}'
+    doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}. Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
     doc_precision = f'The precision of the fitting net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
     doc_scale = 'The output of the fitting net (polarizability matrix) will be scaled by ``scale``'
@@ -320,7 +320,7 @@ def fitting_polar():
 
 def fitting_dipole():
     doc_neuron = 'The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.'
-    doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}'
+    doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}. Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
     doc_precision = f'The precision of the fitting net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
     doc_sel_type = 'The atom types for which the atomic dipole will be provided. If not set, all types will be selected.'
diff --git a/doc/train-input-auto.rst b/doc/train-input-auto.rst
index 2eb0ed6caa..f1de54f2f4 100644
--- a/doc/train-input-auto.rst
+++ b/doc/train-input-auto.rst
@@ -268,7 +268,7 @@ model:
             | type: ``str``, optional, default: ``tanh``
             | argument path: ``model/descriptor[se_e2_a]/activation_function``
 
-            The activation function in the embedding net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu", "gelu_tf". Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF statdard version.
+            The activation function in the embedding net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
 
         .. _`model/descriptor[se_e2_a]/resnet_dt`: 
 
@@ -373,7 +373,7 @@ model:
             | type: ``str``, optional, default: ``tanh``
             | argument path: ``model/descriptor[se_e2_r]/activation_function``
 
-            The activation function in the embedding net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu", "gelu_tf". Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF statdard version.
+            The activation function in the embedding net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu". 
 
         .. _`model/descriptor[se_e2_r]/resnet_dt`: 
 

From e28ddbe9e36d8b36797c6a331d80fff8b457deef Mon Sep 17 00:00:00 2001
From: denghuilu <denghuilu@pku.edu.cn>
Date: Wed, 29 Jun 2022 10:39:54 +0800
Subject: [PATCH 4/4] Update train-input-auto.rst

---
 doc/train-input-auto.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/train-input-auto.rst b/doc/train-input-auto.rst
index f1de54f2f4..9201809549 100644
--- a/doc/train-input-auto.rst
+++ b/doc/train-input-auto.rst
@@ -373,7 +373,7 @@ model:
             | type: ``str``, optional, default: ``tanh``
             | argument path: ``model/descriptor[se_e2_r]/activation_function``
 
-            The activation function in the embedding net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu". 
+            The activation function in the embedding net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
 
         .. _`model/descriptor[se_e2_r]/resnet_dt`: