diff --git a/deepmd/pd/train/training.py b/deepmd/pd/train/training.py index c85e67a362..4e5fea081f 100644 --- a/deepmd/pd/train/training.py +++ b/deepmd/pd/train/training.py @@ -133,6 +133,9 @@ def __init__( # Iteration config self.num_steps = training_params["numb_steps"] + self.acc_freq: int = training_params.get( + "acc_freq", 1 + ) # gradient accumulation steps self.disp_file = training_params.get("disp_file", "lcurve.out") self.disp_freq = training_params.get("disp_freq", 1000) self.save_ckpt = training_params.get("save_ckpt", "model.ckpt") @@ -744,7 +747,6 @@ def step(_step_id, task_key="Default") -> None: _lr = self.lr_exp cur_lr = _lr.value(_step_id) pref_lr = cur_lr - self.optimizer.clear_grad(set_to_zero=False) with nvprof_context(enable_profiling, "Fetching data"): input_dict, label_dict, log_dict = self.get_data( @@ -780,22 +782,27 @@ def step(_step_id, task_key="Default") -> None: with nvprof_context(enable_profiling, "Backward pass"): loss.backward() - # fuse + allreduce manually before optimization if use DDP + no_sync - # details in https://github.com/PaddlePaddle/Paddle/issues/48898#issuecomment-1343838622 - if self.world_size > 1: - hpu.fused_allreduce_gradients(list(self.wrapper.parameters()), None) - - if self.gradient_max_norm > 0.0: - with nvprof_context(enable_profiling, "Gradient clip"): - paddle.nn.utils.clip_grad_norm_( - self.wrapper.parameters(), - self.gradient_max_norm, - error_if_nonfinite=True, + # gradient accumulation + if (_step_id + 1) % self.acc_freq == 0: + # fuse + allreduce manually before optimization if use DDP + no_sync + # details in https://github.com/PaddlePaddle/Paddle/issues/48898#issuecomment-1343838622 + if self.world_size > 1: + hpu.fused_allreduce_gradients( + list(self.wrapper.parameters()), None ) - with nvprof_context(enable_profiling, "Adam update"): - self.optimizer.step() - self.scheduler.step() + if self.gradient_max_norm > 0.0: + with nvprof_context(enable_profiling, "Gradient clip"): + paddle.nn.utils.clip_grad_norm_( + self.wrapper.parameters(), + self.gradient_max_norm, + error_if_nonfinite=True, + ) + + with nvprof_context(enable_profiling, "Adam update"): + self.optimizer.step() + self.optimizer.clear_grad(set_to_zero=False) + self.scheduler.step() else: raise ValueError(f"Not supported optimizer type '{self.opt_type}'") diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index e446674db7..799e806bb2 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -40,6 +40,7 @@ doc_only_tf_supported = "(Supported Backend: TensorFlow) " doc_only_pt_supported = "(Supported Backend: PyTorch) " +doc_only_pd_supported = "(Supported Backend: Paddle) " # descriptors doc_loc_frame = "Defines a local frame at each atom, and the compute the descriptor as local coordinates under this frame." doc_se_e2_a = "Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor." @@ -3167,6 +3168,7 @@ def training_args( doc_kf_blocksize = "The blocksize for the Kalman filter." doc_model_prob = "The visiting probability of each model for each training step in the multi-task mode." doc_data_dict = "The multiple definition of the data, used in the multi-task mode." + doc_acc_freq = "Gradient accumulation steps (number of steps to accumulate gradients before performing an update)." arg_training_data = training_data_args() arg_validation_data = validation_data_args() @@ -3269,6 +3271,13 @@ def training_args( optional=True, doc=doc_only_pt_supported + doc_gradient_max_norm, ), + Argument( + "acc_freq", + int, + optional=True, + default=1, + doc=doc_only_pd_supported + doc_acc_freq, + ), ] variants = [ Variant( diff --git a/source/tests/pd/test_training.py b/source/tests/pd/test_training.py index 8958dcb165..0dc36fa314 100644 --- a/source/tests/pd/test_training.py +++ b/source/tests/pd/test_training.py @@ -150,9 +150,25 @@ def setUp(self) -> None: self.config["model"] = deepcopy(model_se_e2_a) self.config["training"]["numb_steps"] = 1 self.config["training"]["save_freq"] = 1 - # import paddle enable_prim(True) - # assert paddle.framework.core._is_eager_prim_enabled() + + def tearDown(self) -> None: + DPTrainTest.tearDown(self) + + +class TestEnergyModelGradientAccumulation(unittest.TestCase, DPTrainTest): + def setUp(self) -> None: + input_json = str(Path(__file__).parent / "water/se_atten.json") + with open(input_json) as f: + self.config = json.load(f) + data_file = [str(Path(__file__).parent / "water/data/data_0")] + self.config["training"]["training_data"]["systems"] = data_file + self.config["training"]["validation_data"]["systems"] = data_file + self.config["model"] = deepcopy(model_se_e2_a) + self.config["training"]["numb_steps"] = 1 + self.config["training"]["save_freq"] = 1 + self.config["training"]["acc_freq"] = 4 + enable_prim(True) def tearDown(self) -> None: DPTrainTest.tearDown(self)