From 6ad353f5dc89459032388aa368b582677770c841 Mon Sep 17 00:00:00 2001 From: caic99 Date: Mon, 16 Jun 2025 10:52:38 +0000 Subject: [PATCH 1/5] fix: training speed might be incorrect --- deepmd/pt/train/training.py | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index 7a6ff0ebde..75b127141f 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -935,10 +935,9 @@ def log_loss_valid(_task_key="Default"): eta=eta, ) ) - # the first training time is not accurate if ( - (_step_id + 1 - self.start_step) > self.disp_freq - or self.num_steps - self.start_step < 2 * self.disp_freq + (self.num_steps - self.start_step) <= 2 * self.disp_freq # not enough steps + or (_step_id - self.start_step) >= self.disp_freq # skip first disp_freq steps ): self.total_train_time += train_time @@ -951,11 +950,14 @@ def log_loss_valid(_task_key="Default"): ) if ( - ((_step_id + 1) % self.save_freq == 0 and _step_id != self.start_step) - or (_step_id + 1) == self.num_steps + ( + (display_step_id) % self.save_freq == 0 + and _step_id != self.start_step + ) + or (display_step_id) == self.num_steps ) and (self.rank == 0 or dist.get_rank() == 0): # Handle the case if rank 0 aborted and re-assigned - self.latest_model = Path(self.save_ckpt + f"-{_step_id + 1}.pt") + self.latest_model = Path(self.save_ckpt + f"-{display_step_id}.pt") module = ( self.wrapper.module @@ -1021,23 +1023,15 @@ def log_loss_valid(_task_key="Default"): with open("checkpoint", "w") as f: f.write(str(self.latest_model)) - elapsed_batch = self.num_steps - self.start_step - if self.timing_in_training and elapsed_batch // self.disp_freq > 0: - if self.start_step >= 2 * self.disp_freq: + elapsed_steps = self.num_steps - self.start_step + if self.timing_in_training: + if elapsed_steps <= 2 * self.disp_freq: log.info( - "average training time: %.4f s/batch (exclude first %d batches)", - self.total_train_time - / ( - elapsed_batch // self.disp_freq * self.disp_freq - - self.disp_freq - ), - self.disp_freq, + f"average training time: {self.total_train_time / elapsed_steps:.4f} s/batch" ) else: log.info( - "average training time: %.4f s/batch", - self.total_train_time - / (elapsed_batch // self.disp_freq * self.disp_freq), + f"average training time: {self.total_train_time / (elapsed_steps - self.disp_freq - elapsed_steps % self.disp_freq):.4f} s/batch (first {self.disp_freq} batches excluded)", ) if JIT: From 5665c0fa7b2836644cc6aa7889e20278cad5591b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 16 Jun 2025 10:56:11 +0000 Subject: [PATCH 2/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/pt/train/training.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index 75b127141f..08011d2d5e 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -936,8 +936,10 @@ def log_loss_valid(_task_key="Default"): ) ) if ( - (self.num_steps - self.start_step) <= 2 * self.disp_freq # not enough steps - or (_step_id - self.start_step) >= self.disp_freq # skip first disp_freq steps + (self.num_steps - self.start_step) + <= 2 * self.disp_freq # not enough steps + or (_step_id - self.start_step) + >= self.disp_freq # skip first disp_freq steps ): self.total_train_time += train_time From 4e2c8a139a4caa6d83fa33ecd52e8cb418bd1a0c Mon Sep 17 00:00:00 2001 From: caic99 Date: Wed, 18 Jun 2025 09:19:13 +0000 Subject: [PATCH 3/5] refactor for a more explicit logic --- deepmd/pt/train/training.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index 08011d2d5e..9ca095e095 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -942,6 +942,13 @@ def log_loss_valid(_task_key="Default"): >= self.disp_freq # skip first disp_freq steps ): self.total_train_time += train_time + if display_step_id == 1: + self.timed_steps += 1 + else: + self.timed_steps += min( + self.disp_freq, _step_id - self.start_step + ) + print(f"{self.timed_steps=}") if fout: if self.lcurve_should_print_header: @@ -986,6 +993,7 @@ def log_loss_valid(_task_key="Default"): self.wrapper.train() self.t0 = time.time() self.total_train_time = 0.0 + self.timed_steps = 0 for step_id in range(self.start_step, self.num_steps): step(step_id) if JIT: @@ -1025,16 +1033,12 @@ def log_loss_valid(_task_key="Default"): with open("checkpoint", "w") as f: f.write(str(self.latest_model)) - elapsed_steps = self.num_steps - self.start_step if self.timing_in_training: - if elapsed_steps <= 2 * self.disp_freq: - log.info( - f"average training time: {self.total_train_time / elapsed_steps:.4f} s/batch" - ) - else: - log.info( - f"average training time: {self.total_train_time / (elapsed_steps - self.disp_freq - elapsed_steps % self.disp_freq):.4f} s/batch (first {self.disp_freq} batches excluded)", - ) + msg = f"average training time: {self.total_train_time / self.timed_steps:.4f} s/batch" + excluded_steps = self.num_steps - self.start_step - self.timed_steps + if excluded_steps > 0: + msg += f" ({excluded_steps} batches excluded)" + log.info(msg) if JIT: pth_model_path = ( From 15ea1c0f9025ec2a4904b59df6428c9b5adf7ede Mon Sep 17 00:00:00 2001 From: caic99 Date: Wed, 18 Jun 2025 09:49:04 +0000 Subject: [PATCH 4/5] remove debug statement --- deepmd/pt/train/training.py | 1 - 1 file changed, 1 deletion(-) diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index 9ca095e095..55ccd9ef41 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -948,7 +948,6 @@ def log_loss_valid(_task_key="Default"): self.timed_steps += min( self.disp_freq, _step_id - self.start_step ) - print(f"{self.timed_steps=}") if fout: if self.lcurve_should_print_header: From beca0be8ee3f04e3adb30fa8d35b75f87e08fc8d Mon Sep 17 00:00:00 2001 From: caic99 Date: Thu, 19 Jun 2025 04:48:02 +0000 Subject: [PATCH 5/5] fix: ensure average training time is calculated only when timed steps are available --- deepmd/pt/train/training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index 55ccd9ef41..193dcd8cb9 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -1032,7 +1032,7 @@ def log_loss_valid(_task_key="Default"): with open("checkpoint", "w") as f: f.write(str(self.latest_model)) - if self.timing_in_training: + if self.timing_in_training and self.timed_steps: msg = f"average training time: {self.total_train_time / self.timed_steps:.4f} s/batch" excluded_steps = self.num_steps - self.start_step - self.timed_steps if excluded_steps > 0: