From 154c5c13af89df234912b59a61e8d748cea7e2a5 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Wed, 29 May 2024 04:12:23 -0400 Subject: [PATCH 1/2] fix(pt): improve out-of-memory handling Signed-off-by: Jinzhe Zeng --- deepmd/pt/utils/auto_batch_size.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/deepmd/pt/utils/auto_batch_size.py b/deepmd/pt/utils/auto_batch_size.py index 13264a336c..a9d5dc3f7c 100644 --- a/deepmd/pt/utils/auto_batch_size.py +++ b/deepmd/pt/utils/auto_batch_size.py @@ -52,7 +52,14 @@ def is_oom_error(self, e: Exception) -> bool: e : Exception Exception """ - return isinstance(e, RuntimeError) and "CUDA out of memory." in e.args[0] + # several sources think CUSOLVER_STATUS_INTERNAL_ERROR is another out-of-memory error, + # such as https://github.com/JuliaGPU/CUDA.jl/issues/1924 + # (the meaningless error message should be considered as a bug in cusolver) + if isinstance(e, RuntimeError) and ("CUDA out of memory." in e.args[0] or "cusolver error: CUSOLVER_STATUS_INTERNAL_ERROR" in e.args[0]): + # Release all unoccupied cached memory + torch.cuda.empty_cache() + return True + return False def execute_all( self, callable: Callable, total_size: int, natoms: int, *args, **kwargs From 07d9044038107e23152df8a3291405570d6f9136 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 29 May 2024 08:13:52 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/pt/utils/auto_batch_size.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/deepmd/pt/utils/auto_batch_size.py b/deepmd/pt/utils/auto_batch_size.py index a9d5dc3f7c..0af7cdcc47 100644 --- a/deepmd/pt/utils/auto_batch_size.py +++ b/deepmd/pt/utils/auto_batch_size.py @@ -55,7 +55,10 @@ def is_oom_error(self, e: Exception) -> bool: # several sources think CUSOLVER_STATUS_INTERNAL_ERROR is another out-of-memory error, # such as https://github.com/JuliaGPU/CUDA.jl/issues/1924 # (the meaningless error message should be considered as a bug in cusolver) - if isinstance(e, RuntimeError) and ("CUDA out of memory." in e.args[0] or "cusolver error: CUSOLVER_STATUS_INTERNAL_ERROR" in e.args[0]): + if isinstance(e, RuntimeError) and ( + "CUDA out of memory." in e.args[0] + or "cusolver error: CUSOLVER_STATUS_INTERNAL_ERROR" in e.args[0] + ): # Release all unoccupied cached memory torch.cuda.empty_cache() return True