From 154c5c13af89df234912b59a61e8d748cea7e2a5 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Wed, 29 May 2024 04:12:23 -0400
Subject: [PATCH 1/2] fix(pt): improve out-of-memory handling

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 deepmd/pt/utils/auto_batch_size.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/deepmd/pt/utils/auto_batch_size.py b/deepmd/pt/utils/auto_batch_size.py
index 13264a336c..a9d5dc3f7c 100644
--- a/deepmd/pt/utils/auto_batch_size.py
+++ b/deepmd/pt/utils/auto_batch_size.py
@@ -52,7 +52,14 @@ def is_oom_error(self, e: Exception) -> bool:
         e : Exception
             Exception
         """
-        return isinstance(e, RuntimeError) and "CUDA out of memory." in e.args[0]
+        # several sources think CUSOLVER_STATUS_INTERNAL_ERROR is another out-of-memory error,
+        # such as https://github.com/JuliaGPU/CUDA.jl/issues/1924
+        # (the meaningless error message should be considered as a bug in cusolver)
+        if isinstance(e, RuntimeError) and ("CUDA out of memory." in e.args[0] or "cusolver error: CUSOLVER_STATUS_INTERNAL_ERROR" in e.args[0]):
+            # Release all unoccupied cached memory
+            torch.cuda.empty_cache()
+            return True
+        return False
 
     def execute_all(
         self, callable: Callable, total_size: int, natoms: int, *args, **kwargs

From 07d9044038107e23152df8a3291405570d6f9136 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 29 May 2024 08:13:52 +0000
Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 deepmd/pt/utils/auto_batch_size.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/deepmd/pt/utils/auto_batch_size.py b/deepmd/pt/utils/auto_batch_size.py
index a9d5dc3f7c..0af7cdcc47 100644
--- a/deepmd/pt/utils/auto_batch_size.py
+++ b/deepmd/pt/utils/auto_batch_size.py
@@ -55,7 +55,10 @@ def is_oom_error(self, e: Exception) -> bool:
         # several sources think CUSOLVER_STATUS_INTERNAL_ERROR is another out-of-memory error,
         # such as https://github.com/JuliaGPU/CUDA.jl/issues/1924
         # (the meaningless error message should be considered as a bug in cusolver)
-        if isinstance(e, RuntimeError) and ("CUDA out of memory." in e.args[0] or "cusolver error: CUSOLVER_STATUS_INTERNAL_ERROR" in e.args[0]):
+        if isinstance(e, RuntimeError) and (
+            "CUDA out of memory." in e.args[0]
+            or "cusolver error: CUSOLVER_STATUS_INTERNAL_ERROR" in e.args[0]
+        ):
             # Release all unoccupied cached memory
             torch.cuda.empty_cache()
             return True