From ee340497a090b08fe4c91ac96d9f050a8caf7704 Mon Sep 17 00:00:00 2001 From: OutisLi Date: Fri, 24 Apr 2026 10:06:45 +0800 Subject: [PATCH 1/3] fix(pt): recognize AOTInductor-wrapped CUDA OOM in AutoBatchSize When running `dp --pt-expt test` (or any path that goes through `deepmd.pt_expt.infer.deep_eval`) against a `.pt2` AOTInductor package, `AutoBatchSize` doubles the batch on every success. For models with a large `sel` the exploration eventually saturates GPU memory, and the CUDA caching allocator raises the usual ``CUDA out of memory`` from inside the AOTInductor runtime. AOTInductor then rewraps that error as a generic RuntimeError: run_func_(...) API call failed at .../aoti_runner/model_container_runner.cpp, line 144 The original "CUDA out of memory" text is printed only to stderr, so the old `is_oom_error` -- which keyed on a short list of substrings in `e.args[0]` -- never matched. `execute()` therefore did not shrink the batch; the exception propagated and the run crashed on a GPU that was otherwise completely idle (as confirmed by monitoring `nvidia-smi --query-compute-apps`, which showed dp itself as the sole consumer holding tens of GiB just before the failure). Widen `is_oom_error` to: * walk the exception chain via `__cause__` / `__context__`, so that a future PyTorch preserving the original OOM text is handled for free; * keep matching the four plain CUDA OOM markers on every message in the chain; * additionally treat the AOTInductor wrapper signature (`run_func_(` plus `model_container_runner`) as an OOM candidate. If the AOTInductor wrapper ever hides a non-OOM failure, the batch shrinker will halve down to 1 and then raise `OutOfMemoryError`, so the fallback is bounded -- non-OOM bugs still surface with a clear terminal error rather than being silently retried forever. --- deepmd/pt/utils/auto_batch_size.py | 61 +++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 14 deletions(-) diff --git a/deepmd/pt/utils/auto_batch_size.py b/deepmd/pt/utils/auto_batch_size.py index 5f8e0930d3..8ac7fba828 100644 --- a/deepmd/pt/utils/auto_batch_size.py +++ b/deepmd/pt/utils/auto_batch_size.py @@ -49,20 +49,53 @@ def is_oom_error(self, e: Exception) -> bool: e : Exception Exception """ - # several sources think CUSOLVER_STATUS_INTERNAL_ERROR is another out-of-memory error, - # such as https://github.com/JuliaGPU/CUDA.jl/issues/1924 - # (the meaningless error message should be considered as a bug in cusolver) - if ( - isinstance(e, RuntimeError) - and ( - "CUDA out of memory." in e.args[0] - or "CUDA driver error: out of memory" in e.args[0] - or "cusolver error: CUSOLVER_STATUS_INTERNAL_ERROR" in e.args[0] - # https://github.com/deepmodeling/deepmd-kit/issues/4594 - or "CUDA error: out of memory" in e.args[0] - ) - ) or isinstance(e, torch.cuda.OutOfMemoryError): - # Release all unoccupied cached memory + if isinstance(e, torch.cuda.OutOfMemoryError): torch.cuda.empty_cache() return True + + if not isinstance(e, RuntimeError) or not e.args: + return False + + # Gather messages from the exception itself and its chain. AOTInductor + # (.pt2) sometimes strips the underlying OOM message when rewrapping, + # but not always; checking ``__cause__`` / ``__context__`` catches the + # remaining cases when the original error is preserved. + msgs: list[str] = [] + cur: BaseException | None = e + seen: set[int] = set() + while cur is not None and id(cur) not in seen: + seen.add(id(cur)) + if cur.args: + first = cur.args[0] + if isinstance(first, str): + msgs.append(first) + cur = cur.__cause__ or cur.__context__ + + # Several sources treat CUSOLVER_STATUS_INTERNAL_ERROR as an OOM, e.g. + # https://github.com/JuliaGPU/CUDA.jl/issues/1924 + plain_oom_markers = ( + "CUDA out of memory.", + "CUDA driver error: out of memory", + "CUDA error: out of memory", + "cusolver error: CUSOLVER_STATUS_INTERNAL_ERROR", + ) + if any(m in msg for msg in msgs for m in plain_oom_markers): + torch.cuda.empty_cache() + return True + + # AOTInductor (.pt2) wraps the underlying CUDA OOM as a generic + # ``run_func_(...) API call failed at .../model_container_runner.cpp``. + # The original "CUDA out of memory" text is printed to stderr only and + # is absent from the Python-level RuntimeError, so we match on the + # wrapper signature. If the root cause turns out to be something + # other than OOM, ``execute()`` will keep shrinking the batch and + # eventually raise ``OutOfMemoryError`` at batch size 1, which is a + # clean failure rather than an uncaught exception. + aoti_wrapped = any( + "run_func_(" in msg and "model_container_runner" in msg for msg in msgs + ) + if aoti_wrapped: + torch.cuda.empty_cache() + return True + return False From a00b10f797b00490caa01cd611521332949e9490 Mon Sep 17 00:00:00 2001 From: OutisLi Date: Sat, 25 Apr 2026 10:35:57 +0800 Subject: [PATCH 2/3] fixup --- deepmd/pt/utils/auto_batch_size.py | 3 ++- source/tests/pt/test_auto_batch_size.py | 33 +++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/deepmd/pt/utils/auto_batch_size.py b/deepmd/pt/utils/auto_batch_size.py index 8ac7fba828..ba9fe8598e 100644 --- a/deepmd/pt/utils/auto_batch_size.py +++ b/deepmd/pt/utils/auto_batch_size.py @@ -53,7 +53,7 @@ def is_oom_error(self, e: Exception) -> bool: torch.cuda.empty_cache() return True - if not isinstance(e, RuntimeError) or not e.args: + if not isinstance(e, RuntimeError): return False # Gather messages from the exception itself and its chain. AOTInductor @@ -85,6 +85,7 @@ def is_oom_error(self, e: Exception) -> bool: # AOTInductor (.pt2) wraps the underlying CUDA OOM as a generic # ``run_func_(...) API call failed at .../model_container_runner.cpp``. + # https://github.com/deepmodeling/deepmd-kit/issues/4594 # The original "CUDA out of memory" text is printed to stderr only and # is absent from the Python-level RuntimeError, so we match on the # wrapper signature. If the root cause turns out to be something diff --git a/source/tests/pt/test_auto_batch_size.py b/source/tests/pt/test_auto_batch_size.py index c67a23df52..e7bb69b62e 100644 --- a/source/tests/pt/test_auto_batch_size.py +++ b/source/tests/pt/test_auto_batch_size.py @@ -1,5 +1,8 @@ # SPDX-License-Identifier: LGPL-3.0-or-later import unittest +from unittest import ( + mock, +) import numpy as np @@ -9,6 +12,36 @@ class TestAutoBatchSize(unittest.TestCase): + @mock.patch("deepmd.pt.utils.auto_batch_size.torch.cuda.empty_cache") + def test_is_oom_error_cuda_message(self, empty_cache) -> None: + auto_batch_size = AutoBatchSize(256, 2.0) + + self.assertTrue( + auto_batch_size.is_oom_error(RuntimeError("CUDA out of memory.")) + ) + empty_cache.assert_called_once() + + @mock.patch("deepmd.pt.utils.auto_batch_size.torch.cuda.empty_cache") + def test_is_oom_error_empty_runtime_error_from_cuda_oom(self, empty_cache) -> None: + auto_batch_size = AutoBatchSize(256, 2.0) + cause = RuntimeError("CUDA driver error: out of memory") + error = RuntimeError() + error.__cause__ = cause + + self.assertTrue(auto_batch_size.is_oom_error(error)) + empty_cache.assert_called_once() + + @mock.patch("deepmd.pt.utils.auto_batch_size.torch.cuda.empty_cache") + def test_is_oom_error_aoti_wrapper(self, empty_cache) -> None: + auto_batch_size = AutoBatchSize(256, 2.0) + error = RuntimeError( + "run_func_(...) API call failed at " + "/tmp/torchinductor/model_container_runner.cpp" + ) + + self.assertTrue(auto_batch_size.is_oom_error(error)) + empty_cache.assert_called_once() + def test_execute_all(self) -> None: dd0 = np.zeros((10000, 2, 1, 3, 4)) dd1 = np.ones((10000, 2, 1, 3, 4)) From 33a3bd8275d0650be010d353f3429df5cbc928bc Mon Sep 17 00:00:00 2001 From: OutisLi Date: Sun, 26 Apr 2026 14:30:04 +0800 Subject: [PATCH 3/3] fixup --- deepmd/pt/utils/auto_batch_size.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepmd/pt/utils/auto_batch_size.py b/deepmd/pt/utils/auto_batch_size.py index ba9fe8598e..306d722fad 100644 --- a/deepmd/pt/utils/auto_batch_size.py +++ b/deepmd/pt/utils/auto_batch_size.py @@ -73,6 +73,7 @@ def is_oom_error(self, e: Exception) -> bool: # Several sources treat CUSOLVER_STATUS_INTERNAL_ERROR as an OOM, e.g. # https://github.com/JuliaGPU/CUDA.jl/issues/1924 + # https://github.com/deepmodeling/deepmd-kit/issues/4594 plain_oom_markers = ( "CUDA out of memory.", "CUDA driver error: out of memory", @@ -85,7 +86,6 @@ def is_oom_error(self, e: Exception) -> bool: # AOTInductor (.pt2) wraps the underlying CUDA OOM as a generic # ``run_func_(...) API call failed at .../model_container_runner.cpp``. - # https://github.com/deepmodeling/deepmd-kit/issues/4594 # The original "CUDA out of memory" text is printed to stderr only and # is absent from the Python-level RuntimeError, so we match on the # wrapper signature. If the root cause turns out to be something