change comment

kuafou · kuafou · commit 8028fe106558 · 2025-12-31T09:33:12.000+08:00
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
@@ -156,7 +156,6 @@
 from sglang.srt.managers.utils import GenerationBatchResult, validate_input_length
 from sglang.srt.mem_cache.cache_init_params import CacheInitParams
 from sglang.srt.mem_cache.common import release_kv_cache
-from sglang.srt.mem_cache.mamba_radix_cache import MambaRadixCache
 from sglang.srt.mem_cache.radix_cache import RadixCache
 from sglang.srt.model_executor.forward_batch_info import ForwardMode, PPProxyTensors
 from sglang.srt.multiplex.multiplexing_mixin import SchedulerMultiplexMixin
@@ -1991,16 +1990,17 @@ def get_new_batch_prefill(self) -> Optional[ScheduleBatch]:
             )
 
             if res != AddReqResult.CONTINUE:
-                # Release mamba slot allocated via COW if scheduling fails
-                if (
-                    self.is_hybrid_ssm
-                    and isinstance(self.tree_cache, MambaRadixCache)
-                    and req.mamba_pool_idx is not None
-                ):
-                    self.req_to_token_pool.mamba_pool.free(
-                        req.mamba_pool_idx.unsqueeze(-1)
-                    )
+                # Release mamba slot if allocated via COW but scheduling failed.
+                #
+                # Without this, the slot remains held by a waiting request, causing
+                # check_memory() to detect a "memory leak" and crash the server.
+                # The next schedule round will re-allocate safely via match_prefix().
+                #
+                # See: https://github.com/sgl-project/sglang/issues/15840
+                if req.mamba_pool_idx is not None:
+                    self.req_to_token_pool.mamba_pool.free(req.mamba_pool_idx.unsqueeze(-1))
                     req.mamba_pool_idx = None
+
                 if res == AddReqResult.NO_TOKEN:
                     if self.enable_hierarchical_cache:
                         # Set batch_is_full after making sure there are requests that can be served