[bugfix] fix mamba slot leak when scheduling fails with radix cache (sgl-project#15840) (sgl-project#16067)

kuafou · yizhang2077 · Johnsonms · commit 7d0de214d73d · 2026-02-14T01:44:01.000Z
Co-authored-by: yizhang2077 &lt;1109276519@qq.com&gt;
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
@@ -2095,6 +2095,8 @@ def _get_new_batch_prefill_raw(
                 running_loras.add(req.lora_id)
 
             if res != AddReqResult.CONTINUE:
+                self.maybe_release_mamba_cache(req)
+
                 if res == AddReqResult.NO_TOKEN:
                     if self.enable_hierarchical_cache:
                         # Set batch_is_full after making sure there are requests that can be served
@@ -2192,6 +2194,25 @@ def _get_new_batch_prefill_raw(
 
         return new_batch
 
+    def maybe_release_mamba_cache(self, req: Req) -> None:
+        """Release mamba slot if allocated via COW but scheduling failed.
+
+        Without this, the slot remains held by a waiting request, causing
+        check_memory() to detect a "memory leak" and crash the server.
+        The next schedule round will re-allocate safely via match_prefix().
+
+        Note: In disaggregation DECODE mode, mamba state is transferred from PREFILL and
+        is not recoverable if freed, so we do not free it here. To avoid false-positive
+        leak checks in this situation, self_check_during_idle skips memory checking when
+        the waiting queue is not empty.
+        """
+        if (
+            req.mamba_pool_idx is not None
+            and self.disaggregation_mode != DisaggregationMode.DECODE
+        ):
+            self.req_to_token_pool.mamba_pool.free(req.mamba_pool_idx.unsqueeze(-1))
+            req.mamba_pool_idx = None
+
     def update_running_batch(self, batch: ScheduleBatch) -> Optional[ScheduleBatch]:
         """Update the current running decoding batch."""
         initial_bs = batch.batch_size()