refactor

kuafou · kuafou · commit 8dca3fdfcceb · 2026-01-07T12:24:14.000+08:00
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
@@ -2003,24 +2003,7 @@ def _get_new_batch_prefill_raw(
             )
 
             if res != AddReqResult.CONTINUE:
-                # Release mamba slot if allocated via COW but scheduling failed.
-                #
-                # Without this, the slot remains held by a waiting request, causing
-                # check_memory() to detect a "memory leak" and crash the server.
-                # The next schedule round will re-allocate safely via match_prefix().
-                #
-                # Note: In disaggregation DECODE mode, mamba state is transferred from PREFILL and
-                # is not recoverable if freed, so we do not free it here. To avoid false-positive
-                # leak checks in this situation, self_check_during_idle skips memory checking when
-                # the waiting queue is not empty.
-                if (
-                    req.mamba_pool_idx is not None
-                    and self.disaggregation_mode != DisaggregationMode.DECODE
-                ):
-                    self.req_to_token_pool.mamba_pool.free(
-                        req.mamba_pool_idx.unsqueeze(-1)
-                    )
-                    req.mamba_pool_idx = None
+                self.maybe_release_mamba_cache(req)
 
                 if res == AddReqResult.NO_TOKEN:
                     if self.enable_hierarchical_cache:
@@ -2115,6 +2098,25 @@ def _get_new_batch_prefill_raw(
 
         return new_batch
 
+    def maybe_release_mamba_cache(self, req: Req) -> None:
+        """Release mamba slot if allocated via COW but scheduling failed.
+
+        Without this, the slot remains held by a waiting request, causing
+        check_memory() to detect a "memory leak" and crash the server.
+        The next schedule round will re-allocate safely via match_prefix().
+
+        Note: In disaggregation DECODE mode, mamba state is transferred from PREFILL and
+        is not recoverable if freed, so we do not free it here. To avoid false-positive
+        leak checks in this situation, self_check_during_idle skips memory checking when
+        the waiting queue is not empty.
+        """
+        if (
+            req.mamba_pool_idx is not None
+            and self.disaggregation_mode != DisaggregationMode.DECODE
+        ):
+            self.req_to_token_pool.mamba_pool.free(req.mamba_pool_idx.unsqueeze(-1))
+            req.mamba_pool_idx = None
+
     def update_running_batch(self, batch: ScheduleBatch) -> Optional[ScheduleBatch]:
         """Update the current running decoding batch."""
         initial_bs = batch.batch_size()