Skip to content

Commit 8dca3fd

Browse files
committed
refactor
1 parent bc8622f commit 8dca3fd

File tree

1 file changed

+20
-18
lines changed

1 file changed

+20
-18
lines changed

python/sglang/srt/managers/scheduler.py

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2003,24 +2003,7 @@ def _get_new_batch_prefill_raw(
20032003
)
20042004

20052005
if res != AddReqResult.CONTINUE:
2006-
# Release mamba slot if allocated via COW but scheduling failed.
2007-
#
2008-
# Without this, the slot remains held by a waiting request, causing
2009-
# check_memory() to detect a "memory leak" and crash the server.
2010-
# The next schedule round will re-allocate safely via match_prefix().
2011-
#
2012-
# Note: In disaggregation DECODE mode, mamba state is transferred from PREFILL and
2013-
# is not recoverable if freed, so we do not free it here. To avoid false-positive
2014-
# leak checks in this situation, self_check_during_idle skips memory checking when
2015-
# the waiting queue is not empty.
2016-
if (
2017-
req.mamba_pool_idx is not None
2018-
and self.disaggregation_mode != DisaggregationMode.DECODE
2019-
):
2020-
self.req_to_token_pool.mamba_pool.free(
2021-
req.mamba_pool_idx.unsqueeze(-1)
2022-
)
2023-
req.mamba_pool_idx = None
2006+
self.maybe_release_mamba_cache(req)
20242007

20252008
if res == AddReqResult.NO_TOKEN:
20262009
if self.enable_hierarchical_cache:
@@ -2115,6 +2098,25 @@ def _get_new_batch_prefill_raw(
21152098

21162099
return new_batch
21172100

2101+
def maybe_release_mamba_cache(self, req: Req) -> None:
2102+
"""Release mamba slot if allocated via COW but scheduling failed.
2103+
2104+
Without this, the slot remains held by a waiting request, causing
2105+
check_memory() to detect a "memory leak" and crash the server.
2106+
The next schedule round will re-allocate safely via match_prefix().
2107+
2108+
Note: In disaggregation DECODE mode, mamba state is transferred from PREFILL and
2109+
is not recoverable if freed, so we do not free it here. To avoid false-positive
2110+
leak checks in this situation, self_check_during_idle skips memory checking when
2111+
the waiting queue is not empty.
2112+
"""
2113+
if (
2114+
req.mamba_pool_idx is not None
2115+
and self.disaggregation_mode != DisaggregationMode.DECODE
2116+
):
2117+
self.req_to_token_pool.mamba_pool.free(req.mamba_pool_idx.unsqueeze(-1))
2118+
req.mamba_pool_idx = None
2119+
21182120
def update_running_batch(self, batch: ScheduleBatch) -> Optional[ScheduleBatch]:
21192121
"""Update the current running decoding batch."""
21202122
initial_bs = batch.batch_size()

0 commit comments

Comments
 (0)