@@ -2003,24 +2003,7 @@ def _get_new_batch_prefill_raw(
20032003 )
20042004
20052005 if res != AddReqResult .CONTINUE :
2006- # Release mamba slot if allocated via COW but scheduling failed.
2007- #
2008- # Without this, the slot remains held by a waiting request, causing
2009- # check_memory() to detect a "memory leak" and crash the server.
2010- # The next schedule round will re-allocate safely via match_prefix().
2011- #
2012- # Note: In disaggregation DECODE mode, mamba state is transferred from PREFILL and
2013- # is not recoverable if freed, so we do not free it here. To avoid false-positive
2014- # leak checks in this situation, self_check_during_idle skips memory checking when
2015- # the waiting queue is not empty.
2016- if (
2017- req .mamba_pool_idx is not None
2018- and self .disaggregation_mode != DisaggregationMode .DECODE
2019- ):
2020- self .req_to_token_pool .mamba_pool .free (
2021- req .mamba_pool_idx .unsqueeze (- 1 )
2022- )
2023- req .mamba_pool_idx = None
2006+ self .maybe_release_mamba_cache (req )
20242007
20252008 if res == AddReqResult .NO_TOKEN :
20262009 if self .enable_hierarchical_cache :
@@ -2115,6 +2098,25 @@ def _get_new_batch_prefill_raw(
21152098
21162099 return new_batch
21172100
2101+ def maybe_release_mamba_cache (self , req : Req ) -> None :
2102+ """Release mamba slot if allocated via COW but scheduling failed.
2103+
2104+ Without this, the slot remains held by a waiting request, causing
2105+ check_memory() to detect a "memory leak" and crash the server.
2106+ The next schedule round will re-allocate safely via match_prefix().
2107+
2108+ Note: In disaggregation DECODE mode, mamba state is transferred from PREFILL and
2109+ is not recoverable if freed, so we do not free it here. To avoid false-positive
2110+ leak checks in this situation, self_check_during_idle skips memory checking when
2111+ the waiting queue is not empty.
2112+ """
2113+ if (
2114+ req .mamba_pool_idx is not None
2115+ and self .disaggregation_mode != DisaggregationMode .DECODE
2116+ ):
2117+ self .req_to_token_pool .mamba_pool .free (req .mamba_pool_idx .unsqueeze (- 1 ))
2118+ req .mamba_pool_idx = None
2119+
21182120 def update_running_batch (self , batch : ScheduleBatch ) -> Optional [ScheduleBatch ]:
21192121 """Update the current running decoding batch."""
21202122 initial_bs = batch .batch_size ()
0 commit comments