@@ -2095,6 +2095,8 @@ def _get_new_batch_prefill_raw(
20952095 running_loras .add (req .lora_id )
20962096
20972097 if res != AddReqResult .CONTINUE :
2098+ self .maybe_release_mamba_cache (req )
2099+
20982100 if res == AddReqResult .NO_TOKEN :
20992101 if self .enable_hierarchical_cache :
21002102 # Set batch_is_full after making sure there are requests that can be served
@@ -2192,6 +2194,25 @@ def _get_new_batch_prefill_raw(
21922194
21932195 return new_batch
21942196
2197+ def maybe_release_mamba_cache (self , req : Req ) -> None :
2198+ """Release mamba slot if allocated via COW but scheduling failed.
2199+
2200+ Without this, the slot remains held by a waiting request, causing
2201+ check_memory() to detect a "memory leak" and crash the server.
2202+ The next schedule round will re-allocate safely via match_prefix().
2203+
2204+ Note: In disaggregation DECODE mode, mamba state is transferred from PREFILL and
2205+ is not recoverable if freed, so we do not free it here. To avoid false-positive
2206+ leak checks in this situation, self_check_during_idle skips memory checking when
2207+ the waiting queue is not empty.
2208+ """
2209+ if (
2210+ req .mamba_pool_idx is not None
2211+ and self .disaggregation_mode != DisaggregationMode .DECODE
2212+ ):
2213+ self .req_to_token_pool .mamba_pool .free (req .mamba_pool_idx .unsqueeze (- 1 ))
2214+ req .mamba_pool_idx = None
2215+
21952216 def update_running_batch (self , batch : ScheduleBatch ) -> Optional [ScheduleBatch ]:
21962217 """Update the current running decoding batch."""
21972218 initial_bs = batch .batch_size ()
0 commit comments