|
156 | 156 | from sglang.srt.managers.utils import GenerationBatchResult, validate_input_length |
157 | 157 | from sglang.srt.mem_cache.cache_init_params import CacheInitParams |
158 | 158 | from sglang.srt.mem_cache.common import release_kv_cache |
159 | | -from sglang.srt.mem_cache.mamba_radix_cache import MambaRadixCache |
160 | 159 | from sglang.srt.mem_cache.radix_cache import RadixCache |
161 | 160 | from sglang.srt.model_executor.forward_batch_info import ForwardMode, PPProxyTensors |
162 | 161 | from sglang.srt.multiplex.multiplexing_mixin import SchedulerMultiplexMixin |
@@ -1991,16 +1990,17 @@ def get_new_batch_prefill(self) -> Optional[ScheduleBatch]: |
1991 | 1990 | ) |
1992 | 1991 |
|
1993 | 1992 | if res != AddReqResult.CONTINUE: |
1994 | | - # Release mamba slot allocated via COW if scheduling fails |
1995 | | - if ( |
1996 | | - self.is_hybrid_ssm |
1997 | | - and isinstance(self.tree_cache, MambaRadixCache) |
1998 | | - and req.mamba_pool_idx is not None |
1999 | | - ): |
2000 | | - self.req_to_token_pool.mamba_pool.free( |
2001 | | - req.mamba_pool_idx.unsqueeze(-1) |
2002 | | - ) |
| 1993 | + # Release mamba slot if allocated via COW but scheduling failed. |
| 1994 | + # |
| 1995 | + # Without this, the slot remains held by a waiting request, causing |
| 1996 | + # check_memory() to detect a "memory leak" and crash the server. |
| 1997 | + # The next schedule round will re-allocate safely via match_prefix(). |
| 1998 | + # |
| 1999 | + # See: https://github.com/sgl-project/sglang/issues/15840 |
| 2000 | + if req.mamba_pool_idx is not None: |
| 2001 | + self.req_to_token_pool.mamba_pool.free(req.mamba_pool_idx.unsqueeze(-1)) |
2003 | 2002 | req.mamba_pool_idx = None |
| 2003 | + |
2004 | 2004 | if res == AddReqResult.NO_TOKEN: |
2005 | 2005 | if self.enable_hierarchical_cache: |
2006 | 2006 | # Set batch_is_full after making sure there are requests that can be served |
|
0 commit comments