Fix mamba slot leak when scheduling fails with radix cache (#15840)

kuafou · kuafou · commit 91e20229a64d · 2025-12-29T18:06:58.000+08:00
When add_one_req fails after init_next_round_input allocates a mamba slot
via COW (copy-on-write) during match_prefix, the slot was not released,
causing memory leak.

This fix releases the mamba slot when scheduling fails.
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
@@ -1982,6 +1982,12 @@ def get_new_batch_prefill(self) -> Optional[ScheduleBatch]:
             )
 
             if res != AddReqResult.CONTINUE:
+                # Release mamba slot allocated via COW if scheduling fails
+                if self.is_hybrid_ssm and req.mamba_pool_idx is not None:
+                    self.req_to_token_pool.mamba_pool.free(
+                        req.mamba_pool_idx.unsqueeze(-1)
+                    )
+                    req.mamba_pool_idx = None
                 if res == AddReqResult.NO_TOKEN:
                     if self.enable_hierarchical_cache:
                         # Set batch_is_full after making sure there are requests that can be served
diff --git a/test/srt/test_mamba_unittest.py b/test/srt/test_mamba_unittest.py
@@ -336,6 +336,110 @@ def make_dummy_req():
             == mamba_pool.mamba_cache.temporal[:, last_node.mamba_value]
         )
 
+    def test_mamba_slot_release_after_match_prefix_cow(self):
+        num_layers, global_interval = 48, 4
+        full_attention_layer_ids = list(
+            range(global_interval - 1, num_layers, global_interval)
+        )
+        mamba_layers = [
+            i for i in range(num_layers) if i not in full_attention_layer_ids
+        ]
+        os.environ["SGLANG_MAMBA_SSM_DTYPE"] = "bfloat16"
+
+        mamba2_cache_params = Mamba2CacheParams(
+            shape=Mamba2StateShape.create(
+                tp_world_size=1,
+                intermediate_size=4096,
+                n_groups=16,
+                num_heads=32,
+                head_dim=128,
+                state_size=128,
+                conv_kernel=4,
+            ),
+            layers=mamba_layers,
+        )
+        req_to_token_pool = HybridReqToTokenPool(
+            size=10,
+            mamba_size=20,
+            mamba_spec_state_size=10,
+            max_context_len=128,
+            device="cuda",
+            enable_memory_saver=False,
+            cache_params=mamba2_cache_params,
+            enable_mamba_extra_buffer=False,
+            speculative_num_draft_tokens=3,
+        )
+        pool = HybridLinearKVPool(
+            size=128,
+            dtype=torch.bfloat16,
+            page_size=1,
+            head_num=2,
+            head_dim=256,
+            full_attention_layer_ids=full_attention_layer_ids,
+            enable_kvcache_transpose=False,
+            device="cuda",
+            enable_memory_saver=False,
+            mamba_pool=req_to_token_pool.mamba_pool,
+        )
+        allocator = TokenToKVPoolAllocator(
+            size=128,
+            dtype=torch.bfloat16,
+            device="cuda",
+            kvcache=pool,
+            need_sort=False,
+        )
+        tree = MambaRadixCache(
+            params=CacheInitParams(
+                req_to_token_pool=req_to_token_pool,
+                token_to_kv_pool_allocator=allocator,
+                page_size=1,
+                disable=False,
+            )
+        )
+        mamba_pool = req_to_token_pool.mamba_pool
+
+        # Insert req1 to create cached mamba state
+        sampling_params = SamplingParams(temperature=0, max_new_tokens=1)
+        req1 = Req(
+            rid=0,
+            origin_input_text="",
+            origin_input_ids=[],
+            sampling_params=sampling_params,
+        )
+        req_to_token_pool.alloc(1, reqs=[req1])
+        token_ids = [1, 2, 3, 4, 5]
+        tree.insert(
+            RadixKey(token_ids),
+            allocator.alloc(len(token_ids)),
+            req1.mamba_pool_idx.unsqueeze(0),
+        )
+
+        initial_available = mamba_pool.available_size()
+
+        # req2 matches prefix with COW - this allocates a new mamba slot
+        req2 = Req(
+            rid=1,
+            origin_input_text="",
+            origin_input_ids=[],
+            sampling_params=sampling_params,
+        )
+        tree.match_prefix(RadixKey(token_ids), req=req2, cow_mamba=True)
+
+        # Verify COW allocated a mamba slot
+        assert req2.mamba_pool_idx is not None, "COW should allocate mamba slot"
+        assert (
+            mamba_pool.available_size() < initial_available
+        ), "Pool size should decrease"
+
+        # Simulate scheduling failure cleanup
+        mamba_pool.free(req2.mamba_pool_idx.unsqueeze(-1))
+        req2.mamba_pool_idx = None
+
+        # Verify slot is released
+        assert (
+            mamba_pool.available_size() == initial_available
+        ), "Slot should be released"
+
 
 if __name__ == "__main__":
     unittest.main()