Fix Bug on dsv3.2 (sgl-project#18553)

BourneSun0527 · web-flow · commit 2cc235e7952e · 2026-02-11T14:39:01.000+08:00
This PR affects only the NPU. If any issues arise, please contact iforgetmyname.
diff --git a/python/sglang/srt/layers/attention/nsa/nsa_indexer.py b/python/sglang/srt/layers/attention/nsa/nsa_indexer.py
@@ -7,6 +7,7 @@
 import torch
 from einops import rearrange
 
+from sglang.srt.environ import envs
 from sglang.srt.layers.layernorm import LayerNorm
 from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
 from sglang.srt.layers.utils import MultiPlatformOp
@@ -1190,13 +1191,17 @@ def forward_npu(
             )  # [bs, n, d]
             q = torch.cat([q_pe, q_nope], dim=-1)
 
-        indexer_weight_stream = get_indexer_weight_stream()
-        indexer_weight_stream.wait_stream(torch.npu.current_stream())
-        with torch.npu.stream(indexer_weight_stream):
+        if envs.SGLANG_NPU_USE_MULTI_STREAM.get():
+            indexer_weight_stream = get_indexer_weight_stream()
+            indexer_weight_stream.wait_stream(torch.npu.current_stream())
+            with torch.npu.stream(indexer_weight_stream):
+                x = x.view(-1, self.hidden_size)
+                weights = self.weights_proj(x.float())[0].to(torch.bfloat16)
+                weights.record_stream(indexer_weight_stream)
+                weights_event = indexer_weight_stream.record_event()
+        else:
             x = x.view(-1, self.hidden_size)
             weights = self.weights_proj(x.float())[0].to(torch.bfloat16)
-            weights.record_stream(indexer_weight_stream)
-            weights_event = indexer_weight_stream.record_event()
 
         k_proj = self.wk(x)[0]  # [b, s, 7168] @ [7168, 128] = [b, s, 128]
         k = self.k_norm(k_proj)
@@ -1278,7 +1283,8 @@ def forward_npu(
 
         if self.alt_stream is not None:
             torch.npu.current_stream().wait_event(q_rope_event)
-        torch.npu.current_stream().wait_event(weights_event)
+        if envs.SGLANG_NPU_USE_MULTI_STREAM.get():
+            torch.npu.current_stream().wait_event(weights_event)
 
         block_table = forward_batch.attn_backend.forward_metadata.block_tables
         if (
diff --git a/python/sglang/srt/managers/overlap_utils.py b/python/sglang/srt/managers/overlap_utils.py
@@ -6,16 +6,18 @@
 import torch
 
 from sglang.srt.speculative.spec_utils import spec_need_hidden_states
-from sglang.srt.utils import get_compiler_backend
+from sglang.srt.utils import get_compiler_backend, is_npu
 
 if TYPE_CHECKING:
     from sglang.srt.managers.schedule_batch import ModelWorkerBatch
     from sglang.srt.managers.scheduler import GenerationBatchResult
     from sglang.srt.speculative.eagle_info import EagleDraftInput
     from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
 
+_is_npu = is_npu()
 
-@torch.compile(dynamic=True, backend=get_compiler_backend())
+
+@torch.compile(dynamic=True, backend=get_compiler_backend(), disable=_is_npu)
 def _resolve_future_token_ids(input_ids, future_token_ids_map):
     input_ids[:] = torch.where(
         input_ids < 0,