[Ascend]Support qwen3.5 (sgl-project#18544)

chenxu214 · web-flow · commit 1edc69be0854 · 2026-02-12T15:22:47.000+08:00
This PR affects only the NPU. If any issues arise, please contact iforgetmyname.
diff --git a/python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py b/python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py
@@ -35,7 +35,7 @@
 from sglang.srt.utils import cpu_has_amx_support, is_cpu, is_cuda, is_npu
 from sglang.srt.utils.common import rank0_log
 
-if not is_cpu():
+if not is_cpu() and not is_npu():
     # fix import error on CPU device, no impacts when non-CPU path
     from sglang.jit_kernel.cutedsl_gdn import (
         cutedsl_fused_sigmoid_gating_delta_rule_update,
@@ -814,7 +814,7 @@ def __init__(self, model_runner: ModelRunner):
         self.conv_states_shape = (
             model_runner.req_to_token_pool.mamba_pool.mamba_cache.conv[0].shape
         )
-        if not is_cpu():
+        if not is_cpu() and not is_npu():
             assert (
                 self.conv_states_shape[-1] < FLA_CHUNK_SIZE
             ), f"{self.conv_states_shape[-1]=} should be less than {FLA_CHUNK_SIZE}"
diff --git a/python/sglang/srt/layers/quantization/modelslim/modelslim.py b/python/sglang/srt/layers/quantization/modelslim/modelslim.py
@@ -193,6 +193,15 @@ def is_layer_skipped(
     ):
         # adapted from vllm.model_executor.layers.quantization.utils.quant_utils.is_layer_skipped
         proj_name = prefix.split(".")[-1]
+        if not hasattr(self, "_quant_description_normalized"):
+            quant_description = {}
+            for prefix_, value in self.quant_description.items():
+                prefix_ = prefix_.replace("language_model.", "")
+                if "visual" in prefix_:
+                    prefix_ = prefix_.replace("model.", "")
+                quant_description[prefix_] = value
+            self.quant_description = quant_description
+            self._quant_description_normalized = True
         if proj_name in fused_mapping:
             shard_prefixes = [
                 prefix.replace(proj_name, shard_proj_name)
diff --git a/python/sglang/srt/models/qwen3_5.py b/python/sglang/srt/models/qwen3_5.py
@@ -34,6 +34,7 @@
 # Distributed
 from sglang.srt.distributed import get_pp_group
 from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
 
 # Layers - Attention
 from sglang.srt.layers.attention.fla.layernorm_gated import RMSNorm as RMSNormGated
@@ -328,7 +329,7 @@ def __init__(
                 config=config,
                 quant_config=quant_config,
                 alt_stream=alt_stream,
-                prefix=add_prefix("mlp", prefix.replace(".self_attn", "")),
+                prefix=add_prefix("mlp", prefix.replace(".linear_attn", "")),
             )
             is_layer_sparse = True
             is_previous_layer_sparse = True
@@ -339,7 +340,7 @@ def __init__(
                 intermediate_size=config.intermediate_size,
                 hidden_act=config.hidden_act,
                 quant_config=quant_config,
-                prefix=add_prefix("mlp", prefix.replace(".self_attn", "")),
+                prefix=add_prefix("mlp", prefix.replace(".linear_attn", "")),
             )
             is_layer_sparse = False
             is_previous_layer_sparse = False
@@ -1318,5 +1319,14 @@ def load_fused_expert_weights(
 
         return loaded_params
 
+    @classmethod
+    def get_model_config_for_expert_location(cls, config):
+        text_config = getattr(config, "text_config", config)
+        return ModelConfigForExpertLocation(
+            num_layers=text_config.num_hidden_layers,
+            num_logical_experts=text_config.num_experts,
+            num_groups=None,
+        )
+
 
 EntryClass = [Qwen3_5MoeForConditionalGeneration, Qwen3_5ForConditionalGeneration]