Fix/qwen3 5 amd rope cutedsl fallback (#18753)

andyluo7 · seungrokj · web-flow · commit 944a9f6fcfe6 · 2026-02-14T22:09:44.000-08:00
Co-authored-by: seungrokj &lt;seungrok.jung@amd.com&gt;
diff --git a/python/sglang/srt/configs/qwen3_5.py b/python/sglang/srt/configs/qwen3_5.py
@@ -17,9 +17,18 @@ def __init__(
         self,
         **kwargs,
     ):
+        # HF Qwen3.5 checkpoints may provide RoPE settings under rope_parameters.
+        # Normalize it before parent init so downstream code sees the expected values.
+        rope_parameters = kwargs.pop("rope_parameters", None)
+        if kwargs.get("rope_scaling") is None and rope_parameters is not None:
+            kwargs["rope_scaling"] = rope_parameters
+
         super().__init__(**kwargs)
         if self.rope_scaling is None:
-            self.rope_scaling = {}
+            self.rope_scaling = rope_parameters or {}
+
+        # Keep both names for compatibility with model code paths that read either.
+        self.rope_parameters = rope_parameters or self.rope_scaling
 
 
 class Qwen3_5Config(PretrainedConfig):
diff --git a/python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py b/python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py
@@ -45,9 +45,14 @@
 
 if not is_cpu() and not is_npu():
     # fix import error on CPU device, no impacts when non-CPU path
-    from sglang.jit_kernel.cutedsl_gdn import (
-        cutedsl_fused_sigmoid_gating_delta_rule_update,
-    )
+    try:
+        from sglang.jit_kernel.cutedsl_gdn import (
+            cutedsl_fused_sigmoid_gating_delta_rule_update,
+        )
+    except ModuleNotFoundError:
+        # CuTe DSL path requires cuda-python (cuda.bindings.*). Keep runtime usable
+        # by falling back to non-CuTe kernels when it's unavailable.
+        cutedsl_fused_sigmoid_gating_delta_rule_update = None
     from sglang.srt.layers.attention.fla.chunk import chunk_gated_delta_rule
     from sglang.srt.layers.attention.fla.chunk_delta_h import (
         CHUNK_SIZE as FLA_CHUNK_SIZE,
@@ -830,6 +835,12 @@ def __init__(self, model_runner: ModelRunner):
             ), f"{self.conv_states_shape[-1]=} should be less than {FLA_CHUNK_SIZE}"
 
         use_cutedsl = Envs.SGLANG_USE_CUTEDSL_GDN_DECODE.get()
+        if use_cutedsl and cutedsl_fused_sigmoid_gating_delta_rule_update is None:
+            rank0_log(
+                "CuTe DSL GDN decode requested but unavailable "
+                "(missing cuda.bindings). Falling back to FLA decode kernel."
+            )
+            use_cutedsl = False
         rank0_log(f"CuTe DSL GDN decode enabled: {use_cutedsl}")
         self._kernel_func = (
             cutedsl_fused_sigmoid_gating_delta_rule_update