Use fused_sigmoid_gating_delta_rule_update_kernel for KDA (#17108)

strgrb · web-flow · commit bcc6d84f93fb · 2026-01-21T19:24:29.000+08:00
diff --git a/python/sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py b/python/sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py
@@ -34,6 +34,7 @@ def fused_sigmoid_gating_delta_rule_update_kernel(
     USE_INITIAL_STATE: tl.constexpr,
     USE_QK_L2NORM_IN_KERNEL: tl.constexpr,
     IS_VARLEN: tl.constexpr,
+    IS_KDA: tl.constexpr,
 ):
     """
     Fused kernel that combines sigmoid gating computation with recurrent delta rule update.
@@ -64,8 +65,12 @@ def fused_sigmoid_gating_delta_rule_update_kernel(
 
     # Gating computation pointers
     p_A_log = A_log + i_hv
-    p_a = a + bos * HV + i_hv
-    p_dt_bias = dt_bias + i_hv
+    if IS_KDA:
+        p_a = a + (bos * HV + i_hv) * K + o_k
+        p_dt_bias = dt_bias + i_hv * K + o_k
+    else:
+        p_a = a + bos * HV + i_hv
+        p_dt_bias = dt_bias + i_hv
 
     mask_k = o_k < K
     mask_v = o_v < V
@@ -119,7 +124,10 @@ def fused_sigmoid_gating_delta_rule_update_kernel(
         b_q = b_q * scale
 
         # Apply gating to hidden state: h *= exp(g)
-        b_h *= tl.exp(b_g)
+        if IS_KDA:
+            b_h *= tl.exp(b_g[:, None])
+        else:
+            b_h *= tl.exp(b_g)
 
         # Delta rule: v -= sum(h * k, dim=0)
         b_v -= tl.sum(b_h * b_k[:, None], 0)
@@ -172,6 +180,7 @@ def fused_sigmoid_gating_delta_rule_update(
     scale: Optional[float] = None,
     use_qk_l2norm_in_kernel: bool = False,
     cu_seqlens: Optional[torch.Tensor] = None,
+    is_kda: bool = False,
 ):
     """
     Fused triton implementation of sigmoid gating delta rule update.
@@ -221,6 +230,7 @@ def fused_sigmoid_gating_delta_rule_update(
         USE_INITIAL_STATE=initial_state_source is not None,
         USE_QK_L2NORM_IN_KERNEL=use_qk_l2norm_in_kernel,
         IS_VARLEN=cu_seqlens is not None,
+        IS_KDA=is_kda,
         num_warps=num_warps,
         num_stages=num_stages,
     )
diff --git a/python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py b/python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py
@@ -17,7 +17,7 @@
 from sglang.srt.layers.attention.fla.fused_sigmoid_gating_recurrent import (
     fused_sigmoid_gating_delta_rule_update,
 )
-from sglang.srt.layers.attention.fla.kda import chunk_kda, fused_recurrent_kda
+from sglang.srt.layers.attention.fla.kda import chunk_kda
 from sglang.srt.layers.attention.mamba.causal_conv1d_triton import (
     PAD_SLOT_ID,
     causal_conv1d_fn,
@@ -647,6 +647,9 @@ def forward_decode(
         beta = kwargs["beta"]
         g = kwargs["gate"]
 
+        A_log = kwargs["A_log"]
+        dt_bias = kwargs["dt_bias"]
+
         layer_cache = self.req_to_token_pool.mamba2_layer_cache(layer_id)
         q_conv_state, k_conv_state, v_conv_state = layer_cache.conv
         ssm_states = layer_cache.temporal
@@ -686,21 +689,23 @@ def forward_decode(
             lambda x: rearrange(x, "n (h d) -> 1 n h d", d=head_dim), (q, k, v)
         )
 
-        initial_state = ssm_states[cache_indices].contiguous()
-        (
-            core_attn_out,
-            last_recurrent_state,
-        ) = fused_recurrent_kda(
+        core_attn_out = fused_sigmoid_gating_delta_rule_update(
+            A_log=A_log,
+            dt_bias=dt_bias,
             q=q,
             k=k,
             v=v,
-            g=g,
-            beta=beta,
-            initial_state=initial_state,
-            use_qk_l2norm_in_kernel=True,
+            a=g,
+            b=beta,
+            initial_state_source=ssm_states,
+            initial_state_indices=cache_indices,
             cu_seqlens=query_start_loc,
+            use_qk_l2norm_in_kernel=True,
+            softplus_beta=1.0,
+            softplus_threshold=20.0,
+            is_kda=True,
         )
-        ssm_states[cache_indices] = last_recurrent_state
+
         return core_attn_out
 
     def forward_extend(
diff --git a/python/sglang/srt/models/kimi_linear.py b/python/sglang/srt/models/kimi_linear.py
@@ -316,9 +316,12 @@ def forward(
 
         beta = self.b_proj(hidden_states)[0].float().sigmoid()
         forget_gate = self.f_b_proj(self.f_a_proj(hidden_states)[0])[0]
-        forget_gate = fused_kda_gate(
-            forget_gate, self.A_log, self.head_dim, g_bias=self.dt_bias
-        )
+
+        # fused_kda_gate is fused to KimiLinearAttentionBackend with decode
+        if not forward_batch.forward_mode.is_decode():
+            forget_gate = fused_kda_gate(
+                forget_gate, self.A_log, self.head_dim, g_bias=self.dt_bias
+            )
         beta = beta.unsqueeze(0)
         forget_gate = forget_gate.unsqueeze(0)
 
@@ -336,6 +339,8 @@ def forward(
             "layer_id": self.layer_idx,
             "beta": beta,
             "gate": forget_gate,
+            "A_log": self.A_log,
+            "dt_bias": self.dt_bias,
         }
 
         core_attn_out = forward_batch.attn_backend.forward(