[AMD] Fix accuracy issue when running TP4 dsv3 model with mtp (sgl-project#18607)

1am9trash · yctseng0211 · kkHuang-amd · web-flow · commit e20e6c28b9bf · 2026-02-12T01:13:16.000-08:00
Co-authored-by: YC Tseng &lt;yctseng@amd.com&gt;
Co-authored-by: kkHuang-amd &lt;wunhuang@amd.com&gt;
diff --git a/docker/rocm.Dockerfile b/docker/rocm.Dockerfile
@@ -21,7 +21,7 @@ ENV BUILD_TRITON="0"
 ENV BUILD_LLVM="0"
 ENV BUILD_AITER_ALL="1"
 ENV BUILD_MOONCAKE="1"
-ENV AITER_COMMIT="v0.1.10.post2"
+ENV AITER_COMMIT="v0.1.10.post3"
 
 # ===============================
 # Base image 950 and args
@@ -31,7 +31,7 @@ ENV BUILD_TRITON="0"
 ENV BUILD_LLVM="0"
 ENV BUILD_AITER_ALL="1"
 ENV BUILD_MOONCAKE="1"
-ENV AITER_COMMIT="v0.1.10.post2"
+ENV AITER_COMMIT="v0.1.10.post3"
 # ===============================
 # Chosen arch and args
 FROM ${GPU_ARCH}
diff --git a/python/sglang/srt/layers/attention/aiter_backend.py b/python/sglang/srt/layers/attention/aiter_backend.py
@@ -195,11 +195,15 @@ def __init__(
             )
             global _use_mla_ps_kernel, fast_mode, intra_batch_mode
 
+            if self.num_head == 32:
+                fast_mode = True
+                intra_batch_mode = False
+
             # current persist a16w16 mla_decode kernel does not support head_num = 128
             # need to fall back to non-persist
             # only use mla_ps_kernel when fp8 kv_cache
-            # for non-fp8 kv_cache, use non-persist kernel to avoid performance degradation
-            if self.kv_cache_dtype is not fp8_dtype:
+            # for non-fp8 kv_cache on tp8, use non-persist kernel to avoid performance degradation
+            if self.num_head == 16 and self.kv_cache_dtype is not fp8_dtype:
                 _use_mla_ps_kernel = False
                 fast_mode = False
                 intra_batch_mode = False
@@ -301,7 +305,7 @@ def make_mla_meta_data(
             kv_last_page_len,
             self.num_head // nhead_kv,
             nhead_kv,
-            True,
+            False,
             work_metadata,
             work_info_set,
             work_indptr,