diff --git a/backends/cuda/triton/kernels/fused_moe.py b/backends/cuda/triton/kernels/fused_moe.py
index d02f17a6cfa..89994d4d09c 100644
--- a/backends/cuda/triton/kernels/fused_moe.py
+++ b/backends/cuda/triton/kernels/fused_moe.py
@@ -580,9 +580,12 @@ def _fused_moe_fake(
 # ---------------------------------------------------------------------------
 
 # Fixed BLOCK_M for the batched kernel. Not autotuned because the token
-# sorting layout depends on it. 16 is the minimum for tl.dot and wastes
-# the least padding with typical Qwen3.5 expert load (~30 tokens/expert).
-_BATCHED_BLOCK_M = 16
+# sorting layout depends on it. Microbenchmarked on Qwen3.5 MoE prefill
+# (M=1696, top_k=8, 256 experts) — BLOCK_M=64 is ~1.32x faster than 16
+# despite the extra padding, because the per-expert M block (~30 tokens
+# × 8 top_k = ~53 active rows/expert) saturates 64-row tensor-core MMAs
+# and reduces total program count.
+_BATCHED_BLOCK_M = 64
 
 
 def moe_align_block_size(