From 015476d09fa33d9671cb122ad487156a1fbe0814 Mon Sep 17 00:00:00 2001
From: Songhao Jia <gasoonjia@meta.com>
Date: Thu, 30 Apr 2026 01:09:38 -0700
Subject: [PATCH] Bump batched MoE BLOCK_M from 16 to 64 on top of persistent
 kernel

Microbenchmarked on Qwen3.5 MoE prefill (M=1696, top_k=8, 256 experts):
  BLOCK_M=16: 3.62 ms
  BLOCK_M=32: 2.85 ms (1.27x)
  BLOCK_M=64: 2.75 ms (1.32x)

E2E (Qwen3.5-35B-A3B prefill, --moe-activation-dtype int8 --dense-prefill
dequant --cuda_graph, p=1600 d=512, run_1..5 median):
  BLOCK_M=16: 5897 tok/s prefill (273 ms), 98.1 tok/s decode
  BLOCK_M=64: 6793 tok/s prefill (237 ms), 98.1 tok/s decode
  Speedup:    1.152x prefill, decode unchanged (decode uses non-batched
              fused_moe kernel)

Outputs are bit-identical between BLOCK_M=16 and BLOCK_M=64 in the
microbenchmark (max abs diff = 0).
---
 backends/cuda/triton/kernels/fused_moe.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/backends/cuda/triton/kernels/fused_moe.py b/backends/cuda/triton/kernels/fused_moe.py
index d02f17a6cfa..89994d4d09c 100644
--- a/backends/cuda/triton/kernels/fused_moe.py
+++ b/backends/cuda/triton/kernels/fused_moe.py
@@ -580,9 +580,12 @@ def _fused_moe_fake(
 # ---------------------------------------------------------------------------
 
 # Fixed BLOCK_M for the batched kernel. Not autotuned because the token
-# sorting layout depends on it. 16 is the minimum for tl.dot and wastes
-# the least padding with typical Qwen3.5 expert load (~30 tokens/expert).
-_BATCHED_BLOCK_M = 16
+# sorting layout depends on it. Microbenchmarked on Qwen3.5 MoE prefill
+# (M=1696, top_k=8, 256 experts) — BLOCK_M=64 is ~1.32x faster than 16
+# despite the extra padding, because the per-expert M block (~30 tokens
+# × 8 top_k = ~53 active rows/expert) saturates 64-row tensor-core MMAs
+# and reduces total program count.
+_BATCHED_BLOCK_M = 64
 
 
 def moe_align_block_size(