From 015476d09fa33d9671cb122ad487156a1fbe0814 Mon Sep 17 00:00:00 2001 From: Songhao Jia Date: Thu, 30 Apr 2026 01:09:38 -0700 Subject: [PATCH] Bump batched MoE BLOCK_M from 16 to 64 on top of persistent kernel Microbenchmarked on Qwen3.5 MoE prefill (M=1696, top_k=8, 256 experts): BLOCK_M=16: 3.62 ms BLOCK_M=32: 2.85 ms (1.27x) BLOCK_M=64: 2.75 ms (1.32x) E2E (Qwen3.5-35B-A3B prefill, --moe-activation-dtype int8 --dense-prefill dequant --cuda_graph, p=1600 d=512, run_1..5 median): BLOCK_M=16: 5897 tok/s prefill (273 ms), 98.1 tok/s decode BLOCK_M=64: 6793 tok/s prefill (237 ms), 98.1 tok/s decode Speedup: 1.152x prefill, decode unchanged (decode uses non-batched fused_moe kernel) Outputs are bit-identical between BLOCK_M=16 and BLOCK_M=64 in the microbenchmark (max abs diff = 0). --- backends/cuda/triton/kernels/fused_moe.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/backends/cuda/triton/kernels/fused_moe.py b/backends/cuda/triton/kernels/fused_moe.py index d02f17a6cfa..89994d4d09c 100644 --- a/backends/cuda/triton/kernels/fused_moe.py +++ b/backends/cuda/triton/kernels/fused_moe.py @@ -580,9 +580,12 @@ def _fused_moe_fake( # --------------------------------------------------------------------------- # Fixed BLOCK_M for the batched kernel. Not autotuned because the token -# sorting layout depends on it. 16 is the minimum for tl.dot and wastes -# the least padding with typical Qwen3.5 expert load (~30 tokens/expert). -_BATCHED_BLOCK_M = 16 +# sorting layout depends on it. Microbenchmarked on Qwen3.5 MoE prefill +# (M=1696, top_k=8, 256 experts) — BLOCK_M=64 is ~1.32x faster than 16 +# despite the extra padding, because the per-expert M block (~30 tokens +# × 8 top_k = ~53 active rows/expert) saturates 64-row tensor-core MMAs +# and reduces total program count. +_BATCHED_BLOCK_M = 64 def moe_align_block_size(