[Perf] Add Flashinfer DeepGEMM SM90 for SwapAB Optimization (sgl-project#15514)

b8zhong · sfiisf · commit 6a8b09aed985 · 2026-02-05T11:52:27.000+08:00
Co-authored-by: Brayden Zhong &lt;b8zhong@users.noreply.github.com&gt;
diff --git a/python/sglang/srt/layers/quantization/fp8_utils.py b/python/sglang/srt/layers/quantization/fp8_utils.py
@@ -134,7 +134,8 @@ class Fp8GemmRunnerBackend(Enum):
     """Enum for FP8 GEMM runner backend selection."""
 
     AUTO = "auto"
-    FLASHINFER = "flashinfer_trtllm"
+    FLASHINFER_TRTLLM = "flashinfer_trtllm"
+    FLASHINFER_DEEPGEMM = "flashinfer_deepgemm"
     CUTLASS = "cutlass"
     DEEP_GEMM = "deep_gemm"
     TRITON = "triton"
@@ -144,7 +145,10 @@ def is_auto(self) -> bool:
         return self == Fp8GemmRunnerBackend.AUTO
 
     def is_flashinfer(self) -> bool:
-        return self == Fp8GemmRunnerBackend.FLASHINFER
+        return self == Fp8GemmRunnerBackend.FLASHINFER_TRTLLM
+
+    def is_flashinfer_deepgemm(self) -> bool:
+        return self == Fp8GemmRunnerBackend.FLASHINFER_DEEPGEMM
 
     def is_cutlass(self) -> bool:
         return self == Fp8GemmRunnerBackend.CUTLASS
@@ -170,6 +174,10 @@ def _check_cutlass_block_fp8_hardware_support() -> bool:
 if is_blackwell_supported() and is_flashinfer_available():
     from flashinfer.gemm import gemm_fp8_nt_groupwise
 
+if is_sm90_supported() and is_flashinfer_available():
+    # FlashInfer SM90 DeepGEMM with automatic swapAB optimization for small M
+    from flashinfer.gemm import fp8_blockscale_gemm_sm90
+
 
 def dispatch_w8a8_block_fp8_linear() -> Callable:
     """
@@ -200,6 +208,15 @@ def _dispatch_explicit_backend(backend: Fp8GemmRunnerBackend) -> Callable:
             )
         return flashinfer_gemm_w8a8_block_fp8_linear_with_fallback
 
+    elif backend.is_flashinfer_deepgemm():
+        if not (is_sm90_supported() and is_flashinfer_available()):
+            raise RuntimeError(
+                "FlashInfer DeepGEMM with swapAB requested via --fp8-gemm-backend=flashinfer_deepgemm, "
+                "but it's not available. This backend requires Hopper (SM90) GPUs and FlashInfer "
+                "to be installed."
+            )
+        return flashinfer_deepgemm_w8a8_block_fp8_linear_with_fallback
+
     elif backend.is_cutlass():
         if not _check_cutlass_block_fp8_hardware_support():
             raise RuntimeError(
@@ -333,6 +350,60 @@ def flashinfer_gemm_w8a8_block_fp8_linear_with_fallback(
     return output.to(dtype=input_2d.dtype).view(*output_shape)
 
 
+def flashinfer_deepgemm_w8a8_block_fp8_linear_with_fallback(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    block_size: List[int],
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """
+    FlashInfer DeepGEMM backend for SM90 (Hopper) with swapAB optimization.
+
+    Uses flashinfer.gemm.fp8_blockscale_gemm_sm90 which automatically selects
+    the swapAB kernel for small M dimensions (M < 32) for better performance
+    during decoding/low batch size scenarios.
+
+    For SM90 (Hopper), this uses the DeepGEMM JIT with automatic swapAB selection.
+    """
+    assert input_scale is None
+
+    output_dtype = input.dtype
+    dtype_supported = output_dtype == torch.bfloat16
+
+    # fp8_blockscale_gemm_sm90 requires: N % 64 == 0, K % 128 == 0
+    shape_supported = weight.shape[0] % 64 == 0 and weight.shape[1] % 128 == 0
+
+    if not (shape_supported and dtype_supported):
+        if weight_scale.dtype == torch.int32:
+            weight_scale = _unpack_ue8m0_scale_for_triton(
+                weight_scale, weight.shape, block_size
+            )
+        return triton_w8a8_block_fp8_linear(
+            input, weight, block_size, weight_scale, input_scale, bias
+        )
+
+    input_2d = input.view(-1, input.shape[-1])
+    output_shape = [*input.shape[:-1], weight.shape[0]]
+
+    # - input: (M, K) BF16 or FP8
+    # - weight: (N, K) FP8 with weight_scale
+    # - weight_scale: (N, K//128) for per-token or (N//128, K//128) for per-block
+
+    output = fp8_blockscale_gemm_sm90(
+        input_2d,
+        weight,
+        input_scale=None,  # BF16 input, internal quantization
+        weight_scale=weight_scale,
+        out_dtype=output_dtype,
+    )
+
+    if bias is not None:
+        output += bias
+    return output.view(*output_shape)
+
+
 def cutlass_w8a8_block_fp8_linear_with_fallback(
     input: torch.Tensor,
     weight: torch.Tensor,
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
@@ -198,6 +198,7 @@
     "auto",
     "deep_gemm",
     "flashinfer_trtllm",
+    "flashinfer_deepgemm",
     "cutlass",
     "triton",
     "aiter",
@@ -3685,6 +3686,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
             "Options: 'auto' (default, auto-selects based on hardware), "
             "'deep_gemm' (JIT-compiled; enabled by default on NVIDIA Hopper (SM90) and Blackwell (SM100) when DeepGEMM is installed), "
             "'flashinfer_trtllm' (optimal for Blackwell and low-latency), "
+            "'flashinfer_deepgemm' (Hopper SM90 only; uses swapAB optimization for small M dimensions in decoding), "
             "'cutlass' (optimal for Hopper/Blackwell GPUs and high-throughput), "
             "'triton' (fallback, widely compatible), "
             "'aiter' (ROCm only). "
diff --git a/test/srt/test_fp8_blockwise_gemm.py b/test/srt/test_fp8_blockwise_gemm.py
@@ -69,5 +69,10 @@ class TestFP8BlockwiseGemmFlashinferTrtllm(FP8BlockwiseGemmBase, unittest.TestCa
     backend = "flashinfer_trtllm"
 
 
+@unittest.skipIf(get_device_sm() != 90, "Test requires CUDA SM 90")
+class TestFP8BlockwiseGemmFlashinferDeepGemm(FP8BlockwiseGemmBase, unittest.TestCase):
+    backend = "flashinfer_deepgemm"
+
+
 if __name__ == "__main__":
     unittest.main()