Support mxint4 flashinfer_trtllm moe gemm (sgl-project#16892)

HandH1998 · Johnsonms · commit 9c4b947da6fa · 2026-02-14T01:41:18.000Z
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
@@ -54,6 +54,9 @@
     FusedMoEMethodBase,
     QuantizationConfig,
 )
+from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe import (
+    CompressedTensorsMxInt4MoEMethod,
+)
 from sglang.srt.layers.quantization.fp8 import Fp8MoEMethod
 from sglang.srt.layers.quantization.modelopt_quant import ModelOptNvFp4FusedMoEMethod
 from sglang.srt.layers.quantization.unquant import UnquantizedFusedMoEMethod
@@ -253,6 +256,7 @@ def __init__(
             gemm1_alpha=gemm1_alpha,
             gemm1_clamp_limit=gemm1_clamp_limit,
             is_gated=is_gated,
+            routing_method_type=routing_method_type,
         )
 
         self.quant_method: Optional[FusedMoEMethodBase] = None
@@ -688,6 +692,7 @@ def _weight_loader_impl(
             isinstance(self.quant_method, ModelOptNvFp4FusedMoEMethod)
             or isinstance(self.quant_method, Fp8MoEMethod)
             or isinstance(self.quant_method, UnquantizedFusedMoEMethod)
+            or isinstance(self.quant_method, CompressedTensorsMxInt4MoEMethod)
         ):
             shard_id = {"w1": "w3", "w3": "w1", "w2": "w2"}[shard_id]
 
@@ -1140,6 +1145,7 @@ def forward_impl(self, hidden_states: torch.Tensor, topk_output: TopKOutput):
         router_logits = topk_output.router_logits
         topk_config = topk_output.topk_config
         correction_bias = topk_config.correction_bias
+        routed_scaling_factor = self.moe_runner_config.routed_scaling_factor
 
         if isinstance(self.quant_method, UnquantizedFusedMoEMethod):
             # lazy import
@@ -1170,6 +1176,7 @@ def forward_impl(self, hidden_states: torch.Tensor, topk_output: TopKOutput):
                     local_expert_offset=self.moe_ep_rank * self.num_local_experts,
                     local_num_experts=self.num_local_experts,
                     routing_method_type=self.routing_method_type,
+                    routed_scaling_factor=routed_scaling_factor,
                     tune_max_num_tokens=next_power_of_2(hidden_states.shape[0]),
                 )
 
diff --git a/python/sglang/srt/layers/moe/moe_runner/base.py b/python/sglang/srt/layers/moe/moe_runner/base.py
@@ -6,7 +6,11 @@
 
 import torch
 
-from sglang.srt.layers.moe.utils import MoeA2ABackend, MoeRunnerBackend
+from sglang.srt.layers.moe.utils import (
+    MoeA2ABackend,
+    MoeRunnerBackend,
+    RoutingMethodType,
+)
 
 if TYPE_CHECKING:
     from sglang.srt.layers.moe.moe_runner.triton import (
@@ -33,6 +37,7 @@ class MoeRunnerConfig:
     top_k: Optional[int] = None
     num_fused_shared_experts: Optional[int] = None
     params_dtype: Optional[torch.dtype] = None
+    routing_method_type: Optional[RoutingMethodType] = None
 
     # Runner configuration
     activation: str = "silu"
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -471,6 +471,19 @@ def _is_wNa16_group_channel(
 
         return is_channel_group and input_quant_none and is_symmetric and is_static
 
+    def _is_mxint4a16(self, weight_quant: BaseModel, input_quant: BaseModel) -> bool:
+        input_quant_none = input_quant is None
+        is_symmetric = weight_quant.symmetric
+        is_mxint4 = (
+            weight_quant.num_bits == 4
+            and weight_quant.type == QuantizationType.INT
+            and weight_quant.strategy == QuantizationStrategy.GROUP.value
+            and weight_quant.group_size == 32
+        )
+        is_static = not weight_quant.dynamic
+
+        return is_mxint4 and input_quant_none and is_symmetric and is_static
+
     def _is_dynamic_token_w4(
         self, weight_quant: BaseModel, input_quant: BaseModel
     ) -> bool:
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py