fix(config): Support setting Mamba state dtype via config file (sgl-project#18532)

zju-stu-lizheng · 瑀澈 · web-flow · commit 44603764d65e · 2026-02-11T00:20:06.000+08:00
Co-authored-by: 瑀澈 &lt;yuche.lz@alibaba-inc.com&gt;
diff --git a/python/sglang/srt/configs/falcon_h1.py b/python/sglang/srt/configs/falcon_h1.py
@@ -18,7 +18,11 @@
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
 
-from sglang.srt.configs.mamba_utils import Mamba2CacheParams, Mamba2StateShape
+from sglang.srt.configs.mamba_utils import (
+    Mamba2CacheParams,
+    Mamba2StateShape,
+    mamba2_state_dtype,
+)
 
 logger = logging.get_logger(__name__)
 
@@ -307,4 +311,6 @@ def mamba2_cache_params(self):
             state_size=self.mamba_d_state,
             conv_kernel=self.mamba_d_conv,
         )
-        return Mamba2CacheParams(shape=shape, layers=self.linear_layer_ids)
+        return Mamba2CacheParams(
+            shape=shape, layers=self.linear_layer_ids, dtype=mamba2_state_dtype(self)
+        )
diff --git a/python/sglang/srt/configs/jet_nemotron.py b/python/sglang/srt/configs/jet_nemotron.py
@@ -3,7 +3,11 @@
 
 from transformers.configuration_utils import PretrainedConfig
 
-from sglang.srt.configs.mamba_utils import Mamba2CacheParams, Mamba2StateShape
+from sglang.srt.configs.mamba_utils import (
+    Mamba2CacheParams,
+    Mamba2StateShape,
+    mamba2_state_dtype,
+)
 
 
 @dataclass
@@ -71,4 +75,6 @@ def mamba2_cache_params(self) -> Mamba2CacheParams:
             conv_kernel=jet_block_config.conv_size,
         )
 
-        return Mamba2CacheParams(shape=shape, layers=self.linear_layer_ids)
+        return Mamba2CacheParams(
+            shape=shape, layers=self.linear_layer_ids, dtype=mamba2_state_dtype(self)
+        )
diff --git a/python/sglang/srt/configs/lfm2.py b/python/sglang/srt/configs/lfm2.py
@@ -20,7 +20,11 @@
 from transformers import Lfm2Config as HFLfm2Config
 from transformers.utils import logging
 
-from sglang.srt.configs.mamba_utils import Mamba2CacheParams, Mamba2StateShape
+from sglang.srt.configs.mamba_utils import (
+    Mamba2CacheParams,
+    Mamba2StateShape,
+    mamba2_state_dtype,
+)
 
 logger = logging.get_logger(__name__)
 
@@ -87,11 +91,10 @@ def mamba2_cache_params(self) -> Optional[Mamba2CacheParams]:
             conv_kernel=conv_kernel,
         )
 
-        # Uses default mamba2_state_dtype() which reads SGLANG_MAMBA_CONV_DTYPE env var
-        # (defaults to bfloat16). Set SGLANG_MAMBA_CONV_DTYPE=float16 for fp16 inference.
         return Mamba2CacheParams(
             shape=shape,
             layers=conv_layer_ids,
+            dtype=mamba2_state_dtype(self),
         )
 
 
diff --git a/python/sglang/srt/configs/mamba_utils.py b/python/sglang/srt/configs/mamba_utils.py
@@ -12,6 +12,7 @@
 # limitations under the License.
 """Common config utils for mamba2 - NemotronH, FalconH1, Qwen3Next, LFM2, etc."""
 
+import logging
 from abc import ABC
 from dataclasses import dataclass, field
 from typing import List, Optional
@@ -22,6 +23,8 @@
 from sglang.srt.distributed.utils import divide
 from sglang.srt.environ import envs
 
+logger = logging.getLogger(__name__)
+
 
 def extra_groups_for_head_shards(ngroups: int, tp_size: int):
     """Compute the increase in group numbers to account for
@@ -41,20 +44,72 @@ class Mamba2StateDType:
     temporal: torch.dtype
 
 
-def mamba2_state_dtype() -> Mamba2StateDType:
+def mamba2_state_dtype(config=None) -> Mamba2StateDType:
+    """
+    Get mamba2 state dtype from config or environment variable.
+
+    Priority (from highest to lowest):
+    1. Environment variable SGLANG_MAMBA_SSM_DTYPE
+    2. Config file (config.mamba_ssm_dtype or config.text_config.mamba_ssm_dtype)
+    3. Default "float32"
+
+    Args:
+        config: Optional config object (PretrainedConfig). If provided, will read
+                mamba_ssm_dtype from it. For VL models, reads from text_config.
+
+    Returns:
+        Mamba2StateDType with conv and temporal dtypes
+    """
     dtype_map = {
         "float32": torch.float32,
         "bfloat16": torch.bfloat16,
         "float16": torch.float16,
     }
     conv_dtype = dtype_map.get(envs.SGLANG_MAMBA_CONV_DTYPE.get(), torch.bfloat16)
-    ssm_dtype = dtype_map.get(envs.SGLANG_MAMBA_SSM_DTYPE.get(), torch.float32)
+
+    # Get SSM dtype: default -> config -> env var
+    ssm_dtype = torch.float32  # Step 1: Default value
+
+    # Step 2: Try to read from config
+    if config is not None:
+        config_dtype = None
+        if hasattr(config, "text_config") and hasattr(
+            config.text_config, "mamba_ssm_dtype"
+        ):
+            # VL model: read from text_config
+            config_dtype = config.text_config.mamba_ssm_dtype
+        elif hasattr(config, "mamba_ssm_dtype"):
+            # Text model: read from root config
+            config_dtype = config.mamba_ssm_dtype
+
+        if config_dtype is not None:
+            if config_dtype not in dtype_map:
+                logger.warning(
+                    f"Invalid mamba_ssm_dtype '{config_dtype}' in config. "
+                    f"Must be one of {list(dtype_map.keys())}. Using default 'float32'."
+                )
+            else:
+                ssm_dtype = dtype_map[config_dtype]
+
+    # Step 3: Check environment variable, if not None, override
+    env_ssm_dtype = envs.SGLANG_MAMBA_SSM_DTYPE.get()
+    if env_ssm_dtype is not None:
+        if env_ssm_dtype not in dtype_map:
+            logger.warning(
+                f"Invalid mamba_ssm_dtype '{env_ssm_dtype}' from environment variable. "
+                f"Must be one of {list(dtype_map.keys())}. Using default 'float32'."
+            )
+        else:
+            ssm_dtype = dtype_map[env_ssm_dtype]
+
+    logger.info(f"Mamba2 state dtype: conv_dtype={conv_dtype}, ssm_dtype={ssm_dtype}")
+
     return Mamba2StateDType(conv=conv_dtype, temporal=ssm_dtype)
 
 
 @dataclass(kw_only=True, frozen=True)
 class BaseLinearStateParams(ABC):
-    dtype: Mamba2StateDType = field(default_factory=mamba2_state_dtype)
+    dtype: Mamba2StateDType = field(default_factory=lambda: mamba2_state_dtype(None))
     layers: list[int]
 
     @property
diff --git a/python/sglang/srt/configs/nemotron_h.py b/python/sglang/srt/configs/nemotron_h.py
@@ -19,7 +19,11 @@
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
 
-from sglang.srt.configs.mamba_utils import Mamba2CacheParams, Mamba2StateShape
+from sglang.srt.configs.mamba_utils import (
+    Mamba2CacheParams,
+    Mamba2StateShape,
+    mamba2_state_dtype,
+)
 
 logger = logging.get_logger(__name__)
 
@@ -305,4 +309,6 @@ def mamba2_cache_params(self) -> Mamba2CacheParams:
             conv_kernel=self.conv_kernel,
         )
 
-        return Mamba2CacheParams(shape=shape, layers=self.mamba_layer_ids)
+        return Mamba2CacheParams(
+            shape=shape, layers=self.mamba_layer_ids, dtype=mamba2_state_dtype(self)
+        )
diff --git a/python/sglang/srt/configs/qwen3_next.py b/python/sglang/srt/configs/qwen3_next.py
@@ -19,7 +19,11 @@
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
 
-from sglang.srt.configs.mamba_utils import Mamba2CacheParams, Mamba2StateShape
+from sglang.srt.configs.mamba_utils import (
+    Mamba2CacheParams,
+    Mamba2StateShape,
+    mamba2_state_dtype,
+)
 from sglang.srt.configs.update_config import adjust_tp_num_heads_if_necessary
 from sglang.srt.utils import is_cpu
 
@@ -293,4 +297,6 @@ def mamba2_cache_params(self) -> Mamba2CacheParams:
             conv_kernel=self.linear_conv_kernel_dim,
         )
 
-        return Mamba2CacheParams(shape=shape, layers=self.linear_layer_ids)
+        return Mamba2CacheParams(
+            shape=shape, layers=self.linear_layer_ids, dtype=mamba2_state_dtype(self)
+        )
diff --git a/python/sglang/srt/environ.py b/python/sglang/srt/environ.py
@@ -418,7 +418,7 @@ class Envs:
 
     # Mamba
     SGLANG_MAMBA_CONV_DTYPE = EnvStr("bfloat16")
-    SGLANG_MAMBA_SSM_DTYPE = EnvStr("float32")
+    SGLANG_MAMBA_SSM_DTYPE = EnvStr(None)
 
     # Release & Resume Memory
     SGLANG_MEMORY_SAVER_CUDA_GRAPH = EnvBool(False)
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
@@ -513,7 +513,7 @@ class ServerArgs:
 
     # Mamba cache
     max_mamba_cache_size: Optional[int] = None
-    mamba_ssm_dtype: str = "float32"
+    mamba_ssm_dtype: Optional[str] = None
     mamba_full_memory_ratio: float = 0.9
     mamba_scheduler_strategy: str = "auto"
     mamba_track_interval: int = 256
@@ -2600,7 +2600,8 @@ def _handle_tokenizer_batching(self):
 
     def _handle_environment_variables(self):
         envs.SGLANG_ENABLE_TORCH_COMPILE.set("1" if self.enable_torch_compile else "0")
-        envs.SGLANG_MAMBA_SSM_DTYPE.set(self.mamba_ssm_dtype)
+        if self.mamba_ssm_dtype is not None:
+            envs.SGLANG_MAMBA_SSM_DTYPE.set(self.mamba_ssm_dtype)
         envs.SGLANG_DISABLE_OUTLINES_DISK_CACHE.set(
             "1" if self.disable_outlines_disk_cache else "0"
         )
@@ -4130,9 +4131,10 @@ def add_cli_args(parser: argparse.ArgumentParser):
         parser.add_argument(
             "--mamba-ssm-dtype",
             type=str,
-            default=ServerArgs.mamba_ssm_dtype,
+            default=None,
             choices=MAMBA_SSM_DTYPE_CHOICES,
-            help="The data type of the SSM states in mamba cache.",
+            help="The data type of the SSM states in mamba cache. "
+            "If not set, will be read from model config (mamba_ssm_dtype).",
         )
         parser.add_argument(
             "--mamba-full-memory-ratio",