InternLM · lvhan028 · Feb 24, 2026 · Feb 12, 2026 · Feb 12, 2026 · Copilot
diff --git a/README.md b/README.md
@@ -163,6 +163,7 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
   <li>SDAR (1.7B-30B)</li>
   <li>gpt-oss (20B, 120B)</li>
   <li>GLM-4.7-Flash (30B)</li>
+  <li>GLM-5 (754B)</li>
 </ul>
 </td>
 <td>

diff --git a/README_ja.md b/README_ja.md
@@ -143,6 +143,7 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
   <li>SDAR (1.7B-30B)</li>
   <li>gpt-oss (20B, 120B)</li>
   <li>GLM-4.7-Flash (30B)</li>
+  <li>GLM-5 (754B)</li>
 </ul>
 </td>
 <td>

diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -165,6 +165,7 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力，在各种规模的模型
   <li>SDAR (1.7B-30B)</li>
   <li>gpt-oss (20B, 120B)</li>
   <li>GLM-4.7-Flash (30B)</li>
+  <li>GLM-5 (754B)</li>
 </ul>
 </td>
 <td>

diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
@@ -126,6 +126,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |         Phi-3.5-vision         |      4.2B       | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
 |              SDAR              |    1.7B-30B     | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
 |         GLM-4.7-Flash          |       30B       | LLM  |    Yes    |   No    |   No    |  No  |  No   |
+|             GLM-5              |      754B       | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 
 ```{note}
 * [1] Currently Mono-InternVL does not support FP16 due to numerical instability. Please use BF16 instead.

diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
@@ -121,6 +121,7 @@
 |            GLM-4.5             |      355B       | LLM  |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |          GLM-4.5-Air           |      106B       | LLM  |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |         GLM-4.7-Flash          |       30B       | LLM  |    Yes    |   No    |   No    |  No  |  No   |
+|             GLM-5              |      754B       | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |           CodeGeeX4            |       9B        | LLM  |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |          Phi-3.5-mini          |      3.8B       | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
 |          Phi-3.5-MoE           |     16x3.8B     | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |

diff --git a/lmdeploy/pytorch/configurations/deepseek_v2.py b/lmdeploy/pytorch/configurations/deepseek_v2.py
@@ -1,38 +1,16 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
 from lmdeploy.pytorch.config import ModelConfig
 
 from .builder import AutoModelConfigBuilder
 from .utils import flash_mla_available
 
 
-def _check_env_v32(device: str = 'cuda'):
-    """Environment check."""
-    if device != 'cuda':
-        return
-
-    # check cuda
-    try:
-        import fast_hadamard_transform  # noqa: F401
-    except ImportError:
-        raise ImportError('Deepseek V3.2 requires <fast_hadamard_transform>.')
-
-    try:
-        import flash_mla  # noqa: F401
-    except ImportError:
-        raise ImportError('Deepseek V3.2 requires <flash_mla>.')
-
-    if not hasattr(flash_mla, 'flash_mla_sparse_fwd'):
-        raise RuntimeError('Latest flash_mla is required: https://github.com/deepseek-ai/FlashMLA.')
-
-
 class DeepseekV2ModelConfigBuilder(AutoModelConfigBuilder):
 
     @classmethod
     def condition(cls, hf_config):
         """config."""
-        return hf_config.model_type in ['deepseek_v3', 'deepseek_v2', 'deepseek_v32', 'kimi_k2']
+        return hf_config.model_type in ['deepseek_v3', 'deepseek_v2', 'kimi_k2']
 
     @classmethod
     def build(cls, hf_config, model_path: str = None, is_draft_model: bool = False, spec_method: str = None, **kwargs):
@@ -77,13 +55,4 @@ def build(cls, hf_config, model_path: str = None, is_draft_model: bool = False,
             use_flash_mla=hf_config.use_flash_mla,
             model_paradigm=model_paradigm,
         )
-
-        if hf_config.model_type == 'deepseek_v32':
-            assert hf_config.use_flash_mla, 'DeepSeek-V3.2 requires flash_mla to be available.'
-            index_k_shape = ([hf_config.index_head_dim], torch.float8_e4m3fn)
-            index_k_scale_shape = ([1], torch.float32)
-            config.cache_shapes = [index_k_shape, index_k_scale_shape]
-            config.use_mla_fp8_cache = True
-            config.mla_index_topk = hf_config.index_topk
-            config.check_env_func = _check_env_v32
         return config
diff --git a/lmdeploy/pytorch/configurations/deepseek_v32.py b/lmdeploy/pytorch/configurations/deepseek_v32.py
@@ -0,0 +1,46 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from .deepseek_v2 import DeepseekV2ModelConfigBuilder
+
+
+def _check_env_v32(device: str = 'cuda'):
+    """Environment check."""
+    if device != 'cuda':
+        return
+
+    # check cuda
+    try:
+        import fast_hadamard_transform  # noqa: F401
+    except ImportError:
+        raise ImportError('Deepseek V3.2 requires <fast_hadamard_transform>.')
+
+    try:
+        import flash_mla  # noqa: F401
+    except ImportError:
+        raise ImportError('Deepseek V3.2 requires <flash_mla>.')
+
+    if not hasattr(flash_mla, 'flash_mla_sparse_fwd'):
+        raise RuntimeError('Latest flash_mla is required: https://github.com/deepseek-ai/FlashMLA.')
+
+
+class DeepseekV32ModelConfigBuilder(DeepseekV2ModelConfigBuilder):
+
+    @classmethod
+    def condition(cls, hf_config):
+        """config."""
+        return hf_config.model_type in ['deepseek_v32', 'glm_moe_dsa']
+
+    @classmethod
+    def build(cls, hf_config, model_path: str | None = None, **kwargs):
+        """build."""
+        config = DeepseekV2ModelConfigBuilder.build(hf_config, model_path=model_path, **kwargs)
+
+        assert hf_config.use_flash_mla, 'DeepSeek-V3.2 requires flash_mla to be available.'
-        assert hf_config.use_flash_mla, 'DeepSeek-V3.2 requires flash_mla to be available.'
+        if not hf_config.use_flash_mla:
+            raise RuntimeError('DeepSeek-V3.2 requires flash_mla to be available.')
-        assert hf_config.use_flash_mla, 'DeepSeek-V3.2 requires flash_mla to be available.'
+        if not hf_config.use_flash_mla:
+            raise RuntimeError('DeepSeek-V3.2 requires flash_mla to be available.')
+        index_k_shape = ([hf_config.index_head_dim], torch.float8_e4m3fn)
+        index_k_scale_shape = ([1], torch.float32)
+        config.cache_shapes = [index_k_shape, index_k_scale_shape]
+        config.use_mla_fp8_cache = True
+        config.mla_index_topk = hf_config.index_topk
+        config.check_env_func = _check_env_v32
+        return config
diff --git a/lmdeploy/pytorch/models/deepseek_v32.py b/lmdeploy/pytorch/models/deepseek_v32.py
@@ -81,7 +81,6 @@ def __init__(self, config: Any, layer_idx: int, dtype: torch.dtype = None, devic
                                                  device=device,
                                                  is_tp=False)
         self.softmax_scale = self.head_dim**-0.5
-        self.scale_fmt = quant_config['scale_fmt']
         self.apply_rotary_pos_emb = ApplyRotaryEmb()
         self.indexer_topk = IndexerTopKFP8(self.index_topk, self.softmax_scale, block_size=128, fill=-1)
 
@@ -201,8 +200,8 @@ def __init__(self, config: Any, layer_idx: int, dtype: torch.dtype = None, devic
         rope_scaling = get_rope_parameters(config)
         if rope_scaling is not None:
             mscale_all_dim = rope_scaling.get('mscale_all_dim', 0)
-            scaling_factor = rope_scaling['factor']
             if mscale_all_dim:
+                scaling_factor = rope_scaling['factor']
                 mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
                 self.softmax_scale = self.softmax_scale * mscale * mscale
 

diff --git a/lmdeploy/pytorch/models/module_map.py b/lmdeploy/pytorch/models/module_map.py
@@ -52,6 +52,9 @@
 
 MODULE_MAP.update({'Glm4MoeLiteForCausalLM': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.deepseek_v2.DeepseekV2ForCausalLM'})
 
+# glm5
+MODULE_MAP.update({'GlmMoeDsaForCausalLM': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.deepseek_v32.DeepseekV32ForCausalLM'})
+
 # internlm
 MODULE_MAP.update({
     'InternLMForCausalLM': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.internlm.InternLMForCausalLM',