diff --git a/README.md b/README.md
index a26538c6d3..1b04ef4cd6 100644
--- a/README.md
+++ b/README.md
@@ -163,6 +163,7 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
SDAR (1.7B-30B)
gpt-oss (20B, 120B)
GLM-4.7-Flash (30B)
+ GLM-5 (754B)
diff --git a/README_ja.md b/README_ja.md
index 902d356704..552848d644 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -143,6 +143,7 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
SDAR (1.7B-30B)
gpt-oss (20B, 120B)
GLM-4.7-Flash (30B)
+ GLM-5 (754B)
|
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 4e9224feae..1a3f6229f2 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -165,6 +165,7 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力,在各种规模的模型
SDAR (1.7B-30B)
gpt-oss (20B, 120B)
GLM-4.7-Flash (30B)
+ GLM-5 (754B)
|
diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index ffb93a57e2..53012899b5 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -126,6 +126,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
| Phi-3.5-vision | 4.2B | MLLM | Yes | Yes | No | - | - |
| SDAR | 1.7B-30B | LLM | Yes | Yes | No | - | - |
| GLM-4.7-Flash | 30B | LLM | Yes | No | No | No | No |
+| GLM-5 | 754B | LLM | Yes | No | No | No | No |
```{note}
* [1] Currently Mono-InternVL does not support FP16 due to numerical instability. Please use BF16 instead.
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index 3da6a72d81..4f5b77482d 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -121,6 +121,7 @@
| GLM-4.5 | 355B | LLM | Yes | Yes | Yes | - | - |
| GLM-4.5-Air | 106B | LLM | Yes | Yes | Yes | - | - |
| GLM-4.7-Flash | 30B | LLM | Yes | No | No | No | No |
+| GLM-5 | 754B | LLM | Yes | No | No | No | No |
| CodeGeeX4 | 9B | LLM | Yes | Yes | Yes | - | - |
| Phi-3.5-mini | 3.8B | LLM | Yes | Yes | No | - | - |
| Phi-3.5-MoE | 16x3.8B | LLM | Yes | Yes | No | - | - |
diff --git a/lmdeploy/pytorch/configurations/deepseek_v2.py b/lmdeploy/pytorch/configurations/deepseek_v2.py
index 8d51e1a0fd..0d692e8f1b 100644
--- a/lmdeploy/pytorch/configurations/deepseek_v2.py
+++ b/lmdeploy/pytorch/configurations/deepseek_v2.py
@@ -1,38 +1,16 @@
# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
from lmdeploy.pytorch.config import ModelConfig
from .builder import AutoModelConfigBuilder
from .utils import flash_mla_available
-def _check_env_v32(device: str = 'cuda'):
- """Environment check."""
- if device != 'cuda':
- return
-
- # check cuda
- try:
- import fast_hadamard_transform # noqa: F401
- except ImportError:
- raise ImportError('Deepseek V3.2 requires .')
-
- try:
- import flash_mla # noqa: F401
- except ImportError:
- raise ImportError('Deepseek V3.2 requires .')
-
- if not hasattr(flash_mla, 'flash_mla_sparse_fwd'):
- raise RuntimeError('Latest flash_mla is required: https://github.com/deepseek-ai/FlashMLA.')
-
-
class DeepseekV2ModelConfigBuilder(AutoModelConfigBuilder):
@classmethod
def condition(cls, hf_config):
"""config."""
- return hf_config.model_type in ['deepseek_v3', 'deepseek_v2', 'deepseek_v32', 'kimi_k2']
+ return hf_config.model_type in ['deepseek_v3', 'deepseek_v2', 'kimi_k2']
@classmethod
def build(cls, hf_config, model_path: str = None, is_draft_model: bool = False, spec_method: str = None, **kwargs):
@@ -77,13 +55,4 @@ def build(cls, hf_config, model_path: str = None, is_draft_model: bool = False,
use_flash_mla=hf_config.use_flash_mla,
model_paradigm=model_paradigm,
)
-
- if hf_config.model_type == 'deepseek_v32':
- assert hf_config.use_flash_mla, 'DeepSeek-V3.2 requires flash_mla to be available.'
- index_k_shape = ([hf_config.index_head_dim], torch.float8_e4m3fn)
- index_k_scale_shape = ([1], torch.float32)
- config.cache_shapes = [index_k_shape, index_k_scale_shape]
- config.use_mla_fp8_cache = True
- config.mla_index_topk = hf_config.index_topk
- config.check_env_func = _check_env_v32
return config
diff --git a/lmdeploy/pytorch/configurations/deepseek_v32.py b/lmdeploy/pytorch/configurations/deepseek_v32.py
new file mode 100644
index 0000000000..cec2cf0781
--- /dev/null
+++ b/lmdeploy/pytorch/configurations/deepseek_v32.py
@@ -0,0 +1,46 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from .deepseek_v2 import DeepseekV2ModelConfigBuilder
+
+
+def _check_env_v32(device: str = 'cuda'):
+ """Environment check."""
+ if device != 'cuda':
+ return
+
+ # check cuda
+ try:
+ import fast_hadamard_transform # noqa: F401
+ except ImportError:
+ raise ImportError('Deepseek V3.2 requires .')
+
+ try:
+ import flash_mla # noqa: F401
+ except ImportError:
+ raise ImportError('Deepseek V3.2 requires .')
+
+ if not hasattr(flash_mla, 'flash_mla_sparse_fwd'):
+ raise RuntimeError('Latest flash_mla is required: https://github.com/deepseek-ai/FlashMLA.')
+
+
+class DeepseekV32ModelConfigBuilder(DeepseekV2ModelConfigBuilder):
+
+ @classmethod
+ def condition(cls, hf_config):
+ """config."""
+ return hf_config.model_type in ['deepseek_v32', 'glm_moe_dsa']
+
+ @classmethod
+ def build(cls, hf_config, model_path: str | None = None, **kwargs):
+ """build."""
+ config = DeepseekV2ModelConfigBuilder.build(hf_config, model_path=model_path, **kwargs)
+
+ assert hf_config.use_flash_mla, 'DeepSeek-V3.2 requires flash_mla to be available.'
+ index_k_shape = ([hf_config.index_head_dim], torch.float8_e4m3fn)
+ index_k_scale_shape = ([1], torch.float32)
+ config.cache_shapes = [index_k_shape, index_k_scale_shape]
+ config.use_mla_fp8_cache = True
+ config.mla_index_topk = hf_config.index_topk
+ config.check_env_func = _check_env_v32
+ return config
diff --git a/lmdeploy/pytorch/models/deepseek_v32.py b/lmdeploy/pytorch/models/deepseek_v32.py
index 29e1d57876..19ee10d420 100644
--- a/lmdeploy/pytorch/models/deepseek_v32.py
+++ b/lmdeploy/pytorch/models/deepseek_v32.py
@@ -81,7 +81,6 @@ def __init__(self, config: Any, layer_idx: int, dtype: torch.dtype = None, devic
device=device,
is_tp=False)
self.softmax_scale = self.head_dim**-0.5
- self.scale_fmt = quant_config['scale_fmt']
self.apply_rotary_pos_emb = ApplyRotaryEmb()
self.indexer_topk = IndexerTopKFP8(self.index_topk, self.softmax_scale, block_size=128, fill=-1)
@@ -201,8 +200,8 @@ def __init__(self, config: Any, layer_idx: int, dtype: torch.dtype = None, devic
rope_scaling = get_rope_parameters(config)
if rope_scaling is not None:
mscale_all_dim = rope_scaling.get('mscale_all_dim', 0)
- scaling_factor = rope_scaling['factor']
if mscale_all_dim:
+ scaling_factor = rope_scaling['factor']
mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
self.softmax_scale = self.softmax_scale * mscale * mscale
diff --git a/lmdeploy/pytorch/models/module_map.py b/lmdeploy/pytorch/models/module_map.py
index dd96723eb9..57433eb47d 100644
--- a/lmdeploy/pytorch/models/module_map.py
+++ b/lmdeploy/pytorch/models/module_map.py
@@ -52,6 +52,9 @@
MODULE_MAP.update({'Glm4MoeLiteForCausalLM': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.deepseek_v2.DeepseekV2ForCausalLM'})
+# glm5
+MODULE_MAP.update({'GlmMoeDsaForCausalLM': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.deepseek_v32.DeepseekV32ForCausalLM'})
+
# internlm
MODULE_MAP.update({
'InternLMForCausalLM': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.internlm.InternLMForCausalLM',
|