From 6d8d219be0c9c7d3b51768fcde35ca95f23d3639 Mon Sep 17 00:00:00 2001 From: "q.yao" Date: Thu, 12 Feb 2026 16:34:08 +0800 Subject: [PATCH 1/2] support glm5 --- README.md | 1 + README_ja.md | 1 + README_zh-CN.md | 1 + docs/en/supported_models/supported_models.md | 1 + .../supported_models/supported_models.md | 1 + .../pytorch/configurations/deepseek_v2.py | 33 +------------ .../pytorch/configurations/deepseek_v32.py | 46 +++++++++++++++++++ lmdeploy/pytorch/models/deepseek_v32.py | 3 +- 8 files changed, 53 insertions(+), 34 deletions(-) create mode 100644 lmdeploy/pytorch/configurations/deepseek_v32.py diff --git a/README.md b/README.md index a26538c6d3..1b04ef4cd6 100644 --- a/README.md +++ b/README.md @@ -163,6 +163,7 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
  • SDAR (1.7B-30B)
  • gpt-oss (20B, 120B)
  • GLM-4.7-Flash (30B)
  • +
  • GLM-5 (754B)
  • diff --git a/README_ja.md b/README_ja.md index 902d356704..552848d644 100644 --- a/README_ja.md +++ b/README_ja.md @@ -143,6 +143,7 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
  • SDAR (1.7B-30B)
  • gpt-oss (20B, 120B)
  • GLM-4.7-Flash (30B)
  • +
  • GLM-5 (754B)
  • diff --git a/README_zh-CN.md b/README_zh-CN.md index 4e9224feae..1a3f6229f2 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -165,6 +165,7 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力,在各种规模的模型
  • SDAR (1.7B-30B)
  • gpt-oss (20B, 120B)
  • GLM-4.7-Flash (30B)
  • +
  • GLM-5 (754B)
  • diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md index ffb93a57e2..53012899b5 100644 --- a/docs/en/supported_models/supported_models.md +++ b/docs/en/supported_models/supported_models.md @@ -126,6 +126,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine | Phi-3.5-vision | 4.2B | MLLM | Yes | Yes | No | - | - | | SDAR | 1.7B-30B | LLM | Yes | Yes | No | - | - | | GLM-4.7-Flash | 30B | LLM | Yes | No | No | No | No | +| GLM-5 | 754B | LLM | Yes | No | No | No | No | ```{note} * [1] Currently Mono-InternVL does not support FP16 due to numerical instability. Please use BF16 instead. diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md index 3da6a72d81..4f5b77482d 100644 --- a/docs/zh_cn/supported_models/supported_models.md +++ b/docs/zh_cn/supported_models/supported_models.md @@ -121,6 +121,7 @@ | GLM-4.5 | 355B | LLM | Yes | Yes | Yes | - | - | | GLM-4.5-Air | 106B | LLM | Yes | Yes | Yes | - | - | | GLM-4.7-Flash | 30B | LLM | Yes | No | No | No | No | +| GLM-5 | 754B | LLM | Yes | No | No | No | No | | CodeGeeX4 | 9B | LLM | Yes | Yes | Yes | - | - | | Phi-3.5-mini | 3.8B | LLM | Yes | Yes | No | - | - | | Phi-3.5-MoE | 16x3.8B | LLM | Yes | Yes | No | - | - | diff --git a/lmdeploy/pytorch/configurations/deepseek_v2.py b/lmdeploy/pytorch/configurations/deepseek_v2.py index 8d51e1a0fd..0d692e8f1b 100644 --- a/lmdeploy/pytorch/configurations/deepseek_v2.py +++ b/lmdeploy/pytorch/configurations/deepseek_v2.py @@ -1,38 +1,16 @@ # Copyright (c) OpenMMLab. All rights reserved. -import torch - from lmdeploy.pytorch.config import ModelConfig from .builder import AutoModelConfigBuilder from .utils import flash_mla_available -def _check_env_v32(device: str = 'cuda'): - """Environment check.""" - if device != 'cuda': - return - - # check cuda - try: - import fast_hadamard_transform # noqa: F401 - except ImportError: - raise ImportError('Deepseek V3.2 requires .') - - try: - import flash_mla # noqa: F401 - except ImportError: - raise ImportError('Deepseek V3.2 requires .') - - if not hasattr(flash_mla, 'flash_mla_sparse_fwd'): - raise RuntimeError('Latest flash_mla is required: https://github.com/deepseek-ai/FlashMLA.') - - class DeepseekV2ModelConfigBuilder(AutoModelConfigBuilder): @classmethod def condition(cls, hf_config): """config.""" - return hf_config.model_type in ['deepseek_v3', 'deepseek_v2', 'deepseek_v32', 'kimi_k2'] + return hf_config.model_type in ['deepseek_v3', 'deepseek_v2', 'kimi_k2'] @classmethod def build(cls, hf_config, model_path: str = None, is_draft_model: bool = False, spec_method: str = None, **kwargs): @@ -77,13 +55,4 @@ def build(cls, hf_config, model_path: str = None, is_draft_model: bool = False, use_flash_mla=hf_config.use_flash_mla, model_paradigm=model_paradigm, ) - - if hf_config.model_type == 'deepseek_v32': - assert hf_config.use_flash_mla, 'DeepSeek-V3.2 requires flash_mla to be available.' - index_k_shape = ([hf_config.index_head_dim], torch.float8_e4m3fn) - index_k_scale_shape = ([1], torch.float32) - config.cache_shapes = [index_k_shape, index_k_scale_shape] - config.use_mla_fp8_cache = True - config.mla_index_topk = hf_config.index_topk - config.check_env_func = _check_env_v32 return config diff --git a/lmdeploy/pytorch/configurations/deepseek_v32.py b/lmdeploy/pytorch/configurations/deepseek_v32.py new file mode 100644 index 0000000000..cec2cf0781 --- /dev/null +++ b/lmdeploy/pytorch/configurations/deepseek_v32.py @@ -0,0 +1,46 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from .deepseek_v2 import DeepseekV2ModelConfigBuilder + + +def _check_env_v32(device: str = 'cuda'): + """Environment check.""" + if device != 'cuda': + return + + # check cuda + try: + import fast_hadamard_transform # noqa: F401 + except ImportError: + raise ImportError('Deepseek V3.2 requires .') + + try: + import flash_mla # noqa: F401 + except ImportError: + raise ImportError('Deepseek V3.2 requires .') + + if not hasattr(flash_mla, 'flash_mla_sparse_fwd'): + raise RuntimeError('Latest flash_mla is required: https://github.com/deepseek-ai/FlashMLA.') + + +class DeepseekV32ModelConfigBuilder(DeepseekV2ModelConfigBuilder): + + @classmethod + def condition(cls, hf_config): + """config.""" + return hf_config.model_type in ['deepseek_v32', 'glm_moe_dsa'] + + @classmethod + def build(cls, hf_config, model_path: str | None = None, **kwargs): + """build.""" + config = DeepseekV2ModelConfigBuilder.build(hf_config, model_path=model_path, **kwargs) + + assert hf_config.use_flash_mla, 'DeepSeek-V3.2 requires flash_mla to be available.' + index_k_shape = ([hf_config.index_head_dim], torch.float8_e4m3fn) + index_k_scale_shape = ([1], torch.float32) + config.cache_shapes = [index_k_shape, index_k_scale_shape] + config.use_mla_fp8_cache = True + config.mla_index_topk = hf_config.index_topk + config.check_env_func = _check_env_v32 + return config diff --git a/lmdeploy/pytorch/models/deepseek_v32.py b/lmdeploy/pytorch/models/deepseek_v32.py index 29e1d57876..19ee10d420 100644 --- a/lmdeploy/pytorch/models/deepseek_v32.py +++ b/lmdeploy/pytorch/models/deepseek_v32.py @@ -81,7 +81,6 @@ def __init__(self, config: Any, layer_idx: int, dtype: torch.dtype = None, devic device=device, is_tp=False) self.softmax_scale = self.head_dim**-0.5 - self.scale_fmt = quant_config['scale_fmt'] self.apply_rotary_pos_emb = ApplyRotaryEmb() self.indexer_topk = IndexerTopKFP8(self.index_topk, self.softmax_scale, block_size=128, fill=-1) @@ -201,8 +200,8 @@ def __init__(self, config: Any, layer_idx: int, dtype: torch.dtype = None, devic rope_scaling = get_rope_parameters(config) if rope_scaling is not None: mscale_all_dim = rope_scaling.get('mscale_all_dim', 0) - scaling_factor = rope_scaling['factor'] if mscale_all_dim: + scaling_factor = rope_scaling['factor'] mscale = yarn_get_mscale(scaling_factor, mscale_all_dim) self.softmax_scale = self.softmax_scale * mscale * mscale From 188a52113cd5a2665287157370f8c00b76ab22cd Mon Sep 17 00:00:00 2001 From: "q.yao" Date: Thu, 12 Feb 2026 18:13:38 +0800 Subject: [PATCH 2/2] add map --- lmdeploy/pytorch/models/module_map.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lmdeploy/pytorch/models/module_map.py b/lmdeploy/pytorch/models/module_map.py index dd96723eb9..57433eb47d 100644 --- a/lmdeploy/pytorch/models/module_map.py +++ b/lmdeploy/pytorch/models/module_map.py @@ -52,6 +52,9 @@ MODULE_MAP.update({'Glm4MoeLiteForCausalLM': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.deepseek_v2.DeepseekV2ForCausalLM'}) +# glm5 +MODULE_MAP.update({'GlmMoeDsaForCausalLM': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.deepseek_v32.DeepseekV32ForCausalLM'}) + # internlm MODULE_MAP.update({ 'InternLMForCausalLM': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.internlm.InternLMForCausalLM',