Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
<li>SDAR (1.7B-30B)</li>
<li>gpt-oss (20B, 120B)</li>
<li>GLM-4.7-Flash (30B)</li>
<li>GLM-5 (754B)</li>
</ul>
</td>
<td>
Expand Down
1 change: 1 addition & 0 deletions README_ja.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
<li>SDAR (1.7B-30B)</li>
<li>gpt-oss (20B, 120B)</li>
<li>GLM-4.7-Flash (30B)</li>
<li>GLM-5 (754B)</li>
</ul>
</td>
<td>
Expand Down
1 change: 1 addition & 0 deletions README_zh-CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力,在各种规模的模型
<li>SDAR (1.7B-30B)</li>
<li>gpt-oss (20B, 120B)</li>
<li>GLM-4.7-Flash (30B)</li>
<li>GLM-5 (754B)</li>
</ul>
</td>
<td>
Expand Down
1 change: 1 addition & 0 deletions docs/en/supported_models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
| Phi-3.5-vision | 4.2B | MLLM | Yes | Yes | No | - | - |
| SDAR | 1.7B-30B | LLM | Yes | Yes | No | - | - |
| GLM-4.7-Flash | 30B | LLM | Yes | No | No | No | No |
| GLM-5 | 754B | LLM | Yes | No | No | No | No |

```{note}
* [1] Currently Mono-InternVL does not support FP16 due to numerical instability. Please use BF16 instead.
Expand Down
1 change: 1 addition & 0 deletions docs/zh_cn/supported_models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@
| GLM-4.5 | 355B | LLM | Yes | Yes | Yes | - | - |
| GLM-4.5-Air | 106B | LLM | Yes | Yes | Yes | - | - |
| GLM-4.7-Flash | 30B | LLM | Yes | No | No | No | No |
| GLM-5 | 754B | LLM | Yes | No | No | No | No |
| CodeGeeX4 | 9B | LLM | Yes | Yes | Yes | - | - |
| Phi-3.5-mini | 3.8B | LLM | Yes | Yes | No | - | - |
| Phi-3.5-MoE | 16x3.8B | LLM | Yes | Yes | No | - | - |
Expand Down
33 changes: 1 addition & 32 deletions lmdeploy/pytorch/configurations/deepseek_v2.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,16 @@
# Copyright (c) OpenMMLab. All rights reserved.
import torch

from lmdeploy.pytorch.config import ModelConfig

from .builder import AutoModelConfigBuilder
from .utils import flash_mla_available


def _check_env_v32(device: str = 'cuda'):
"""Environment check."""
if device != 'cuda':
return

# check cuda
try:
import fast_hadamard_transform # noqa: F401
except ImportError:
raise ImportError('Deepseek V3.2 requires <fast_hadamard_transform>.')

try:
import flash_mla # noqa: F401
except ImportError:
raise ImportError('Deepseek V3.2 requires <flash_mla>.')

if not hasattr(flash_mla, 'flash_mla_sparse_fwd'):
raise RuntimeError('Latest flash_mla is required: https://github.com/deepseek-ai/FlashMLA.')


class DeepseekV2ModelConfigBuilder(AutoModelConfigBuilder):

@classmethod
def condition(cls, hf_config):
"""config."""
return hf_config.model_type in ['deepseek_v3', 'deepseek_v2', 'deepseek_v32', 'kimi_k2']
return hf_config.model_type in ['deepseek_v3', 'deepseek_v2', 'kimi_k2']

@classmethod
def build(cls, hf_config, model_path: str = None, is_draft_model: bool = False, spec_method: str = None, **kwargs):
Expand Down Expand Up @@ -77,13 +55,4 @@ def build(cls, hf_config, model_path: str = None, is_draft_model: bool = False,
use_flash_mla=hf_config.use_flash_mla,
model_paradigm=model_paradigm,
)

if hf_config.model_type == 'deepseek_v32':
assert hf_config.use_flash_mla, 'DeepSeek-V3.2 requires flash_mla to be available.'
index_k_shape = ([hf_config.index_head_dim], torch.float8_e4m3fn)
index_k_scale_shape = ([1], torch.float32)
config.cache_shapes = [index_k_shape, index_k_scale_shape]
config.use_mla_fp8_cache = True
config.mla_index_topk = hf_config.index_topk
config.check_env_func = _check_env_v32
return config
46 changes: 46 additions & 0 deletions lmdeploy/pytorch/configurations/deepseek_v32.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Copyright (c) OpenMMLab. All rights reserved.
import torch

from .deepseek_v2 import DeepseekV2ModelConfigBuilder


def _check_env_v32(device: str = 'cuda'):
"""Environment check."""
if device != 'cuda':
return

# check cuda
try:
import fast_hadamard_transform # noqa: F401
except ImportError:
raise ImportError('Deepseek V3.2 requires <fast_hadamard_transform>.')

try:
import flash_mla # noqa: F401
except ImportError:
raise ImportError('Deepseek V3.2 requires <flash_mla>.')

if not hasattr(flash_mla, 'flash_mla_sparse_fwd'):
raise RuntimeError('Latest flash_mla is required: https://github.com/deepseek-ai/FlashMLA.')


class DeepseekV32ModelConfigBuilder(DeepseekV2ModelConfigBuilder):

@classmethod
def condition(cls, hf_config):
"""config."""
return hf_config.model_type in ['deepseek_v32', 'glm_moe_dsa']

@classmethod
def build(cls, hf_config, model_path: str | None = None, **kwargs):
"""build."""
config = DeepseekV2ModelConfigBuilder.build(hf_config, model_path=model_path, **kwargs)

assert hf_config.use_flash_mla, 'DeepSeek-V3.2 requires flash_mla to be available.'
Copy link

Copilot AI Feb 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Avoid using assert for runtime/environment requirements (asserts are skipped with python -O). Raise a RuntimeError/ValueError with the same message instead so the check is always enforced.

Suggested change
assert hf_config.use_flash_mla, 'DeepSeek-V3.2 requires flash_mla to be available.'
if not hf_config.use_flash_mla:
raise RuntimeError('DeepSeek-V3.2 requires flash_mla to be available.')

Copilot uses AI. Check for mistakes.
index_k_shape = ([hf_config.index_head_dim], torch.float8_e4m3fn)
index_k_scale_shape = ([1], torch.float32)
config.cache_shapes = [index_k_shape, index_k_scale_shape]
config.use_mla_fp8_cache = True
config.mla_index_topk = hf_config.index_topk
config.check_env_func = _check_env_v32
return config
3 changes: 1 addition & 2 deletions lmdeploy/pytorch/models/deepseek_v32.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@ def __init__(self, config: Any, layer_idx: int, dtype: torch.dtype = None, devic
device=device,
is_tp=False)
self.softmax_scale = self.head_dim**-0.5
self.scale_fmt = quant_config['scale_fmt']
self.apply_rotary_pos_emb = ApplyRotaryEmb()
self.indexer_topk = IndexerTopKFP8(self.index_topk, self.softmax_scale, block_size=128, fill=-1)

Expand Down Expand Up @@ -201,8 +200,8 @@ def __init__(self, config: Any, layer_idx: int, dtype: torch.dtype = None, devic
rope_scaling = get_rope_parameters(config)
if rope_scaling is not None:
mscale_all_dim = rope_scaling.get('mscale_all_dim', 0)
scaling_factor = rope_scaling['factor']
if mscale_all_dim:
scaling_factor = rope_scaling['factor']
mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
self.softmax_scale = self.softmax_scale * mscale * mscale

Expand Down
3 changes: 3 additions & 0 deletions lmdeploy/pytorch/models/module_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@

MODULE_MAP.update({'Glm4MoeLiteForCausalLM': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.deepseek_v2.DeepseekV2ForCausalLM'})

# glm5
MODULE_MAP.update({'GlmMoeDsaForCausalLM': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.deepseek_v32.DeepseekV32ForCausalLM'})

# internlm
MODULE_MAP.update({
'InternLMForCausalLM': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.internlm.InternLMForCausalLM',
Expand Down