diff --git a/docker/Dockerfile_ascend_a2_300i b/docker/Dockerfile_ascend_a2_300i index 47fc5fd8bc..5a3d989130 100644 --- a/docker/Dockerfile_ascend_a2_300i +++ b/docker/Dockerfile_ascend_a2_300i @@ -4,8 +4,8 @@ ARG ASCEND_DEVICE_TYPE=ascend_a2 ARG ASCEND_HUB=swr.cn-south-1.myhuaweicloud.com/ascendhub -FROM ${ASCEND_HUB}/cann:8.3.rc1.alpha002-910b-ubuntu22.04-py3.11 AS ascend_a2_base -FROM ${ASCEND_HUB}/cann:8.3.rc1.alpha002-310p-ubuntu22.04-py3.11 AS ascend_300i_base +FROM ${ASCEND_HUB}/cann:8.3.rc1-910b-ubuntu22.04-py3.11 AS ascend_a2_base +FROM ${ASCEND_HUB}/cann:8.3.rc1-310p-ubuntu22.04-py3.11 AS ascend_300i_base FROM ${ASCEND_DEVICE_TYPE}_base AS builder ENV DEBIAN_FRONTEND=noninteractive @@ -23,6 +23,6 @@ ARG LMDEPLOY_TAG=main RUN --mount=type=cache,target=/root/.cache \ pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \ pip config set global.trusted-host pypi.tuna.tsinghua.edu.cn && \ - pip install --no-cache-dir torch==2.8.0 torch-npu==2.8.0rc1 torchvision==0.23.0 && \ + pip install --no-cache-dir torch==2.8.0 torch-npu==2.8.0 torchvision==0.23.0 && \ TORCH_DEVICE_BACKEND_AUTOLOAD=0 DEVICE=ascend pip install git+https://github.com/DeepLink-org/dlinfer.git@${DLINFER_TAG} && \ LMDEPLOY_TARGET_DEVICE=ascend pip install git+https://github.com/InternLM/lmdeploy.git@${LMDEPLOY_TAG} diff --git a/docker/Dockerfile_ascend_a3 b/docker/Dockerfile_ascend_a3 index 9764c3e548..d8fc152ed1 100644 --- a/docker/Dockerfile_ascend_a3 +++ b/docker/Dockerfile_ascend_a3 @@ -4,7 +4,7 @@ ARG ASCEND_DEVICE_TYPE=ascend_a3 ARG ASCEND_HUB=swr.cn-south-1.myhuaweicloud.com/ascendhub -FROM ${ASCEND_HUB}/cann:8.3.rc1.alpha002-a3-openeuler24.03-py3.11 AS ascend_a3_base +FROM ${ASCEND_HUB}/cann:8.3.rc1-a3-openeuler24.03-py3.11 AS ascend_a3_base FROM ${ASCEND_DEVICE_TYPE}_base AS builder ENV DEBIAN_FRONTEND=noninteractive @@ -22,6 +22,6 @@ ARG LMDEPLOY_TAG=main RUN --mount=type=cache,target=/root/.cache \ pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \ pip config set global.trusted-host pypi.tuna.tsinghua.edu.cn && \ - pip install --no-cache-dir torch==2.8.0 torch-npu==2.8.0rc1 torchvision==0.23.0 && \ + pip install --no-cache-dir torch==2.8.0 torch-npu==2.8.0 torchvision==0.23.0 && \ TORCH_DEVICE_BACKEND_AUTOLOAD=0 DEVICE=ascend pip install git+https://github.com/DeepLink-org/dlinfer.git@${DLINFER_TAG} && \ LMDEPLOY_TARGET_DEVICE=ascend pip install git+https://github.com/InternLM/lmdeploy.git@${LMDEPLOY_TAG} diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py index 273a6ca42c..72bad9f30c 100644 --- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py +++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py @@ -212,7 +212,7 @@ def get_total_slots(): elif is_unpaged_prefill: # prepare some params of unpaged_prefill attention stage. q_start_loc_cpu, kv_seqlens_cpu = None, None - q_seqlens_cpu = step_context.q_seqlens.cpu() + q_seqlens_cpu = step_context.q_seqlens.cpu().to(torch.int32) if SocVersion.is_Ascend910(): single_attention_mask = torch.logical_not( torch.tril( @@ -251,7 +251,7 @@ def get_total_slots(): step_context.block_offsets = step_context.block_offsets\ .repeat_interleave(step_context.q_seqlens, 0) dynamo.mark_dynamic(step_context.block_offsets, [0, 1]) - kv_seqlens = step_context.kv_seqlens.to(torch.int32) + kv_seqlens = step_context.kv_seqlens.cpu().to(torch.int32) if not step_context.is_decoding: if is_unpaged_prefill: if SocVersion.is_Ascend910(): @@ -269,11 +269,9 @@ def get_total_slots(): else: raise ValueError(f"dlinfer doesn't support {SocVersion.device_name()} device currently.") kv_seqlens = kv_seqlens.repeat_interleave(step_context.q_seqlens, 0) - if not is_unpaged_prefill and AscendOpsBackend.enable_aclgraph(): - kv_seqlens = kv_seqlens.cpu().tolist() else: if step_context.is_decoding: - kv_seqlens_cpu = step_context.kv_seqlens.cpu() + kv_seqlens_cpu = step_context.kv_seqlens.cpu().to(torch.int32) elif is_unpaged_prefill: pass else: diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py index 99bc54f29c..534076551f 100644 --- a/lmdeploy/pytorch/backends/dlinfer/moe.py +++ b/lmdeploy/pytorch/backends/dlinfer/moe.py @@ -27,9 +27,9 @@ class DlinferSoftmaxTopKBuilder(SoftmaxTopKBuilder): """Dlinfer softmax topk implementation builder.""" @staticmethod - def build(top_k: int, dim: int = -1): + def build(top_k: int, dim: int = -1, n_groups: int = -1): """build.""" - return DlinferSoftmaxTopKImpl(top_k, dim) + return DlinferSoftmaxTopKImpl(top_k, dim, n_groups) class DlinferFusedMoEImpl(FusedMoEImpl): diff --git a/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py b/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py index 16dfc99dea..94cf428b39 100644 --- a/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py +++ b/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py @@ -46,8 +46,7 @@ def __init__(self, dim: int, base: int = 10000, scaling_factor: float = 1.0): self.dim = dim self.base = base # yapf: disable - inv_freq = 1.0 / (self.base - ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim)).float().cuda() + inv_freq = 1.0 / (self.base**(torch.arange(0, self.dim, 2, dtype=torch.float, device='cuda') / self.dim)) # yapf: enable self.register_buffer('inv_freq', inv_freq, persistent=False)