From 8e6dda3cfc6709ad984d27ab2a128a98e3d64daf Mon Sep 17 00:00:00 2001 From: yaofengchen Date: Thu, 28 Aug 2025 07:11:02 +0000 Subject: [PATCH] remove ppu backend --- lmdeploy/cli/utils.py | 2 +- lmdeploy/messages.py | 3 +- .../pytorch/backends/dlinfer/ppu/__init__.py | 2 - .../backends/dlinfer/ppu/op_backend.py | 87 ------------------- lmdeploy/pytorch/backends/selector.py | 3 - lmdeploy/pytorch/kernels/dispatcher.py | 9 +- lmdeploy/pytorch/models/module_map.py | 6 +- lmdeploy/utils.py | 5 +- requirements/runtime_ppu.txt | 21 ----- requirements_ppu.txt | 4 - 10 files changed, 5 insertions(+), 137 deletions(-) delete mode 100644 lmdeploy/pytorch/backends/dlinfer/ppu/__init__.py delete mode 100644 lmdeploy/pytorch/backends/dlinfer/ppu/op_backend.py delete mode 100644 requirements/runtime_ppu.txt delete mode 100644 requirements_ppu.txt diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py index 10a9770e62..111af08fd4 100644 --- a/lmdeploy/cli/utils.py +++ b/lmdeploy/cli/utils.py @@ -388,7 +388,7 @@ def calib_search_scale(parser): ) @staticmethod - def device(parser, default: str = 'cuda', choices: List[str] = ['cuda', 'ascend', 'maca', 'camb', 'ppu']): + def device(parser, default: str = 'cuda', choices: List[str] = ['cuda', 'ascend', 'maca', 'camb']): """Add argument device to parser.""" return parser.add_argument('--device', diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py index d3f4e5b116..0001bfb755 100644 --- a/lmdeploy/messages.py +++ b/lmdeploy/messages.py @@ -385,8 +385,7 @@ def __post_init__(self): 'invalid max_prefill_token_num' assert self.num_gpu_blocks >= 0, 'invalid num_gpu_blocks' assert self.quant_policy in (0, 4, 8), 'invalid quant_policy' - assert self.device_type in ['cuda', 'ascend', 'maca', 'camb', - 'ppu'], (f'invalid device_type: {self.device_type}') + assert self.device_type in ['cuda', 'ascend', 'maca', 'camb'], (f'invalid device_type: {self.device_type}') assert self.block_size >= 16 and (self.block_size & (self.block_size - 1)) == 0, \ f'block_size must be >= 16 and a power of 2, but got {self.block_size}' if self.quant_policy > 0 and self.device_type not in ['cuda', 'ascend']: diff --git a/lmdeploy/pytorch/backends/dlinfer/ppu/__init__.py b/lmdeploy/pytorch/backends/dlinfer/ppu/__init__.py deleted file mode 100644 index 4a989a5e6d..0000000000 --- a/lmdeploy/pytorch/backends/dlinfer/ppu/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from .op_backend import PpuOpsBackend # noqa: F401 diff --git a/lmdeploy/pytorch/backends/dlinfer/ppu/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ppu/op_backend.py deleted file mode 100644 index 81d530c4c5..0000000000 --- a/lmdeploy/pytorch/backends/dlinfer/ppu/op_backend.py +++ /dev/null @@ -1,87 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. - -import torch - -from lmdeploy.pytorch.config import BackendConfig, CacheConfig, ModelConfig -from lmdeploy.utils import get_logger - -from ..op_backend import DlinferOpsBackend - -logger = get_logger('lmdeploy') - - -class PpuOpsBackend(DlinferOpsBackend): - """PPU layer backend.""" - total_slots = None - - @staticmethod - def get_name() -> str: - """Backend name.""" - return 'ppu' - - @classmethod - def update_step_context(cls, step_context): - """Update step context.""" - - def get_total_slots(): - if cls.total_slots is None: - cls.total_slots = torch.arange(block_num * block_size, - dtype=torch.long, - device=step_context.block_offsets.device) - cls.total_slots = cls.total_slots.view(block_num, block_size) - return cls.total_slots - - kv_start_indices, attention_mask = [], [] - block_num, block_size, _, _ = step_context.kv_caches[0][1].shape - - is_unpaged_prefill = False - if not step_context.is_decoding: - is_unpaged_prefill = torch.all(step_context.q_seqlens.eq(step_context.kv_seqlens)) - - q_start_loc = torch.cat( - (step_context.q_start_loc, (step_context.q_start_loc[-1] + step_context.q_seqlens[-1]).unsqueeze(0))) - q_seqlens = step_context.q_seqlens - kv_seqlens = step_context.kv_seqlens - max_q_seq_len = torch.max(q_seqlens) - max_kv_seq_len = torch.max(kv_seqlens) - - if step_context.is_decoding: - idx = (step_context.kv_seqlens - 1) % block_size - b_num = (step_context.kv_seqlens - 1) // block_size - last_block = step_context.block_offsets.gather(1, b_num.view(-1, 1)).view(-1) - kv_start_indices = (last_block * block_size + idx).reshape((-1, 1)) - else: - for i in range(step_context.q_start_loc.size(0)): - q_seq_len = int(step_context.q_seqlens[i]) - kv_seq_len = int(step_context.kv_seqlens[i]) - history_length = kv_seq_len - q_seq_len - total_slots = get_total_slots() - slot_tables = total_slots[step_context.block_offsets[i]].view(-1) - slots = slot_tables[history_length:kv_seq_len] - kv_start_indices.append(slots) - kv_start_indices = torch.cat(kv_start_indices) - - attn_meta_cls = cls.get_attention_metadata_cls() - attn_metadata = attn_meta_cls( - step_context.is_decoding, - step_context.block_offsets, - q_start_loc=q_start_loc, - q_seqlens=q_seqlens, - kv_seqlens=kv_seqlens, - kv_start_indices=kv_start_indices, - block_size=block_size, - attention_mask=attention_mask, - is_unpaged_prefill=is_unpaged_prefill, - max_q_seq_len=max_q_seq_len, - max_kv_seq_len=max_kv_seq_len, - ) - - step_context.attn_metadata = attn_metadata - return step_context - - @staticmethod - def build_graph_runner(model: torch.nn.Module, model_config: ModelConfig, cache_config: CacheConfig, - backend_config: BackendConfig, device: torch.device): - """Build graph runner.""" - from lmdeploy.pytorch.backends.cuda.graph_runner import CUDAGraphRunner - return CUDAGraphRunner(model, model_config, cache_config, backend_config, device) diff --git a/lmdeploy/pytorch/backends/selector.py b/lmdeploy/pytorch/backends/selector.py index 765127dd1f..1164cbe693 100644 --- a/lmdeploy/pytorch/backends/selector.py +++ b/lmdeploy/pytorch/backends/selector.py @@ -21,9 +21,6 @@ def _get_backend(): if device_type == 'camb': from .dlinfer.camb import CambOpsBackend return CambOpsBackend - if device_type == 'ppu': - from .dlinfer.ppu import PpuOpsBackend - return PpuOpsBackend else: raise RuntimeError(f'Unsupported device type: {device_type}') diff --git a/lmdeploy/pytorch/kernels/dispatcher.py b/lmdeploy/pytorch/kernels/dispatcher.py index 78e9a87d79..fcf85c913f 100644 --- a/lmdeploy/pytorch/kernels/dispatcher.py +++ b/lmdeploy/pytorch/kernels/dispatcher.py @@ -64,14 +64,7 @@ def __init__(self, func_name: str): self.func_name = func_name self.dispatched_func = self.load_and_call self.device_manager.register_context_callback(self.device_callback) - self.device_map = { - 'cuda': 'cuda', - 'ascend': 'dlinfer', - 'npu': 'dlinfer', - 'maca': 'dlinfer', - 'camb': 'dlinfer', - 'ppu': 'dlinfer' - } + self.device_map = {'cuda': 'cuda', 'ascend': 'dlinfer', 'npu': 'dlinfer', 'maca': 'dlinfer', 'camb': 'dlinfer'} def device_callback(self, context: DeviceContext): """Device context callback.""" diff --git a/lmdeploy/pytorch/models/module_map.py b/lmdeploy/pytorch/models/module_map.py index 01d2c2fda6..46a9886ea7 100644 --- a/lmdeploy/pytorch/models/module_map.py +++ b/lmdeploy/pytorch/models/module_map.py @@ -7,12 +7,8 @@ ASCEND_MODULE_MAP = dict() MACA_MODULE_MAP = dict() CAMB_MODULE_MAP = dict() -PPU_MODULE_MAP = dict() -DEVICE_SPECIAL_MODULE_MAP = dict(ascend=ASCEND_MODULE_MAP, - maca=MACA_MODULE_MAP, - camb=CAMB_MODULE_MAP, - ppu=PPU_MODULE_MAP) +DEVICE_SPECIAL_MODULE_MAP = dict(ascend=ASCEND_MODULE_MAP, maca=MACA_MODULE_MAP, camb=CAMB_MODULE_MAP) # llama MODULE_MAP.update({ diff --git a/lmdeploy/utils.py b/lmdeploy/utils.py index b90754e7a1..3d4e98bfb8 100644 --- a/lmdeploy/utils.py +++ b/lmdeploy/utils.py @@ -335,7 +335,7 @@ def get_max_batch_size(device_type: str): Args: device_type (str): the type of device """ - assert device_type in ['cuda', 'ascend', 'maca', 'camb', 'ppu'] + assert device_type in ['cuda', 'ascend', 'maca', 'camb'] if device_type == 'cuda': max_batch_size_map = {'a100': 256, 'a800': 256, 'h100': 512, 'h800': 512} import torch @@ -352,8 +352,6 @@ def get_max_batch_size(device_type: str): return 256 elif device_type == 'camb': return 256 - elif device_type == 'ppu': - return 256 def is_bf16_supported(device_type: str = 'cuda'): @@ -401,7 +399,6 @@ def try_import_deeplink(device_type: str): 'npu', 'maca', 'camb', - 'ppu', ] if device_type in deeplink_device_type_list: try: diff --git a/requirements/runtime_ppu.txt b/requirements/runtime_ppu.txt deleted file mode 100644 index 84e7963dbf..0000000000 --- a/requirements/runtime_ppu.txt +++ /dev/null @@ -1,21 +0,0 @@ -accelerate>=0.29.3 -einops -fastapi -fire -mmengine-lite -numpy<2.0.0 -openai -outlines<0.1.0 -partial_json_parser -peft<=0.11.1 -pillow -protobuf -pydantic>2.0.0 -safetensors -sentencepiece -shortuuid -tiktoken -torch>=2.6.0 -torchvision>=0.21.0 -transformers -uvicorn diff --git a/requirements_ppu.txt b/requirements_ppu.txt deleted file mode 100644 index 94d8bf6eb3..0000000000 --- a/requirements_ppu.txt +++ /dev/null @@ -1,4 +0,0 @@ --r requirements/build.txt --r requirements/runtime_ppu.txt --r requirements/lite.txt --r requirements/serve.txt