diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py index 111af08fd4..36e4d672ef 100644 --- a/lmdeploy/cli/utils.py +++ b/lmdeploy/cli/utils.py @@ -526,7 +526,7 @@ def max_log_len(parser): return parser.add_argument('--max-log-len', type=int, default=None, - help='Max number of prompt characters or prompt tokens being' + help='Max number of prompt characters or prompt tokens being ' 'printed in log. Default: Unlimited') @staticmethod @@ -552,8 +552,9 @@ def communicator(parser): return parser.add_argument('--communicator', type=str, default='nccl', - choices=['nccl', 'native'], - help='Communication backend for multi-GPU inference') + choices=['nccl', 'native', 'cuda-ipc'], + help='Communication backend for multi-GPU inference. The "native" option is ' + 'deprecated and serves as an alias for "cuda-ipc"') @staticmethod def enable_microbatch(parser): @@ -581,9 +582,9 @@ def role(parser): type=str, default='Hybrid', choices=['Hybrid', 'Prefill', 'Decode'], - help='Hybrid for Non-Disaggregated Engine;' - 'Prefill for Disaggregated Prefill Engine;' - 'Decode for Disaggregated Decode Engine;') + help='Hybrid for Non-Disaggregated Engine; ' + 'Prefill for Disaggregated Prefill Engine; ' + 'Decode for Disaggregated Decode Engine') @staticmethod def migration_backend(parser): diff --git a/lmdeploy/utils.py b/lmdeploy/utils.py index 3d4e98bfb8..0afeeb1ca5 100644 --- a/lmdeploy/utils.py +++ b/lmdeploy/utils.py @@ -337,7 +337,7 @@ def get_max_batch_size(device_type: str): """ assert device_type in ['cuda', 'ascend', 'maca', 'camb'] if device_type == 'cuda': - max_batch_size_map = {'a100': 256, 'a800': 256, 'h100': 512, 'h800': 512} + max_batch_size_map = {'a100': 384, 'a800': 384, 'h100': 1024, 'h800': 1024, 'l20y': 1024, 'h200': 1024} import torch device_name = torch.cuda.get_device_name(0).lower() for name, size in max_batch_size_map.items():