From 475aa469df5085a5537354144d852972ac5bf23c Mon Sep 17 00:00:00 2001 From: cr7258 Date: Thu, 1 May 2025 22:46:40 +0800 Subject: [PATCH 1/8] feat: add TensorRT-LLM as backend --- chart/templates/backends/tensorrt-llm.yaml | 52 ++++++++++++++++++++++ chart/values.global.yaml | 4 ++ docs/examples/tensorrt-llm/playground.yaml | 25 +++++++++++ test/config/backends/tensorrt-llm.yaml | 51 +++++++++++++++++++++ 4 files changed, 132 insertions(+) create mode 100644 chart/templates/backends/tensorrt-llm.yaml create mode 100644 docs/examples/tensorrt-llm/playground.yaml create mode 100644 test/config/backends/tensorrt-llm.yaml diff --git a/chart/templates/backends/tensorrt-llm.yaml b/chart/templates/backends/tensorrt-llm.yaml new file mode 100644 index 00000000..9a3083d4 --- /dev/null +++ b/chart/templates/backends/tensorrt-llm.yaml @@ -0,0 +1,52 @@ +{{- if .Values.backendRuntime.enabled -}} +apiVersion: inference.llmaz.io/v1alpha1 +kind: BackendRuntime +metadata: + labels: + app.kubernetes.io/name: backendruntime + app.kubernetes.io/part-of: llmaz + app.kubernetes.io/created-by: llmaz + name: tensorrt-llm +spec: + command: + - trtllm-serve + image: {{ .Values.backendRuntime.tensorrt_llm.image.repository }} + version: {{ .Values.backendRuntime.tensorrt_llm.image.tag }} + # Do not edit the preset argument name unless you know what you're doing. + # Free to add more arguments with your requirements. + recommendedConfigs: + - name: default + args: + - "{{`{{ .ModelPath }}`}}" + - --host + - "0.0.0.0" + - --port + - "8080" + resources: + requests: + cpu: 4 + memory: 16Gi + limits: + cpu: 4 + memory: 16Gi + startupProbe: + periodSeconds: 10 + failureThreshold: 30 + httpGet: + path: /health + port: 8080 + livenessProbe: + initialDelaySeconds: 15 + periodSeconds: 10 + failureThreshold: 3 + httpGet: + path: /health + port: 8080 + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 3 + httpGet: + path: /health + port: 8080 + {{- end }} diff --git a/chart/values.global.yaml b/chart/values.global.yaml index db67db00..d52fe509 100644 --- a/chart/values.global.yaml +++ b/chart/values.global.yaml @@ -14,6 +14,10 @@ backendRuntime: image: repository: lmsysorg/sglang tag: v0.4.5-cu121 + tensorrt_llm: + image: + repository: nvcr.io/nvidia/tritonserver + tag: 25.03-trtllm-python-py3 tgi: image: repository: ghcr.io/huggingface/text-generation-inference diff --git a/docs/examples/tensorrt-llm/playground.yaml b/docs/examples/tensorrt-llm/playground.yaml new file mode 100644 index 00000000..6bc41d87 --- /dev/null +++ b/docs/examples/tensorrt-llm/playground.yaml @@ -0,0 +1,25 @@ +apiVersion: llmaz.io/v1alpha1 +kind: OpenModel +metadata: + name: qwen2-0--5b +spec: + familyName: qwen2 + source: + modelHub: + modelID: Qwen/Qwen2-0.5B-Instruct + inferenceConfig: + flavors: + - name: a10 # GPU type + limits: + nvidia.com/gpu: 1 +--- +apiVersion: inference.llmaz.io/v1alpha1 +kind: Playground +metadata: + name: qwen2-0--5b +spec: + replicas: 1 + modelClaim: + modelName: qwen2-0--5b + backendRuntimeConfig: + backendName: tensorrt-llm diff --git a/test/config/backends/tensorrt-llm.yaml b/test/config/backends/tensorrt-llm.yaml new file mode 100644 index 00000000..494630c1 --- /dev/null +++ b/test/config/backends/tensorrt-llm.yaml @@ -0,0 +1,51 @@ +apiVersion: inference.llmaz.io/v1alpha1 +kind: BackendRuntime +metadata: + labels: + app.kubernetes.io/name: backendruntime + app.kubernetes.io/part-of: llmaz + app.kubernetes.io/created-by: llmaz + name: tensorrt-llm +spec: + command: + - trtllm-serve + image: nvcr.io/nvidia/tritonserver + version: 25.03-trtllm-python-py3 + # Do not edit the preset argument name unless you know what you're doing. + # Free to add more arguments with your requirements. + recommendedConfigs: + - name: default + args: + - "{{`{{ .ModelPath }}`}}" + - --host + - "0.0.0.0" + - --port + - "8080" + sharedMemorySize: 2Gi + resources: + requests: + cpu: 4 + memory: 16Gi + limits: + cpu: 4 + memory: 16Gi + startupProbe: + periodSeconds: 10 + failureThreshold: 30 + httpGet: + path: /health + port: 8080 + livenessProbe: + initialDelaySeconds: 15 + periodSeconds: 10 + failureThreshold: 3 + httpGet: + path: /health + port: 8080 + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 3 + httpGet: + path: /health + port: 8080 From 2538c1db03feefba8d1e6e3f26942f256d479d0d Mon Sep 17 00:00:00 2001 From: cr7258 Date: Thu, 1 May 2025 22:54:32 +0800 Subject: [PATCH 2/8] update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cdd745c2..287a9246 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ Easy, advanced inference platform for large language models on Kubernetes ## Key Features - **Easy of Use**: People can quick deploy a LLM service with minimal configurations. -- **Broad Backends Support**: llmaz supports a wide range of advanced inference backends for different scenarios, like [vLLM](https://github.com/vllm-project/vllm), [Text-Generation-Inference](https://github.com/huggingface/text-generation-inference), [SGLang](https://github.com/sgl-project/sglang), [llama.cpp](https://github.com/ggerganov/llama.cpp). Find the full list of supported backends [here](./docs/support-backends.md). +- **Broad Backends Support**: llmaz supports a wide range of advanced inference backends for different scenarios, like [vLLM](https://github.com/vllm-project/vllm), [Text-Generation-Inference](https://github.com/huggingface/text-generation-inference), [SGLang](https://github.com/sgl-project/sglang), [llama.cpp](https://github.com/ggerganov/llama.cpp), [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM). Find the full list of supported backends [here](./docs/support-backends.md). - **Accelerator Fungibility**: llmaz supports serving the same LLM with various accelerators to optimize cost and performance. - **Various Model Providers**: llmaz supports a wide range of model providers, such as [HuggingFace](https://huggingface.co/), [ModelScope](https://www.modelscope.cn), ObjectStores. llmaz will automatically handle the model loading, requiring no effort from users. - **Multi-Host Support**: llmaz supports both single-host and multi-host scenarios with [LWS](https://github.com/kubernetes-sigs/lws) from day 0. From 479dc5be92664f21bad3f5b3b4a86fd49412e7c6 Mon Sep 17 00:00:00 2001 From: cr7258 Date: Thu, 1 May 2025 23:03:06 +0800 Subject: [PATCH 3/8] update readme --- examples/README.md | 4 ++++ {docs/examples => examples}/tensorrt-llm/playground.yaml | 0 site/content/en/docs/integrations/support-backends.md | 4 ++++ 3 files changed, 8 insertions(+) rename {docs/examples => examples}/tensorrt-llm/playground.yaml (100%) diff --git a/examples/README.md b/examples/README.md index 0f5ea9ae..58d9fb42 100644 --- a/examples/README.md +++ b/examples/README.md @@ -46,6 +46,10 @@ By default, we use [vLLM](https://github.com/vllm-project/vllm) as the inference [llama.cpp](https://github.com/ggerganov/llama.cpp) can serve models on a wide variety of hardwares, such as CPU, see [example](./llamacpp/) here. +### Deploy models via TensorRT-LLM + +[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) provides users with an easy-to-use Python API to define Large Language Models (LLMs) and support state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs, see [example](./tensorrt-llm/) here. + ### Deploy models via text-generation-inference [text-generation-inference](https://github.com/huggingface/text-generation-inference) is used in production at Hugging Face to power Hugging Chat, the Inference API and Inference Endpoint. see [example](./tgi/) here. diff --git a/docs/examples/tensorrt-llm/playground.yaml b/examples/tensorrt-llm/playground.yaml similarity index 100% rename from docs/examples/tensorrt-llm/playground.yaml rename to examples/tensorrt-llm/playground.yaml diff --git a/site/content/en/docs/integrations/support-backends.md b/site/content/en/docs/integrations/support-backends.md index 0d2d5821..2331016a 100644 --- a/site/content/en/docs/integrations/support-backends.md +++ b/site/content/en/docs/integrations/support-backends.md @@ -13,6 +13,10 @@ If you want to integrate more backends into llmaz, please refer to this [PR](htt [SGLang](https://github.com/sgl-project/sglang) is yet another fast serving framework for large language models and vision language models. +## TensorRT-LLM + +[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) provides users with an easy-to-use Python API to define Large Language Models (LLMs) and support state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs. TensorRT-LLM also contains components to create Python and C++ runtimes that orchestrate the inference execution in performant way. + ## Text-Generation-Inference [text-generation-inference](https://github.com/huggingface/text-generation-inference) is a Rust, Python and gRPC server for text generation inference. Used in production at Hugging Face to power Hugging Chat, the Inference API and Inference Endpoint. From ceaf5200403557ab47a1c902fb780fb5cae25aa2 Mon Sep 17 00:00:00 2001 From: cr7258 Date: Fri, 2 May 2025 12:44:09 +0800 Subject: [PATCH 4/8] remove example to resolve conflicts --- examples/README.md | 28 +++++++++++++-------------- examples/tensorrt-llm/playground.yaml | 25 ------------------------ 2 files changed, 13 insertions(+), 40 deletions(-) delete mode 100644 examples/tensorrt-llm/playground.yaml diff --git a/examples/README.md b/examples/README.md index 58d9fb42..ddb9aaad 100644 --- a/examples/README.md +++ b/examples/README.md @@ -4,17 +4,19 @@ We provide a set of examples to help you serve large language models, by default ## Table of Contents -- [Deploy models from Huggingface](#deploy-models-from-huggingface) -- [Deploy models from ModelScope](#deploy-models-from-modelscope) -- [Deploy models from ObjectStore](#deploy-models-from-objectstore) -- [Deploy models via SGLang](#deploy-models-via-sglang) -- [Deploy models via llama.cpp](#deploy-models-via-llamacpp) -- [Deploy models via text-generation-inference](#deploy-models-via-tgi) -- [Deploy models via ollama](#ollama) -- [Speculative Decoding with vLLM](#speculative-decoding-with-vllm) -- [Deploy multi-host inference](#multi-host-inference) -- [Deploy host models](#deploy-host-models) -- [Envoy AI Gateway](#envoy-ai-gateway) +- [Examples](#examples) + - [Table of Contents](#table-of-contents) + - [Deploy models from Huggingface](#deploy-models-from-huggingface) + - [Deploy models from ModelScope](#deploy-models-from-modelscope) + - [Deploy models from ObjectStore](#deploy-models-from-objectstore) + - [Deploy models via SGLang](#deploy-models-via-sglang) + - [Deploy models via llama.cpp](#deploy-models-via-llamacpp) + - [Deploy models via text-generation-inference](#deploy-models-via-text-generation-inference) + - [Deploy models via ollama](#deploy-models-via-ollama) + - [Speculative Decoding with vLLM](#speculative-decoding-with-vllm) + - [Multi-Host Inference](#multi-host-inference) + - [Deploy Host Models](#deploy-host-models) + - [Envoy AI Gateway](#envoy-ai-gateway) ### Deploy models from Huggingface @@ -46,10 +48,6 @@ By default, we use [vLLM](https://github.com/vllm-project/vllm) as the inference [llama.cpp](https://github.com/ggerganov/llama.cpp) can serve models on a wide variety of hardwares, such as CPU, see [example](./llamacpp/) here. -### Deploy models via TensorRT-LLM - -[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) provides users with an easy-to-use Python API to define Large Language Models (LLMs) and support state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs, see [example](./tensorrt-llm/) here. - ### Deploy models via text-generation-inference [text-generation-inference](https://github.com/huggingface/text-generation-inference) is used in production at Hugging Face to power Hugging Chat, the Inference API and Inference Endpoint. see [example](./tgi/) here. diff --git a/examples/tensorrt-llm/playground.yaml b/examples/tensorrt-llm/playground.yaml deleted file mode 100644 index 6bc41d87..00000000 --- a/examples/tensorrt-llm/playground.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: llmaz.io/v1alpha1 -kind: OpenModel -metadata: - name: qwen2-0--5b -spec: - familyName: qwen2 - source: - modelHub: - modelID: Qwen/Qwen2-0.5B-Instruct - inferenceConfig: - flavors: - - name: a10 # GPU type - limits: - nvidia.com/gpu: 1 ---- -apiVersion: inference.llmaz.io/v1alpha1 -kind: Playground -metadata: - name: qwen2-0--5b -spec: - replicas: 1 - modelClaim: - modelName: qwen2-0--5b - backendRuntimeConfig: - backendName: tensorrt-llm From 087b3d505da5cd6979ba4f6db331f10b15867542 Mon Sep 17 00:00:00 2001 From: cr7258 Date: Fri, 2 May 2025 12:46:49 +0800 Subject: [PATCH 5/8] remove example to resolve conflicts --- examples/README.md | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/examples/README.md b/examples/README.md index ddb9aaad..5a9883f8 100644 --- a/examples/README.md +++ b/examples/README.md @@ -4,19 +4,17 @@ We provide a set of examples to help you serve large language models, by default ## Table of Contents -- [Examples](#examples) - - [Table of Contents](#table-of-contents) - - [Deploy models from Huggingface](#deploy-models-from-huggingface) - - [Deploy models from ModelScope](#deploy-models-from-modelscope) - - [Deploy models from ObjectStore](#deploy-models-from-objectstore) - - [Deploy models via SGLang](#deploy-models-via-sglang) - - [Deploy models via llama.cpp](#deploy-models-via-llamacpp) - - [Deploy models via text-generation-inference](#deploy-models-via-text-generation-inference) - - [Deploy models via ollama](#deploy-models-via-ollama) - - [Speculative Decoding with vLLM](#speculative-decoding-with-vllm) - - [Multi-Host Inference](#multi-host-inference) - - [Deploy Host Models](#deploy-host-models) - - [Envoy AI Gateway](#envoy-ai-gateway) +- [Deploy models from Huggingface](#deploy-models-from-huggingface) +- [Deploy models from ModelScope](#deploy-models-from-modelscope) +- [Deploy models from ObjectStore](#deploy-models-from-objectstore) +- [Deploy models via SGLang](#deploy-models-via-sglang) +- [Deploy models via llama.cpp](#deploy-models-via-llamacpp) +- [Deploy models via text-generation-inference](#deploy-models-via-text-generation-inference) +- [Deploy models via ollama](#deploy-models-via-ollama) +- [Speculative Decoding with vLLM](#speculative-decoding-with-vllm) +- [Multi-Host Inference](#multi-host-inference) +- [Deploy Host Models](#deploy-host-models) +- [Envoy AI Gateway](#envoy-ai-gateway) ### Deploy models from Huggingface From 1eb8a21190cc249759bbfe925d3c830940820705 Mon Sep 17 00:00:00 2001 From: cr7258 Date: Fri, 2 May 2025 12:49:13 +0800 Subject: [PATCH 6/8] fix --- examples/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/README.md b/examples/README.md index 5a9883f8..0f5ea9ae 100644 --- a/examples/README.md +++ b/examples/README.md @@ -9,11 +9,11 @@ We provide a set of examples to help you serve large language models, by default - [Deploy models from ObjectStore](#deploy-models-from-objectstore) - [Deploy models via SGLang](#deploy-models-via-sglang) - [Deploy models via llama.cpp](#deploy-models-via-llamacpp) -- [Deploy models via text-generation-inference](#deploy-models-via-text-generation-inference) -- [Deploy models via ollama](#deploy-models-via-ollama) +- [Deploy models via text-generation-inference](#deploy-models-via-tgi) +- [Deploy models via ollama](#ollama) - [Speculative Decoding with vLLM](#speculative-decoding-with-vllm) -- [Multi-Host Inference](#multi-host-inference) -- [Deploy Host Models](#deploy-host-models) +- [Deploy multi-host inference](#multi-host-inference) +- [Deploy host models](#deploy-host-models) - [Envoy AI Gateway](#envoy-ai-gateway) ### Deploy models from Huggingface From 38040dbb502829cf8c156daf5ae5684207a5ce61 Mon Sep 17 00:00:00 2001 From: cr7258 Date: Fri, 2 May 2025 12:53:33 +0800 Subject: [PATCH 7/8] add tersorrt-llm example --- docs/examples/ tersorrt-llm/playground.yaml | 25 +++++++++++++++++++++ docs/examples/README.md | 13 +++++++---- 2 files changed, 34 insertions(+), 4 deletions(-) create mode 100644 docs/examples/ tersorrt-llm/playground.yaml diff --git a/docs/examples/ tersorrt-llm/playground.yaml b/docs/examples/ tersorrt-llm/playground.yaml new file mode 100644 index 00000000..6bc41d87 --- /dev/null +++ b/docs/examples/ tersorrt-llm/playground.yaml @@ -0,0 +1,25 @@ +apiVersion: llmaz.io/v1alpha1 +kind: OpenModel +metadata: + name: qwen2-0--5b +spec: + familyName: qwen2 + source: + modelHub: + modelID: Qwen/Qwen2-0.5B-Instruct + inferenceConfig: + flavors: + - name: a10 # GPU type + limits: + nvidia.com/gpu: 1 +--- +apiVersion: inference.llmaz.io/v1alpha1 +kind: Playground +metadata: + name: qwen2-0--5b +spec: + replicas: 1 + modelClaim: + modelName: qwen2-0--5b + backendRuntimeConfig: + backendName: tensorrt-llm diff --git a/docs/examples/README.md b/docs/examples/README.md index 0f5ea9ae..6733d004 100644 --- a/docs/examples/README.md +++ b/docs/examples/README.md @@ -9,11 +9,12 @@ We provide a set of examples to help you serve large language models, by default - [Deploy models from ObjectStore](#deploy-models-from-objectstore) - [Deploy models via SGLang](#deploy-models-via-sglang) - [Deploy models via llama.cpp](#deploy-models-via-llamacpp) -- [Deploy models via text-generation-inference](#deploy-models-via-tgi) -- [Deploy models via ollama](#ollama) +- [Deploy models via TensorRT-LLM](#deploy-models-via-tensorrt-llm) +- [Deploy models via text-generation-inference](#deploy-models-via-text-generation-inference) +- [Deploy models via ollama](#deploy-models-via-ollama) - [Speculative Decoding with vLLM](#speculative-decoding-with-vllm) -- [Deploy multi-host inference](#multi-host-inference) -- [Deploy host models](#deploy-host-models) +- [Multi-Host Inference](#multi-host-inference) +- [Deploy Host Models](#deploy-host-models) - [Envoy AI Gateway](#envoy-ai-gateway) ### Deploy models from Huggingface @@ -46,6 +47,10 @@ By default, we use [vLLM](https://github.com/vllm-project/vllm) as the inference [llama.cpp](https://github.com/ggerganov/llama.cpp) can serve models on a wide variety of hardwares, such as CPU, see [example](./llamacpp/) here. +### Deploy models via TensorRT-LLM + +[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) provides users with an easy-to-use Python API to define Large Language Models (LLMs) and support state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs, see [example](./tensorrt-llm/) here. + ### Deploy models via text-generation-inference [text-generation-inference](https://github.com/huggingface/text-generation-inference) is used in production at Hugging Face to power Hugging Chat, the Inference API and Inference Endpoint. see [example](./tgi/) here. From d0dd3915504d457a0daf20328d4abb287f518e05 Mon Sep 17 00:00:00 2001 From: cr7258 Date: Fri, 2 May 2025 12:56:03 +0800 Subject: [PATCH 8/8] fix folder name --- docs/examples/{ tersorrt-llm => tensorrt-llm}/playground.yaml | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename docs/examples/{ tersorrt-llm => tensorrt-llm}/playground.yaml (100%) diff --git a/docs/examples/ tersorrt-llm/playground.yaml b/docs/examples/tensorrt-llm/playground.yaml similarity index 100% rename from docs/examples/ tersorrt-llm/playground.yaml rename to docs/examples/tensorrt-llm/playground.yaml