From 475aa469df5085a5537354144d852972ac5bf23c Mon Sep 17 00:00:00 2001
From: cr7258 <chengzw258@163.com>
Date: Thu, 1 May 2025 22:46:40 +0800
Subject: [PATCH 1/8] feat: add TensorRT-LLM as backend

---
 chart/templates/backends/tensorrt-llm.yaml | 52 ++++++++++++++++++++++
 chart/values.global.yaml                   |  4 ++
 docs/examples/tensorrt-llm/playground.yaml | 25 +++++++++++
 test/config/backends/tensorrt-llm.yaml     | 51 +++++++++++++++++++++
 4 files changed, 132 insertions(+)
 create mode 100644 chart/templates/backends/tensorrt-llm.yaml
 create mode 100644 docs/examples/tensorrt-llm/playground.yaml
 create mode 100644 test/config/backends/tensorrt-llm.yaml

diff --git a/chart/templates/backends/tensorrt-llm.yaml b/chart/templates/backends/tensorrt-llm.yaml
new file mode 100644
index 00000000..9a3083d4
--- /dev/null
+++ b/chart/templates/backends/tensorrt-llm.yaml
@@ -0,0 +1,52 @@
+{{- if .Values.backendRuntime.enabled -}}
+apiVersion: inference.llmaz.io/v1alpha1
+kind: BackendRuntime
+metadata:
+  labels:
+    app.kubernetes.io/name: backendruntime
+    app.kubernetes.io/part-of: llmaz
+    app.kubernetes.io/created-by: llmaz
+  name: tensorrt-llm
+spec:
+  command:
+    - trtllm-serve
+  image: {{ .Values.backendRuntime.tensorrt_llm.image.repository }}
+  version: {{ .Values.backendRuntime.tensorrt_llm.image.tag }}
+  # Do not edit the preset argument name unless you know what you're doing.
+  # Free to add more arguments with your requirements.
+  recommendedConfigs:
+    - name: default
+      args:
+        - "{{`{{ .ModelPath }}`}}"
+        - --host
+        - "0.0.0.0"
+        - --port
+        - "8080"
+      resources:
+        requests:
+          cpu: 4
+          memory: 16Gi
+        limits:
+          cpu: 4
+          memory: 16Gi
+  startupProbe:
+    periodSeconds: 10
+    failureThreshold: 30
+    httpGet:
+      path: /health
+      port: 8080
+  livenessProbe:
+    initialDelaySeconds: 15
+    periodSeconds: 10
+    failureThreshold: 3
+    httpGet:
+      path: /health
+      port: 8080
+  readinessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    failureThreshold: 3
+    httpGet:
+      path: /health
+      port: 8080
+  {{- end }}
diff --git a/chart/values.global.yaml b/chart/values.global.yaml
index db67db00..d52fe509 100644
--- a/chart/values.global.yaml
+++ b/chart/values.global.yaml
@@ -14,6 +14,10 @@ backendRuntime:
     image:
       repository: lmsysorg/sglang
       tag: v0.4.5-cu121
+  tensorrt_llm:
+    image:
+      repository: nvcr.io/nvidia/tritonserver
+      tag: 25.03-trtllm-python-py3
   tgi:
     image:
       repository: ghcr.io/huggingface/text-generation-inference
diff --git a/docs/examples/tensorrt-llm/playground.yaml b/docs/examples/tensorrt-llm/playground.yaml
new file mode 100644
index 00000000..6bc41d87
--- /dev/null
+++ b/docs/examples/tensorrt-llm/playground.yaml
@@ -0,0 +1,25 @@
+apiVersion: llmaz.io/v1alpha1
+kind: OpenModel
+metadata:
+  name: qwen2-0--5b
+spec:
+  familyName: qwen2
+  source:
+    modelHub:
+      modelID: Qwen/Qwen2-0.5B-Instruct
+  inferenceConfig:
+    flavors:
+      - name: a10 # GPU type
+        limits:
+          nvidia.com/gpu: 1
+---
+apiVersion: inference.llmaz.io/v1alpha1
+kind: Playground
+metadata:
+  name: qwen2-0--5b
+spec:
+  replicas: 1
+  modelClaim:
+    modelName: qwen2-0--5b
+  backendRuntimeConfig:
+    backendName: tensorrt-llm
diff --git a/test/config/backends/tensorrt-llm.yaml b/test/config/backends/tensorrt-llm.yaml
new file mode 100644
index 00000000..494630c1
--- /dev/null
+++ b/test/config/backends/tensorrt-llm.yaml
@@ -0,0 +1,51 @@
+apiVersion: inference.llmaz.io/v1alpha1
+kind: BackendRuntime
+metadata:
+  labels:
+    app.kubernetes.io/name: backendruntime
+    app.kubernetes.io/part-of: llmaz
+    app.kubernetes.io/created-by: llmaz
+  name: tensorrt-llm
+spec:
+  command:
+    - trtllm-serve
+  image: nvcr.io/nvidia/tritonserver
+  version: 25.03-trtllm-python-py3
+  # Do not edit the preset argument name unless you know what you're doing.
+  # Free to add more arguments with your requirements.
+  recommendedConfigs:
+    - name: default
+      args:
+        - "{{`{{ .ModelPath }}`}}"
+        - --host
+        - "0.0.0.0"
+        - --port
+        - "8080"
+      sharedMemorySize: 2Gi
+      resources:
+        requests:
+          cpu: 4
+          memory: 16Gi
+        limits:
+          cpu: 4
+          memory: 16Gi
+  startupProbe:
+    periodSeconds: 10
+    failureThreshold: 30
+    httpGet:
+      path: /health
+      port: 8080
+  livenessProbe:
+    initialDelaySeconds: 15
+    periodSeconds: 10
+    failureThreshold: 3
+    httpGet:
+      path: /health
+      port: 8080
+  readinessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    failureThreshold: 3
+    httpGet:
+      path: /health
+      port: 8080

From 2538c1db03feefba8d1e6e3f26942f256d479d0d Mon Sep 17 00:00:00 2001
From: cr7258 <chengzw258@163.com>
Date: Thu, 1 May 2025 22:54:32 +0800
Subject: [PATCH 2/8] update readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index cdd745c2..287a9246 100644
--- a/README.md
+++ b/README.md
@@ -39,7 +39,7 @@ Easy, advanced inference platform for large language models on Kubernetes
 ## Key Features
 
 - **Easy of Use**: People can quick deploy a LLM service with minimal configurations.
-- **Broad Backends Support**: llmaz supports a wide range of advanced inference backends for different scenarios, like [vLLM](https://github.com/vllm-project/vllm), [Text-Generation-Inference](https://github.com/huggingface/text-generation-inference), [SGLang](https://github.com/sgl-project/sglang), [llama.cpp](https://github.com/ggerganov/llama.cpp). Find the full list of supported backends [here](./docs/support-backends.md).
+- **Broad Backends Support**: llmaz supports a wide range of advanced inference backends for different scenarios, like [vLLM](https://github.com/vllm-project/vllm), [Text-Generation-Inference](https://github.com/huggingface/text-generation-inference), [SGLang](https://github.com/sgl-project/sglang), [llama.cpp](https://github.com/ggerganov/llama.cpp), [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM). Find the full list of supported backends [here](./docs/support-backends.md).
 - **Accelerator Fungibility**: llmaz supports serving the same LLM with various accelerators to optimize cost and performance.
 - **Various Model Providers**: llmaz supports a wide range of model providers, such as [HuggingFace](https://huggingface.co/), [ModelScope](https://www.modelscope.cn), ObjectStores. llmaz will automatically handle the model loading, requiring no effort from users.
 - **Multi-Host Support**: llmaz supports both single-host and multi-host scenarios with [LWS](https://github.com/kubernetes-sigs/lws) from day 0. 

From 479dc5be92664f21bad3f5b3b4a86fd49412e7c6 Mon Sep 17 00:00:00 2001
From: cr7258 <chengzw258@163.com>
Date: Thu, 1 May 2025 23:03:06 +0800
Subject: [PATCH 3/8] update readme

---
 examples/README.md                                       | 4 ++++
 {docs/examples => examples}/tensorrt-llm/playground.yaml | 0
 site/content/en/docs/integrations/support-backends.md    | 4 ++++
 3 files changed, 8 insertions(+)
 rename {docs/examples => examples}/tensorrt-llm/playground.yaml (100%)

diff --git a/examples/README.md b/examples/README.md
index 0f5ea9ae..58d9fb42 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -46,6 +46,10 @@ By default, we use [vLLM](https://github.com/vllm-project/vllm) as the inference
 
 [llama.cpp](https://github.com/ggerganov/llama.cpp) can serve models on a wide variety of hardwares, such as CPU, see [example](./llamacpp/) here.
 
+### Deploy models via TensorRT-LLM
+
+[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) provides users with an easy-to-use Python API to define Large Language Models (LLMs) and support state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs, see [example](./tensorrt-llm/) here.
+
 ### Deploy models via text-generation-inference
 
 [text-generation-inference](https://github.com/huggingface/text-generation-inference) is used in production at Hugging Face to power Hugging Chat, the Inference API and Inference Endpoint. see [example](./tgi/) here.
diff --git a/docs/examples/tensorrt-llm/playground.yaml b/examples/tensorrt-llm/playground.yaml
similarity index 100%
rename from docs/examples/tensorrt-llm/playground.yaml
rename to examples/tensorrt-llm/playground.yaml
diff --git a/site/content/en/docs/integrations/support-backends.md b/site/content/en/docs/integrations/support-backends.md
index 0d2d5821..2331016a 100644
--- a/site/content/en/docs/integrations/support-backends.md
+++ b/site/content/en/docs/integrations/support-backends.md
@@ -13,6 +13,10 @@ If you want to integrate more backends into llmaz, please refer to this [PR](htt
 
 [SGLang](https://github.com/sgl-project/sglang) is yet another fast serving framework for large language models and vision language models.
 
+## TensorRT-LLM
+
+[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) provides users with an easy-to-use Python API to define Large Language Models (LLMs) and support state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs. TensorRT-LLM also contains components to create Python and C++ runtimes that orchestrate the inference execution in performant way.
+
 ## Text-Generation-Inference
 
 [text-generation-inference](https://github.com/huggingface/text-generation-inference) is a Rust, Python and gRPC server for text generation inference. Used in production at Hugging Face to power Hugging Chat, the Inference API and Inference Endpoint.

From ceaf5200403557ab47a1c902fb780fb5cae25aa2 Mon Sep 17 00:00:00 2001
From: cr7258 <chengzw258@163.com>
Date: Fri, 2 May 2025 12:44:09 +0800
Subject: [PATCH 4/8] remove example to resolve conflicts

---
 examples/README.md                    | 28 +++++++++++++--------------
 examples/tensorrt-llm/playground.yaml | 25 ------------------------
 2 files changed, 13 insertions(+), 40 deletions(-)
 delete mode 100644 examples/tensorrt-llm/playground.yaml

diff --git a/examples/README.md b/examples/README.md
index 58d9fb42..ddb9aaad 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -4,17 +4,19 @@ We provide a set of examples to help you serve large language models, by default
 
 ## Table of Contents
 
-- [Deploy models from Huggingface](#deploy-models-from-huggingface)
-- [Deploy models from ModelScope](#deploy-models-from-modelscope)
-- [Deploy models from ObjectStore](#deploy-models-from-objectstore)
-- [Deploy models via SGLang](#deploy-models-via-sglang)
-- [Deploy models via llama.cpp](#deploy-models-via-llamacpp)
-- [Deploy models via text-generation-inference](#deploy-models-via-tgi)
-- [Deploy models via ollama](#ollama)
-- [Speculative Decoding with vLLM](#speculative-decoding-with-vllm)
-- [Deploy multi-host inference](#multi-host-inference)
-- [Deploy host models](#deploy-host-models)
-- [Envoy AI Gateway](#envoy-ai-gateway)
+- [Examples](#examples)
+  - [Table of Contents](#table-of-contents)
+    - [Deploy models from Huggingface](#deploy-models-from-huggingface)
+    - [Deploy models from ModelScope](#deploy-models-from-modelscope)
+    - [Deploy models from ObjectStore](#deploy-models-from-objectstore)
+    - [Deploy models via SGLang](#deploy-models-via-sglang)
+    - [Deploy models via llama.cpp](#deploy-models-via-llamacpp)
+    - [Deploy models via text-generation-inference](#deploy-models-via-text-generation-inference)
+    - [Deploy models via ollama](#deploy-models-via-ollama)
+    - [Speculative Decoding with vLLM](#speculative-decoding-with-vllm)
+    - [Multi-Host Inference](#multi-host-inference)
+    - [Deploy Host Models](#deploy-host-models)
+    - [Envoy AI Gateway](#envoy-ai-gateway)
 
 ### Deploy models from Huggingface
 
@@ -46,10 +48,6 @@ By default, we use [vLLM](https://github.com/vllm-project/vllm) as the inference
 
 [llama.cpp](https://github.com/ggerganov/llama.cpp) can serve models on a wide variety of hardwares, such as CPU, see [example](./llamacpp/) here.
 
-### Deploy models via TensorRT-LLM
-
-[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) provides users with an easy-to-use Python API to define Large Language Models (LLMs) and support state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs, see [example](./tensorrt-llm/) here.
-
 ### Deploy models via text-generation-inference
 
 [text-generation-inference](https://github.com/huggingface/text-generation-inference) is used in production at Hugging Face to power Hugging Chat, the Inference API and Inference Endpoint. see [example](./tgi/) here.
diff --git a/examples/tensorrt-llm/playground.yaml b/examples/tensorrt-llm/playground.yaml
deleted file mode 100644
index 6bc41d87..00000000
--- a/examples/tensorrt-llm/playground.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-apiVersion: llmaz.io/v1alpha1
-kind: OpenModel
-metadata:
-  name: qwen2-0--5b
-spec:
-  familyName: qwen2
-  source:
-    modelHub:
-      modelID: Qwen/Qwen2-0.5B-Instruct
-  inferenceConfig:
-    flavors:
-      - name: a10 # GPU type
-        limits:
-          nvidia.com/gpu: 1
----
-apiVersion: inference.llmaz.io/v1alpha1
-kind: Playground
-metadata:
-  name: qwen2-0--5b
-spec:
-  replicas: 1
-  modelClaim:
-    modelName: qwen2-0--5b
-  backendRuntimeConfig:
-    backendName: tensorrt-llm

From 087b3d505da5cd6979ba4f6db331f10b15867542 Mon Sep 17 00:00:00 2001
From: cr7258 <chengzw258@163.com>
Date: Fri, 2 May 2025 12:46:49 +0800
Subject: [PATCH 5/8] remove example to resolve conflicts

---
 examples/README.md | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index ddb9aaad..5a9883f8 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -4,19 +4,17 @@ We provide a set of examples to help you serve large language models, by default
 
 ## Table of Contents
 
-- [Examples](#examples)
-  - [Table of Contents](#table-of-contents)
-    - [Deploy models from Huggingface](#deploy-models-from-huggingface)
-    - [Deploy models from ModelScope](#deploy-models-from-modelscope)
-    - [Deploy models from ObjectStore](#deploy-models-from-objectstore)
-    - [Deploy models via SGLang](#deploy-models-via-sglang)
-    - [Deploy models via llama.cpp](#deploy-models-via-llamacpp)
-    - [Deploy models via text-generation-inference](#deploy-models-via-text-generation-inference)
-    - [Deploy models via ollama](#deploy-models-via-ollama)
-    - [Speculative Decoding with vLLM](#speculative-decoding-with-vllm)
-    - [Multi-Host Inference](#multi-host-inference)
-    - [Deploy Host Models](#deploy-host-models)
-    - [Envoy AI Gateway](#envoy-ai-gateway)
+- [Deploy models from Huggingface](#deploy-models-from-huggingface)
+- [Deploy models from ModelScope](#deploy-models-from-modelscope)
+- [Deploy models from ObjectStore](#deploy-models-from-objectstore)
+- [Deploy models via SGLang](#deploy-models-via-sglang)
+- [Deploy models via llama.cpp](#deploy-models-via-llamacpp)
+- [Deploy models via text-generation-inference](#deploy-models-via-text-generation-inference)
+- [Deploy models via ollama](#deploy-models-via-ollama)
+- [Speculative Decoding with vLLM](#speculative-decoding-with-vllm)
+- [Multi-Host Inference](#multi-host-inference)
+- [Deploy Host Models](#deploy-host-models)
+- [Envoy AI Gateway](#envoy-ai-gateway)
 
 ### Deploy models from Huggingface
 

From 1eb8a21190cc249759bbfe925d3c830940820705 Mon Sep 17 00:00:00 2001
From: cr7258 <chengzw258@163.com>
Date: Fri, 2 May 2025 12:49:13 +0800
Subject: [PATCH 6/8] fix

---
 examples/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index 5a9883f8..0f5ea9ae 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -9,11 +9,11 @@ We provide a set of examples to help you serve large language models, by default
 - [Deploy models from ObjectStore](#deploy-models-from-objectstore)
 - [Deploy models via SGLang](#deploy-models-via-sglang)
 - [Deploy models via llama.cpp](#deploy-models-via-llamacpp)
-- [Deploy models via text-generation-inference](#deploy-models-via-text-generation-inference)
-- [Deploy models via ollama](#deploy-models-via-ollama)
+- [Deploy models via text-generation-inference](#deploy-models-via-tgi)
+- [Deploy models via ollama](#ollama)
 - [Speculative Decoding with vLLM](#speculative-decoding-with-vllm)
-- [Multi-Host Inference](#multi-host-inference)
-- [Deploy Host Models](#deploy-host-models)
+- [Deploy multi-host inference](#multi-host-inference)
+- [Deploy host models](#deploy-host-models)
 - [Envoy AI Gateway](#envoy-ai-gateway)
 
 ### Deploy models from Huggingface

From 38040dbb502829cf8c156daf5ae5684207a5ce61 Mon Sep 17 00:00:00 2001
From: cr7258 <chengzw258@163.com>
Date: Fri, 2 May 2025 12:53:33 +0800
Subject: [PATCH 7/8] add  tersorrt-llm example

---
 docs/examples/ tersorrt-llm/playground.yaml | 25 +++++++++++++++++++++
 docs/examples/README.md                     | 13 +++++++----
 2 files changed, 34 insertions(+), 4 deletions(-)
 create mode 100644 docs/examples/ tersorrt-llm/playground.yaml

diff --git a/docs/examples/ tersorrt-llm/playground.yaml b/docs/examples/ tersorrt-llm/playground.yaml
new file mode 100644
index 00000000..6bc41d87
--- /dev/null
+++ b/docs/examples/ tersorrt-llm/playground.yaml	
@@ -0,0 +1,25 @@
+apiVersion: llmaz.io/v1alpha1
+kind: OpenModel
+metadata:
+  name: qwen2-0--5b
+spec:
+  familyName: qwen2
+  source:
+    modelHub:
+      modelID: Qwen/Qwen2-0.5B-Instruct
+  inferenceConfig:
+    flavors:
+      - name: a10 # GPU type
+        limits:
+          nvidia.com/gpu: 1
+---
+apiVersion: inference.llmaz.io/v1alpha1
+kind: Playground
+metadata:
+  name: qwen2-0--5b
+spec:
+  replicas: 1
+  modelClaim:
+    modelName: qwen2-0--5b
+  backendRuntimeConfig:
+    backendName: tensorrt-llm
diff --git a/docs/examples/README.md b/docs/examples/README.md
index 0f5ea9ae..6733d004 100644
--- a/docs/examples/README.md
+++ b/docs/examples/README.md
@@ -9,11 +9,12 @@ We provide a set of examples to help you serve large language models, by default
 - [Deploy models from ObjectStore](#deploy-models-from-objectstore)
 - [Deploy models via SGLang](#deploy-models-via-sglang)
 - [Deploy models via llama.cpp](#deploy-models-via-llamacpp)
-- [Deploy models via text-generation-inference](#deploy-models-via-tgi)
-- [Deploy models via ollama](#ollama)
+- [Deploy models via TensorRT-LLM](#deploy-models-via-tensorrt-llm)
+- [Deploy models via text-generation-inference](#deploy-models-via-text-generation-inference)
+- [Deploy models via ollama](#deploy-models-via-ollama)
 - [Speculative Decoding with vLLM](#speculative-decoding-with-vllm)
-- [Deploy multi-host inference](#multi-host-inference)
-- [Deploy host models](#deploy-host-models)
+- [Multi-Host Inference](#multi-host-inference)
+- [Deploy Host Models](#deploy-host-models)
 - [Envoy AI Gateway](#envoy-ai-gateway)
 
 ### Deploy models from Huggingface
@@ -46,6 +47,10 @@ By default, we use [vLLM](https://github.com/vllm-project/vllm) as the inference
 
 [llama.cpp](https://github.com/ggerganov/llama.cpp) can serve models on a wide variety of hardwares, such as CPU, see [example](./llamacpp/) here.
 
+### Deploy models via TensorRT-LLM
+
+[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) provides users with an easy-to-use Python API to define Large Language Models (LLMs) and support state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs, see [example](./tensorrt-llm/) here.
+
 ### Deploy models via text-generation-inference
 
 [text-generation-inference](https://github.com/huggingface/text-generation-inference) is used in production at Hugging Face to power Hugging Chat, the Inference API and Inference Endpoint. see [example](./tgi/) here.

From d0dd3915504d457a0daf20328d4abb287f518e05 Mon Sep 17 00:00:00 2001
From: cr7258 <chengzw258@163.com>
Date: Fri, 2 May 2025 12:56:03 +0800
Subject: [PATCH 8/8] fix folder name

---
 docs/examples/{ tersorrt-llm => tensorrt-llm}/playground.yaml | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename docs/examples/{ tersorrt-llm => tensorrt-llm}/playground.yaml (100%)

diff --git a/docs/examples/ tersorrt-llm/playground.yaml b/docs/examples/tensorrt-llm/playground.yaml
similarity index 100%
rename from docs/examples/ tersorrt-llm/playground.yaml
rename to docs/examples/tensorrt-llm/playground.yaml