From b5d8563d11f41fac7edd4a8953fe8996bc7958ee Mon Sep 17 00:00:00 2001
From: kerthcet <kerthcet@gmail.com>
Date: Wed, 9 Oct 2024 19:00:19 +0800
Subject: [PATCH] Support TGI as another backendruntime

Signed-off-by: kerthcet <kerthcet@gmail.com>
---
 README.md                                     |  2 +-
 chart/templates/backends/tgi.yaml             | 29 +++++++++++++++++++
 docs/examples/README.md                       |  5 ++++
 docs/examples/tgi/model.yaml                  | 13 +++++++++
 docs/examples/tgi/playground.yaml             | 10 +++++++
 docs/support-backends.md                      |  6 ++++
 test/config/backends/tgi.yaml                 | 27 +++++++++++++++++
 .../controller/inference/playground_test.go   | 28 ++++++++++++++++++
 8 files changed, 119 insertions(+), 1 deletion(-)
 create mode 100644 chart/templates/backends/tgi.yaml
 create mode 100644 docs/examples/tgi/model.yaml
 create mode 100644 docs/examples/tgi/playground.yaml
 create mode 100644 test/config/backends/tgi.yaml

diff --git a/README.md b/README.md
index a05f3d78..ffa42ce4 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ Easy, advanced inference platform for large language models on Kubernetes
 ## Features Overview
 
 - **Easy of Use**: People can quick deploy a LLM service with minimal configurations.
-- **Broad Backend Support**: llmaz supports a wide range of advanced inference backends for different scenarios, like [vLLM](https://github.com/vllm-project/vllm), [SGLang](https://github.com/sgl-project/sglang), [llama.cpp](https://github.com/ggerganov/llama.cpp). Find the full list of supported backends [here](./docs/support-backends.md).
+- **Broad Backend Support**: llmaz supports a wide range of advanced inference backends for different scenarios, like [vLLM](https://github.com/vllm-project/vllm), [Text-Generation-Inference](https://github.com/huggingface/text-generation-inference), [SGLang](https://github.com/sgl-project/sglang), [llama.cpp](https://github.com/ggerganov/llama.cpp). Find the full list of supported backends [here](./docs/support-backends.md).
 - **Scaling Efficiency (WIP)**: llmaz works smoothly with autoscaling components like [Cluster-Autoscaler](https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler) or [Karpenter](https://github.com/kubernetes-sigs/karpenter) to support elastic scenarios.
 - **Accelerator Fungibility (WIP)**: llmaz supports serving the same LLM with various accelerators to optimize cost and performance.
 - **SOTA Inference**: llmaz supports the latest cutting-edge researches like [Speculative Decoding](https://arxiv.org/abs/2211.17192) or [Splitwise](https://arxiv.org/abs/2311.18677)(WIP) to run on Kubernetes.
diff --git a/chart/templates/backends/tgi.yaml b/chart/templates/backends/tgi.yaml
new file mode 100644
index 00000000..d6a67420
--- /dev/null
+++ b/chart/templates/backends/tgi.yaml
@@ -0,0 +1,29 @@
+{{- if .Values.backendRuntime.install -}}
+apiVersion: inference.llmaz.io/v1alpha1
+kind: BackendRuntime
+metadata:
+  labels:
+    app.kubernetes.io/name: backendruntime
+    app.kubernetes.io/part-of: llmaz
+    app.kubernetes.io/created-by: llmaz
+  name: tgi
+spec:
+  image: ghcr.io/huggingface/text-generation-inference
+  version: 2.3.1
+  # Do not edit the preset argument name unless you know what you're doing.
+  # Free to add more arguments with your requirements.
+  args:
+    - name: default
+      flags:
+        - --model-id
+        - "{{`{{ .ModelPath }}`}}"
+        - --port
+        - "8080"
+  resources:
+    requests:
+      cpu: 4
+      memory: 8Gi
+    limits:
+      cpu: 4
+      memory: 8Gi
+{{- end }}
diff --git a/docs/examples/README.md b/docs/examples/README.md
index b690c225..e011cbe6 100644
--- a/docs/examples/README.md
+++ b/docs/examples/README.md
@@ -9,6 +9,7 @@ We provide a set of examples to help you serve large language models, by default
 - [Deploy models from ObjectStore](#deploy-models-from-objectstore)
 - [Deploy models via SGLang](#deploy-models-via-sglang)
 - [Deploy models via llama.cpp](#deploy-models-via-llamacpp)
+- [Deploy models via text-generation-inference](#deploy-models-via-tgi)
 - [Speculative Decoding with vLLM](#speculative-decoding-with-vllm)
 
 ### Deploy models from Huggingface
@@ -41,6 +42,10 @@ By default, we use [vLLM](https://github.com/vllm-project/vllm) as the inference
 
 [llama.cpp](https://github.com/ggerganov/llama.cpp) can serve models on a wide variety of hardwares, such as CPU, see [example](./llamacpp/) here.
 
+### Deploy models via text-generation-inference
+
+[text-generation-inference](https://github.com/huggingface/text-generation-inference) is used in production at Hugging Face to power Hugging Chat, the Inference API and Inference Endpoint. see [example](./tgi/) here.
+
 ### Speculative Decoding with vLLM
 
 [Speculative Decoding](https://arxiv.org/abs/2211.17192) can improve inference performance efficiently, see [example](./speculative-decoding/vllm/) here.
diff --git a/docs/examples/tgi/model.yaml b/docs/examples/tgi/model.yaml
new file mode 100644
index 00000000..fe0ef7c1
--- /dev/null
+++ b/docs/examples/tgi/model.yaml
@@ -0,0 +1,13 @@
+apiVersion: llmaz.io/v1alpha1
+kind: OpenModel
+metadata:
+  name: qwen2-0--5b
+spec:
+  familyName: qwen2
+  source:
+    modelHub:
+      modelID: Qwen/Qwen2-0.5B-Instruct
+  inferenceFlavors:
+  - name: t4 # GPU type
+    requests:
+      nvidia.com/gpu: 1
diff --git a/docs/examples/tgi/playground.yaml b/docs/examples/tgi/playground.yaml
new file mode 100644
index 00000000..4f453f80
--- /dev/null
+++ b/docs/examples/tgi/playground.yaml
@@ -0,0 +1,10 @@
+apiVersion: inference.llmaz.io/v1alpha1
+kind: Playground
+metadata:
+  name: qwen2-0--5b
+spec:
+  replicas: 1
+  modelClaim:
+    modelName: qwen2-0--5b
+  backendRuntimeConfig:
+    name: tgi
diff --git a/docs/support-backends.md b/docs/support-backends.md
index 49a57484..2a1a27a4 100644
--- a/docs/support-backends.md
+++ b/docs/support-backends.md
@@ -1,5 +1,7 @@
 # All Kinds of Supported Inference Backends
 
+If you want to integrate more backends into llmaz, please refer to this [PR](https://github.com/InftyAI/llmaz/pull/182). It's always welcomed.
+
 ## llama.cpp
 
 [llama.cpp](https://github.com/ggerganov/llama.cpp) is to enable LLM inference with minimal setup and state-of-the-art performance on a wide variety of hardware - locally and in the cloud.
@@ -8,6 +10,10 @@
 
 [SGLang](https://github.com/sgl-project/sglang) is yet another fast serving framework for large language models and vision language models.
 
+## Text-Generation-Inference
+
+[text-generation-inference](https://github.com/huggingface/text-generation-inference) is a Rust, Python and gRPC server for text generation inference. Used in production at Hugging Face to power Hugging Chat, the Inference API and Inference Endpoint.
+
 ## vLLM
 
 [vLLM](https://github.com/vllm-project/vllm) is a high-throughput and memory-efficient inference and serving engine for LLMs
diff --git a/test/config/backends/tgi.yaml b/test/config/backends/tgi.yaml
new file mode 100644
index 00000000..69d4145e
--- /dev/null
+++ b/test/config/backends/tgi.yaml
@@ -0,0 +1,27 @@
+apiVersion: inference.llmaz.io/v1alpha1
+kind: BackendRuntime
+metadata:
+  labels:
+    app.kubernetes.io/name: backendruntime
+    app.kubernetes.io/part-of: llmaz
+    app.kubernetes.io/created-by: llmaz
+  name: tgi
+spec:
+  image: ghcr.io/huggingface/text-generation-inference
+  version: 2.3.1
+  # Do not edit the preset argument name unless you know what you're doing.
+  # Free to add more arguments with your requirements.
+  args:
+    - name: default
+      flags:
+        - --model-id
+        - "{{`{{ .ModelPath }}`}}"
+        - --port
+        - "8080"
+  resources:
+    requests:
+      cpu: 4
+      memory: 8Gi
+    limits:
+      cpu: 4
+      memory: 8Gi
diff --git a/test/integration/controller/inference/playground_test.go b/test/integration/controller/inference/playground_test.go
index 36f6b110..3814aa52 100644
--- a/test/integration/controller/inference/playground_test.go
+++ b/test/integration/controller/inference/playground_test.go
@@ -236,6 +236,34 @@ var _ = ginkgo.Describe("playground controller test", func() {
 				},
 			},
 		}),
+		ginkgo.Entry("advance configured Playground with tgi", &testValidatingCase{
+			makePlayground: func() *inferenceapi.Playground {
+				return wrapper.MakePlayground("playground", ns.Name).ModelClaim(model.Name).Label(coreapi.ModelNameLabelKey, model.Name).
+					BackendRuntime("tgi").BackendRuntimeVersion("main").BackendRuntimeArgs([]string{"--model-id", "Qwen/Qwen2-0.5B-Instruct"}).BackendRuntimeEnv("FOO", "BAR").
+					BackendRuntimeRequest("cpu", "1").BackendRuntimeLimit("cpu", "10").
+					Obj()
+			},
+			updates: []*update{
+				{
+					updateFunc: func(playground *inferenceapi.Playground) {
+						gomega.Expect(k8sClient.Create(ctx, playground)).To(gomega.Succeed())
+					},
+					checkFunc: func(ctx context.Context, k8sClient client.Client, playground *inferenceapi.Playground) {
+						validation.ValidatePlayground(ctx, k8sClient, playground)
+						validation.ValidatePlaygroundStatusEqualTo(ctx, k8sClient, playground, inferenceapi.PlaygroundProgressing, "Pending", metav1.ConditionTrue)
+					},
+				},
+				{
+					updateFunc: func(playground *inferenceapi.Playground) {
+						util.UpdateLwsToReady(ctx, k8sClient, playground.Name, playground.Namespace)
+					},
+					checkFunc: func(ctx context.Context, k8sClient client.Client, playground *inferenceapi.Playground) {
+						validation.ValidatePlayground(ctx, k8sClient, playground)
+						validation.ValidatePlaygroundStatusEqualTo(ctx, k8sClient, playground, inferenceapi.PlaygroundAvailable, "PlaygroundReady", metav1.ConditionTrue)
+					},
+				},
+			},
+		}),
 		ginkgo.Entry("playground is created when service exists with the same name", &testValidatingCase{
 			makePlayground: func() *inferenceapi.Playground {
 				return util.MockASamplePlayground(ns.Name)