From b5d8563d11f41fac7edd4a8953fe8996bc7958ee Mon Sep 17 00:00:00 2001 From: kerthcet Date: Wed, 9 Oct 2024 19:00:19 +0800 Subject: [PATCH] Support TGI as another backendruntime Signed-off-by: kerthcet --- README.md | 2 +- chart/templates/backends/tgi.yaml | 29 +++++++++++++++++++ docs/examples/README.md | 5 ++++ docs/examples/tgi/model.yaml | 13 +++++++++ docs/examples/tgi/playground.yaml | 10 +++++++ docs/support-backends.md | 6 ++++ test/config/backends/tgi.yaml | 27 +++++++++++++++++ .../controller/inference/playground_test.go | 28 ++++++++++++++++++ 8 files changed, 119 insertions(+), 1 deletion(-) create mode 100644 chart/templates/backends/tgi.yaml create mode 100644 docs/examples/tgi/model.yaml create mode 100644 docs/examples/tgi/playground.yaml create mode 100644 test/config/backends/tgi.yaml diff --git a/README.md b/README.md index a05f3d78..ffa42ce4 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ Easy, advanced inference platform for large language models on Kubernetes ## Features Overview - **Easy of Use**: People can quick deploy a LLM service with minimal configurations. -- **Broad Backend Support**: llmaz supports a wide range of advanced inference backends for different scenarios, like [vLLM](https://github.com/vllm-project/vllm), [SGLang](https://github.com/sgl-project/sglang), [llama.cpp](https://github.com/ggerganov/llama.cpp). Find the full list of supported backends [here](./docs/support-backends.md). +- **Broad Backend Support**: llmaz supports a wide range of advanced inference backends for different scenarios, like [vLLM](https://github.com/vllm-project/vllm), [Text-Generation-Inference](https://github.com/huggingface/text-generation-inference), [SGLang](https://github.com/sgl-project/sglang), [llama.cpp](https://github.com/ggerganov/llama.cpp). Find the full list of supported backends [here](./docs/support-backends.md). - **Scaling Efficiency (WIP)**: llmaz works smoothly with autoscaling components like [Cluster-Autoscaler](https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler) or [Karpenter](https://github.com/kubernetes-sigs/karpenter) to support elastic scenarios. - **Accelerator Fungibility (WIP)**: llmaz supports serving the same LLM with various accelerators to optimize cost and performance. - **SOTA Inference**: llmaz supports the latest cutting-edge researches like [Speculative Decoding](https://arxiv.org/abs/2211.17192) or [Splitwise](https://arxiv.org/abs/2311.18677)(WIP) to run on Kubernetes. diff --git a/chart/templates/backends/tgi.yaml b/chart/templates/backends/tgi.yaml new file mode 100644 index 00000000..d6a67420 --- /dev/null +++ b/chart/templates/backends/tgi.yaml @@ -0,0 +1,29 @@ +{{- if .Values.backendRuntime.install -}} +apiVersion: inference.llmaz.io/v1alpha1 +kind: BackendRuntime +metadata: + labels: + app.kubernetes.io/name: backendruntime + app.kubernetes.io/part-of: llmaz + app.kubernetes.io/created-by: llmaz + name: tgi +spec: + image: ghcr.io/huggingface/text-generation-inference + version: 2.3.1 + # Do not edit the preset argument name unless you know what you're doing. + # Free to add more arguments with your requirements. + args: + - name: default + flags: + - --model-id + - "{{`{{ .ModelPath }}`}}" + - --port + - "8080" + resources: + requests: + cpu: 4 + memory: 8Gi + limits: + cpu: 4 + memory: 8Gi +{{- end }} diff --git a/docs/examples/README.md b/docs/examples/README.md index b690c225..e011cbe6 100644 --- a/docs/examples/README.md +++ b/docs/examples/README.md @@ -9,6 +9,7 @@ We provide a set of examples to help you serve large language models, by default - [Deploy models from ObjectStore](#deploy-models-from-objectstore) - [Deploy models via SGLang](#deploy-models-via-sglang) - [Deploy models via llama.cpp](#deploy-models-via-llamacpp) +- [Deploy models via text-generation-inference](#deploy-models-via-tgi) - [Speculative Decoding with vLLM](#speculative-decoding-with-vllm) ### Deploy models from Huggingface @@ -41,6 +42,10 @@ By default, we use [vLLM](https://github.com/vllm-project/vllm) as the inference [llama.cpp](https://github.com/ggerganov/llama.cpp) can serve models on a wide variety of hardwares, such as CPU, see [example](./llamacpp/) here. +### Deploy models via text-generation-inference + +[text-generation-inference](https://github.com/huggingface/text-generation-inference) is used in production at Hugging Face to power Hugging Chat, the Inference API and Inference Endpoint. see [example](./tgi/) here. + ### Speculative Decoding with vLLM [Speculative Decoding](https://arxiv.org/abs/2211.17192) can improve inference performance efficiently, see [example](./speculative-decoding/vllm/) here. diff --git a/docs/examples/tgi/model.yaml b/docs/examples/tgi/model.yaml new file mode 100644 index 00000000..fe0ef7c1 --- /dev/null +++ b/docs/examples/tgi/model.yaml @@ -0,0 +1,13 @@ +apiVersion: llmaz.io/v1alpha1 +kind: OpenModel +metadata: + name: qwen2-0--5b +spec: + familyName: qwen2 + source: + modelHub: + modelID: Qwen/Qwen2-0.5B-Instruct + inferenceFlavors: + - name: t4 # GPU type + requests: + nvidia.com/gpu: 1 diff --git a/docs/examples/tgi/playground.yaml b/docs/examples/tgi/playground.yaml new file mode 100644 index 00000000..4f453f80 --- /dev/null +++ b/docs/examples/tgi/playground.yaml @@ -0,0 +1,10 @@ +apiVersion: inference.llmaz.io/v1alpha1 +kind: Playground +metadata: + name: qwen2-0--5b +spec: + replicas: 1 + modelClaim: + modelName: qwen2-0--5b + backendRuntimeConfig: + name: tgi diff --git a/docs/support-backends.md b/docs/support-backends.md index 49a57484..2a1a27a4 100644 --- a/docs/support-backends.md +++ b/docs/support-backends.md @@ -1,5 +1,7 @@ # All Kinds of Supported Inference Backends +If you want to integrate more backends into llmaz, please refer to this [PR](https://github.com/InftyAI/llmaz/pull/182). It's always welcomed. + ## llama.cpp [llama.cpp](https://github.com/ggerganov/llama.cpp) is to enable LLM inference with minimal setup and state-of-the-art performance on a wide variety of hardware - locally and in the cloud. @@ -8,6 +10,10 @@ [SGLang](https://github.com/sgl-project/sglang) is yet another fast serving framework for large language models and vision language models. +## Text-Generation-Inference + +[text-generation-inference](https://github.com/huggingface/text-generation-inference) is a Rust, Python and gRPC server for text generation inference. Used in production at Hugging Face to power Hugging Chat, the Inference API and Inference Endpoint. + ## vLLM [vLLM](https://github.com/vllm-project/vllm) is a high-throughput and memory-efficient inference and serving engine for LLMs diff --git a/test/config/backends/tgi.yaml b/test/config/backends/tgi.yaml new file mode 100644 index 00000000..69d4145e --- /dev/null +++ b/test/config/backends/tgi.yaml @@ -0,0 +1,27 @@ +apiVersion: inference.llmaz.io/v1alpha1 +kind: BackendRuntime +metadata: + labels: + app.kubernetes.io/name: backendruntime + app.kubernetes.io/part-of: llmaz + app.kubernetes.io/created-by: llmaz + name: tgi +spec: + image: ghcr.io/huggingface/text-generation-inference + version: 2.3.1 + # Do not edit the preset argument name unless you know what you're doing. + # Free to add more arguments with your requirements. + args: + - name: default + flags: + - --model-id + - "{{`{{ .ModelPath }}`}}" + - --port + - "8080" + resources: + requests: + cpu: 4 + memory: 8Gi + limits: + cpu: 4 + memory: 8Gi diff --git a/test/integration/controller/inference/playground_test.go b/test/integration/controller/inference/playground_test.go index 36f6b110..3814aa52 100644 --- a/test/integration/controller/inference/playground_test.go +++ b/test/integration/controller/inference/playground_test.go @@ -236,6 +236,34 @@ var _ = ginkgo.Describe("playground controller test", func() { }, }, }), + ginkgo.Entry("advance configured Playground with tgi", &testValidatingCase{ + makePlayground: func() *inferenceapi.Playground { + return wrapper.MakePlayground("playground", ns.Name).ModelClaim(model.Name).Label(coreapi.ModelNameLabelKey, model.Name). + BackendRuntime("tgi").BackendRuntimeVersion("main").BackendRuntimeArgs([]string{"--model-id", "Qwen/Qwen2-0.5B-Instruct"}).BackendRuntimeEnv("FOO", "BAR"). + BackendRuntimeRequest("cpu", "1").BackendRuntimeLimit("cpu", "10"). + Obj() + }, + updates: []*update{ + { + updateFunc: func(playground *inferenceapi.Playground) { + gomega.Expect(k8sClient.Create(ctx, playground)).To(gomega.Succeed()) + }, + checkFunc: func(ctx context.Context, k8sClient client.Client, playground *inferenceapi.Playground) { + validation.ValidatePlayground(ctx, k8sClient, playground) + validation.ValidatePlaygroundStatusEqualTo(ctx, k8sClient, playground, inferenceapi.PlaygroundProgressing, "Pending", metav1.ConditionTrue) + }, + }, + { + updateFunc: func(playground *inferenceapi.Playground) { + util.UpdateLwsToReady(ctx, k8sClient, playground.Name, playground.Namespace) + }, + checkFunc: func(ctx context.Context, k8sClient client.Client, playground *inferenceapi.Playground) { + validation.ValidatePlayground(ctx, k8sClient, playground) + validation.ValidatePlaygroundStatusEqualTo(ctx, k8sClient, playground, inferenceapi.PlaygroundAvailable, "PlaygroundReady", metav1.ConditionTrue) + }, + }, + }, + }), ginkgo.Entry("playground is created when service exists with the same name", &testValidatingCase{ makePlayground: func() *inferenceapi.Playground { return util.MockASamplePlayground(ns.Name)