From 28d8f3d6363a8a1a1db642addaca7cf410440342 Mon Sep 17 00:00:00 2001
From: cr7258 <chengzw258@163.com>
Date: Sun, 27 Apr 2025 13:45:04 +0800
Subject: [PATCH 1/4] feat: add preStop hook for llamacpp and tgi in the
 BackendRuntime

---
 chart/templates/backends/llamacpp.yaml        | 19 +++++++++++++++++++
 chart/templates/backends/tgi.yaml             | 18 ++++++++++++++++++
 docs/examples/tgi/playground.yaml             |  2 +-
 .../inference/playground_controller.go        |  1 +
 4 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/chart/templates/backends/llamacpp.yaml b/chart/templates/backends/llamacpp.yaml
index 150e2378..fb6ae6d0 100644
--- a/chart/templates/backends/llamacpp.yaml
+++ b/chart/templates/backends/llamacpp.yaml
@@ -12,6 +12,24 @@ spec:
     - ./llama-server
   image: {{ .Values.backendRuntime.llamacpp.image.repository }}
   version: {{ .Values.backendRuntime.llamacpp.image.tag }}
+  lifecycle:
+    preStop:
+      exec:
+        command:
+          - /bin/sh
+          - -c
+          - |
+            while true; do
+              RUNNING=$(curl -s http://localhost:8080/metrics | grep 'llamacpp:requests_processing' | grep -v '#' | awk '{print $2}')
+              WAITING=$(curl -s http://localhost:8080/metrics | grep 'llamacpp:requests_deferred' | grep -v '#' | awk '{print $2}')
+              if [ "$RUNNING" = "0" ] && [ "$WAITING" = "0" ]; then
+                echo "Terminating: No active or waiting requests, safe to terminate" >> /proc/1/fd/1
+                exit 0
+              else
+                echo "Terminating: Running: $RUNNING, Waiting: $WAITING" >> /proc/1/fd/1
+                sleep 5
+              fi
+            done
   # Do not edit the preset argument name unless you know what you're doing.
   # Free to add more arguments with your requirements.
   recommendedConfigs:
@@ -23,6 +41,7 @@ spec:
         - "0.0.0.0"
         - --port
         - "8080"
+        - --metrics
       resources:
         requests:
           cpu: 2
diff --git a/chart/templates/backends/tgi.yaml b/chart/templates/backends/tgi.yaml
index 693964ee..39641705 100644
--- a/chart/templates/backends/tgi.yaml
+++ b/chart/templates/backends/tgi.yaml
@@ -10,6 +10,24 @@ metadata:
 spec:
   image: {{ .Values.backendRuntime.tgi.image.repository }}
   version: {{ .Values.backendRuntime.tgi.image.tag }}
+  lifecycle:
+    preStop:
+      exec:
+        command:
+          - /bin/sh
+          - -c
+          - |
+            while true; do
+              RUNNING=$(curl -s http://localhost:8080/metrics | grep 'tgi_batch_current_size' | grep -v '#' | awk '{print $2}')
+              WAITING=$(curl -s http://localhost:8080/metrics | grep 'tgi_queue_size' | grep -v '#' | awk '{print $2}')
+              if [ "$RUNNING" = "0" ] && [ "$WAITING" = "0" ]; then
+                echo "Terminating: No active or waiting requests, safe to terminate" >> /proc/1/fd/1
+                exit 0
+              else
+                echo "Terminating: Running: $RUNNING, Waiting: $WAITING" >> /proc/1/fd/1
+                sleep 5
+              fi
+            done
   # Do not edit the preset argument name unless you know what you're doing.
   # Free to add more arguments with your requirements.
   recommendedConfigs:
diff --git a/docs/examples/tgi/playground.yaml b/docs/examples/tgi/playground.yaml
index 3edad83a..d6d4021c 100644
--- a/docs/examples/tgi/playground.yaml
+++ b/docs/examples/tgi/playground.yaml
@@ -22,4 +22,4 @@ spec:
   modelClaim:
     modelName: qwen2-0--5b
   backendRuntimeConfig:
-    name: tgi
+    backendName: tgi
diff --git a/pkg/controller/inference/playground_controller.go b/pkg/controller/inference/playground_controller.go
index f33866f1..ecb3c977 100644
--- a/pkg/controller/inference/playground_controller.go
+++ b/pkg/controller/inference/playground_controller.go
@@ -330,6 +330,7 @@ func buildTemplate(models []*coreapi.OpenModel, playground *inferenceapi.Playgro
 
 	template := corev1.PodTemplateSpec{
 		Spec: corev1.PodSpec{
+			TerminationGracePeriodSeconds: ptr.To[int64](130),
 			// TODO: should we support image pull secret here?
 			Containers: []corev1.Container{
 				{

From d6e7c2122aaa111d49b15bb8f6bd42d232e6587e Mon Sep 17 00:00:00 2001
From: Se7en <chengzw258@163.com>
Date: Sun, 27 Apr 2025 21:14:17 +0800
Subject: [PATCH 2/4] Add TODO

---
 pkg/controller/inference/playground_controller.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pkg/controller/inference/playground_controller.go b/pkg/controller/inference/playground_controller.go
index ecb3c977..b7b69a59 100644
--- a/pkg/controller/inference/playground_controller.go
+++ b/pkg/controller/inference/playground_controller.go
@@ -330,6 +330,8 @@ func buildTemplate(models []*coreapi.OpenModel, playground *inferenceapi.Playgro
 
 	template := corev1.PodTemplateSpec{
 		Spec: corev1.PodSpec{
+                         // TODO: The timeout is mainly quoted from https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/v0.3.0/config/manifests/vllm/gpu-deployment.yaml#L170-L226.
+                         // We should support this in the upstream inference engines in the future.
 			TerminationGracePeriodSeconds: ptr.To[int64](130),
 			// TODO: should we support image pull secret here?
 			Containers: []corev1.Container{

From ae1ae3e6157d26c8ae0d47b63189dff0ede42f41 Mon Sep 17 00:00:00 2001
From: Se7en <chengzw258@163.com>
Date: Sun, 27 Apr 2025 21:15:49 +0800
Subject: [PATCH 3/4] Remove extra blank

---
 pkg/controller/inference/playground_controller.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/controller/inference/playground_controller.go b/pkg/controller/inference/playground_controller.go
index b7b69a59..24f79c32 100644
--- a/pkg/controller/inference/playground_controller.go
+++ b/pkg/controller/inference/playground_controller.go
@@ -330,8 +330,8 @@ func buildTemplate(models []*coreapi.OpenModel, playground *inferenceapi.Playgro
 
 	template := corev1.PodTemplateSpec{
 		Spec: corev1.PodSpec{
-                         // TODO: The timeout is mainly quoted from https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/v0.3.0/config/manifests/vllm/gpu-deployment.yaml#L170-L226.
-                         // We should support this in the upstream inference engines in the future.
+                        // TODO: The timeout is mainly quoted from https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/v0.3.0/config/manifests/vllm/gpu-deployment.yaml#L170-L226.
+                        // We should support this in the upstream inference engines in the future.
 			TerminationGracePeriodSeconds: ptr.To[int64](130),
 			// TODO: should we support image pull secret here?
 			Containers: []corev1.Container{

From 5935aef2841701ad9b4cc9372e857d2f6c2e2a34 Mon Sep 17 00:00:00 2001
From: cr7258 <chengzw258@163.com>
Date: Sun, 27 Apr 2025 21:19:37 +0800
Subject: [PATCH 4/4] fix golang lint

---
 pkg/controller/inference/playground_controller.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/controller/inference/playground_controller.go b/pkg/controller/inference/playground_controller.go
index 24f79c32..73dd19ad 100644
--- a/pkg/controller/inference/playground_controller.go
+++ b/pkg/controller/inference/playground_controller.go
@@ -330,8 +330,8 @@ func buildTemplate(models []*coreapi.OpenModel, playground *inferenceapi.Playgro
 
 	template := corev1.PodTemplateSpec{
 		Spec: corev1.PodSpec{
-                        // TODO: The timeout is mainly quoted from https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/v0.3.0/config/manifests/vllm/gpu-deployment.yaml#L170-L226.
-                        // We should support this in the upstream inference engines in the future.
+			// TODO: The timeout is mainly quoted from https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/v0.3.0/config/manifests/vllm/gpu-deployment.yaml#L170-L226.
+			// We should support this in the upstream inference engines in the future.
 			TerminationGracePeriodSeconds: ptr.To[int64](130),
 			// TODO: should we support image pull secret here?
 			Containers: []corev1.Container{