InftyAI · InftyAI-Agent · Apr 27, 2025 · Apr 27, 2025 · Apr 27, 2025 · Apr 27, 2025
diff --git a/chart/templates/backends/llamacpp.yaml b/chart/templates/backends/llamacpp.yaml
@@ -12,6 +12,24 @@ spec:
     - ./llama-server
   image: {{ .Values.backendRuntime.llamacpp.image.repository }}
   version: {{ .Values.backendRuntime.llamacpp.image.tag }}
+  lifecycle:
+    preStop:
+      exec:
+        command:
+          - /bin/sh
+          - -c
+          - |
+            while true; do
+              RUNNING=$(curl -s http://localhost:8080/metrics | grep 'llamacpp:requests_processing' | grep -v '#' | awk '{print $2}')
+              WAITING=$(curl -s http://localhost:8080/metrics | grep 'llamacpp:requests_deferred' | grep -v '#' | awk '{print $2}')
+              if [ "$RUNNING" = "0" ] && [ "$WAITING" = "0" ]; then
+                echo "Terminating: No active or waiting requests, safe to terminate" >> /proc/1/fd/1
+                exit 0
+              else
+                echo "Terminating: Running: $RUNNING, Waiting: $WAITING" >> /proc/1/fd/1
+                sleep 5
+              fi
+            done
   # Do not edit the preset argument name unless you know what you're doing.
   # Free to add more arguments with your requirements.
   recommendedConfigs:
@@ -23,6 +41,7 @@ spec:
         - "0.0.0.0"
         - --port
         - "8080"
+        - --metrics
       resources:
         requests:
           cpu: 2

diff --git a/chart/templates/backends/tgi.yaml b/chart/templates/backends/tgi.yaml
@@ -10,6 +10,24 @@ metadata:
 spec:
   image: {{ .Values.backendRuntime.tgi.image.repository }}
   version: {{ .Values.backendRuntime.tgi.image.tag }}
+  lifecycle:
+    preStop:
+      exec:
+        command:
+          - /bin/sh
+          - -c
+          - |
+            while true; do
+              RUNNING=$(curl -s http://localhost:8080/metrics | grep 'tgi_batch_current_size' | grep -v '#' | awk '{print $2}')
+              WAITING=$(curl -s http://localhost:8080/metrics | grep 'tgi_queue_size' | grep -v '#' | awk '{print $2}')
+              if [ "$RUNNING" = "0" ] && [ "$WAITING" = "0" ]; then
+                echo "Terminating: No active or waiting requests, safe to terminate" >> /proc/1/fd/1
+                exit 0
+              else
+                echo "Terminating: Running: $RUNNING, Waiting: $WAITING" >> /proc/1/fd/1
+                sleep 5
+              fi
+            done
   # Do not edit the preset argument name unless you know what you're doing.
   # Free to add more arguments with your requirements.
   recommendedConfigs:

diff --git a/docs/examples/tgi/playground.yaml b/docs/examples/tgi/playground.yaml
@@ -22,4 +22,4 @@ spec:
   modelClaim:
     modelName: qwen2-0--5b
   backendRuntimeConfig:
-    name: tgi
+    backendName: tgi
diff --git a/pkg/controller/inference/playground_controller.go b/pkg/controller/inference/playground_controller.go
@@ -330,6 +330,9 @@ func buildTemplate(models []*coreapi.OpenModel, playground *inferenceapi.Playgro
 
 	template := corev1.PodTemplateSpec{
 		Spec: corev1.PodSpec{
+			// TODO: The timeout is mainly quoted from https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/v0.3.0/config/manifests/vllm/gpu-deployment.yaml#L170-L226.
+			// We should support this in the upstream inference engines in the future.
+			TerminationGracePeriodSeconds: ptr.To[int64](130),
 			// TODO: should we support image pull secret here?
 			Containers: []corev1.Container{
 				{