From 28d8f3d6363a8a1a1db642addaca7cf410440342 Mon Sep 17 00:00:00 2001 From: cr7258 Date: Sun, 27 Apr 2025 13:45:04 +0800 Subject: [PATCH 1/4] feat: add preStop hook for llamacpp and tgi in the BackendRuntime --- chart/templates/backends/llamacpp.yaml | 19 +++++++++++++++++++ chart/templates/backends/tgi.yaml | 18 ++++++++++++++++++ docs/examples/tgi/playground.yaml | 2 +- .../inference/playground_controller.go | 1 + 4 files changed, 39 insertions(+), 1 deletion(-) diff --git a/chart/templates/backends/llamacpp.yaml b/chart/templates/backends/llamacpp.yaml index 150e2378..fb6ae6d0 100644 --- a/chart/templates/backends/llamacpp.yaml +++ b/chart/templates/backends/llamacpp.yaml @@ -12,6 +12,24 @@ spec: - ./llama-server image: {{ .Values.backendRuntime.llamacpp.image.repository }} version: {{ .Values.backendRuntime.llamacpp.image.tag }} + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - | + while true; do + RUNNING=$(curl -s http://localhost:8080/metrics | grep 'llamacpp:requests_processing' | grep -v '#' | awk '{print $2}') + WAITING=$(curl -s http://localhost:8080/metrics | grep 'llamacpp:requests_deferred' | grep -v '#' | awk '{print $2}') + if [ "$RUNNING" = "0" ] && [ "$WAITING" = "0" ]; then + echo "Terminating: No active or waiting requests, safe to terminate" >> /proc/1/fd/1 + exit 0 + else + echo "Terminating: Running: $RUNNING, Waiting: $WAITING" >> /proc/1/fd/1 + sleep 5 + fi + done # Do not edit the preset argument name unless you know what you're doing. # Free to add more arguments with your requirements. recommendedConfigs: @@ -23,6 +41,7 @@ spec: - "0.0.0.0" - --port - "8080" + - --metrics resources: requests: cpu: 2 diff --git a/chart/templates/backends/tgi.yaml b/chart/templates/backends/tgi.yaml index 693964ee..39641705 100644 --- a/chart/templates/backends/tgi.yaml +++ b/chart/templates/backends/tgi.yaml @@ -10,6 +10,24 @@ metadata: spec: image: {{ .Values.backendRuntime.tgi.image.repository }} version: {{ .Values.backendRuntime.tgi.image.tag }} + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - | + while true; do + RUNNING=$(curl -s http://localhost:8080/metrics | grep 'tgi_batch_current_size' | grep -v '#' | awk '{print $2}') + WAITING=$(curl -s http://localhost:8080/metrics | grep 'tgi_queue_size' | grep -v '#' | awk '{print $2}') + if [ "$RUNNING" = "0" ] && [ "$WAITING" = "0" ]; then + echo "Terminating: No active or waiting requests, safe to terminate" >> /proc/1/fd/1 + exit 0 + else + echo "Terminating: Running: $RUNNING, Waiting: $WAITING" >> /proc/1/fd/1 + sleep 5 + fi + done # Do not edit the preset argument name unless you know what you're doing. # Free to add more arguments with your requirements. recommendedConfigs: diff --git a/docs/examples/tgi/playground.yaml b/docs/examples/tgi/playground.yaml index 3edad83a..d6d4021c 100644 --- a/docs/examples/tgi/playground.yaml +++ b/docs/examples/tgi/playground.yaml @@ -22,4 +22,4 @@ spec: modelClaim: modelName: qwen2-0--5b backendRuntimeConfig: - name: tgi + backendName: tgi diff --git a/pkg/controller/inference/playground_controller.go b/pkg/controller/inference/playground_controller.go index f33866f1..ecb3c977 100644 --- a/pkg/controller/inference/playground_controller.go +++ b/pkg/controller/inference/playground_controller.go @@ -330,6 +330,7 @@ func buildTemplate(models []*coreapi.OpenModel, playground *inferenceapi.Playgro template := corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ + TerminationGracePeriodSeconds: ptr.To[int64](130), // TODO: should we support image pull secret here? Containers: []corev1.Container{ { From d6e7c2122aaa111d49b15bb8f6bd42d232e6587e Mon Sep 17 00:00:00 2001 From: Se7en Date: Sun, 27 Apr 2025 21:14:17 +0800 Subject: [PATCH 2/4] Add TODO --- pkg/controller/inference/playground_controller.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/controller/inference/playground_controller.go b/pkg/controller/inference/playground_controller.go index ecb3c977..b7b69a59 100644 --- a/pkg/controller/inference/playground_controller.go +++ b/pkg/controller/inference/playground_controller.go @@ -330,6 +330,8 @@ func buildTemplate(models []*coreapi.OpenModel, playground *inferenceapi.Playgro template := corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ + // TODO: The timeout is mainly quoted from https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/v0.3.0/config/manifests/vllm/gpu-deployment.yaml#L170-L226. + // We should support this in the upstream inference engines in the future. TerminationGracePeriodSeconds: ptr.To[int64](130), // TODO: should we support image pull secret here? Containers: []corev1.Container{ From ae1ae3e6157d26c8ae0d47b63189dff0ede42f41 Mon Sep 17 00:00:00 2001 From: Se7en Date: Sun, 27 Apr 2025 21:15:49 +0800 Subject: [PATCH 3/4] Remove extra blank --- pkg/controller/inference/playground_controller.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/controller/inference/playground_controller.go b/pkg/controller/inference/playground_controller.go index b7b69a59..24f79c32 100644 --- a/pkg/controller/inference/playground_controller.go +++ b/pkg/controller/inference/playground_controller.go @@ -330,8 +330,8 @@ func buildTemplate(models []*coreapi.OpenModel, playground *inferenceapi.Playgro template := corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ - // TODO: The timeout is mainly quoted from https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/v0.3.0/config/manifests/vllm/gpu-deployment.yaml#L170-L226. - // We should support this in the upstream inference engines in the future. + // TODO: The timeout is mainly quoted from https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/v0.3.0/config/manifests/vllm/gpu-deployment.yaml#L170-L226. + // We should support this in the upstream inference engines in the future. TerminationGracePeriodSeconds: ptr.To[int64](130), // TODO: should we support image pull secret here? Containers: []corev1.Container{ From 5935aef2841701ad9b4cc9372e857d2f6c2e2a34 Mon Sep 17 00:00:00 2001 From: cr7258 Date: Sun, 27 Apr 2025 21:19:37 +0800 Subject: [PATCH 4/4] fix golang lint --- pkg/controller/inference/playground_controller.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/controller/inference/playground_controller.go b/pkg/controller/inference/playground_controller.go index 24f79c32..73dd19ad 100644 --- a/pkg/controller/inference/playground_controller.go +++ b/pkg/controller/inference/playground_controller.go @@ -330,8 +330,8 @@ func buildTemplate(models []*coreapi.OpenModel, playground *inferenceapi.Playgro template := corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ - // TODO: The timeout is mainly quoted from https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/v0.3.0/config/manifests/vllm/gpu-deployment.yaml#L170-L226. - // We should support this in the upstream inference engines in the future. + // TODO: The timeout is mainly quoted from https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/v0.3.0/config/manifests/vllm/gpu-deployment.yaml#L170-L226. + // We should support this in the upstream inference engines in the future. TerminationGracePeriodSeconds: ptr.To[int64](130), // TODO: should we support image pull secret here? Containers: []corev1.Container{