diff --git a/chart/templates/backends/llamacpp.yaml b/chart/templates/backends/llamacpp.yaml index 150e2378..fb6ae6d0 100644 --- a/chart/templates/backends/llamacpp.yaml +++ b/chart/templates/backends/llamacpp.yaml @@ -12,6 +12,24 @@ spec: - ./llama-server image: {{ .Values.backendRuntime.llamacpp.image.repository }} version: {{ .Values.backendRuntime.llamacpp.image.tag }} + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - | + while true; do + RUNNING=$(curl -s http://localhost:8080/metrics | grep 'llamacpp:requests_processing' | grep -v '#' | awk '{print $2}') + WAITING=$(curl -s http://localhost:8080/metrics | grep 'llamacpp:requests_deferred' | grep -v '#' | awk '{print $2}') + if [ "$RUNNING" = "0" ] && [ "$WAITING" = "0" ]; then + echo "Terminating: No active or waiting requests, safe to terminate" >> /proc/1/fd/1 + exit 0 + else + echo "Terminating: Running: $RUNNING, Waiting: $WAITING" >> /proc/1/fd/1 + sleep 5 + fi + done # Do not edit the preset argument name unless you know what you're doing. # Free to add more arguments with your requirements. recommendedConfigs: @@ -23,6 +41,7 @@ spec: - "0.0.0.0" - --port - "8080" + - --metrics resources: requests: cpu: 2 diff --git a/chart/templates/backends/tgi.yaml b/chart/templates/backends/tgi.yaml index 693964ee..39641705 100644 --- a/chart/templates/backends/tgi.yaml +++ b/chart/templates/backends/tgi.yaml @@ -10,6 +10,24 @@ metadata: spec: image: {{ .Values.backendRuntime.tgi.image.repository }} version: {{ .Values.backendRuntime.tgi.image.tag }} + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - | + while true; do + RUNNING=$(curl -s http://localhost:8080/metrics | grep 'tgi_batch_current_size' | grep -v '#' | awk '{print $2}') + WAITING=$(curl -s http://localhost:8080/metrics | grep 'tgi_queue_size' | grep -v '#' | awk '{print $2}') + if [ "$RUNNING" = "0" ] && [ "$WAITING" = "0" ]; then + echo "Terminating: No active or waiting requests, safe to terminate" >> /proc/1/fd/1 + exit 0 + else + echo "Terminating: Running: $RUNNING, Waiting: $WAITING" >> /proc/1/fd/1 + sleep 5 + fi + done # Do not edit the preset argument name unless you know what you're doing. # Free to add more arguments with your requirements. recommendedConfigs: diff --git a/docs/examples/tgi/playground.yaml b/docs/examples/tgi/playground.yaml index 3edad83a..d6d4021c 100644 --- a/docs/examples/tgi/playground.yaml +++ b/docs/examples/tgi/playground.yaml @@ -22,4 +22,4 @@ spec: modelClaim: modelName: qwen2-0--5b backendRuntimeConfig: - name: tgi + backendName: tgi diff --git a/pkg/controller/inference/playground_controller.go b/pkg/controller/inference/playground_controller.go index f33866f1..73dd19ad 100644 --- a/pkg/controller/inference/playground_controller.go +++ b/pkg/controller/inference/playground_controller.go @@ -330,6 +330,9 @@ func buildTemplate(models []*coreapi.OpenModel, playground *inferenceapi.Playgro template := corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ + // TODO: The timeout is mainly quoted from https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/v0.3.0/config/manifests/vllm/gpu-deployment.yaml#L170-L226. + // We should support this in the upstream inference engines in the future. + TerminationGracePeriodSeconds: ptr.To[int64](130), // TODO: should we support image pull secret here? Containers: []corev1.Container{ {