Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions chart/templates/backends/llamacpp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,24 @@ spec:
- ./llama-server
image: {{ .Values.backendRuntime.llamacpp.image.repository }}
version: {{ .Values.backendRuntime.llamacpp.image.tag }}
lifecycle:
preStop:
exec:
command:
- /bin/sh
- -c
- |
while true; do
RUNNING=$(curl -s http://localhost:8080/metrics | grep 'llamacpp:requests_processing' | grep -v '#' | awk '{print $2}')
WAITING=$(curl -s http://localhost:8080/metrics | grep 'llamacpp:requests_deferred' | grep -v '#' | awk '{print $2}')
if [ "$RUNNING" = "0" ] && [ "$WAITING" = "0" ]; then
echo "Terminating: No active or waiting requests, safe to terminate" >> /proc/1/fd/1
exit 0
else
echo "Terminating: Running: $RUNNING, Waiting: $WAITING" >> /proc/1/fd/1
sleep 5
fi
done
# Do not edit the preset argument name unless you know what you're doing.
# Free to add more arguments with your requirements.
recommendedConfigs:
Expand All @@ -23,6 +41,7 @@ spec:
- "0.0.0.0"
- --port
- "8080"
- --metrics
resources:
requests:
cpu: 2
Expand Down
18 changes: 18 additions & 0 deletions chart/templates/backends/tgi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,24 @@ metadata:
spec:
image: {{ .Values.backendRuntime.tgi.image.repository }}
version: {{ .Values.backendRuntime.tgi.image.tag }}
lifecycle:
preStop:
exec:
command:
- /bin/sh
- -c
- |
while true; do
RUNNING=$(curl -s http://localhost:8080/metrics | grep 'tgi_batch_current_size' | grep -v '#' | awk '{print $2}')
WAITING=$(curl -s http://localhost:8080/metrics | grep 'tgi_queue_size' | grep -v '#' | awk '{print $2}')
if [ "$RUNNING" = "0" ] && [ "$WAITING" = "0" ]; then
echo "Terminating: No active or waiting requests, safe to terminate" >> /proc/1/fd/1
exit 0
else
echo "Terminating: Running: $RUNNING, Waiting: $WAITING" >> /proc/1/fd/1
sleep 5
fi
done
# Do not edit the preset argument name unless you know what you're doing.
# Free to add more arguments with your requirements.
recommendedConfigs:
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/tgi/playground.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@ spec:
modelClaim:
modelName: qwen2-0--5b
backendRuntimeConfig:
name: tgi
backendName: tgi
3 changes: 3 additions & 0 deletions pkg/controller/inference/playground_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,9 @@ func buildTemplate(models []*coreapi.OpenModel, playground *inferenceapi.Playgro

template := corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
// TODO: The timeout is mainly quoted from https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/v0.3.0/config/manifests/vllm/gpu-deployment.yaml#L170-L226.
// We should support this in the upstream inference engines in the future.
TerminationGracePeriodSeconds: ptr.To[int64](130),
Comment thread
cr7258 marked this conversation as resolved.
Comment thread
cr7258 marked this conversation as resolved.
// TODO: should we support image pull secret here?
Containers: []corev1.Container{
{
Expand Down
Loading