diff --git a/README.md b/README.md index 46c40893..eb7e846e 100644 --- a/README.md +++ b/README.md @@ -67,9 +67,9 @@ spec: modelHub: modelID: facebook/opt-125m inferenceFlavors: - - name: t4 # GPU type - requests: - nvidia.com/gpu: 1 + - name: t4 # GPU type + requests: + nvidia.com/gpu: 1 ``` #### Inference Playground @@ -124,12 +124,11 @@ If you want to learn more about this project, please refer to [develop.md](./doc - CLI tool support - Model training, fine tuning in the long-term - ## Community Join us for more discussions: -* **Slack Channel**: [#llmaz](https://inftyai.slack.com/archives/C06D0BGEQ1G) +- **Slack Channel**: [#llmaz](https://inftyai.slack.com/archives/C06D0BGEQ1G) ## Contributions diff --git a/api/inference/v1alpha1/backendruntime_types.go b/api/inference/v1alpha1/backendruntime_types.go index 72dc3408..f766f7e6 100644 --- a/api/inference/v1alpha1/backendruntime_types.go +++ b/api/inference/v1alpha1/backendruntime_types.go @@ -63,6 +63,22 @@ type BackendRuntimeSpec struct { // accelerators like GPU should not be defined here, but at the model flavors, // or the values here will be overwritten. Resources ResourceRequirements `json:"resources"` + // Periodic probe of backend liveness. + // Backend will be restarted if the probe fails. + // Cannot be updated. + // +optional + LivenessProbe *corev1.Probe `json:"livenessProbe,omitempty"` + // Periodic probe of backend readiness. + // Backend will be removed from service endpoints if the probe fails. + // +optional + ReadinessProbe *corev1.Probe `json:"readinessProbe,omitempty"` + // StartupProbe indicates that the Backend has successfully initialized. + // If specified, no other probes are executed until this completes successfully. + // If this probe fails, the backend will be restarted, just as if the livenessProbe failed. + // This can be used to provide different probe parameters at the beginning of a backend's lifecycle, + // when it might take a long time to load data or warm a cache, than during steady-state operation. + // +optional + StartupProbe *corev1.Probe `json:"startupProbe,omitempty"` } // BackendRuntimeStatus defines the observed state of BackendRuntime diff --git a/chart/templates/backends/llamacpp.yaml b/chart/templates/backends/llamacpp.yaml index 634c5e8a..2eedfab6 100644 --- a/chart/templates/backends/llamacpp.yaml +++ b/chart/templates/backends/llamacpp.yaml @@ -23,6 +23,7 @@ spec: - "0.0.0.0" - --port - "8080" + # TODO: not supported yet, see https://github.com/InftyAI/llmaz/issues/240. - name: speculative-decoding flags: - -m @@ -40,4 +41,24 @@ spec: limits: cpu: 2 memory: 4Gi + startupProbe: + periodSeconds: 10 + failureThreshold: 30 + httpGet: + path: /health + port: 8080 + livenessProbe: + initialDelaySeconds: 15 + periodSeconds: 10 + failureThreshold: 3 + httpGet: + path: /health + port: 8080 + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 3 + httpGet: + path: /health + port: 8080 {{- end }} diff --git a/chart/templates/backends/sglang.yaml b/chart/templates/backends/sglang.yaml index f7cc1e8c..86a5b44d 100644 --- a/chart/templates/backends/sglang.yaml +++ b/chart/templates/backends/sglang.yaml @@ -34,4 +34,24 @@ spec: limits: cpu: 4 memory: 8Gi + startupProbe: + periodSeconds: 10 + failureThreshold: 30 + httpGet: + path: /health + port: 8080 + livenessProbe: + initialDelaySeconds: 15 + periodSeconds: 10 + failureThreshold: 3 + httpGet: + path: /health + port: 8080 + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 3 + httpGet: + path: /health_generate + port: 8080 {{- end }} diff --git a/chart/templates/backends/tgi.yaml b/chart/templates/backends/tgi.yaml index d6a67420..5a2bd87c 100644 --- a/chart/templates/backends/tgi.yaml +++ b/chart/templates/backends/tgi.yaml @@ -26,4 +26,24 @@ spec: limits: cpu: 4 memory: 8Gi + startupProbe: + periodSeconds: 10 + failureThreshold: 30 + httpGet: + path: /health + port: 8080 + livenessProbe: + initialDelaySeconds: 15 + periodSeconds: 10 + failureThreshold: 3 + httpGet: + path: /health + port: 8080 + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 3 + httpGet: + path: /health + port: 8080 {{- end }} diff --git a/chart/templates/backends/vllm.yaml b/chart/templates/backends/vllm.yaml index 1a52874e..2a2888c0 100644 --- a/chart/templates/backends/vllm.yaml +++ b/chart/templates/backends/vllm.yaml @@ -107,4 +107,24 @@ spec: limits: cpu: 4 memory: 8Gi + startupProbe: + periodSeconds: 10 + failureThreshold: 30 + httpGet: + path: /health + port: 8080 + livenessProbe: + initialDelaySeconds: 15 + periodSeconds: 10 + failureThreshold: 3 + httpGet: + path: /health + port: 8080 + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 3 + httpGet: + path: /health + port: 8080 {{- end }} diff --git a/config/crd/bases/inference.llmaz.io_backendruntimes.yaml b/config/crd/bases/inference.llmaz.io_backendruntimes.yaml index 21c8639f..81967e3c 100644 --- a/config/crd/bases/inference.llmaz.io_backendruntimes.yaml +++ b/config/crd/bases/inference.llmaz.io_backendruntimes.yaml @@ -194,6 +194,160 @@ spec: Image represents the default image registry of the backendRuntime. It will work together with version to make up a real image. type: string + livenessProbe: + description: |- + Periodic probe of backend liveness. + Backend will be restarted if the probe fails. + Cannot be updated. + properties: + exec: + description: Exec specifies the action to take. + properties: + command: + description: |- + Command is the command line to execute inside the container, the working directory for the + command is root ('/') in the container's filesystem. The command is simply exec'd, it is + not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use + a shell, you need to explicitly call out to that shell. + Exit status of 0 is treated as live/healthy and non-zero is unhealthy. + items: + type: string + type: array + x-kubernetes-list-type: atomic + type: object + failureThreshold: + description: |- + Minimum consecutive failures for the probe to be considered failed after having succeeded. + Defaults to 3. Minimum value is 1. + format: int32 + type: integer + grpc: + description: GRPC specifies an action involving a GRPC port. + properties: + port: + description: Port number of the gRPC service. Number must + be in the range 1 to 65535. + format: int32 + type: integer + service: + default: "" + description: |- + Service is the name of the service to place in the gRPC HealthCheckRequest + (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). + + If this is not specified, the default behavior is defined by gRPC. + type: string + required: + - port + type: object + httpGet: + description: HTTPGet specifies the http request to perform. + properties: + host: + description: |- + Host name to connect to, defaults to the pod IP. You probably want to set + "Host" in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set in the request. HTTP allows + repeated headers. + items: + description: HTTPHeader describes a custom header to be + used in HTTP probes + properties: + name: + description: |- + The header field name. + This will be canonicalized upon output, so case-variant names will be understood as the same header. + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: object + type: array + x-kubernetes-list-type: atomic + path: + description: Path to access on the HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: |- + Name or number of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: |- + Scheme to use for connecting to the host. + Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: |- + Number of seconds after the container has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes + format: int32 + type: integer + periodSeconds: + description: |- + How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: |- + Minimum consecutive successes for the probe to be considered successful after having failed. + Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: TCPSocket specifies an action involving a TCP port. + properties: + host: + description: 'Optional: Host name to connect to, defaults + to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: |- + Number or name of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + terminationGracePeriodSeconds: + description: |- + Optional duration in seconds the pod needs to terminate gracefully upon probe failure. + The grace period is the duration in seconds after the processes running in the pod are sent + a termination signal and the time when the processes are forcibly halted with a kill signal. + Set this value longer than the expected cleanup time for your process. + If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this + value overrides the value provided by the pod spec. + Value must be non-negative integer. The value zero indicates stop immediately via + the kill signal (no opportunity to shut down). + This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. + Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. + format: int64 + type: integer + timeoutSeconds: + description: |- + Number of seconds after which the probe times out. + Defaults to 1 second. Minimum value is 1. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes + format: int32 + type: integer + type: object multiHostCommands: description: |- MultiHostCommands represents leader and worker commands for nodes with @@ -208,6 +362,159 @@ spec: type: string type: array type: object + readinessProbe: + description: |- + Periodic probe of backend readiness. + Backend will be removed from service endpoints if the probe fails. + properties: + exec: + description: Exec specifies the action to take. + properties: + command: + description: |- + Command is the command line to execute inside the container, the working directory for the + command is root ('/') in the container's filesystem. The command is simply exec'd, it is + not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use + a shell, you need to explicitly call out to that shell. + Exit status of 0 is treated as live/healthy and non-zero is unhealthy. + items: + type: string + type: array + x-kubernetes-list-type: atomic + type: object + failureThreshold: + description: |- + Minimum consecutive failures for the probe to be considered failed after having succeeded. + Defaults to 3. Minimum value is 1. + format: int32 + type: integer + grpc: + description: GRPC specifies an action involving a GRPC port. + properties: + port: + description: Port number of the gRPC service. Number must + be in the range 1 to 65535. + format: int32 + type: integer + service: + default: "" + description: |- + Service is the name of the service to place in the gRPC HealthCheckRequest + (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). + + If this is not specified, the default behavior is defined by gRPC. + type: string + required: + - port + type: object + httpGet: + description: HTTPGet specifies the http request to perform. + properties: + host: + description: |- + Host name to connect to, defaults to the pod IP. You probably want to set + "Host" in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set in the request. HTTP allows + repeated headers. + items: + description: HTTPHeader describes a custom header to be + used in HTTP probes + properties: + name: + description: |- + The header field name. + This will be canonicalized upon output, so case-variant names will be understood as the same header. + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: object + type: array + x-kubernetes-list-type: atomic + path: + description: Path to access on the HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: |- + Name or number of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: |- + Scheme to use for connecting to the host. + Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: |- + Number of seconds after the container has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes + format: int32 + type: integer + periodSeconds: + description: |- + How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: |- + Minimum consecutive successes for the probe to be considered successful after having failed. + Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: TCPSocket specifies an action involving a TCP port. + properties: + host: + description: 'Optional: Host name to connect to, defaults + to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: |- + Number or name of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + terminationGracePeriodSeconds: + description: |- + Optional duration in seconds the pod needs to terminate gracefully upon probe failure. + The grace period is the duration in seconds after the processes running in the pod are sent + a termination signal and the time when the processes are forcibly halted with a kill signal. + Set this value longer than the expected cleanup time for your process. + If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this + value overrides the value provided by the pod spec. + Value must be non-negative integer. The value zero indicates stop immediately via + the kill signal (no opportunity to shut down). + This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. + Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. + format: int64 + type: integer + timeoutSeconds: + description: |- + Number of seconds after which the probe times out. + Defaults to 1 second. Minimum value is 1. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes + format: int32 + type: integer + type: object resources: description: |- Resources represents the resource requirements for backendRuntime, like cpu/mem, @@ -239,6 +546,162 @@ spec: More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ type: object type: object + startupProbe: + description: |- + StartupProbe indicates that the Backend has successfully initialized. + If specified, no other probes are executed until this completes successfully. + If this probe fails, the backend will be restarted, just as if the livenessProbe failed. + This can be used to provide different probe parameters at the beginning of a backend's lifecycle, + when it might take a long time to load data or warm a cache, than during steady-state operation. + properties: + exec: + description: Exec specifies the action to take. + properties: + command: + description: |- + Command is the command line to execute inside the container, the working directory for the + command is root ('/') in the container's filesystem. The command is simply exec'd, it is + not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use + a shell, you need to explicitly call out to that shell. + Exit status of 0 is treated as live/healthy and non-zero is unhealthy. + items: + type: string + type: array + x-kubernetes-list-type: atomic + type: object + failureThreshold: + description: |- + Minimum consecutive failures for the probe to be considered failed after having succeeded. + Defaults to 3. Minimum value is 1. + format: int32 + type: integer + grpc: + description: GRPC specifies an action involving a GRPC port. + properties: + port: + description: Port number of the gRPC service. Number must + be in the range 1 to 65535. + format: int32 + type: integer + service: + default: "" + description: |- + Service is the name of the service to place in the gRPC HealthCheckRequest + (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). + + If this is not specified, the default behavior is defined by gRPC. + type: string + required: + - port + type: object + httpGet: + description: HTTPGet specifies the http request to perform. + properties: + host: + description: |- + Host name to connect to, defaults to the pod IP. You probably want to set + "Host" in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set in the request. HTTP allows + repeated headers. + items: + description: HTTPHeader describes a custom header to be + used in HTTP probes + properties: + name: + description: |- + The header field name. + This will be canonicalized upon output, so case-variant names will be understood as the same header. + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: object + type: array + x-kubernetes-list-type: atomic + path: + description: Path to access on the HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: |- + Name or number of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: |- + Scheme to use for connecting to the host. + Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: |- + Number of seconds after the container has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes + format: int32 + type: integer + periodSeconds: + description: |- + How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: |- + Minimum consecutive successes for the probe to be considered successful after having failed. + Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: TCPSocket specifies an action involving a TCP port. + properties: + host: + description: 'Optional: Host name to connect to, defaults + to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: |- + Number or name of the port to access on the container. + Number must be in the range 1 to 65535. + Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + terminationGracePeriodSeconds: + description: |- + Optional duration in seconds the pod needs to terminate gracefully upon probe failure. + The grace period is the duration in seconds after the processes running in the pod are sent + a termination signal and the time when the processes are forcibly halted with a kill signal. + Set this value longer than the expected cleanup time for your process. + If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this + value overrides the value provided by the pod spec. + Value must be non-negative integer. The value zero indicates stop immediately via + the kill signal (no opportunity to shut down). + This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. + Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. + format: int64 + type: integer + timeoutSeconds: + description: |- + Number of seconds after which the probe times out. + Defaults to 1 second. Minimum value is 1. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes + format: int32 + type: integer + type: object version: description: |- Version represents the default version of the backendRuntime. diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml index 045cb509..aab3a173 100644 --- a/config/manager/kustomization.yaml +++ b/config/manager/kustomization.yaml @@ -4,5 +4,5 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization images: - name: controller - newName: inftyai/llmaz - newTag: v0.0.9 + newName: inftyai/test + newTag: llmaz-011701 diff --git a/docs/examples/multi-nodes/model.yaml b/docs/examples/multi-nodes/model.yaml index 11bd2a30..513939fb 100644 --- a/docs/examples/multi-nodes/model.yaml +++ b/docs/examples/multi-nodes/model.yaml @@ -7,7 +7,7 @@ spec: source: modelHub: # TODO: - modelID: Qwen/Qwen2-0.5B + modelID: meta-llama/Llama-3.1-405B inferenceFlavors: - name: a100-80gb requests: diff --git a/pkg/controller/inference/playground_controller.go b/pkg/controller/inference/playground_controller.go index ea9f3780..c6174747 100644 --- a/pkg/controller/inference/playground_controller.go +++ b/pkg/controller/inference/playground_controller.go @@ -313,6 +313,17 @@ func buildTemplate(models []*coreapi.OpenModel, playground *inferenceapi.Playgro args = nil } + var livenessProbe, readinessProbe, startupProbe *corev1.Probe + if backendRuntime.Spec.StartupProbe != nil { + startupProbe = backendRuntime.Spec.StartupProbe + } + if backendRuntime.Spec.LivenessProbe != nil { + livenessProbe = backendRuntime.Spec.LivenessProbe + } + if backendRuntime.Spec.ReadinessProbe != nil { + readinessProbe = backendRuntime.Spec.ReadinessProbe + } + template := corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ // TODO: should we support image pull secret here? @@ -332,6 +343,9 @@ func buildTemplate(models []*coreapi.OpenModel, playground *inferenceapi.Playgro ContainerPort: modelSource.DEFAULT_BACKEND_PORT, }, }, + StartupProbe: startupProbe, + LivenessProbe: livenessProbe, + ReadinessProbe: readinessProbe, }, }, }, diff --git a/test/config/backends/llamacpp.yaml b/test/config/backends/llamacpp.yaml index da57e3d3..df1fe360 100644 --- a/test/config/backends/llamacpp.yaml +++ b/test/config/backends/llamacpp.yaml @@ -20,6 +20,7 @@ spec: - "0.0.0.0" - --port - "8080" + # TODO: not supported yet, see https://github.com/InftyAI/llmaz/issues/240. - name: speculative-decoding flags: - -m @@ -30,6 +31,10 @@ spec: - "0.0.0.0" - --port - "8080" + - --draft-max + - "16" + - --draft-min + - "5" resources: requests: cpu: 2 @@ -37,3 +42,23 @@ spec: limits: cpu: 2 memory: 4Gi + startupProbe: + periodSeconds: 10 + failureThreshold: 30 + httpGet: + path: /health + port: 8080 + livenessProbe: + initialDelaySeconds: 15 + periodSeconds: 10 + failureThreshold: 3 + httpGet: + path: /health + port: 8080 + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 3 + httpGet: + path: /health + port: 8080 diff --git a/test/config/backends/ollama.yaml b/test/config/backends/ollama.yaml index 23ed462e..d5e347b0 100644 --- a/test/config/backends/ollama.yaml +++ b/test/config/backends/ollama.yaml @@ -16,14 +16,14 @@ spec: - name: default flags: - "ollama serve & - while true; do output=$(ollama list 2>&1); - if ! echo $output | grep -q 'could not connect to ollama app' && echo $output | grep -q 'NAME';then echo 'ollama is running';break; else echo 'Waiting for the ollama to be running...';sleep 1;fi;done; - ollama run {{`{{ .ModelName }}`}}; - while true;do sleep 60;done" + while true; do output=$(ollama list 2>&1); + if ! echo $output | grep -q 'could not connect to ollama app' && echo $output | grep -q 'NAME';then echo 'ollama is running';break; else echo 'Waiting for the ollama to be running...';sleep 1;fi;done; + ollama run {{`{{ .ModelName }}`}}; + while true;do sleep 60;done" resources: requests: cpu: 2 memory: 4Gi limits: cpu: 2 - memory: 4Gi \ No newline at end of file + memory: 4Gi diff --git a/test/config/backends/sglang.yaml b/test/config/backends/sglang.yaml index 8d5b4eaf..3eb4fab7 100644 --- a/test/config/backends/sglang.yaml +++ b/test/config/backends/sglang.yaml @@ -31,3 +31,23 @@ spec: limits: cpu: 4 memory: 8Gi + startupProbe: + periodSeconds: 10 + failureThreshold: 30 + httpGet: + path: /health + port: 8080 + livenessProbe: + initialDelaySeconds: 15 + periodSeconds: 10 + failureThreshold: 3 + httpGet: + path: /health + port: 8080 + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 3 + httpGet: + path: /health_generate + port: 8080 diff --git a/test/config/backends/tgi.yaml b/test/config/backends/tgi.yaml index 13eeed7a..75235192 100644 --- a/test/config/backends/tgi.yaml +++ b/test/config/backends/tgi.yaml @@ -25,3 +25,23 @@ spec: limits: cpu: 4 memory: 8Gi + startupProbe: + periodSeconds: 10 + failureThreshold: 30 + httpGet: + path: /health + port: 8080 + livenessProbe: + initialDelaySeconds: 15 + periodSeconds: 10 + failureThreshold: 3 + httpGet: + path: /health + port: 8080 + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 3 + httpGet: + path: /health + port: 8080 diff --git a/test/config/backends/vllm.yaml b/test/config/backends/vllm.yaml index 384d9a97..7ecbd873 100644 --- a/test/config/backends/vllm.yaml +++ b/test/config/backends/vllm.yaml @@ -106,3 +106,23 @@ spec: limits: cpu: 4 memory: 8Gi + startupProbe: + periodSeconds: 10 + failureThreshold: 30 + httpGet: + path: /health + port: 8080 + livenessProbe: + initialDelaySeconds: 15 + periodSeconds: 10 + failureThreshold: 3 + httpGet: + path: /health + port: 8080 + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 3 + httpGet: + path: /health + port: 8080 diff --git a/test/util/validation/validate_playground.go b/test/util/validation/validate_playground.go index 10181b4d..0ef25a89 100644 --- a/test/util/validation/validate_playground.go +++ b/test/util/validation/validate_playground.go @@ -112,6 +112,8 @@ func ValidatePlayground(ctx context.Context, k8sClient client.Client, playground // compare the same part of leader and worker template, image, version, env, resources. if playground.Spec.BackendRuntimeConfig != nil { + + // compare image & version if playground.Spec.BackendRuntimeConfig.Version != nil { if parser.Image(*playground.Spec.BackendRuntimeConfig.Version) != service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Image { return fmt.Errorf("expected container image %s, got %s", parser.Image(*playground.Spec.BackendRuntimeConfig.Version), service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Image) @@ -187,6 +189,41 @@ func ValidatePlayground(ctx context.Context, k8sClient client.Client, playground } } } + + // compare probes + if backendRuntime.Spec.StartupProbe != nil { + if multiHost { + if diff := cmp.Diff(*service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].StartupProbe, *backendRuntime.Spec.StartupProbe); diff != "" { + return fmt.Errorf("unexpected startupProbe") + } + } else { + if diff := cmp.Diff(*service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].StartupProbe, *backendRuntime.Spec.StartupProbe); diff != "" { + return fmt.Errorf("unexpected startupProbe") + } + } + } + if backendRuntime.Spec.LivenessProbe != nil { + if multiHost { + if diff := cmp.Diff(*service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].LivenessProbe, *backendRuntime.Spec.LivenessProbe); diff != "" { + return fmt.Errorf("unexpected livenessProbe") + } + } else { + if diff := cmp.Diff(*service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].LivenessProbe, *backendRuntime.Spec.LivenessProbe); diff != "" { + return fmt.Errorf("unexpected livenessProbe") + } + } + } + if backendRuntime.Spec.ReadinessProbe != nil { + if multiHost { + if diff := cmp.Diff(*service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].ReadinessProbe, *backendRuntime.Spec.ReadinessProbe); diff != "" { + return fmt.Errorf("unexpected readinessProbe") + } + } else { + if diff := cmp.Diff(*service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].ReadinessProbe, *backendRuntime.Spec.ReadinessProbe); diff != "" { + return fmt.Errorf("unexpected readinessProbe") + } + } + } } // compare the different parts. diff --git a/test/util/wrapper/backend.go b/test/util/wrapper/backend.go index 4deb1b52..c82a7887 100644 --- a/test/util/wrapper/backend.go +++ b/test/util/wrapper/backend.go @@ -17,7 +17,7 @@ limitations under the License. package wrapper import ( - v1 "k8s.io/api/core/v1" + corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -72,16 +72,29 @@ func (w *BackendRuntimeWrapper) Arg(name string, flags []string) *BackendRuntime func (w *BackendRuntimeWrapper) Request(r, v string) *BackendRuntimeWrapper { if w.Spec.Resources.Requests == nil { - w.Spec.Resources.Requests = v1.ResourceList{} + w.Spec.Resources.Requests = corev1.ResourceList{} } - w.Spec.Resources.Requests[v1.ResourceName(r)] = resource.MustParse(v) + w.Spec.Resources.Requests[corev1.ResourceName(r)] = resource.MustParse(v) return w } func (w *BackendRuntimeWrapper) Limit(r, v string) *BackendRuntimeWrapper { if w.Spec.Resources.Limits == nil { - w.Spec.Resources.Limits = v1.ResourceList{} + w.Spec.Resources.Limits = corev1.ResourceList{} + } + w.Spec.Resources.Limits[corev1.ResourceName(r)] = resource.MustParse(v) + return w +} + +func (w *BackendRuntimeWrapper) Probe(name string, probe *corev1.Probe) *BackendRuntimeWrapper { + if name == "liveness" { + w.Spec.LivenessProbe = probe + } + if name == "readiness" { + w.Spec.ReadinessProbe = probe + } + if name == "startup" { + w.Spec.LivenessProbe = probe } - w.Spec.Resources.Limits[v1.ResourceName(r)] = resource.MustParse(v) return w }