From cd2963192bedb45b8d3175a8ac840163ef83505f Mon Sep 17 00:00:00 2001 From: kerthcet Date: Tue, 18 Feb 2025 11:21:13 +0800 Subject: [PATCH 1/3] Add recommendedConfigs to backendRuntime Signed-off-by: kerthcet --- .../v1alpha1/backendruntime_types.go | 61 +- api/inference/v1alpha1/config_types.go | 56 +- api/inference/v1alpha1/playground_types.go | 15 +- .../v1alpha1/zz_generated.deepcopy.go | 143 +- chart/templates/backends/llamacpp.yaml | 29 +- chart/templates/backends/ollama.yaml | 34 +- chart/templates/backends/sglang.yaml | 18 +- chart/templates/backends/tgi.yaml | 18 +- chart/templates/backends/vllm.yaml | 22 +- .../inference/v1alpha1/backendruntimearg.go | 49 - .../v1alpha1/backendruntimeconfig.go | 48 +- .../inference/v1alpha1/elasticconfig.go | 22 +- .../inference/v1alpha1/scaletriggerref.go | 38 - client-go/applyconfiguration/utils.go | 4 - .../inference.llmaz.io_backendruntimes.yaml | 1236 +++++++++-------- .../bases/inference.llmaz.io_playgrounds.yaml | 127 +- docs/examples/hpa/playground.yaml | 13 +- docs/examples/llamacpp/playground.yaml | 7 +- docs/examples/ollama/playground.yaml | 2 +- docs/examples/sglang/playground.yaml | 2 +- .../llamacpp/playground.yaml | 6 +- .../inference/playground_controller.go | 127 +- .../inference/service_controller.go | 2 +- .../{ => backendruntime}/backendruntime.go | 67 +- .../backendruntime_test.go | 0 pkg/controller_helper/helper.go | 16 +- .../{model_source => modelsource}/modelhub.go | 0 .../modelsource.go | 0 .../modelsource_test.go | 0 .../{model_source => modelsource}/uri.go | 0 pkg/webhook/backendruntime_webhook.go | 21 +- pkg/webhook/openmodel_webhook.go | 2 +- pkg/webhook/playground_webhook.go | 10 +- pkg/webhook/service_webhook.go | 2 +- test/config/backends/fake_backend.yaml | 51 +- test/config/backends/llamacpp.yaml | 22 +- test/config/backends/ollama.yaml | 22 +- test/config/backends/sglang.yaml | 20 +- test/config/backends/tgi.yaml | 18 +- test/config/backends/vllm.yaml | 22 +- test/e2e/playground_test.go | 4 +- .../controller/inference/hpa_test.go | 9 +- .../controller/inference/playground_test.go | 12 +- .../webhook/backendruntime_test.go | 13 +- test/util/mock.go | 2 +- test/util/validation/validate_playground.go | 222 ++- test/util/validation/validate_service.go | 2 +- test/util/wrapper/backend.go | 78 +- test/util/wrapper/playground.go | 36 +- 49 files changed, 1322 insertions(+), 1408 deletions(-) delete mode 100644 client-go/applyconfiguration/inference/v1alpha1/backendruntimearg.go delete mode 100644 client-go/applyconfiguration/inference/v1alpha1/scaletriggerref.go rename pkg/controller_helper/{ => backendruntime}/backendruntime.go (66%) rename pkg/controller_helper/{ => backendruntime}/backendruntime_test.go (100%) rename pkg/controller_helper/{model_source => modelsource}/modelhub.go (100%) rename pkg/controller_helper/{model_source => modelsource}/modelsource.go (100%) rename pkg/controller_helper/{model_source => modelsource}/modelsource_test.go (100%) rename pkg/controller_helper/{model_source => modelsource}/uri.go (100%) diff --git a/api/inference/v1alpha1/backendruntime_types.go b/api/inference/v1alpha1/backendruntime_types.go index cee21e2d..d26829df 100644 --- a/api/inference/v1alpha1/backendruntime_types.go +++ b/api/inference/v1alpha1/backendruntime_types.go @@ -19,22 +19,10 @@ package v1alpha1 import ( autoscalingv2 "k8s.io/api/autoscaling/v2" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -// BackendRuntimeArg is the preset arguments for easy to use. -// Three preset names are provided: default, speculative-decoding, model-parallelism, -// do not change the name. -type BackendRuntimeArg struct { - // Name represents the identifier of the backendRuntime argument. - // +kubebuilder:default=default - // +optional - Name *string `json:"name,omitempty"` - // Flags represents all the preset configurations. - // Flag around with {{ .CONFIG }} is a configuration waiting for render. - Flags []string `json:"flags,omitempty"` -} - // HPATrigger represents the configuration of the HorizontalPodAutoscaler. // Inspired by kubernetes.io/pkg/apis/autoscaling/types.go#HorizontalPodAutoscalerSpec. // Note: HPA component should be installed in prior. @@ -55,17 +43,6 @@ type HPATrigger struct { Behavior *autoscalingv2.HorizontalPodAutoscalerBehavior `json:"behavior,omitempty"` } -// NamedScaleTrigger defines the rules to scale the workloads. -// Only one trigger cloud work at a time. The name is used to identify -// the trigger in backendRuntime. -type NamedScaleTrigger struct { - // Name represents the identifier of the scale trigger, e.g. some triggers defined for - // latency sensitive workloads, some are defined for throughput sensitive workloads. - Name string `json:"name,omitempty"` - // HPA represents the trigger configuration of the HorizontalPodAutoscaler. - HPA *HPATrigger `json:"hpa,omitempty"` -} - // ScaleTrigger defines the rules to scale the workloads. // Only one trigger cloud work at a time, mostly used in Playground. type ScaleTrigger struct { @@ -83,6 +60,30 @@ type MultiHostCommands struct { Worker []string `json:"worker,omitempty"` } +// RecommendedConfig represents the recommended configurations for the backendRuntime, +// user can choose one of them to apply. +type RecommendedConfig struct { + // Name represents the identifier of the config. + Name string `json:"name"` + // Args represents all the arguments for the command. + // Argument around with {{ .CONFIG }} is a configuration waiting for render. + // +optional + Args []string `json:"args,omitempty"` + // Resources represents the resource requirements for backend, like cpu/mem, + // accelerators like GPU should not be defined here, but at the model flavors, + // or the values here will be overwritten. + // +optional + Resources *ResourceRequirements `json:"resources,omitempty"` + // SharedMemorySize represents the size of /dev/shm required in the runtime of + // inference workload. + // +optional + SharedMemorySize *resource.Quantity `json:"sharedMemorySize,omitempty"` + // ScaleTrigger defines the rules to scale the workloads. + // Only one trigger cloud work at a time. + // +optional + ScaleTrigger *ScaleTrigger `json:"scaleTrigger,omitempty"` +} + // BackendRuntimeSpec defines the desired state of BackendRuntime type BackendRuntimeSpec struct { // Commands represents the default commands for the backendRuntime. @@ -98,16 +99,9 @@ type BackendRuntimeSpec struct { // Version represents the default version of the backendRuntime. // It will be appended to the image as a tag. Version string `json:"version"` - // Args represents the preset arguments of the backendRuntime. - // They can be appended or overwritten by the Playground backendRuntimeConfig. - Args []BackendRuntimeArg `json:"args,omitempty"` // Envs represents the environments set to the container. // +optional Envs []corev1.EnvVar `json:"envs,omitempty"` - // Resources represents the resource requirements for backendRuntime, like cpu/mem, - // accelerators like GPU should not be defined here, but at the model flavors, - // or the values here will be overwritten. - Resources ResourceRequirements `json:"resources"` // Periodic probe of backend liveness. // Backend will be restarted if the probe fails. // Cannot be updated. @@ -124,10 +118,9 @@ type BackendRuntimeSpec struct { // when it might take a long time to load data or warm a cache, than during steady-state operation. // +optional StartupProbe *corev1.Probe `json:"startupProbe,omitempty"` - // ScaleTriggers represents a set of triggers preset to be used by Playground. - // If Playground not specify the scale trigger, the 0-index trigger will be used. + // RecommendedConfigs represents the recommended configurations for the backendRuntime. // +optional - ScaleTriggers []NamedScaleTrigger `json:"scaleTriggers,omitempty"` + RecommendedConfigs []RecommendedConfig `json:"recommendedConfigs,omitempty"` } // BackendRuntimeStatus defines the observed state of BackendRuntime diff --git a/api/inference/v1alpha1/config_types.go b/api/inference/v1alpha1/config_types.go index 3bcf7fca..d95add61 100644 --- a/api/inference/v1alpha1/config_types.go +++ b/api/inference/v1alpha1/config_types.go @@ -28,10 +28,10 @@ const ( ) type BackendRuntimeConfig struct { - // Name represents the inference backend under the hood, e.g. vLLM. + // BackendName represents the inference backend under the hood, e.g. vLLM. // +kubebuilder:default=vllm // +optional - Name *BackendName `json:"name,omitempty"` + BackendName *BackendName `json:"backendName,omitempty"` // Version represents the backend version if you want a different one // from the default version. // +optional @@ -39,18 +39,32 @@ type BackendRuntimeConfig struct { // Envs represents the environments set to the container. // +optional Envs []corev1.EnvVar `json:"envs,omitempty"` - + // ConfigName represents the recommended configuration name for the backend, + // It will be inferred from the models in the runtime if not specified, e.g. default, + // speculative-decoding or model-parallelism. + ConfigName *string `json:"configName,omitempty"` + // Args represents all the arguments for the command. + // Argument around with {{ .CONFIG }} is a configuration waiting for render. + // +optional + // Args defined here will "append" the args in the recommendedConfig. + // +optional + Args []string `json:"args,omitempty"` // Resources represents the resource requirements for backend, like cpu/mem, // accelerators like GPU should not be defined here, but at the model flavors, // or the values here will be overwritten. + // Resources defined here will "overwrite" the resources in the recommendedConfig. + // +optional Resources *ResourceRequirements `json:"resources,omitempty"` // SharedMemorySize represents the size of /dev/shm required in the runtime of // inference workload. + // SharedMemorySize defined here will "overwrite" the sharedMemorySize in the recommendedConfig. // +optional SharedMemorySize *resource.Quantity `json:"sharedMemorySize,omitempty"` - // Args represents the specified arguments of the backendRuntime, - // will be append to the backendRuntime.spec.Args. - Args *BackendRuntimeArg `json:"args,omitempty"` + // ScaleTrigger defines the rules to scale the workloads. + // Only one trigger cloud work at a time, mostly used in Playground. + // ScaleTrigger defined here will "overwrite" the scaleTrigger in the recommendedConfig. + // +optional + ScaleTrigger *ScaleTrigger `json:"scaleTrigger,omitempty"` } // TODO: Do not support DRA yet, we can support that once needed. @@ -66,33 +80,3 @@ type ResourceRequirements struct { // +optional Requests corev1.ResourceList `json:"requests,omitempty"` } - -// ScaleTriggerRef refers to the configured scaleTrigger in the backendRuntime. -type ScaleTriggerRef struct { - // Name represents the scale trigger name defined in the backendRuntime.scaleTriggers. - Name string `json:"name"` -} - -type ElasticConfig struct { - // MinReplicas indicates the minimum number of inference workloads based on the traffic. - // Default to 1. - // MinReplicas couldn't be 0 now, will support serverless in the future. - // +kubebuilder:default=1 - // +optional - MinReplicas *int32 `json:"minReplicas,omitempty"` - // MaxReplicas indicates the maximum number of inference workloads based on the traffic. - // Default to nil means there's no limit for the instance number. - // +optional - MaxReplicas *int32 `json:"maxReplicas,omitempty"` - // ScaleTriggerRef refers to the configured scaleTrigger in the backendRuntime - // with tuned target value. - // ScaleTriggerRef and ScaleTrigger can't be set at the same time. - // +optional - ScaleTriggerRef *ScaleTriggerRef `json:"scaleTriggerRef,omitempty"` - // ScaleTrigger defines a set of triggers to scale the workloads. - // If not defined, trigger configured in backendRuntime will be used, - // otherwise, trigger defined here will overwrite the defaulted ones. - // ScaleTriggerRef and ScaleTrigger can't be set at the same time. - // +optional - ScaleTrigger *ScaleTrigger `json:"scaleTrigger,omitempty"` -} diff --git a/api/inference/v1alpha1/playground_types.go b/api/inference/v1alpha1/playground_types.go index 1afaf33a..4c18ae75 100644 --- a/api/inference/v1alpha1/playground_types.go +++ b/api/inference/v1alpha1/playground_types.go @@ -44,11 +44,22 @@ type PlaygroundSpec struct { BackendRuntimeConfig *BackendRuntimeConfig `json:"backendRuntimeConfig,omitempty"` // ElasticConfig defines the configuration for elastic usage, // e.g. the max/min replicas. - // Note: this requires to install the HPA first or will report error. - // +optional ElasticConfig *ElasticConfig `json:"elasticConfig,omitempty"` } +type ElasticConfig struct { + // MinReplicas indicates the minimum number of inference workloads based on the traffic. + // Default to 1. + // MinReplicas couldn't be 0 now, will support serverless in the future. + // +kubebuilder:default=1 + // +optional + MinReplicas *int32 `json:"minReplicas,omitempty"` + // MaxReplicas indicates the maximum number of inference workloads based on the traffic. + // Default to nil means there's no limit for the instance number. + // +optional + MaxReplicas *int32 `json:"maxReplicas,omitempty"` +} + const ( // PlaygroundProgressing means the Playground is progressing now, such as waiting for the // inference service creation, rolling update or scaling up and down. diff --git a/api/inference/v1alpha1/zz_generated.deepcopy.go b/api/inference/v1alpha1/zz_generated.deepcopy.go index 4a7a8cff..cde4189b 100644 --- a/api/inference/v1alpha1/zz_generated.deepcopy.go +++ b/api/inference/v1alpha1/zz_generated.deepcopy.go @@ -55,36 +55,11 @@ func (in *BackendRuntime) DeepCopyObject() runtime.Object { return nil } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *BackendRuntimeArg) DeepCopyInto(out *BackendRuntimeArg) { - *out = *in - if in.Name != nil { - in, out := &in.Name, &out.Name - *out = new(string) - **out = **in - } - if in.Flags != nil { - in, out := &in.Flags, &out.Flags - *out = make([]string, len(*in)) - copy(*out, *in) - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new BackendRuntimeArg. -func (in *BackendRuntimeArg) DeepCopy() *BackendRuntimeArg { - if in == nil { - return nil - } - out := new(BackendRuntimeArg) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *BackendRuntimeConfig) DeepCopyInto(out *BackendRuntimeConfig) { *out = *in - if in.Name != nil { - in, out := &in.Name, &out.Name + if in.BackendName != nil { + in, out := &in.BackendName, &out.BackendName *out = new(BackendName) **out = **in } @@ -93,11 +68,6 @@ func (in *BackendRuntimeConfig) DeepCopyInto(out *BackendRuntimeConfig) { *out = new(string) **out = **in } - if in.Args != nil { - in, out := &in.Args, &out.Args - *out = new(BackendRuntimeArg) - (*in).DeepCopyInto(*out) - } if in.Envs != nil { in, out := &in.Envs, &out.Envs *out = make([]v1.EnvVar, len(*in)) @@ -105,6 +75,16 @@ func (in *BackendRuntimeConfig) DeepCopyInto(out *BackendRuntimeConfig) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.ConfigName != nil { + in, out := &in.ConfigName, &out.ConfigName + *out = new(string) + **out = **in + } + if in.Args != nil { + in, out := &in.Args, &out.Args + *out = make([]string, len(*in)) + copy(*out, *in) + } if in.Resources != nil { in, out := &in.Resources, &out.Resources *out = new(ResourceRequirements) @@ -115,6 +95,11 @@ func (in *BackendRuntimeConfig) DeepCopyInto(out *BackendRuntimeConfig) { x := (*in).DeepCopy() *out = &x } + if in.ScaleTrigger != nil { + in, out := &in.ScaleTrigger, &out.ScaleTrigger + *out = new(ScaleTrigger) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new BackendRuntimeConfig. @@ -172,13 +157,6 @@ func (in *BackendRuntimeSpec) DeepCopyInto(out *BackendRuntimeSpec) { *out = new(MultiHostCommands) (*in).DeepCopyInto(*out) } - if in.Args != nil { - in, out := &in.Args, &out.Args - *out = make([]BackendRuntimeArg, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } - } if in.Envs != nil { in, out := &in.Envs, &out.Envs *out = make([]v1.EnvVar, len(*in)) @@ -186,7 +164,6 @@ func (in *BackendRuntimeSpec) DeepCopyInto(out *BackendRuntimeSpec) { (*in)[i].DeepCopyInto(&(*out)[i]) } } - in.Resources.DeepCopyInto(&out.Resources) if in.LivenessProbe != nil { in, out := &in.LivenessProbe, &out.LivenessProbe *out = new(v1.Probe) @@ -202,9 +179,9 @@ func (in *BackendRuntimeSpec) DeepCopyInto(out *BackendRuntimeSpec) { *out = new(v1.Probe) (*in).DeepCopyInto(*out) } - if in.ScaleTriggers != nil { - in, out := &in.ScaleTriggers, &out.ScaleTriggers - *out = make([]NamedScaleTrigger, len(*in)) + if in.RecommendedConfigs != nil { + in, out := &in.RecommendedConfigs, &out.RecommendedConfigs + *out = make([]RecommendedConfig, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } @@ -256,16 +233,6 @@ func (in *ElasticConfig) DeepCopyInto(out *ElasticConfig) { *out = new(int32) **out = **in } - if in.ScaleTriggerRef != nil { - in, out := &in.ScaleTriggerRef, &out.ScaleTriggerRef - *out = new(ScaleTriggerRef) - **out = **in - } - if in.ScaleTrigger != nil { - in, out := &in.ScaleTrigger, &out.ScaleTrigger - *out = new(ScaleTrigger) - (*in).DeepCopyInto(*out) - } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ElasticConfig. @@ -330,26 +297,6 @@ func (in *MultiHostCommands) DeepCopy() *MultiHostCommands { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *NamedScaleTrigger) DeepCopyInto(out *NamedScaleTrigger) { - *out = *in - if in.HPA != nil { - in, out := &in.HPA, &out.HPA - *out = new(HPATrigger) - (*in).DeepCopyInto(*out) - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NamedScaleTrigger. -func (in *NamedScaleTrigger) DeepCopy() *NamedScaleTrigger { - if in == nil { - return nil - } - out := new(NamedScaleTrigger) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Playground) DeepCopyInto(out *Playground) { *out = *in @@ -471,6 +418,41 @@ func (in *PlaygroundStatus) DeepCopy() *PlaygroundStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RecommendedConfig) DeepCopyInto(out *RecommendedConfig) { + *out = *in + if in.Args != nil { + in, out := &in.Args, &out.Args + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.Resources != nil { + in, out := &in.Resources, &out.Resources + *out = new(ResourceRequirements) + (*in).DeepCopyInto(*out) + } + if in.SharedMemorySize != nil { + in, out := &in.SharedMemorySize, &out.SharedMemorySize + x := (*in).DeepCopy() + *out = &x + } + if in.ScaleTrigger != nil { + in, out := &in.ScaleTrigger, &out.ScaleTrigger + *out = new(ScaleTrigger) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RecommendedConfig. +func (in *RecommendedConfig) DeepCopy() *RecommendedConfig { + if in == nil { + return nil + } + out := new(RecommendedConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ResourceRequirements) DeepCopyInto(out *ResourceRequirements) { *out = *in @@ -520,21 +502,6 @@ func (in *ScaleTrigger) DeepCopy() *ScaleTrigger { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *ScaleTriggerRef) DeepCopyInto(out *ScaleTriggerRef) { - *out = *in -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ScaleTriggerRef. -func (in *ScaleTriggerRef) DeepCopy() *ScaleTriggerRef { - if in == nil { - return nil - } - out := new(ScaleTriggerRef) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Service) DeepCopyInto(out *Service) { *out = *in diff --git a/chart/templates/backends/llamacpp.yaml b/chart/templates/backends/llamacpp.yaml index 2b85c24c..2923f07d 100644 --- a/chart/templates/backends/llamacpp.yaml +++ b/chart/templates/backends/llamacpp.yaml @@ -14,33 +14,22 @@ spec: version: server # Do not edit the preset argument name unless you know what you're doing. # Free to add more arguments with your requirements. - args: + recommendedConfigs: - name: default - flags: + args: - -m - "{{`{{ .ModelPath }}`}}" - --host - "0.0.0.0" - --port - "8080" - # TODO: not supported yet, see https://github.com/InftyAI/llmaz/issues/240. - # - name: speculative-decoding - # flags: - # - -m - # - "{{`{{ .ModelPath }}`}}" - # - -md - # - "{{`{{ .DraftModelPath }}`}}" - # - --host - # - "0.0.0.0" - # - --port - # - "8080" - resources: - requests: - cpu: 2 - memory: 4Gi - limits: - cpu: 2 - memory: 4Gi + resources: + requests: + cpu: 2 + memory: 4Gi + limits: + cpu: 2 + memory: 4Gi startupProbe: periodSeconds: 10 failureThreshold: 30 diff --git a/chart/templates/backends/ollama.yaml b/chart/templates/backends/ollama.yaml index e931d616..83efb1d7 100644 --- a/chart/templates/backends/ollama.yaml +++ b/chart/templates/backends/ollama.yaml @@ -13,24 +13,24 @@ spec: - -c image: ollama/ollama version: latest + envs: + - name: OLLAMA_HOST + value: 0.0.0.0:8080 # Do not edit the preset argument name unless you know what you're doing. # Free to add more arguments with your requirements. - args: + recommendedConfigs: - name: default - flags: + args: - "ollama serve & - while true; do output=$(ollama list 2>&1); - if ! echo $output | grep -q 'could not connect to ollama app' && echo $output | grep -q 'NAME';then echo 'ollama is running';break; else echo 'Waiting for the ollama to be running...';sleep 1;fi;done; - ollama run {{`{{ .ModelName }}`}}; - while true;do sleep 60;done" - envs: - - name: OLLAMA_HOST - value: 0.0.0.0:8080 - resources: - requests: - cpu: 2 - memory: 4Gi - limits: - cpu: 2 - memory: 4Gi -{{- end }} \ No newline at end of file + while true; do output=$(ollama list 2>&1); + if ! echo $output | grep -q 'could not connect to ollama app' && echo $output | grep -q 'NAME';then echo 'ollama is running';break; else echo 'Waiting for the ollama to be running...';sleep 1;fi;done; + ollama run {{`{{ .ModelName }}`}}; + while true;do sleep 60;done" + resources: + requests: + cpu: 2 + memory: 4Gi + limits: + cpu: 2 + memory: 4Gi +{{- end }} diff --git a/chart/templates/backends/sglang.yaml b/chart/templates/backends/sglang.yaml index 86a5b44d..335b3a01 100644 --- a/chart/templates/backends/sglang.yaml +++ b/chart/templates/backends/sglang.yaml @@ -16,9 +16,9 @@ spec: version: v0.2.10-cu121 # Do not edit the preset argument name unless you know what you're doing. # Free to add more arguments with your requirements. - args: + recommendedConfigs: - name: default - flags: + args: - --model-path - "{{`{{ .ModelPath }}`}}" - --served-model-name @@ -27,13 +27,13 @@ spec: - "0.0.0.0" - --port - "8080" - resources: - requests: - cpu: 4 - memory: 8Gi - limits: - cpu: 4 - memory: 8Gi + resources: + requests: + cpu: 4 + memory: 8Gi + limits: + cpu: 4 + memory: 8Gi startupProbe: periodSeconds: 10 failureThreshold: 30 diff --git a/chart/templates/backends/tgi.yaml b/chart/templates/backends/tgi.yaml index 5a2bd87c..812be7e0 100644 --- a/chart/templates/backends/tgi.yaml +++ b/chart/templates/backends/tgi.yaml @@ -12,20 +12,20 @@ spec: version: 2.3.1 # Do not edit the preset argument name unless you know what you're doing. # Free to add more arguments with your requirements. - args: + recommendedConfigs: - name: default - flags: + args: - --model-id - "{{`{{ .ModelPath }}`}}" - --port - "8080" - resources: - requests: - cpu: 4 - memory: 8Gi - limits: - cpu: 4 - memory: 8Gi + resources: + requests: + cpu: 4 + memory: 8Gi + limits: + cpu: 4 + memory: 8Gi startupProbe: periodSeconds: 10 failureThreshold: 30 diff --git a/chart/templates/backends/vllm.yaml b/chart/templates/backends/vllm.yaml index 2a2888c0..318cc87c 100644 --- a/chart/templates/backends/vllm.yaml +++ b/chart/templates/backends/vllm.yaml @@ -59,9 +59,9 @@ spec: version: v0.6.0 # Do not edit the preset argument name unless you know what you're doing. # Free to add more arguments with your requirements. - args: + recommendedConfigs: - name: default - flags: + args: - --model - "{{`{{ .ModelPath }}`}}" - --served-model-name @@ -70,8 +70,15 @@ spec: - "0.0.0.0" - --port - "8080" + resources: + requests: + cpu: 4 + memory: 8Gi + limits: + cpu: 4 + memory: 8Gi - name: speculative-decoding - flags: + args: - --model - "{{`{{ .ModelPath }}`}}" - --served-model-name @@ -87,7 +94,7 @@ spec: - -tp - "1" - name: model-parallelism - flags: + args: - --model - "{{`{{ .ModelPath }}`}}" - --served-model-name @@ -100,13 +107,6 @@ spec: - "{{`{{ .TP }}`}}" - --pipeline-parallel-size - "{{`{{ .PP }}`}}" - resources: - requests: - cpu: 4 - memory: 8Gi - limits: - cpu: 4 - memory: 8Gi startupProbe: periodSeconds: 10 failureThreshold: 30 diff --git a/client-go/applyconfiguration/inference/v1alpha1/backendruntimearg.go b/client-go/applyconfiguration/inference/v1alpha1/backendruntimearg.go deleted file mode 100644 index 231aa87a..00000000 --- a/client-go/applyconfiguration/inference/v1alpha1/backendruntimearg.go +++ /dev/null @@ -1,49 +0,0 @@ -/* -Copyright 2024. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by applyconfiguration-gen. DO NOT EDIT. - -package v1alpha1 - -// BackendRuntimeArgApplyConfiguration represents a declarative configuration of the BackendRuntimeArg type for use -// with apply. -type BackendRuntimeArgApplyConfiguration struct { - Name *string `json:"name,omitempty"` - Flags []string `json:"flags,omitempty"` -} - -// BackendRuntimeArgApplyConfiguration constructs a declarative configuration of the BackendRuntimeArg type for use with -// apply. -func BackendRuntimeArg() *BackendRuntimeArgApplyConfiguration { - return &BackendRuntimeArgApplyConfiguration{} -} - -// WithName sets the Name field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Name field is set to the value of the last call. -func (b *BackendRuntimeArgApplyConfiguration) WithName(value string) *BackendRuntimeArgApplyConfiguration { - b.Name = &value - return b -} - -// WithFlags adds the given value to the Flags field in the declarative configuration -// and returns the receiver, so that objects can be build by chaining "With" function invocations. -// If called multiple times, values provided by each call will be appended to the Flags field. -func (b *BackendRuntimeArgApplyConfiguration) WithFlags(values ...string) *BackendRuntimeArgApplyConfiguration { - for i := range values { - b.Flags = append(b.Flags, values[i]) - } - return b -} diff --git a/client-go/applyconfiguration/inference/v1alpha1/backendruntimeconfig.go b/client-go/applyconfiguration/inference/v1alpha1/backendruntimeconfig.go index 9f34a792..3cda1928 100644 --- a/client-go/applyconfiguration/inference/v1alpha1/backendruntimeconfig.go +++ b/client-go/applyconfiguration/inference/v1alpha1/backendruntimeconfig.go @@ -26,12 +26,14 @@ import ( // BackendRuntimeConfigApplyConfiguration represents a declarative configuration of the BackendRuntimeConfig type for use // with apply. type BackendRuntimeConfigApplyConfiguration struct { - Name *inferencev1alpha1.BackendName `json:"name,omitempty"` + BackendName *inferencev1alpha1.BackendName `json:"backendName,omitempty"` Version *string `json:"version,omitempty"` - Args *BackendRuntimeArgApplyConfiguration `json:"args,omitempty"` Envs []v1.EnvVar `json:"envs,omitempty"` + ConfigName *string `json:"configName,omitempty"` + Args []string `json:"flags,omitempty"` Resources *ResourceRequirementsApplyConfiguration `json:"resources,omitempty"` SharedMemorySize *resource.Quantity `json:"sharedMemorySize,omitempty"` + ScaleTrigger *ScaleTriggerApplyConfiguration `json:"scaleTrigger,omitempty"` } // BackendRuntimeConfigApplyConfiguration constructs a declarative configuration of the BackendRuntimeConfig type for use with @@ -40,11 +42,11 @@ func BackendRuntimeConfig() *BackendRuntimeConfigApplyConfiguration { return &BackendRuntimeConfigApplyConfiguration{} } -// WithName sets the Name field in the declarative configuration to the given value +// WithBackendName sets the BackendName field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Name field is set to the value of the last call. -func (b *BackendRuntimeConfigApplyConfiguration) WithName(value inferencev1alpha1.BackendName) *BackendRuntimeConfigApplyConfiguration { - b.Name = &value +// If called multiple times, the BackendName field is set to the value of the last call. +func (b *BackendRuntimeConfigApplyConfiguration) WithBackendName(value inferencev1alpha1.BackendName) *BackendRuntimeConfigApplyConfiguration { + b.BackendName = &value return b } @@ -56,14 +58,6 @@ func (b *BackendRuntimeConfigApplyConfiguration) WithVersion(value string) *Back return b } -// WithArgs sets the Args field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Args field is set to the value of the last call. -func (b *BackendRuntimeConfigApplyConfiguration) WithArgs(value *BackendRuntimeArgApplyConfiguration) *BackendRuntimeConfigApplyConfiguration { - b.Args = value - return b -} - // WithEnvs adds the given value to the Envs field in the declarative configuration // and returns the receiver, so that objects can be build by chaining "With" function invocations. // If called multiple times, values provided by each call will be appended to the Envs field. @@ -74,6 +68,24 @@ func (b *BackendRuntimeConfigApplyConfiguration) WithEnvs(values ...v1.EnvVar) * return b } +// WithConfigName sets the ConfigName field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the ConfigName field is set to the value of the last call. +func (b *BackendRuntimeConfigApplyConfiguration) WithConfigName(value string) *BackendRuntimeConfigApplyConfiguration { + b.ConfigName = &value + return b +} + +// WithArgs adds the given value to the Args field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, values provided by each call will be appended to the Args field. +func (b *BackendRuntimeConfigApplyConfiguration) WithArgs(values ...string) *BackendRuntimeConfigApplyConfiguration { + for i := range values { + b.Args = append(b.Args, values[i]) + } + return b +} + // WithResources sets the Resources field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the Resources field is set to the value of the last call. @@ -89,3 +101,11 @@ func (b *BackendRuntimeConfigApplyConfiguration) WithSharedMemorySize(value reso b.SharedMemorySize = &value return b } + +// WithScaleTrigger sets the ScaleTrigger field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the ScaleTrigger field is set to the value of the last call. +func (b *BackendRuntimeConfigApplyConfiguration) WithScaleTrigger(value *ScaleTriggerApplyConfiguration) *BackendRuntimeConfigApplyConfiguration { + b.ScaleTrigger = value + return b +} diff --git a/client-go/applyconfiguration/inference/v1alpha1/elasticconfig.go b/client-go/applyconfiguration/inference/v1alpha1/elasticconfig.go index 7603a088..69a06a75 100644 --- a/client-go/applyconfiguration/inference/v1alpha1/elasticconfig.go +++ b/client-go/applyconfiguration/inference/v1alpha1/elasticconfig.go @@ -20,10 +20,8 @@ package v1alpha1 // ElasticConfigApplyConfiguration represents a declarative configuration of the ElasticConfig type for use // with apply. type ElasticConfigApplyConfiguration struct { - MinReplicas *int32 `json:"minReplicas,omitempty"` - MaxReplicas *int32 `json:"maxReplicas,omitempty"` - ScaleTriggerRef *ScaleTriggerRefApplyConfiguration `json:"scaleTriggerRef,omitempty"` - ScaleTrigger *ScaleTriggerApplyConfiguration `json:"scaleTrigger,omitempty"` + MinReplicas *int32 `json:"minReplicas,omitempty"` + MaxReplicas *int32 `json:"maxReplicas,omitempty"` } // ElasticConfigApplyConfiguration constructs a declarative configuration of the ElasticConfig type for use with @@ -47,19 +45,3 @@ func (b *ElasticConfigApplyConfiguration) WithMaxReplicas(value int32) *ElasticC b.MaxReplicas = &value return b } - -// WithScaleTriggerRef sets the ScaleTriggerRef field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the ScaleTriggerRef field is set to the value of the last call. -func (b *ElasticConfigApplyConfiguration) WithScaleTriggerRef(value *ScaleTriggerRefApplyConfiguration) *ElasticConfigApplyConfiguration { - b.ScaleTriggerRef = value - return b -} - -// WithScaleTrigger sets the ScaleTrigger field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the ScaleTrigger field is set to the value of the last call. -func (b *ElasticConfigApplyConfiguration) WithScaleTrigger(value *ScaleTriggerApplyConfiguration) *ElasticConfigApplyConfiguration { - b.ScaleTrigger = value - return b -} diff --git a/client-go/applyconfiguration/inference/v1alpha1/scaletriggerref.go b/client-go/applyconfiguration/inference/v1alpha1/scaletriggerref.go deleted file mode 100644 index ba87d027..00000000 --- a/client-go/applyconfiguration/inference/v1alpha1/scaletriggerref.go +++ /dev/null @@ -1,38 +0,0 @@ -/* -Copyright 2024. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by applyconfiguration-gen. DO NOT EDIT. - -package v1alpha1 - -// ScaleTriggerRefApplyConfiguration represents a declarative configuration of the ScaleTriggerRef type for use -// with apply. -type ScaleTriggerRefApplyConfiguration struct { - Name *string `json:"name,omitempty"` -} - -// ScaleTriggerRefApplyConfiguration constructs a declarative configuration of the ScaleTriggerRef type for use with -// apply. -func ScaleTriggerRef() *ScaleTriggerRefApplyConfiguration { - return &ScaleTriggerRefApplyConfiguration{} -} - -// WithName sets the Name field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Name field is set to the value of the last call. -func (b *ScaleTriggerRefApplyConfiguration) WithName(value string) *ScaleTriggerRefApplyConfiguration { - b.Name = &value - return b -} diff --git a/client-go/applyconfiguration/utils.go b/client-go/applyconfiguration/utils.go index 62e75b80..e412064f 100644 --- a/client-go/applyconfiguration/utils.go +++ b/client-go/applyconfiguration/utils.go @@ -33,8 +33,6 @@ import ( func ForKind(kind schema.GroupVersionKind) interface{} { switch kind { // Group=inference.llmaz.io, Version=v1alpha1 - case v1alpha1.SchemeGroupVersion.WithKind("BackendRuntimeArg"): - return &inferencev1alpha1.BackendRuntimeArgApplyConfiguration{} case v1alpha1.SchemeGroupVersion.WithKind("BackendRuntimeConfig"): return &inferencev1alpha1.BackendRuntimeConfigApplyConfiguration{} case v1alpha1.SchemeGroupVersion.WithKind("ElasticConfig"): @@ -51,8 +49,6 @@ func ForKind(kind schema.GroupVersionKind) interface{} { return &inferencev1alpha1.ResourceRequirementsApplyConfiguration{} case v1alpha1.SchemeGroupVersion.WithKind("ScaleTrigger"): return &inferencev1alpha1.ScaleTriggerApplyConfiguration{} - case v1alpha1.SchemeGroupVersion.WithKind("ScaleTriggerRef"): - return &inferencev1alpha1.ScaleTriggerRefApplyConfiguration{} case v1alpha1.SchemeGroupVersion.WithKind("Service"): return &inferencev1alpha1.ServiceApplyConfiguration{} case v1alpha1.SchemeGroupVersion.WithKind("ServiceSpec"): diff --git a/config/crd/bases/inference.llmaz.io_backendruntimes.yaml b/config/crd/bases/inference.llmaz.io_backendruntimes.yaml index 7b7f89da..cf10d168 100644 --- a/config/crd/bases/inference.llmaz.io_backendruntimes.yaml +++ b/config/crd/bases/inference.llmaz.io_backendruntimes.yaml @@ -41,30 +41,6 @@ spec: spec: description: BackendRuntimeSpec defines the desired state of BackendRuntime properties: - args: - description: |- - Args represents the preset arguments of the backendRuntime. - They can be appended or overwritten by the Playground backendRuntimeConfig. - items: - description: |- - BackendRuntimeArg is the preset arguments for easy to use. - Three preset names are provided: default, speculative-decoding, model-parallelism, - do not change the name. - properties: - flags: - description: |- - Flags represents all the preset configurations. - Flag around with {{ .CONFIG }} is a configuration waiting for render. - items: - type: string - type: array - name: - default: default - description: Name represents the identifier of the backendRuntime - argument. - type: string - type: object - type: array commands: description: Commands represents the default commands for the backendRuntime. items: @@ -516,648 +492,679 @@ spec: format: int32 type: integer type: object - resources: - description: |- - Resources represents the resource requirements for backendRuntime, like cpu/mem, - accelerators like GPU should not be defined here, but at the model flavors, - or the values here will be overwritten. - properties: - limits: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - description: |- - Limits describes the maximum amount of compute resources allowed. - More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ - type: object - requests: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - description: |- - Requests describes the minimum amount of compute resources required. - If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, - otherwise to an implementation-defined value. Requests cannot exceed Limits. - More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ - type: object - type: object - scaleTriggers: - description: |- - ScaleTriggers represents a set of triggers preset to be used by Playground. - If Playground not specify the scale trigger, the 0-index trigger will be used. + recommendedConfigs: + description: RecommendedConfigs represents the recommended configurations + for the backendRuntime. items: description: |- - NamedScaleTrigger defines the rules to scale the workloads. - Only one trigger cloud work at a time. The name is used to identify - the trigger in backendRuntime. + RecommendedConfig represents the recommended configurations for the backendRuntime, + user can choose one of them to apply. properties: - hpa: - description: HPA represents the trigger configuration of the - HorizontalPodAutoscaler. + args: + description: |- + Args represents all the arguments for the command. + Argument around with {{ .CONFIG }} is a configuration waiting for render. + items: + type: string + type: array + name: + description: Name represents the identifier of the config. + type: string + resources: + description: |- + Resources represents the resource requirements for backend, like cpu/mem, + accelerators like GPU should not be defined here, but at the model flavors, + or the values here will be overwritten. properties: - behavior: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: |- - behavior configures the scaling behavior of the target - in both Up and Down directions (scaleUp and scaleDown fields respectively). - If not set, the default HPAScalingRules for scale up and scale down are used. + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + scaleTrigger: + description: |- + ScaleTrigger defines the rules to scale the workloads. + Only one trigger cloud work at a time. + properties: + hpa: + description: HPA represents the trigger configuration of + the HorizontalPodAutoscaler. properties: - scaleDown: + behavior: description: |- - scaleDown is scaling policy for scaling Down. - If not set, the default value is to allow to scale down to minReplicas pods, with a - 300 second stabilization window (i.e., the highest recommendation for - the last 300sec is used). + behavior configures the scaling behavior of the target + in both Up and Down directions (scaleUp and scaleDown fields respectively). + If not set, the default HPAScalingRules for scale up and scale down are used. properties: - policies: - description: |- - policies is a list of potential scaling polices which can be used during scaling. - At least one policy must be specified, otherwise the HPAScalingRules will be discarded as invalid - items: - description: HPAScalingPolicy is a single policy - which must hold true for a specified past interval. - properties: - periodSeconds: - description: |- - periodSeconds specifies the window of time for which the policy should hold true. - PeriodSeconds must be greater than zero and less than or equal to 1800 (30 min). - format: int32 - type: integer - type: - description: type is used to specify the scaling - policy. - type: string - value: - description: |- - value contains the amount of change which is permitted by the policy. - It must be greater than zero - format: int32 - type: integer - required: - - periodSeconds - - type - - value - type: object - type: array - x-kubernetes-list-type: atomic - selectPolicy: + scaleDown: description: |- - selectPolicy is used to specify which policy should be used. - If not set, the default value Max is used. - type: string - stabilizationWindowSeconds: + scaleDown is scaling policy for scaling Down. + If not set, the default value is to allow to scale down to minReplicas pods, with a + 300 second stabilization window (i.e., the highest recommendation for + the last 300sec is used). + properties: + policies: + description: |- + policies is a list of potential scaling polices which can be used during scaling. + At least one policy must be specified, otherwise the HPAScalingRules will be discarded as invalid + items: + description: HPAScalingPolicy is a single + policy which must hold true for a specified + past interval. + properties: + periodSeconds: + description: |- + periodSeconds specifies the window of time for which the policy should hold true. + PeriodSeconds must be greater than zero and less than or equal to 1800 (30 min). + format: int32 + type: integer + type: + description: type is used to specify the + scaling policy. + type: string + value: + description: |- + value contains the amount of change which is permitted by the policy. + It must be greater than zero + format: int32 + type: integer + required: + - periodSeconds + - type + - value + type: object + type: array + x-kubernetes-list-type: atomic + selectPolicy: + description: |- + selectPolicy is used to specify which policy should be used. + If not set, the default value Max is used. + type: string + stabilizationWindowSeconds: + description: |- + stabilizationWindowSeconds is the number of seconds for which past recommendations should be + considered while scaling up or scaling down. + StabilizationWindowSeconds must be greater than or equal to zero and less than or equal to 3600 (one hour). + If not set, use the default values: + - For scale up: 0 (i.e. no stabilization is done). + - For scale down: 300 (i.e. the stabilization window is 300 seconds long). + format: int32 + type: integer + type: object + scaleUp: description: |- - stabilizationWindowSeconds is the number of seconds for which past recommendations should be - considered while scaling up or scaling down. - StabilizationWindowSeconds must be greater than or equal to zero and less than or equal to 3600 (one hour). - If not set, use the default values: - - For scale up: 0 (i.e. no stabilization is done). - - For scale down: 300 (i.e. the stabilization window is 300 seconds long). - format: int32 - type: integer + scaleUp is scaling policy for scaling Up. + If not set, the default value is the higher of: + * increase no more than 4 pods per 60 seconds + * double the number of pods per 60 seconds + No stabilization is used. + properties: + policies: + description: |- + policies is a list of potential scaling polices which can be used during scaling. + At least one policy must be specified, otherwise the HPAScalingRules will be discarded as invalid + items: + description: HPAScalingPolicy is a single + policy which must hold true for a specified + past interval. + properties: + periodSeconds: + description: |- + periodSeconds specifies the window of time for which the policy should hold true. + PeriodSeconds must be greater than zero and less than or equal to 1800 (30 min). + format: int32 + type: integer + type: + description: type is used to specify the + scaling policy. + type: string + value: + description: |- + value contains the amount of change which is permitted by the policy. + It must be greater than zero + format: int32 + type: integer + required: + - periodSeconds + - type + - value + type: object + type: array + x-kubernetes-list-type: atomic + selectPolicy: + description: |- + selectPolicy is used to specify which policy should be used. + If not set, the default value Max is used. + type: string + stabilizationWindowSeconds: + description: |- + stabilizationWindowSeconds is the number of seconds for which past recommendations should be + considered while scaling up or scaling down. + StabilizationWindowSeconds must be greater than or equal to zero and less than or equal to 3600 (one hour). + If not set, use the default values: + - For scale up: 0 (i.e. no stabilization is done). + - For scale down: 300 (i.e. the stabilization window is 300 seconds long). + format: int32 + type: integer + type: object type: object - scaleUp: + metrics: description: |- - scaleUp is scaling policy for scaling Up. - If not set, the default value is the higher of: - * increase no more than 4 pods per 60 seconds - * double the number of pods per 60 seconds - No stabilization is used. - properties: - policies: - description: |- - policies is a list of potential scaling polices which can be used during scaling. - At least one policy must be specified, otherwise the HPAScalingRules will be discarded as invalid - items: - description: HPAScalingPolicy is a single policy - which must hold true for a specified past interval. - properties: - periodSeconds: - description: |- - periodSeconds specifies the window of time for which the policy should hold true. - PeriodSeconds must be greater than zero and less than or equal to 1800 (30 min). - format: int32 - type: integer - type: - description: type is used to specify the scaling - policy. - type: string - value: - description: |- - value contains the amount of change which is permitted by the policy. - It must be greater than zero - format: int32 - type: integer - required: - - periodSeconds - - type - - value - type: object - type: array - x-kubernetes-list-type: atomic - selectPolicy: - description: |- - selectPolicy is used to specify which policy should be used. - If not set, the default value Max is used. - type: string - stabilizationWindowSeconds: - description: |- - stabilizationWindowSeconds is the number of seconds for which past recommendations should be - considered while scaling up or scaling down. - StabilizationWindowSeconds must be greater than or equal to zero and less than or equal to 3600 (one hour). - If not set, use the default values: - - For scale up: 0 (i.e. no stabilization is done). - - For scale down: 300 (i.e. the stabilization window is 300 seconds long). - format: int32 - type: integer - type: object - type: object - metrics: - description: |- - metrics contains the specifications for which to use to calculate the - desired replica count (the maximum replica count across all metrics will - be used). The desired replica count is calculated multiplying the - ratio between the target value and the current value by the current - number of pods. Ergo, metrics used must decrease as the pod count is - increased, and vice-versa. See the individual metric source types for - more information about how each type of metric must respond. - items: - description: |- - MetricSpec specifies how to scale based on a single metric - (only `type` and one other matching field should be set at once). - properties: - containerResource: + metrics contains the specifications for which to use to calculate the + desired replica count (the maximum replica count across all metrics will + be used). The desired replica count is calculated multiplying the + ratio between the target value and the current value by the current + number of pods. Ergo, metrics used must decrease as the pod count is + increased, and vice-versa. See the individual metric source types for + more information about how each type of metric must respond. + items: description: |- - containerResource refers to a resource metric (such as those specified in - requests and limits) known to Kubernetes describing a single container in - each pod of the current scale target (e.g. CPU or memory). Such metrics are - built in to Kubernetes, and have special scaling options on top of those - available to normal per-pod metrics using the "pods" source. + MetricSpec specifies how to scale based on a single metric + (only `type` and one other matching field should be set at once). properties: - container: - description: container is the name of the container - in the pods of the scaling target - type: string - name: - description: name is the name of the resource - in question. - type: string - target: - description: target specifies the target value - for the given metric + containerResource: + description: |- + containerResource refers to a resource metric (such as those specified in + requests and limits) known to Kubernetes describing a single container in + each pod of the current scale target (e.g. CPU or memory). Such metrics are + built in to Kubernetes, and have special scaling options on top of those + available to normal per-pod metrics using the "pods" source. properties: - averageUtilization: - description: |- - averageUtilization is the target value of the average of the - resource metric across all relevant pods, represented as a percentage of - the requested value of the resource for the pods. - Currently only valid for Resource metric source type - format: int32 - type: integer - averageValue: - anyOf: - - type: integer - - type: string - description: |- - averageValue is the target value of the average of the - metric across all relevant pods (as a quantity) - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: - description: type represents whether the metric - type is Utilization, Value, or AverageValue + container: + description: container is the name of the + container in the pods of the scaling target type: string - value: - anyOf: - - type: integer - - type: string - description: value is the target value of - the metric (as a quantity). - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - required: - - type - type: object - required: - - container - - name - - target - type: object - external: - description: |- - external refers to a global metric that is not associated - with any Kubernetes object. It allows autoscaling based on information - coming from components running outside of cluster - (for example length of queue in cloud messaging service, or - QPS from loadbalancer running outside of cluster). - properties: - metric: - description: metric identifies the target metric - by name and selector - properties: name: - description: name is the name of the given - metric + description: name is the name of the resource + in question. type: string - selector: - description: |- - selector is the string-encoded form of a standard kubernetes label selector for the given metric - When set, it is passed as an additional parameter to the metrics server for more specific metrics scoping. - When unset, just the metricName will be used to gather metrics. + target: + description: target specifies the target value + for the given metric properties: - matchExpressions: - description: matchExpressions is a list - of label selector requirements. The - requirements are ANDed. - items: - description: |- - A label selector requirement is a selector that contains values, a key, and an operator that - relates the key and values. - properties: - key: - description: key is the label key - that the selector applies to. - type: string - operator: - description: |- - operator represents a key's relationship to a set of values. - Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: |- - values is an array of string values. If the operator is In or NotIn, - the values array must be non-empty. If the operator is Exists or DoesNotExist, - the values array must be empty. This array is replaced during a strategic - merge patch. - items: - type: string - type: array - x-kubernetes-list-type: atomic - required: - - key - - operator - type: object - type: array - x-kubernetes-list-type: atomic - matchLabels: - additionalProperties: - type: string + averageUtilization: description: |- - matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels - map is equivalent to an element of matchExpressions, whose key field is "key", the - operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object + averageUtilization is the target value of the average of the + resource metric across all relevant pods, represented as a percentage of + the requested value of the resource for the pods. + Currently only valid for Resource metric source type + format: int32 + type: integer + averageValue: + anyOf: + - type: integer + - type: string + description: |- + averageValue is the target value of the average of the + metric across all relevant pods (as a quantity) + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: + description: type represents whether the + metric type is Utilization, Value, or + AverageValue + type: string + value: + anyOf: + - type: integer + - type: string + description: value is the target value + of the metric (as a quantity). + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - type type: object - x-kubernetes-map-type: atomic required: + - container - name + - target type: object - target: - description: target specifies the target value - for the given metric + external: + description: |- + external refers to a global metric that is not associated + with any Kubernetes object. It allows autoscaling based on information + coming from components running outside of cluster + (for example length of queue in cloud messaging service, or + QPS from loadbalancer running outside of cluster). properties: - averageUtilization: - description: |- - averageUtilization is the target value of the average of the - resource metric across all relevant pods, represented as a percentage of - the requested value of the resource for the pods. - Currently only valid for Resource metric source type - format: int32 - type: integer - averageValue: - anyOf: - - type: integer - - type: string - description: |- - averageValue is the target value of the average of the - metric across all relevant pods (as a quantity) - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: - description: type represents whether the metric - type is Utilization, Value, or AverageValue - type: string - value: - anyOf: - - type: integer - - type: string - description: value is the target value of - the metric (as a quantity). - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - required: - - type - type: object - required: - - metric - - target - type: object - object: - description: |- - object refers to a metric describing a single kubernetes object - (for example, hits-per-second on an Ingress object). - properties: - describedObject: - description: describedObject specifies the descriptions - of a object,such as kind,name apiVersion - properties: - apiVersion: - description: apiVersion is the API version - of the referent - type: string - kind: - description: 'kind is the kind of the referent; - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' - type: string - name: - description: 'name is the name of the referent; - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' - type: string + metric: + description: metric identifies the target + metric by name and selector + properties: + name: + description: name is the name of the given + metric + type: string + selector: + description: |- + selector is the string-encoded form of a standard kubernetes label selector for the given metric + When set, it is passed as an additional parameter to the metrics server for more specific metrics scoping. + When unset, just the metricName will be used to gather metrics. + properties: + matchExpressions: + description: matchExpressions is a + list of label selector requirements. + The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label + key that the selector applies + to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + required: + - name + type: object + target: + description: target specifies the target value + for the given metric + properties: + averageUtilization: + description: |- + averageUtilization is the target value of the average of the + resource metric across all relevant pods, represented as a percentage of + the requested value of the resource for the pods. + Currently only valid for Resource metric source type + format: int32 + type: integer + averageValue: + anyOf: + - type: integer + - type: string + description: |- + averageValue is the target value of the average of the + metric across all relevant pods (as a quantity) + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: + description: type represents whether the + metric type is Utilization, Value, or + AverageValue + type: string + value: + anyOf: + - type: integer + - type: string + description: value is the target value + of the metric (as a quantity). + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - type + type: object required: - - kind - - name + - metric + - target type: object - metric: - description: metric identifies the target metric - by name and selector + object: + description: |- + object refers to a metric describing a single kubernetes object + (for example, hits-per-second on an Ingress object). properties: - name: - description: name is the name of the given - metric - type: string - selector: - description: |- - selector is the string-encoded form of a standard kubernetes label selector for the given metric - When set, it is passed as an additional parameter to the metrics server for more specific metrics scoping. - When unset, just the metricName will be used to gather metrics. + describedObject: + description: describedObject specifies the + descriptions of a object,such as kind,name + apiVersion properties: - matchExpressions: - description: matchExpressions is a list - of label selector requirements. The - requirements are ANDed. - items: - description: |- - A label selector requirement is a selector that contains values, a key, and an operator that - relates the key and values. - properties: - key: - description: key is the label key - that the selector applies to. - type: string - operator: + apiVersion: + description: apiVersion is the API version + of the referent + type: string + kind: + description: 'kind is the kind of the + referent; More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + name: + description: 'name is the name of the + referent; More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + required: + - kind + - name + type: object + metric: + description: metric identifies the target + metric by name and selector + properties: + name: + description: name is the name of the given + metric + type: string + selector: + description: |- + selector is the string-encoded form of a standard kubernetes label selector for the given metric + When set, it is passed as an additional parameter to the metrics server for more specific metrics scoping. + When unset, just the metricName will be used to gather metrics. + properties: + matchExpressions: + description: matchExpressions is a + list of label selector requirements. + The requirements are ANDed. + items: description: |- - operator represents a key's relationship to a set of values. - Valid operators are In, NotIn, Exists and DoesNotExist. + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label + key that the selector applies + to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: type: string - values: - description: |- - values is an array of string values. If the operator is In or NotIn, - the values array must be non-empty. If the operator is Exists or DoesNotExist, - the values array must be empty. This array is replaced during a strategic - merge patch. - items: - type: string - type: array - x-kubernetes-list-type: atomic - required: - - key - - operator - type: object - type: array - x-kubernetes-list-type: atomic - matchLabels: - additionalProperties: - type: string - description: |- - matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels - map is equivalent to an element of matchExpressions, whose key field is "key", the - operator is "In", and the values array contains only "value". The requirements are ANDed. + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object type: object + x-kubernetes-map-type: atomic + required: + - name + type: object + target: + description: target specifies the target value + for the given metric + properties: + averageUtilization: + description: |- + averageUtilization is the target value of the average of the + resource metric across all relevant pods, represented as a percentage of + the requested value of the resource for the pods. + Currently only valid for Resource metric source type + format: int32 + type: integer + averageValue: + anyOf: + - type: integer + - type: string + description: |- + averageValue is the target value of the average of the + metric across all relevant pods (as a quantity) + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: + description: type represents whether the + metric type is Utilization, Value, or + AverageValue + type: string + value: + anyOf: + - type: integer + - type: string + description: value is the target value + of the metric (as a quantity). + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - type type: object - x-kubernetes-map-type: atomic required: - - name + - describedObject + - metric + - target type: object - target: - description: target specifies the target value - for the given metric + pods: + description: |- + pods refers to a metric describing each pod in the current scale target + (for example, transactions-processed-per-second). The values will be + averaged together before being compared to the target value. properties: - averageUtilization: - description: |- - averageUtilization is the target value of the average of the - resource metric across all relevant pods, represented as a percentage of - the requested value of the resource for the pods. - Currently only valid for Resource metric source type - format: int32 - type: integer - averageValue: - anyOf: - - type: integer - - type: string - description: |- - averageValue is the target value of the average of the - metric across all relevant pods (as a quantity) - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: - description: type represents whether the metric - type is Utilization, Value, or AverageValue - type: string - value: - anyOf: - - type: integer - - type: string - description: value is the target value of - the metric (as a quantity). - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - required: - - type - type: object - required: - - describedObject - - metric - - target - type: object - pods: - description: |- - pods refers to a metric describing each pod in the current scale target - (for example, transactions-processed-per-second). The values will be - averaged together before being compared to the target value. - properties: - metric: - description: metric identifies the target metric - by name and selector - properties: - name: - description: name is the name of the given - metric - type: string - selector: - description: |- - selector is the string-encoded form of a standard kubernetes label selector for the given metric - When set, it is passed as an additional parameter to the metrics server for more specific metrics scoping. - When unset, just the metricName will be used to gather metrics. + metric: + description: metric identifies the target + metric by name and selector properties: - matchExpressions: - description: matchExpressions is a list - of label selector requirements. The - requirements are ANDed. - items: - description: |- - A label selector requirement is a selector that contains values, a key, and an operator that - relates the key and values. - properties: - key: - description: key is the label key - that the selector applies to. - type: string - operator: + name: + description: name is the name of the given + metric + type: string + selector: + description: |- + selector is the string-encoded form of a standard kubernetes label selector for the given metric + When set, it is passed as an additional parameter to the metrics server for more specific metrics scoping. + When unset, just the metricName will be used to gather metrics. + properties: + matchExpressions: + description: matchExpressions is a + list of label selector requirements. + The requirements are ANDed. + items: description: |- - operator represents a key's relationship to a set of values. - Valid operators are In, NotIn, Exists and DoesNotExist. + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label + key that the selector applies + to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: type: string - values: - description: |- - values is an array of string values. If the operator is In or NotIn, - the values array must be non-empty. If the operator is Exists or DoesNotExist, - the values array must be empty. This array is replaced during a strategic - merge patch. - items: - type: string - type: array - x-kubernetes-list-type: atomic - required: - - key - - operator - type: object - type: array - x-kubernetes-list-type: atomic - matchLabels: - additionalProperties: - type: string - description: |- - matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels - map is equivalent to an element of matchExpressions, whose key field is "key", the - operator is "In", and the values array contains only "value". The requirements are ANDed. + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object type: object + x-kubernetes-map-type: atomic + required: + - name + type: object + target: + description: target specifies the target value + for the given metric + properties: + averageUtilization: + description: |- + averageUtilization is the target value of the average of the + resource metric across all relevant pods, represented as a percentage of + the requested value of the resource for the pods. + Currently only valid for Resource metric source type + format: int32 + type: integer + averageValue: + anyOf: + - type: integer + - type: string + description: |- + averageValue is the target value of the average of the + metric across all relevant pods (as a quantity) + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: + description: type represents whether the + metric type is Utilization, Value, or + AverageValue + type: string + value: + anyOf: + - type: integer + - type: string + description: value is the target value + of the metric (as a quantity). + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - type type: object - x-kubernetes-map-type: atomic required: - - name + - metric + - target type: object - target: - description: target specifies the target value - for the given metric + resource: + description: |- + resource refers to a resource metric (such as those specified in + requests and limits) known to Kubernetes describing each pod in the + current scale target (e.g. CPU or memory). Such metrics are built in to + Kubernetes, and have special scaling options on top of those available + to normal per-pod metrics using the "pods" source. properties: - averageUtilization: - description: |- - averageUtilization is the target value of the average of the - resource metric across all relevant pods, represented as a percentage of - the requested value of the resource for the pods. - Currently only valid for Resource metric source type - format: int32 - type: integer - averageValue: - anyOf: - - type: integer - - type: string - description: |- - averageValue is the target value of the average of the - metric across all relevant pods (as a quantity) - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: - description: type represents whether the metric - type is Utilization, Value, or AverageValue + name: + description: name is the name of the resource + in question. type: string - value: - anyOf: - - type: integer - - type: string - description: value is the target value of - the metric (as a quantity). - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true + target: + description: target specifies the target value + for the given metric + properties: + averageUtilization: + description: |- + averageUtilization is the target value of the average of the + resource metric across all relevant pods, represented as a percentage of + the requested value of the resource for the pods. + Currently only valid for Resource metric source type + format: int32 + type: integer + averageValue: + anyOf: + - type: integer + - type: string + description: |- + averageValue is the target value of the average of the + metric across all relevant pods (as a quantity) + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: + description: type represents whether the + metric type is Utilization, Value, or + AverageValue + type: string + value: + anyOf: + - type: integer + - type: string + description: value is the target value + of the metric (as a quantity). + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - type + type: object required: - - type + - name + - target type: object - required: - - metric - - target - type: object - resource: - description: |- - resource refers to a resource metric (such as those specified in - requests and limits) known to Kubernetes describing each pod in the - current scale target (e.g. CPU or memory). Such metrics are built in to - Kubernetes, and have special scaling options on top of those available - to normal per-pod metrics using the "pods" source. - properties: - name: - description: name is the name of the resource - in question. + type: + description: |- + type is the type of metric source. It should be one of "ContainerResource", "External", + "Object", "Pods" or "Resource", each mapping to a matching field in the object. type: string - target: - description: target specifies the target value - for the given metric - properties: - averageUtilization: - description: |- - averageUtilization is the target value of the average of the - resource metric across all relevant pods, represented as a percentage of - the requested value of the resource for the pods. - Currently only valid for Resource metric source type - format: int32 - type: integer - averageValue: - anyOf: - - type: integer - - type: string - description: |- - averageValue is the target value of the average of the - metric across all relevant pods (as a quantity) - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: - description: type represents whether the metric - type is Utilization, Value, or AverageValue - type: string - value: - anyOf: - - type: integer - - type: string - description: value is the target value of - the metric (as a quantity). - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - required: - - type - type: object required: - - name - - target + - type type: object - type: - description: |- - type is the type of metric source. It should be one of "ContainerResource", "External", - "Object", "Pods" or "Resource", each mapping to a matching field in the object. - type: string - required: - - type - type: object - type: array + type: array + type: object type: object - name: + sharedMemorySize: + anyOf: + - type: integer + - type: string description: |- - Name represents the identifier of the scale trigger, e.g. some triggers defined for - latency sensitive workloads, some are defined for throughput sensitive workloads. - type: string + SharedMemorySize represents the size of /dev/shm required in the runtime of + inference workload. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - name type: object type: array startupProbe: @@ -1323,7 +1330,6 @@ spec: type: string required: - image - - resources - version type: object status: diff --git a/config/crd/bases/inference.llmaz.io_playgrounds.yaml b/config/crd/bases/inference.llmaz.io_playgrounds.yaml index f7b62761..5774f499 100644 --- a/config/crd/bases/inference.llmaz.io_playgrounds.yaml +++ b/config/crd/bases/inference.llmaz.io_playgrounds.yaml @@ -48,22 +48,23 @@ spec: properties: args: description: |- - Args represents the specified arguments of the backendRuntime, - will be append to the backendRuntime.spec.Args. - properties: - flags: - description: |- - Flags represents all the preset configurations. - Flag around with {{ .CONFIG }} is a configuration waiting for render. - items: - type: string - type: array - name: - default: default - description: Name represents the identifier of the backendRuntime - argument. - type: string - type: object + Args represents all the arguments for the command. + Argument around with {{ .CONFIG }} is a configuration waiting for render. + Args defined here will "append" the args in the recommendedConfig. + items: + type: string + type: array + backendName: + default: vllm + description: BackendName represents the inference backend under + the hood, e.g. vLLM. + type: string + configName: + description: |- + ConfigName represents the recommended configuration name for the backend, + It will be inferred from the models in the runtime if not specified, e.g. default, + speculative-decoding or model-parallelism. + type: string envs: description: Envs represents the environments set to the container. items: @@ -184,16 +185,12 @@ spec: - name type: object type: array - name: - default: vllm - description: Name represents the inference backend under the hood, - e.g. vLLM. - type: string resources: description: |- Resources represents the resource requirements for backend, like cpu/mem, accelerators like GPU should not be defined here, but at the model flavors, or the values here will be overwritten. + Resources defined here will "overwrite" the resources in the recommendedConfig. properties: limits: additionalProperties: @@ -220,47 +217,11 @@ spec: More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ type: object type: object - sharedMemorySize: - anyOf: - - type: integer - - type: string - description: |- - SharedMemorySize represents the size of /dev/shm required in the runtime of - inference workload. - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - version: - description: |- - Version represents the backend version if you want a different one - from the default version. - type: string - type: object - elasticConfig: - description: |- - ElasticConfig defines the configuration for elastic usage, - e.g. the max/min replicas. - Note: this requires to install the HPA first or will report error. - properties: - maxReplicas: - description: |- - MaxReplicas indicates the maximum number of inference workloads based on the traffic. - Default to nil means there's no limit for the instance number. - format: int32 - type: integer - minReplicas: - default: 1 - description: |- - MinReplicas indicates the minimum number of inference workloads based on the traffic. - Default to 1. - MinReplicas couldn't be 0 now, will support serverless in the future. - format: int32 - type: integer scaleTrigger: description: |- - ScaleTrigger defines a set of triggers to scale the workloads. - If not defined, trigger configured in backendRuntime will be used, - otherwise, trigger defined here will overwrite the defaulted ones. - ScaleTriggerRef and ScaleTrigger can't be set at the same time. + ScaleTrigger defines the rules to scale the workloads. + Only one trigger cloud work at a time, mostly used in Playground. + ScaleTrigger defined here will "overwrite" the scaleTrigger in the recommendedConfig. properties: hpa: description: HPA represents the trigger configuration of the @@ -869,19 +830,41 @@ spec: type: array type: object type: object - scaleTriggerRef: + sharedMemorySize: + anyOf: + - type: integer + - type: string description: |- - ScaleTriggerRef refers to the configured scaleTrigger in the backendRuntime - with tuned target value. - ScaleTriggerRef and ScaleTrigger can't be set at the same time. - properties: - name: - description: Name represents the scale trigger name defined - in the backendRuntime.scaleTriggers. - type: string - required: - - name - type: object + SharedMemorySize represents the size of /dev/shm required in the runtime of + inference workload. + SharedMemorySize defined here will "overwrite" the sharedMemorySize in the recommendedConfig. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + version: + description: |- + Version represents the backend version if you want a different one + from the default version. + type: string + type: object + elasticConfig: + description: |- + ElasticConfig defines the configuration for elastic usage, + e.g. the max/min replicas. + properties: + maxReplicas: + description: |- + MaxReplicas indicates the maximum number of inference workloads based on the traffic. + Default to nil means there's no limit for the instance number. + format: int32 + type: integer + minReplicas: + default: 1 + description: |- + MinReplicas indicates the minimum number of inference workloads based on the traffic. + Default to 1. + MinReplicas couldn't be 0 now, will support serverless in the future. + format: int32 + type: integer type: object modelClaim: description: |- diff --git a/docs/examples/hpa/playground.yaml b/docs/examples/hpa/playground.yaml index 07b68770..34fd84da 100644 --- a/docs/examples/hpa/playground.yaml +++ b/docs/examples/hpa/playground.yaml @@ -1,20 +1,19 @@ apiVersion: inference.llmaz.io/v1alpha1 kind: Playground metadata: - name: qwen2-0--5b + name: qwen2-0--5b-hpa spec: replicas: 1 modelClaim: modelName: qwen2-0--5b-gguf - backendRuntimeConfig: - name: llamacpp - args: - name: "default" - flags: - - -fa # use flash attention elasticConfig: minReplicas: 1 maxReplicas: 3 + backendRuntimeConfig: + backendName: llamacpp + configName: default + args: + - -fa # use flash attention scaleTrigger: hpa: metrics: diff --git a/docs/examples/llamacpp/playground.yaml b/docs/examples/llamacpp/playground.yaml index de621667..95e6524f 100644 --- a/docs/examples/llamacpp/playground.yaml +++ b/docs/examples/llamacpp/playground.yaml @@ -7,8 +7,7 @@ spec: modelClaim: modelName: qwen2-0--5b-gguf backendRuntimeConfig: - name: llamacpp + backendName: llamacpp + configName: default args: - name: "default" - flags: - - -fa # use flash attention + - -fa # use flash attention diff --git a/docs/examples/ollama/playground.yaml b/docs/examples/ollama/playground.yaml index f91949ac..62f47c9b 100644 --- a/docs/examples/ollama/playground.yaml +++ b/docs/examples/ollama/playground.yaml @@ -7,4 +7,4 @@ spec: modelClaim: modelName: qwen2-0--5b backendRuntimeConfig: - name: ollama + backendName: ollama diff --git a/docs/examples/sglang/playground.yaml b/docs/examples/sglang/playground.yaml index a94a55f8..6b76e133 100644 --- a/docs/examples/sglang/playground.yaml +++ b/docs/examples/sglang/playground.yaml @@ -7,4 +7,4 @@ spec: modelClaim: modelName: qwen2-0--5b backendRuntimeConfig: - name: sglang + backendName: sglang diff --git a/docs/examples/speculative-decoding/llamacpp/playground.yaml b/docs/examples/speculative-decoding/llamacpp/playground.yaml index 4d797263..870f692f 100644 --- a/docs/examples/speculative-decoding/llamacpp/playground.yaml +++ b/docs/examples/speculative-decoding/llamacpp/playground.yaml @@ -14,11 +14,9 @@ spec: - name: llama2-7b-q2-k-gguf # the draft model role: draft backendRuntimeConfig: - name: llamacpp + backendName: llamacpp args: - name: "speculative-decoding" - flags: - - -fa # use flash attention + - -fa # use flash attention resources: requests: cpu: 4 diff --git a/pkg/controller/inference/playground_controller.go b/pkg/controller/inference/playground_controller.go index f322ea41..fc1648c7 100644 --- a/pkg/controller/inference/playground_controller.go +++ b/pkg/controller/inference/playground_controller.go @@ -47,7 +47,8 @@ import ( coreclientgo "github.com/inftyai/llmaz/client-go/applyconfiguration/core/v1alpha1" inferenceclientgo "github.com/inftyai/llmaz/client-go/applyconfiguration/inference/v1alpha1" helper "github.com/inftyai/llmaz/pkg/controller_helper" - modelSource "github.com/inftyai/llmaz/pkg/controller_helper/model_source" + backendruntime "github.com/inftyai/llmaz/pkg/controller_helper/backendruntime" + modelSource "github.com/inftyai/llmaz/pkg/controller_helper/modelsource" "github.com/inftyai/llmaz/pkg/util" ) @@ -106,8 +107,8 @@ func (r *PlaygroundReconciler) Reconcile(ctx context.Context, req ctrl.Request) } backendRuntimeName := inferenceapi.DefaultBackend - if playground.Spec.BackendRuntimeConfig != nil && playground.Spec.BackendRuntimeConfig.Name != nil { - backendRuntimeName = *playground.Spec.BackendRuntimeConfig.Name + if playground.Spec.BackendRuntimeConfig != nil && playground.Spec.BackendRuntimeConfig.BackendName != nil { + backendRuntimeName = *playground.Spec.BackendRuntimeConfig.BackendName } backendRuntime := &inferenceapi.BackendRuntime{} if err := r.Get(ctx, types.NamespacedName{Name: string(backendRuntimeName)}, backendRuntime); err != nil { @@ -129,7 +130,7 @@ func (r *PlaygroundReconciler) Reconcile(ctx context.Context, req ctrl.Request) return ctrl.Result{}, err } - scalingConfiguration := buildScalingConfiguration(playground, backendRuntime) + scalingConfiguration := buildScalingConfiguration(models, playground, backendRuntime) if scalingConfiguration != nil { if err := setControllerReferenceForScalingConfiguration(playground, scalingConfiguration, r.Scheme); err != nil { logger.Error(err, "failed to set OwnerReference for scaling workload", "workload", fmt.Sprintf("%s/%s", playground.Namespace, playground.Name), "kind", scalingConfiguration.Kind) @@ -244,8 +245,7 @@ func buildServiceApplyConfiguration(models []*coreapi.OpenModel, playground *inf // Model flavors will not be considered but in inferenceService controller to support accelerator fungibility. func buildWorkloadTemplate(models []*coreapi.OpenModel, playground *inferenceapi.Playground, backendRuntime *inferenceapi.BackendRuntime) (lws.LeaderWorkerSetSpec, error) { workload := lws.LeaderWorkerSetSpec{ - // Use the default policy defined in lws. - StartupPolicy: lws.LeaderCreatedStartupPolicy, + StartupPolicy: lws.LeaderReadyStartupPolicy, RolloutStrategy: lws.RolloutStrategy{ Type: lws.RollingUpdateStrategyType, }, @@ -265,7 +265,7 @@ func buildWorkloadTemplate(models []*coreapi.OpenModel, playground *inferenceapi if multiHost { workload.LeaderWorkerTemplate.LeaderTemplate = &template - workload.LeaderWorkerTemplate.WorkerTemplate = buildWorkerTemplate(playground, backendRuntime) + workload.LeaderWorkerTemplate.WorkerTemplate = buildWorkerTemplate(models, playground, backendRuntime) } else { workload.LeaderWorkerTemplate.WorkerTemplate = template } @@ -274,33 +274,35 @@ func buildWorkloadTemplate(models []*coreapi.OpenModel, playground *inferenceapi } func buildTemplate(models []*coreapi.OpenModel, playground *inferenceapi.Playground, backendRuntime *inferenceapi.BackendRuntime, multiHost bool) (corev1.PodTemplateSpec, error) { - parser := helper.NewBackendRuntimeParser(backendRuntime) + parser := backendruntime.NewBackendRuntimeParser(backendRuntime, models, playground) - commands := parser.Commands() - if multiHost { - commands = parser.LeaderCommands() + // envs + envs := parser.Envs() + if playground.Spec.BackendRuntimeConfig != nil { + envs = append(envs, playground.Spec.BackendRuntimeConfig.Envs...) } - args, err := parser.Args(playground, models, multiHost) + // args + args, err := parser.Args() if err != nil { return corev1.PodTemplateSpec{}, err } - envs := parser.Envs() - if playground.Spec.BackendRuntimeConfig != nil { - envs = append(envs, playground.Spec.BackendRuntimeConfig.Envs...) - if playground.Spec.BackendRuntimeConfig.Args != nil { - args = append(args, playground.Spec.BackendRuntimeConfig.Args.Flags...) - } + args = append(args, playground.Spec.BackendRuntimeConfig.Args...) } + // resources + r := parser.Resources() + if r == nil { + r = &inferenceapi.ResourceRequirements{} + } resources := corev1.ResourceRequirements{ - Requests: parser.Resources().Requests, - Limits: parser.Resources().Limits, + Requests: r.Requests, + Limits: r.Limits, } if playground.Spec.BackendRuntimeConfig != nil && playground.Spec.BackendRuntimeConfig.Resources != nil { - limits := util.MergeResources(playground.Spec.BackendRuntimeConfig.Resources.Limits, parser.Resources().Limits) - requests := util.MergeResources(playground.Spec.BackendRuntimeConfig.Resources.Requests, parser.Resources().Requests) + limits := util.MergeResources(playground.Spec.BackendRuntimeConfig.Resources.Limits, r.Limits) + requests := util.MergeResources(playground.Spec.BackendRuntimeConfig.Resources.Requests, r.Requests) resources = corev1.ResourceRequirements{ Limits: limits, @@ -308,26 +310,31 @@ func buildTemplate(models []*coreapi.OpenModel, playground *inferenceapi.Playgro } // Make sure the limits are always greater than requests. - for k, v := range resources.Limits { + for k, v := range resources.Requests { if k == corev1.ResourceCPU || k == corev1.ResourceMemory { - if v.Cmp(requests[k]) == -1 { + if v.Cmp(limits[k]) == 1 { resources.Limits[k] = requests[k] } } } } + // image version version := parser.Version() if playground.Spec.BackendRuntimeConfig != nil && playground.Spec.BackendRuntimeConfig.Version != nil { version = *playground.Spec.BackendRuntimeConfig.Version } - // Pod can not accept shell commands with args together, merge the args with the commands. + // commands + commands := parser.Commands() if multiHost { + commands = parser.LeaderCommands() + // Pod can not accept shell commands with args together, merge the args with the commands. commands = util.MergeArgsWithCommands(commands, args) args = nil } + // probe var livenessProbe, readinessProbe, startupProbe *corev1.Probe if backendRuntime.Spec.StartupProbe != nil { startupProbe = backendRuntime.Spec.StartupProbe @@ -366,14 +373,19 @@ func buildTemplate(models []*coreapi.OpenModel, playground *inferenceapi.Playgro }, } - // construct /dev/shm size + // sharedMemorySize + sharedMemorySize := parser.SharedMemorySize() if playground.Spec.BackendRuntimeConfig != nil && playground.Spec.BackendRuntimeConfig.SharedMemorySize != nil { + sharedMemorySize = playground.Spec.BackendRuntimeConfig.SharedMemorySize + } + if sharedMemorySize != nil { + // construct /dev/shm size template.Spec.Volumes = append(template.Spec.Volumes, corev1.Volume{ Name: "dshm", VolumeSource: corev1.VolumeSource{ EmptyDir: &corev1.EmptyDirVolumeSource{ Medium: corev1.StorageMediumMemory, - SizeLimit: playground.Spec.BackendRuntimeConfig.SharedMemorySize, + SizeLimit: sharedMemorySize, }, }, }) @@ -389,21 +401,25 @@ func buildTemplate(models []*coreapi.OpenModel, playground *inferenceapi.Playgro // This is a copy of buildTemplate with some refactors, only used in multi-nodes cases. // Worker template has no args, no contain port. -func buildWorkerTemplate(playground *inferenceapi.Playground, backendRuntime *inferenceapi.BackendRuntime) corev1.PodTemplateSpec { - parser := helper.NewBackendRuntimeParser(backendRuntime) +func buildWorkerTemplate(models []*coreapi.OpenModel, playground *inferenceapi.Playground, backendRuntime *inferenceapi.BackendRuntime) corev1.PodTemplateSpec { + parser := backendruntime.NewBackendRuntimeParser(backendRuntime, models, playground) envs := parser.Envs() if playground.Spec.BackendRuntimeConfig != nil { envs = append(envs, playground.Spec.BackendRuntimeConfig.Envs...) } + r := parser.Resources() + if r == nil { + r = &inferenceapi.ResourceRequirements{} + } resources := corev1.ResourceRequirements{ - Requests: parser.Resources().Requests, - Limits: parser.Resources().Limits, + Requests: r.Requests, + Limits: r.Limits, } if playground.Spec.BackendRuntimeConfig != nil && playground.Spec.BackendRuntimeConfig.Resources != nil { - limits := util.MergeResources(playground.Spec.BackendRuntimeConfig.Resources.Limits, parser.Resources().Limits) - requests := util.MergeResources(playground.Spec.BackendRuntimeConfig.Resources.Requests, parser.Resources().Requests) + limits := util.MergeResources(playground.Spec.BackendRuntimeConfig.Resources.Limits, r.Limits) + requests := util.MergeResources(playground.Spec.BackendRuntimeConfig.Resources.Requests, r.Requests) resources = corev1.ResourceRequirements{ Limits: limits, @@ -411,9 +427,9 @@ func buildWorkerTemplate(playground *inferenceapi.Playground, backendRuntime *in } // Make sure the limits are always greater than requests. - for k, v := range resources.Limits { + for k, v := range resources.Requests { if k == corev1.ResourceCPU || k == corev1.ResourceMemory { - if v.Cmp(requests[k]) == -1 { + if v.Cmp(limits[k]) == 1 { resources.Limits[k] = requests[k] } } @@ -441,14 +457,18 @@ func buildWorkerTemplate(playground *inferenceapi.Playground, backendRuntime *in }, } - // construct /dev/shm size + sharedMemorySize := parser.SharedMemorySize() if playground.Spec.BackendRuntimeConfig != nil && playground.Spec.BackendRuntimeConfig.SharedMemorySize != nil { + sharedMemorySize = playground.Spec.BackendRuntimeConfig.SharedMemorySize + } + if sharedMemorySize != nil { + // construct /dev/shm size template.Spec.Volumes = append(template.Spec.Volumes, corev1.Volume{ Name: "dshm", VolumeSource: corev1.VolumeSource{ EmptyDir: &corev1.EmptyDirVolumeSource{ Medium: corev1.StorageMediumMemory, - SizeLimit: playground.Spec.BackendRuntimeConfig.SharedMemorySize, + SizeLimit: sharedMemorySize, }, }, }) @@ -564,33 +584,28 @@ func setControllerReferenceForService(owner metav1.Object, saf *inferenceclientg } // buildScalingConfiguration supports HPA only now. -func buildScalingConfiguration(playground *inferenceapi.Playground, backend *inferenceapi.BackendRuntime) *autoscalingv2.HorizontalPodAutoscaler { +func buildScalingConfiguration(models []*coreapi.OpenModel, playground *inferenceapi.Playground, backend *inferenceapi.BackendRuntime) *autoscalingv2.HorizontalPodAutoscaler { if playground.Spec.ElasticConfig == nil { return nil } - // Handle HPA. - if playground.Spec.ElasticConfig.ScaleTrigger != nil && playground.Spec.ElasticConfig.ScaleTrigger.HPA != nil { + // Prefer the playground config. + if playground.Spec.BackendRuntimeConfig != nil && playground.Spec.BackendRuntimeConfig.ScaleTrigger != nil { hpa := newHPA(playground) - hpa.Spec.Metrics = playground.Spec.ElasticConfig.ScaleTrigger.HPA.Metrics - hpa.Spec.Behavior = playground.Spec.ElasticConfig.ScaleTrigger.HPA.Behavior + hpa.Spec.Metrics = playground.Spec.BackendRuntimeConfig.ScaleTrigger.HPA.Metrics + hpa.Spec.Behavior = playground.Spec.BackendRuntimeConfig.ScaleTrigger.HPA.Behavior return hpa + } - if len(backend.Spec.ScaleTriggers) > 0 { - hpa := newHPA(playground) - if playground.Spec.ElasticConfig.ScaleTriggerRef != nil { - for _, trigger := range backend.Spec.ScaleTriggers { - if trigger.Name == playground.Spec.ElasticConfig.ScaleTriggerRef.Name { - hpa.Spec.Metrics = trigger.HPA.Metrics - hpa.Spec.Behavior = trigger.HPA.Behavior - return hpa - } - } - } else { - // use the 0-index as the default value. - hpa.Spec.Metrics = backend.Spec.ScaleTriggers[0].HPA.Metrics - hpa.Spec.Behavior = backend.Spec.ScaleTriggers[0].HPA.Behavior + _, multiHost := helper.MultiHostInference(models[0], playground) + mode := helper.DetectArgFrom(playground, multiHost) + + for _, recommend := range backend.Spec.RecommendedConfigs { + if recommend.Name == mode && recommend.ScaleTrigger != nil { + hpa := newHPA(playground) + hpa.Spec.Metrics = recommend.ScaleTrigger.HPA.Metrics + hpa.Spec.Behavior = recommend.ScaleTrigger.HPA.Behavior return hpa } } diff --git a/pkg/controller/inference/service_controller.go b/pkg/controller/inference/service_controller.go index 3aae6e8e..cc0de1ec 100644 --- a/pkg/controller/inference/service_controller.go +++ b/pkg/controller/inference/service_controller.go @@ -45,7 +45,7 @@ import ( coreapi "github.com/inftyai/llmaz/api/core/v1alpha1" inferenceapi "github.com/inftyai/llmaz/api/inference/v1alpha1" helper "github.com/inftyai/llmaz/pkg/controller_helper" - modelSource "github.com/inftyai/llmaz/pkg/controller_helper/model_source" + modelSource "github.com/inftyai/llmaz/pkg/controller_helper/modelsource" "github.com/inftyai/llmaz/pkg/util" ) diff --git a/pkg/controller_helper/backendruntime.go b/pkg/controller_helper/backendruntime/backendruntime.go similarity index 66% rename from pkg/controller_helper/backendruntime.go rename to pkg/controller_helper/backendruntime/backendruntime.go index bdb8bfba..2c1eb021 100644 --- a/pkg/controller_helper/backendruntime.go +++ b/pkg/controller_helper/backendruntime/backendruntime.go @@ -22,19 +22,33 @@ import ( "strings" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" coreapi "github.com/inftyai/llmaz/api/core/v1alpha1" inferenceapi "github.com/inftyai/llmaz/api/inference/v1alpha1" - modelSource "github.com/inftyai/llmaz/pkg/controller_helper/model_source" + helper "github.com/inftyai/llmaz/pkg/controller_helper" + modelSource "github.com/inftyai/llmaz/pkg/controller_helper/modelsource" ) // TODO: add unit tests. type BackendRuntimeParser struct { - backendRuntime *inferenceapi.BackendRuntime + backendRuntime *inferenceapi.BackendRuntime + models []*coreapi.OpenModel + playground *inferenceapi.Playground + recommendConfigName string + multiHost bool } -func NewBackendRuntimeParser(backendRuntime *inferenceapi.BackendRuntime) *BackendRuntimeParser { - return &BackendRuntimeParser{backendRuntime} +func NewBackendRuntimeParser(backendRuntime *inferenceapi.BackendRuntime, models []*coreapi.OpenModel, playground *inferenceapi.Playground) *BackendRuntimeParser { + _, multiHost := helper.MultiHostInference(models[0], playground) + name := helper.RecommendedConfigName(playground, multiHost) + return &BackendRuntimeParser{ + backendRuntime, + models, + playground, + name, + multiHost, + } } func (p *BackendRuntimeParser) Commands() []string { @@ -59,16 +73,8 @@ func (p *BackendRuntimeParser) Envs() []corev1.EnvVar { return p.backendRuntime.Spec.Envs } -func (p *BackendRuntimeParser) Args(playground *inferenceapi.Playground, models []*coreapi.OpenModel, multiNodes bool) ([]string, error) { - var argName string - if playground.Spec.BackendRuntimeConfig != nil && playground.Spec.BackendRuntimeConfig.Args != nil { - argName = *playground.Spec.BackendRuntimeConfig.Args.Name - } else { - // Auto detect the args from model roles. - argName = DetectArgFrom(playground, multiNodes) - } - - mainModel := models[0] +func (p *BackendRuntimeParser) Args() ([]string, error) { + mainModel := p.models[0] source := modelSource.NewModelSourceProvider(mainModel) modelInfo := map[string]string{ @@ -76,8 +82,8 @@ func (p *BackendRuntimeParser) Args(playground *inferenceapi.Playground, models "ModelName": source.ModelName(), } - if multiNodes { - flavors := FirstAssignedFlavor(mainModel, playground) + if p.multiHost { + flavors := helper.FirstAssignedFlavor(mainModel, p.playground) if len(flavors) > 0 { modelInfo["PP"] = flavors[0].Params["PP"] modelInfo["TP"] = flavors[0].Params["TP"] @@ -86,13 +92,13 @@ func (p *BackendRuntimeParser) Args(playground *inferenceapi.Playground, models // TODO: This is not that reliable because two models doesn't always means speculative-decoding. // Revisit this later. - if len(models) > 1 { - modelInfo["DraftModelPath"] = modelSource.NewModelSourceProvider(models[1]).ModelPath() + if len(p.models) > 1 { + modelInfo["DraftModelPath"] = modelSource.NewModelSourceProvider(p.models[1]).ModelPath() } - for _, arg := range p.backendRuntime.Spec.Args { - if *arg.Name == argName { - return renderFlags(arg.Flags, modelInfo) + for _, recommend := range p.backendRuntime.Spec.RecommendedConfigs { + if recommend.Name == p.recommendConfigName { + return renderFlags(recommend.Args, modelInfo) } } @@ -108,8 +114,23 @@ func (p *BackendRuntimeParser) Version() string { return p.backendRuntime.Spec.Version } -func (p *BackendRuntimeParser) Resources() inferenceapi.ResourceRequirements { - return p.backendRuntime.Spec.Resources +func (p *BackendRuntimeParser) Resources() *inferenceapi.ResourceRequirements { + for _, recommend := range p.backendRuntime.Spec.RecommendedConfigs { + if recommend.Name == p.recommendConfigName { + return recommend.Resources + } + } + // We should not reach here. + return nil +} + +func (p *BackendRuntimeParser) SharedMemorySize() *resource.Quantity { + for _, recommend := range p.backendRuntime.Spec.RecommendedConfigs { + if recommend.Name == p.recommendConfigName { + return recommend.SharedMemorySize + } + } + return nil } func renderFlags(flags []string, modelInfo map[string]string) ([]string, error) { diff --git a/pkg/controller_helper/backendruntime_test.go b/pkg/controller_helper/backendruntime/backendruntime_test.go similarity index 100% rename from pkg/controller_helper/backendruntime_test.go rename to pkg/controller_helper/backendruntime/backendruntime_test.go diff --git a/pkg/controller_helper/helper.go b/pkg/controller_helper/helper.go index 99cad552..85699fbe 100644 --- a/pkg/controller_helper/helper.go +++ b/pkg/controller_helper/helper.go @@ -33,6 +33,18 @@ const ( ModelParallelismArg string = "model-parallelism" ) +func RecommendedConfigName(playground *inferenceapi.Playground, multiNodes bool) string { + var name string + if playground.Spec.BackendRuntimeConfig != nil && playground.Spec.BackendRuntimeConfig.ConfigName != nil { + name = *playground.Spec.BackendRuntimeConfig.ConfigName + } else { + // Auto detect the args from model roles. + name = DetectArgFrom(playground, multiNodes) + } + + return name +} + // DetectArgFrom wil auto detect the arg from model roles if not set explicitly. func DetectArgFrom(playground *inferenceapi.Playground, isMultiNodesInference bool) string { if isMultiNodesInference { @@ -91,7 +103,7 @@ func fetchModels(ctx context.Context, k8sClient client.Client, mrs []coreapi.Mod return models, nil } -// FirstAssignedFlavor will return the first assigned flavor of the model, always the 0-index flavor. +// FirstAssignedFlavor will return the first assigned flavor of the model. func FirstAssignedFlavor(model *coreapi.OpenModel, playground *inferenceapi.Playground) []coreapi.Flavor { var flavors []coreapi.FlavorName if playground.Spec.ModelClaim != nil { @@ -117,7 +129,7 @@ func FirstAssignedFlavor(model *coreapi.OpenModel, playground *inferenceapi.Play return nil } -// MultiHostInference returns two values, the first one is the TP size, +// MultiHostInference returns two values, the first one is the PP size, // the second one is whether this is a multi-host inference. func MultiHostInference(model *coreapi.OpenModel, playground *inferenceapi.Playground) (int32, bool) { flavors := FirstAssignedFlavor(model, playground) diff --git a/pkg/controller_helper/model_source/modelhub.go b/pkg/controller_helper/modelsource/modelhub.go similarity index 100% rename from pkg/controller_helper/model_source/modelhub.go rename to pkg/controller_helper/modelsource/modelhub.go diff --git a/pkg/controller_helper/model_source/modelsource.go b/pkg/controller_helper/modelsource/modelsource.go similarity index 100% rename from pkg/controller_helper/model_source/modelsource.go rename to pkg/controller_helper/modelsource/modelsource.go diff --git a/pkg/controller_helper/model_source/modelsource_test.go b/pkg/controller_helper/modelsource/modelsource_test.go similarity index 100% rename from pkg/controller_helper/model_source/modelsource_test.go rename to pkg/controller_helper/modelsource/modelsource_test.go diff --git a/pkg/controller_helper/model_source/uri.go b/pkg/controller_helper/modelsource/uri.go similarity index 100% rename from pkg/controller_helper/model_source/uri.go rename to pkg/controller_helper/modelsource/uri.go diff --git a/pkg/webhook/backendruntime_webhook.go b/pkg/webhook/backendruntime_webhook.go index babd1b76..3b7c3519 100644 --- a/pkg/webhook/backendruntime_webhook.go +++ b/pkg/webhook/backendruntime_webhook.go @@ -78,20 +78,25 @@ func (w *BackendRuntimeWebhook) generateValidate(obj runtime.Object) field.Error var allErrs field.ErrorList // Validate resources. - for k, v := range backend.Spec.Resources.Limits { - if requestV, ok := backend.Spec.Resources.Requests[k]; ok { - if v.Cmp(requestV) == -1 { - allErrs = append(allErrs, field.Forbidden(specPath.Child("resources"), fmt.Sprintf("resource limit of %s is less than resource request", k))) + for _, recommend := range backend.Spec.RecommendedConfigs { + if recommend.Resources == nil { + continue + } + for k, v := range recommend.Resources.Limits { + if requestV, ok := recommend.Resources.Requests[k]; ok { + if v.Cmp(requestV) == -1 { + allErrs = append(allErrs, field.Forbidden(specPath.Child("resources"), fmt.Sprintf("resource limit of %s is less than resource request", k))) + } } } } names := []string{} - for _, arg := range backend.Spec.Args { - if util.In(names, *arg.Name) { - allErrs = append(allErrs, field.Forbidden(specPath.Child("args", "name"), fmt.Sprintf("duplicated name %s", *arg.Name))) + for _, recommend := range backend.Spec.RecommendedConfigs { + if util.In(names, recommend.Name) { + allErrs = append(allErrs, field.Forbidden(specPath.Child("args", "name"), fmt.Sprintf("duplicated name %s", recommend.Name))) } - names = append(names, *arg.Name) + names = append(names, recommend.Name) } return allErrs } diff --git a/pkg/webhook/openmodel_webhook.go b/pkg/webhook/openmodel_webhook.go index 9ede0ce5..d78c02b7 100644 --- a/pkg/webhook/openmodel_webhook.go +++ b/pkg/webhook/openmodel_webhook.go @@ -27,7 +27,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/webhook/admission" coreapi "github.com/inftyai/llmaz/api/core/v1alpha1" - modelSource "github.com/inftyai/llmaz/pkg/controller_helper/model_source" + modelSource "github.com/inftyai/llmaz/pkg/controller_helper/modelsource" "github.com/inftyai/llmaz/pkg/util" ) diff --git a/pkg/webhook/playground_webhook.go b/pkg/webhook/playground_webhook.go index 53c25839..47ac0ff3 100644 --- a/pkg/webhook/playground_webhook.go +++ b/pkg/webhook/playground_webhook.go @@ -141,7 +141,7 @@ func (w *PlaygroundWebhook) generateValidate(obj runtime.Object) field.ErrorList } if playground.Spec.ElasticConfig != nil { - if *playground.Spec.ElasticConfig.MinReplicas == 0 { + if playground.Spec.ElasticConfig.MinReplicas != nil && *playground.Spec.ElasticConfig.MinReplicas == 0 { allErrs = append(allErrs, field.Forbidden(specPath.Child("elasticConfig.minReplicas"), "minReplicas couldn't be 0")) } @@ -150,11 +150,11 @@ func (w *PlaygroundWebhook) generateValidate(obj runtime.Object) field.ErrorList allErrs = append(allErrs, field.Invalid(specPath.Child("elasticConfig.scaleTrigger.hpa"), *playground.Spec.ElasticConfig.MinReplicas, "minReplicas must be less than maxReplicas")) } } + } - if playground.Spec.ElasticConfig.ScaleTrigger != nil { - if playground.Spec.ElasticConfig.ScaleTrigger.HPA == nil { - allErrs = append(allErrs, field.Forbidden(specPath.Child("elasticConfig.scaleTrigger.hpa"), "hpa couldn't be nil")) - } + if playground.Spec.BackendRuntimeConfig != nil && playground.Spec.BackendRuntimeConfig.ScaleTrigger != nil { + if playground.Spec.BackendRuntimeConfig.ScaleTrigger.HPA == nil { + allErrs = append(allErrs, field.Forbidden(specPath.Child("backendRuntime.scaleTrigger.hpa"), "hpa couldn't be nil")) } } diff --git a/pkg/webhook/service_webhook.go b/pkg/webhook/service_webhook.go index fd21b2e5..008dc820 100644 --- a/pkg/webhook/service_webhook.go +++ b/pkg/webhook/service_webhook.go @@ -28,7 +28,7 @@ import ( coreapi "github.com/inftyai/llmaz/api/core/v1alpha1" inferenceapi "github.com/inftyai/llmaz/api/inference/v1alpha1" - modelSource "github.com/inftyai/llmaz/pkg/controller_helper/model_source" + modelSource "github.com/inftyai/llmaz/pkg/controller_helper/modelsource" ) type ServiceWebhook struct{} diff --git a/test/config/backends/fake_backend.yaml b/test/config/backends/fake_backend.yaml index 250f1621..ba573040 100644 --- a/test/config/backends/fake_backend.yaml +++ b/test/config/backends/fake_backend.yaml @@ -13,42 +13,33 @@ spec: - echo "hello" image: busybox version: latest - args: + recommendedConfigs: - name: default - flags: + args: - mode - "default" + resources: + requests: + cpu: 4 + memory: 8Gi + limits: + cpu: 4 + memory: 8Gi + sharedMemorySize: 1Gi + scaleTrigger: + hpa: + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 20 - name: speculative-decoding - flags: + args: - mode - "speculative-decoding" - name: fuz - flags: + args: - mode - "fuz" - resources: - requests: - cpu: 4 - memory: 8Gi - limits: - cpu: 4 - memory: 8Gi - scaleTriggers: - - name: hpa - hpa: - metrics: - - type: Resource - resource: - name: cpu - target: - type: Utilization - averageUtilization: 50 - - name: hpa2 - hpa: - metrics: - - type: Resource - resource: - name: cpu - target: - type: Utilization - averageUtilization: 80 diff --git a/test/config/backends/llamacpp.yaml b/test/config/backends/llamacpp.yaml index ea4554e4..ae3d04b9 100644 --- a/test/config/backends/llamacpp.yaml +++ b/test/config/backends/llamacpp.yaml @@ -11,18 +11,27 @@ spec: - ./llama-server image: ghcr.io/ggerganov/llama.cpp version: server - args: + # Do not edit the preset argument name unless you know what you're doing. + # Free to add more arguments with your requirements. + recommendedConfigs: - name: default - flags: + args: - -m - "{{ .ModelPath }}" - --host - "0.0.0.0" - --port - "8080" + resources: + requests: + cpu: 2 + memory: 4Gi + limits: + cpu: 2 + memory: 4Gi # TODO: not supported yet, see https://github.com/InftyAI/llmaz/issues/240. # - name: speculative-decoding - # flags: + # args: # - -m # - "{{ .ModelPath }}" # - -md @@ -35,13 +44,6 @@ spec: # - "16" # - --draft-min # - "5" - resources: - requests: - cpu: 2 - memory: 4Gi - limits: - cpu: 2 - memory: 4Gi startupProbe: periodSeconds: 10 failureThreshold: 30 diff --git a/test/config/backends/ollama.yaml b/test/config/backends/ollama.yaml index d5e347b0..35f93e79 100644 --- a/test/config/backends/ollama.yaml +++ b/test/config/backends/ollama.yaml @@ -12,18 +12,20 @@ spec: - -c image: ollama/ollama version: latest - args: + # Do not edit the preset argument name unless you know what you're doing. + # Free to add more arguments with your requirements. + recommendedConfigs: - name: default - flags: + args: - "ollama serve & while true; do output=$(ollama list 2>&1); if ! echo $output | grep -q 'could not connect to ollama app' && echo $output | grep -q 'NAME';then echo 'ollama is running';break; else echo 'Waiting for the ollama to be running...';sleep 1;fi;done; - ollama run {{`{{ .ModelName }}`}}; + ollama run {{ .ModelName }}; while true;do sleep 60;done" - resources: - requests: - cpu: 2 - memory: 4Gi - limits: - cpu: 2 - memory: 4Gi + resources: + requests: + cpu: 2 + memory: 4Gi + limits: + cpu: 2 + memory: 4Gi diff --git a/test/config/backends/sglang.yaml b/test/config/backends/sglang.yaml index 3eb4fab7..6a8dc60d 100644 --- a/test/config/backends/sglang.yaml +++ b/test/config/backends/sglang.yaml @@ -13,9 +13,11 @@ spec: - sglang.launch_server image: lmsysorg/sglang version: v0.2.10-cu121 - args: + # Do not edit the preset argument name unless you know what you're doing. + # Free to add more arguments with your requirements. + recommendedConfigs: - name: default - flags: + args: - --model-path - "{{ .ModelPath }}" - --served-model-name @@ -24,13 +26,13 @@ spec: - "0.0.0.0" - --port - "8080" - resources: - requests: - cpu: 4 - memory: 8Gi - limits: - cpu: 4 - memory: 8Gi + resources: + requests: + cpu: 4 + memory: 8Gi + limits: + cpu: 4 + memory: 8Gi startupProbe: periodSeconds: 10 failureThreshold: 30 diff --git a/test/config/backends/tgi.yaml b/test/config/backends/tgi.yaml index 75235192..b16eed65 100644 --- a/test/config/backends/tgi.yaml +++ b/test/config/backends/tgi.yaml @@ -11,20 +11,20 @@ spec: version: 2.3.1 # Do not edit the preset argument name unless you know what you're doing. # Free to add more arguments with your requirements. - args: + recommendedConfigs: - name: default - flags: + args: - --model-id - "{{ .ModelPath }}" - --port - "8080" - resources: - requests: - cpu: 4 - memory: 8Gi - limits: - cpu: 4 - memory: 8Gi + resources: + requests: + cpu: 4 + memory: 8Gi + limits: + cpu: 4 + memory: 8Gi startupProbe: periodSeconds: 10 failureThreshold: 30 diff --git a/test/config/backends/vllm.yaml b/test/config/backends/vllm.yaml index 7ecbd873..35e6cf67 100644 --- a/test/config/backends/vllm.yaml +++ b/test/config/backends/vllm.yaml @@ -58,9 +58,9 @@ spec: version: v0.6.0 # Do not edit the preset argument name unless you know what you're doing. # Free to add more arguments with your requirements. - args: + recommendedConfigs: - name: default - flags: + args: - --model - "{{ .ModelPath }}" - --served-model-name @@ -69,8 +69,15 @@ spec: - "0.0.0.0" - --port - "8080" + resources: + requests: + cpu: 4 + memory: 8Gi + limits: + cpu: 4 + memory: 8Gi - name: speculative-decoding - flags: + args: - --model - "{{ .ModelPath }}" - --served-model-name @@ -86,7 +93,7 @@ spec: - -tp - "1" - name: model-parallelism - flags: + args: - --model - "{{ .ModelPath }}" - --served-model-name @@ -99,13 +106,6 @@ spec: - "{{ .TP }}" - --pipeline-parallel-size - "{{ .PP }}" - resources: - requests: - cpu: 4 - memory: 8Gi - limits: - cpu: 4 - memory: 8Gi startupProbe: periodSeconds: 10 failureThreshold: 30 diff --git a/test/e2e/playground_test.go b/test/e2e/playground_test.go index 3640df92..c3c05938 100644 --- a/test/e2e/playground_test.go +++ b/test/e2e/playground_test.go @@ -54,7 +54,7 @@ var _ = ginkgo.Describe("playground e2e tests", func() { Image("ollama/ollama").Version("latest"). Command([]string{"sh", "-c"}). Arg("default", []string{"ollama serve & while true;do output=$(ollama list 2>&1);if ! echo $output | grep -q 'could not connect to ollama app' && echo $output | grep -q 'NAME';then echo 'ollama is running';break; else echo 'Waiting for the ollama to be running...';sleep 1;fi;done;ollama run {{.ModelName}};while true;do sleep 60;done"}). - Request("cpu", "2").Request("memory", "4Gi").Limit("cpu", "4").Limit("memory", "4Gi").Obj() + Request("default", "cpu", "2").Request("default", "memory", "4Gi").Limit("default", "cpu", "4").Limit("default", "memory", "4Gi").Obj() gomega.Expect(k8sClient.Create(ctx, backendRuntime)).To(gomega.Succeed()) model := wrapper.MakeModel("qwen2-0--5b").FamilyName("qwen2").ModelSourceWithURI("ollama://qwen2:0.5b").Obj() @@ -91,7 +91,7 @@ var _ = ginkgo.Describe("playground e2e tests", func() { Image("ghcr.io/ggerganov/llama.cpp").Version("server"). Command([]string{"./llama-server"}). Arg("default", []string{"-m", "{{.ModelPath}}", "--host", "0.0.0.0", "--port", "8080"}). - Request("cpu", "2").Request("memory", "4Gi").Limit("cpu", "4").Limit("memory", "4Gi").Obj() + Request("default", "cpu", "2").Request("default", "memory", "4Gi").Limit("default", "cpu", "4").Limit("default", "memory", "4Gi").Obj() gomega.Expect(k8sClient.Create(ctx, backendRuntime)).To(gomega.Succeed()) model := wrapper.MakeModel("qwen2-0-5b-gguf").FamilyName("qwen2").ModelSourceWithModelHub("Huggingface").ModelSourceWithModelID("Qwen/Qwen2-0.5B-Instruct-GGUF", "qwen2-0_5b-instruct-q5_k_m.gguf", "", nil, nil).Obj() diff --git a/test/integration/controller/inference/hpa_test.go b/test/integration/controller/inference/hpa_test.go index 57a40993..afcd3b82 100644 --- a/test/integration/controller/inference/hpa_test.go +++ b/test/integration/controller/inference/hpa_test.go @@ -98,7 +98,7 @@ var _ = ginkgo.Describe("hpa test", func() { if err := k8sClient.Get(ctx, types.NamespacedName{Name: playground.Name, Namespace: playground.Namespace}, hpa); err != nil { return err } - if diff := cmp.Diff(playground.Spec.ElasticConfig.ScaleTrigger.HPA.Metrics, hpa.Spec.Metrics); diff != "" { + if diff := cmp.Diff(playground.Spec.BackendRuntimeConfig.ScaleTrigger.HPA.Metrics, hpa.Spec.Metrics); diff != "" { return fmt.Errorf("metrics not match: %s", diff) } return nil @@ -129,7 +129,7 @@ var _ = ginkgo.Describe("hpa test", func() { if err := k8sClient.Get(ctx, types.NamespacedName{Name: "fake-backend"}, backend); err != nil { return err } - if diff := cmp.Diff(backend.Spec.ScaleTriggers[0].HPA.Metrics, hpa.Spec.Metrics); diff != "" { + if diff := cmp.Diff(backend.Spec.RecommendedConfigs[0].ScaleTrigger.HPA.Metrics, hpa.Spec.Metrics); diff != "" { return fmt.Errorf("metrics not match: %s", diff) } return nil @@ -141,7 +141,8 @@ var _ = ginkgo.Describe("hpa test", func() { ginkgo.Entry("playground with scaleTrigger overwrite backendRuntime's", &testValidatingCase{ makePlayground: func() *inferenceapi.Playground { return wrapper.MakePlayground("playground", ns.Name).ModelClaim(model.Name).Label(coreapi.ModelNameLabelKey, model.Name). - ElasticConfig(1, 3).ScaleTriggerRef("hpa2"). + ElasticConfig(1, 3). + HPA(util.MockASimpleHPATrigger()). BackendRuntime("fake-backend"). Obj() }, @@ -160,7 +161,7 @@ var _ = ginkgo.Describe("hpa test", func() { if err := k8sClient.Get(ctx, types.NamespacedName{Name: "fake-backend"}, backend); err != nil { return err } - if diff := cmp.Diff(backend.Spec.ScaleTriggers[1].HPA.Metrics, hpa.Spec.Metrics); diff != "" { + if diff := cmp.Diff(util.MockASimpleHPATrigger().Metrics, hpa.Spec.Metrics); diff != "" { return fmt.Errorf("metrics not match: %s", diff) } return nil diff --git a/test/integration/controller/inference/playground_test.go b/test/integration/controller/inference/playground_test.go index d62de815..fc7d7a97 100644 --- a/test/integration/controller/inference/playground_test.go +++ b/test/integration/controller/inference/playground_test.go @@ -194,7 +194,7 @@ var _ = ginkgo.Describe("playground controller test", func() { ginkgo.Entry("advance configured Playground with sglang", &testValidatingCase{ makePlayground: func() *inferenceapi.Playground { return wrapper.MakePlayground("playground", ns.Name).ModelClaim(model.Name).Label(coreapi.ModelNameLabelKey, model.Name). - BackendRuntime("sglang").BackendRuntimeVersion("main").BackendRuntimeArgs("default", []string{"--foo", "bar"}).BackendRuntimeEnv("FOO", "BAR"). + BackendRuntime("sglang").BackendRuntimeVersion("main").BackendRuntimeArgs([]string{"--foo", "bar"}).BackendRuntimeEnv("FOO", "BAR"). BackendRuntimeRequest("cpu", "1").BackendRuntimeLimit("cpu", "10"). Obj() }, @@ -222,7 +222,7 @@ var _ = ginkgo.Describe("playground controller test", func() { ginkgo.Entry("advance configured Playground with llamacpp", &testValidatingCase{ makePlayground: func() *inferenceapi.Playground { return wrapper.MakePlayground("playground", ns.Name).ModelClaim(model.Name).Label(coreapi.ModelNameLabelKey, model.Name). - BackendRuntime("llamacpp").BackendRuntimeVersion("main").BackendRuntimeArgs("default", []string{"--foo", "bar"}).BackendRuntimeEnv("FOO", "BAR"). + BackendRuntime("llamacpp").BackendRuntimeVersion("main").BackendRuntimeArgs([]string{"--foo", "bar"}).BackendRuntimeEnv("FOO", "BAR"). BackendRuntimeRequest("cpu", "1").BackendRuntimeLimit("cpu", "10"). Obj() }, @@ -250,7 +250,7 @@ var _ = ginkgo.Describe("playground controller test", func() { ginkgo.Entry("advance configured Playground with tgi", &testValidatingCase{ makePlayground: func() *inferenceapi.Playground { return wrapper.MakePlayground("playground", ns.Name).ModelClaim(model.Name).Label(coreapi.ModelNameLabelKey, model.Name). - BackendRuntime("tgi").BackendRuntimeVersion("main").BackendRuntimeArgs("default", []string{"--model-id", "Qwen/Qwen2-0.5B-Instruct"}).BackendRuntimeEnv("FOO", "BAR"). + BackendRuntime("tgi").BackendRuntimeVersion("main").BackendRuntimeArgs([]string{"--model-id", "Qwen/Qwen2-0.5B-Instruct"}).BackendRuntimeEnv("FOO", "BAR"). BackendRuntimeRequest("cpu", "1").BackendRuntimeLimit("cpu", "10"). Obj() }, @@ -278,7 +278,7 @@ var _ = ginkgo.Describe("playground controller test", func() { ginkgo.Entry("advance configured Playground with ollama", &testValidatingCase{ makePlayground: func() *inferenceapi.Playground { return wrapper.MakePlayground("playground", ns.Name).ModelClaim(model.Name).Label(coreapi.ModelNameLabelKey, model.Name). - BackendRuntime("ollama").BackendRuntimeVersion("main").BackendRuntimeArgs("default", []string{"--foo", "bar"}).BackendRuntimeEnv("FOO", "BAR"). + BackendRuntime("ollama").BackendRuntimeVersion("main").BackendRuntimeArgs([]string{"--foo", "bar"}).BackendRuntimeEnv("FOO", "BAR"). BackendRuntimeRequest("cpu", "1").BackendRuntimeLimit("cpu", "10"). Obj() }, @@ -306,7 +306,7 @@ var _ = ginkgo.Describe("playground controller test", func() { ginkgo.Entry("advance configured Playground with argName set", &testValidatingCase{ makePlayground: func() *inferenceapi.Playground { return wrapper.MakePlayground("playground", ns.Name).ModelClaim(model.Name).Label(coreapi.ModelNameLabelKey, model.Name). - BackendRuntime("fake-backend").BackendRuntimeVersion("main").BackendRuntimeArgs("fuz", []string{"--model-id", "Qwen/Qwen2-0.5B-Instruct"}).BackendRuntimeEnv("FOO", "BAR"). + BackendRuntime("fake-backend").BackendRuntimeVersion("main").BackendRuntimeArgs([]string{"--model-id", "Qwen/Qwen2-0.5B-Instruct"}).BackendRuntimeEnv("FOO", "BAR"). BackendRuntimeRequest("cpu", "1").BackendRuntimeLimit("cpu", "10"). Obj() }, @@ -468,7 +468,7 @@ var _ = ginkgo.Describe("playground controller test", func() { ginkgo.Entry("Playground with shared memory size configured", &testValidatingCase{ makePlayground: func() *inferenceapi.Playground { return wrapper.MakePlayground("playground", ns.Name).ModelClaim(multiNodesModel.Name).Label(coreapi.ModelNameLabelKey, multiNodesModel.Name). - SharedMemorySize("1Gi"). + SharedMemorySize("2Gi"). Obj() }, updates: []*update{ diff --git a/test/integration/webhook/backendruntime_test.go b/test/integration/webhook/backendruntime_test.go index 220dc2ad..92dfea4f 100644 --- a/test/integration/webhook/backendruntime_test.go +++ b/test/integration/webhook/backendruntime_test.go @@ -23,6 +23,7 @@ import ( inferenceapi "github.com/inftyai/llmaz/api/inference/v1alpha1" "github.com/inftyai/llmaz/test/util" + "github.com/inftyai/llmaz/test/util/wrapper" ) var _ = ginkgo.Describe("BackendRuntime default and validation", func() { @@ -74,7 +75,7 @@ var _ = ginkgo.Describe("BackendRuntime default and validation", func() { }), ginkgo.Entry("BackendRuntime creation with limits less than requests", &testValidatingCase{ creationFunc: func() *inferenceapi.BackendRuntime { - return util.MockASampleBackendRuntime().Limit("cpu", "1").Obj() + return util.MockASampleBackendRuntime().Limit("default", "cpu", "1").Obj() }, createFailed: true, }), @@ -84,11 +85,15 @@ var _ = ginkgo.Describe("BackendRuntime default and validation", func() { }, createFailed: false, }), - ginkgo.Entry("BackendRuntime creation with duplicated argument name", &testValidatingCase{ + ginkgo.Entry("BackendRuntime creation with no resources", &testValidatingCase{ creationFunc: func() *inferenceapi.BackendRuntime { - return util.MockASampleBackendRuntime().Arg("default", []string{"foo", "bar"}).Obj() + return wrapper.MakeBackendRuntime("vllm"). + Image("vllm/vllm-openai").Version("v0.6.0"). + Command([]string{"python3", "-m", "vllm.entrypoints.openai.api_server"}). + Arg("default", []string{"--model", "{{.ModelPath}}", "--served-model-name", "{{.ModelName}}", "--host", "0.0.0.0", "--port", "8080"}). + Obj() }, - createFailed: true, + createFailed: false, }), ) }) diff --git a/test/util/mock.go b/test/util/mock.go index 9a19c161..5069bc98 100644 --- a/test/util/mock.go +++ b/test/util/mock.go @@ -56,7 +56,7 @@ func MockASampleBackendRuntime() *wrapper.BackendRuntimeWrapper { Image("vllm/vllm-openai").Version("v0.6.0"). Command([]string{"python3", "-m", "vllm.entrypoints.openai.api_server"}). Arg("default", []string{"--model", "{{.ModelPath}}", "--served-model-name", "{{.ModelName}}", "--host", "0.0.0.0", "--port", "8080"}). - Request("cpu", "4").Limit("cpu", "4") + Request("default", "cpu", "4").Limit("default", "cpu", "4") } func MockASimpleHPATrigger() *inferenceapi.HPATrigger { diff --git a/test/util/validation/validate_playground.go b/test/util/validation/validate_playground.go index 4bfb1fc5..9617651b 100644 --- a/test/util/validation/validate_playground.go +++ b/test/util/validation/validate_playground.go @@ -33,7 +33,8 @@ import ( coreapi "github.com/inftyai/llmaz/api/core/v1alpha1" inferenceapi "github.com/inftyai/llmaz/api/inference/v1alpha1" helper "github.com/inftyai/llmaz/pkg/controller_helper" - modelSource "github.com/inftyai/llmaz/pkg/controller_helper/model_source" + backendruntime "github.com/inftyai/llmaz/pkg/controller_helper/backendruntime" + modelSource "github.com/inftyai/llmaz/pkg/controller_helper/modelsource" pkgutil "github.com/inftyai/llmaz/pkg/util" "github.com/inftyai/llmaz/test/util" "github.com/inftyai/llmaz/test/util/format" @@ -90,15 +91,15 @@ func ValidatePlayground(ctx context.Context, k8sClient client.Client, playground } backendRuntimeName := inferenceapi.DefaultBackend - if playground.Spec.BackendRuntimeConfig != nil && playground.Spec.BackendRuntimeConfig.Name != nil { - backendRuntimeName = *playground.Spec.BackendRuntimeConfig.Name + if playground.Spec.BackendRuntimeConfig != nil && playground.Spec.BackendRuntimeConfig.BackendName != nil { + backendRuntimeName = *playground.Spec.BackendRuntimeConfig.BackendName } backendRuntime := inferenceapi.BackendRuntime{} if err := k8sClient.Get(ctx, types.NamespacedName{Name: string(backendRuntimeName)}, &backendRuntime); err != nil { return errors.New("failed to get backendRuntime") } - parser := helper.NewBackendRuntimeParser(&backendRuntime) + parser := backendruntime.NewBackendRuntimeParser(&backendRuntime, models, playground) multiHost := service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate != nil if service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Name != modelSource.MODEL_RUNNER_CONTAINER_NAME { @@ -110,144 +111,98 @@ func ValidatePlayground(ctx context.Context, k8sClient client.Client, playground } } - // compare the same part of leader and worker template, image, version, env, resources. - if playground.Spec.BackendRuntimeConfig != nil { + // compare fields both backendRuntime and playground can configure. - // compare image & version - if playground.Spec.BackendRuntimeConfig.Version != nil { - if parser.Image(*playground.Spec.BackendRuntimeConfig.Version) != service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Image { - return fmt.Errorf("expected container image %s, got %s", parser.Image(*playground.Spec.BackendRuntimeConfig.Version), service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Image) - } - if multiHost { - if parser.Image(*playground.Spec.BackendRuntimeConfig.Version) != service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Image { - return fmt.Errorf("expected container image %s, got %s", parser.Image(*playground.Spec.BackendRuntimeConfig.Version), service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Image) - } - } - } else { - if parser.Image(parser.Version()) != service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Image { - return fmt.Errorf("expected container image %s, got %s", parser.Image(parser.Version()), service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Image) - } - if multiHost { - if parser.Image(parser.Version()) != service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Image { - return fmt.Errorf("expected container image %s, got %s", parser.Image(parser.Version()), service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Image) - } + sharedMemorySize := parser.SharedMemorySize() + if playground.Spec.BackendRuntimeConfig != nil && playground.Spec.BackendRuntimeConfig.SharedMemorySize != nil { + sharedMemorySize = playground.Spec.BackendRuntimeConfig.SharedMemorySize + } + if sharedMemorySize != nil { + if multiHost { + if *sharedMemorySize != *service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Volumes[0].EmptyDir.SizeLimit { + return fmt.Errorf("expected SharedMemorySize %s, got %s", sharedMemorySize.String(), service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Volumes[0].EmptyDir.SizeLimit.String()) } } - - if playground.Spec.BackendRuntimeConfig.Envs != nil { - if diff := cmp.Diff(service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Env, playground.Spec.BackendRuntimeConfig.Envs); diff != "" { - return fmt.Errorf("unexpected envs") - } - if multiHost { - if diff := cmp.Diff(service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Env, playground.Spec.BackendRuntimeConfig.Envs); diff != "" { - return fmt.Errorf("unexpected envs") - } - } + if *sharedMemorySize != *service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Volumes[0].EmptyDir.SizeLimit { + return fmt.Errorf("expected SharedMemorySize %s, got %s", sharedMemorySize.String(), service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Volumes[0].EmptyDir.SizeLimit.String()) } + } - if playground.Spec.BackendRuntimeConfig.Resources != nil { - for k, v := range playground.Spec.BackendRuntimeConfig.Resources.Limits { - if !service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Resources.Limits[k].Equal(v) { - return fmt.Errorf("unexpected limits for %s, want %v, got %v", k, v, service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Resources.Limits[k]) - } - if multiHost { - if !service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Resources.Limits[k].Equal(v) { - return fmt.Errorf("unexpected limits for %s, want %v, got %v", k, v, service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Resources.Limits[k]) - } - } - } - for k, v := range playground.Spec.BackendRuntimeConfig.Resources.Requests { - if !service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Resources.Requests[k].Equal(v) { - return fmt.Errorf("unexpected requests for %s, want %v, got %v", k, v, service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Resources.Requests[k]) - } - if multiHost { - if !service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Resources.Requests[k].Equal(v) { - return fmt.Errorf("unexpected requests for %s, want %v, got %v", k, v, service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Resources.Requests[k]) - } - } - } - } else { - // Validate default resources requirements. - for k, v := range parser.Resources().Limits { - if !service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Resources.Limits[k].Equal(v) { - return fmt.Errorf("unexpected limit for %s, want %v, got %v", k, v, service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Resources.Limits[k]) - } - if multiHost { - if !service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Resources.Limits[k].Equal(v) { - return fmt.Errorf("unexpected limit for %s, want %v, got %v", k, v, service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Resources.Limits[k]) - } - } - } - for k, v := range parser.Resources().Requests { - if !service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Resources.Requests[k].Equal(v) { - return fmt.Errorf("unexpected limit for %s, want %v, got %v", k, v, service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Resources.Requests[k]) - } - if multiHost { - if !service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Resources.Requests[k].Equal(v) { - return fmt.Errorf("unexpected limit for %s, want %v, got %v", k, v, service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Resources.Requests[k]) - } - } + resources := parser.Resources() + if resources == nil { + resources = &inferenceapi.ResourceRequirements{} + } + if playground.Spec.BackendRuntimeConfig != nil && playground.Spec.BackendRuntimeConfig.Resources != nil { + resources = playground.Spec.BackendRuntimeConfig.Resources + } + for k, v := range resources.Limits { + if multiHost { + if !service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Resources.Limits[k].Equal(v) { + return fmt.Errorf("unexpected limits for %s, want %v, got %v", k, v, service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Resources.Limits[k]) } } - - // compare probes - if backendRuntime.Spec.StartupProbe != nil { - if multiHost { - if diff := cmp.Diff(*service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].StartupProbe, *backendRuntime.Spec.StartupProbe); diff != "" { - return fmt.Errorf("unexpected startupProbe") - } - } else { - if diff := cmp.Diff(*service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].StartupProbe, *backendRuntime.Spec.StartupProbe); diff != "" { - return fmt.Errorf("unexpected startupProbe") - } - } + if !service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Resources.Limits[k].Equal(v) { + return fmt.Errorf("unexpected limits for %s, want %v, got %v", k, v, service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Resources.Limits[k]) } - if backendRuntime.Spec.LivenessProbe != nil { - if multiHost { - if diff := cmp.Diff(*service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].LivenessProbe, *backendRuntime.Spec.LivenessProbe); diff != "" { - return fmt.Errorf("unexpected livenessProbe") - } - } else { - if diff := cmp.Diff(*service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].LivenessProbe, *backendRuntime.Spec.LivenessProbe); diff != "" { - return fmt.Errorf("unexpected livenessProbe") - } + } + for k, v := range resources.Requests { + if multiHost { + if !service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Resources.Requests[k].Equal(v) { + return fmt.Errorf("unexpected requests for %s, want %v, got %v", k, v, service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Resources.Requests[k]) } } - if backendRuntime.Spec.ReadinessProbe != nil { - if multiHost { - if diff := cmp.Diff(*service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].ReadinessProbe, *backendRuntime.Spec.ReadinessProbe); diff != "" { - return fmt.Errorf("unexpected readinessProbe") - } - } else { - if diff := cmp.Diff(*service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].ReadinessProbe, *backendRuntime.Spec.ReadinessProbe); diff != "" { - return fmt.Errorf("unexpected readinessProbe") - } - } + if !service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Resources.Requests[k].Equal(v) { + return fmt.Errorf("unexpected requests for %s, want %v, got %v", k, v, service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Resources.Requests[k]) } } - // compare the different parts. + version := parser.Version() + if playground.Spec.BackendRuntimeConfig != nil && playground.Spec.BackendRuntimeConfig.Version != nil { + version = *playground.Spec.BackendRuntimeConfig.Version + } + if parser.Image(version) != service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Image { + return fmt.Errorf("expected container image %s, got %s", parser.Image(version), service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Image) + } + if multiHost { + if parser.Image(version) != service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Image { + return fmt.Errorf("expected container image %s, got %s", parser.Image(version), service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Image) + } + } - args, err := parser.Args(playground, models, multiHost) + envs := parser.Envs() + if playground.Spec.BackendRuntimeConfig != nil && playground.Spec.BackendRuntimeConfig.Envs != nil { + envs = playground.Spec.BackendRuntimeConfig.Envs + } + if diff := cmp.Diff(envs, service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Env); diff != "" { + return fmt.Errorf("unexpected envs") + } + if multiHost { + if diff := cmp.Diff(envs, service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Env); diff != "" { + return fmt.Errorf("unexpected envs") + } + } + + args, err := parser.Args() if err != nil { return err } if playground.Spec.BackendRuntimeConfig != nil && playground.Spec.BackendRuntimeConfig.Args != nil { - args = append(args, playground.Spec.BackendRuntimeConfig.Args.Flags...) + args = append(args, playground.Spec.BackendRuntimeConfig.Args...) } - for _, arg := range args { - if multiHost { - if len(service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Args) != 0 { - return fmt.Errorf("args should be empty, but got: %v", service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Args) - } - } else { + if multiHost { + if len(service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Args) != 0 { + return fmt.Errorf("args should be empty, but got: %v", service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Args) + } + } else { + for _, arg := range args { if !slices.Contains(service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Args, arg) { return fmt.Errorf("didn't contain arg: %s", arg) } } } + // compare commands if multiHost { if diff := cmp.Diff(pkgutil.MergeArgsWithCommands(parser.LeaderCommands(), args), service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Command); diff != "" { return errors.New("command not right") @@ -261,14 +216,39 @@ func ValidatePlayground(ctx context.Context, k8sClient client.Client, playground } } - if playground.Spec.BackendRuntimeConfig != nil && playground.Spec.BackendRuntimeConfig.SharedMemorySize != nil { + // compare fields only can be configured in backend. + + if backendRuntime.Spec.StartupProbe != nil { if multiHost { - if *playground.Spec.BackendRuntimeConfig.SharedMemorySize != *service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Volumes[0].EmptyDir.SizeLimit { - return fmt.Errorf("expected SharedMemorySize %s, got %s", playground.Spec.BackendRuntimeConfig.SharedMemorySize.String(), service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Volumes[0].EmptyDir.SizeLimit.String()) + if diff := cmp.Diff(*service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].StartupProbe, *backendRuntime.Spec.StartupProbe); diff != "" { + return fmt.Errorf("unexpected startupProbe") + } + } else { + if diff := cmp.Diff(*service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].StartupProbe, *backendRuntime.Spec.StartupProbe); diff != "" { + return fmt.Errorf("unexpected startupProbe") } } - if *playground.Spec.BackendRuntimeConfig.SharedMemorySize != *service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Volumes[0].EmptyDir.SizeLimit { - return fmt.Errorf("expected SharedMemorySize %s, got %s", playground.Spec.BackendRuntimeConfig.SharedMemorySize.String(), service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Volumes[0].EmptyDir.SizeLimit.String()) + } + if backendRuntime.Spec.LivenessProbe != nil { + if multiHost { + if diff := cmp.Diff(*service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].LivenessProbe, *backendRuntime.Spec.LivenessProbe); diff != "" { + return fmt.Errorf("unexpected livenessProbe") + } + } else { + if diff := cmp.Diff(*service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].LivenessProbe, *backendRuntime.Spec.LivenessProbe); diff != "" { + return fmt.Errorf("unexpected livenessProbe") + } + } + } + if backendRuntime.Spec.ReadinessProbe != nil { + if multiHost { + if diff := cmp.Diff(*service.Spec.WorkloadTemplate.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].ReadinessProbe, *backendRuntime.Spec.ReadinessProbe); diff != "" { + return fmt.Errorf("unexpected readinessProbe") + } + } else { + if diff := cmp.Diff(*service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].ReadinessProbe, *backendRuntime.Spec.ReadinessProbe); diff != "" { + return fmt.Errorf("unexpected readinessProbe") + } } } diff --git a/test/util/validation/validate_service.go b/test/util/validation/validate_service.go index aaf65236..c66ecc43 100644 --- a/test/util/validation/validate_service.go +++ b/test/util/validation/validate_service.go @@ -33,7 +33,7 @@ import ( coreapi "github.com/inftyai/llmaz/api/core/v1alpha1" inferenceapi "github.com/inftyai/llmaz/api/inference/v1alpha1" "github.com/inftyai/llmaz/pkg" - modelSource "github.com/inftyai/llmaz/pkg/controller_helper/model_source" + modelSource "github.com/inftyai/llmaz/pkg/controller_helper/modelsource" "github.com/inftyai/llmaz/test/util" ) diff --git a/test/util/wrapper/backend.go b/test/util/wrapper/backend.go index 26faf2b8..7d4148a8 100644 --- a/test/util/wrapper/backend.go +++ b/test/util/wrapper/backend.go @@ -20,7 +20,6 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/utils/ptr" inferenceapi "github.com/inftyai/llmaz/api/inference/v1alpha1" ) @@ -64,26 +63,79 @@ func (w *BackendRuntimeWrapper) Command(commands []string) *BackendRuntimeWrappe } func (w *BackendRuntimeWrapper) Arg(name string, flags []string) *BackendRuntimeWrapper { - w.Spec.Args = append(w.Spec.Args, inferenceapi.BackendRuntimeArg{ - Name: ptr.To[string](name), - Flags: flags, - }) + if w.Spec.RecommendedConfigs == nil { + w.Spec.RecommendedConfigs = []inferenceapi.RecommendedConfig{ + { + Name: name, + }, + } + } + for _, recommend := range w.Spec.RecommendedConfigs { + if recommend.Name == name { + recommend.Args = flags + break + } + } return w } -func (w *BackendRuntimeWrapper) Request(r, v string) *BackendRuntimeWrapper { - if w.Spec.Resources.Requests == nil { - w.Spec.Resources.Requests = corev1.ResourceList{} +func (w *BackendRuntimeWrapper) Request(name, r, v string) *BackendRuntimeWrapper { + if w.Spec.RecommendedConfigs == nil { + w.Spec.RecommendedConfigs = []inferenceapi.RecommendedConfig{ + { + Name: name, + }, + } + } + for i, recommend := range w.Spec.RecommendedConfigs { + if recommend.Name == name { + if w.Spec.RecommendedConfigs[i].Resources == nil { + w.Spec.RecommendedConfigs[i].Resources = &inferenceapi.ResourceRequirements{} + } + if w.Spec.RecommendedConfigs[i].Resources.Requests == nil { + w.Spec.RecommendedConfigs[i].Resources.Requests = corev1.ResourceList{} + } + w.Spec.RecommendedConfigs[i].Resources.Requests[corev1.ResourceName(r)] = resource.MustParse(v) + break + } } - w.Spec.Resources.Requests[corev1.ResourceName(r)] = resource.MustParse(v) return w } -func (w *BackendRuntimeWrapper) Limit(r, v string) *BackendRuntimeWrapper { - if w.Spec.Resources.Limits == nil { - w.Spec.Resources.Limits = corev1.ResourceList{} +func (w *BackendRuntimeWrapper) Limit(name, r, v string) *BackendRuntimeWrapper { + if w.Spec.RecommendedConfigs == nil { + w.Spec.RecommendedConfigs = []inferenceapi.RecommendedConfig{ + { + Name: name, + }, + } + } + for i, recommend := range w.Spec.RecommendedConfigs { + if recommend.Name == name { + if w.Spec.RecommendedConfigs[i].Resources.Limits == nil { + w.Spec.RecommendedConfigs[i].Resources.Limits = corev1.ResourceList{} + } + w.Spec.RecommendedConfigs[i].Resources.Limits[corev1.ResourceName(r)] = resource.MustParse(v) + break + } + } + return w +} + +func (w *BackendRuntimeWrapper) SharedMemorySize(name, v string) *BackendRuntimeWrapper { + if w.Spec.RecommendedConfigs == nil { + w.Spec.RecommendedConfigs = []inferenceapi.RecommendedConfig{ + { + Name: name, + }, + } + } + for i, recommend := range w.Spec.RecommendedConfigs { + if recommend.Name == name { + value := resource.MustParse(v) + w.Spec.RecommendedConfigs[i].SharedMemorySize = &value + } } - w.Spec.Resources.Limits[corev1.ResourceName(r)] = resource.MustParse(v) return w } diff --git a/test/util/wrapper/playground.go b/test/util/wrapper/playground.go index 10502ab3..0b76cdd9 100644 --- a/test/util/wrapper/playground.go +++ b/test/util/wrapper/playground.go @@ -97,7 +97,7 @@ func (w *PlaygroundWrapper) BackendRuntime(name string) *PlaygroundWrapper { w.Spec.BackendRuntimeConfig = &inferenceapi.BackendRuntimeConfig{} } backendName := inferenceapi.BackendName(name) - w.Spec.BackendRuntimeConfig.Name = &backendName + w.Spec.BackendRuntimeConfig.BackendName = &backendName return w } @@ -109,15 +109,11 @@ func (w *PlaygroundWrapper) BackendRuntimeVersion(version string) *PlaygroundWra return w } -func (w *PlaygroundWrapper) BackendRuntimeArgs(name string, args []string) *PlaygroundWrapper { +func (w *PlaygroundWrapper) BackendRuntimeArgs(args []string) *PlaygroundWrapper { if w.Spec.BackendRuntimeConfig == nil { w = w.BackendRuntime("vllm") } - if w.Spec.BackendRuntimeConfig.Args == nil { - w.Spec.BackendRuntimeConfig.Args = &inferenceapi.BackendRuntimeArg{} - } - w.Spec.BackendRuntimeConfig.Args.Name = &name - w.Spec.BackendRuntimeConfig.Args.Flags = args + w.Spec.BackendRuntimeConfig.Args = args return w } @@ -161,32 +157,22 @@ func (w *PlaygroundWrapper) BackendRuntimeLimit(r, v string) *PlaygroundWrapper } func (w *PlaygroundWrapper) ElasticConfig(minReplicas, maxReplicas int32) *PlaygroundWrapper { - w.Spec.ElasticConfig = &inferenceapi.ElasticConfig{ - MaxReplicas: ptr.To[int32](maxReplicas), - MinReplicas: ptr.To[int32](minReplicas), - } - return w -} - -func (w *PlaygroundWrapper) HPA(config *inferenceapi.HPATrigger) *PlaygroundWrapper { if w.Spec.ElasticConfig == nil { w.Spec.ElasticConfig = &inferenceapi.ElasticConfig{} } - if w.Spec.ElasticConfig.ScaleTrigger == nil { - w.Spec.ElasticConfig.ScaleTrigger = &inferenceapi.ScaleTrigger{} - } - w.Spec.ElasticConfig.ScaleTrigger.HPA = config + w.Spec.ElasticConfig.MaxReplicas = ptr.To[int32](maxReplicas) + w.Spec.ElasticConfig.MinReplicas = ptr.To[int32](minReplicas) return w } -func (w *PlaygroundWrapper) ScaleTriggerRef(name string) *PlaygroundWrapper { - if w.Spec.ElasticConfig == nil { - w.Spec.ElasticConfig = &inferenceapi.ElasticConfig{} +func (w *PlaygroundWrapper) HPA(config *inferenceapi.HPATrigger) *PlaygroundWrapper { + if w.Spec.BackendRuntimeConfig == nil { + w.Spec.BackendRuntimeConfig = &inferenceapi.BackendRuntimeConfig{} } - if w.Spec.ElasticConfig.ScaleTriggerRef == nil { - w.Spec.ElasticConfig.ScaleTriggerRef = &inferenceapi.ScaleTriggerRef{} + if w.Spec.BackendRuntimeConfig.ScaleTrigger == nil { + w.Spec.BackendRuntimeConfig.ScaleTrigger = &inferenceapi.ScaleTrigger{} } - w.Spec.ElasticConfig.ScaleTriggerRef.Name = name + w.Spec.BackendRuntimeConfig.ScaleTrigger.HPA = config return w } From 1b4885fb85fda08398f701c09ee093c73315a209 Mon Sep 17 00:00:00 2001 From: kerthcet Date: Tue, 18 Feb 2025 11:51:40 +0800 Subject: [PATCH 2/3] change E2E timeout to 5minutes Signed-off-by: kerthcet --- test/util/consts.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/util/consts.go b/test/util/consts.go index 4d89170c..969de5e2 100644 --- a/test/util/consts.go +++ b/test/util/consts.go @@ -20,6 +20,6 @@ import "time" const ( IntegrationTimeout = 10 * time.Second Interval = time.Millisecond * 250 - E2ETimeout = 3 * time.Minute + E2ETimeout = 5 * time.Minute E2EInterval = 1 * time.Second ) From a719916319e180b4f254651cb1d5582402154e31 Mon Sep 17 00:00:00 2001 From: kerthcet Date: Tue, 18 Feb 2025 14:30:24 +0800 Subject: [PATCH 3/3] Fix e2e test error Signed-off-by: kerthcet --- test/util/wrapper/backend.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/util/wrapper/backend.go b/test/util/wrapper/backend.go index 7d4148a8..3f73ec58 100644 --- a/test/util/wrapper/backend.go +++ b/test/util/wrapper/backend.go @@ -62,7 +62,7 @@ func (w *BackendRuntimeWrapper) Command(commands []string) *BackendRuntimeWrappe return w } -func (w *BackendRuntimeWrapper) Arg(name string, flags []string) *BackendRuntimeWrapper { +func (w *BackendRuntimeWrapper) Arg(name string, args []string) *BackendRuntimeWrapper { if w.Spec.RecommendedConfigs == nil { w.Spec.RecommendedConfigs = []inferenceapi.RecommendedConfig{ { @@ -70,9 +70,9 @@ func (w *BackendRuntimeWrapper) Arg(name string, flags []string) *BackendRuntime }, } } - for _, recommend := range w.Spec.RecommendedConfigs { + for i, recommend := range w.Spec.RecommendedConfigs { if recommend.Name == name { - recommend.Args = flags + w.Spec.RecommendedConfigs[i].Args = args break } }