From f1f36d28c0dd612b3d428e325c6b81f640a58a38 Mon Sep 17 00:00:00 2001 From: kerthcet Date: Mon, 2 Sep 2024 15:10:56 +0800 Subject: [PATCH] [1/N] Add speculativeDecoding support Signed-off-by: kerthcet --- README.md | 15 ++- api/core/v1alpha1/model_types.go | 49 +++++---- api/core/v1alpha1/zz_generated.deepcopy.go | 5 - api/inference/v1alpha1/config_types.go | 1 + api/inference/v1alpha1/playground_types.go | 18 ++-- api/inference/v1alpha1/service_types.go | 11 +- .../v1alpha1/zz_generated.deepcopy.go | 18 +--- .../core/v1alpha1/multimodelsclaim.go | 22 ++-- .../inference/v1alpha1/playgroundspec.go | 23 ++--- .../inference/v1alpha1/servicespec.go | 21 ++-- .../bases/inference.llmaz.io_playgrounds.yaml | 97 +++++++++--------- .../bases/inference.llmaz.io_services.yaml | 75 ++++++-------- config/crd/bases/llmaz.io_openmodels.yaml | 9 +- config/manager/kustomization.yaml | 4 +- docs/assets/.DS_Store | Bin 6148 -> 0 bytes docs/examples/README.md | 9 +- docs/examples/llamacpp/model.yaml | 2 +- docs/examples/sglang/model.yaml | 2 +- .../speculative-decoding/vllm/model.yaml | 25 +++++ .../speculative-decoding/vllm/playground.yaml | 18 ++++ llmaz/README.md | 4 +- .../inference/playground_controller.go | 50 ++++++--- .../inference/service_controller.go | 38 +++---- pkg/controller_helper/backend/backend.go | 12 ++- pkg/controller_helper/backend/llamacpp.go | 10 +- pkg/controller_helper/backend/sglang.go | 10 +- pkg/controller_helper/backend/vllm.go | 27 ++++- .../model_source/modelhub.go | 15 ++- .../model_source/modelsource.go | 6 +- pkg/controller_helper/model_source/uri.go | 14 ++- pkg/webhook/playground_webhook.go | 19 +++- .../controller/inference/playground_test.go | 25 ++++- .../controller/inference/service_test.go | 24 ++++- test/integration/webhook/playground_test.go | 30 ++++++ test/integration/webhook/service_test.go | 3 +- test/util/mock.go | 2 +- test/util/validation/validate_playground.go | 58 +++++++---- test/util/validation/validate_service.go | 47 +++++---- test/util/wrapper/playground.go | 21 ++++ test/util/wrapper/service.go | 8 +- 40 files changed, 549 insertions(+), 298 deletions(-) delete mode 100644 docs/assets/.DS_Store create mode 100644 docs/examples/speculative-decoding/vllm/model.yaml create mode 100644 docs/examples/speculative-decoding/vllm/playground.yaml diff --git a/README.md b/README.md index b9f0b4d5..6365cc01 100644 --- a/README.md +++ b/README.md @@ -27,11 +27,11 @@ Easy, advanced inference platform for large language models on Kubernetes ## Feature Overview - **Easy of Use**: People can quick deploy a LLM service with minimal configurations. -- **Broad Backend Support**: llmaz supports a wide range of advanced inference backends for high performance, like [vLLM](https://github.com/vllm-project/vllm), [SGLang](https://github.com/sgl-project/sglang), [llama.cpp](https://github.com/ggerganov/llama.cpp). Find the full list of supported backends [here](./docs/support-backends.md). +- **Broad Backend Support**: llmaz supports a wide range of advanced inference backends for different scenarios, like [vLLM](https://github.com/vllm-project/vllm), [SGLang](https://github.com/sgl-project/sglang), [llama.cpp](https://github.com/ggerganov/llama.cpp). Find the full list of supported backends [here](./docs/support-backends.md). - **Scaling Efficiency (WIP)**: llmaz works smoothly with autoscaling components like [Cluster-Autoscaler](https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler) or [Karpenter](https://github.com/kubernetes-sigs/karpenter) to support elastic scenarios. - **Accelerator Fungibility (WIP)**: llmaz supports serving the same LLM with various accelerators to optimize cost and performance. -- **SOTA Inference (WIP)**: llmaz supports the latest cutting-edge researches like [Speculative Decoding](https://arxiv.org/abs/2211.17192) or [Splitwise](https://arxiv.org/abs/2311.18677) to run on Kubernetes. -- **Various Model Providers**: llmaz automatically loads models from various providers, such as [HuggingFace](https://huggingface.co/), [ModelScope](https://www.modelscope.cn), ObjectStores(aliyun OSS, more on the way). +- **SOTA Inference**: llmaz supports the latest cutting-edge researches like [Speculative Decoding](https://arxiv.org/abs/2211.17192) or [Splitwise](https://arxiv.org/abs/2311.18677)(WIP) to run on Kubernetes. +- **Various Model Providers**: llmaz supports a wide range of model providers, such as [HuggingFace](https://huggingface.co/), [ModelScope](https://www.modelscope.cn), ObjectStores(aliyun OSS, more on the way). llmaz automatically handles the model loading requiring no effort from users. - **Multi-hosts Support**: llmaz supports both single-host and multi-hosts scenarios with [LWS](https://github.com/kubernetes-sigs/lws) from day 1. ## Quick Start @@ -110,10 +110,19 @@ curl http://localhost:8080/v1/completions \ ## Roadmap - Gateway support for traffic routing +- Metrics support - Serverless support for cloud-agnostic users - CLI tool support - Model training, fine tuning in the long-term +## Project Structures + +```structure +llmaz # root +├── llmaz # where the model loader logic locates +├── pkg # where the main logic for Kubernetes controllers locates +``` + ## Contributions 🚀 All kinds of contributions are welcomed ! Please follow [Contributing](./CONTRIBUTING.md). Thanks to all these contributors. diff --git a/api/core/v1alpha1/model_types.go b/api/core/v1alpha1/model_types.go index 7fbad86d..e0ca3d62 100644 --- a/api/core/v1alpha1/model_types.go +++ b/api/core/v1alpha1/model_types.go @@ -92,9 +92,9 @@ type Flavor struct { // the requests here will be covered. // +optional Requests v1.ResourceList `json:"requests,omitempty"` - // NodeSelector defines the labels to filter specified nodes, like - // cloud-provider.com/accelerator: nvidia-a100. - // NodeSelector will be auto injected to the Pods as scheduling primitives. + // NodeSelector represents the node candidates for Pod placements, if a node doesn't + // meet the nodeSelector, it will be filtered out in the resourceFungibility scheduler plugin. + // If nodeSelector is empty, it means every node is a candidate. // +optional NodeSelector map[string]string `json:"nodeSelector,omitempty"` // Params stores other useful parameters and will be consumed by the autoscaling components @@ -107,39 +107,47 @@ type Flavor struct { type ModelName string -// ModelClaim represents the references to one model. -// It's a simple config for most of the cases compared to multiModelsClaim. +// ModelClaim represents claiming for one model, it's the standard claimMode +// of multiModelsClaim compared to other modes like SpeculativeDecoding. type ModelClaim struct { - // ModelName represents a list of models, there maybe multiple models here - // to support state-of-the-art technologies like speculative decoding. + // ModelName represents the name of the Model. ModelName ModelName `json:"modelName,omitempty"` - // InferenceFlavors represents a list of flavors with fungibility supports - // to serve the model. The flavor names should be a subset of the model - // configured flavors. If not set, will use the model configured flavors. + // InferenceFlavors represents a list of flavors with fungibility support + // to serve the model. + // If set, The flavor names should be a subset of the model configured flavors. + // If not set, Model configured flavors will be used by default. // +optional InferenceFlavors []FlavorName `json:"inferenceFlavors,omitempty"` } -// MultiModelsClaim represents the references to multiple models. -// It's an advanced and more complicated config comparing to modelClaim. +type InferenceMode string + +const ( + Standard InferenceMode = "Standard" + SpeculativeDecoding InferenceMode = "SpeculativeDecoding" +) + +// MultiModelsClaim represents claiming for multiple models with different claimModes, +// like standard or speculative-decoding to support different inference scenarios. type MultiModelsClaim struct { // ModelNames represents a list of models, there maybe multiple models here // to support state-of-the-art technologies like speculative decoding. + // If the composedMode is SpeculativeDecoding, the first model is the target model, + // and the second model is the draft model. // +kubebuilder:validation:MinItems=1 ModelNames []ModelName `json:"modelNames,omitempty"` + // Mode represents the paradigm to serve the model, whether via a standard way + // or via an advanced technique like SpeculativeDecoding. + // +kubebuilder:default=Standard + // +kubebuilder:validation:Enum={Standard,SpeculativeDecoding} + // +optional + InferenceMode InferenceMode `json:"inferenceMode,omitempty"` // InferenceFlavors represents a list of flavors with fungibility supported // to serve the model. // - If not set, always apply with the 0-index model by default. // - If set, will lookup the flavor names following the model orders. // +optional InferenceFlavors []FlavorName `json:"inferenceFlavors,omitempty"` - // Rate works only when multiple claims declared, it represents the replicas rates of - // the sub-workload, like when claim1.rate:claim2.rate = 1:2 and 3 replicas defined in - // workload, then sub-workload1 will have 1 replica, and sub-workload2 will have 2 replicas. - // This is mostly designed for state-of-the-art technology called splitwise, the prefill - // and decode phase will be separated and requires different accelerators. - // The sum of the rates should be divisible by replicas. - Rate *int32 `json:"rate,omitempty"` } // ModelSpec defines the desired state of Model @@ -151,7 +159,8 @@ type ModelSpec struct { // the model such as loading from huggingface, OCI registry, s3, host path and so on. Source ModelSource `json:"source"` // InferenceFlavors represents the accelerator requirements to serve the model. - // Flavors are fungible following the priority of slice order. + // Flavors are fungible following the priority represented by the slice order. + // +kubebuilder:validation:MaxItems=8 // +optional InferenceFlavors []Flavor `json:"inferenceFlavors,omitempty"` } diff --git a/api/core/v1alpha1/zz_generated.deepcopy.go b/api/core/v1alpha1/zz_generated.deepcopy.go index 00dfcf4e..8ad44d3e 100644 --- a/api/core/v1alpha1/zz_generated.deepcopy.go +++ b/api/core/v1alpha1/zz_generated.deepcopy.go @@ -195,11 +195,6 @@ func (in *MultiModelsClaim) DeepCopyInto(out *MultiModelsClaim) { *out = make([]FlavorName, len(*in)) copy(*out, *in) } - if in.Rate != nil { - in, out := &in.Rate, &out.Rate - *out = new(int32) - **out = **in - } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MultiModelsClaim. diff --git a/api/inference/v1alpha1/config_types.go b/api/inference/v1alpha1/config_types.go index d3078e75..ebe466e1 100644 --- a/api/inference/v1alpha1/config_types.go +++ b/api/inference/v1alpha1/config_types.go @@ -39,6 +39,7 @@ type BackendConfig struct { // +optional Version *string `json:"version,omitempty"` // Args represents the arguments passed to the backend. + // You can add new args or overwrite the default args. // +optional Args []string `json:"args,omitempty"` // Envs represents the environments set to the container. diff --git a/api/inference/v1alpha1/playground_types.go b/api/inference/v1alpha1/playground_types.go index 1d18f270..792fe421 100644 --- a/api/inference/v1alpha1/playground_types.go +++ b/api/inference/v1alpha1/playground_types.go @@ -28,19 +28,17 @@ type PlaygroundSpec struct { // +kubebuilder:default=1 // +optional Replicas *int32 `json:"replicas,omitempty"` - // ModelClaim represents one modelClaim, it's a simple configuration - // compared to multiModelsClaims only work for one model and one claim. - // ModelClaim and multiModelsClaims are exclusive configured. - // Note: properties (nodeSelectors, resources, e.g.) of the model flavors - // will be applied to the workload if not exist. + // ModelClaim represents claiming for one model, it's the standard claimMode + // of multiModelsClaim compared to other modes like SpeculativeDecoding. + // Most of the time, modelClaim is enough. + // ModelClaim and multiModelsClaim are exclusive configured. // +optional ModelClaim *coreapi.ModelClaim `json:"modelClaim,omitempty"` - // MultiModelsClaims represents multiple modelClaim, which is useful when different - // sub-workload has different accelerator requirements, like the state-of-the-art - // technology called splitwise, the workload template is shared by both. - // ModelClaim and multiModelsClaims are exclusive configured. + // MultiModelsClaim represents claiming for multiple models with different claimModes, + // like standard or speculative-decoding to support different inference scenarios. + // ModelClaim and multiModelsClaim are exclusive configured. // +optional - MultiModelsClaims []coreapi.MultiModelsClaim `json:"multiModelsClaims,omitempty"` + MultiModelsClaim *coreapi.MultiModelsClaim `json:"multiModelsClaim,omitempty"` // BackendConfig represents the inference backend configuration // under the hood, e.g. vLLM, which is the default backend. // +optional diff --git a/api/inference/v1alpha1/service_types.go b/api/inference/v1alpha1/service_types.go index 7af3f22a..9ab675b9 100644 --- a/api/inference/v1alpha1/service_types.go +++ b/api/inference/v1alpha1/service_types.go @@ -27,14 +27,9 @@ import ( // Service controller will maintain multi-flavor of workloads with // different accelerators for cost or performance considerations. type ServiceSpec struct { - // MultiModelsClaims represents multiple modelClaim, which is useful when different - // sub-workload has different accelerator requirements, like the state-of-the-art - // technology called splitwise, the workload template is shared by both. - // Most of the time, one modelClaim is enough. - // Note: properties (nodeSelectors, resources, e.g.) of the model flavors - // will be applied to the workload if not exist. - // +kubebuilder:validation:MinItems=1 - MultiModelsClaims []coreapi.MultiModelsClaim `json:"multiModelsClaims,omitempty"` + // MultiModelsClaim represents claiming for multiple models with different claimModes, + // like standard or speculative-decoding to support different inference scenarios. + MultiModelsClaim coreapi.MultiModelsClaim `json:"multiModelsClaim,omitempty"` // WorkloadTemplate defines the underlying workload layout and configuration. // Note: the LWS spec might be twisted with various LWS instances to support // accelerator fungibility or other cutting-edge researches. diff --git a/api/inference/v1alpha1/zz_generated.deepcopy.go b/api/inference/v1alpha1/zz_generated.deepcopy.go index d9d94588..cfdad843 100644 --- a/api/inference/v1alpha1/zz_generated.deepcopy.go +++ b/api/inference/v1alpha1/zz_generated.deepcopy.go @@ -166,12 +166,10 @@ func (in *PlaygroundSpec) DeepCopyInto(out *PlaygroundSpec) { *out = new(corev1alpha1.ModelClaim) (*in).DeepCopyInto(*out) } - if in.MultiModelsClaims != nil { - in, out := &in.MultiModelsClaims, &out.MultiModelsClaims - *out = make([]corev1alpha1.MultiModelsClaim, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } + if in.MultiModelsClaim != nil { + in, out := &in.MultiModelsClaim, &out.MultiModelsClaim + *out = new(corev1alpha1.MultiModelsClaim) + (*in).DeepCopyInto(*out) } if in.BackendConfig != nil { in, out := &in.BackendConfig, &out.BackendConfig @@ -303,13 +301,7 @@ func (in *ServiceList) DeepCopyObject() runtime.Object { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ServiceSpec) DeepCopyInto(out *ServiceSpec) { *out = *in - if in.MultiModelsClaims != nil { - in, out := &in.MultiModelsClaims, &out.MultiModelsClaims - *out = make([]corev1alpha1.MultiModelsClaim, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } - } + in.MultiModelsClaim.DeepCopyInto(&out.MultiModelsClaim) in.WorkloadTemplate.DeepCopyInto(&out.WorkloadTemplate) if in.ElasticConfig != nil { in, out := &in.ElasticConfig, &out.ElasticConfig diff --git a/client-go/applyconfiguration/core/v1alpha1/multimodelsclaim.go b/client-go/applyconfiguration/core/v1alpha1/multimodelsclaim.go index f086f03e..3c6a8bc3 100644 --- a/client-go/applyconfiguration/core/v1alpha1/multimodelsclaim.go +++ b/client-go/applyconfiguration/core/v1alpha1/multimodelsclaim.go @@ -24,9 +24,9 @@ import ( // MultiModelsClaimApplyConfiguration represents an declarative configuration of the MultiModelsClaim type for use // with apply. type MultiModelsClaimApplyConfiguration struct { - ModelNames []v1alpha1.ModelName `json:"modelNames,omitempty"` - InferenceFlavors []v1alpha1.FlavorName `json:"inferenceFlavors,omitempty"` - Rate *int32 `json:"rate,omitempty"` + ModelNames []v1alpha1.ModelName `json:"modelNames,omitempty"` + InferenceMode *v1alpha1.InferenceMode `json:"inferenceMode,omitempty"` + InferenceFlavors []v1alpha1.FlavorName `json:"inferenceFlavors,omitempty"` } // MultiModelsClaimApplyConfiguration constructs an declarative configuration of the MultiModelsClaim type for use with @@ -45,6 +45,14 @@ func (b *MultiModelsClaimApplyConfiguration) WithModelNames(values ...v1alpha1.M return b } +// WithInferenceMode sets the InferenceMode field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the InferenceMode field is set to the value of the last call. +func (b *MultiModelsClaimApplyConfiguration) WithInferenceMode(value v1alpha1.InferenceMode) *MultiModelsClaimApplyConfiguration { + b.InferenceMode = &value + return b +} + // WithInferenceFlavors adds the given value to the InferenceFlavors field in the declarative configuration // and returns the receiver, so that objects can be build by chaining "With" function invocations. // If called multiple times, values provided by each call will be appended to the InferenceFlavors field. @@ -54,11 +62,3 @@ func (b *MultiModelsClaimApplyConfiguration) WithInferenceFlavors(values ...v1al } return b } - -// WithRate sets the Rate field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Rate field is set to the value of the last call. -func (b *MultiModelsClaimApplyConfiguration) WithRate(value int32) *MultiModelsClaimApplyConfiguration { - b.Rate = &value - return b -} diff --git a/client-go/applyconfiguration/inference/v1alpha1/playgroundspec.go b/client-go/applyconfiguration/inference/v1alpha1/playgroundspec.go index 02604da3..6c39c925 100644 --- a/client-go/applyconfiguration/inference/v1alpha1/playgroundspec.go +++ b/client-go/applyconfiguration/inference/v1alpha1/playgroundspec.go @@ -24,10 +24,10 @@ import ( // PlaygroundSpecApplyConfiguration represents an declarative configuration of the PlaygroundSpec type for use // with apply. type PlaygroundSpecApplyConfiguration struct { - Replicas *int32 `json:"replicas,omitempty"` - ModelClaim *v1alpha1.ModelClaimApplyConfiguration `json:"modelClaim,omitempty"` - MultiModelsClaims []v1alpha1.MultiModelsClaimApplyConfiguration `json:"multiModelsClaims,omitempty"` - BackendConfig *BackendConfigApplyConfiguration `json:"backendConfig,omitempty"` + Replicas *int32 `json:"replicas,omitempty"` + ModelClaim *v1alpha1.ModelClaimApplyConfiguration `json:"modelClaim,omitempty"` + MultiModelsClaim *v1alpha1.MultiModelsClaimApplyConfiguration `json:"multiModelsClaim,omitempty"` + BackendConfig *BackendConfigApplyConfiguration `json:"backendConfig,omitempty"` } // PlaygroundSpecApplyConfiguration constructs an declarative configuration of the PlaygroundSpec type for use with @@ -52,16 +52,11 @@ func (b *PlaygroundSpecApplyConfiguration) WithModelClaim(value *v1alpha1.ModelC return b } -// WithMultiModelsClaims adds the given value to the MultiModelsClaims field in the declarative configuration -// and returns the receiver, so that objects can be build by chaining "With" function invocations. -// If called multiple times, values provided by each call will be appended to the MultiModelsClaims field. -func (b *PlaygroundSpecApplyConfiguration) WithMultiModelsClaims(values ...*v1alpha1.MultiModelsClaimApplyConfiguration) *PlaygroundSpecApplyConfiguration { - for i := range values { - if values[i] == nil { - panic("nil value passed to WithMultiModelsClaims") - } - b.MultiModelsClaims = append(b.MultiModelsClaims, *values[i]) - } +// WithMultiModelsClaim sets the MultiModelsClaim field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the MultiModelsClaim field is set to the value of the last call. +func (b *PlaygroundSpecApplyConfiguration) WithMultiModelsClaim(value *v1alpha1.MultiModelsClaimApplyConfiguration) *PlaygroundSpecApplyConfiguration { + b.MultiModelsClaim = value return b } diff --git a/client-go/applyconfiguration/inference/v1alpha1/servicespec.go b/client-go/applyconfiguration/inference/v1alpha1/servicespec.go index 4095eb91..f31e425f 100644 --- a/client-go/applyconfiguration/inference/v1alpha1/servicespec.go +++ b/client-go/applyconfiguration/inference/v1alpha1/servicespec.go @@ -25,9 +25,9 @@ import ( // ServiceSpecApplyConfiguration represents an declarative configuration of the ServiceSpec type for use // with apply. type ServiceSpecApplyConfiguration struct { - MultiModelsClaims []v1alpha1.MultiModelsClaimApplyConfiguration `json:"multiModelsClaims,omitempty"` - WorkloadTemplate *v1.LeaderWorkerSetSpec `json:"workloadTemplate,omitempty"` - ElasticConfig *ElasticConfigApplyConfiguration `json:"elasticConfig,omitempty"` + MultiModelsClaim *v1alpha1.MultiModelsClaimApplyConfiguration `json:"multiModelsClaim,omitempty"` + WorkloadTemplate *v1.LeaderWorkerSetSpec `json:"workloadTemplate,omitempty"` + ElasticConfig *ElasticConfigApplyConfiguration `json:"elasticConfig,omitempty"` } // ServiceSpecApplyConfiguration constructs an declarative configuration of the ServiceSpec type for use with @@ -36,16 +36,11 @@ func ServiceSpec() *ServiceSpecApplyConfiguration { return &ServiceSpecApplyConfiguration{} } -// WithMultiModelsClaims adds the given value to the MultiModelsClaims field in the declarative configuration -// and returns the receiver, so that objects can be build by chaining "With" function invocations. -// If called multiple times, values provided by each call will be appended to the MultiModelsClaims field. -func (b *ServiceSpecApplyConfiguration) WithMultiModelsClaims(values ...*v1alpha1.MultiModelsClaimApplyConfiguration) *ServiceSpecApplyConfiguration { - for i := range values { - if values[i] == nil { - panic("nil value passed to WithMultiModelsClaims") - } - b.MultiModelsClaims = append(b.MultiModelsClaims, *values[i]) - } +// WithMultiModelsClaim sets the MultiModelsClaim field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the MultiModelsClaim field is set to the value of the last call. +func (b *ServiceSpecApplyConfiguration) WithMultiModelsClaim(value *v1alpha1.MultiModelsClaimApplyConfiguration) *ServiceSpecApplyConfiguration { + b.MultiModelsClaim = value return b } diff --git a/config/crd/bases/inference.llmaz.io_playgrounds.yaml b/config/crd/bases/inference.llmaz.io_playgrounds.yaml index 69991078..766444dd 100644 --- a/config/crd/bases/inference.llmaz.io_playgrounds.yaml +++ b/config/crd/bases/inference.llmaz.io_playgrounds.yaml @@ -45,7 +45,9 @@ spec: under the hood, e.g. vLLM, which is the default backend. properties: args: - description: Args represents the arguments passed to the backend. + description: |- + Args represents the arguments passed to the backend. + You can add new args or overwrite the default args. items: type: string type: array @@ -222,66 +224,59 @@ spec: type: object modelClaim: description: |- - ModelClaim represents one modelClaim, it's a simple configuration - compared to multiModelsClaims only work for one model and one claim. - ModelClaim and multiModelsClaims are exclusive configured. - Note: properties (nodeSelectors, resources, e.g.) of the model flavors - will be applied to the workload if not exist. + ModelClaim represents claiming for one model, it's the standard claimMode + of multiModelsClaim compared to other modes like SpeculativeDecoding. + Most of the time, modelClaim is enough. + ModelClaim and multiModelsClaim are exclusive configured. properties: inferenceFlavors: description: |- - InferenceFlavors represents a list of flavors with fungibility supports - to serve the model. The flavor names should be a subset of the model - configured flavors. If not set, will use the model configured flavors. + InferenceFlavors represents a list of flavors with fungibility support + to serve the model. + If set, The flavor names should be a subset of the model configured flavors. + If not set, Model configured flavors will be used by default. items: type: string type: array modelName: - description: |- - ModelName represents a list of models, there maybe multiple models here - to support state-of-the-art technologies like speculative decoding. + description: ModelName represents the name of the Model. type: string type: object - multiModelsClaims: + multiModelsClaim: description: |- - MultiModelsClaims represents multiple modelClaim, which is useful when different - sub-workload has different accelerator requirements, like the state-of-the-art - technology called splitwise, the workload template is shared by both. - ModelClaim and multiModelsClaims are exclusive configured. - items: - description: |- - MultiModelsClaim represents the references to multiple models. - It's an advanced and more complicated config comparing to modelClaim. - properties: - inferenceFlavors: - description: |- - InferenceFlavors represents a list of flavors with fungibility supported - to serve the model. - - If not set, always apply with the 0-index model by default. - - If set, will lookup the flavor names following the model orders. - items: - type: string - type: array - modelNames: - description: |- - ModelNames represents a list of models, there maybe multiple models here - to support state-of-the-art technologies like speculative decoding. - items: - type: string - minItems: 1 - type: array - rate: - description: |- - Rate works only when multiple claims declared, it represents the replicas rates of - the sub-workload, like when claim1.rate:claim2.rate = 1:2 and 3 replicas defined in - workload, then sub-workload1 will have 1 replica, and sub-workload2 will have 2 replicas. - This is mostly designed for state-of-the-art technology called splitwise, the prefill - and decode phase will be separated and requires different accelerators. - The sum of the rates should be divisible by replicas. - format: int32 - type: integer - type: object - type: array + MultiModelsClaim represents claiming for multiple models with different claimModes, + like standard or speculative-decoding to support different inference scenarios. + ModelClaim and multiModelsClaim are exclusive configured. + properties: + inferenceFlavors: + description: |- + InferenceFlavors represents a list of flavors with fungibility supported + to serve the model. + - If not set, always apply with the 0-index model by default. + - If set, will lookup the flavor names following the model orders. + items: + type: string + type: array + inferenceMode: + default: Standard + description: |- + Mode represents the paradigm to serve the model, whether via a standard way + or via an advanced technique like SpeculativeDecoding. + enum: + - Standard + - SpeculativeDecoding + type: string + modelNames: + description: |- + ModelNames represents a list of models, there maybe multiple models here + to support state-of-the-art technologies like speculative decoding. + If the composedMode is SpeculativeDecoding, the first model is the target model, + and the second model is the draft model. + items: + type: string + minItems: 1 + type: array + type: object replicas: default: 1 description: Replicas represents the replica number of inference workloads. diff --git a/config/crd/bases/inference.llmaz.io_services.yaml b/config/crd/bases/inference.llmaz.io_services.yaml index 17e7ab57..e6bc503d 100644 --- a/config/crd/bases/inference.llmaz.io_services.yaml +++ b/config/crd/bases/inference.llmaz.io_services.yaml @@ -65,49 +65,40 @@ spec: format: int32 type: integer type: object - multiModelsClaims: + multiModelsClaim: description: |- - MultiModelsClaims represents multiple modelClaim, which is useful when different - sub-workload has different accelerator requirements, like the state-of-the-art - technology called splitwise, the workload template is shared by both. - Most of the time, one modelClaim is enough. - Note: properties (nodeSelectors, resources, e.g.) of the model flavors - will be applied to the workload if not exist. - items: - description: |- - MultiModelsClaim represents the references to multiple models. - It's an advanced and more complicated config comparing to modelClaim. - properties: - inferenceFlavors: - description: |- - InferenceFlavors represents a list of flavors with fungibility supported - to serve the model. - - If not set, always apply with the 0-index model by default. - - If set, will lookup the flavor names following the model orders. - items: - type: string - type: array - modelNames: - description: |- - ModelNames represents a list of models, there maybe multiple models here - to support state-of-the-art technologies like speculative decoding. - items: - type: string - minItems: 1 - type: array - rate: - description: |- - Rate works only when multiple claims declared, it represents the replicas rates of - the sub-workload, like when claim1.rate:claim2.rate = 1:2 and 3 replicas defined in - workload, then sub-workload1 will have 1 replica, and sub-workload2 will have 2 replicas. - This is mostly designed for state-of-the-art technology called splitwise, the prefill - and decode phase will be separated and requires different accelerators. - The sum of the rates should be divisible by replicas. - format: int32 - type: integer - type: object - minItems: 1 - type: array + MultiModelsClaim represents claiming for multiple models with different claimModes, + like standard or speculative-decoding to support different inference scenarios. + properties: + inferenceFlavors: + description: |- + InferenceFlavors represents a list of flavors with fungibility supported + to serve the model. + - If not set, always apply with the 0-index model by default. + - If set, will lookup the flavor names following the model orders. + items: + type: string + type: array + inferenceMode: + default: Standard + description: |- + Mode represents the paradigm to serve the model, whether via a standard way + or via an advanced technique like SpeculativeDecoding. + enum: + - Standard + - SpeculativeDecoding + type: string + modelNames: + description: |- + ModelNames represents a list of models, there maybe multiple models here + to support state-of-the-art technologies like speculative decoding. + If the composedMode is SpeculativeDecoding, the first model is the target model, + and the second model is the draft model. + items: + type: string + minItems: 1 + type: array + type: object workloadTemplate: description: |- WorkloadTemplate defines the underlying workload layout and configuration. diff --git a/config/crd/bases/llmaz.io_openmodels.yaml b/config/crd/bases/llmaz.io_openmodels.yaml index e89cfd6a..7b3f0734 100644 --- a/config/crd/bases/llmaz.io_openmodels.yaml +++ b/config/crd/bases/llmaz.io_openmodels.yaml @@ -47,7 +47,7 @@ spec: inferenceFlavors: description: |- InferenceFlavors represents the accelerator requirements to serve the model. - Flavors are fungible following the priority of slice order. + Flavors are fungible following the priority represented by the slice order. items: description: |- Flavor defines the accelerator requirements for a model and the necessary parameters @@ -63,9 +63,9 @@ spec: additionalProperties: type: string description: |- - NodeSelector defines the labels to filter specified nodes, like - cloud-provider.com/accelerator: nvidia-a100. - NodeSelector will be auto injected to the Pods as scheduling primitives. + NodeSelector represents the node candidates for Pod placements, if a node doesn't + meet the nodeSelector, it will be filtered out in the resourceFungibility scheduler plugin. + If nodeSelector is empty, it means every node is a candidate. type: object params: additionalProperties: @@ -97,6 +97,7 @@ spec: required: - name type: object + maxItems: 8 type: array source: description: |- diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml index beaa32a8..9714fe0d 100644 --- a/config/manager/kustomization.yaml +++ b/config/manager/kustomization.yaml @@ -4,5 +4,5 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization images: - name: controller - newName: inftyai/llmaz - newTag: main + newName: inftyai/llmaz-test + newTag: 0901-04 diff --git a/docs/assets/.DS_Store b/docs/assets/.DS_Store deleted file mode 100644 index e848b39addecbb9e17266eeafa98c9433c8141d9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKO;5r=5PgdrFq+8GV}Aj}e^3Y}k{B;UJ!<(74OK`f7(DeK`4jcc?1q$3u7;SI zWcJO@SKHU7y9>Z|SK}+737|n0tetYG5Q&TKNiBHlM5jGkSl|QGemWoRWt-y|8IX6^ zK#Do0)YAJKrt=|V@?m^aTR$qaD)JZk@OHJ#GsbQ(zy?d?nBX04Jg{qu73~SVZS;^Z zme9KAe1c59GqrNOVni!v^pSNLW{fVFEncba^XqzHe1i_{Han;6oZyT0m=z<X8(z>zY*HCv={=FvuD zz!)$FwhYMsA)*S#0Sk}%>)@bA0HSOoRcOnWQEH-qalpbOBNXRLiN4f@BZl+k^hY5t z4p?~f<#6Hf;lh<&IH5SbI{QbO4i|g0(HJlWb{RPF)26Kd*N4ylyG1rL28@A!#el2# z2E8sHDXgt~4<~DFM17`;NM7M_TS7yZV&=+Hd`VTIKN5o&2P{0&L$Qs3(qMxz@TUy? E0*UNb 0 { + w.Spec.ModelClaim.InferenceFlavors = fNames + } + return w +} + func (w *PlaygroundWrapper) Backend(name string) *PlaygroundWrapper { if w.Spec.BackendConfig == nil { w.Spec.BackendConfig = &inferenceapi.BackendConfig{} diff --git a/test/util/wrapper/service.go b/test/util/wrapper/service.go index 16f898ed..512f074d 100644 --- a/test/util/wrapper/service.go +++ b/test/util/wrapper/service.go @@ -45,7 +45,7 @@ func (w *ServiceWrapper) Obj() *inferenceapi.Service { return &w.Service } -func (w *ServiceWrapper) ModelsClaim(modelNames []string, flavorNames []string, rate *int32) *ServiceWrapper { +func (w *ServiceWrapper) ModelsClaim(modelNames []string, mode coreapi.InferenceMode, flavorNames []string) *ServiceWrapper { names := []coreapi.ModelName{} for i := range modelNames { names = append(names, coreapi.ModelName(modelNames[i])) @@ -54,11 +54,11 @@ func (w *ServiceWrapper) ModelsClaim(modelNames []string, flavorNames []string, for i := range flavorNames { flavors = append(flavors, coreapi.FlavorName(flavorNames[i])) } - w.Spec.MultiModelsClaims = append(w.Spec.MultiModelsClaims, coreapi.MultiModelsClaim{ + w.Spec.MultiModelsClaim = coreapi.MultiModelsClaim{ ModelNames: names, + InferenceMode: mode, InferenceFlavors: flavors, - Rate: rate, - }) + } return w }