Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions api/inference/v1alpha1/config_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,17 +38,16 @@ type BackendConfig struct {
// from the default version.
// +optional
Version *string `json:"version,omitempty"`
// Args represents the arguments passed to the backend.
// Args represents the arguments appended to the backend.
// You can add new args or overwrite the default args.
// +optional
Args []string `json:"args,omitempty"`
// Envs represents the environments set to the container.
// +optional
Envs []corev1.EnvVar `json:"envs,omitempty"`
// Resources represents the resource requirements for backend, like cpu/mem,
// accelerators like GPU should not be defined here, but at the Model flavors,
// or the same accelerator requirements defined there will be covered and
// the workload will lose the fungibility capacity.
// accelerators like GPU should not be defined here, but at the model flavors,
// or the values here will be covered.
Resources *ResourceRequirements `json:"resources,omitempty"`
}

Expand Down
7 changes: 3 additions & 4 deletions config/crd/bases/inference.llmaz.io_playgrounds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ spec:
properties:
args:
description: |-
Args represents the arguments passed to the backend.
Args represents the arguments appended to the backend.
You can add new args or overwrite the default args.
items:
type: string
Expand Down Expand Up @@ -187,9 +187,8 @@ spec:
resources:
description: |-
Resources represents the resource requirements for backend, like cpu/mem,
accelerators like GPU should not be defined here, but at the Model flavors,
or the same accelerator requirements defined there will be covered and
the workload will lose the fungibility capacity.
accelerators like GPU should not be defined here, but at the model flavors,
or the values here will be covered.
properties:
limits:
additionalProperties:
Expand Down
Binary file modified docs/assets/arch.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
9 changes: 9 additions & 0 deletions pkg/controller/inference/playground_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,15 @@ func buildWorkerTemplate(models []*coreapi.OpenModel, playground *inferenceapi.P
Limits: limits,
Requests: requests,
}

// Make sure the limits are always greater than requests.
for k, v := range resources.Limits {
if k == corev1.ResourceCPU || k == corev1.ResourceMemory {
if v.Cmp(requests[k]) == -1 {
resources.Limits[k] = requests[k]
}
}
}
}

template := corev1.PodTemplateSpec{
Expand Down
12 changes: 9 additions & 3 deletions pkg/controller/inference/service_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,14 @@ func (r *ServiceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
if err := r.Get(ctx, types.NamespacedName{Name: string(mr.Name)}, model); err != nil {
return ctrl.Result{}, err
}
models = append(models, model)
// Make sure the main model is always the 0-index model.
// We only have one main model right now, if this changes,
// the logic may also change here.
if *mr.Role == coreapi.MainRole {
models = append([]*coreapi.OpenModel{model}, models...)
} else {
models = append(models, model)
}
}

workloadApplyConfiguration := buildWorkloadApplyConfiguration(service, models)
Expand Down Expand Up @@ -152,8 +159,7 @@ func injectModelProperties(template *applyconfigurationv1.LeaderWorkerTemplateAp
source.InjectModelLoader(template.WorkerTemplate, i)
}

// We treat the 0-index model as the main model, we only consider the main model's requirements,
// like label, flavor. Note: this may change in the future, let's see.
// We only consider the main model's requirements for now.
template.WorkerTemplate.Labels = util.MergeKVs(template.WorkerTemplate.Labels, modelLabels(models[0]))
injectModelFlavor(template, models[0])
}
Expand Down
29 changes: 29 additions & 0 deletions test/integration/controller/inference/playground_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package inference

import (
"context"
"fmt"

"github.com/onsi/ginkgo/v2"
"github.com/onsi/gomega"
Expand Down Expand Up @@ -323,5 +324,33 @@ var _ = ginkgo.Describe("playground controller test", func() {
},
},
}),
ginkgo.Entry("Playground with backendConfig's resource requests greater than limits", &testValidatingCase{
makePlayground: func() *inferenceapi.Playground {
return wrapper.MakePlayground("playground", ns.Name).ModelClaim(model.Name).Label(coreapi.ModelNameLabelKey, model.Name).
BackendRequest("cpu", "10").
Obj()
},
updates: []*update{
{
updateFunc: func(playground *inferenceapi.Playground) {
gomega.Expect(k8sClient.Create(ctx, playground)).To(gomega.Succeed())
},
checkFunc: func(ctx context.Context, k8sClient client.Client, playground *inferenceapi.Playground) {
gomega.Eventually(func() error {
service := inferenceapi.Service{}
if err := k8sClient.Get(ctx, types.NamespacedName{Name: playground.Name, Namespace: playground.Namespace}, &service); err != nil {
return err
}
if service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Resources.Limits.Cpu().CmpInt64(10) != 0 {
return fmt.Errorf("unexpected Cpu limit value, want %d, got %d", 10, service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Resources.Limits.Cpu().Size())
}
return nil
}).Should(gomega.Succeed())
validation.ValidatePlayground(ctx, k8sClient, playground)
validation.ValidatePlaygroundStatusEqualTo(ctx, k8sClient, playground, inferenceapi.PlaygroundProgressing, "Pending", metav1.ConditionTrue)
},
},
},
}),
)
})