Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions api/core/v1alpha1/model_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,23 +100,24 @@ type Flavor struct {
Name FlavorName `json:"name"`
// Requests defines the required accelerators to serve the model for each replica,
// like <nvidia.com/gpu: 8>. For multi-hosts cases, the requests here indicates
// the resource requirements for each replica. This may change in the future.
// the resource requirements for each replica, usually equals to the TP size.
// Not recommended to set the cpu and memory usage here:
// - if using playground, you can define the cpu/mem usage at backendConfig.
// - if using inference service, you can define the cpu/mem at the container resources.
// However, if you define the same accelerator requests at playground/service as well,
// the requests here will be covered.
// the requests will be overwritten by the flavor requests.
// +optional
Requests v1.ResourceList `json:"requests,omitempty"`
// NodeSelector represents the node candidates for Pod placements, if a node doesn't
// meet the nodeSelector, it will be filtered out in the resourceFungibility scheduler plugin.
// If nodeSelector is empty, it means every node is a candidate.
// +optional
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
// Params stores other useful parameters and will be consumed by the autoscaling components
// like cluster-autoscaler, Karpenter.
// E.g. when scaling up nodes with 8x Nvidia A00, the parameter can be injected with
// instance-type: p4d.24xlarge for AWS.
// Params stores other useful parameters and will be consumed by cluster-autoscaler / Karpenter
// for autoscaling or be defined as model parallelism parameters like TP or PP size.
// E.g. with autoscaling, when scaling up nodes with 8x Nvidia A00, the parameter can be injected
// with <INSTANCE-TYPE: p4d.24xlarge> for AWS.
// Preset parameters: TP, PP, INSTANCE-TYPE.
// +optional
Params map[string]string `json:"params,omitempty"`
}
Expand Down
18 changes: 14 additions & 4 deletions api/inference/v1alpha1/backendruntime_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

// BackendRuntimeArg is preset arguments for easy to use.
// Do not edit the preset names unless set the argument name explicitly
// in Playground backendRuntimeConfig.
// BackendRuntimeArg is the preset arguments for easy to use.
// Three preset names are provided: default, speculative-decoding, model-parallelism,
// do not change the name.
type BackendRuntimeArg struct {
// Name represents the identifier of the backendRuntime argument.
Name string `json:"name"`
Expand All @@ -32,11 +32,21 @@ type BackendRuntimeArg struct {
Flags []string `json:"flags,omitempty"`
}

// MultiHostCommands represents leader & worker commands for multiple nodes scenarios.
type MultiHostCommands struct {
Leader []string `json:"leader,omitempty"`
Worker []string `json:"worker,omitempty"`
}

// BackendRuntimeSpec defines the desired state of BackendRuntime
type BackendRuntimeSpec struct {
// Commands represents the default command of the backendRuntime.
// Commands represents the default commands for the backendRuntime.
// +optional
Commands []string `json:"commands,omitempty"`
// MultiHostCommands represents leader and worker commands for nodes with
// different roles.
// +optional
MultiHostCommands *MultiHostCommands `json:"multiHostCommands,omitempty"`
// Image represents the default image registry of the backendRuntime.
// It will work together with version to make up a real image.
Image string `json:"image"`
Expand Down
6 changes: 6 additions & 0 deletions api/inference/v1alpha1/service_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@ import (
coreapi "github.com/inftyai/llmaz/api/core/v1alpha1"
)

const (
// InferenceServiceFlavorsAnnoKey is the annotation key for the flavors specified
// in the inference service, the value is a comma-separated list of flavor names.
InferenceServiceFlavorsAnnoKey = "llmaz.io/inference-service-flavors"
)

// ServiceSpec defines the desired state of Service.
// Service controller will maintain multi-flavor of workloads with
// different accelerators for cost or performance considerations.
Expand Down
30 changes: 30 additions & 0 deletions api/inference/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

58 changes: 57 additions & 1 deletion chart/templates/backends/vllm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,49 @@ spec:
- python3
- -m
- vllm.entrypoints.openai.api_server
multiHostCommands:
leader:
- sh
- -c
- |
ray start --head --disable-usage-stats --include-dashboard false

i=0
while true; do
active_nodes=`python3 -c 'import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))'`
if [ $active_nodes -eq $(LWS_GROUP_SIZE) ]; then
echo "All ray workers are active and the ray cluster is initialized successfully."
break
fi
if [ $i -eq 60 ]; then
echo "Initialization failed. Exiting..."
exit 1
fi
echo "Wait for $active_nodes/$(LWS_GROUP_SIZE) workers to be active."
i=$((i+1))
sleep 5s;
done

python3 -m vllm.entrypoints.openai.api_server
worker:
- sh
- -c
- |
i=0
while true; do
ray start --address=$(LWS_LEADER_ADDRESS):6379 --block

if [ $? -eq 0 ]; then
echo "Worker: Ray runtime started with head address $(LWS_LEADER_ADDRESS):6379"
break
fi
if [ $i -eq 60 ]; then
echo "Initialization failed. Exiting..."
exit 1
fi
echo "Waiting until the ray worker is active..."
sleep 5s;
done
image: vllm/vllm-openai
version: v0.6.0
# Do not edit the preset argument name unless you know what you're doing.
Expand Down Expand Up @@ -39,11 +82,24 @@ spec:
- "0.0.0.0"
- --port
- "8080"
- --use-v2-block-manager
- --num_speculative_tokens
- "5"
- -tp
- "1"
- name: model-parallelism
flags:
- --model
- "{{`{{ .ModelPath }}`}}"
- --served-model-name
- "{{`{{ .ModelName }}`}}"
- --host
- "0.0.0.0"
- --port
- "8080"
- --tensor-parallel-size
- "{{`{{ .TP }}`}}"
- --pipeline-parallel-size
- "{{`{{ .PP }}`}}"
resources:
requests:
cpu: 4
Expand Down
34 changes: 17 additions & 17 deletions chart/values.yaml
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
controllerManager:
kubeRbacProxy:
args:
- --secure-listen-address=0.0.0.0:8443
- --upstream=http://127.0.0.1:8080/
- --logtostderr=true
- --v=0
- --secure-listen-address=0.0.0.0:8443
- --upstream=http://127.0.0.1:8080/
- --logtostderr=true
- --v=0
containerSecurityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
- ALL
image:
repository: gcr.io/kubebuilder/kube-rbac-proxy
tag: v0.15.0
Expand All @@ -22,15 +22,15 @@ controllerManager:
memory: 64Mi
manager:
args:
- --health-probe-bind-address=:8081
- --metrics-bind-address=127.0.0.1:8080
- --leader-elect
- --namespace=llmaz-system
- --health-probe-bind-address=:8081
- --metrics-bind-address=127.0.0.1:8080
- --leader-elect
- --namespace=llmaz-system
containerSecurityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
- ALL
image:
repository: inftyai/llmaz
tag: v0.0.9
Expand All @@ -47,14 +47,14 @@ controllerManager:
kubernetesClusterDomain: cluster.local
metricsService:
ports:
- name: https
port: 8443
protocol: TCP
targetPort: https
- name: https
port: 8443
protocol: TCP
targetPort: https
type: ClusterIP
webhookService:
ports:
- port: 443
protocol: TCP
targetPort: 9443
- port: 443
protocol: TCP
targetPort: 9443
type: ClusterIP
22 changes: 18 additions & 4 deletions config/crd/bases/inference.llmaz.io_backendruntimes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@ spec:
They can be appended or overwritten by the Playground backendRuntimeConfig.
items:
description: |-
BackendRuntimeArg is preset arguments for easy to use.
Do not edit the preset names unless set the argument name explicitly
in Playground backendRuntimeConfig.
BackendRuntimeArg is the preset arguments for easy to use.
Three preset names are provided: default, speculative-decoding, model-parallelism,
do not change the name.
properties:
flags:
description: |-
Expand All @@ -67,7 +67,7 @@ spec:
type: object
type: array
commands:
description: Commands represents the default command of the backendRuntime.
description: Commands represents the default commands for the backendRuntime.
items:
type: string
type: array
Expand Down Expand Up @@ -194,6 +194,20 @@ spec:
Image represents the default image registry of the backendRuntime.
It will work together with version to make up a real image.
type: string
multiHostCommands:
description: |-
MultiHostCommands represents leader and worker commands for nodes with
different roles.
properties:
leader:
items:
type: string
type: array
worker:
items:
type: string
type: array
type: object
resources:
description: |-
Resources represents the resource requirements for backendRuntime, like cpu/mem,
Expand Down
3 changes: 3 additions & 0 deletions config/crd/bases/inference.llmaz.io_services.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16378,6 +16378,9 @@ spec:
description: |-
SubdomainPolicy determines the policy that will be used when creating
the headless service, defaults to shared
enum:
- Shared
- UniquePerReplica
type: string
required:
- subdomainPolicy
Expand Down
13 changes: 7 additions & 6 deletions config/crd/bases/llmaz.io_openmodels.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,11 @@ spec:
additionalProperties:
type: string
description: |-
Params stores other useful parameters and will be consumed by the autoscaling components
like cluster-autoscaler, Karpenter.
E.g. when scaling up nodes with 8x Nvidia A00, the parameter can be injected with
instance-type: p4d.24xlarge for AWS.
Params stores other useful parameters and will be consumed by cluster-autoscaler / Karpenter
for autoscaling or be defined as model parallelism parameters like TP or PP size.
E.g. with autoscaling, when scaling up nodes with 8x Nvidia A00, the parameter can be injected
with <INSTANCE-TYPE: p4d.24xlarge> for AWS.
Preset parameters: TP, PP, INSTANCE-TYPE.
type: object
requests:
additionalProperties:
Expand All @@ -88,12 +89,12 @@ spec:
description: |-
Requests defines the required accelerators to serve the model for each replica,
like <nvidia.com/gpu: 8>. For multi-hosts cases, the requests here indicates
the resource requirements for each replica. This may change in the future.
the resource requirements for each replica, usually equals to the TP size.
Not recommended to set the cpu and memory usage here:
- if using playground, you can define the cpu/mem usage at backendConfig.
- if using inference service, you can define the cpu/mem at the container resources.
However, if you define the same accelerator requests at playground/service as well,
the requests here will be covered.
the requests will be overwritten by the flavor requests.
type: object
required:
- name
Expand Down
5 changes: 5 additions & 0 deletions docs/examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ We provide a set of examples to help you serve large language models, by default
- [Deploy models via text-generation-inference](#deploy-models-via-tgi)
- [Deploy models via ollama](#ollama)
- [Speculative Decoding with vLLM](#speculative-decoding-with-vllm)
- [Deploy multi-host inference](#multi-host-inference)

### Deploy models from Huggingface

Expand Down Expand Up @@ -54,3 +55,7 @@ By default, we use [vLLM](https://github.com/vllm-project/vllm) as the inference
### Speculative Decoding with vLLM

[Speculative Decoding](https://arxiv.org/abs/2211.17192) can improve inference performance efficiently, see [example](./speculative-decoding/vllm/) here.

### Multi-Host Inference

Model size is growing bigger and bigger, Llama 3.1 405B FP16 LLM requires more than 750 GB GPU for weights only, leaving kv cache unconsidered, even with 8 x H100 Nvidia GPUs, 80 GB size of HBM each, can not fit in a single host, requires a multi-host deployment, see [example](./multi-nodes/) here.
6 changes: 3 additions & 3 deletions docs/examples/huggingface/model.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@ spec:
modelHub:
modelID: facebook/opt-125m
inferenceFlavors:
- name: t4 # GPU type
requests:
nvidia.com/gpu: 1
- name: t4 # GPU type
requests:
nvidia.com/gpu: 1
25 changes: 25 additions & 0 deletions docs/examples/multi-nodes/model.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
apiVersion: llmaz.io/v1alpha1
kind: OpenModel
metadata:
name: llama3-405b-instruct
spec:
familyName: llama3
source:
modelHub:
# TODO:
modelID: Qwen/Qwen2-0.5B
inferenceFlavors:
- name: a100-80gb
requests:
nvidia.com/gpu: 1 # single node request
params:
TP: "8"
PP: "2"
# - name: h100
# requests:
# nvidia.com/gpu: 8 # single node request
# params:
# TP: "8"
# PP: "2"
# nodeSelector:
# gpu.h100: true
Loading