diff --git a/docs/developing/README.md b/docs/developing/README.md index 969e1188..0de5a426 100644 --- a/docs/developing/README.md +++ b/docs/developing/README.md @@ -102,7 +102,7 @@ So we don't need to manually update it every time. ### Prerequisites -- [yq](https://github.com/mikefarah/yq) (for YAML → JSON conversion) +- [yq](https://github.com/mikefarah/yq) >= 4.12.0 (for YAML → JSON conversion) ### Architecture diff --git a/internal/apis/v1/handlers/nodes/gpu.go b/internal/apis/v1/handlers/nodes/gpu.go index 95139eb6..85f7b7ae 100644 --- a/internal/apis/v1/handlers/nodes/gpu.go +++ b/internal/apis/v1/handlers/nodes/gpu.go @@ -6,7 +6,6 @@ import ( "errors" "fmt" "reflect" - "strconv" "strings" "github.com/NVIDIA/go-nvml/pkg/nvml" @@ -104,7 +103,13 @@ func (h *helper) listLocalGpuCards() ([]gpu.GpuCard, error) { return nil, err } - vgpuProfiles, hexProfilesMap := listVgpuProfiles(device, hexGpu) + hexProfilesMap := map[uint32]gpu.VgpuProfileFromHex{} + hexProfileCollection := gpu.VgpuProfileCollectionFromHex{} + + if isVgpu(hexGpu) { + hexProfilesMap, hexProfileCollection = cubecos.GetNodeVgpuProfilesMap(hexGpu.PciAddress) + } + attachedInstances := listAttachedInstances(listAttachedInstancesOpts{ Device: device, DeviceUUID: uuid, @@ -116,22 +121,20 @@ func (h *helper) listLocalGpuCards() ([]gpu.GpuCard, error) { HexProfilesMap: hexProfilesMap, }) - if vgpuProfiles != nil && attachedInstances != nil { - updateVgpuProfilesRemaining(*vgpuProfiles, *attachedInstances) - } + profileCollection := toProfileCollection(hexProfileCollection, attachedInstances) gpuCards = append(gpuCards, gpu.GpuCard{ Id: hexGpu.Id, Name: hexGpu.Name, ResourceType: hexGpu.Type, SupportResourceTypes: hexGpu.SupportTypes, - Vram: &gpu.VramInfo{ + Vram: gpu.VramInfo{ AllocatedMiB: memoryUsedMiB, TotalMiB: memoryTotalMiB, - UtilizationPercent: float64(memoryUtilizationPercent), + UtilizationPercent: memoryUtilizationPercent, }, - Gpu: &gpu.GpuInfo{ - UtilizationPercent: float64(gpuUtilizationPercent), + Gpu: gpu.GpuInfo{ + UtilizationPercent: gpuUtilizationPercent, }, PciAddress: pciAddress, Status: gpu.GpuStatusInfo{ @@ -139,9 +142,8 @@ func (h *helper) listLocalGpuCards() ([]gpu.GpuCard, error) { IsProcessing: false, }, AllocationSummary: hexGpu.Allocation, - VramLimitMiB: memoryTotalMiB, ProfileCountLimit: hexGpu.ProfileCountLimit, - Profiles: vgpuProfiles, + Profiles: profileCollection, AttachedInstances: attachedInstances, }) } @@ -190,48 +192,6 @@ func extractPciAddress(pciInfo nvml.PciInfo) string { return address } -// Returns vGPU profiles with `Remaining = Count`, or returns `nil` for non-vGPU. -// The exact `Remaining` value should be calculated based on the profile's `Count` -// and the amount of attached instances created with this profile. -func listVgpuProfiles(device nvml.Device, hexGpu gpu.GpuFromHex) (*[]gpu.VgpuProfile, map[uint32]gpu.VgpuProfileFromHex) { - hexProfilesMap := map[uint32]gpu.VgpuProfileFromHex{} - - if !isVgpu(hexGpu) { - return nil, hexProfilesMap - } - - vgpuProfiles := []gpu.VgpuProfile{} - hexProfilesMap = cubecos.GetNodeVgpuProfilesMap(hexGpu.PciAddress) - - for i := 0; ; i++ { - nvmlProfile, ret := device.GetGpuInstanceProfileInfo(i) - - if ret == nvml.ERROR_INVALID_ARGUMENT { - // No more profiles. - break - } - - if ret != nvml.SUCCESS { - log.Errorf("nvml: failed to get gpu instance profile info for gpu %s at index %d: %v", hexGpu.Id, i, nvml.ErrorString(ret)) - continue - } - - hexProfile := hexProfilesMap[nvmlProfile.Id] - - vgpuProfiles = append(vgpuProfiles, gpu.VgpuProfile{ - Id: strconv.FormatUint(uint64(nvmlProfile.Id), 10), - Name: hexProfile.Name, - VramMiB: nvmlProfile.MemorySizeMB, - AliasName: hexProfile.Alias, - Count: hexProfile.Count, - // `Remaining` will be calculated later after getting attached instances. - Remaining: hexProfile.Count, - }) - } - - return &vgpuProfiles, hexProfilesMap -} - func isVgpu(hexGpu gpu.GpuFromHex) bool { return hexGpu.Type == gpu.ResourceTypeSriovVgpu || hexGpu.Type == gpu.ResourceTypeMigBackedVgpu } @@ -353,7 +313,7 @@ func listVgpuAttachedInstances(opts listAttachedInstancesOpts) *[]gpu.AttachedIn attachedInstances = append(attachedInstances, gpu.AttachedInstance{ Id: vmId, Name: instanceName, - ProfileAlias: &profileAlias, + ProfileAlias: profileAlias, UtilizationPercent: utilizationPercent, MemoryUsage: gpu.InstanceMemoryUsage{ AllocatedMiB: bytesToMiB(fbUsage), @@ -420,20 +380,83 @@ func buildInstanceLinks(vmId string) gpu.InstanceLinks { } } -func updateVgpuProfilesRemaining(profiles []gpu.VgpuProfile, attachedInstances []gpu.AttachedInstance) { - profileMapByAlias := map[string]gpu.VgpuProfile{} - for _, profile := range profiles { - profileMapByAlias[profile.AliasName] = profile +func toProfileCollection( + hexProfileCollection gpu.VgpuProfileCollectionFromHex, + attachedInstances *[]gpu.AttachedInstance, +) gpu.GpuProfileCollection { + collection := gpu.GpuProfileCollection{ + SriovVgpu: []gpu.VgpuProfile{}, + MigBackedVgpu: []gpu.VgpuProfile{}, + } + + if hexProfileCollection.Sriov != nil { + for _, profile := range *hexProfileCollection.Sriov { + collection.SriovVgpu = append(collection.SriovVgpu, gpu.VgpuProfile{ + Id: profile.Id, + Name: profile.Name, + VramMiB: profile.VramMiB, + Count: profile.Count, + Remaining: nil, + AliasName: profile.Alias, + CountLimit: profile.VmCountLimit, + }) + } + } + + migProfileRemainingMap := createMigProfileRemainingMap(hexProfileCollection.MigBacked, attachedInstances) + + if hexProfileCollection.MigBacked != nil { + for _, profile := range *hexProfileCollection.MigBacked { + remaining := migProfileRemainingMap[profile.Id] + + collection.MigBackedVgpu = append(collection.MigBackedVgpu, gpu.VgpuProfile{ + Id: profile.Id, + Name: profile.Name, + VramMiB: profile.VramMiB, + Count: profile.Count, + Remaining: &remaining, + AliasName: profile.Alias, + CountLimit: profile.VmCountLimit, + }) + } + } + + return collection +} + +// Returns a map with profile ID as key, and remaining count as value. +func createMigProfileRemainingMap( + migProfiles *[]gpu.VgpuProfileFromHex, + attachedInstances *[]gpu.AttachedInstance, +) map[uint32]int { + // Key: profile ID. Value: remaining count. + remainingMap := map[uint32]int{} + + if migProfiles == nil || attachedInstances == nil { + return remainingMap } - profileInstanceCountMap := map[string]int{} - for _, instance := range attachedInstances { - profile := profileMapByAlias[*instance.ProfileAlias] - profileInstanceCountMap[profile.Id]++ + // Key: profile alias. Value: profile ID. + profileIdMap := map[string]uint32{} + + for _, profile := range *migProfiles { + if profile.Alias == nil || len(*profile.Alias) == 0 { + continue + } + remainingMap[profile.Id] = profile.Count + profileIdMap[*profile.Alias] = profile.Id } - for _, profile := range profiles { - instanceCount := profileInstanceCountMap[profile.Id] - profile.Remaining = profile.Count - instanceCount + for _, instance := range *attachedInstances { + if instance.ProfileAlias == nil || len(*instance.ProfileAlias) == 0 { + continue + } + + profileId, exists := profileIdMap[*instance.ProfileAlias] + if exists { + remainingMap[profileId] = max(remainingMap[profileId]-1, 0) + } } + + return remainingMap } diff --git a/internal/cubecos/nodes.go b/internal/cubecos/nodes.go index e7668baa..b4726b53 100644 --- a/internal/cubecos/nodes.go +++ b/internal/cubecos/nodes.go @@ -421,41 +421,46 @@ func listNodeGpus(nodeName string) ([]gpu.GpuFromHex, error) { return gpus, nil } -func GetNodeVgpuProfilesMap(gpuId string) map[uint32]gpu.VgpuProfileFromHex { - profiles := listNodeVgpuProfiles(gpuId) +func GetNodeVgpuProfilesMap(gpuId string) (map[uint32]gpu.VgpuProfileFromHex, gpu.VgpuProfileCollectionFromHex) { + collection := getNodeVgpuProfileCollection(gpuId) + + // Profile IDs will not conflict between SR-IOV and MIG. profilesMap := map[uint32]gpu.VgpuProfileFromHex{} - for _, profile := range *profiles { - profilesMap[profile.Id] = profile + if collection.Sriov != nil { + for _, profile := range *collection.Sriov { + profilesMap[profile.Id] = profile + } + } + + if collection.MigBacked != nil { + for _, profile := range *collection.MigBacked { + profilesMap[profile.Id] = profile + } } - return profilesMap + return profilesMap, collection } -func listNodeVgpuProfiles(gpuId string) *[]gpu.VgpuProfileFromHex { +func getNodeVgpuProfileCollection(gpuId string) gpu.VgpuProfileCollectionFromHex { ctx, cancel := context.WithTimeout(wait.CtxSeconds(30)) defer cancel() - profiles := []gpu.VgpuProfileFromHex{} + collection := gpu.VgpuProfileCollectionFromHex{} - out, err := exec.CommandContext(ctx, "hex_sdk", "gpu_vgpu_profile_list", "-gpuId", gpuId).CombinedOutput() + out, err := exec.CommandContext(ctx, "hex_sdk", "gpu_vgpu_profile_list", gpuId).CombinedOutput() if err != nil { log.Errorf("nodes: failed to list vgpu profiles for gpu %s: %v", gpuId, err) - return &profiles - } - - if !IsHexSuccessful(err) { - log.Errorf("nodes: output error when listing vgpu profiles for gpu %s via hex_sdk: %v", gpuId, err) - return &profiles + return collection } - err = json.Unmarshal(out, &profiles) + err = json.Unmarshal(out, &collection) if err != nil { log.Errorf("nodes: failed to parse output when listing vgpu profiles for gpu %s via hex_sdk: %v", gpuId, err) - return &profiles + return collection } - return &profiles + return collection } func GetNodePgpuAttachedInstance(pciAddress string) *gpu.PgpuAttachedInstanceFromHex { diff --git a/internal/definition/v1/gpu/gpu.go b/internal/definition/v1/gpu/gpu.go index e9a03dd9..6305da7e 100644 --- a/internal/definition/v1/gpu/gpu.go +++ b/internal/definition/v1/gpu/gpu.go @@ -30,11 +30,18 @@ type GpuFromHex struct { Allocation *AllocationSummary `json:"allocation"` } +type VgpuProfileCollectionFromHex struct { + Sriov *[]VgpuProfileFromHex `json:"sriov"` + MigBacked *[]VgpuProfileFromHex `json:"migBacked"` +} + type VgpuProfileFromHex struct { - Id uint32 `json:"id"` - Name string `json:"name"` - Count int `json:"count"` - Alias string `json:"alias"` + Id uint32 `json:"id"` + Name string `json:"name"` + VramMiB uint64 `json:"vramMiB"` + Count int `json:"count"` + Alias *string `json:"alias"` + VmCountLimit *int `json:"vmCountLimit"` } type PgpuAttachedInstanceFromHex struct { @@ -48,24 +55,23 @@ type GpuCard struct { PciAddress string `json:"pciAddress"` ResourceType ResourceType `json:"resourceType"` SupportResourceTypes []SupportResourceType `json:"supportResourceTypes"` - Vram *VramInfo `json:"vram"` - Gpu *GpuInfo `json:"gpu"` + Vram VramInfo `json:"vram"` + Gpu GpuInfo `json:"gpu"` AllocationSummary *AllocationSummary `json:"allocationSummary"` - VramLimitMiB int `json:"vramLimitMiB"` ProfileCountLimit *int `json:"profileCountLimit"` - Profiles *[]VgpuProfile `json:"profiles"` + Profiles GpuProfileCollection `json:"profiles"` AttachedInstances *[]AttachedInstance `json:"attachedInstances"` Status GpuStatusInfo `json:"status"` } type VramInfo struct { - AllocatedMiB int `json:"allocatedMiB"` - TotalMiB int `json:"totalMiB"` - UtilizationPercent float64 `json:"utilizationPercent"` + AllocatedMiB int `json:"allocatedMiB"` + TotalMiB int `json:"totalMiB"` + UtilizationPercent uint32 `json:"utilizationPercent"` } type GpuInfo struct { - UtilizationPercent float64 `json:"utilizationPercent"` + UtilizationPercent uint32 `json:"utilizationPercent"` } type AllocationSummary struct { @@ -73,13 +79,19 @@ type AllocationSummary struct { Total int `json:"total"` } +type GpuProfileCollection struct { + SriovVgpu []VgpuProfile `json:"sriovVgpu"` + MigBackedVgpu []VgpuProfile `json:"migBackedVgpu"` +} + type VgpuProfile struct { - Id string `json:"id"` - Name string `json:"name"` - VramMiB uint64 `json:"vramMiB"` - AliasName string `json:"aliasName"` - Count int `json:"count"` - Remaining int `json:"remaining"` + Id uint32 `json:"id"` + Name string `json:"name"` + VramMiB uint64 `json:"vramMiB"` + Count int `json:"count"` + Remaining *int `json:"remaining"` + AliasName *string `json:"aliasName"` + CountLimit *int `json:"countLimit"` } type AttachedInstance struct {