Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/developing/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ So we don't need to manually update it every time.

### Prerequisites

- [yq](https://github.com/mikefarah/yq) (for YAML → JSON conversion)
- [yq](https://github.com/mikefarah/yq) >= 4.12.0 (for YAML → JSON conversion)
Comment thread
steven-chiu-bigstack marked this conversation as resolved.

### Architecture

Expand Down
153 changes: 88 additions & 65 deletions internal/apis/v1/handlers/nodes/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import (
"errors"
"fmt"
"reflect"
"strconv"
"strings"

"github.com/NVIDIA/go-nvml/pkg/nvml"
Expand Down Expand Up @@ -104,7 +103,13 @@ func (h *helper) listLocalGpuCards() ([]gpu.GpuCard, error) {
return nil, err
}

vgpuProfiles, hexProfilesMap := listVgpuProfiles(device, hexGpu)
hexProfilesMap := map[uint32]gpu.VgpuProfileFromHex{}
hexProfileCollection := gpu.VgpuProfileCollectionFromHex{}

if isVgpu(hexGpu) {
hexProfilesMap, hexProfileCollection = cubecos.GetNodeVgpuProfilesMap(hexGpu.PciAddress)
}

attachedInstances := listAttachedInstances(listAttachedInstancesOpts{
Device: device,
DeviceUUID: uuid,
Expand All @@ -116,32 +121,29 @@ func (h *helper) listLocalGpuCards() ([]gpu.GpuCard, error) {
HexProfilesMap: hexProfilesMap,
})

if vgpuProfiles != nil && attachedInstances != nil {
updateVgpuProfilesRemaining(*vgpuProfiles, *attachedInstances)
}
profileCollection := toProfileCollection(hexProfileCollection, attachedInstances)
Comment thread
steven-chiu-bigstack marked this conversation as resolved.

gpuCards = append(gpuCards, gpu.GpuCard{
Id: hexGpu.Id,
Name: hexGpu.Name,
ResourceType: hexGpu.Type,
SupportResourceTypes: hexGpu.SupportTypes,
Vram: &gpu.VramInfo{
Vram: gpu.VramInfo{
AllocatedMiB: memoryUsedMiB,
TotalMiB: memoryTotalMiB,
UtilizationPercent: float64(memoryUtilizationPercent),
UtilizationPercent: memoryUtilizationPercent,
},
Gpu: &gpu.GpuInfo{
UtilizationPercent: float64(gpuUtilizationPercent),
Gpu: gpu.GpuInfo{
UtilizationPercent: gpuUtilizationPercent,
},
PciAddress: pciAddress,
Status: gpu.GpuStatusInfo{
Current: hexGpu.Status,
IsProcessing: false,
},
AllocationSummary: hexGpu.Allocation,
VramLimitMiB: memoryTotalMiB,
ProfileCountLimit: hexGpu.ProfileCountLimit,
Profiles: vgpuProfiles,
Profiles: profileCollection,
AttachedInstances: attachedInstances,
})
}
Expand Down Expand Up @@ -190,48 +192,6 @@ func extractPciAddress(pciInfo nvml.PciInfo) string {
return address
}

// Returns vGPU profiles with `Remaining = Count`, or returns `nil` for non-vGPU.
// The exact `Remaining` value should be calculated based on the profile's `Count`
// and the amount of attached instances created with this profile.
func listVgpuProfiles(device nvml.Device, hexGpu gpu.GpuFromHex) (*[]gpu.VgpuProfile, map[uint32]gpu.VgpuProfileFromHex) {
hexProfilesMap := map[uint32]gpu.VgpuProfileFromHex{}

if !isVgpu(hexGpu) {
return nil, hexProfilesMap
}

vgpuProfiles := []gpu.VgpuProfile{}
hexProfilesMap = cubecos.GetNodeVgpuProfilesMap(hexGpu.PciAddress)

for i := 0; ; i++ {
nvmlProfile, ret := device.GetGpuInstanceProfileInfo(i)

if ret == nvml.ERROR_INVALID_ARGUMENT {
// No more profiles.
break
}

if ret != nvml.SUCCESS {
log.Errorf("nvml: failed to get gpu instance profile info for gpu %s at index %d: %v", hexGpu.Id, i, nvml.ErrorString(ret))
continue
}

hexProfile := hexProfilesMap[nvmlProfile.Id]

vgpuProfiles = append(vgpuProfiles, gpu.VgpuProfile{
Id: strconv.FormatUint(uint64(nvmlProfile.Id), 10),
Name: hexProfile.Name,
VramMiB: nvmlProfile.MemorySizeMB,
AliasName: hexProfile.Alias,
Count: hexProfile.Count,
// `Remaining` will be calculated later after getting attached instances.
Remaining: hexProfile.Count,
})
}

return &vgpuProfiles, hexProfilesMap
}

func isVgpu(hexGpu gpu.GpuFromHex) bool {
return hexGpu.Type == gpu.ResourceTypeSriovVgpu || hexGpu.Type == gpu.ResourceTypeMigBackedVgpu
}
Expand Down Expand Up @@ -353,7 +313,7 @@ func listVgpuAttachedInstances(opts listAttachedInstancesOpts) *[]gpu.AttachedIn
attachedInstances = append(attachedInstances, gpu.AttachedInstance{
Id: vmId,
Name: instanceName,
ProfileAlias: &profileAlias,
ProfileAlias: profileAlias,
UtilizationPercent: utilizationPercent,
MemoryUsage: gpu.InstanceMemoryUsage{
AllocatedMiB: bytesToMiB(fbUsage),
Expand Down Expand Up @@ -420,20 +380,83 @@ func buildInstanceLinks(vmId string) gpu.InstanceLinks {
}
}

func updateVgpuProfilesRemaining(profiles []gpu.VgpuProfile, attachedInstances []gpu.AttachedInstance) {
profileMapByAlias := map[string]gpu.VgpuProfile{}
for _, profile := range profiles {
profileMapByAlias[profile.AliasName] = profile
func toProfileCollection(
hexProfileCollection gpu.VgpuProfileCollectionFromHex,
attachedInstances *[]gpu.AttachedInstance,
) gpu.GpuProfileCollection {
collection := gpu.GpuProfileCollection{
SriovVgpu: []gpu.VgpuProfile{},
MigBackedVgpu: []gpu.VgpuProfile{},
}

if hexProfileCollection.Sriov != nil {
for _, profile := range *hexProfileCollection.Sriov {
collection.SriovVgpu = append(collection.SriovVgpu, gpu.VgpuProfile{
Id: profile.Id,
Name: profile.Name,
VramMiB: profile.VramMiB,
Count: profile.Count,
Remaining: nil,
AliasName: profile.Alias,
CountLimit: profile.VmCountLimit,
})
}
}

migProfileRemainingMap := createMigProfileRemainingMap(hexProfileCollection.MigBacked, attachedInstances)

if hexProfileCollection.MigBacked != nil {
for _, profile := range *hexProfileCollection.MigBacked {
remaining := migProfileRemainingMap[profile.Id]

collection.MigBackedVgpu = append(collection.MigBackedVgpu, gpu.VgpuProfile{
Id: profile.Id,
Name: profile.Name,
VramMiB: profile.VramMiB,
Count: profile.Count,
Remaining: &remaining,
AliasName: profile.Alias,
CountLimit: profile.VmCountLimit,
})
}
}

return collection
}

// Returns a map with profile ID as key, and remaining count as value.
func createMigProfileRemainingMap(
migProfiles *[]gpu.VgpuProfileFromHex,
attachedInstances *[]gpu.AttachedInstance,
) map[uint32]int {
// Key: profile ID. Value: remaining count.
remainingMap := map[uint32]int{}

if migProfiles == nil || attachedInstances == nil {
return remainingMap
}

profileInstanceCountMap := map[string]int{}
for _, instance := range attachedInstances {
profile := profileMapByAlias[*instance.ProfileAlias]
profileInstanceCountMap[profile.Id]++
// Key: profile alias. Value: profile ID.
profileIdMap := map[string]uint32{}

for _, profile := range *migProfiles {
if profile.Alias == nil || len(*profile.Alias) == 0 {
continue
}
remainingMap[profile.Id] = profile.Count
profileIdMap[*profile.Alias] = profile.Id
}

for _, profile := range profiles {
instanceCount := profileInstanceCountMap[profile.Id]
profile.Remaining = profile.Count - instanceCount
for _, instance := range *attachedInstances {
if instance.ProfileAlias == nil || len(*instance.ProfileAlias) == 0 {
continue
}

profileId, exists := profileIdMap[*instance.ProfileAlias]
if exists {
remainingMap[profileId] = max(remainingMap[profileId]-1, 0)
}
}

return remainingMap
}
39 changes: 22 additions & 17 deletions internal/cubecos/nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -421,41 +421,46 @@ func listNodeGpus(nodeName string) ([]gpu.GpuFromHex, error) {
return gpus, nil
}

func GetNodeVgpuProfilesMap(gpuId string) map[uint32]gpu.VgpuProfileFromHex {
profiles := listNodeVgpuProfiles(gpuId)
func GetNodeVgpuProfilesMap(gpuId string) (map[uint32]gpu.VgpuProfileFromHex, gpu.VgpuProfileCollectionFromHex) {
collection := getNodeVgpuProfileCollection(gpuId)

// Profile IDs will not conflict between SR-IOV and MIG.
profilesMap := map[uint32]gpu.VgpuProfileFromHex{}

for _, profile := range *profiles {
profilesMap[profile.Id] = profile
if collection.Sriov != nil {
for _, profile := range *collection.Sriov {
profilesMap[profile.Id] = profile
}
}

if collection.MigBacked != nil {
for _, profile := range *collection.MigBacked {
profilesMap[profile.Id] = profile
}
}

return profilesMap
return profilesMap, collection
}

func listNodeVgpuProfiles(gpuId string) *[]gpu.VgpuProfileFromHex {
func getNodeVgpuProfileCollection(gpuId string) gpu.VgpuProfileCollectionFromHex {
ctx, cancel := context.WithTimeout(wait.CtxSeconds(30))
defer cancel()

profiles := []gpu.VgpuProfileFromHex{}
collection := gpu.VgpuProfileCollectionFromHex{}

out, err := exec.CommandContext(ctx, "hex_sdk", "gpu_vgpu_profile_list", "-gpuId", gpuId).CombinedOutput()
out, err := exec.CommandContext(ctx, "hex_sdk", "gpu_vgpu_profile_list", gpuId).CombinedOutput()
if err != nil {
log.Errorf("nodes: failed to list vgpu profiles for gpu %s: %v", gpuId, err)
return &profiles
}

if !IsHexSuccessful(err) {
log.Errorf("nodes: output error when listing vgpu profiles for gpu %s via hex_sdk: %v", gpuId, err)
return &profiles
return collection
}

err = json.Unmarshal(out, &profiles)
err = json.Unmarshal(out, &collection)
if err != nil {
log.Errorf("nodes: failed to parse output when listing vgpu profiles for gpu %s via hex_sdk: %v", gpuId, err)
return &profiles
return collection
}

return &profiles
return collection
}

func GetNodePgpuAttachedInstance(pciAddress string) *gpu.PgpuAttachedInstanceFromHex {
Expand Down
48 changes: 30 additions & 18 deletions internal/definition/v1/gpu/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,18 @@ type GpuFromHex struct {
Allocation *AllocationSummary `json:"allocation"`
}

type VgpuProfileCollectionFromHex struct {
Sriov *[]VgpuProfileFromHex `json:"sriov"`
MigBacked *[]VgpuProfileFromHex `json:"migBacked"`
}

type VgpuProfileFromHex struct {
Id uint32 `json:"id"`
Name string `json:"name"`
Count int `json:"count"`
Alias string `json:"alias"`
Id uint32 `json:"id"`
Name string `json:"name"`
VramMiB uint64 `json:"vramMiB"`
Count int `json:"count"`
Alias *string `json:"alias"`
VmCountLimit *int `json:"vmCountLimit"`
}

type PgpuAttachedInstanceFromHex struct {
Expand All @@ -48,38 +55,43 @@ type GpuCard struct {
PciAddress string `json:"pciAddress"`
ResourceType ResourceType `json:"resourceType"`
SupportResourceTypes []SupportResourceType `json:"supportResourceTypes"`
Vram *VramInfo `json:"vram"`
Gpu *GpuInfo `json:"gpu"`
Vram VramInfo `json:"vram"`
Gpu GpuInfo `json:"gpu"`
AllocationSummary *AllocationSummary `json:"allocationSummary"`
VramLimitMiB int `json:"vramLimitMiB"`
ProfileCountLimit *int `json:"profileCountLimit"`
Profiles *[]VgpuProfile `json:"profiles"`
Profiles GpuProfileCollection `json:"profiles"`
AttachedInstances *[]AttachedInstance `json:"attachedInstances"`
Status GpuStatusInfo `json:"status"`
}

type VramInfo struct {
AllocatedMiB int `json:"allocatedMiB"`
TotalMiB int `json:"totalMiB"`
UtilizationPercent float64 `json:"utilizationPercent"`
AllocatedMiB int `json:"allocatedMiB"`
TotalMiB int `json:"totalMiB"`
UtilizationPercent uint32 `json:"utilizationPercent"`
}

type GpuInfo struct {
UtilizationPercent float64 `json:"utilizationPercent"`
UtilizationPercent uint32 `json:"utilizationPercent"`
}

type AllocationSummary struct {
Current int `json:"current"`
Total int `json:"total"`
}

type GpuProfileCollection struct {
SriovVgpu []VgpuProfile `json:"sriovVgpu"`
MigBackedVgpu []VgpuProfile `json:"migBackedVgpu"`
}

type VgpuProfile struct {
Id string `json:"id"`
Name string `json:"name"`
VramMiB uint64 `json:"vramMiB"`
AliasName string `json:"aliasName"`
Count int `json:"count"`
Remaining int `json:"remaining"`
Id uint32 `json:"id"`
Comment thread
steven-chiu-bigstack marked this conversation as resolved.
Name string `json:"name"`
VramMiB uint64 `json:"vramMiB"`
Count int `json:"count"`
Remaining *int `json:"remaining"`
AliasName *string `json:"aliasName"`
CountLimit *int `json:"countLimit"`
}

type AttachedInstance struct {
Expand Down
Loading