diff --git a/api/cube-cos-openapi b/api/cube-cos-openapi index 221c16f2..c8ff77ef 160000 --- a/api/cube-cos-openapi +++ b/api/cube-cos-openapi @@ -1 +1 @@ -Subproject commit 221c16f26079be819bd357b9bf76f51b1f1724ff +Subproject commit c8ff77ef107e0a56de4024e4faf0fd2c5fa5c170 diff --git a/internal/apis/v1/handlers/grafana/handlers.go b/internal/apis/v1/handlers/grafana/handlers.go index 97a2f7de..89209ae0 100644 --- a/internal/apis/v1/handlers/grafana/handlers.go +++ b/internal/apis/v1/handlers/grafana/handlers.go @@ -7,6 +7,7 @@ import ( "github.com/bigstack-oss/cube-cos-api/internal/apis/v1/bodies" "github.com/bigstack-oss/cube-cos-api/internal/cubecos" "github.com/bigstack-oss/cube-cos-api/internal/definition/v1/grafana" + "github.com/bigstack-oss/cube-cos-api/internal/definition/v1/nodes" "github.com/gin-gonic/gin" ) @@ -54,6 +55,12 @@ var ( Path: "/grafana/storages", Func: forwardStoragesLink, }, + { + Version: apis.V1, + Method: http.MethodGet, + Path: "/grafana/gpuWorkloadHistory/:hostname", + Func: forwardGpuWorkloadHistoryLinks, + }, } ) @@ -133,3 +140,20 @@ func forwardStoragesLink(c *gin.Context) { }, ) } + +// Returns the device dashboard deep-links for a physical node's GPU workload +// history (panel 50 = GPU Util, 51 = VRAM). Both filter by var-GPU_HOST, whose +// value must equal the gpu.host `host` tag (verified equal to Node.Hostname). +// Enabled is gated on node existence (cluster-wide); GetNodeGpusMap is NOT used +// here because it is local-only and would report the wrong node for remote ones. +func forwardGpuWorkloadHistoryLinks(c *gin.Context) { + bodies.SetOk( + c, + "fetch gpu workload history links successfully", + grafana.GpuWorkloadHistory{ + GpuUtilizationUrl: genGpuUtilizationHistoryLink(c), + VramUrl: genGpuVramHistoryLink(c), + Enabled: nodes.IsExist(c.Param("hostname")), + }, + ) +} diff --git a/internal/apis/v1/handlers/grafana/links.go b/internal/apis/v1/handlers/grafana/links.go index ffb43af9..6c799e51 100644 --- a/internal/apis/v1/handlers/grafana/links.go +++ b/internal/apis/v1/handlers/grafana/links.go @@ -57,3 +57,22 @@ func genStoragesLink() string { base.DataCenterVip, ) } + +// panel 50 = GPU Utilization on the device dashboard (UID i-device). +// Filtered by the hidden $GPU_HOST variable (gpu.host's `host` tag), NOT $HOST. +func genGpuUtilizationHistoryLink(c *gin.Context) string { + return fmt.Sprintf( + "https://%s/grafana/d/i-device/device?orgId=1&var-GPU_HOST=%s&from=now-3h&to=now&viewPanel=50", + base.DataCenterVip, + c.Param("hostname"), + ) +} + +// panel 51 = GPU VRAM Usage on the device dashboard (UID i-device). +func genGpuVramHistoryLink(c *gin.Context) string { + return fmt.Sprintf( + "https://%s/grafana/d/i-device/device?orgId=1&var-GPU_HOST=%s&from=now-3h&to=now&viewPanel=51", + base.DataCenterVip, + c.Param("hostname"), + ) +} diff --git a/internal/definition/v1/grafana/grafana.go b/internal/definition/v1/grafana/grafana.go index 4e39b824..ae250f91 100644 --- a/internal/definition/v1/grafana/grafana.go +++ b/internal/definition/v1/grafana/grafana.go @@ -8,3 +8,9 @@ type Dashboard struct { Link string `json:"link"` Enabled bool `json:"enabled"` } + +type GpuWorkloadHistory struct { + GpuUtilizationUrl string `json:"gpuUtilizationUrl"` + VramUrl string `json:"vramUrl"` + Enabled bool `json:"enabled"` +}