Skip to content

Commit 1f392ec

Browse files
feat: add additional labels to the existing scheduler_attempts_total metric (kubernetes-sigs#2545)
* Add selected endpoint info to SchedulerAttemptsTotal metric Signed-off-by: Lionel Villard <villard@us.ibm.com> * add model_name label to scheduler_attempts_total Signed-off-by: Lionel Villard <villard@us.ibm.com> * add unit tests Signed-off-by: Lionel Villard <villard@us.ibm.com> * document metric Signed-off-by: Lionel Villard <villard@us.ibm.com> * fix typo Signed-off-by: Lionel Villard <villard@us.ibm.com> * check primaryResults Signed-off-by: Lionel Villard <villard@us.ibm.com> * add targetModelName even when attempt failed Signed-off-by: Lionel Villard <villard@us.ibm.com> * fix golden file Signed-off-by: Lionel Villard <villard@us.ibm.com> --------- Signed-off-by: Lionel Villard <villard@us.ibm.com>
1 parent d2928ea commit 1f392ec

File tree

8 files changed

+168
-46
lines changed

8 files changed

+168
-46
lines changed

pkg/epp/metrics/metrics.go

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828

2929
logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/common/observability/logging"
3030
metricsutil "sigs.k8s.io/gateway-api-inference-extension/pkg/common/observability/metrics"
31+
schedulingframework "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/framework/interface/scheduling"
3132
)
3233

3334
const (
@@ -55,6 +56,7 @@ var (
5556
modelLabels = []string{"model_name", "target_model_name"}
5657
modelTypeLabels = []string{"model_name", "target_model_name", "type"}
5758
poolLabels = []string{"name"}
59+
endpointLabels = []string{"pod_name", "namespace", "port"}
5860

5961
// --- Common Buckets ---
6062

@@ -320,7 +322,7 @@ var (
320322
Name: "scheduler_attempts_total",
321323
Help: metricsutil.HelpMsgWithStability("Total number of scheduling attempts.", compbasemetrics.ALPHA),
322324
},
323-
[]string{"status"}, // "success", "failure"
325+
append([]string{"status", "target_model_name"}, endpointLabels...),
324326
)
325327

326328
pluginProcessingLatencies = prometheus.NewHistogramVec(
@@ -770,13 +772,29 @@ func RecordSchedulerE2ELatency(duration time.Duration) {
770772
schedulerE2ELatency.WithLabelValues().Observe(duration.Seconds())
771773
}
772774

773-
// RecordSchedulerAttempt records a scheduling attempt with status.
774-
func RecordSchedulerAttempt(err error) {
775+
// RecordSchedulerAttempt records a scheduling attempt with status and endpoint information.
776+
func RecordSchedulerAttempt(err error, targetModelName string, result *schedulingframework.SchedulingResult) {
775777
if err != nil {
776-
schedulerAttemptsTotal.WithLabelValues(SchedulerStatusFailure).Inc()
777-
} else {
778-
schedulerAttemptsTotal.WithLabelValues(SchedulerStatusSuccess).Inc()
778+
schedulerAttemptsTotal.WithLabelValues(SchedulerStatusFailure, targetModelName, "", "", "").Inc()
779+
return
779780
}
781+
782+
if result != nil {
783+
// Collect endpoint information for successful scheduling attempts
784+
primaryResults := result.ProfileResults[result.PrimaryProfileName]
785+
if primaryResults != nil {
786+
// prepareRequest (in director.go) selects the first endpoint. Do the same here.
787+
if len(primaryResults.TargetEndpoints) > 0 {
788+
metadata := primaryResults.TargetEndpoints[0].GetMetadata()
789+
if metadata != nil {
790+
schedulerAttemptsTotal.WithLabelValues(SchedulerStatusSuccess, targetModelName, metadata.PodName, metadata.NamespacedName.Namespace, metadata.Port).Inc()
791+
return
792+
}
793+
}
794+
}
795+
}
796+
797+
schedulerAttemptsTotal.WithLabelValues(SchedulerStatusSuccess, targetModelName, "", "", "").Inc()
780798
}
781799

782800
const (

pkg/epp/metrics/metrics_test.go

Lines changed: 128 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,14 @@ import (
2626
"github.com/prometheus/client_golang/prometheus"
2727
dto "github.com/prometheus/client_model/go"
2828
"github.com/stretchr/testify/require"
29+
k8stypes "k8s.io/apimachinery/pkg/types"
2930
"k8s.io/component-base/metrics/testutil"
3031
"sigs.k8s.io/controller-runtime/pkg/metrics"
3132

3233
errcommon "sigs.k8s.io/gateway-api-inference-extension/pkg/common/error"
3334
logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/common/observability/logging"
35+
fwkdl "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/framework/interface/datalayer"
36+
schedulingframework "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/framework/interface/scheduling"
3437
)
3538

3639
const (
@@ -796,46 +799,134 @@ func TestFlowControlEnqueueDurationMetric(t *testing.T) {
796799

797800
func TestSchedulerAttemptsTotal(t *testing.T) {
798801

799-
scenarios := []struct {
800-
name string
801-
successCount int
802-
failureCount int
803-
}{
804-
{
805-
name: "mixed success and failure attempts",
806-
successCount: 10,
807-
failureCount: 5,
808-
},
802+
compareMetrics := func(t *testing.T, goldenFile string) {
803+
t.Helper()
804+
wantMetrics, err := os.Open(goldenFile)
805+
if err != nil {
806+
t.Fatal(err)
807+
}
808+
defer func() {
809+
if err = wantMetrics.Close(); err != nil {
810+
t.Error(err)
811+
}
812+
}()
813+
if err := testutil.GatherAndCompare(
814+
metrics.Registry,
815+
wantMetrics,
816+
"inference_extension_scheduler_attempts_total",
817+
); err != nil {
818+
t.Errorf("metric comparison failed: %v", err)
819+
}
809820
}
810821

811-
for _, scenario := range scenarios {
812-
t.Run(scenario.name, func(t *testing.T) {
813-
Reset()
814-
for i := 0; i < scenario.successCount; i++ {
815-
RecordSchedulerAttempt(nil)
816-
}
817-
for i := 0; i < scenario.failureCount; i++ {
818-
RecordSchedulerAttempt(errors.New("simulated scheduling failure"))
819-
}
822+
t.Run("success with endpoint metadata", func(t *testing.T) {
823+
Reset()
824+
result := &schedulingframework.SchedulingResult{
825+
PrimaryProfileName: "primary",
826+
ProfileResults: map[string]*schedulingframework.ProfileRunResult{
827+
"primary": {
828+
TargetEndpoints: []schedulingframework.Endpoint{
829+
schedulingframework.NewEndpoint(
830+
&fwkdl.EndpointMetadata{
831+
NamespacedName: k8stypes.NamespacedName{Name: "pod-1", Namespace: "ns-1"},
832+
PodName: "pod-1",
833+
Port: "8080",
834+
},
835+
nil, nil,
836+
),
837+
},
838+
},
839+
},
840+
}
841+
RecordSchedulerAttempt(nil, "modelA", result)
842+
RecordSchedulerAttempt(nil, "modelA", result)
843+
compareMetrics(t, "testdata/scheduler_attempts_with_result_metrics")
844+
})
820845

821-
wantMetrics, err := os.Open("testdata/scheduler_attempts_total_metrics")
822-
defer func() {
823-
if err = wantMetrics.Close(); err != nil {
824-
t.Error(err)
825-
}
826-
}()
827-
if err != nil {
828-
t.Fatal(err)
829-
}
830-
if err := testutil.GatherAndCompare(
831-
metrics.Registry,
832-
wantMetrics,
833-
"inference_extension_scheduler_attempts_total",
834-
); err != nil {
835-
t.Errorf("metric comparison failed: %v", err)
836-
}
837-
})
838-
}
846+
t.Run("success with multiple endpoints uses first", func(t *testing.T) {
847+
Reset()
848+
result := &schedulingframework.SchedulingResult{
849+
PrimaryProfileName: "primary",
850+
ProfileResults: map[string]*schedulingframework.ProfileRunResult{
851+
"primary": {
852+
TargetEndpoints: []schedulingframework.Endpoint{
853+
schedulingframework.NewEndpoint(
854+
&fwkdl.EndpointMetadata{
855+
NamespacedName: k8stypes.NamespacedName{Name: "pod-1", Namespace: "ns-1"},
856+
PodName: "pod-1",
857+
Port: "8080",
858+
},
859+
nil, nil,
860+
),
861+
schedulingframework.NewEndpoint(
862+
&fwkdl.EndpointMetadata{
863+
NamespacedName: k8stypes.NamespacedName{Name: "pod-2", Namespace: "ns-2"},
864+
PodName: "pod-2",
865+
Port: "9090",
866+
},
867+
nil, nil,
868+
),
869+
},
870+
},
871+
},
872+
}
873+
RecordSchedulerAttempt(nil, "modelA", result)
874+
RecordSchedulerAttempt(nil, "modelB", result)
875+
compareMetrics(t, "testdata/scheduler_attempts_multiple_endpoints_metrics")
876+
})
877+
878+
t.Run("success with different models and endpoints", func(t *testing.T) {
879+
Reset()
880+
resultA := &schedulingframework.SchedulingResult{
881+
PrimaryProfileName: "primary",
882+
ProfileResults: map[string]*schedulingframework.ProfileRunResult{
883+
"primary": {
884+
TargetEndpoints: []schedulingframework.Endpoint{
885+
schedulingframework.NewEndpoint(
886+
&fwkdl.EndpointMetadata{
887+
NamespacedName: k8stypes.NamespacedName{Name: "pod-1", Namespace: "ns-1"},
888+
PodName: "pod-1",
889+
Port: "8080",
890+
},
891+
nil, nil,
892+
),
893+
},
894+
},
895+
},
896+
}
897+
resultB := &schedulingframework.SchedulingResult{
898+
PrimaryProfileName: "primary",
899+
ProfileResults: map[string]*schedulingframework.ProfileRunResult{
900+
"primary": {
901+
TargetEndpoints: []schedulingframework.Endpoint{
902+
schedulingframework.NewEndpoint(
903+
&fwkdl.EndpointMetadata{
904+
NamespacedName: k8stypes.NamespacedName{Name: "pod-2", Namespace: "ns-2"},
905+
PodName: "pod-2",
906+
Port: "9090",
907+
},
908+
nil, nil,
909+
),
910+
},
911+
},
912+
},
913+
}
914+
RecordSchedulerAttempt(nil, "modelA", resultA)
915+
RecordSchedulerAttempt(nil, "modelA", resultA)
916+
RecordSchedulerAttempt(nil, "modelB", resultB)
917+
compareMetrics(t, "testdata/scheduler_attempts_different_models_metrics")
918+
})
919+
920+
t.Run("mixed success and failure attempts", func(t *testing.T) {
921+
Reset()
922+
for i := 0; i < 10; i++ {
923+
RecordSchedulerAttempt(nil, "modelA", nil)
924+
}
925+
for i := 0; i < 5; i++ {
926+
RecordSchedulerAttempt(errors.New("simulated scheduling failure"), "modelA", nil)
927+
}
928+
compareMetrics(t, "testdata/scheduler_attempts_total_metrics")
929+
})
839930
}
840931

841932
func TestPrefixCacheMetrics(t *testing.T) {
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# HELP inference_extension_scheduler_attempts_total [ALPHA] Total number of scheduling attempts.
2+
# TYPE inference_extension_scheduler_attempts_total counter
3+
inference_extension_scheduler_attempts_total{namespace="ns-1",pod_name="pod-1",port="8080",status="success",target_model_name="modelA"} 2
4+
inference_extension_scheduler_attempts_total{namespace="ns-2",pod_name="pod-2",port="9090",status="success",target_model_name="modelB"} 1
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# HELP inference_extension_scheduler_attempts_total [ALPHA] Total number of scheduling attempts.
2+
# TYPE inference_extension_scheduler_attempts_total counter
3+
inference_extension_scheduler_attempts_total{namespace="ns-1",pod_name="pod-1",port="8080",status="success",target_model_name="modelA"} 1
4+
inference_extension_scheduler_attempts_total{namespace="ns-1",pod_name="pod-1",port="8080",status="success",target_model_name="modelB"} 1
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# HELP inference_extension_scheduler_attempts_total [ALPHA] Total number of scheduling attempts.
22
# TYPE inference_extension_scheduler_attempts_total counter
3-
inference_extension_scheduler_attempts_total{status="failure"} 5
4-
inference_extension_scheduler_attempts_total{status="success"} 10
3+
inference_extension_scheduler_attempts_total{namespace="",pod_name="",port="",status="failure",target_model_name="modelA"} 5
4+
inference_extension_scheduler_attempts_total{namespace="",pod_name="",port="",status="success",target_model_name="modelA"} 10
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# HELP inference_extension_scheduler_attempts_total [ALPHA] Total number of scheduling attempts.
2+
# TYPE inference_extension_scheduler_attempts_total counter
3+
inference_extension_scheduler_attempts_total{namespace="ns-1",pod_name="pod-1",port="8080",status="success",target_model_name="modelA"} 2

pkg/epp/scheduling/scheduler.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ func (s *Scheduler) Schedule(ctx context.Context, request *framework.LLMRequest,
5757
scheduleStart := time.Now()
5858
defer func() {
5959
metrics.RecordSchedulerE2ELatency(time.Since(scheduleStart))
60-
metrics.RecordSchedulerAttempt(err)
60+
metrics.RecordSchedulerAttempt(err, request.TargetModel, result)
6161
}()
6262

6363
profileRunResults := map[string]*framework.ProfileRunResult{}

site-src/guides/metrics-and-observability.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ This guide describes the current state of exposed metrics and how to scrape them
4646
| inference_pool_per_pod_queue_size | Gauge | The total number of queue for each model server pod under the inference pool | `model_server_pod`=&lt;model-server-pod-name&gt; <br> `name`=&lt;inference-pool-name&gt; | ALPHA |
4747
| inference_pool_ready_pods | Gauge | The number of ready pods for an inference server pool. | `name`=&lt;inference-pool-name&gt; | ALPHA |
4848
| inference_extension_info | Gauge | The general information of the current build. | `commit`=&lt;hash-of-the-build&gt; <br> `build_ref`=&lt;ref-to-the-build&gt; | ALPHA |
49+
| inference_extension_scheduler_attempts_total | Counter | Total number of scheduling attempts. | `status`=&lt;success\|failure&gt; <br> `target_model_name`=&lt;target-model-name&gt; <br> `pod_name`=&lt;pod-name&gt; <br> `namespace`=&lt;namespace&gt; <br> `port`=&lt;port&gt; | ALPHA |
50+
4951

5052
### Dynamic LoRA Adapter Sidecar
5153

0 commit comments

Comments
 (0)