From 86e92d56020f9873204594aa884152aefbf95277 Mon Sep 17 00:00:00 2001 From: Daniel Blando Date: Fri, 20 Aug 2021 09:33:24 -0700 Subject: [PATCH 1/4] Send new label statusFamily for ingestor failures Signed-off-by: Daniel Blando --- CHANGELOG.md | 1 + pkg/distributor/distributor.go | 19 ++++++++++--- pkg/distributor/distributor_test.go | 42 ++++++++++++++++++++++++----- 3 files changed, 53 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e5b7bc3950a..8ceb6c60848 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -48,6 +48,7 @@ * [BUGFIX] Ingester: fixed ingester stuck on start up (LEAVING ring state) when `-ingester.heartbeat-period=0` and `-ingester.unregister-on-shutdown=false`. #4366 * [BUGFIX] Ingester: panic during shutdown while fetching batches from cache. #4397 * [BUGFIX] Querier: After query-frontend restart, querier may have lower than configured concurrency. #4417 +* [FEATURE] Distributor: Add label `statusFamily` to metric `cortex_distributor_ingester_append_failures_total` #4441 ## 1.10.0 / 2021-08-03 diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index 872c7f0435a..c6fc85570ab 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -57,6 +57,9 @@ const ( typeSamples = "samples" typeMetadata = "metadata" + statusFamily5xx = "5xx" + statusFamily4xx = "4xx" + instanceIngestionRateTickInterval = time.Second ) @@ -300,7 +303,7 @@ func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Ove Namespace: "cortex", Name: "distributor_ingester_append_failures_total", Help: "The total number of failed batch appends sent to ingesters.", - }, []string{"ingester", "type"}), + }, []string{"ingester", "type", "statusFamily"}), ingesterQueries: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ Namespace: "cortex", Name: "distributor_ingester_queries_total", @@ -819,19 +822,29 @@ func (d *Distributor) send(ctx context.Context, ingester ring.InstanceDesc, time if len(metadata) > 0 { d.ingesterAppends.WithLabelValues(ingester.Addr, typeMetadata).Inc() if err != nil { - d.ingesterAppendFailures.WithLabelValues(ingester.Addr, typeMetadata).Inc() + d.ingesterAppendFailures.WithLabelValues(ingester.Addr, typeMetadata, getStatusFamily(err)).Inc() } } if len(timeseries) > 0 { d.ingesterAppends.WithLabelValues(ingester.Addr, typeSamples).Inc() if err != nil { - d.ingesterAppendFailures.WithLabelValues(ingester.Addr, typeSamples).Inc() + d.ingesterAppendFailures.WithLabelValues(ingester.Addr, typeSamples, getStatusFamily(err)).Inc() } } return err } +func getStatusFamily(err error) string { + statusFamily := statusFamily5xx + httpResp, ok := httpgrpc.HTTPResponseFromError(err) + if ok && httpResp.Code/100 == 4 { + statusFamily = statusFamily4xx + } + + return statusFamily +} + // ForReplicationSet runs f, in parallel, for all ingesters in the input replication set. func (d *Distributor) ForReplicationSet(ctx context.Context, replicationSet ring.ReplicationSet, f func(context.Context, ingester_client.IngesterClient) (interface{}, error)) ([]interface{}, error) { return replicationSet.Do(ctx, d.cfg.ExtraQueryDelay, func(ctx context.Context, ing *ring.InstanceDesc) (interface{}, error) { diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go index 921701ca018..5bcd13e3d6e 100644 --- a/pkg/distributor/distributor_test.go +++ b/pkg/distributor/distributor_test.go @@ -48,7 +48,7 @@ import ( ) var ( - errFail = fmt.Errorf("Fail") + errFail = httpgrpc.Errorf(http.StatusInternalServerError, "Fail") emptyResponse = &cortexpb.WriteResponse{} ) @@ -124,6 +124,7 @@ func TestDistributor_Push(t *testing.T) { expectedResponse *cortexpb.WriteResponse expectedError error expectedMetrics string + ingesterError error }{ "A push of no samples shouldn't block or return error, even if ingesters are sad": { numIngesters: 3, @@ -203,7 +204,7 @@ func TestDistributor_Push(t *testing.T) { expectedMetrics: ` # HELP cortex_distributor_ingester_append_failures_total The total number of failed batch appends sent to ingesters. # TYPE cortex_distributor_ingester_append_failures_total counter - cortex_distributor_ingester_append_failures_total{ingester="2",type="samples"} 1 + cortex_distributor_ingester_append_failures_total{ingester="2",statusFamily="5xx",type="samples"} 1 # HELP cortex_distributor_ingester_appends_total The total number of batch appends sent to ingesters. # TYPE cortex_distributor_ingester_appends_total counter cortex_distributor_ingester_appends_total{ingester="0",type="samples"} 1 @@ -218,10 +219,30 @@ func TestDistributor_Push(t *testing.T) { metadata: 1, metricNames: []string{distributorAppend, distributorAppendFailure}, expectedResponse: emptyResponse, + ingesterError: httpgrpc.Errorf(http.StatusInternalServerError, "Fail"), expectedMetrics: ` # HELP cortex_distributor_ingester_append_failures_total The total number of failed batch appends sent to ingesters. # TYPE cortex_distributor_ingester_append_failures_total counter - cortex_distributor_ingester_append_failures_total{ingester="2",type="metadata"} 1 + cortex_distributor_ingester_append_failures_total{ingester="2",statusFamily="5xx",type="metadata"} 1 + # HELP cortex_distributor_ingester_appends_total The total number of batch appends sent to ingesters. + # TYPE cortex_distributor_ingester_appends_total counter + cortex_distributor_ingester_appends_total{ingester="0",type="metadata"} 1 + cortex_distributor_ingester_appends_total{ingester="1",type="metadata"} 1 + cortex_distributor_ingester_appends_total{ingester="2",type="metadata"} 1 + `, + }, + "A push to overloaded ingesters should report the correct metrics": { + numIngesters: 3, + happyIngesters: 2, + samples: samplesIn{num: 0, startTimestampMs: 123456789000}, + metadata: 1, + metricNames: []string{distributorAppend, distributorAppendFailure}, + expectedResponse: emptyResponse, + ingesterError: httpgrpc.Errorf(http.StatusTooManyRequests, "Fail"), + expectedMetrics: ` + # HELP cortex_distributor_ingester_append_failures_total The total number of failed batch appends sent to ingesters. + # TYPE cortex_distributor_ingester_append_failures_total counter + cortex_distributor_ingester_append_failures_total{ingester="2",statusFamily="4xx",type="metadata"} 1 # HELP cortex_distributor_ingester_appends_total The total number of batch appends sent to ingesters. # TYPE cortex_distributor_ingester_appends_total counter cortex_distributor_ingester_appends_total{ingester="0",type="metadata"} 1 @@ -243,6 +264,7 @@ func TestDistributor_Push(t *testing.T) { numDistributors: 1, shardByAllLabels: shardByAllLabels, limits: limits, + errFail: tc.ingesterError, }) defer stopAll(ds, r) @@ -1905,6 +1927,7 @@ type prepConfig struct { maxInflightRequests int maxIngestionRate float64 replicationFactor int + errFail error } func prepare(t *testing.T, cfg prepConfig) ([]*Distributor, []mockIngester, *ring.Ring, []*prometheus.Registry) { @@ -1916,9 +1939,15 @@ func prepare(t *testing.T, cfg prepConfig) ([]*Distributor, []mockIngester, *rin }) } for i := cfg.happyIngesters; i < cfg.numIngesters; i++ { - ingesters = append(ingesters, mockIngester{ + mi := mockIngester{ queryDelay: cfg.queryDelay, - }) + errFail: errFail, + } + if cfg.errFail != nil { + mi.errFail = cfg.errFail + } + + ingesters = append(ingesters, mi) } // Use a real ring with a mock KV store to test ring RF logic. @@ -2149,6 +2178,7 @@ type mockIngester struct { client.IngesterClient grpc_health_v1.HealthClient happy bool + errFail error stats client.UsersStatsResponse timeseries map[uint32]*cortexpb.PreallocTimeseries metadata map[uint32]map[cortexpb.MetricMetadata]struct{} @@ -2187,7 +2217,7 @@ func (i *mockIngester) Push(ctx context.Context, req *cortexpb.WriteRequest, opt i.trackCall("Push") if !i.happy { - return nil, errFail + return nil, i.errFail } if i.timeseries == nil { From 58f08a5168d3272a557dbc87b63c3f4de84ea068 Mon Sep 17 00:00:00 2001 From: Daniel Blando Date: Tue, 24 Aug 2021 14:47:25 -0700 Subject: [PATCH 2/4] update changelog Signed-off-by: Daniel Blando --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ceb6c60848..598d048fb27 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ * [FEATURE] Ruler: Add new `-ruler.query-stats-enabled` which when enabled will report the `cortex_ruler_query_seconds_total` as a per-user metric that tracks the sum of the wall time of executing queries in the ruler in seconds. #4317 * [FEATURE] Query Frontend: Add `cortex_query_fetched_series_total` and `cortex_query_fetched_chunks_bytes_total` per-user counters to expose the number of series and bytes fetched as part of queries. These metrics can be enabled with the `-frontend.query-stats-enabled` flag (or its respective YAML config option `query_stats_enabled`). #4343 * [FEATURE] AlertManager: Add support for SNS Receiver. #4382 +* [FEATURE] Distributor: Add label `statusFamily` to metric ``cortex_distributor_ingester_append_failures_total`` #4441 * [ENHANCEMENT] Add timeout for waiting on compactor to become ACTIVE in the ring. #4262 * [ENHANCEMENT] Reduce memory used by streaming queries, particularly in ruler. #4341 * [ENHANCEMENT] Ring: allow experimental configuration of disabling of heartbeat timeouts by setting the relevant configuration value to zero. Applies to the following: #4342 @@ -48,7 +49,6 @@ * [BUGFIX] Ingester: fixed ingester stuck on start up (LEAVING ring state) when `-ingester.heartbeat-period=0` and `-ingester.unregister-on-shutdown=false`. #4366 * [BUGFIX] Ingester: panic during shutdown while fetching batches from cache. #4397 * [BUGFIX] Querier: After query-frontend restart, querier may have lower than configured concurrency. #4417 -* [FEATURE] Distributor: Add label `statusFamily` to metric `cortex_distributor_ingester_append_failures_total` #4441 ## 1.10.0 / 2021-08-03 From 5f788a14f4d710e4d115363e6ee1f658b04d8370 Mon Sep 17 00:00:00 2001 From: Daniel Blando Date: Tue, 24 Aug 2021 21:48:03 -0700 Subject: [PATCH 3/4] Change changelog Signed-off-by: Daniel Blando --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 598d048fb27..d96ef3ef132 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,7 +23,7 @@ * [FEATURE] Ruler: Add new `-ruler.query-stats-enabled` which when enabled will report the `cortex_ruler_query_seconds_total` as a per-user metric that tracks the sum of the wall time of executing queries in the ruler in seconds. #4317 * [FEATURE] Query Frontend: Add `cortex_query_fetched_series_total` and `cortex_query_fetched_chunks_bytes_total` per-user counters to expose the number of series and bytes fetched as part of queries. These metrics can be enabled with the `-frontend.query-stats-enabled` flag (or its respective YAML config option `query_stats_enabled`). #4343 * [FEATURE] AlertManager: Add support for SNS Receiver. #4382 -* [FEATURE] Distributor: Add label `statusFamily` to metric ``cortex_distributor_ingester_append_failures_total`` #4441 +* [FEATURE] Distributor: Add label `status` to metric `cortex_distributor_ingester_append_failures_total` #4442 * [ENHANCEMENT] Add timeout for waiting on compactor to become ACTIVE in the ring. #4262 * [ENHANCEMENT] Reduce memory used by streaming queries, particularly in ruler. #4341 * [ENHANCEMENT] Ring: allow experimental configuration of disabling of heartbeat timeouts by setting the relevant configuration value to zero. Applies to the following: #4342 From 3efe52858127a37ceda8c4d09f0437eedf249a2c Mon Sep 17 00:00:00 2001 From: Daniel Blando Date: Thu, 26 Aug 2021 11:22:31 -0700 Subject: [PATCH 4/4] Update label name Signed-off-by: Daniel Blando --- pkg/distributor/distributor.go | 17 +++++++---------- pkg/distributor/distributor_test.go | 24 ++++++++++++------------ 2 files changed, 19 insertions(+), 22 deletions(-) diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index c6fc85570ab..212da5046ae 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -57,9 +57,6 @@ const ( typeSamples = "samples" typeMetadata = "metadata" - statusFamily5xx = "5xx" - statusFamily4xx = "4xx" - instanceIngestionRateTickInterval = time.Second ) @@ -303,7 +300,7 @@ func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Ove Namespace: "cortex", Name: "distributor_ingester_append_failures_total", Help: "The total number of failed batch appends sent to ingesters.", - }, []string{"ingester", "type", "statusFamily"}), + }, []string{"ingester", "type", "status"}), ingesterQueries: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ Namespace: "cortex", Name: "distributor_ingester_queries_total", @@ -822,27 +819,27 @@ func (d *Distributor) send(ctx context.Context, ingester ring.InstanceDesc, time if len(metadata) > 0 { d.ingesterAppends.WithLabelValues(ingester.Addr, typeMetadata).Inc() if err != nil { - d.ingesterAppendFailures.WithLabelValues(ingester.Addr, typeMetadata, getStatusFamily(err)).Inc() + d.ingesterAppendFailures.WithLabelValues(ingester.Addr, typeMetadata, getErrorStatus(err)).Inc() } } if len(timeseries) > 0 { d.ingesterAppends.WithLabelValues(ingester.Addr, typeSamples).Inc() if err != nil { - d.ingesterAppendFailures.WithLabelValues(ingester.Addr, typeSamples, getStatusFamily(err)).Inc() + d.ingesterAppendFailures.WithLabelValues(ingester.Addr, typeSamples, getErrorStatus(err)).Inc() } } return err } -func getStatusFamily(err error) string { - statusFamily := statusFamily5xx +func getErrorStatus(err error) string { + status := "5xx" httpResp, ok := httpgrpc.HTTPResponseFromError(err) if ok && httpResp.Code/100 == 4 { - statusFamily = statusFamily4xx + status = "4xx" } - return statusFamily + return status } // ForReplicationSet runs f, in parallel, for all ingesters in the input replication set. diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go index 5bcd13e3d6e..caa06a4d8c4 100644 --- a/pkg/distributor/distributor_test.go +++ b/pkg/distributor/distributor_test.go @@ -124,7 +124,7 @@ func TestDistributor_Push(t *testing.T) { expectedResponse *cortexpb.WriteResponse expectedError error expectedMetrics string - ingesterError error + ingesterError error }{ "A push of no samples shouldn't block or return error, even if ingesters are sad": { numIngesters: 3, @@ -204,7 +204,7 @@ func TestDistributor_Push(t *testing.T) { expectedMetrics: ` # HELP cortex_distributor_ingester_append_failures_total The total number of failed batch appends sent to ingesters. # TYPE cortex_distributor_ingester_append_failures_total counter - cortex_distributor_ingester_append_failures_total{ingester="2",statusFamily="5xx",type="samples"} 1 + cortex_distributor_ingester_append_failures_total{ingester="2",status="5xx",type="samples"} 1 # HELP cortex_distributor_ingester_appends_total The total number of batch appends sent to ingesters. # TYPE cortex_distributor_ingester_appends_total counter cortex_distributor_ingester_appends_total{ingester="0",type="samples"} 1 @@ -223,7 +223,7 @@ func TestDistributor_Push(t *testing.T) { expectedMetrics: ` # HELP cortex_distributor_ingester_append_failures_total The total number of failed batch appends sent to ingesters. # TYPE cortex_distributor_ingester_append_failures_total counter - cortex_distributor_ingester_append_failures_total{ingester="2",statusFamily="5xx",type="metadata"} 1 + cortex_distributor_ingester_append_failures_total{ingester="2",status="5xx",type="metadata"} 1 # HELP cortex_distributor_ingester_appends_total The total number of batch appends sent to ingesters. # TYPE cortex_distributor_ingester_appends_total counter cortex_distributor_ingester_appends_total{ingester="0",type="metadata"} 1 @@ -242,7 +242,7 @@ func TestDistributor_Push(t *testing.T) { expectedMetrics: ` # HELP cortex_distributor_ingester_append_failures_total The total number of failed batch appends sent to ingesters. # TYPE cortex_distributor_ingester_append_failures_total counter - cortex_distributor_ingester_append_failures_total{ingester="2",statusFamily="4xx",type="metadata"} 1 + cortex_distributor_ingester_append_failures_total{ingester="2",status="4xx",type="metadata"} 1 # HELP cortex_distributor_ingester_appends_total The total number of batch appends sent to ingesters. # TYPE cortex_distributor_ingester_appends_total counter cortex_distributor_ingester_appends_total{ingester="0",type="metadata"} 1 @@ -1939,15 +1939,15 @@ func prepare(t *testing.T, cfg prepConfig) ([]*Distributor, []mockIngester, *rin }) } for i := cfg.happyIngesters; i < cfg.numIngesters; i++ { - mi := mockIngester{ - queryDelay: cfg.queryDelay, - errFail: errFail, - } + miError := errFail if cfg.errFail != nil { - mi.errFail = cfg.errFail + miError = cfg.errFail } - ingesters = append(ingesters, mi) + ingesters = append(ingesters, mockIngester{ + queryDelay: cfg.queryDelay, + failResp: miError, + }) } // Use a real ring with a mock KV store to test ring RF logic. @@ -2178,7 +2178,7 @@ type mockIngester struct { client.IngesterClient grpc_health_v1.HealthClient happy bool - errFail error + failResp error stats client.UsersStatsResponse timeseries map[uint32]*cortexpb.PreallocTimeseries metadata map[uint32]map[cortexpb.MetricMetadata]struct{} @@ -2217,7 +2217,7 @@ func (i *mockIngester) Push(ctx context.Context, req *cortexpb.WriteRequest, opt i.trackCall("Push") if !i.happy { - return nil, i.errFail + return nil, i.failResp } if i.timeseries == nil {