diff --git a/CHANGELOG.md b/CHANGELOG.md index c34d28f53e5..2b6c3743ce5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ * [FEATURE] Querier/Ruler: Add `query_partial_data` and `rules_partial_data` limits to allow queries/rules to be evaluated with data from a single zone, if other zones are not available. #6526 * [FEATURE] Update prometheus alertmanager version to v0.28.0 and add new integration msteamsv2, jira, and rocketchat. #6590 * [FEATURE] Ingester: Support out-of-order native histogram ingestion. It automatically enabled when `-ingester.out-of-order-time-window > 0` and `-blocks-storage.tsdb.enable-native-histograms=true`. #6626 #6663 +* [ENHANCEMENT] Alertmanager: Add nflog and silences maintenance metrics. #6659 * [ENHANCEMENT] Querier: limit label APIs to query only ingesters if `start` param is not been specified. #6618 * [ENHANCEMENT] Alertmanager: Add new limits `-alertmanager.max-silences-count` and `-alertmanager.max-silences-size-bytes` for limiting silences per tenant. #6605 * [ENHANCEMENT] Update prometheus version to v3.1.0. #6583 diff --git a/pkg/alertmanager/alertmanager_metrics.go b/pkg/alertmanager/alertmanager_metrics.go index 21d77d2b4b0..527e4581798 100644 --- a/pkg/alertmanager/alertmanager_metrics.go +++ b/pkg/alertmanager/alertmanager_metrics.go @@ -30,6 +30,8 @@ type alertmanagerMetrics struct { nflogQueryErrorsTotal *prometheus.Desc nflogQueryDuration *prometheus.Desc nflogPropagatedMessagesTotal *prometheus.Desc + nflogMaintenanceTotal *prometheus.Desc + nflogMaintenanceErrorsTotal *prometheus.Desc // exported metrics, gathered from Alertmanager Marker markerAlerts *prometheus.Desc @@ -43,6 +45,8 @@ type alertmanagerMetrics struct { silencesQueryDuration *prometheus.Desc silences *prometheus.Desc silencesPropagatedMessagesTotal *prometheus.Desc + silencesMaintenanceTotal *prometheus.Desc + silencesMaintenanceErrorsTotal *prometheus.Desc // The alertmanager config hash. configHashValue *prometheus.Desc @@ -127,6 +131,14 @@ func newAlertmanagerMetrics() *alertmanagerMetrics { "cortex_alertmanager_nflog_gossip_messages_propagated_total", "Number of received gossip messages that have been further gossiped.", nil, nil), + nflogMaintenanceTotal: prometheus.NewDesc( + "cortex_alertmanager_nflog_maintenance_total", + "How many maintenances were executed for the notification log.", + nil, nil), + nflogMaintenanceErrorsTotal: prometheus.NewDesc( + "cortex_alertmanager_nflog_maintenance_errors_total", + "How many maintenances were executed for the notification log that failed.", + nil, nil), markerAlerts: prometheus.NewDesc( "cortex_alertmanager_alerts", "How many alerts by state.", @@ -163,6 +175,14 @@ func newAlertmanagerMetrics() *alertmanagerMetrics { "cortex_alertmanager_silences", "How many silences by state.", []string{"user", "state"}, nil), + silencesMaintenanceTotal: prometheus.NewDesc( + "cortex_alertmanager_silences_maintenance_total", + "How many maintenances were executed for silences.", + nil, nil), + silencesMaintenanceErrorsTotal: prometheus.NewDesc( + "cortex_alertmanager_silences_maintenance_errors_total", + "How many maintenances were executed for silences that failed.", + nil, nil), configHashValue: prometheus.NewDesc( "cortex_alertmanager_config_hash", "Hash of the currently loaded alertmanager configuration.", @@ -268,6 +288,8 @@ func (m *alertmanagerMetrics) Describe(out chan<- *prometheus.Desc) { out <- m.nflogQueryErrorsTotal out <- m.nflogQueryDuration out <- m.nflogPropagatedMessagesTotal + out <- m.nflogMaintenanceTotal + out <- m.nflogMaintenanceErrorsTotal out <- m.silencesGCDuration out <- m.silencesSnapshotDuration out <- m.silencesSnapshotSize @@ -276,6 +298,8 @@ func (m *alertmanagerMetrics) Describe(out chan<- *prometheus.Desc) { out <- m.silencesQueryDuration out <- m.silencesPropagatedMessagesTotal out <- m.silences + out <- m.silencesMaintenanceTotal + out <- m.silencesMaintenanceErrorsTotal out <- m.configHashValue out <- m.partialMerges out <- m.partialMergesFailed @@ -317,6 +341,8 @@ func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) { data.SendSumOfCounters(out, m.nflogQueryErrorsTotal, "alertmanager_nflog_query_errors_total") data.SendSumOfHistograms(out, m.nflogQueryDuration, "alertmanager_nflog_query_duration_seconds") data.SendSumOfCounters(out, m.nflogPropagatedMessagesTotal, "alertmanager_nflog_gossip_messages_propagated_total") + data.SendSumOfCounters(out, m.nflogMaintenanceTotal, "alertmanager_nflog_maintenance_total") + data.SendSumOfCounters(out, m.nflogMaintenanceErrorsTotal, "alertmanager_nflog_maintenance_errors_total") data.SendSumOfSummaries(out, m.silencesGCDuration, "alertmanager_silences_gc_duration_seconds") data.SendSumOfSummaries(out, m.silencesSnapshotDuration, "alertmanager_silences_snapshot_duration_seconds") @@ -326,6 +352,8 @@ func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) { data.SendSumOfHistograms(out, m.silencesQueryDuration, "alertmanager_silences_query_duration_seconds") data.SendSumOfCounters(out, m.silencesPropagatedMessagesTotal, "alertmanager_silences_gossip_messages_propagated_total") data.SendSumOfGaugesPerUserWithLabels(out, m.silences, "alertmanager_silences", "state") + data.SendSumOfCounters(out, m.silencesMaintenanceTotal, "alertmanager_silences_maintenance_total") + data.SendSumOfCounters(out, m.silencesMaintenanceErrorsTotal, "alertmanager_silences_maintenance_errors_total") data.SendMaxOfGaugesPerUser(out, m.configHashValue, "alertmanager_config_hash") diff --git a/pkg/alertmanager/alertmanager_metrics_test.go b/pkg/alertmanager/alertmanager_metrics_test.go index 25cad5f344a..3716e8e6d05 100644 --- a/pkg/alertmanager/alertmanager_metrics_test.go +++ b/pkg/alertmanager/alertmanager_metrics_test.go @@ -104,6 +104,12 @@ func TestAlertmanagerMetricsStore(t *testing.T) { # HELP cortex_alertmanager_nflog_snapshot_size_bytes Size of the last notification log snapshot in bytes. # TYPE cortex_alertmanager_nflog_snapshot_size_bytes gauge cortex_alertmanager_nflog_snapshot_size_bytes 111 + # HELP cortex_alertmanager_nflog_maintenance_total How many maintenances were executed for the notification log. + # TYPE cortex_alertmanager_nflog_maintenance_total counter + cortex_alertmanager_nflog_maintenance_total 111 + # HELP cortex_alertmanager_nflog_maintenance_errors_total How many maintenances were executed for the notification log that failed. + # TYPE cortex_alertmanager_nflog_maintenance_errors_total counter + cortex_alertmanager_nflog_maintenance_errors_total 111 # HELP cortex_alertmanager_notification_latency_seconds The latency of notifications in seconds. # TYPE cortex_alertmanager_notification_latency_seconds histogram cortex_alertmanager_notification_latency_seconds_bucket{le="1"} 14 @@ -277,6 +283,12 @@ func TestAlertmanagerMetricsStore(t *testing.T) { # HELP cortex_alertmanager_silences_snapshot_size_bytes Size of the last silence snapshot in bytes. # TYPE cortex_alertmanager_silences_snapshot_size_bytes gauge cortex_alertmanager_silences_snapshot_size_bytes 111 + # HELP cortex_alertmanager_silences_maintenance_total How many maintenances were executed for silences. + # TYPE cortex_alertmanager_silences_maintenance_total counter + cortex_alertmanager_silences_maintenance_total 111 + # HELP cortex_alertmanager_silences_maintenance_errors_total How many maintenances were executed for silences that failed. + # TYPE cortex_alertmanager_silences_maintenance_errors_total counter + cortex_alertmanager_silences_maintenance_errors_total 111 # HELP cortex_alertmanager_state_fetch_replica_state_failed_total Number of times we have failed to read and merge the full state from another replica. # TYPE cortex_alertmanager_state_fetch_replica_state_failed_total counter cortex_alertmanager_state_fetch_replica_state_failed_total 0 @@ -414,6 +426,13 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) { # TYPE cortex_alertmanager_nflog_snapshot_size_bytes gauge cortex_alertmanager_nflog_snapshot_size_bytes 111 + # HELP cortex_alertmanager_nflog_maintenance_total How many maintenances were executed for the notification log. + # TYPE cortex_alertmanager_nflog_maintenance_total counter + cortex_alertmanager_nflog_maintenance_total 111 + # HELP cortex_alertmanager_nflog_maintenance_errors_total How many maintenances were executed for the notification log that failed. + # TYPE cortex_alertmanager_nflog_maintenance_errors_total counter + cortex_alertmanager_nflog_maintenance_errors_total 111 + # HELP cortex_alertmanager_notification_latency_seconds The latency of notifications in seconds. # TYPE cortex_alertmanager_notification_latency_seconds histogram cortex_alertmanager_notification_latency_seconds_bucket{le="1"} 14 @@ -598,6 +617,14 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) { # HELP cortex_alertmanager_silences_snapshot_size_bytes Size of the last silence snapshot in bytes. # TYPE cortex_alertmanager_silences_snapshot_size_bytes gauge cortex_alertmanager_silences_snapshot_size_bytes 111 + + # HELP cortex_alertmanager_silences_maintenance_total How many maintenances were executed for silences. + # TYPE cortex_alertmanager_silences_maintenance_total counter + cortex_alertmanager_silences_maintenance_total 111 + # HELP cortex_alertmanager_silences_maintenance_errors_total How many maintenances were executed for silences that failed. + # TYPE cortex_alertmanager_silences_maintenance_errors_total counter + cortex_alertmanager_silences_maintenance_errors_total 111 + # HELP cortex_alertmanager_state_fetch_replica_state_failed_total Number of times we have failed to read and merge the full state from another replica. # TYPE cortex_alertmanager_state_fetch_replica_state_failed_total counter cortex_alertmanager_state_fetch_replica_state_failed_total 0 @@ -715,6 +742,13 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) { # TYPE cortex_alertmanager_nflog_snapshot_size_bytes gauge cortex_alertmanager_nflog_snapshot_size_bytes 11 + # HELP cortex_alertmanager_nflog_maintenance_total How many maintenances were executed for the notification log. + # TYPE cortex_alertmanager_nflog_maintenance_total counter + cortex_alertmanager_nflog_maintenance_total 111 + # HELP cortex_alertmanager_nflog_maintenance_errors_total How many maintenances were executed for the notification log that failed. + # TYPE cortex_alertmanager_nflog_maintenance_errors_total counter + cortex_alertmanager_nflog_maintenance_errors_total 111 + # HELP cortex_alertmanager_notification_latency_seconds The latency of notifications in seconds. # TYPE cortex_alertmanager_notification_latency_seconds histogram cortex_alertmanager_notification_latency_seconds_bucket{le="1"} 14 @@ -863,6 +897,13 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) { # TYPE cortex_alertmanager_silences_snapshot_size_bytes gauge cortex_alertmanager_silences_snapshot_size_bytes 11 + # HELP cortex_alertmanager_silences_maintenance_total How many maintenances were executed for silences. + # TYPE cortex_alertmanager_silences_maintenance_total counter + cortex_alertmanager_silences_maintenance_total 111 + # HELP cortex_alertmanager_silences_maintenance_errors_total How many maintenances were executed for silences that failed. + # TYPE cortex_alertmanager_silences_maintenance_errors_total counter + cortex_alertmanager_silences_maintenance_errors_total 111 + # HELP cortex_alertmanager_state_fetch_replica_state_failed_total Number of times we have failed to read and merge the full state from another replica. # TYPE cortex_alertmanager_state_fetch_replica_state_failed_total counter cortex_alertmanager_state_fetch_replica_state_failed_total 0 @@ -913,6 +954,8 @@ func populateAlertmanager(base float64) *prometheus.Registry { s.silencesActive.Set(base) s.silencesExpired.Set(base * 2) s.silencesPending.Set(base * 3) + s.silencesMaintenanceTotal.Add(base) + s.silencesMaintenanceErrorsTotal.Add(base) n := newNflogMetrics(reg) n.gcDuration.Observe(base) @@ -922,6 +965,8 @@ func populateAlertmanager(base float64) *prometheus.Registry { n.queryErrorsTotal.Add(base) n.queryDuration.Observe(base) n.propagatedMessagesTotal.Add(base) + n.maintenanceTotal.Add(base) + n.maintenanceErrorsTotal.Add(base) nm := newNotifyMetrics(reg) for i, integration := range integrations { @@ -967,6 +1012,8 @@ type nflogMetrics struct { queryErrorsTotal prometheus.Counter queryDuration prometheus.Histogram propagatedMessagesTotal prometheus.Counter + maintenanceTotal prometheus.Counter + maintenanceErrorsTotal prometheus.Counter } func newNflogMetrics(r prometheus.Registerer) *nflogMetrics { @@ -1002,22 +1049,32 @@ func newNflogMetrics(r prometheus.Registerer) *nflogMetrics { Name: "alertmanager_nflog_gossip_messages_propagated_total", Help: "Number of received gossip messages that have been further gossiped.", }) + m.maintenanceTotal = promauto.With(r).NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_nflog_maintenance_total", + Help: "How many maintenances were executed for the notification log.", + }) + m.maintenanceErrorsTotal = promauto.With(r).NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_nflog_maintenance_errors_total", + Help: "How many maintenances were executed for the notification log that failed.", + }) return m } // Copied from github.com/alertmanager/silence/silence.go type silenceMetrics struct { - gcDuration prometheus.Summary - snapshotDuration prometheus.Summary - snapshotSize prometheus.Gauge - queriesTotal prometheus.Counter - queryErrorsTotal prometheus.Counter - queryDuration prometheus.Histogram - silencesActive prometheus.Gauge - silencesPending prometheus.Gauge - silencesExpired prometheus.Gauge - propagatedMessagesTotal prometheus.Counter + gcDuration prometheus.Summary + snapshotDuration prometheus.Summary + snapshotSize prometheus.Gauge + queriesTotal prometheus.Counter + queryErrorsTotal prometheus.Counter + queryDuration prometheus.Histogram + silencesActive prometheus.Gauge + silencesPending prometheus.Gauge + silencesExpired prometheus.Gauge + propagatedMessagesTotal prometheus.Counter + silencesMaintenanceTotal prometheus.Counter + silencesMaintenanceErrorsTotal prometheus.Counter } func newSilenceMetrics(r prometheus.Registerer) *silenceMetrics { @@ -1068,6 +1125,14 @@ func newSilenceMetrics(r prometheus.Registerer) *silenceMetrics { Help: "How many silences by state.", ConstLabels: prometheus.Labels{"state": string(types.SilenceStateExpired)}, }) + m.silencesMaintenanceTotal = promauto.With(r).NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_silences_maintenance_total", + Help: "How many maintenances were executed for silences.", + }) + m.silencesMaintenanceErrorsTotal = promauto.With(r).NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_silences_maintenance_errors_total", + Help: "How many maintenances were executed for silences that failed.", + }) return m }