Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
* [FEATURE] Querier/Ruler: Add `query_partial_data` and `rules_partial_data` limits to allow queries/rules to be evaluated with data from a single zone, if other zones are not available. #6526
* [FEATURE] Update prometheus alertmanager version to v0.28.0 and add new integration msteamsv2, jira, and rocketchat. #6590
* [FEATURE] Ingester: Support out-of-order native histogram ingestion. It automatically enabled when `-ingester.out-of-order-time-window > 0` and `-blocks-storage.tsdb.enable-native-histograms=true`. #6626 #6663
* [ENHANCEMENT] Alertmanager: Add nflog and silences maintenance metrics. #6659
* [ENHANCEMENT] Querier: limit label APIs to query only ingesters if `start` param is not been specified. #6618
* [ENHANCEMENT] Alertmanager: Add new limits `-alertmanager.max-silences-count` and `-alertmanager.max-silences-size-bytes` for limiting silences per tenant. #6605
* [ENHANCEMENT] Update prometheus version to v3.1.0. #6583
Expand Down
28 changes: 28 additions & 0 deletions pkg/alertmanager/alertmanager_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ type alertmanagerMetrics struct {
nflogQueryErrorsTotal *prometheus.Desc
nflogQueryDuration *prometheus.Desc
nflogPropagatedMessagesTotal *prometheus.Desc
nflogMaintenanceTotal *prometheus.Desc
nflogMaintenanceErrorsTotal *prometheus.Desc

// exported metrics, gathered from Alertmanager Marker
markerAlerts *prometheus.Desc
Expand All @@ -43,6 +45,8 @@ type alertmanagerMetrics struct {
silencesQueryDuration *prometheus.Desc
silences *prometheus.Desc
silencesPropagatedMessagesTotal *prometheus.Desc
silencesMaintenanceTotal *prometheus.Desc
silencesMaintenanceErrorsTotal *prometheus.Desc

// The alertmanager config hash.
configHashValue *prometheus.Desc
Expand Down Expand Up @@ -127,6 +131,14 @@ func newAlertmanagerMetrics() *alertmanagerMetrics {
"cortex_alertmanager_nflog_gossip_messages_propagated_total",
"Number of received gossip messages that have been further gossiped.",
nil, nil),
nflogMaintenanceTotal: prometheus.NewDesc(
"cortex_alertmanager_nflog_maintenance_total",
"How many maintenances were executed for the notification log.",
nil, nil),
nflogMaintenanceErrorsTotal: prometheus.NewDesc(
"cortex_alertmanager_nflog_maintenance_errors_total",
"How many maintenances were executed for the notification log that failed.",
nil, nil),
markerAlerts: prometheus.NewDesc(
"cortex_alertmanager_alerts",
"How many alerts by state.",
Expand Down Expand Up @@ -163,6 +175,14 @@ func newAlertmanagerMetrics() *alertmanagerMetrics {
"cortex_alertmanager_silences",
"How many silences by state.",
[]string{"user", "state"}, nil),
silencesMaintenanceTotal: prometheus.NewDesc(
"cortex_alertmanager_silences_maintenance_total",
"How many maintenances were executed for silences.",
nil, nil),
silencesMaintenanceErrorsTotal: prometheus.NewDesc(
"cortex_alertmanager_silences_maintenance_errors_total",
"How many maintenances were executed for silences that failed.",
nil, nil),
configHashValue: prometheus.NewDesc(
"cortex_alertmanager_config_hash",
"Hash of the currently loaded alertmanager configuration.",
Expand Down Expand Up @@ -268,6 +288,8 @@ func (m *alertmanagerMetrics) Describe(out chan<- *prometheus.Desc) {
out <- m.nflogQueryErrorsTotal
out <- m.nflogQueryDuration
out <- m.nflogPropagatedMessagesTotal
out <- m.nflogMaintenanceTotal
out <- m.nflogMaintenanceErrorsTotal
out <- m.silencesGCDuration
out <- m.silencesSnapshotDuration
out <- m.silencesSnapshotSize
Expand All @@ -276,6 +298,8 @@ func (m *alertmanagerMetrics) Describe(out chan<- *prometheus.Desc) {
out <- m.silencesQueryDuration
out <- m.silencesPropagatedMessagesTotal
out <- m.silences
out <- m.silencesMaintenanceTotal
out <- m.silencesMaintenanceErrorsTotal
out <- m.configHashValue
out <- m.partialMerges
out <- m.partialMergesFailed
Expand Down Expand Up @@ -317,6 +341,8 @@ func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) {
data.SendSumOfCounters(out, m.nflogQueryErrorsTotal, "alertmanager_nflog_query_errors_total")
data.SendSumOfHistograms(out, m.nflogQueryDuration, "alertmanager_nflog_query_duration_seconds")
data.SendSumOfCounters(out, m.nflogPropagatedMessagesTotal, "alertmanager_nflog_gossip_messages_propagated_total")
data.SendSumOfCounters(out, m.nflogMaintenanceTotal, "alertmanager_nflog_maintenance_total")
data.SendSumOfCounters(out, m.nflogMaintenanceErrorsTotal, "alertmanager_nflog_maintenance_errors_total")

data.SendSumOfSummaries(out, m.silencesGCDuration, "alertmanager_silences_gc_duration_seconds")
data.SendSumOfSummaries(out, m.silencesSnapshotDuration, "alertmanager_silences_snapshot_duration_seconds")
Expand All @@ -326,6 +352,8 @@ func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) {
data.SendSumOfHistograms(out, m.silencesQueryDuration, "alertmanager_silences_query_duration_seconds")
data.SendSumOfCounters(out, m.silencesPropagatedMessagesTotal, "alertmanager_silences_gossip_messages_propagated_total")
data.SendSumOfGaugesPerUserWithLabels(out, m.silences, "alertmanager_silences", "state")
data.SendSumOfCounters(out, m.silencesMaintenanceTotal, "alertmanager_silences_maintenance_total")
data.SendSumOfCounters(out, m.silencesMaintenanceErrorsTotal, "alertmanager_silences_maintenance_errors_total")

data.SendMaxOfGaugesPerUser(out, m.configHashValue, "alertmanager_config_hash")

Expand Down
85 changes: 75 additions & 10 deletions pkg/alertmanager/alertmanager_metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,12 @@ func TestAlertmanagerMetricsStore(t *testing.T) {
# HELP cortex_alertmanager_nflog_snapshot_size_bytes Size of the last notification log snapshot in bytes.
# TYPE cortex_alertmanager_nflog_snapshot_size_bytes gauge
cortex_alertmanager_nflog_snapshot_size_bytes 111
# HELP cortex_alertmanager_nflog_maintenance_total How many maintenances were executed for the notification log.
# TYPE cortex_alertmanager_nflog_maintenance_total counter
cortex_alertmanager_nflog_maintenance_total 111
# HELP cortex_alertmanager_nflog_maintenance_errors_total How many maintenances were executed for the notification log that failed.
# TYPE cortex_alertmanager_nflog_maintenance_errors_total counter
cortex_alertmanager_nflog_maintenance_errors_total 111
# HELP cortex_alertmanager_notification_latency_seconds The latency of notifications in seconds.
# TYPE cortex_alertmanager_notification_latency_seconds histogram
cortex_alertmanager_notification_latency_seconds_bucket{le="1"} 14
Expand Down Expand Up @@ -277,6 +283,12 @@ func TestAlertmanagerMetricsStore(t *testing.T) {
# HELP cortex_alertmanager_silences_snapshot_size_bytes Size of the last silence snapshot in bytes.
# TYPE cortex_alertmanager_silences_snapshot_size_bytes gauge
cortex_alertmanager_silences_snapshot_size_bytes 111
# HELP cortex_alertmanager_silences_maintenance_total How many maintenances were executed for silences.
# TYPE cortex_alertmanager_silences_maintenance_total counter
cortex_alertmanager_silences_maintenance_total 111
# HELP cortex_alertmanager_silences_maintenance_errors_total How many maintenances were executed for silences that failed.
# TYPE cortex_alertmanager_silences_maintenance_errors_total counter
cortex_alertmanager_silences_maintenance_errors_total 111
# HELP cortex_alertmanager_state_fetch_replica_state_failed_total Number of times we have failed to read and merge the full state from another replica.
# TYPE cortex_alertmanager_state_fetch_replica_state_failed_total counter
cortex_alertmanager_state_fetch_replica_state_failed_total 0
Expand Down Expand Up @@ -414,6 +426,13 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {
# TYPE cortex_alertmanager_nflog_snapshot_size_bytes gauge
cortex_alertmanager_nflog_snapshot_size_bytes 111

# HELP cortex_alertmanager_nflog_maintenance_total How many maintenances were executed for the notification log.
# TYPE cortex_alertmanager_nflog_maintenance_total counter
cortex_alertmanager_nflog_maintenance_total 111
# HELP cortex_alertmanager_nflog_maintenance_errors_total How many maintenances were executed for the notification log that failed.
# TYPE cortex_alertmanager_nflog_maintenance_errors_total counter
cortex_alertmanager_nflog_maintenance_errors_total 111

# HELP cortex_alertmanager_notification_latency_seconds The latency of notifications in seconds.
# TYPE cortex_alertmanager_notification_latency_seconds histogram
cortex_alertmanager_notification_latency_seconds_bucket{le="1"} 14
Expand Down Expand Up @@ -598,6 +617,14 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {
# HELP cortex_alertmanager_silences_snapshot_size_bytes Size of the last silence snapshot in bytes.
# TYPE cortex_alertmanager_silences_snapshot_size_bytes gauge
cortex_alertmanager_silences_snapshot_size_bytes 111

# HELP cortex_alertmanager_silences_maintenance_total How many maintenances were executed for silences.
# TYPE cortex_alertmanager_silences_maintenance_total counter
cortex_alertmanager_silences_maintenance_total 111
# HELP cortex_alertmanager_silences_maintenance_errors_total How many maintenances were executed for silences that failed.
# TYPE cortex_alertmanager_silences_maintenance_errors_total counter
cortex_alertmanager_silences_maintenance_errors_total 111

# HELP cortex_alertmanager_state_fetch_replica_state_failed_total Number of times we have failed to read and merge the full state from another replica.
# TYPE cortex_alertmanager_state_fetch_replica_state_failed_total counter
cortex_alertmanager_state_fetch_replica_state_failed_total 0
Expand Down Expand Up @@ -715,6 +742,13 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {
# TYPE cortex_alertmanager_nflog_snapshot_size_bytes gauge
cortex_alertmanager_nflog_snapshot_size_bytes 11

# HELP cortex_alertmanager_nflog_maintenance_total How many maintenances were executed for the notification log.
# TYPE cortex_alertmanager_nflog_maintenance_total counter
cortex_alertmanager_nflog_maintenance_total 111
# HELP cortex_alertmanager_nflog_maintenance_errors_total How many maintenances were executed for the notification log that failed.
# TYPE cortex_alertmanager_nflog_maintenance_errors_total counter
cortex_alertmanager_nflog_maintenance_errors_total 111

# HELP cortex_alertmanager_notification_latency_seconds The latency of notifications in seconds.
# TYPE cortex_alertmanager_notification_latency_seconds histogram
cortex_alertmanager_notification_latency_seconds_bucket{le="1"} 14
Expand Down Expand Up @@ -863,6 +897,13 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {
# TYPE cortex_alertmanager_silences_snapshot_size_bytes gauge
cortex_alertmanager_silences_snapshot_size_bytes 11

# HELP cortex_alertmanager_silences_maintenance_total How many maintenances were executed for silences.
# TYPE cortex_alertmanager_silences_maintenance_total counter
cortex_alertmanager_silences_maintenance_total 111
# HELP cortex_alertmanager_silences_maintenance_errors_total How many maintenances were executed for silences that failed.
# TYPE cortex_alertmanager_silences_maintenance_errors_total counter
cortex_alertmanager_silences_maintenance_errors_total 111

# HELP cortex_alertmanager_state_fetch_replica_state_failed_total Number of times we have failed to read and merge the full state from another replica.
# TYPE cortex_alertmanager_state_fetch_replica_state_failed_total counter
cortex_alertmanager_state_fetch_replica_state_failed_total 0
Expand Down Expand Up @@ -913,6 +954,8 @@ func populateAlertmanager(base float64) *prometheus.Registry {
s.silencesActive.Set(base)
s.silencesExpired.Set(base * 2)
s.silencesPending.Set(base * 3)
s.silencesMaintenanceTotal.Add(base)
s.silencesMaintenanceErrorsTotal.Add(base)

n := newNflogMetrics(reg)
n.gcDuration.Observe(base)
Expand All @@ -922,6 +965,8 @@ func populateAlertmanager(base float64) *prometheus.Registry {
n.queryErrorsTotal.Add(base)
n.queryDuration.Observe(base)
n.propagatedMessagesTotal.Add(base)
n.maintenanceTotal.Add(base)
n.maintenanceErrorsTotal.Add(base)

nm := newNotifyMetrics(reg)
for i, integration := range integrations {
Expand Down Expand Up @@ -967,6 +1012,8 @@ type nflogMetrics struct {
queryErrorsTotal prometheus.Counter
queryDuration prometheus.Histogram
propagatedMessagesTotal prometheus.Counter
maintenanceTotal prometheus.Counter
maintenanceErrorsTotal prometheus.Counter
}

func newNflogMetrics(r prometheus.Registerer) *nflogMetrics {
Expand Down Expand Up @@ -1002,22 +1049,32 @@ func newNflogMetrics(r prometheus.Registerer) *nflogMetrics {
Name: "alertmanager_nflog_gossip_messages_propagated_total",
Help: "Number of received gossip messages that have been further gossiped.",
})
m.maintenanceTotal = promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "alertmanager_nflog_maintenance_total",
Help: "How many maintenances were executed for the notification log.",
})
m.maintenanceErrorsTotal = promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "alertmanager_nflog_maintenance_errors_total",
Help: "How many maintenances were executed for the notification log that failed.",
})

return m
}

// Copied from github.com/alertmanager/silence/silence.go
type silenceMetrics struct {
gcDuration prometheus.Summary
snapshotDuration prometheus.Summary
snapshotSize prometheus.Gauge
queriesTotal prometheus.Counter
queryErrorsTotal prometheus.Counter
queryDuration prometheus.Histogram
silencesActive prometheus.Gauge
silencesPending prometheus.Gauge
silencesExpired prometheus.Gauge
propagatedMessagesTotal prometheus.Counter
gcDuration prometheus.Summary
snapshotDuration prometheus.Summary
snapshotSize prometheus.Gauge
queriesTotal prometheus.Counter
queryErrorsTotal prometheus.Counter
queryDuration prometheus.Histogram
silencesActive prometheus.Gauge
silencesPending prometheus.Gauge
silencesExpired prometheus.Gauge
propagatedMessagesTotal prometheus.Counter
silencesMaintenanceTotal prometheus.Counter
silencesMaintenanceErrorsTotal prometheus.Counter
}

func newSilenceMetrics(r prometheus.Registerer) *silenceMetrics {
Expand Down Expand Up @@ -1068,6 +1125,14 @@ func newSilenceMetrics(r prometheus.Registerer) *silenceMetrics {
Help: "How many silences by state.",
ConstLabels: prometheus.Labels{"state": string(types.SilenceStateExpired)},
})
m.silencesMaintenanceTotal = promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "alertmanager_silences_maintenance_total",
Help: "How many maintenances were executed for silences.",
})
m.silencesMaintenanceErrorsTotal = promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "alertmanager_silences_maintenance_errors_total",
Help: "How many maintenances were executed for silences that failed.",
})

return m
}
Expand Down