From 0aa370e8b305cacd2e1284af873c139e2997d289 Mon Sep 17 00:00:00 2001 From: mchtech Date: Tue, 11 Jun 2024 10:31:42 +0000 Subject: [PATCH 1/3] add IRQ PSI metrics Signed-off-by: mchtech --- collector/pressure_linux.go | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/collector/pressure_linux.go b/collector/pressure_linux.go index 4dbdc5393d..31f0d6f8ca 100644 --- a/collector/pressure_linux.go +++ b/collector/pressure_linux.go @@ -29,7 +29,7 @@ import ( ) var ( - psiResources = []string{"cpu", "io", "memory"} + psiResources = []string{"cpu", "io", "memory", "irq"} ) type pressureStatsCollector struct { @@ -38,6 +38,7 @@ type pressureStatsCollector struct { ioFull *prometheus.Desc mem *prometheus.Desc memFull *prometheus.Desc + irqFull *prometheus.Desc fs procfs.FS @@ -81,6 +82,11 @@ func NewPressureStatsCollector(logger log.Logger) (Collector, error) { "Total time in seconds no process could make progress due to memory congestion", nil, nil, ), + irqFull: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "pressure", "irq_stalled_seconds_total"), + "Total time in seconds no process could make progress due to IRQ congestion", + nil, nil, + ), fs: fs, logger: logger, }, nil @@ -102,7 +108,9 @@ func (c *pressureStatsCollector) Update(ch chan<- prometheus.Metric) error { } return fmt.Errorf("failed to retrieve pressure stats: %w", err) } - if vals.Some == nil { + // IRQ pressure does not have 'some' data. + // See https://github.com/torvalds/linux/blob/v6.9/kernel/sched/psi.c#L1243 + if vals.Some == nil && res != "irq" { level.Debug(c.logger).Log("msg", "pressure information returned no 'some' data") return ErrNoData } @@ -119,6 +127,8 @@ func (c *pressureStatsCollector) Update(ch chan<- prometheus.Metric) error { case "memory": ch <- prometheus.MustNewConstMetric(c.mem, prometheus.CounterValue, float64(vals.Some.Total)/1000.0/1000.0) ch <- prometheus.MustNewConstMetric(c.memFull, prometheus.CounterValue, float64(vals.Full.Total)/1000.0/1000.0) + case "irq": + ch <- prometheus.MustNewConstMetric(c.irqFull, prometheus.CounterValue, float64(vals.Full.Total)/1000.0/1000.0) default: level.Debug(c.logger).Log("msg", "did not account for resource", "resource", res) } From ddaaa6ba6dfb1e5f6b02075ef0ec353de86b86b2 Mon Sep 17 00:00:00 2001 From: mchtech Date: Tue, 11 Jun 2024 10:42:57 +0000 Subject: [PATCH 2/3] change irq psi url Signed-off-by: mchtech --- collector/pressure_linux.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collector/pressure_linux.go b/collector/pressure_linux.go index 31f0d6f8ca..ac25d7c12c 100644 --- a/collector/pressure_linux.go +++ b/collector/pressure_linux.go @@ -109,7 +109,7 @@ func (c *pressureStatsCollector) Update(ch chan<- prometheus.Metric) error { return fmt.Errorf("failed to retrieve pressure stats: %w", err) } // IRQ pressure does not have 'some' data. - // See https://github.com/torvalds/linux/blob/v6.9/kernel/sched/psi.c#L1243 + // See https://github.com/torvalds/linux/blob/v6.9/include/linux/psi_types.h#L65 if vals.Some == nil && res != "irq" { level.Debug(c.logger).Log("msg", "pressure information returned no 'some' data") return ErrNoData From 459520e92803288f7859fff4f2f61a062659bff9 Mon Sep 17 00:00:00 2001 From: mchtech Date: Tue, 11 Jun 2024 11:01:17 +0000 Subject: [PATCH 3/3] add IRQ PSI test data Signed-off-by: mchtech --- collector/fixtures/e2e-64k-page-output.txt | 3 +++ collector/fixtures/e2e-output.txt | 3 +++ collector/fixtures/proc/pressure/irq | 1 + 3 files changed, 7 insertions(+) create mode 100644 collector/fixtures/proc/pressure/irq diff --git a/collector/fixtures/e2e-64k-page-output.txt b/collector/fixtures/e2e-64k-page-output.txt index 99c58abaf7..d3b485710f 100644 --- a/collector/fixtures/e2e-64k-page-output.txt +++ b/collector/fixtures/e2e-64k-page-output.txt @@ -2830,6 +2830,9 @@ node_pressure_io_stalled_seconds_total 159.229614 # HELP node_pressure_io_waiting_seconds_total Total time in seconds that processes have waited due to IO congestion # TYPE node_pressure_io_waiting_seconds_total counter node_pressure_io_waiting_seconds_total 159.886802 +# HELP node_pressure_irq_stalled_seconds_total Total time in seconds no process could make progress due to IRQ congestion +# TYPE node_pressure_irq_stalled_seconds_total counter +node_pressure_irq_stalled_seconds_total 0.008494 # HELP node_pressure_memory_stalled_seconds_total Total time in seconds no process could make progress due to memory congestion # TYPE node_pressure_memory_stalled_seconds_total counter node_pressure_memory_stalled_seconds_total 0 diff --git a/collector/fixtures/e2e-output.txt b/collector/fixtures/e2e-output.txt index d52cb99d04..1a6448fb2d 100644 --- a/collector/fixtures/e2e-output.txt +++ b/collector/fixtures/e2e-output.txt @@ -2852,6 +2852,9 @@ node_pressure_io_stalled_seconds_total 159.229614 # HELP node_pressure_io_waiting_seconds_total Total time in seconds that processes have waited due to IO congestion # TYPE node_pressure_io_waiting_seconds_total counter node_pressure_io_waiting_seconds_total 159.886802 +# HELP node_pressure_irq_stalled_seconds_total Total time in seconds no process could make progress due to IRQ congestion +# TYPE node_pressure_irq_stalled_seconds_total counter +node_pressure_irq_stalled_seconds_total 0.008494 # HELP node_pressure_memory_stalled_seconds_total Total time in seconds no process could make progress due to memory congestion # TYPE node_pressure_memory_stalled_seconds_total counter node_pressure_memory_stalled_seconds_total 0 diff --git a/collector/fixtures/proc/pressure/irq b/collector/fixtures/proc/pressure/irq new file mode 100644 index 0000000000..76059c7572 --- /dev/null +++ b/collector/fixtures/proc/pressure/irq @@ -0,0 +1 @@ +full avg10=0.00 avg60=0.00 avg300=0.00 total=8494 \ No newline at end of file