From fbde5872afd233970c082ce0eef051b1a6e74599 Mon Sep 17 00:00:00 2001 From: Namrata Namrata Date: Wed, 25 Mar 2026 09:07:21 +0000 Subject: [PATCH 1/3] Fix OTel timer metrics using Gauge instead of Histogram --- .../observability/metrics/otel_logger.py | 38 ++++++++++++++++--- .../observability/metrics/test_otel_logger.py | 27 +++++++------ 2 files changed, 48 insertions(+), 17 deletions(-) diff --git a/shared/observability/src/airflow_shared/observability/metrics/otel_logger.py b/shared/observability/src/airflow_shared/observability/metrics/otel_logger.py index 4b5174a3bf3ca..d855fbcdf2858 100644 --- a/shared/observability/src/airflow_shared/observability/metrics/otel_logger.py +++ b/shared/observability/src/airflow_shared/observability/metrics/otel_logger.py @@ -146,7 +146,8 @@ class _OtelTimer(Timer): """ An implementation of Stats.Timer() which records the result in the OTel Metrics Map. - OpenTelemetry does not have a native timer, we will store the values as a Gauge. + OpenTelemetry does not have a native timer; values are stored as a Histogram so that + all observations (count, sum, bucket distribution) are preserved across multiple recordings. :param name: The name of the timer. :param tags: Tags to append to the timer. @@ -161,8 +162,8 @@ def __init__(self, otel_logger: SafeOtelLogger, name: str | None, tags: Attribut def stop(self, send: bool = True) -> None: super().stop(send) if self.name and send and self.duration: - self.otel_logger.metrics_map.set_gauge_value( - full_name(prefix=self.otel_logger.prefix, name=self.name), self.duration, False, self.tags + self.otel_logger.metrics_map.record_histogram_value( + full_name(prefix=self.otel_logger.prefix, name=self.name), self.duration, self.tags ) @@ -278,11 +279,11 @@ def timing( *, tags: Attributes = None, ) -> None: - """OTel does not have a native timer, stored as a Gauge whose value is elapsed ms.""" + """Record a timing observation as a Histogram to preserve distribution information.""" if self.metrics_validator.test(stat) and name_is_otel_safe(self.prefix, stat): if isinstance(dt, datetime.timedelta): dt = dt.total_seconds() * 1000.0 - self.metrics_map.set_gauge_value(full_name(prefix=self.prefix, name=stat), float(dt), False, tags) + self.metrics_map.record_histogram_value(full_name(prefix=self.prefix, name=stat), float(dt), tags) def timer( self, @@ -314,6 +315,18 @@ def set_value(self, new_value: int | float, delta: bool): self.gauge.set(new_value, attributes=self.attributes) +class InternalHistogram: + """Stores a histogram instrument for timer/timing metrics.""" + + def __init__(self, meter, name: str): + otel_safe_name = _get_otel_safe_name(name) + self.histogram = meter.create_histogram(name=otel_safe_name, unit="ms") + log.debug("Created %s as type: %s", otel_safe_name, _type_as_str(self.histogram)) + + def record(self, value: float, tags: Attributes) -> None: + self.histogram.record(value, attributes=tags) + + class MetricsMap: """Stores Otel Instruments.""" @@ -376,6 +389,21 @@ def set_gauge_value(self, name: str, value: int | float, delta: bool, tags: Attr self.map[key].set_value(value, delta) + def record_histogram_value(self, name: str, value: float, tags: Attributes) -> None: + """ + Record a timing observation in a Histogram instrument. + + Unlike a Gauge, a Histogram accumulates all observations so that count, sum, + and bucket distribution are preserved across multiple recordings. + + :param name: The name of the histogram to record. + :param value: The timing observation in milliseconds. + :param tags: Attributes to attach to the observation. + """ + if name not in self.map: + self.map[name] = InternalHistogram(meter=self.meter, name=name) + self.map[name].record(value, tags) + def flush_otel_metrics(): provider = metrics.get_meter_provider() diff --git a/shared/observability/tests/observability/metrics/test_otel_logger.py b/shared/observability/tests/observability/metrics/test_otel_logger.py index c27c372996968..d0089663009be 100644 --- a/shared/observability/tests/observability/metrics/test_otel_logger.py +++ b/shared/observability/tests/observability/metrics/test_otel_logger.py @@ -244,25 +244,28 @@ def test_timing_new_metric(self, name): self.stats.timing(name, dt=datetime.timedelta(seconds=123)) - self.meter.get_meter().create_gauge.assert_called_once_with(name=full_name(name)) - expected_value = 123000.0 - assert self.map[full_name(name)].value == expected_value + self.meter.get_meter().create_histogram.assert_called_once_with(name=full_name(name), unit="ms") + self.meter.get_meter().create_histogram.return_value.record.assert_called_once_with( + 123000.0, attributes=None + ) def test_timing_new_metric_with_tags(self, name): tags = {"hello": "world"} - key = _generate_key_name(full_name(name), tags) self.stats.timing(name, dt=1, tags=tags) - self.meter.get_meter().create_gauge.assert_called_once_with(name=full_name(name)) - self.map[key].attributes == tags + self.meter.get_meter().create_histogram.assert_called_once_with(name=full_name(name), unit="ms") + self.meter.get_meter().create_histogram.return_value.record.assert_called_once_with( + 1.0, attributes=tags + ) def test_timing_existing_metric(self, name): self.stats.timing(name, dt=1) self.stats.timing(name, dt=2) - self.meter.get_meter().create_gauge.assert_called_once_with(name=full_name(name)) - assert self.map[full_name(name)].value == 2 + # histogram created only once, but both observations are recorded + self.meter.get_meter().create_histogram.assert_called_once_with(name=full_name(name), unit="ms") + assert self.meter.get_meter().create_histogram.return_value.record.call_count == 2 # For the four test_timer_foo tests below: # time.perf_count() is called once to get the starting timestamp and again @@ -277,7 +280,7 @@ def test_timer_with_name_returns_float_and_stores_value(self, mock_time, name): expected_duration = 3140.0 assert timer.duration == expected_duration assert mock_time.call_count == 2 - self.meter.get_meter().create_gauge.assert_called_once_with(name=full_name(name)) + self.meter.get_meter().create_histogram.assert_called_once_with(name=full_name(name), unit="ms") @mock.patch.object(time, "perf_counter", side_effect=[0.0, 3.14]) def test_timer_no_name_returns_float_but_does_not_store_value(self, mock_time, name): @@ -288,7 +291,7 @@ def test_timer_no_name_returns_float_but_does_not_store_value(self, mock_time, n expected_duration = 3140.0 assert timer.duration == expected_duration assert mock_time.call_count == 2 - self.meter.get_meter().create_gauge.assert_not_called() + self.meter.get_meter().create_histogram.assert_not_called() @mock.patch.object(time, "perf_counter", side_effect=[0.0, 3.14]) def test_timer_start_and_stop_manually_send_false(self, mock_time, name): @@ -301,7 +304,7 @@ def test_timer_start_and_stop_manually_send_false(self, mock_time, name): expected_value = 3140.0 assert timer.duration == expected_value assert mock_time.call_count == 2 - self.meter.get_meter().create_gauge.assert_not_called() + self.meter.get_meter().create_histogram.assert_not_called() @mock.patch.object(time, "perf_counter", side_effect=[0.0, 3.14]) def test_timer_start_and_stop_manually_send_true(self, mock_time, name): @@ -314,7 +317,7 @@ def test_timer_start_and_stop_manually_send_true(self, mock_time, name): expected_value = 3140.0 assert timer.duration == expected_value assert mock_time.call_count == 2 - self.meter.get_meter().create_gauge.assert_called_once_with(name=full_name(name)) + self.meter.get_meter().create_histogram.assert_called_once_with(name=full_name(name), unit="ms") @pytest.mark.parametrize( ( From 7720860cdbaa2ae4ba9377058be37f6e0c0e1588 Mon Sep 17 00:00:00 2001 From: Namrata Namrata Date: Thu, 2 Apr 2026 18:09:37 +0100 Subject: [PATCH 2/3] Use ExponentialBucketHistogramAggregation for timing metrics --- .../observability/metrics/otel_logger.py | 17 +++++++++++++++ .../observability/metrics/test_otel_logger.py | 21 +++++++++++++++++-- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/shared/observability/src/airflow_shared/observability/metrics/otel_logger.py b/shared/observability/src/airflow_shared/observability/metrics/otel_logger.py index d855fbcdf2858..291e300383652 100644 --- a/shared/observability/src/airflow_shared/observability/metrics/otel_logger.py +++ b/shared/observability/src/airflow_shared/observability/metrics/otel_logger.py @@ -26,10 +26,12 @@ from opentelemetry import metrics from opentelemetry.sdk.metrics import MeterProvider +from opentelemetry.sdk.metrics._internal.aggregation import ExponentialBucketHistogramAggregation from opentelemetry.sdk.metrics._internal.export import ( ConsoleMetricExporter, PeriodicExportingMetricReader, ) +from opentelemetry.sdk.metrics.view import View from opentelemetry.sdk.resources import SERVICE_NAME, Resource from ..common import get_otel_data_exporter @@ -428,6 +430,15 @@ def get_otel_logger( stat_name_handler: Callable[[str], str] | None = None, statsd_influxdb_enabled: bool = False, ) -> SafeOtelLogger: + """ + Build and return a :class:`SafeOtelLogger` backed by a configured :class:`MeterProvider`. + + Histogram instruments (used for ``timing()`` / ``timer()`` metrics) are aggregated with + :class:`~opentelemetry.sdk.metrics._internal.aggregation.ExponentialBucketHistogramAggregation` + so that bucket boundaries adapt automatically to the observed data range. This avoids + the need to hand-tune explicit bucket boundaries for metrics that span very different + scales (milliseconds to hours). + """ otel_env_config = load_metrics_env_config() effective_service_name: str = otel_env_config.service_name or service_name or "airflow" @@ -465,6 +476,12 @@ def get_otel_logger( MeterProvider( resource=resource, metric_readers=readers, + views=[ + View( + instrument_type=metrics.Histogram, + aggregation=ExponentialBucketHistogramAggregation(), + ) + ], shutdown_on_exit=False, ), ) diff --git a/shared/observability/tests/observability/metrics/test_otel_logger.py b/shared/observability/tests/observability/metrics/test_otel_logger.py index d0089663009be..6c42ab4dd2122 100644 --- a/shared/observability/tests/observability/metrics/test_otel_logger.py +++ b/shared/observability/tests/observability/metrics/test_otel_logger.py @@ -25,6 +25,8 @@ import pytest from opentelemetry.metrics import MeterProvider +from opentelemetry.sdk.metrics._internal.aggregation import ExponentialBucketHistogramAggregation +from opentelemetry.sdk.metrics.view import View from airflow_shared.observability.common import get_otel_data_exporter from airflow_shared.observability.exceptions import InvalidStatsNameException @@ -418,6 +420,18 @@ def test_config_priorities( == f"opentelemetry.exporter.otlp.proto.{expected_exporter_module}.metric_exporter" ) + @mock.patch("airflow_shared.observability.metrics.otel_logger.metrics") + @mock.patch("airflow_shared.observability.metrics.otel_logger.MeterProvider") + def test_get_otel_logger_uses_exponential_histogram_view(self, mock_provider, mock_metrics): + get_otel_logger(host="localhost", port=4318) + + call_kwargs = mock_provider.call_args.kwargs + views = call_kwargs["views"] + assert len(views) == 1 + view = views[0] + assert isinstance(view, View) + assert isinstance(view._aggregation, ExponentialBucketHistogramAggregation) + def test_atexit_flush_on_process_exit(self): """ Run a process that initializes a logger, creates a stat and then exits. @@ -425,8 +439,11 @@ def test_atexit_flush_on_process_exit(self): The logger initialization registers an atexit hook. Test that the hook runs and flushes the created stat at shutdown. """ - test_module_name = "tests.observability.metrics.test_otel_logger" - function_call_str = f"import {test_module_name} as m; m.mock_service_run()" + function_call_str = ( + "from airflow_shared.observability.metrics.otel_logger import get_otel_logger; " + "logger = get_otel_logger(debug=True); " + "logger.incr('my_test_stat')" + ) proc = subprocess.run( [sys.executable, "-c", function_call_str], From 876d1949c11453f53774607d0ce4fae67075eeb4 Mon Sep 17 00:00:00 2001 From: Namrata Namrata Date: Mon, 20 Apr 2026 13:29:41 +0100 Subject: [PATCH 3/3] Use public API import path for ExponentialBucketHistogramAggregation and fix histogram map isolation --- airflow-core/newsfragments/64207.significant.rst | 1 + .../observability/metrics/otel_logger.py | 15 ++++++++------- .../observability/metrics/test_otel_logger.py | 3 +-- 3 files changed, 10 insertions(+), 9 deletions(-) create mode 100644 airflow-core/newsfragments/64207.significant.rst diff --git a/airflow-core/newsfragments/64207.significant.rst b/airflow-core/newsfragments/64207.significant.rst new file mode 100644 index 0000000000000..3254fa20a5410 --- /dev/null +++ b/airflow-core/newsfragments/64207.significant.rst @@ -0,0 +1 @@ +OTel timer and timing metrics now use Histogram instead of Gauge, preserving count, sum, and bucket distribution across recordings. diff --git a/shared/observability/src/airflow_shared/observability/metrics/otel_logger.py b/shared/observability/src/airflow_shared/observability/metrics/otel_logger.py index 291e300383652..ec9ee581560e2 100644 --- a/shared/observability/src/airflow_shared/observability/metrics/otel_logger.py +++ b/shared/observability/src/airflow_shared/observability/metrics/otel_logger.py @@ -26,12 +26,11 @@ from opentelemetry import metrics from opentelemetry.sdk.metrics import MeterProvider -from opentelemetry.sdk.metrics._internal.aggregation import ExponentialBucketHistogramAggregation from opentelemetry.sdk.metrics._internal.export import ( ConsoleMetricExporter, PeriodicExportingMetricReader, ) -from opentelemetry.sdk.metrics.view import View +from opentelemetry.sdk.metrics.view import ExponentialBucketHistogramAggregation, View from opentelemetry.sdk.resources import SERVICE_NAME, Resource from ..common import get_otel_data_exporter @@ -163,7 +162,7 @@ def __init__(self, otel_logger: SafeOtelLogger, name: str | None, tags: Attribut def stop(self, send: bool = True) -> None: super().stop(send) - if self.name and send and self.duration: + if self.name and send and self.duration is not None: self.otel_logger.metrics_map.record_histogram_value( full_name(prefix=self.otel_logger.prefix, name=self.name), self.duration, self.tags ) @@ -335,9 +334,11 @@ class MetricsMap: def __init__(self, meter): self.meter = meter self.map = {} + self.histograms: dict[str, InternalHistogram] = {} def clear(self) -> None: self.map.clear() + self.histograms.clear() def _create_counter(self, name): """Create a new counter or up_down_counter for the provided name.""" @@ -402,9 +403,9 @@ def record_histogram_value(self, name: str, value: float, tags: Attributes) -> N :param value: The timing observation in milliseconds. :param tags: Attributes to attach to the observation. """ - if name not in self.map: - self.map[name] = InternalHistogram(meter=self.meter, name=name) - self.map[name].record(value, tags) + if name not in self.histograms: + self.histograms[name] = InternalHistogram(meter=self.meter, name=name) + self.histograms[name].record(value, tags) def flush_otel_metrics(): @@ -434,7 +435,7 @@ def get_otel_logger( Build and return a :class:`SafeOtelLogger` backed by a configured :class:`MeterProvider`. Histogram instruments (used for ``timing()`` / ``timer()`` metrics) are aggregated with - :class:`~opentelemetry.sdk.metrics._internal.aggregation.ExponentialBucketHistogramAggregation` + :class:`~opentelemetry.sdk.metrics.view.ExponentialBucketHistogramAggregation` so that bucket boundaries adapt automatically to the observed data range. This avoids the need to hand-tune explicit bucket boundaries for metrics that span very different scales (milliseconds to hours). diff --git a/shared/observability/tests/observability/metrics/test_otel_logger.py b/shared/observability/tests/observability/metrics/test_otel_logger.py index 6c42ab4dd2122..faa45e2e09ea8 100644 --- a/shared/observability/tests/observability/metrics/test_otel_logger.py +++ b/shared/observability/tests/observability/metrics/test_otel_logger.py @@ -25,8 +25,7 @@ import pytest from opentelemetry.metrics import MeterProvider -from opentelemetry.sdk.metrics._internal.aggregation import ExponentialBucketHistogramAggregation -from opentelemetry.sdk.metrics.view import View +from opentelemetry.sdk.metrics.view import ExponentialBucketHistogramAggregation, View from airflow_shared.observability.common import get_otel_data_exporter from airflow_shared.observability.exceptions import InvalidStatsNameException