diff --git a/airflow-core/docs/administration-and-deployment/logging-monitoring/metrics.rst b/airflow-core/docs/administration-and-deployment/logging-monitoring/metrics.rst index 49f37ab18207f..d79849c3d29c6 100644 --- a/airflow-core/docs/administration-and-deployment/logging-monitoring/metrics.rst +++ b/airflow-core/docs/administration-and-deployment/logging-monitoring/metrics.rst @@ -115,6 +115,42 @@ You need to configure the SSL certificate and key within the OpenTelemetry colle cert_file: "/path/to/cert/cert.crt" key_file: "/path/to/key/key.pem" +Histogram Metrics and Backend Requirements +------------------------------------------ + +Airflow's timing metrics (``timing()`` / ``timer()``) are emitted as OpenTelemetry +histograms aggregated with +`exponential bucket histograms `_, +so bucket boundaries adapt automatically to the observed range and you do not have to +hand-tune explicit buckets for metrics that span very different scales (milliseconds to +hours). + +To ingest these correctly end-to-end, the metrics backend you connect to must support +OpenTelemetry exponential histograms and (for Prometheus) their conversion to native +histograms: + +* **OpenTelemetry Collector** — use ``opentelemetry-collector-contrib`` version 0.115.0 + or above. Older versions do not translate OTLP exponential histograms into Prometheus + native histograms. +* **Prometheus** — native histograms must be enabled explicitly, and how you do that + depends on the Prometheus version: + + * **2.40 to 3.8** — start Prometheus with the ``--enable-feature=native-histograms`` + flag. + * **3.8 and above** — set ``scrape_native_histograms: true`` in the scrape + configuration (this option was added in 3.8, and from 3.9 the feature flag is a + no-op so the config setting is required): + + .. code-block:: yaml + + global: + scrape_native_histograms: true + +If the backend does not support native histograms, exponential-histogram data points may +be dropped or rendered incorrectly. A reference stack (Collector, Prometheus, and Grafana) +wired up for local development is available via ``breeze start-airflow --integration otel``; +see the contributor docs for details. + Allow/Block Lists ----------------- diff --git a/scripts/ci/docker-compose/integration-otel.yml b/scripts/ci/docker-compose/integration-otel.yml index 8b98699fccbfd..fd7f252c3f1a3 100644 --- a/scripts/ci/docker-compose/integration-otel.yml +++ b/scripts/ci/docker-compose/integration-otel.yml @@ -17,7 +17,7 @@ --- services: otel-collector: - image: otel/opentelemetry-collector-contrib:0.70.0 + image: otel/opentelemetry-collector-contrib:0.155.0 labels: breeze.description: "Integration required for OTEL/opentelemetry hooks." container_name: "breeze-otel-collector" @@ -29,9 +29,12 @@ services: - "28889:8889" # Prometheus exporter metrics prometheus: - image: prom/prometheus + image: prom/prometheus:v3.5.4 container_name: "breeze-prometheus" user: "0" + command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--enable-feature=native-histograms" ports: - "29090:9090" volumes: diff --git a/scripts/ci/docker-compose/otel-collector-config.yml b/scripts/ci/docker-compose/otel-collector-config.yml index 9c6ede5f50ca6..b22c07f736214 100644 --- a/scripts/ci/docker-compose/otel-collector-config.yml +++ b/scripts/ci/docker-compose/otel-collector-config.yml @@ -22,6 +22,7 @@ receivers: otlp: protocols: http: + endpoint: 0.0.0.0:4318 processors: batch: @@ -32,7 +33,7 @@ exporters: tls: insecure: true - logging: + debug: verbosity: detailed prometheus: endpoint: 0.0.0.0:8889 @@ -44,9 +45,9 @@ service: traces: receivers: [otlp] processors: [batch] - exporters: [logging, otlp/jaeger] + exporters: [debug, otlp/jaeger] metrics: receivers: [otlp] processors: [batch] - exporters: [logging, prometheus] + exporters: [debug, prometheus]