From 45cc8a9a3e9840607df75ea09296ceda07ff8298 Mon Sep 17 00:00:00 2001 From: Sean Ghaeli Date: Tue, 23 Jun 2026 17:05:32 +0000 Subject: [PATCH] Revert "Fetch deadline callback context via Execution API at runtime (#66608)" This reverts commit 8095abb57188f43305d9561f0f809d5e7900d6e5. --- airflow-core/docs/howto/deadline-alerts.rst | 27 +- .../core_api/datamodels/ui/deadline.py | 38 +- .../core_api/openapi/_private_ui.yaml | 14 +- .../core_api/routes/ui/deadlines.py | 10 +- .../airflow/api_fastapi/execution_api/app.py | 3 +- .../execution_api/routes/connections.py | 15 +- .../execution_api/routes/dag_runs.py | 7 +- .../execution_api/routes/variables.py | 27 +- .../api_fastapi/execution_api/security.py | 27 +- .../src/airflow/dag_processing/dagbag.py | 29 - .../src/airflow/executors/base_executor.py | 4 - .../src/airflow/executors/local_executor.py | 16 +- .../airflow/executors/workloads/callback.py | 11 - .../src/airflow/jobs/scheduler_job_runner.py | 100 +-- .../src/airflow/jobs/triggerer_job_runner.py | 188 +----- ..._3_3_0_change_deadline_interval_to_json.py | 84 +-- airflow-core/src/airflow/models/callback.py | 43 +- airflow-core/src/airflow/models/deadline.py | 90 +-- .../src/airflow/models/deadline_alert.py | 15 +- airflow-core/src/airflow/models/trigger.py | 31 +- .../src/airflow/serialization/decoders.py | 11 +- .../airflow/serialization/definitions/dag.py | 141 ++--- .../serialization/definitions/deadline.py | 27 +- airflow-core/src/airflow/triggers/callback.py | 94 +-- .../ui/openapi-gen/queries/ensureQueryData.ts | 2 +- .../ui/openapi-gen/queries/prefetch.ts | 2 +- .../airflow/ui/openapi-gen/queries/queries.ts | 2 +- .../ui/openapi-gen/queries/suspense.ts | 2 +- .../ui/openapi-gen/requests/schemas.gen.ts | 13 +- .../ui/openapi-gen/requests/services.gen.ts | 2 +- .../ui/openapi-gen/requests/types.gen.ts | 6 +- .../ui/public/i18n/locales/en/dag.json | 4 +- .../ui/src/pages/Dag/DeadlineAlertsBadge.tsx | 13 +- .../ui/src/pages/Dag/Overview/DeadlineRow.tsx | 20 +- .../ui/src/pages/Run/DeadlineStatus.tsx | 31 +- .../ui/src/pages/Run/DeadlineStatusModal.tsx | 23 +- .../core_api/routes/ui/test_deadlines.py | 83 +-- .../versions/head/test_task_instances.py | 8 +- .../tests/unit/dag_processing/test_dagbag.py | 40 -- .../unit/executors/test_local_executor.py | 4 - ...test_build_trigger_workloads_resilience.py | 143 ----- ...t_enqueue_executor_callbacks_resilience.py | 174 ------ .../tests/unit/jobs/test_triggerer_job.py | 100 +-- ...t_0117_deadline_interval_json_migration.py | 140 ++--- .../tests/unit/models/test_callback.py | 152 +---- airflow-core/tests/unit/models/test_dagrun.py | 156 ++--- .../tests/unit/models/test_deadline.py | 47 +- .../tests/unit/models/test_deadline_alert.py | 31 - .../test_deadline_callback_adversarial.py | 137 ---- ..._deadline_callback_context_construction.py | 55 -- .../test_deadline_handle_miss_reentrancy.py | 102 --- .../test_deadline_scheduler_resilience.py | 149 ----- .../tests/unit/models/test_prune_deadlines.py | 134 ---- .../tests/unit/models/test_trigger.py | 33 - .../test_deadline_reference_registry.py | 82 --- .../tests/unit/triggers/test_callback.py | 167 +---- .../airflow_shared/module_loading/__init__.py | 40 -- .../metrics/metrics_template.yaml | 12 - .../template_rendering/__init__.py | 30 - task-sdk/pyproject.toml | 1 - .../src/airflow/sdk/definitions/callback.py | 18 +- .../src/airflow/sdk/definitions/deadline.py | 51 +- .../sdk/execution_time/callback_supervisor.py | 188 +----- .../src/airflow/sdk/execution_time/context.py | 43 -- .../task_sdk/definitions/test_callback.py | 27 - .../task_sdk/definitions/test_deadline.py | 88 +-- .../test_deadline_alert_validation.py | 63 -- .../test_build_context_field_edge_cases.py | 100 --- .../test_build_context_from_dag_run.py | 129 ---- .../test_callback_supervisor.py | 591 +++++++----------- .../test_callback_supervisor_io_faults.py | 61 -- 71 files changed, 584 insertions(+), 3967 deletions(-) delete mode 100644 airflow-core/tests/unit/jobs/test_build_trigger_workloads_resilience.py delete mode 100644 airflow-core/tests/unit/jobs/test_enqueue_executor_callbacks_resilience.py delete mode 100644 airflow-core/tests/unit/models/test_deadline_callback_adversarial.py delete mode 100644 airflow-core/tests/unit/models/test_deadline_callback_context_construction.py delete mode 100644 airflow-core/tests/unit/models/test_deadline_handle_miss_reentrancy.py delete mode 100644 airflow-core/tests/unit/models/test_deadline_scheduler_resilience.py delete mode 100644 airflow-core/tests/unit/models/test_prune_deadlines.py delete mode 100644 task-sdk/tests/task_sdk/definitions/test_deadline_alert_validation.py delete mode 100644 task-sdk/tests/task_sdk/execution_time/test_build_context_field_edge_cases.py delete mode 100644 task-sdk/tests/task_sdk/execution_time/test_build_context_from_dag_run.py delete mode 100644 task-sdk/tests/task_sdk/execution_time/test_callback_supervisor_io_faults.py diff --git a/airflow-core/docs/howto/deadline-alerts.rst b/airflow-core/docs/howto/deadline-alerts.rst index 8fdfc3cc095a7..e36908009a0f1 100644 --- a/airflow-core/docs/howto/deadline-alerts.rst +++ b/airflow-core/docs/howto/deadline-alerts.rst @@ -58,7 +58,7 @@ Below is an example Dag implementation. If the Dag has not finished 15 minutes a .. code-block:: python - from datetime import datetime, timedelta, timezone + from datetime import datetime, timedelta from airflow.sdk import AsyncCallback, DAG, DeadlineAlert, DeadlineReference from airflow.providers.slack.notifications.slack_webhook import SlackWebhookNotifier from airflow.providers.standard.operators.empty import EmptyOperator @@ -165,9 +165,7 @@ Here's an example using a fixed datetime: .. code-block:: python - tomorrow_at_ten = datetime.combine( - datetime.now().date() + timedelta(days=1), time(10, 0), tzinfo=timezone.utc - ) + tomorrow_at_ten = datetime.combine(datetime.now().date() + timedelta(days=1), time(10, 0)) with DAG( dag_id="fixed_deadline_alert", @@ -367,19 +365,12 @@ A **custom asynchronous callback** might look like this: Templating and Context ^^^^^^^^^^^^^^^^^^^^^^ -A relatively simple version of the Airflow context is passed to callables, and Airflow runs -:ref:`concepts:jinja-templating` on string-valued callback ``kwargs`` using that context. String -kwargs that contain ``{{ ... }}`` are rendered before the callback runs; non-string kwargs and -strings without template markers are passed through untouched, and a template that fails to render -falls back to its raw value (logged at warning) rather than failing the callback. Templating works -identically on both the synchronous (executor) and asynchronous (triggerer) callback paths. - -The variables available for templating are those in the simplified context: the ID and the -calculated deadline time of the Deadline Alert (``{{ deadline.id }}``, ``{{ deadline.deadline_time }}``), -plus the Dag Run fields included in the ``GET`` REST API response for Dag Run (e.g. -``{{ dag_run.run_id }}``, ``{{ run_id }}``, ``{{ logical_date }}``, ``{{ ds }}``, ``{{ ts }}``). -Notifiers continue to run their own templating as part of their execution. Support for a more -comprehensive context will be added in future versions. +Currently, a relatively simple version of the Airflow context is passed to callables and Airflow does not run +:ref:`concepts:jinja-templating` on the kwargs. However, Notifiers already run templating with the +provided context as part of their execution. This means that templating can be used when using a Notifier +as long as the variables being templated are included in the simplified context. This currently includes the +ID and the calculated deadline time of the Deadline Alert as well as the data included in the ``GET`` REST API +response for Dag Run. Support for more comprehensive context and templating will be added in future versions. Deadline Calculation ^^^^^^^^^^^^^^^^^^^^ @@ -392,7 +383,7 @@ In the following examples, ``notify_team`` is either a SyncCallback or AsyncCall .. code-block:: python - next_meeting = datetime(2025, 6, 26, 9, 30, tzinfo=timezone.utc) + next_meeting = datetime(2025, 6, 26, 9, 30) DeadlineAlert( reference=DeadlineReference.FIXED_DATETIME(next_meeting), diff --git a/airflow-core/src/airflow/api_fastapi/core_api/datamodels/ui/deadline.py b/airflow-core/src/airflow/api_fastapi/core_api/datamodels/ui/deadline.py index 334d3523ac3e3..6f9402f23603d 100644 --- a/airflow-core/src/airflow/api_fastapi/core_api/datamodels/ui/deadline.py +++ b/airflow-core/src/airflow/api_fastapi/core_api/datamodels/ui/deadline.py @@ -19,10 +19,9 @@ from collections.abc import Iterable from datetime import datetime -from typing import Any from uuid import UUID -from pydantic import AliasPath, Field, field_validator +from pydantic import AliasPath, Field from airflow.api_fastapi.core_api.base import BaseModel @@ -53,42 +52,9 @@ class DeadlineAlertResponse(BaseModel): id: UUID name: str | None = None reference_type: str = Field(validation_alias=AliasPath("reference", "reference_type")) - interval: float | None = Field( - default=None, - description=( - "Interval in seconds between the reference time and the deadline. " - "Null for a dynamic interval (e.g. a VariableInterval) whose value is " - "only resolved at scheduler evaluation time." - ), - ) + interval: float = Field(description="Interval in seconds between deadline evaluations.") created_at: datetime - @field_validator("interval", mode="before") - @classmethod - def coerce_interval_to_seconds(cls, value: Any) -> float | None: - """ - Coerce the stored ``interval`` into seconds. - - ``DeadlineAlert.interval`` is a JSON column holding the Airflow-serialized form - of the SDK interval, not a plain number. A fixed ``timedelta`` serializes to - ``{"__classname__": "datetime.timedelta", "__data__": }`` and a dynamic - ``VariableInterval`` to ``{"__classname__": ".../VariableInterval", "__data__": {...}}``. - Without this coercion Pydantic cannot turn that dict into ``float`` and the - ``/ui/dags/{dag_id}/deadlineAlerts`` endpoint raises a 500, which breaks the - run-page deadline status badge. Return the seconds for a fixed interval, or - ``None`` for a dynamic one (resolved later by the scheduler). - """ - if value is None or isinstance(value, (int, float)): - return value - if isinstance(value, dict): - data = value.get("__data__") - # Fixed timedelta: __data__ is the total seconds as a number. - if isinstance(data, (int, float)): - return float(data) - # Dynamic interval (e.g. VariableInterval): no fixed seconds to report. - return None - return None - class DeadlineAlertCollectionResponse(BaseModel): """DeadlineAlert Collection serializer for responses.""" diff --git a/airflow-core/src/airflow/api_fastapi/core_api/openapi/_private_ui.yaml b/airflow-core/src/airflow/api_fastapi/core_api/openapi/_private_ui.yaml index d7f20f888f9ad..d786e259b4e77 100644 --- a/airflow-core/src/airflow/api_fastapi/core_api/openapi/_private_ui.yaml +++ b/airflow-core/src/airflow/api_fastapi/core_api/openapi/_private_ui.yaml @@ -894,12 +894,13 @@ paths: type: string description: 'Attributes to order by, multi criteria sort is supported. Prefix with `-` for descending order. Supported attributes: `id, created_at, - name`' + name, interval`' default: - created_at title: Order By description: 'Attributes to order by, multi criteria sort is supported. Prefix - with `-` for descending order. Supported attributes: `id, created_at, name`' + with `-` for descending order. Supported attributes: `id, created_at, name, + interval`' responses: '200': description: Successful Response @@ -2514,13 +2515,9 @@ components: type: string title: Reference Type interval: - anyOf: - - type: number - - type: 'null' + type: number title: Interval - description: Interval in seconds between the reference time and the deadline. - Null for a dynamic interval (e.g. a VariableInterval) whose value is only - resolved at scheduler evaluation time. + description: Interval in seconds between deadline evaluations. created_at: type: string format: date-time @@ -2529,6 +2526,7 @@ components: required: - id - reference_type + - interval - created_at title: DeadlineAlertResponse description: DeadlineAlert serializer for responses. diff --git a/airflow-core/src/airflow/api_fastapi/core_api/routes/ui/deadlines.py b/airflow-core/src/airflow/api_fastapi/core_api/routes/ui/deadlines.py index 1f7076e554e01..d9cfea10d6d94 100644 --- a/airflow-core/src/airflow/api_fastapi/core_api/routes/ui/deadlines.py +++ b/airflow-core/src/airflow/api_fastapi/core_api/routes/ui/deadlines.py @@ -165,16 +165,8 @@ def get_dag_deadline_alerts( order_by: Annotated[ SortParam, Depends( - # NOTE: ``interval`` is intentionally NOT a sortable key. ``DeadlineAlert.interval`` is a - # JSON column holding the Airflow-serialized interval — a dict such as - # ``{"__classname__": "datetime.timedelta", "__data__": 300.0}`` for a fixed interval, or a - # structurally different dict for a ``VariableInterval``. Ordering by it at the DB level - # sorts by the JSON text/structure, not the duration, so the result is arbitrary and - # misleading (e.g. a dynamic VariableInterval sorts before/after fixed intervals by shape, - # and "300" vs "3600" compare lexicographically). Meaningful sorting would need a computed - # seconds column. Allow only columns that sort correctly. SortParam( - ["id", "created_at", "name"], + ["id", "created_at", "name", "interval"], DeadlineAlert, ).dynamic_depends(default="created_at") ), diff --git a/airflow-core/src/airflow/api_fastapi/execution_api/app.py b/airflow-core/src/airflow/api_fastapi/execution_api/app.py index c800a319e2b78..449019db7998d 100644 --- a/airflow-core/src/airflow/api_fastapi/execution_api/app.py +++ b/airflow-core/src/airflow/api_fastapi/execution_api/app.py @@ -386,7 +386,7 @@ def app(self): from airflow.api_fastapi.execution_api.routes.connections import has_connection_access from airflow.api_fastapi.execution_api.routes.variables import has_variable_access from airflow.api_fastapi.execution_api.routes.xcoms import has_xcom_access - from airflow.api_fastapi.execution_api.security import _jwt_bearer, require_auth + from airflow.api_fastapi.execution_api.security import _jwt_bearer self._app = create_task_execution_api_app() @@ -403,7 +403,6 @@ async def always_allow(request: Request): return TIToken(id=ti_id, claims=claims) self._app.dependency_overrides[_jwt_bearer] = always_allow - self._app.dependency_overrides[require_auth] = always_allow self._app.dependency_overrides[has_connection_access] = always_allow self._app.dependency_overrides[has_variable_access] = always_allow self._app.dependency_overrides[has_xcom_access] = always_allow diff --git a/airflow-core/src/airflow/api_fastapi/execution_api/routes/connections.py b/airflow-core/src/airflow/api_fastapi/execution_api/routes/connections.py index d0e78c086f717..8289dcf97fb8a 100644 --- a/airflow-core/src/airflow/api_fastapi/execution_api/routes/connections.py +++ b/airflow-core/src/airflow/api_fastapi/execution_api/routes/connections.py @@ -20,15 +20,10 @@ import logging from typing import Annotated -from fastapi import APIRouter, Depends, HTTPException, Path, Security, status +from fastapi import APIRouter, Depends, HTTPException, Path, status from airflow.api_fastapi.execution_api.datamodels.connection import ConnectionResponse -from airflow.api_fastapi.execution_api.security import ( - CurrentTIToken, - ExecutionAPIRoute, - get_team_name_dep, - require_auth, -) +from airflow.api_fastapi.execution_api.security import CurrentTIToken, get_team_name_dep from airflow.exceptions import AirflowNotFoundException from airflow.models.connection import Connection @@ -54,8 +49,8 @@ async def has_connection_access( router = APIRouter( - route_class=ExecutionAPIRoute, responses={status.HTTP_404_NOT_FOUND: {"description": "Connection not found"}}, + dependencies=[Depends(has_connection_access)], ) log = logging.getLogger(__name__) @@ -63,10 +58,6 @@ async def has_connection_access( @router.get( "/{connection_id}", - dependencies=[ - Security(require_auth, scopes=["token:execution", "token:workload"]), - Depends(has_connection_access), - ], responses={ status.HTTP_401_UNAUTHORIZED: {"description": "Unauthorized"}, status.HTTP_403_FORBIDDEN: {"description": "Task does not have access to the connection"}, diff --git a/airflow-core/src/airflow/api_fastapi/execution_api/routes/dag_runs.py b/airflow-core/src/airflow/api_fastapi/execution_api/routes/dag_runs.py index 6c92a7a00dea3..31aeb96c63e72 100644 --- a/airflow-core/src/airflow/api_fastapi/execution_api/routes/dag_runs.py +++ b/airflow-core/src/airflow/api_fastapi/execution_api/routes/dag_runs.py @@ -21,7 +21,7 @@ from typing import Annotated from cadwyn import VersionedAPIRouter -from fastapi import HTTPException, Query, Security, status +from fastapi import HTTPException, Query, status from sqlalchemy import func, select from sqlalchemy.exc import NoResultFound @@ -33,7 +33,7 @@ from airflow.api_fastapi.execution_api.datamodels.dagrun import DagRunStateResponse, TriggerDAGRunPayload from airflow.api_fastapi.execution_api.datamodels.taskinstance import DagRun from airflow.api_fastapi.execution_api.datamodels.token import TIToken -from airflow.api_fastapi.execution_api.security import CurrentTIToken, ExecutionAPIRoute, require_auth +from airflow.api_fastapi.execution_api.security import CurrentTIToken from airflow.exceptions import DagNotPartitionedError, DagRunAlreadyExists, InvalidPartitionKeyError from airflow.models.dag import DagModel from airflow.models.dagrun import DagRun as DagRunModel @@ -41,7 +41,7 @@ from airflow.utils.state import DagRunState from airflow.utils.types import DagRunTriggeredByType, DagRunType -router = VersionedAPIRouter(route_class=ExecutionAPIRoute) +router = VersionedAPIRouter() log = logging.getLogger(__name__) @@ -66,7 +66,6 @@ def get_previous_dagrun_compat( @router.get( "/{dag_id}/{run_id}", - dependencies=[Security(require_auth, scopes=["token:execution", "token:workload"])], responses={status.HTTP_404_NOT_FOUND: {"description": "Dag run not found"}}, ) def get_dag_run(dag_id: str, run_id: str, session: SessionDep) -> DagRun: diff --git a/airflow-core/src/airflow/api_fastapi/execution_api/routes/variables.py b/airflow-core/src/airflow/api_fastapi/execution_api/routes/variables.py index 43613921d2eda..631ad35ded17f 100644 --- a/airflow-core/src/airflow/api_fastapi/execution_api/routes/variables.py +++ b/airflow-core/src/airflow/api_fastapi/execution_api/routes/variables.py @@ -20,7 +20,7 @@ import logging from typing import Annotated -from fastapi import APIRouter, Depends, HTTPException, Path, Query, Request, Security, status +from fastapi import APIRouter, Depends, HTTPException, Path, Query, Request, status from sqlalchemy import func, select from airflow.api_fastapi.common.db.common import SessionDep @@ -29,12 +29,7 @@ VariablePostBody, VariableResponse, ) -from airflow.api_fastapi.execution_api.security import ( - CurrentTIToken, - ExecutionAPIRoute, - get_team_name_dep, - require_auth, -) +from airflow.api_fastapi.execution_api.security import CurrentTIToken, get_team_name_dep from airflow.models.variable import Variable @@ -62,7 +57,7 @@ async def has_variable_access( return True -router = APIRouter(route_class=ExecutionAPIRoute) +router = APIRouter() log = logging.getLogger(__name__) @@ -73,7 +68,6 @@ async def has_variable_access( # it requires a variable_key path parameter that /keys does not have. @router.get( "/keys", - dependencies=[Security(require_auth, scopes=["token:execution", "token:workload"])], responses={ status.HTTP_401_UNAUTHORIZED: {"description": "Unauthorized"}, }, @@ -109,10 +103,7 @@ def get_variable_keys( @router.get( "/{variable_key:path}", - dependencies=[ - Security(require_auth, scopes=["token:execution", "token:workload"]), - Depends(has_variable_access), - ], + dependencies=[Depends(has_variable_access)], responses={ status.HTTP_404_NOT_FOUND: {"description": "Variable not found"}, status.HTTP_401_UNAUTHORIZED: {"description": "Unauthorized"}, @@ -140,10 +131,7 @@ def get_variable( @router.put( "/{variable_key:path}", - dependencies=[ - Security(require_auth, scopes=["token:execution"]), - Depends(has_variable_access), - ], + dependencies=[Depends(has_variable_access)], status_code=status.HTTP_201_CREATED, responses={ status.HTTP_404_NOT_FOUND: {"description": "Variable not found"}, @@ -163,10 +151,7 @@ def put_variable( @router.delete( "/{variable_key:path}", - dependencies=[ - Security(require_auth, scopes=["token:execution"]), - Depends(has_variable_access), - ], + dependencies=[Depends(has_variable_access)], status_code=status.HTTP_204_NO_CONTENT, responses={ status.HTTP_404_NOT_FOUND: {"description": "Variable not found"}, diff --git a/airflow-core/src/airflow/api_fastapi/execution_api/security.py b/airflow-core/src/airflow/api_fastapi/execution_api/security.py index 9ebcb6bfd683b..9de3493061f32 100644 --- a/airflow-core/src/airflow/api_fastapi/execution_api/security.py +++ b/airflow-core/src/airflow/api_fastapi/execution_api/security.py @@ -234,7 +234,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: async def get_team_name_dep(token=CurrentTIToken) -> str | None: - """Return the team name associated to the task or callback (if any).""" + """Return the team name associated to the task (if any).""" from airflow.configuration import conf if not conf.getboolean("core", "multi_team"): @@ -243,12 +243,7 @@ async def get_team_name_dep(token=CurrentTIToken) -> str | None: from airflow.utils.session import create_session_async async with create_session_async() as session: - team_name = await session.scalar(_team_name_for_ti_stmt(token.id)) - if team_name is not None: - return team_name - # Workload tokens use the callback UUID as sub; fall back to the - # Callback → dag_id → Team path for deadline callback subprocesses. - return await session.scalar(_team_name_for_callback_stmt(token.id)) + return await session.scalar(_team_name_for_ti_stmt(token.id)) def get_team_name_for_ti(ti_id, session) -> str | None: @@ -294,21 +289,3 @@ def _team_name_for_dag_stmt(dag_id): .join(DagBundleModel.teams) .where(DagModel.dag_id == dag_id) ) - - -def _team_name_for_callback_stmt(callback_id): - """Build the select statement resolving ``Callback.id -> dag_id -> Team.name``.""" - from airflow.models import DagModel - from airflow.models.callback import Callback - from airflow.models.dagbundle import DagBundleModel - from airflow.models.team import Team - - # Callbacks store dag_id as a JSON key in data; join via the dag_id value. - return ( - select(Team.name) - .select_from(Callback) - .join(DagModel, DagModel.dag_id == Callback.data["dag_id"].as_string()) - .join(DagBundleModel, DagBundleModel.name == DagModel.bundle_name) - .join(DagBundleModel.teams) - .where(Callback.id == callback_id) - ) diff --git a/airflow-core/src/airflow/dag_processing/dagbag.py b/airflow-core/src/airflow/dag_processing/dagbag.py index c626b18782b6a..34684810977c7 100644 --- a/airflow-core/src/airflow/dag_processing/dagbag.py +++ b/airflow-core/src/airflow/dag_processing/dagbag.py @@ -160,35 +160,6 @@ def _validate_executor_fields(dag: DAG, bundle_name: str | None = None) -> None: "executor to use one of the configured executors." ) - # Validate executors on deadline (SLA-style) callbacks too. A SyncCallback may pin an - # executor; if it names one that doesn't exist, the scheduler can't route the queued - # ExecutorCallback — it would sit PENDING forever, re-warning every loop. Catch it here at - # parse time (like tasks) so the author gets immediate, actionable feedback. AsyncCallbacks - # run on the triggerer and have no executor field, so only SyncCallbacks are checked. - deadlines = dag.deadline or [] - if not isinstance(deadlines, list): - deadlines = [deadlines] - for deadline_alert in deadlines: - callback = getattr(deadline_alert, "callback", None) - callback_executor = getattr(callback, "executor", None) - if not callback_executor: - continue - - if not _executor_exists(callback_executor, dag_team_name): - if dag_team_name: - raise UnknownExecutorException( - f"A deadline callback on DAG '{dag.dag_id}' specifies executor '{callback_executor}', " - f"which is not available for team '{dag_team_name}' or as a global executor. Make sure " - f"'{callback_executor}' is configured for team '{dag_team_name}' or globally in your " - "[core] executors configuration, or update the callback's executor to use one of the " - f"configured executors for team '{dag_team_name}' or available global executors." - ) - raise UnknownExecutorException( - f"A deadline callback on DAG '{dag.dag_id}' specifies executor '{callback_executor}', " - "which is not available. Make sure it is listed in your [core] executors configuration, " - "or update the callback's executor to use one of the configured executors." - ) - class DagBag(LoggingMixin): """ diff --git a/airflow-core/src/airflow/executors/base_executor.py b/airflow-core/src/airflow/executors/base_executor.py index 24fcb5b734c01..ef5772331108f 100644 --- a/airflow-core/src/airflow/executors/base_executor.py +++ b/airflow-core/src/airflow/executors/base_executor.py @@ -719,10 +719,6 @@ def run_workload( callback_path=workload.callback.data.get("path", ""), callback_kwargs=workload.callback.data.get("kwargs", {}), dag_rel_path=workload.dag_rel_path, - dag_id=workload.callback.data.get("dag_id"), - run_id=workload.callback.data.get("run_id"), - deadline_id=workload.callback.data.get("deadline_id"), - deadline_time=workload.callback.data.get("deadline_time"), log_path=workload.log_path, bundle_info=workload.bundle_info, token=workload.token, diff --git a/airflow-core/src/airflow/executors/local_executor.py b/airflow-core/src/airflow/executors/local_executor.py index 8917d72bbbe22..a464304240936 100644 --- a/airflow-core/src/airflow/executors/local_executor.py +++ b/airflow-core/src/airflow/executors/local_executor.py @@ -107,20 +107,8 @@ def _run_worker( ) output.put((workload.key, workload.success_state, None)) except Exception as e: - # A transient callback context-fetch failure should requeue (not terminally fail), so - # the scheduler retries it next loop — mirroring the triggerer path. The workload - # exposes ``retry_state`` (PENDING for callbacks) for exactly this case. - from airflow.sdk.execution_time.callback_supervisor import ( # noqa: SDK001 - CallbackContextFetchError, - ) - - retry_state = getattr(workload, "retry_state", None) - if isinstance(e, CallbackContextFetchError) and retry_state is not None: - log.warning("Callback context fetch failed transiently; requeueing for retry.") - output.put((workload.key, retry_state, e)) - else: - log.exception("Workload execution failed.", workload_type=type(workload).__name__) - output.put((workload.key, workload.failure_state, e)) + log.exception("Workload execution failed.", workload_type=type(workload).__name__) + output.put((workload.key, workload.failure_state, e)) class LocalExecutor(BaseExecutor): diff --git a/airflow-core/src/airflow/executors/workloads/callback.py b/airflow-core/src/airflow/executors/workloads/callback.py index 3bacb6bb93e2e..04f26b8e787c1 100644 --- a/airflow-core/src/airflow/executors/workloads/callback.py +++ b/airflow-core/src/airflow/executors/workloads/callback.py @@ -103,17 +103,6 @@ def failure_state(self) -> CallbackState: def running_state(self) -> CallbackState: return CallbackState.RUNNING - @property - def retry_state(self) -> CallbackState: - """ - State to report on a transient (retryable) failure, e.g. a context-fetch blip. - - PENDING is non-terminal, so the scheduler's PENDING-callbacks query re-picks the callback - on the next loop and retries it — instead of the terminal FAILED a real callback error - produces. Mirrors the triggerer path's re-evaluate-on-next-loop behavior. - """ - return CallbackState.PENDING - @classmethod def make( cls, diff --git a/airflow-core/src/airflow/jobs/scheduler_job_runner.py b/airflow-core/src/airflow/jobs/scheduler_job_runner.py index 620df2fd9f6e6..2f94d480eb6d1 100644 --- a/airflow-core/src/airflow/jobs/scheduler_job_runner.py +++ b/airflow-core/src/airflow/jobs/scheduler_job_runner.py @@ -1175,32 +1175,12 @@ def _enqueue_executor_callbacks(self, session: Session) -> None: self.log.debug("No available slots for callbacks; all executors at capacity") return - pending_callbacks_query = ( + pending_callbacks = session.scalars( select(ExecutorCallback) .where(ExecutorCallback.type == CallbackType.EXECUTOR) .where(ExecutorCallback.state == CallbackState.PENDING) - # Stable FIFO tiebreaker after priority: created_at then id. Without a secondary - # sort key, equal-priority callbacks (all deadline callbacks default to - # priority_weight=1) come back in DB-dependent order, so when the PENDING backlog - # exceeds max_callbacks the same subset can be picked every loop and the rest - # starve indefinitely. Ordering oldest-first guarantees forward progress for every - # callback, mirroring the task path's (-priority, logical_date, map_index) ordering. - .order_by( - ExecutorCallback.priority_weight.desc(), - ExecutorCallback.created_at, - ExecutorCallback.id, - ) + .order_by(ExecutorCallback.priority_weight.desc()) .limit(max_callbacks) - ) - - # Lock the selected rows FOR UPDATE SKIP LOCKED so concurrent HA scheduler replicas do - # not both select the same PENDING callbacks and enqueue them to their executors twice - # (a double-execution of the deadline callback). The ``state = QUEUED`` write alone is - # not enough: by the time the conflicting write is resolved at commit, both schedulers - # have already called ``executor.queue_workload`` (the side effect). This mirrors the - # task-instance critical section and the deadline-selection query in _run_scheduler_loop. - pending_callbacks = session.scalars( - with_row_locks(pending_callbacks_query, of=ExecutorCallback, session=session, skip_locked=True) ).all() if not pending_callbacks: @@ -1231,50 +1211,22 @@ def _enqueue_executor_callbacks(self, session: Session) -> None: dag_run = session.get(DagRun, dag_run_id) if dag_run is None: - # The DagRun is gone (deleted). Deletion is permanent — the executor - # callback can never run because its execution context is built from the - # DagRun (see _fetch_and_build_context). Leaving it PENDING would retry it - # every scheduler loop forever (a zombie callback + perpetual log spam), so - # terminalize it as FAILED instead. (TriggererCallbacks differ: they have a - # standalone Trigger that can still fire and terminalize them, so they are - # intentionally left to survive a DagRun deletion.) self.log.warning( - "DagRun id=%s for ExecutorCallback %s no longer exists (deleted); " - "marking the callback FAILED — it cannot run without its DagRun context.", + "Could not find DagRun with id=%s for callback %s. DagRun may have been deleted.", dag_run_id, callback.id, ) - callback.state = CallbackState.FAILED - session.add(callback) - stats.incr( - "deadline_alerts.callback_orphaned_dagrun_deleted", - tags={"dag_id": callback.data.get("dag_id", "")}, - ) continue - # Isolate each callback: a single bad ExecutorCallback (e.g. its workload - # fails to build, or the executor's queue_workload raises) must not crash - # the scheduler loop or prevent the remaining callbacks from being enqueued. - # The SAVEPOINT keeps the already-enqueued callbacks in this batch intact - # when one rolls back, and the callback is left in PENDING for retry next - # loop. This mirrors the per-deadline isolation in the handle_miss loop. - try: - with session.begin_nested(): - workload = workloads.ExecuteCallback.make( - callback=callback, - dag_run=dag_run, - generator=executor.jwt_generator, - ) + workload = workloads.ExecuteCallback.make( + callback=callback, + dag_run=dag_run, + generator=executor.jwt_generator, + ) - executor.queue_workload(workload, session=session) - callback.state = CallbackState.QUEUED - session.add(callback) - except Exception: - self.log.exception( - "Failed to enqueue ExecutorCallback %s to executor %s; leaving it PENDING for retry", - callback.id, - executor, - ) + executor.queue_workload(workload, session=session) + callback.state = CallbackState.QUEUED + session.add(callback) @staticmethod def _process_task_event_logs(log_records: deque[Log], session: Session): @@ -1369,12 +1321,7 @@ def process_executor_events( cls.logger().debug("Draining executor event with state %s for connection test %s", state, key) elif isinstance(key, CallbackKey): cls.logger().info("Received executor event with state %s for callback %s", state, key) - if state in ( - CallbackState.RUNNING, - CallbackState.FAILED, - CallbackState.SUCCESS, - CallbackState.PENDING, - ): + if state in (CallbackState.RUNNING, CallbackState.FAILED, CallbackState.SUCCESS): callback_keys_with_events.append(key) else: cls.logger().error("Unknown workload key type in event buffer: %r", key) @@ -1401,16 +1348,6 @@ def process_executor_events( callback.state = CallbackState.FAILED callback.output = str(info) if info else "Execution failed" cls.logger().error("Callback %s failed: %s", callback_id, callback.output) - elif state == CallbackState.PENDING: - # Transient failure (e.g. the executor callback could not fetch its DagRun - # context — an API blip / network partition / token expiry). Reset to PENDING so - # the next scheduler loop re-picks it from the PENDING-callbacks query and retries, - # rather than terminally failing on a recoverable error. This mirrors the triggerer - # path, which re-evaluates a callback trigger on the next loop when the fetch fails. - callback.state = CallbackState.PENDING - cls.logger().warning( - "Callback %s hit a transient failure; requeueing for retry: %s", callback_id, info - ) session.add(callback) # Return if no finished tasks @@ -1877,18 +1814,7 @@ def _run_scheduler_loop(self) -> None: key_share=False, ) ): - # Isolate each deadline: a single bad deadline (e.g. its DagRun was - # deleted between selection and handling, or its callback fails to - # queue) must not crash the scheduler loop or prevent the remaining - # overdue deadlines from being processed. The SAVEPOINT keeps the - # already-handled deadlines in this batch intact when one rolls back. - try: - with session.begin_nested(): - deadline.handle_miss(session) - except Exception: - self.log.exception( - "Failed to handle missed deadline %s; skipping it this loop", deadline.id - ) + deadline.handle_miss(session) # Route ExecutorCallback workloads to executors (similar to task routing) self._enqueue_executor_callbacks(session) diff --git a/airflow-core/src/airflow/jobs/triggerer_job_runner.py b/airflow-core/src/airflow/jobs/triggerer_job_runner.py index 4e5249fee7941..b32ea53a2cd7f 100644 --- a/airflow-core/src/airflow/jobs/triggerer_job_runner.py +++ b/airflow-core/src/airflow/jobs/triggerer_job_runner.py @@ -785,29 +785,11 @@ def _create_workload( if trigger.assets: watched_assets = {a.name: a.uri for a in trigger.assets} - # ``callback`` is an attribute of the Trigger model, not of the BaseEventTrigger - # protocol — asset-only triggers (and spec'd test mocks) may not expose it, so read - # it defensively. Only callback triggers fetch DagRun context. - callback = getattr(trigger, "callback", None) - # Only fetch DagRun data for callback triggers (not all non-task triggers). - dag_run_data = self._fetch_callback_dag_run_data(trigger, session=session) if callback else None - if callback and dag_run_data is None: - # Only skip when routing data was present but the DagRun lookup failed - # (transient DB/API issue). Old 3.2.x callbacks without dag_id/run_id - # pass through — their trigger's run() falls back to stored kwargs["context"]. - callback_data = callback.data or {} - if callback_data.get("dag_id") and callback_data.get("run_id"): - log.warning( - "Skipping callback trigger — DagRun not found for context", - trigger_id=trigger.id, - ) - return None return workloads.RunTrigger( id=trigger.id, classpath=trigger.classpath, encrypted_kwargs=trigger.encrypted_kwargs, watched_assets=watched_assets, - dag_run_data=dag_run_data, ) if not trigger.task_instance.dag_version_id: @@ -864,45 +846,6 @@ def _create_workload( timeout_after=trigger.task_instance.trigger_timeout, ) - def _fetch_callback_dag_run_data(self, trigger: Trigger, *, session: Session) -> dict | None: - """ - Fetch DagRun data for deadline callback triggers. - - When a callback trigger has dag_id/run_id stored in its associated Callback.data, - fetch the DagRun and return serialized dag_run_data so the TriggerRunner can build - a standard Context at runtime (same pattern as start_from_trigger). - """ - from airflow.models.dagrun import DagRun - - # The trigger's callback relationship stores the identifiers we need - if not trigger.callback: - return None - - callback_data = trigger.callback.data - dag_id = callback_data.get("dag_id") - run_id = callback_data.get("run_id") - if not dag_id or not run_id: - return None - - dagrun = session.scalar(select(DagRun).where(DagRun.dag_id == dag_id, DagRun.run_id == run_id)) - if not dagrun: - log.warning( - "Could not find DagRun for callback context", - dag_id=dag_id, - run_id=run_id, - trigger_id=trigger.id, - ) - return None - - data = dagrun.dag_run_data.model_dump(exclude_unset=True) - # Pass deadline metadata through so _build_context_from_dag_run_data can expose - # context["deadline"] for template rendering ({{ deadline.deadline_time }}). - deadline_id = callback_data.get("deadline_id") - deadline_time = callback_data.get("deadline_time") - if deadline_id or deadline_time: - data["_deadline"] = {"id": deadline_id, "deadline_time": deadline_time} - return data - def fetch_trigger_details(self, trigger_ids: set[int], *, session: Session) -> dict[int, Trigger]: """Fetch trigger rows by ID.""" return Trigger.bulk_fetch(trigger_ids, session=session) @@ -944,29 +887,13 @@ def build_trigger_workloads(self, new_trigger_ids: set[int]) -> list[workloads.R ) continue - # Isolate per-trigger failures: a single bad trigger (e.g. one whose - # serialized Dag fails to load, whose deadline-callback context fetch - # raises, or whose log-path rendering throws) must not abort the whole - # batch and crash the triggerer process. This mirrors the per-item - # isolation already applied to the scheduler's handle_miss and - # _enqueue_executor_callbacks loops. - try: - if workload := self._create_workload( - trigger=new_trigger_orm, - dag_bag=dag_bag, - render_log_fname=render_log_fname, - session=session, - ): - to_create.append(workload) - except Exception: - log.exception( - "Failed to build workload for trigger; skipping it so the rest of " - "the batch can still launch", - id=new_trigger_id, - ) - # _create_workload may have already registered a logger factory before - # failing; drop it so the failed trigger leaves no leaked state behind. - self.logger_cache.pop(new_trigger_id, None) + if workload := self._create_workload( + trigger=new_trigger_orm, + dag_bag=dag_bag, + render_log_fname=render_log_fname, + session=session, + ): + to_create.append(workload) return to_create @@ -1340,22 +1267,6 @@ def create_runtime_ti( ), ) - @staticmethod - def _build_context_from_dag_run_data(dag_run_data: dict) -> Context: - """ - Build a standard Context dict from serialized dag_run_data for callback triggers. - - This provides the same DagRun-level context fields (dag_run, run_id, logical_date, - ds, ts, etc.) that task-bound triggers get via RuntimeTaskInstance.get_template_context(), - but without task-specific fields since callbacks are not tied to a task. - """ - from airflow.api_fastapi.execution_api.datamodels.taskinstance import DagRun as DRDataModel - from airflow.sdk.execution_time.context import build_context_from_dag_run - - deadline_info = dag_run_data.pop("_deadline", None) - dag_run = DRDataModel(**dag_run_data) - return build_context_from_dag_run(dag_run, deadline=deadline_info) # type: ignore[return-value] - async def create_triggers(self): """Drain the to_create queue and create all new triggers that have been requested in the DB.""" while self.to_create: @@ -1402,24 +1313,8 @@ async def create_triggers(self): else: trigger_name = f"ID {trigger_id}" trigger_instance = trigger_class(**deserialised_kwargs) - # For callback triggers with dag_run_data, build a standard Context - # and set it on the trigger instance — the same pattern as - # trigger_instance.task_instance = runtime_ti for task-bound triggers. - if workload.dag_run_data: - context = self._build_context_from_dag_run_data(workload.dag_run_data) - trigger_instance._callback_context = context - except Exception as err: - # Inflating the trigger / building its callback context can fail in more ways than - # a TypeError: ``DRDataModel(**dag_run_data)`` in ``_build_context_from_dag_run_data`` - # is a strict (``extra=forbid``) Pydantic model, so a version-skewed or malformed - # ``dag_run_data`` raises ``pydantic.ValidationError`` (a ``ValueError``, NOT a - # ``TypeError``); ``smart_decode_trigger_kwargs`` / ``_decrypt_kwargs`` can likewise - # raise non-TypeError errors. Catching only ``TypeError`` let those escape to - # ``arun()``'s ``except Exception``, which sets ``self.stop = True`` and shuts down - # the WHOLE triggerer — killing every running trigger because of one bad workload. - # Fail just this trigger (mirrors the ``except BaseException`` on the class-load step - # above) so the others keep running. - self.log.error("Trigger failed to inflate", error=err, trigger_id=trigger_id) + except TypeError as err: + self.log.error("Trigger failed to inflate", error=err) self.failed_triggers.append((trigger_id, err)) continue trigger_instance.trigger_id = trigger_id @@ -1713,71 +1608,6 @@ async def run_trigger( await self.log.awarning("on_kill() raised an exception", name=name, exc_info=True) span.set_status(Status(StatusCode.OK), description=str(e)) raise - except BaseException as e: - # A BaseException (other than CancelledError) raised by user code - # inside the trigger — e.g. an async deadline callback that calls - # sys.exit(), raises KeyboardInterrupt, raises GeneratorExit, or - # raises a custom BaseException subclass. For SystemExit / - # KeyboardInterrupt asyncio re-raises these out of the event-loop - # step; other BaseException subclasses get stored on the task and - # are reaped by cleanup_finished_triggers(), but that path calls - # submit_failure() which only fails dependent task instances and - # never marks a Callback row terminal — so a callback trigger would - # be stuck in RUNNING forever. Handle every BaseException uniformly - # here: record the failure so dependent task instances are failed, - # and emit a terminal FAILED event so callback rows (which are only - # marked terminal via an event, not via submit_failure) don't get - # stuck in RUNNING forever. Do NOT re-raise — that is what keeps - # the event loop alive. asyncio.CancelledError is handled by the - # dedicated ``except asyncio.CancelledError`` clause above (which - # is evaluated first and re-raises), so cancellation is unaffected - # and is never swallowed here. - if isinstance(e, Exception): - # Ordinary exceptions keep their existing behaviour: re-raise so - # the generic ``except Exception`` path below records the failure - # via cleanup_finished_triggers(). Only true BaseException - # subclasses (SystemExit, KeyboardInterrupt, GeneratorExit, and - # custom BaseException classes) are handled inline here. - raise - from airflow.triggers.callback import CallbackTrigger - - span.set_status(Status(StatusCode.ERROR), description=str(e)) - await self.log.aexception( - "Trigger raised %s; failing it instead of crashing the triggerer", - type(e).__name__, - name=name, - trigger_id=trigger_id, - ) - if isinstance(trigger, CallbackTrigger): - # Callback rows are only marked terminal via an event - # (handle_event), never via submit_failure, so a FAILED event - # is what moves the Callback out of RUNNING. Mirror the FAILED - # event CallbackTrigger.run() emits for ordinary exceptions — - # which it cannot emit here because a BaseException tears the - # async generator down before its own except-handler runs. - from airflow.models.callback import CallbackState - from airflow.triggers.callback import PAYLOAD_BODY_KEY, PAYLOAD_STATUS_KEY - - self.triggers[trigger_id]["events"] += 1 - self.events.append( - TriggerEventEntry( - trigger_id=trigger_id, - event=TriggerEvent( - { - PAYLOAD_STATUS_KEY: CallbackState.FAILED, - PAYLOAD_BODY_KEY: f"Trigger failed with {type(e).__name__}: " - f"{''.join(format_exception(type(e), e, e.__traceback__))}", - } - ), - persist_seq=None, - ) - ) - else: - # Task-bound (and asset) triggers fail their dependents through - # the failure queue, exactly as the generic ``except Exception`` - # path does via cleanup_finished_triggers(). - self.failed_triggers.append((trigger_id, e)) - return except Exception as e: span.set_status(Status(StatusCode.ERROR), description=str(e)) raise diff --git a/airflow-core/src/airflow/migrations/versions/0117_3_3_0_change_deadline_interval_to_json.py b/airflow-core/src/airflow/migrations/versions/0117_3_3_0_change_deadline_interval_to_json.py index 28f8da026346d..9c7f5cb4a58fa 100644 --- a/airflow-core/src/airflow/migrations/versions/0117_3_3_0_change_deadline_interval_to_json.py +++ b/airflow-core/src/airflow/migrations/versions/0117_3_3_0_change_deadline_interval_to_json.py @@ -38,6 +38,21 @@ airflow_version = "3.3.0" +def _mysql_downgrade_interval_value_sql(table_name: str = "deadline_alert") -> str: + """Unwrap the serialized interval to a bare JSON number; the FLOAT cast happens at the column retype.""" + return f""" + UPDATE {table_name} + SET `interval` = + CASE + WHEN JSON_EXTRACT(`interval`, '$.__data__') IS NOT NULL + THEN JSON_EXTRACT(`interval`, '$.__data__') + WHEN JSON_EXTRACT(`interval`, '$.__classname__') IS NULL + THEN `interval` + ELSE NULL + END + """ + + def upgrade(): """Apply change deadline interval to JSON.""" conn = op.get_bind() @@ -180,16 +195,18 @@ def downgrade(): UPDATE deadline_alert SET `interval` = CASE - WHEN JSON_UNQUOTE(JSON_EXTRACT(`interval`, '$.__classname__')) = 'datetime.timedelta' - THEN CAST(JSON_EXTRACT(`interval`, '$.__data__') AS DOUBLE) + WHEN JSON_EXTRACT(`interval`, '$.__data__') IS NOT NULL + THEN JSON_EXTRACT(`interval`, '$.__data__') WHEN JSON_EXTRACT(`interval`, '$.__classname__') IS NULL THEN `interval` ELSE NULL END; - Step 2: Convert column type (NULLABLE — non-timedelta intervals downgrade to NULL) + Step 2: Convert column type. This casts the JSON numbers to FLOAT (the original + pre-upgrade type, matching the online migration). Keep NOT NULL — MODIFY COLUMN + redefines the whole column, so omitting it would drop the constraint. ALTER TABLE deadline_alert - MODIFY COLUMN `interval` DOUBLE NULL; + MODIFY COLUMN `interval` FLOAT NOT NULL; SQLite: @@ -197,7 +214,7 @@ def downgrade(): UPDATE deadline_alert SET interval = CASE - WHEN json_extract(interval, '$.__classname__') = 'datetime.timedelta' + WHEN json_extract(interval, '$.__data__') IS NOT NULL THEN CAST(json_extract(interval, '$.__data__') AS REAL) WHEN json_extract(interval, '$.__classname__') IS NULL THEN CAST(interval AS REAL) @@ -205,27 +222,11 @@ def downgrade(): END; Step 2: SQLite does not support ALTER COLUMN TYPE. - Recreate the table with interval as REAL (NULLABLE) and copy data. + Recreate the table with interval as REAL and copy data. """ ) return - # The value-conversion below intentionally maps any non-timedelta interval (e.g. a serialized - # ``VariableInterval``, which has no numeric ``__data__`` to recover) to NULL — this downgrade - # is lossy and documented as such (the 3.2 read path tolerates a missing interval; see - # ``decode_deadline_alert``: "Downgrade is not fully reversible"). Those NULLs are written by the - # UPDATE statements *before* the column type is changed, so the column must be made NULLABLE - # first — otherwise the UPDATE itself fails with a NOT NULL violation (and on table-rebuild - # dialects like SQLite the constraint is enforced on the UPDATE, not just the final ALTER). - with op.batch_alter_table("deadline_alert") as batch_op: - batch_op.alter_column( - "interval", - existing_type=sa.JSON(), - type_=sa.JSON(), - existing_nullable=False, - nullable=True, - ) - if dialect == "postgresql": op.execute(""" UPDATE deadline_alert @@ -240,22 +241,7 @@ def downgrade(): """) elif dialect == "mysql": - # Gate on the timedelta classname BEFORE extracting ``__data__``: a VariableInterval also - # has a (non-null, nested-object) ``__data__``, so checking ``__data__ IS NOT NULL`` first - # would ``CAST`` that object to a bogus ``0.0`` (silent corruption). Only timedelta payloads - # have a recoverable numeric ``__data__``; everything else maps to NULL (matches the - # PostgreSQL branch and the documented lossy-downgrade contract). - op.execute(""" - UPDATE deadline_alert - SET `interval` = - CASE - WHEN JSON_UNQUOTE(JSON_EXTRACT(`interval`, '$.__classname__')) = 'datetime.timedelta' - THEN CAST(JSON_EXTRACT(`interval`, '$.__data__') AS DOUBLE) - WHEN JSON_EXTRACT(`interval`, '$.__classname__') IS NULL - THEN CAST(`interval` AS DOUBLE) - ELSE NULL - END - """) + op.execute(_mysql_downgrade_interval_value_sql()) # Serialized VariableInterval objects do not contain a numeric "__data__" field # and therefore cannot be converted back to a float representation. @@ -271,14 +257,11 @@ def downgrade(): print("SQLite JSON functions not available, using string parsing as fallback.") if json_functions_available: - # Gate on the timedelta classname BEFORE extracting ``__data__`` (see the MySQL branch): - # a VariableInterval's ``__data__`` is a non-null nested object that would ``CAST`` to a - # bogus ``0.0``. Only timedelta payloads are recoverable; everything else maps to NULL. op.execute(""" UPDATE deadline_alert SET interval = CASE - WHEN json_extract(interval, '$.__classname__') = 'datetime.timedelta' + WHEN json_extract(interval, '$.__data__') IS NOT NULL THEN CAST(json_extract(interval, '$.__data__') AS REAL) WHEN json_extract(interval, '$.__classname__') IS NULL THEN CAST(interval AS REAL) @@ -288,18 +271,11 @@ def downgrade(): else: # NOTE: This is a best-effort fallback for environments without JSON1. # It assumes a stable JSON format and may not work for all serialized values. - # Only ``datetime.timedelta`` payloads carry a numeric ``__data__`` we can recover; - # a ``VariableInterval`` also contains the literal ``__data__`` (as a nested object), - # so gating on ``instr(interval, '__data__')`` alone would CAST its object text to a - # bogus ``0.0`` (silent corruption). Gate on the timedelta classname instead, and map - # everything else (VariableInterval, unknown shapes) to NULL — consistent with the - # JSON1 path above and the documented lossy-downgrade contract. op.execute(""" UPDATE deadline_alert SET interval = CASE - WHEN instr(interval, 'datetime.timedelta') > 0 - AND instr(interval, '__data__') > 0 + WHEN instr(interval, '__data__') > 0 THEN CAST( substr( interval, @@ -313,8 +289,6 @@ def downgrade(): END """) - # Finally change the column type JSON -> FLOAT. It stays NULLABLE (made so above) because the - # lossy conversion left NULLs for any non-timedelta interval. with op.batch_alter_table("deadline_alert") as batch_op: if dialect == "postgresql": batch_op.alter_column( @@ -330,14 +304,12 @@ def downgrade(): ELSE NULL END """, - existing_nullable=True, - nullable=True, + existing_nullable=False, ) else: batch_op.alter_column( "interval", existing_type=sa.JSON(), type_=sa.FLOAT(), - existing_nullable=True, - nullable=True, + existing_nullable=False, ) diff --git a/airflow-core/src/airflow/models/callback.py b/airflow-core/src/airflow/models/callback.py index 3fcb8fc905162..1e8e758450ded 100644 --- a/airflow-core/src/airflow/models/callback.py +++ b/airflow-core/src/airflow/models/callback.py @@ -160,8 +160,6 @@ def queue(self, *, session: Session) -> None: def get_metric_info(self, status: CallbackState, result: Any, team_name: str | None = None) -> dict: tags = {"result": result, **self.data} tags.pop("prefix", None) - for key in ("dag_id", "run_id", "deadline_id", "deadline_time", "dag_run_id"): - tags.pop(key, None) if "kwargs" in tags: # Remove the context (if exists) to keep the tags simple @@ -235,44 +233,25 @@ def __repr__(self): return f"{self.data['path']}({self.data['kwargs'] or ''}) on a triggerer" def queue(self, *, session: Session) -> None: - from airflow.models.dag import DagModel from airflow.models.trigger import Trigger from airflow.triggers.callback import CallbackTrigger team_name: str | None = None if self.bundle_name and conf.getboolean("core", "multi_team"): team_name = DagBundleModel.get_team_name(self.bundle_name, session=session) - elif dag_id := self.data.get("dag_id"): - team_name = DagModel.get_team_name(dag_id, session=session) - trigger = Trigger.from_object( + self.trigger = Trigger.from_object( CallbackTrigger( callback_path=self.data["path"], callback_kwargs=self.data["kwargs"], ) ) - trigger.team_name = team_name - self.trigger = trigger + self.trigger.team_name = team_name super().queue(session=session) def handle_event(self, event: TriggerEvent, session: Session): from airflow.triggers.callback import PAYLOAD_BODY_KEY, PAYLOAD_STATUS_KEY - # Terminal states are absorbing: once a callback is SUCCESS/FAILED it must never be - # moved again. In the normal flow CallbackTrigger yields RUNNING then a single terminal - # event in order, but at-least-once / duplicate delivery (a re-run trigger, or an HA - # triggerer replica reprocessing a stale queued event) could otherwise resurrect a - # completed callback back to an active state — or silently overwrite a FAILED outcome - # with a late SUCCESS (and vice versa). Ignore any further events once terminal. - if self.state in TERMINAL_STATES: - log.debug( - "Ignoring event for already-terminal callback %s (state=%s): %s", - self.id, - self.state, - event.payload, - ) - return - if (status := event.payload.get(PAYLOAD_STATUS_KEY)) and status in (ACTIVE_STATES | TERMINAL_STATES): self.state = status if status in TERMINAL_STATES: @@ -280,23 +259,7 @@ def handle_event(self, event: TriggerEvent, session: Session): if self.bundle_name and conf.getboolean("core", "multi_team"): team_name = DagBundleModel.get_team_name(self.bundle_name, session=session) self.trigger = None - body = event.payload.get(PAYLOAD_BODY_KEY) - if body is not None and not isinstance(body, str): - try: - body = json.dumps(body, default=str, sort_keys=True) - except Exception: - # The callback returned a value json.dumps cannot encode even - # with default=str (e.g. a circular reference, a dict with - # mutually uncomparable keys under sort_keys, or a value whose - # str()/repr() raises). handle_event runs in the triggerer - # supervisor's synchronous run loop with no surrounding guard, - # so an exception here would crash the triggerer. Fall back to - # a best-effort repr and never let coercion fail the process. - try: - body = repr(body) - except Exception: - body = f"" - self.output = body + self.output = event.payload.get(PAYLOAD_BODY_KEY) stats.incr(**self.get_metric_info(status, self.output, team_name=team_name)) session.add(self) diff --git a/airflow-core/src/airflow/models/deadline.py b/airflow-core/src/airflow/models/deadline.py index b6a7a7cac04f6..f596c66301498 100644 --- a/airflow-core/src/airflow/models/deadline.py +++ b/airflow-core/src/airflow/models/deadline.py @@ -28,7 +28,6 @@ from sqlalchemy import Boolean, ForeignKey, Index, Integer, Uuid, and_, func, inspect, select, text from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.orm import Mapped, mapped_column, relationship -from sqlalchemy.orm.attributes import flag_modified from airflow._shared.observability.metrics import stats from airflow._shared.timezones import timezone @@ -148,13 +147,8 @@ def __repr__(self): def _determine_resource() -> tuple[str, str]: """Determine the type of resource based on which values are present.""" if self.dagrun_id: - # The deadline is for a Dag run. Guard on the ``dagrun`` relationship (not just - # ``dagrun_id``): the FK can be set while the relationship resolves to None — e.g. - # the DagRun was deleted (ondelete=CASCADE) and this is a stale/expired in-memory - # Deadline. A __repr__ must never raise (it's used in logs, tracebacks, debuggers), - # so fall back to the id-only form rather than dereferencing ``self.dagrun.dag_id``. - dag_id = self.dagrun.dag_id if self.dagrun is not None else "" - return "DagRun", f"Dag: {dag_id} Run: {self.dagrun_id}" + # The deadline is for a Dag run: + return "DagRun", f"Dag: {self.dagrun.dag_id} Run: {self.dagrun_id}" return "Unknown", "" @@ -190,15 +184,8 @@ def prune_deadlines(cls, *, session: Session, conditions: dict[Mapped, Any]) -> try: # Get deadlines which match the provided conditions and their associated DagRuns. - # Exclude deadlines already marked ``missed``: once the scheduler has marked a - # deadline missed it has queued (and owns) that deadline's callback, so prune must - # never delete it (the cascade would drop the queued callback). Today this is also - # implied by the ``end_date <= deadline_time`` guard below — a missed deadline has - # ``deadline_time < now <= end_date`` so it can't satisfy that predicate — but making - # the ``~missed`` filter explicit keeps the "prune only handles on-time deadlines" - # invariant from depending on clock relationships and protects against future callers. deadline_dagrun_pairs = session.execute( - select(Deadline, DagRun).join(DagRun).where(and_(*filter_conditions)).where(~Deadline.missed) + select(Deadline, DagRun).join(DagRun).where(and_(*filter_conditions)) ).all() except AttributeError as e: @@ -242,48 +229,45 @@ def handle_miss(self, session: Session): """Handle a missed deadline by queueing the callback.""" from airflow.models.dag import DagModel # Avoids circular import - # Idempotency guard / defense-in-depth. The only caller (the scheduler loop) already - # selects ``WHERE ~Deadline.missed`` under ``FOR UPDATE SKIP LOCKED`` so a missed deadline - # is never re-handled in normal operation. But this method is otherwise unguarded: a second - # call would re-run ``callback.queue()`` — for a TriggererCallback that REPLACES - # ``self.trigger`` with a brand-new Trigger (orphaning the first) and resets the callback - # state, and for an ExecutorCallback it resets ``state`` back to PENDING — resurrecting a - # callback that may already be QUEUED/RUNNING/terminal (the same resurrection class that - # bug-fix made absorbing in ``Callback.handle_event``). Bail out early if already missed so - # the method is self-protecting regardless of how a (future) caller invokes it. - if self.missed: - logger.debug("Deadline %s already handled (missed=True); skipping re-handling", self.id) - return - - # Routing identifiers: stored at the top level of callback.data so the triggerer/executor - # can locate the DagRun and build execution context *before* invoking the callback. - # These are NOT user payload — they are consumed by infrastructure (_fetch_callback_dag_run_data - # in triggerer_job_runner and _fetch_and_build_context in callback_supervisor) and stripped - # before the callback function is called. - # TODO: A dedicated `routing_data` field on Callback would make this separation explicit - # and avoid mixing routing metadata with user kwargs in the same JSON blob. Deferred because - # changing the Callback model would require a migration and coordination with the executor path. - self.callback.data["dag_id"] = self.dagrun.dag_id - self.callback.data["run_id"] = self.dagrun.run_id - - # Deadline-specific info goes into the routing data (not user kwargs). - # The context builder (_fetch_and_build_context / _fetch_callback_dag_run_data) exposes - # these as context["deadline"] = {"id": ..., "deadline_time": ...} so templates like - # {{ deadline.deadline_time }} continue to work. - self.callback.data["deadline_id"] = str(self.id) - self.callback.data["deadline_time"] = self.deadline_time.isoformat() + def get_simple_context(): + from airflow.api_fastapi.core_api.datamodels.dag_run import DAGRunResponse + from airflow.models import DagRun + + # TODO: Use the TaskAPI from within Triggerer to fetch full context instead of sending this context + # from the scheduler + + # Fetch the DagRun from the database again to avoid errors when self.dagrun's relationship fields + # are not in the current session. + dagrun = session.get(DagRun, self.dagrun_id) + + return { + "dag_run": DAGRunResponse.model_validate(dagrun).model_dump(mode="json"), + "deadline": {"id": self.id, "deadline_time": self.deadline_time}, + } if isinstance(self.callback, TriggererCallback): + # Update the callback with context before queuing + if "kwargs" not in self.callback.data: + self.callback.data["kwargs"] = {} + self.callback.data["kwargs"] = (self.callback.data.get("kwargs") or {}) | { + "context": get_simple_context() + } + self.callback.queue(session=session) - flag_modified(self.callback, "data") session.add(self.callback) session.flush() elif isinstance(self.callback, ExecutorCallback): - # dag_run_id (integer PK) is required by the scheduler's ExecuteCallback.make() + if "kwargs" not in self.callback.data: + self.callback.data["kwargs"] = {} + self.callback.data["kwargs"] = (self.callback.data.get("kwargs") or {}) | { + "context": get_simple_context() + } + self.callback.data["deadline_id"] = str(self.id) self.callback.data["dag_run_id"] = str(self.dagrun.id) + self.callback.data["dag_id"] = self.dagrun.dag_id + self.callback.state = CallbackState.PENDING - flag_modified(self.callback, "data") session.add(self.callback) session.flush() @@ -446,14 +430,6 @@ def __post_init__(self): self.min_runs = self.max_runs if self.min_runs < 1: raise ValueError("min_runs must be at least 1") - if self.min_runs > self.max_runs: - # ``_evaluate_with`` does ``LIMIT max_runs`` then requires - # ``len(durations) >= min_runs``; with ``min_runs > max_runs`` that is never - # satisfiable, so the deadline would silently never be created. Fail fast. - raise ValueError( - f"min_runs ({self.min_runs}) cannot exceed max_runs ({self.max_runs}); " - "the deadline would require more completed runs than it ever samples." - ) @provide_session def _evaluate_with(self, *, session: Session, **kwargs: Any) -> datetime | None: diff --git a/airflow-core/src/airflow/models/deadline_alert.py b/airflow-core/src/airflow/models/deadline_alert.py index 9f029fff5c15a..20bfed459eef9 100644 --- a/airflow-core/src/airflow/models/deadline_alert.py +++ b/airflow-core/src/airflow/models/deadline_alert.py @@ -57,22 +57,11 @@ def __repr__(self): interval_seconds = None - # ``interval`` is a JSON column holding the Airflow-serialized interval: a fixed timedelta is - # ``{"__classname__": "datetime.timedelta", "__data__": }``, a dynamic - # ``VariableInterval`` is ``{"__classname__": ".../VariableInterval", "__data__": {...}}``, and - # a legacy pre-0117 blob may be a bare number. Extract seconds for the fixed/legacy cases and - # fall back to "dynamic" otherwise. (Note: the previous ``isinstance(self.interval, - # datetime.timedelta)`` branch was doubly broken — ``datetime`` here is the *class* - # ``datetime.datetime`` (``from datetime import datetime``), so ``datetime.timedelta`` raised - # ``AttributeError``, and ``interval`` is always a dict so that branch was also unreachable-by-type. - # A ``__repr__`` must never raise — it is used in logs, tracebacks, and debuggers.) if isinstance(self.interval, (int, float)): interval_seconds = int(self.interval) - elif isinstance(self.interval, dict): - data = self.interval.get("__data__") - if isinstance(data, (int, float)): - interval_seconds = int(data) + elif isinstance(self.interval, datetime.timedelta): + interval_seconds = int(self.interval.total_seconds()) if interval_seconds is None: interval_display = "dynamic" diff --git a/airflow-core/src/airflow/models/trigger.py b/airflow-core/src/airflow/models/trigger.py index 82eb9f2833392..6fecd4d825f09 100644 --- a/airflow-core/src/airflow/models/trigger.py +++ b/airflow-core/src/airflow/models/trigger.py @@ -26,7 +26,7 @@ from sqlalchemy import ForeignKey, Integer, String, Text, delete, func, or_, select, update from sqlalchemy.ext.associationproxy import association_proxy -from sqlalchemy.orm import Mapped, Session, joinedload, mapped_column, relationship, selectinload +from sqlalchemy.orm import Mapped, Session, mapped_column, relationship, selectinload from sqlalchemy.sql.functions import coalesce from airflow._shared.timezones import timezone @@ -212,8 +212,7 @@ def bulk_fetch(cls, ids: Iterable[int], *, session: Session = NEW_SESSION) -> di .options( selectinload(cls.task_instance) .joinedload(TaskInstance.trigger) - .joinedload(Trigger.triggerer_job), - joinedload(cls.callback), + .joinedload(Trigger.triggerer_job) ) ) return {obj.id: obj for obj in session.scalars(stmt)} @@ -335,32 +334,6 @@ def submit_failure(cls, trigger_id, exc=None, *, session: Session = NEW_SESSION) task_instance.state = TaskInstanceState.SCHEDULED task_instance.scheduled_dttm = timezone.utcnow() - # A deadline callback trigger has a ``callback`` row instead of a deferred TaskInstance. - # ``submit_event`` terminalises that row (``trigger.callback.handle_event``) on the success - # path, but the failure path historically only handled TaskInstances — so a CallbackTrigger - # that failed to inflate/run (landing in ``failed_triggers`` → here) left its Callback stuck - # in QUEUED/RUNNING forever (a zombie: never SUCCESS/FAILED, re-warned, never reaped, since - # callback rows terminalise ONLY via an event, never via submit_failure's TI path). Emit a - # terminal FAILED event so the callback is marked FAILED — mirroring ``submit_event`` and the - # FAILED-event the triggerer's BaseException handler emits. ``handle_event`` is terminal- - # absorbing, so this is a no-op if the callback already reached a terminal state. - trigger = session.scalars(select(cls).where(cls.id == trigger_id)).one_or_none() - if trigger is not None and trigger.callback is not None: - from airflow.triggers.base import TriggerEvent - from airflow.triggers.callback import PAYLOAD_BODY_KEY, PAYLOAD_STATUS_KEY - from airflow.utils.state import CallbackState - - if isinstance(exc, BaseException): - body = "".join(format_exception(type(exc), exc, exc.__traceback__)) - elif exc: - body = "".join(exc) if isinstance(exc, list) else str(exc) - else: - body = "Trigger failed before sending any event" - trigger.callback.handle_event( - TriggerEvent({PAYLOAD_STATUS_KEY: CallbackState.FAILED, PAYLOAD_BODY_KEY: body}), - session, - ) - @classmethod @provide_session def ids_for_triggerer( diff --git a/airflow-core/src/airflow/serialization/decoders.py b/airflow-core/src/airflow/serialization/decoders.py index dce9d8a9e5f06..890a1296a4a00 100644 --- a/airflow-core/src/airflow/serialization/decoders.py +++ b/airflow-core/src/airflow/serialization/decoders.py @@ -148,16 +148,7 @@ def decode_deadline_reference(reference_data: dict): """Decode a previously serialized deadline reference.""" ref_name = reference_data.get(SerializedReferenceModels.REFERENCE_TYPE_FIELD) - # ``__class_path`` is stamped by the encoder only for custom (non-builtin) references and is - # the authoritative discriminator. It must take precedence over the ``reference_type`` name: - # a user's custom reference may share a class name with a builtin (e.g. ``FixedDatetimeDeadline``), - # and routing by name alone would silently decode it as the builtin class — dropping the custom - # evaluation logic (or raising a spurious KeyError on builtin-only fields). - if "__class_path" in reference_data: - reference_class: type[SerializedReferenceModels.SerializedBaseDeadlineReference] = ( - SerializedReferenceModels.SerializedCustomReference - ) - elif ref_name and SerializedReferenceModels.is_builtin_reference(ref_name): + if ref_name and SerializedReferenceModels.is_builtin_reference(ref_name): reference_class = SerializedReferenceModels.get_reference_class(ref_name) else: reference_class = SerializedReferenceModels.SerializedCustomReference diff --git a/airflow-core/src/airflow/serialization/definitions/dag.py b/airflow-core/src/airflow/serialization/definitions/dag.py index e51191dac4aa4..9dc816eb9cc45 100644 --- a/airflow-core/src/airflow/serialization/definitions/dag.py +++ b/airflow-core/src/airflow/serialization/definitions/dag.py @@ -715,110 +715,51 @@ def _process_dagrun_deadline_alerts( if not deadline_alert: continue - # Isolate each deadline alert: creating a deadline is auxiliary to creating the - # DagRun itself, and must never prevent the DagRun from being created. A single bad - # alert — e.g. a ``VariableInterval`` whose backing Variable is missing / non-integer - # / <= 0 (``resolve()`` raises ``ValueError``), or a reference whose ``evaluate_with`` - # fails — would otherwise propagate out of ``create_dagrun`` and abort the whole run, - # silently stopping the DAG from scheduling. - # - # Isolation is done with a plain ``try``/``except`` and MUST NOT use - # ``session.begin_nested()``: ``create_dagrun`` runs on the scheduler session inside - # the ``prohibit_commit`` guard, and releasing a SAVEPOINT issues a commit, which trips - # that guard (``RuntimeError("UNEXPECTED COMMIT ...")``) and — because this very block - # swallows it — silently skips deadline creation for *every* scheduled DagRun. The - # try/except alone is sufficient: the only DB mutation in the loop body is the final - # ``session.add`` (everything before it is a decode, an in-memory ``resolve()``, or a - # read-only ``evaluate_with`` query), so an exception leaves no partial state to undo, - # and the pending ``Deadline`` is persisted by the caller's outer transaction. - try: - deserialized_deadline_alert = decode_deadline_alert( - { - Encoding.TYPE: DAT.DEADLINE_ALERT, - Encoding.VAR: { - DeadlineAlertFields.REFERENCE: deadline_alert.reference, - DeadlineAlertFields.INTERVAL: deadline_alert.interval, - DeadlineAlertFields.CALLBACK: deadline_alert.callback_def, - }, - } + deserialized_deadline_alert = decode_deadline_alert( + { + Encoding.TYPE: DAT.DEADLINE_ALERT, + Encoding.VAR: { + DeadlineAlertFields.REFERENCE: deadline_alert.reference, + DeadlineAlertFields.INTERVAL: deadline_alert.interval, + DeadlineAlertFields.CALLBACK: deadline_alert.callback_def, + }, + } + ) + + interval = deserialized_deadline_alert.interval + + if isinstance(interval, VariableInterval): + interval = interval.resolve() + + if isinstance(deserialized_deadline_alert.reference, SerializedReferenceModels.TYPES.DAGRUN): + deadline_time = deserialized_deadline_alert.reference.evaluate_with( + session=session, + interval=interval, + # TODO : Pretty sure we can drop these last two; verify after testing is complete + dag_id=self.dag_id, + run_id=orm_dagrun.run_id, ) - interval = deserialized_deadline_alert.interval - - if isinstance(interval, VariableInterval): - # Resolve the VariableInterval through the full secrets chain (env vars, - # configured secrets backends, then the metadata DB) -- the same resolution - # order as ``Variable.get`` -- rather than reading only the ``variable`` table. - # Reading the table directly would bypass ``AIRFLOW_VAR_*`` env vars and secrets - # backends, so a Variable that lives there resolves to ``None`` and the deadline - # is silently dropped by the per-alert ``except`` below. - # - # We must NOT call ``Variable.get`` / ``get_variable_from_secrets`` here: the - # metastore backend's ``get_variable`` is ``@provide_session`` and, given no - # session, opens the thread-local scoped session (the SAME one the scheduler - # holds) whose context-manager exit COMMITS -- tripping the ``prohibit_commit`` - # guard ``create_dagrun`` runs under. Instead iterate the backends ourselves and - # pass the scheduler's ``session`` through to the metastore backend (env/custom - # backends ignore it), so the DB read happens on our session with no commit. - from airflow._shared.secrets_backend.base import call_secrets_backend_method - from airflow.configuration import ensure_secrets_loaded - from airflow.secrets.metastore import MetastoreBackend - - var_val = None - for secrets_backend in ensure_secrets_loaded(): - kwargs = {"session": session} if isinstance(secrets_backend, MetastoreBackend) else {} - var_val = call_secrets_backend_method( - secrets_backend.get_variable, team_name=None, key=interval.key, **kwargs - ) - if var_val is not None: - break - if var_val is None: - # Fail loudly (the per-alert ``except`` isolates it): the Variable does not - # exist in any backend, so we cannot resolve the interval. Not a silent skip. - raise ValueError( - f"VariableInterval '{interval.key}' could not be resolved from any " - f"secrets backend, environment variable, or the metadata database" + if deadline_time is not None: + session.add( + Deadline( + deadline_time=deadline_time, + callback=deserialized_deadline_alert.callback, + dagrun_id=orm_dagrun.id, + deadline_alert_id=deadline_alert.id, + dag_id=orm_dagrun.dag_id, + bundle_name=orm_dagrun.dag_model.bundle_name, ) - interval = interval.coerce_to_timedelta(var_val) - - if isinstance(deserialized_deadline_alert.reference, SerializedReferenceModels.TYPES.DAGRUN): - deadline_time = deserialized_deadline_alert.reference.evaluate_with( - session=session, - interval=interval, - # TODO : Pretty sure we can drop these last two; verify after testing is complete - dag_id=self.dag_id, - run_id=orm_dagrun.run_id, ) - - if deadline_time is not None: - session.add( - Deadline( - deadline_time=deadline_time, - callback=deserialized_deadline_alert.callback, - dagrun_id=orm_dagrun.id, - deadline_alert_id=deadline_alert.id, - dag_id=orm_dagrun.dag_id, - bundle_name=orm_dagrun.dag_model.bundle_name, - ) - ) - team_name = ( - DagModel.get_team_name(self.dag_id, session=session) - if airflow_conf.getboolean("core", "multi_team") - else None - ) - stats.incr( - "deadline_alerts.deadline_created", - tags=prune_dict({"dag_id": self.dag_id, "team_name": team_name}), - ) - except Exception: - log.exception( - "Failed to create deadline for alert %s on DagRun %s (dag_id=%s); " - "skipping this deadline, the DagRun is unaffected", - getattr(deadline_alert, "id", ""), - orm_dagrun.run_id, - self.dag_id, - ) - stats.incr("deadline_alerts.deadline_creation_failed", tags={"dag_id": self.dag_id}) + team_name = ( + DagModel.get_team_name(self.dag_id, session=session) + if airflow_conf.getboolean("core", "multi_team") + else None + ) + stats.incr( + "deadline_alerts.deadline_created", + tags=prune_dict({"dag_id": self.dag_id, "team_name": team_name}), + ) @provide_session def set_task_instance_state( diff --git a/airflow-core/src/airflow/serialization/definitions/deadline.py b/airflow-core/src/airflow/serialization/definitions/deadline.py index d1275f57c8fc9..89e231cba24dc 100644 --- a/airflow-core/src/airflow/serialization/definitions/deadline.py +++ b/airflow-core/src/airflow/serialization/definitions/deadline.py @@ -201,16 +201,6 @@ def __post_init__(self): self.min_runs = self.max_runs if self.min_runs < 1: raise ValueError("min_runs must be at least 1") - if self.min_runs > self.max_runs: - # ``_evaluate_with`` samples at most ``max_runs`` rows (``LIMIT max_runs``) and - # then requires ``len(durations) >= min_runs``. With ``min_runs > max_runs`` that - # is never satisfiable, so the deadline would silently never be created no matter - # how many runs exist. Fail fast here (this also covers ``deserialize_reference``, - # which constructs via ``cls(...)``) rather than producing an inert deadline. - raise ValueError( - f"min_runs ({self.min_runs}) cannot exceed max_runs ({self.max_runs}); " - "the deadline would require more completed runs than it ever samples." - ) @provide_session def _evaluate_with(self, *, session: Session, **kwargs: Any) -> datetime | None: @@ -321,22 +311,7 @@ def serialize_reference(self) -> dict: def deserialize_reference(cls, reference_data: dict): from airflow.serialization.helpers import find_registered_custom_deadline_reference - # ``decode_deadline_reference`` also routes here for any reference it cannot otherwise - # classify — an unrecognized ``reference_type`` with no ``__class_path`` (corrupted / - # hand-edited rows, a blob from a newer version, or a legacy custom-ref whose plugin is - # gone). Without ``__class_path`` there is nothing to import, so a bare - # ``reference_data["__class_path"]`` would raise an opaque ``KeyError``. Surface a clear, - # actionable error instead (the deadline-creation isolation logs this and skips the - # alert rather than aborting the DagRun). - class_path = reference_data.get("__class_path") - if not class_path: - raise ValueError( - "Cannot deserialize deadline reference: unrecognized reference_type " - f"{reference_data.get(SerializedReferenceModels.REFERENCE_TYPE_FIELD)!r} with no " - "'__class_path' to import. The stored reference is corrupt, from a newer " - "Airflow version, or references a custom class whose plugin is no longer installed." - ) - custom_class = find_registered_custom_deadline_reference(class_path) + custom_class = find_registered_custom_deadline_reference(reference_data["__class_path"]) inner_ref = custom_class.deserialize_reference(reference_data) return cls(inner_ref) diff --git a/airflow-core/src/airflow/triggers/callback.py b/airflow-core/src/airflow/triggers/callback.py index 0f4d084f54db0..b27920fed8614 100644 --- a/airflow-core/src/airflow/triggers/callback.py +++ b/airflow-core/src/airflow/triggers/callback.py @@ -17,26 +17,14 @@ from __future__ import annotations -import asyncio import logging -import sys import traceback from collections.abc import AsyncIterator -from pathlib import Path from typing import Any -from airflow._shared.module_loading import ( - UNUSUAL_MODULE_PREFIX, - accepts_context, - import_string, - load_mangled_dag_module, - qualname, -) -from airflow._shared.template_rendering import render_callback_kwargs -from airflow.dag_processing.bundles.manager import DagBundlesManager +from airflow._shared.module_loading import accepts_context, import_string, qualname from airflow.models.callback import CallbackState from airflow.triggers.base import BaseTrigger, TriggerEvent -from airflow.utils.file import get_unique_dag_module_name log = logging.getLogger(__name__) @@ -44,53 +32,6 @@ PAYLOAD_BODY_KEY = "body" -def _ensure_bundle_module_registered(callback_path: str) -> None: - """ - Register an unusual_prefix_{hash}_{stem} module so import_string can find it. - - The triggerer event loop doesn't run the DAG processor, so DAG files with mangled - module names must be registered manually. Walks all configured bundles to find the - file, then delegates the actual loading to _load_mangled_module in callback_supervisor. - - The bundle directory is also added to ``sys.path`` so that a callback defined in a - DAG file can import sibling helper modules living in the same bundle (e.g. a - ``my_helpers.py`` next to the DAG). This mirrors the executor/sync callback path in - ``callback_supervisor`` which appends the bundle path to ``sys.path`` before running - the callback; without it, ``import my_helpers`` inside an async callback fails with - ``ModuleNotFoundError`` on the triggerer. - """ - mod_name = callback_path.split(".")[0] - if mod_name in sys.modules: - return - - # unusual_prefix_{hex40}_{stem} → {stem}.py - parts = mod_name.split("_", 3) - if len(parts) < 4: - return - stem = parts[3] - - try: - for bundle in DagBundlesManager().get_all_dag_bundles(): - try: - bundle.initialize() - # Walk all .py files — stem may differ from filename due to sanitization - # (get_unique_dag_module_name replaces . and - with _ in the stem) - for file_path in Path(bundle.path).rglob("*.py"): - if get_unique_dag_module_name(str(file_path)) == mod_name: - # Put the bundle dir on sys.path so the callback can import sibling - # helper modules from the same bundle (parity with the sync path). - bundle_path = str(bundle.path) - if bundle_path not in sys.path: - sys.path.append(bundle_path) - if load_mangled_dag_module(mod_name, str(file_path)): - return - except Exception: - log.debug("Bundle %s did not contain module stem %s", bundle.name, stem) - continue - except Exception: - log.warning("Failed to register unusual_prefix module %s", mod_name, exc_info=True) - - class CallbackTrigger(BaseTrigger): """Trigger that executes a callback function asynchronously.""" @@ -100,10 +41,6 @@ def __init__(self, callback_path: str, callback_kwargs: dict[str, Any] | None = super().__init__() self.callback_path = callback_path self.callback_kwargs = callback_kwargs or {} - #: Set externally by TriggerRunner from workload dag_run_data before run() is called, - #: the same pattern as task_instance is set for task-bound triggers. Not serialized; - #: always None on a freshly deserialized trigger. - self._callback_context: dict | None = None def serialize(self) -> tuple[str, dict[str, Any]]: return ( @@ -114,35 +51,14 @@ def serialize(self) -> tuple[str, dict[str, Any]]: async def run(self) -> AsyncIterator[TriggerEvent]: try: yield TriggerEvent({PAYLOAD_STATUS_KEY: CallbackState.RUNNING}) - if self.callback_path.startswith(UNUSUAL_MODULE_PREFIX): - await asyncio.to_thread(_ensure_bundle_module_registered, self.callback_path) callback = import_string(self.callback_path) - - kwargs = dict(self.callback_kwargs) - - # ALWAYS strip any "context" stored in callback_kwargs (3.2.x back-compat, where the - # context was serialized inside kwargs) so it is never double-passed alongside the - # explicit ``context=`` below. ``pop`` must run unconditionally: a previous - # ``self._callback_context or kwargs.pop(...)`` short-circuited the pop whenever the - # runtime context was set, so a 3.2.x-serialized trigger that ALSO got a runtime - # ``_callback_context`` (the TriggerRunner sets it unconditionally for callback triggers - # with dag_run_data) kept the stale key and crashed with - # "got multiple values for keyword argument 'context'". Pop first, then prefer the - # runtime context over the stored one. - stored_context = kwargs.pop("context", None) - context = self._callback_context or stored_context - - # Render Jinja in string kwargs (e.g. ``"{{ dag_run.run_id }}"``) using the same - # shared helper as the synchronous executor path, so async and sync callbacks render - # identically. Without this the triggerer path passed kwargs through verbatim while the - # executor path rendered them — a silent sync/async divergence. - if context is not None: - kwargs = render_callback_kwargs(kwargs, context) + # TODO: get full context and run template rendering. Right now, a simple context is included in `callback_kwargs` + context = self.callback_kwargs.pop("context", None) if accepts_context(callback) and context is not None: - result = await callback(**kwargs, context=context) + result = await callback(**self.callback_kwargs, context=context) else: - result = await callback(**kwargs) + result = await callback(**self.callback_kwargs) yield TriggerEvent({PAYLOAD_STATUS_KEY: CallbackState.SUCCESS, PAYLOAD_BODY_KEY: result}) diff --git a/airflow-core/src/airflow/ui/openapi-gen/queries/ensureQueryData.ts b/airflow-core/src/airflow/ui/openapi-gen/queries/ensureQueryData.ts index 0cdbadae04f01..74a833b2ccfdc 100644 --- a/airflow-core/src/airflow/ui/openapi-gen/queries/ensureQueryData.ts +++ b/airflow-core/src/airflow/ui/openapi-gen/queries/ensureQueryData.ts @@ -1896,7 +1896,7 @@ export const ensureUseDeadlinesServiceGetDeadlinesData = (queryClient: QueryClie * @param data.dagId * @param data.limit * @param data.offset -* @param data.orderBy Attributes to order by, multi criteria sort is supported. Prefix with `-` for descending order. Supported attributes: `id, created_at, name` +* @param data.orderBy Attributes to order by, multi criteria sort is supported. Prefix with `-` for descending order. Supported attributes: `id, created_at, name, interval` * @returns DeadlineAlertCollectionResponse Successful Response * @throws ApiError */ diff --git a/airflow-core/src/airflow/ui/openapi-gen/queries/prefetch.ts b/airflow-core/src/airflow/ui/openapi-gen/queries/prefetch.ts index d935c4f6d0767..9db58a9676cc6 100644 --- a/airflow-core/src/airflow/ui/openapi-gen/queries/prefetch.ts +++ b/airflow-core/src/airflow/ui/openapi-gen/queries/prefetch.ts @@ -1896,7 +1896,7 @@ export const prefetchUseDeadlinesServiceGetDeadlines = (queryClient: QueryClient * @param data.dagId * @param data.limit * @param data.offset -* @param data.orderBy Attributes to order by, multi criteria sort is supported. Prefix with `-` for descending order. Supported attributes: `id, created_at, name` +* @param data.orderBy Attributes to order by, multi criteria sort is supported. Prefix with `-` for descending order. Supported attributes: `id, created_at, name, interval` * @returns DeadlineAlertCollectionResponse Successful Response * @throws ApiError */ diff --git a/airflow-core/src/airflow/ui/openapi-gen/queries/queries.ts b/airflow-core/src/airflow/ui/openapi-gen/queries/queries.ts index 37c04b705eeb9..c10c963570653 100644 --- a/airflow-core/src/airflow/ui/openapi-gen/queries/queries.ts +++ b/airflow-core/src/airflow/ui/openapi-gen/queries/queries.ts @@ -1896,7 +1896,7 @@ export const useDeadlinesServiceGetDeadlines = ; }; diff --git a/airflow-core/src/airflow/ui/public/i18n/locales/en/dag.json b/airflow-core/src/airflow/ui/public/i18n/locales/en/dag.json index fd9ac555e7caa..b51add8d5ec09 100644 --- a/airflow-core/src/airflow/ui/public/i18n/locales/en/dag.json +++ b/airflow-core/src/airflow/ui/public/i18n/locales/en/dag.json @@ -42,14 +42,12 @@ }, "deadlineAlerts": { "completionRule": "Must complete within {{interval}} of {{reference}}", - "completionRuleDynamic": "Must complete within a dynamic interval of {{reference}}", "count_one": "{{count}} deadline", "count_other": "{{count}} deadlines", "referenceType": { "AverageRuntimeDeadline": "average runtime", "DagRunLogicalDateDeadline": "logical date", - "DagRunQueuedAtDeadline": "queue time", - "FixedDatetimeDeadline": "fixed datetime" + "DagRunQueuedAtDeadline": "queue time" } }, "deadlineStatus": { diff --git a/airflow-core/src/airflow/ui/src/pages/Dag/DeadlineAlertsBadge.tsx b/airflow-core/src/airflow/ui/src/pages/Dag/DeadlineAlertsBadge.tsx index 716140a19ddd4..5ee03b8aff855 100644 --- a/airflow-core/src/airflow/ui/src/pages/Dag/DeadlineAlertsBadge.tsx +++ b/airflow-core/src/airflow/ui/src/pages/Dag/DeadlineAlertsBadge.tsx @@ -35,21 +35,12 @@ const AlertRow = ({ alert }: { readonly alert: DeadlineAlertResponse }) => { const reference = translate(`deadlineAlerts.referenceType.${alert.reference_type}`, { defaultValue: alert.reference_type, }); - // A fixed interval reports its seconds; a dynamic interval (e.g. VariableInterval) comes back - // as null, since its value is only resolved when the scheduler evaluates the deadline. Render a - // "dynamic interval" phrasing rather than a misleading "a few seconds" from dayjs.duration(null). - const completionRule = - alert.interval === null || alert.interval === undefined - ? translate("deadlineAlerts.completionRuleDynamic", { reference }) - : translate("deadlineAlerts.completionRule", { - interval: dayjs.duration(alert.interval, "seconds").humanize(), - reference, - }); + const interval = dayjs.duration(alert.interval, "seconds").humanize(); return ( - {completionRule} + {translate("deadlineAlerts.completionRule", { interval, reference })} {Boolean(alert.name) && ( {" "} diff --git a/airflow-core/src/airflow/ui/src/pages/Dag/Overview/DeadlineRow.tsx b/airflow-core/src/airflow/ui/src/pages/Dag/Overview/DeadlineRow.tsx index 8f9f98d3b082b..3e47861c90d82 100644 --- a/airflow-core/src/airflow/ui/src/pages/Dag/Overview/DeadlineRow.tsx +++ b/airflow-core/src/airflow/ui/src/pages/Dag/Overview/DeadlineRow.tsx @@ -43,19 +43,7 @@ export const DeadlineRow = ({ alert, deadline }: DeadlineRowProps) => { defaultValue: alert.reference_type, }) : undefined; - // A fixed interval reports its seconds; a dynamic interval (e.g. VariableInterval) comes back as - // null, resolved only at scheduler evaluation time — render a "dynamic interval" phrasing rather - // than a misleading "a few seconds" from dayjs.duration(null). When there is no alert at all, - // completionRule stays undefined so the rule line is omitted (preserving prior behavior). - const completionRule = - alert === undefined - ? undefined - : alert.interval === null || alert.interval === undefined - ? translate("deadlineAlerts.completionRuleDynamic", { reference }) - : translate("deadlineAlerts.completionRule", { - interval: dayjs.duration(alert.interval, "seconds").humanize(), - reference, - }); + const interval = alert ? dayjs.duration(alert.interval, "seconds").humanize() : undefined; return ( @@ -73,11 +61,11 @@ export const DeadlineRow = ({ alert, deadline }: DeadlineRowProps) => { {deadline.dag_run_id} - {completionRule === undefined ? undefined : ( + {reference !== undefined && interval !== undefined ? ( - {completionRule} + {translate("deadlineAlerts.completionRule", { interval, reference })} - )} + ) : undefined} diff --git a/airflow-core/src/airflow/ui/src/pages/Run/DeadlineStatus.tsx b/airflow-core/src/airflow/ui/src/pages/Run/DeadlineStatus.tsx index 8f13477fb0ab6..eff38d4a78e71 100644 --- a/airflow-core/src/airflow/ui/src/pages/Run/DeadlineStatus.tsx +++ b/airflow-core/src/airflow/ui/src/pages/Run/DeadlineStatus.tsx @@ -45,23 +45,6 @@ export const DeadlineStatus = ({ dagId, dagRunId, endDate }: DeadlineStatusProps const { t: translate } = useTranslation("dag"); const [isModalOpen, setIsModalOpen] = useState(false); - // A fixed interval reports its seconds; a dynamic interval (e.g. VariableInterval) comes back - // as null, since its value is only resolved when the scheduler evaluates the deadline. Render - // a "dynamic interval" phrasing in that case rather than a misleading "a few seconds". - const completionRule = (alertItem: DeadlineAlertResponse) => - alertItem.interval === null || alertItem.interval === undefined - ? translate("deadlineAlerts.completionRuleDynamic", { - reference: translate(`deadlineAlerts.referenceType.${alertItem.reference_type}`, { - defaultValue: alertItem.reference_type, - }), - }) - : translate("deadlineAlerts.completionRule", { - interval: dayjs.duration(alertItem.interval, "seconds").humanize(), - reference: translate(`deadlineAlerts.referenceType.${alertItem.reference_type}`, { - defaultValue: alertItem.reference_type, - }), - }); - const { data: deadlineData, isLoading: isLoadingDeadlines } = useDeadlinesServiceGetDeadlines({ dagId, dagRunId, @@ -101,7 +84,12 @@ export const DeadlineStatus = ({ dagId, dagRunId, endDate }: DeadlineStatusProps {(alertData?.deadline_alerts ?? []).map((deadlineAlert) => ( - {completionRule(deadlineAlert)} + {translate("deadlineAlerts.completionRule", { + interval: dayjs.duration(deadlineAlert.interval, "seconds").humanize(), + reference: translate(`deadlineAlerts.referenceType.${deadlineAlert.reference_type}`, { + defaultValue: deadlineAlert.reference_type, + }), + })} ))} @@ -199,7 +187,12 @@ export const DeadlineStatus = ({ dagId, dagRunId, endDate }: DeadlineStatusProps {alert === undefined ? undefined : ( - {completionRule(alert)} + {translate("deadlineAlerts.completionRule", { + interval: dayjs.duration(alert.interval, "seconds").humanize(), + reference: translate(`deadlineAlerts.referenceType.${alert.reference_type}`, { + defaultValue: alert.reference_type, + }), + })} )} diff --git a/airflow-core/src/airflow/ui/src/pages/Run/DeadlineStatusModal.tsx b/airflow-core/src/airflow/ui/src/pages/Run/DeadlineStatusModal.tsx index c0306ae984b39..968f1210d9e82 100644 --- a/airflow-core/src/airflow/ui/src/pages/Run/DeadlineStatusModal.tsx +++ b/airflow-core/src/airflow/ui/src/pages/Run/DeadlineStatusModal.tsx @@ -58,22 +58,6 @@ export const DeadlineStatusModal = ({ const [page, setPage] = useState(1); const offset = (page - 1) * PAGE_LIMIT; - // A dynamic interval (e.g. VariableInterval) comes back as null — render a "dynamic interval" - // phrasing rather than a misleading "a few seconds" from dayjs.duration(null, ...). - const completionRule = (alert: DeadlineAlertResponse) => - alert.interval === null || alert.interval === undefined - ? translate("deadlineAlerts.completionRuleDynamic", { - reference: translate(`deadlineAlerts.referenceType.${alert.reference_type}`, { - defaultValue: alert.reference_type, - }), - }) - : translate("deadlineAlerts.completionRule", { - interval: dayjs.duration(alert.interval, "seconds").humanize(), - reference: translate(`deadlineAlerts.referenceType.${alert.reference_type}`, { - defaultValue: alert.reference_type, - }), - }); - const { data, error, isLoading } = useDeadlinesServiceGetDeadlines( { dagId, @@ -146,7 +130,12 @@ export const DeadlineStatusModal = ({ {alert === undefined ? undefined : ( - {completionRule(alert)} + {translate("deadlineAlerts.completionRule", { + interval: dayjs.duration(alert.interval, "seconds").humanize(), + reference: translate(`deadlineAlerts.referenceType.${alert.reference_type}`, { + defaultValue: alert.reference_type, + }), + })} )} diff --git a/airflow-core/tests/unit/api_fastapi/core_api/routes/ui/test_deadlines.py b/airflow-core/tests/unit/api_fastapi/core_api/routes/ui/test_deadlines.py index 6c83044e17c3f..acadab3a6b16f 100644 --- a/airflow-core/tests/unit/api_fastapi/core_api/routes/ui/test_deadlines.py +++ b/airflow-core/tests/unit/api_fastapi/core_api/routes/ui/test_deadlines.py @@ -17,8 +17,6 @@ from __future__ import annotations -from datetime import timedelta - import pytest from sqlalchemy import select @@ -28,7 +26,7 @@ from airflow.models.serialized_dag import SerializedDagModel from airflow.providers.standard.operators.empty import EmptyOperator from airflow.sdk.definitions.callback import AsyncCallback -from airflow.sdk.definitions.deadline import DeadlineReference, VariableInterval +from airflow.sdk.definitions.deadline import DeadlineReference from airflow.utils.state import DagRunState from airflow.utils.types import DagRunTriggeredByType, DagRunType @@ -509,17 +507,6 @@ def test_should_response_404_for_nonexistent_dag(self, test_client): response = test_client.get("/dags/nonexistent_dag/deadlineAlerts") assert response.status_code == 404 - @pytest.mark.parametrize("order_by", ["interval", "-interval"]) - def test_order_by_interval_is_rejected(self, test_client, order_by): - """``interval`` is a serialized-JSON column (timedelta/VariableInterval dict), so DB-level - ordering by it is meaningless (sorts by JSON structure, not duration). It must NOT be an - allowed sort key — the endpoint should reject it with a 400 rather than silently returning - an arbitrarily-ordered list. - """ - response = test_client.get(f"/dags/{DAG_ID}/deadlineAlerts", params={"order_by": order_by}) - assert response.status_code == 400 - assert "interval" in response.json()["detail"] - def test_should_response_401(self, unauthenticated_test_client): response = unauthenticated_test_client.get(f"/dags/{DAG_ID}/deadlineAlerts") assert response.status_code == 401 @@ -527,71 +514,3 @@ def test_should_response_401(self, unauthenticated_test_client): def test_should_response_403(self, unauthorized_test_client): response = unauthorized_test_client.get(f"/dags/{DAG_ID}/deadlineAlerts") assert response.status_code == 403 - - -class TestDeadlineAlertsIntervalSerialization: - """Regression tests for the ``interval`` column shape on the deadlineAlerts endpoint. - - ``DeadlineAlert.interval`` is a JSON column that, in production, holds the - Airflow-*serialized* interval — ``encode_deadline_alert`` stores - ``serialize(self.interval)``, not a plain number. A fixed ``timedelta`` becomes - ``{"__classname__": "datetime.timedelta", "__version__": 2, "__data__": }`` - and a dynamic ``VariableInterval`` becomes - ``{"__classname__": ".../VariableInterval", "__data__": {"key": ...}}``. - - The response model originally typed ``interval`` as a bare ``float``, so Pydantic - raised on that dict and the endpoint returned 500 — which broke the run-page - ``DeadlineStatus`` badge. The existing fixtures masked this by seeding a bare float - (``interval=3600.0``), a value the real creation path never produces. These tests - seed the realistic serialized forms. - """ - - @pytest.fixture - def dag_with_serialized_intervals(self, dag_maker, session): - from airflow.sdk.serde import serialize - - dag_id = "dag_serialized_interval" - with dag_maker(dag_id, serialized=True, session=session): - EmptyOperator(task_id="task") - dag_maker.sync_dagbag_to_db() - session.commit() - - serialized_dag = session.scalar(select(SerializedDagModel).where(SerializedDagModel.dag_id == dag_id)) - # Fixed interval: stored as the serialized timedelta dict, exactly as - # encode_deadline_alert would persist it. - session.add( - DeadlineAlert( - serialized_dag_id=serialized_dag.id, - name="fixed_interval_alert", - reference=DeadlineReference.DAGRUN_QUEUED_AT.serialize_reference(), - interval=serialize(timedelta(seconds=300)), - callback_def={"path": _CALLBACK_PATH}, - ) - ) - # Dynamic interval: a VariableInterval serializes to a dict with no fixed - # seconds — the value is only resolved at scheduler evaluation time. - session.add( - DeadlineAlert( - serialized_dag_id=serialized_dag.id, - name="dynamic_interval_alert", - reference=DeadlineReference.DAGRUN_QUEUED_AT.serialize_reference(), - interval=serialize(VariableInterval("deadline_seconds")), - callback_def={"path": _CALLBACK_PATH}, - ) - ) - session.commit() - return dag_id - - def test_serialized_timedelta_interval_does_not_500(self, test_client, dag_with_serialized_intervals): - """A fixed timedelta interval is coerced to its seconds value (not a 500).""" - response = test_client.get(f"/dags/{dag_with_serialized_intervals}/deadlineAlerts") - assert response.status_code == 200 - alerts = {a["name"]: a for a in response.json()["deadline_alerts"]} - assert alerts["fixed_interval_alert"]["interval"] == 300.0 - - def test_dynamic_variable_interval_serializes_as_null(self, test_client, dag_with_serialized_intervals): - """A dynamic VariableInterval has no fixed seconds, so interval is null (not a 500).""" - response = test_client.get(f"/dags/{dag_with_serialized_intervals}/deadlineAlerts") - assert response.status_code == 200 - alerts = {a["name"]: a for a in response.json()["deadline_alerts"]} - assert alerts["dynamic_interval_alert"]["interval"] is None diff --git a/airflow-core/tests/unit/api_fastapi/execution_api/versions/head/test_task_instances.py b/airflow-core/tests/unit/api_fastapi/execution_api/versions/head/test_task_instances.py index dfbbb2e0e3cfe..90325bb9c8542 100644 --- a/airflow-core/tests/unit/api_fastapi/execution_api/versions/head/test_task_instances.py +++ b/airflow-core/tests/unit/api_fastapi/execution_api/versions/head/test_task_instances.py @@ -4050,16 +4050,16 @@ def test_workload_scope_rejected_on_state_endpoint(self, client, session, create assert resp.status_code == 403 assert "Token type 'workload' not allowed" in resp.json()["detail"] - def test_workload_scope_accepted_on_connections_endpoint(self, client, session, create_task_instance): - """Workload scoped tokens are accepted on GET /connections for deadline callback subprocesses.""" + def test_workload_scope_rejected_on_connections_endpoint(self, client, session, create_task_instance): + """Workload scoped tokens should be rejected on GET /connections (different router).""" ti = create_task_instance(task_id="test_workload_conn", state=State.RUNNING) session.commit() self._register_scoped_validator(ti.id, "workload") resp = client.get("/execution/connections/test_conn") - # Workload tokens are now accepted; 404 because the connection doesn't exist in the test DB. - assert resp.status_code == 404 + assert resp.status_code == 403 + assert "Token type 'workload' not allowed" in resp.json()["detail"] def test_execution_scope_accepted_on_all_endpoints(self, client, session, create_task_instance): """Execution scoped tokens should be accepted on all endpoints.""" diff --git a/airflow-core/tests/unit/dag_processing/test_dagbag.py b/airflow-core/tests/unit/dag_processing/test_dagbag.py index 43099f16985fb..99abd92a59f73 100644 --- a/airflow-core/tests/unit/dag_processing/test_dagbag.py +++ b/airflow-core/tests/unit/dag_processing/test_dagbag.py @@ -337,46 +337,6 @@ def test_validate_executor_field_executor_not_configured(): _validate_executor_fields(dag) -def _deadline_dag(callback): - from datetime import timedelta - - from airflow.sdk.definitions.deadline import DeadlineAlert, DeadlineReference - - return DAG( - "deadline-dag", - schedule=None, - deadline=DeadlineAlert( - reference=DeadlineReference.DAGRUN_QUEUED_AT, - interval=timedelta(seconds=1), - callback=callback, - ), - ) - - -def _deadline_sync_callback(executor=None): - from airflow.sdk.definitions.callback import SyncCallback - - def _cb(**kwargs): - pass - - return SyncCallback(_cb, executor=executor) - - -def test_validate_executor_field_deadline_callback_invalid_executor(): - """A deadline SyncCallback pinned to an unknown executor is rejected at parse time. - - Without this validation the queued ExecutorCallback would sit PENDING forever (the - scheduler can't route it to a non-existent executor), re-warning every loop. This - mirrors the task-executor validation so the author gets immediate feedback instead. - """ - dag = _deadline_dag(_deadline_sync_callback(executor="bogus.deadline.executor")) - with pytest.raises( - UnknownExecutorException, - match=re.escape("specifies executor 'bogus.deadline.executor', which is not available"), - ): - _validate_executor_fields(dag) - - def test_validate_executor_field(): with DAG("test-dag", schedule=None) as dag: BaseOperator(task_id="t1", executor="test.custom.executor") diff --git a/airflow-core/tests/unit/executors/test_local_executor.py b/airflow-core/tests/unit/executors/test_local_executor.py index bdc06b22221e4..113740bb0c333 100644 --- a/airflow-core/tests/unit/executors/test_local_executor.py +++ b/airflow-core/tests/unit/executors/test_local_executor.py @@ -478,10 +478,6 @@ def test_execute_workload_calls_supervise_callback(self, mock_supervise_callback callback_path="test.module.my_callback", callback_kwargs={"arg1": "val1"}, dag_rel_path=Path("test.py"), - dag_id=None, - run_id=None, - deadline_id=None, - deadline_time=None, log_path="test.log", bundle_info=BundleInfo(name="test_bundle", version="1.0"), token=TestLocalExecutorCallbackSupport.TEST_TOKEN, diff --git a/airflow-core/tests/unit/jobs/test_build_trigger_workloads_resilience.py b/airflow-core/tests/unit/jobs/test_build_trigger_workloads_resilience.py deleted file mode 100644 index 7acd3b9146c8c..0000000000000 --- a/airflow-core/tests/unit/jobs/test_build_trigger_workloads_resilience.py +++ /dev/null @@ -1,143 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Error-isolation tests for ``TriggerRunnerSupervisor.build_trigger_workloads``. - -``build_trigger_workloads`` runs inside the supervisor's synchronous ``run_once`` -loop (``load_triggers`` -> ``update_triggers`` -> ``build_trigger_workloads``). An -unhandled exception there propagates out of ``run()`` and kills the entire -triggerer process, taking every other deferred task down with it. A single bad -trigger -- e.g. one whose deadline-callback context fetch raises, whose serialized -Dag fails to load, or whose log-path rendering throws -- must therefore be isolated -so the rest of the batch still launches and the triggerer stays up. This mirrors the -per-item SAVEPOINT isolation already applied to the scheduler's ``handle_miss`` and -``_enqueue_executor_callbacks`` loops. -""" - -from __future__ import annotations - -import datetime -import selectors -import socket as socket_mod - -import pendulum -import psutil -import pytest -from structlog.typing import FilteringBoundLogger - -from airflow.jobs.job import Job -from airflow.jobs.triggerer_job_runner import TriggerRunnerSupervisor -from airflow.models import DagModel, DagRun, Trigger -from airflow.models.dag_version import DagVersion -from airflow.models.dagbundle import DagBundleModel -from airflow.models.serialized_dag import SerializedDagModel -from airflow.providers.standard.triggers.temporal import TimeDeltaTrigger -from airflow.sdk import DAG, BaseOperator -from airflow.serialization.serialized_objects import LazyDeserializedDAG -from airflow.utils.state import State -from airflow.utils.types import DagRunType - -from tests_common.test_utils.db import clear_db_dags, clear_db_runs -from tests_common.test_utils.taskinstance import create_task_instance - -pytestmark = pytest.mark.db_test - - -class TestBuildTriggerWorkloadsResilience: - @pytest.fixture(autouse=True) - def _clean(self): - clear_db_runs() - clear_db_dags() - yield - clear_db_runs() - clear_db_dags() - - def _make_task_trigger(self, session, idx): - bundle_name = "testing" - session.merge(DagBundleModel(name=bundle_name)) - session.flush() - date = pendulum.datetime(2023, 1, 1) - dag_id = f"build_wl_dag_{idx}" - dag_model = DagModel(dag_id=dag_id, bundle_name=bundle_name) - dag = DAG(dag_id=dag_id, schedule="@daily", start_date=date) - run = DagRun( - dag_id=dag_id, - run_id=f"run_{idx}", - logical_date=date, - data_interval=(date, date), - run_after=date, - run_type=DagRunType.MANUAL, - state=State.RUNNING, - ) - trigger_orm = Trigger.from_object(TimeDeltaTrigger(datetime.timedelta(days=7))) - operator = BaseOperator(task_id="t", dag=dag) - session.add(dag_model) - SerializedDagModel.write_dag(LazyDeserializedDAG.from_dag(dag), bundle_name=bundle_name) - session.add(run) - session.add(trigger_orm) - session.flush() - dag_version = DagVersion.get_latest_version(dag_id) - ti = create_task_instance(operator, run_id=run.run_id, dag_version_id=dag_version.id) - ti.trigger_id = trigger_orm.id - session.add(ti) - session.commit() - return trigger_orm.id - - def _make_supervisor(self, job, mocker): - process = mocker.Mock(spec=psutil.Process, pid=10 * job.id + 1) - mock_stdin = mocker.Mock(spec=socket_mod.socket) - proc = TriggerRunnerSupervisor( - process_log=mocker.Mock(spec=FilteringBoundLogger), - id=job.id, - job=job, - pid=process.pid, - stdin=mock_stdin, - process=process, - capacity=10, - ) - mock_selector = mocker.Mock(spec=selectors.DefaultSelector) - mock_selector.select.return_value = [] - proc.selector = mock_selector - return proc - - def test_one_failing_trigger_does_not_abort_batch(self, session, mocker): - """A single trigger whose workload build raises must be skipped, the rest built, - and no exception must propagate out (which would crash the triggerer).""" - job = Job() - session.add(job) - session.flush() - - ids = [self._make_task_trigger(session, i) for i in range(3)] - bad_id = ids[1] - supervisor = self._make_supervisor(job, mocker) - - real_create = TriggerRunnerSupervisor._create_workload - - def flaky_create(self, trigger, dag_bag, render_log_fname, session): - if trigger.id == bad_id: - raise RuntimeError("workload build boom on one trigger") - return real_create(self, trigger, dag_bag, render_log_fname, session=session) - - mocker.patch.object(TriggerRunnerSupervisor, "_create_workload", flaky_create) - - # Must NOT raise: a single bad trigger is isolated like handle_miss / enqueue callbacks. - result = supervisor.build_trigger_workloads(set(ids)) - - built_ids = sorted(w.id for w in result) - # The two good triggers must still be built despite the bad one in the batch. - assert built_ids == sorted(i for i in ids if i != bad_id) - # The bad trigger must NOT leave a leaked logger factory behind. - assert bad_id not in supervisor.logger_cache diff --git a/airflow-core/tests/unit/jobs/test_enqueue_executor_callbacks_resilience.py b/airflow-core/tests/unit/jobs/test_enqueue_executor_callbacks_resilience.py deleted file mode 100644 index 80f1f01679f10..0000000000000 --- a/airflow-core/tests/unit/jobs/test_enqueue_executor_callbacks_resilience.py +++ /dev/null @@ -1,174 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Error-isolation tests for SchedulerJobRunner._enqueue_executor_callbacks. - -A single bad ExecutorCallback (unknown executor, workload build failure, or an -executor whose ``queue_workload`` raises) must not crash the scheduler loop or -prevent the remaining callbacks in the batch from being enqueued. This mirrors -the per-deadline SAVEPOINT isolation already applied to the -``Deadline.handle_miss`` loop. -""" - -from __future__ import annotations - -from unittest import mock - -import pytest - -from airflow.jobs.job import Job -from airflow.jobs.scheduler_job_runner import SchedulerJobRunner -from airflow.models.callback import ExecutorCallback -from airflow.models.deadline import Deadline -from airflow.sdk.definitions.callback import SyncCallback -from airflow.utils.state import CallbackState - -from tests_common.test_utils.db import clear_db_dags, clear_db_runs - -pytestmark = pytest.mark.db_test - - -def _noop_callback(): - pass - - -@pytest.mark.need_serialized_dag -class TestEnqueueExecutorCallbacksResilience: - @pytest.fixture(autouse=True) - def _clean(self): - clear_db_runs() - clear_db_dags() - yield - clear_db_runs() - clear_db_dags() - - def _make_pending_callback(self, dag_run, session, executor_name=None): - callback = Deadline( - deadline_time=dag_run.logical_date, - callback=SyncCallback(_noop_callback), - dagrun_id=dag_run.id, - deadline_alert_id=None, - ).callback - callback.state = CallbackState.PENDING - callback.data["dag_run_id"] = dag_run.id - callback.data["dag_id"] = dag_run.dag_id - if executor_name is not None: - callback.data["executor"] = executor_name - session.add(callback) - session.flush() - return callback - - def _runner_with_executor(self, queue_side_effect=None): - from tests_common.test_utils.mock_executor import MockExecutor - - executor = MockExecutor(do_update=False) - if queue_side_effect is not None: - executor.queue_workload = mock.MagicMock(side_effect=queue_side_effect) - job = Job() - runner = SchedulerJobRunner(job=job, executors=[executor]) - return runner, executor - - # ---- Scenario #3: many callbacks, one bad one must not block the rest ---- - def test_one_bad_callback_does_not_block_others(self, dag_maker, session): - with dag_maker(dag_id="mixed_batch"): - pass - dr = dag_maker.create_dagrun() - - good1 = self._make_pending_callback(dr, session, executor_name=None) - bad = self._make_pending_callback(dr, session, executor_name=None) - good2 = self._make_pending_callback(dr, session, executor_name=None) - - bad_id = bad.id - - from tests_common.test_utils.mock_executor import MockExecutor - - executor = MockExecutor(do_update=False) - executor.supports_callbacks = True - real_queue = executor.queue_workload - - def selective_queue(workload, session=None): - if str(workload.callback.id) == str(bad_id): - raise RuntimeError("bad callback boom") - return real_queue(workload, session=session) - - executor.queue_workload = mock.MagicMock(side_effect=selective_queue) - job = Job() - runner = SchedulerJobRunner(job=job, executors=[executor]) - - runner._enqueue_executor_callbacks(session) - - # The good callbacks must still be QUEUED despite the bad one in the batch. - assert session.get(ExecutorCallback, good1.id).state == CallbackState.QUEUED - assert session.get(ExecutorCallback, good2.id).state == CallbackState.QUEUED - # The bad one stays PENDING for retry. - assert session.get(ExecutorCallback, bad.id).state == CallbackState.PENDING - - # ---- Orphaned ExecutorCallback: DagRun deleted -> terminalize FAILED (no zombie PENDING) ---- - def test_executor_callback_with_deleted_dagrun_is_failed_not_stuck(self, dag_maker, session): - """An ExecutorCallback whose DagRun has been deleted can never run (its context is built - from the DagRun), so it must be marked FAILED rather than left PENDING and retried (and - re-warned) every scheduler loop forever. Unlike TriggererCallbacks — which have a - standalone Trigger that can still terminalize them — a sync ExecutorCallback has no path - to completion once its DagRun is gone. - """ - - with dag_maker(dag_id="orphaned_cb"): - pass - dr = dag_maker.create_dagrun() - cb = self._make_pending_callback(dr, session, executor_name=None) - cb_id = cb.id - # Point the callback at a DagRun id that does not exist (simulates deletion). - cb.data["dag_run_id"] = 999999 - session.add(cb) - session.flush() - - runner, _ = self._runner_with_executor() - # Must not raise, and must not leave the callback PENDING forever. - runner._enqueue_executor_callbacks(session) - - assert session.get(ExecutorCallback, cb_id).state == CallbackState.FAILED, ( - "orphaned ExecutorCallback (deleted DagRun) should be FAILED, not stuck PENDING" - ) - - # ---- Transient context-fetch failure -> requeue (PENDING), not terminal FAILED ---- - def test_transient_failure_event_requeues_callback_as_pending(self, dag_maker, session): - """A PENDING event from the executor (the callback hit a transient context-fetch failure) - must reset the callback to PENDING so the next scheduler loop retries it — NOT mark it - terminally FAILED. This keeps the executor path's transient-failure behavior in lockstep - with the triggerer path, which re-evaluates on the next loop. - """ - from airflow.models.callback import CallbackKey - - with dag_maker(dag_id="transient_cb"): - pass - dr = dag_maker.create_dagrun() - cb = self._make_pending_callback(dr, session, executor_name=None) - cb_id = cb.id - # Move it out of PENDING first (as if it had been queued/run), so the requeue is observable. - cb.state = CallbackState.RUNNING - session.add(cb) - session.flush() - - runner, executor = self._runner_with_executor() - executor.get_event_buffer = mock.MagicMock( - return_value={CallbackKey(id=str(cb_id)): (CallbackState.PENDING, "context fetch failed")} - ) - - runner._process_executor_events(executor=executor, session=session) - - assert session.get(ExecutorCallback, cb_id).state == CallbackState.PENDING, ( - "a transient (PENDING) executor event must requeue the callback, not fail it" - ) diff --git a/airflow-core/tests/unit/jobs/test_triggerer_job.py b/airflow-core/tests/unit/jobs/test_triggerer_job.py index 9332c2a2342be..4ae539600d47b 100644 --- a/airflow-core/tests/unit/jobs/test_triggerer_job.py +++ b/airflow-core/tests/unit/jobs/test_triggerer_job.py @@ -1333,43 +1333,6 @@ def fn(moment): ... assert isinstance(err, TypeError) assert "got an unexpected keyword argument 'not_exists_arg'" in str(err) - @patch( - "airflow.jobs.triggerer_job_runner.TriggerRunner.get_trigger_by_classpath", - return_value=SuccessTrigger, # no required kwargs, so inflation reaches the context-build step - ) - @patch("airflow.jobs.triggerer_job_runner.Trigger._decrypt_kwargs", return_value={}) - @patch( - "airflow.jobs.triggerer_job_runner.TriggerRunner._build_context_from_dag_run_data", - side_effect=ValueError("simulated DRDataModel ValidationError (extra=forbid / version skew)"), - ) - @pytest.mark.asyncio - async def test_callback_context_build_failure_fails_only_that_trigger( - self, mock_build_ctx, mock_decrypt, mock_get_class, session - ): - """A non-TypeError raised while inflating a callback trigger (e.g. a ``pydantic.ValidationError`` - — a ``ValueError`` — from the strict ``DRDataModel(**dag_run_data)`` on a version-skewed / - malformed ``dag_run_data``) must fail ONLY that trigger, not escape ``create_triggers`` and - shut down the whole triggerer via ``arun()``'s ``except Exception: self.stop = True``.""" - trigger_runner = TriggerRunner() - # A callback workload (ti=None) with dag_run_data routes through _build_context_from_dag_run_data. - trigger_runner.to_create.append( - workloads.RunTrigger.model_construct( - id=7, - ti=None, - classpath="airflow.triggers.callback.CallbackTrigger", - encrypted_kwargs="fake", - dag_run_data={"dag_id": "d", "run_id": "r", "_unexpected_extra_field": "x"}, - ) - ) - - # Must NOT raise (the bug: ValueError escaped except TypeError -> arun -> triggerer shutdown). - await trigger_runner.create_triggers() - - # The bad trigger is failed in isolation, not silently inflated. - assert (7, ANY) in trigger_runner.failed_triggers - assert 7 not in trigger_runner.triggers - mock_build_ctx.assert_called_once() - @pytest.mark.asyncio @patch("airflow.sdk.execution_time.task_runner.SUPERVISOR_COMMS", create=True) async def test_invalid_trigger(self, supervisor_builder): @@ -2128,7 +2091,7 @@ def test_update_triggers_prevents_duplicate_creation_queue_entries(session, supe # Verify that the trigger is not in any other tracking sets assert trigger_orm.id not in supervisor.cancelling_triggers - assert not any(entry.trigger_id == trigger_orm.id for entry in supervisor.events) + assert not any(trigger_id == trigger_orm.id for trigger_id, _ in supervisor.events) assert not any(trigger_id == trigger_orm.id for trigger_id, _ in supervisor.failed_triggers) @@ -2899,64 +2862,3 @@ async def _drive(): trigger_id, _event, seq = events[0] assert trigger_id == 1 assert seq is None - - -class TestFetchCallbackDagRunData: - """Tests for TriggerRunnerSupervisor._fetch_callback_dag_run_data (the PRODUCER of the - ``_deadline`` passthrough that ``_build_context_from_dag_run_data`` consumes). Without this, - the deadline-metadata flow on the async path was only half-covered: a producer that failed to - pack ``_deadline`` (or read the wrong callback.data keys) would silently drop the metadata and - the consumer test would still pass. - """ - - def _supervisor(self, mocker): - import psutil - - job = Job() - process = mocker.Mock(spec=psutil.Process, pid=99) - return TriggerRunnerSupervisor( - process_log=mocker.Mock(spec=FilteringBoundLogger), - id=job.id, - job=job, - pid=process.pid, - stdin=mocker.Mock(spec=socket), - process=process, - capacity=10, - ) - - def test_packs_deadline_metadata_from_callback_data(self, session, dag_maker, mocker): - """deadline_id/deadline_time stored on callback.data (by Deadline.handle_miss) must be - packed into the returned dag_run_data as ``_deadline`` for the consumer to surface.""" - from airflow.models.callback import TriggererCallback - from airflow.models.deadline import Deadline - from airflow.sdk.definitions.callback import AsyncCallback - - with dag_maker(dag_id="fetch_cb_dag", session=session): - EmptyOperator(task_id="t") - dr = dag_maker.create_dagrun() - - # Build a deadline -> TriggererCallback and run handle_miss so callback.data carries the - # routing + deadline metadata exactly as it would in production. - deadline = Deadline( - deadline_time=dr.logical_date, - callback=AsyncCallback("airflow.models.deadline.Deadline.prune_deadlines"), - dagrun_id=dr.id, - deadline_alert_id=None, - ) - session.add(deadline) - session.flush() - deadline.handle_miss(session) - session.flush() - - callback = deadline.callback - assert isinstance(callback, TriggererCallback) - trigger = callback.trigger - assert trigger is not None - - supervisor = self._supervisor(mocker) - data = supervisor._fetch_callback_dag_run_data(trigger, session=session) - - assert data is not None - assert "_deadline" in data - assert data["_deadline"]["id"] == str(deadline.id) - assert data["_deadline"]["deadline_time"] == deadline.deadline_time.isoformat() diff --git a/airflow-core/tests/unit/migrations/test_0117_deadline_interval_json_migration.py b/airflow-core/tests/unit/migrations/test_0117_deadline_interval_json_migration.py index b1922bbb7f1c8..289e83797c30b 100644 --- a/airflow-core/tests/unit/migrations/test_0117_deadline_interval_json_migration.py +++ b/airflow-core/tests/unit/migrations/test_0117_deadline_interval_json_migration.py @@ -17,32 +17,30 @@ # under the License. """ -Regression tests for migration 0117 (8812eb67b63c): change deadline_alert.interval to JSON. +Regression test for migration 0117 (8812eb67b63c) on MySQL. -The downgrade is intentionally lossy — a serialized ``VariableInterval`` has no numeric -``__data__`` to recover, so it must be converted to NULL. Two defects this guards against: - -1. The post-downgrade ``interval`` column must be NULLABLE, otherwise the ``ALTER`` fails with - a NOT NULL violation on the VariableInterval rows and the whole downgrade crashes. -2. A VariableInterval must downgrade to NULL, NOT a bogus ``0.0`` — the value-conversion must - gate on the ``datetime.timedelta`` classname before extracting ``__data__`` (a VariableInterval - also has a ``__data__`` key, but it is a nested object that would ``CAST`` to ``0.0``). +The downgrade must convert ``deadline_alert.interval`` from JSON back to FLOAT without +raising ``ER_INVALID_JSON_TEXT`` (3140). The failure only reproduces with at least one +row present, so this seeds rows and runs the migration's own conversion SQL. It is a +no-op on SQLite/PostgreSQL, which take different code paths. """ from __future__ import annotations import importlib.util -import json -import uuid from pathlib import Path -from unittest import mock +import pytest import sqlalchemy as sa -from alembic.migration import MigrationContext -from alembic.operations import Operations + +from airflow import settings from tests_common.test_utils.paths import AIRFLOW_CORE_SOURCES_PATH +pytestmark = pytest.mark.db_test + +# Migration filenames start with a digit so they cannot be imported via the +# normal import system; load the module by file path instead. _MIGRATION_PATH = ( Path(AIRFLOW_CORE_SOURCES_PATH) / "airflow/migrations/versions/0117_3_3_0_change_deadline_interval_to_json.py" @@ -51,85 +49,43 @@ _migration = importlib.util.module_from_spec(_spec) # type: ignore[arg-type] _spec.loader.exec_module(_migration) # type: ignore[union-attr] -downgrade = _migration.downgrade - -_POST_0117_DDL = """ -CREATE TABLE deadline_alert ( - id TEXT PRIMARY KEY, - serialized_dag_id TEXT, - name TEXT, - reference TEXT NOT NULL, - interval JSON NOT NULL, - callback_def TEXT NOT NULL, - created_at TEXT -) -""" - -_REF = json.dumps({"reference_type": "DagRunQueuedAtDeadline"}) -_CB = json.dumps({"path": "my.callback"}) - -_TIMEDELTA_JSON = json.dumps({"__classname__": "datetime.timedelta", "__version__": 2, "__data__": 300.0}) -_VARIABLE_INTERVAL_JSON = json.dumps( - { - "__classname__": "airflow.sdk.definitions.deadline.VariableInterval", - "__version__": 1, - "__data__": {"key": "deadline_seconds"}, - } -) - +# Isolated table so we run the real conversion SQL without seeding the live deadline_alert +# table (which has FK/NOT NULL columns). +_TABLE = "_test_deadline_interval_dg" -def _engine(ddl): - engine = sa.create_engine("sqlite:///:memory:") - with engine.connect() as conn: - conn.execute(sa.text(ddl)) - conn.commit() - return engine - - -def _run(engine, func): - with engine.begin() as conn: - with Operations.context(MigrationContext.configure(conn)): - with mock.patch.object(_migration, "context") as mock_ctx: - mock_ctx.is_offline_mode.return_value = False - func() - - -def _insert(engine, *, interval, name): - with engine.begin() as conn: - conn.execute( - sa.text( - "INSERT INTO deadline_alert (id, serialized_dag_id, name, reference, interval, callback_def) " - "VALUES (:id, :sdag, :name, :ref, :interval, :cb)" - ), - { - "id": str(uuid.uuid4()), - "sdag": str(uuid.uuid4()), - "name": name, - "ref": _REF, - "interval": interval, - "cb": _CB, - }, - ) - - -def _rows(engine): - with engine.connect() as conn: - return { - r["name"]: r["interval"] - for r in conn.execute(sa.text("SELECT name, interval FROM deadline_alert")).mappings() - } +# A serialized timedelta as written by the 0117 upgrade. +_WRAPPED_TIMEDELTA = '{"__classname__": "datetime.timedelta", "__version__": 2, "__data__": 300.0}' class TestMigration0117Downgrade: - def test_downgrade_variable_interval_becomes_null_not_zero(self): - """The core regression: a VariableInterval must downgrade to NULL, never a bogus 0.0, - and the NOT NULL constraint must not crash the downgrade.""" - engine = _engine(_POST_0117_DDL) - _insert(engine, interval=_VARIABLE_INTERVAL_JSON, name="dynamic") - _insert(engine, interval=_TIMEDELTA_JSON, name="fixed") - _run(engine, downgrade) - rows = _rows(engine) - assert rows["fixed"] == 300.0 - assert rows["dynamic"] is None, ( - "VariableInterval must downgrade to NULL (lossy), not be silently coerced to 0.0" - ) + @pytest.mark.backend("mysql") + def test_mysql_downgrade_interval_value_update_does_not_reject_on_json_column(self): + """The downgrade value-conversion UPDATE must not raise 3140, and must round-trip to FLOAT.""" + create = f"CREATE TABLE {_TABLE} (id INT PRIMARY KEY, `interval` JSON NOT NULL)" + drop = f"DROP TABLE IF EXISTS {_TABLE}" + + with settings.engine.begin() as conn: + conn.execute(sa.text(drop)) + conn.execute(sa.text(create)) + conn.execute( + sa.text(f"INSERT INTO {_TABLE} (id, `interval`) VALUES (1, :v)"), + {"v": _WRAPPED_TIMEDELTA}, + ) + conn.execute( + sa.text(f"INSERT INTO {_TABLE} (id, `interval`) VALUES (2, CAST(:v AS JSON))"), + {"v": "60.0"}, + ) + + try: + with settings.engine.begin() as conn: + # Column is still JSON here; this UPDATE must not raise 3140. The retype casts. + conn.execute(sa.text(_migration._mysql_downgrade_interval_value_sql(_TABLE))) + conn.execute(sa.text(f"ALTER TABLE {_TABLE} MODIFY `interval` FLOAT NOT NULL")) + + with settings.engine.connect() as conn: + rows = dict(conn.execute(sa.text(f"SELECT id, `interval` FROM {_TABLE}")).all()) + + assert rows == {1: 300.0, 2: 60.0} + finally: + with settings.engine.begin() as conn: + conn.execute(sa.text(drop)) diff --git a/airflow-core/tests/unit/models/test_callback.py b/airflow-core/tests/unit/models/test_callback.py index d8c777dc27c50..b2b13582710fc 100644 --- a/airflow-core/tests/unit/models/test_callback.py +++ b/airflow-core/tests/unit/models/test_callback.py @@ -114,6 +114,7 @@ def test_get_metric_info(self): "result": "0", "path": TEST_ASYNC_CALLBACK.path, "kwargs": '{"email": "test@example.com"}', + "dag_id": TEST_DAG_ID, } def test_get_metric_info_dict_values_are_stringified(self): @@ -222,25 +223,6 @@ def test_queue_skips_trigger_team_name_when_multi_team_disabled( callback = self._queue_callback(session, has_bundle=True, has_team=True) assert callback.trigger.team_name is None - def test_queue_with_dag_id_and_no_bundle_does_not_crash(self, session): - """When dag_id is in data but bundle_name is absent, queue() should not crash. - - This verifies the session=session fix on DagModel.get_team_name (Fix 4). - """ - callback = TriggererCallback(TEST_ASYNC_CALLBACK) - callback.data["dag_id"] = "nonexistent_dag" - callback.data["run_id"] = "manual__2024-01-01" - callback.bundle_name = None # no bundle — falls into the elif branch - - # Should not raise — previously could fail with session-not-provided error - callback.queue(session=session) - - # Callback was queued successfully - assert callback.state == CallbackState.QUEUED - assert callback.trigger is not None - # team_name is None because the dag doesn't exist - assert callback.trigger.team_name is None - @pytest.mark.parametrize( ("event", "terminal_state"), [ @@ -312,108 +294,6 @@ def test_handle_event_emits_team_name(self, mock_incr, multi_team, team_name, ex mock_get_team_name.assert_not_called() assert "team_name" not in kwargs["tags"] - @pytest.mark.parametrize( - ("body", "expected_output"), - [ - pytest.param({"rows": 5, "ok": True}, '{"ok": true, "rows": 5}', id="dict_body"), - pytest.param([1, 2, 3], "[1, 2, 3]", id="list_body"), - pytest.param(42, "42", id="int_body"), - pytest.param("already_a_string", "already_a_string", id="str_body_passthrough"), - pytest.param(None, None, id="none_body"), - ], - ) - def test_handle_event_coerces_non_string_output(self, session, body, expected_output): - """A SUCCESS body is the callback's raw return value and can be any JSON-serializable - object. ``output`` is a Text column, so a non-string body must be JSON-coerced before - persisting — otherwise the DB driver raises on commit and crashes the TriggererJobRunner.""" - callback = TriggererCallback(TEST_ASYNC_CALLBACK) - callback.queue(session=session) - event = TriggerEvent({PAYLOAD_STATUS_KEY: CallbackState.SUCCESS, PAYLOAD_BODY_KEY: body}) - - callback.handle_event(event, session) - - assert callback.state == CallbackState.SUCCESS - assert callback.output == expected_output - # Must be a string (or None) so it persists to the Text column without adapter errors. - assert callback.output is None or isinstance(callback.output, str) - # The row must actually commit — this is what would crash before the coercion fix. - session.commit() - - def test_handle_event_body_that_breaks_json_dumps_falls_back_to_repr(self, session): - """A SUCCESS body that ``json.dumps(..., default=str)`` cannot encode must still be - coerced to a string (via the ``repr()`` fallback) rather than crashing the triggerer's - synchronous run loop. ``handle_event`` runs with no surrounding guard, so an uncaught - exception here would tear down the TriggererJobRunner.""" - - class _NotJSONSerializable: - """default=str is called for unknown types; raising from __str__ makes even - ``json.dumps(..., default=str)`` fail, forcing the repr() fallback branch.""" - - def __str__(self): - raise RuntimeError("str() blows up") - - def __repr__(self): - return "" - - callback = TriggererCallback(TEST_ASYNC_CALLBACK) - callback.queue(session=session) - body = _NotJSONSerializable() - event = TriggerEvent({PAYLOAD_STATUS_KEY: CallbackState.SUCCESS, PAYLOAD_BODY_KEY: body}) - - # Must not raise — the json.dumps failure is caught and repr() is used instead. - callback.handle_event(event, session) - - assert callback.state == CallbackState.SUCCESS - assert callback.output == "" - assert isinstance(callback.output, str) - # The row must actually persist without an adapter error on the Text column. - session.commit() - - @pytest.mark.parametrize( - ("terminal_state", "late_event"), - [ - pytest.param( - CallbackState.SUCCESS, - TriggerEvent({PAYLOAD_STATUS_KEY: CallbackState.RUNNING}), - id="success_then_late_running", - ), - pytest.param( - CallbackState.SUCCESS, - TriggerEvent({PAYLOAD_STATUS_KEY: CallbackState.FAILED, PAYLOAD_BODY_KEY: "err"}), - id="success_then_late_failed", - ), - pytest.param( - CallbackState.FAILED, - TriggerEvent({PAYLOAD_STATUS_KEY: CallbackState.SUCCESS, PAYLOAD_BODY_KEY: "late"}), - id="failed_then_late_success", - ), - ], - ) - def test_terminal_state_is_absorbing(self, session, terminal_state, late_event): - """Once a callback is terminal (SUCCESS/FAILED), a late/duplicate event must NOT move it. - - Normal flow delivers RUNNING then a single terminal event in order, but at-least-once / - duplicate delivery (re-run trigger, HA replica reprocessing a stale event) could otherwise - resurrect a completed callback to an active state or flip its terminal outcome. Terminal - states are absorbing: handle_event ignores further events once terminal. - """ - callback = TriggererCallback(TEST_ASYNC_CALLBACK) - callback.queue(session=session) - # Drive it to the terminal state first. - first_body = "original" - callback.handle_event( - TriggerEvent({PAYLOAD_STATUS_KEY: terminal_state, PAYLOAD_BODY_KEY: first_body}), session - ) - assert callback.state == terminal_state - original_output = callback.output - - # A late event must be ignored — state and output unchanged. - callback.handle_event(late_event, session) - assert callback.state == terminal_state, "terminal callback was resurrected/flipped by a late event" - assert callback.output == original_output, ( - "terminal callback's output was overwritten by a late event" - ) - class TestExecutorCallback: def test_polymorphic_serde(self, session): @@ -439,36 +319,6 @@ def test_queue(self, session): callback.queue(session=session) assert callback.state == CallbackState.QUEUED - def test_mixed_table_polymorphic_discrimination(self, session): - """With BOTH a TriggererCallback and an ExecutorCallback in the callback table, - polymorphic deserialization must type each row correctly, and a subclass-scoped query - must return only its own rows. This is the real scheduler/triggerer scenario: - ``_enqueue_executor_callbacks`` does ``select(ExecutorCallback)`` against a table that - also holds TriggererCallbacks — a mis-typed row would route a callback to the wrong - execution path (e.g. an async callback queued to an executor, or vice versa). - """ - tc = TriggererCallback(TEST_ASYNC_CALLBACK) - ec = ExecutorCallback(TEST_SYNC_CALLBACK, fetch_method=CallbackFetchMethod.IMPORT_PATH) - session.add_all([tc, ec]) - session.commit() - tc_id, ec_id = tc.id, ec.id - session.expunge_all() # force fresh polymorphic deserialization from the DB - - # select(Callback) types each row to its correct subclass. - by_id = {c.id: c for c in session.scalars(select(Callback)).all()} - assert isinstance(by_id[tc_id], TriggererCallback) - assert isinstance(by_id[ec_id], ExecutorCallback) - - # The scheduler's query: select(ExecutorCallback) returns ONLY the executor row. - exec_ids = {c.id for c in session.scalars(select(ExecutorCallback)).all()} - assert ec_id in exec_ids - assert tc_id not in exec_ids, "TriggererCallback leaked into select(ExecutorCallback)" - - # The triggerer's query: select(TriggererCallback) returns ONLY the triggerer row. - trig_ids = {c.id for c in session.scalars(select(TriggererCallback)).all()} - assert tc_id in trig_ids - assert ec_id not in trig_ids, "ExecutorCallback leaked into select(TriggererCallback)" - def test_session_get_requires_uuid_not_str(self, session): """Filtering the UUID id column with a plain str breaks on SQLite, so callers must wrap with ``UUID(...)`` before querying.""" diff --git a/airflow-core/tests/unit/models/test_dagrun.py b/airflow-core/tests/unit/models/test_dagrun.py index e578701965648..290e616dd99ec 100644 --- a/airflow-core/tests/unit/models/test_dagrun.py +++ b/airflow-core/tests/unit/models/test_dagrun.py @@ -73,6 +73,8 @@ ) from airflow.sdk.definitions.callback import AsyncCallback from airflow.sdk.definitions.deadline import DeadlineAlert, DeadlineReference, VariableInterval +from airflow.sdk.definitions.variable import Variable +from airflow.sdk.exceptions import AirflowRuntimeError from airflow.serialization.definitions.deadline import SerializedReferenceModels from airflow.serialization.serialized_objects import LazyDeserializedDAG from airflow.settings import get_policy_plugin_manager @@ -1384,27 +1386,16 @@ def test_dag_run_dag_versions_with_null_created_dag_version(self, dag_maker, ses VariableInterval("my_key"), ], ) + @mock.patch.object(Variable, "get") @mock.patch.object(Deadline, "prune_deadlines") - def test_dagrun_success_deadline(self, _, interval, session, deadline_test_dag): + def test_dagrun_success_deadline(self, _, mock_get, interval, session, deadline_test_dag): def on_success_callable(context): assert context["dag_run"].dag_id == "test_dag" - # Fixed future date (repo standard: no datetime.now() in tests). Far enough ahead that the - # FIXED_DATETIME deadline never fires, but within MySQL's TIMESTAMP range (< 2038-01-19) so - # the deadline_time UtcDateTime column accepts it on all backends. - future_date = datetime.datetime(2037, 1, 1, tzinfo=datetime.timezone.utc) - - # Seed a REAL Variable row so the VariableInterval resolves through the actual - # secrets chain (metastore backend) on the scheduler's session -- the code under test no - # longer calls Variable.get, so mocking it would be a false green (see regression: reading - # only the variable table bypassed env/secrets resolution). - if isinstance(interval, VariableInterval): - # Seed via the metastore model (not the SDK Variable imported above, whose set() routes - # through SUPERVISOR_COMMS) so the row really lands in the variable table on this session. - from airflow.models.variable import Variable as VariableModel - - VariableModel.set(key="my_key", value="5", session=session) - session.flush() + future_date = datetime.datetime.now() + datetime.timedelta(days=365) + + # First value used during resolution + mock_get.return_value = "5" scheduler_dag = deadline_test_dag( deadline=DeadlineAlert( @@ -1514,122 +1505,71 @@ def test_dagrun_success_handles_empty_deadline_list(self, mock_prune, dag_maker, mock_prune.assert_not_called() assert dag_run.state == DagRunState.SUCCESS + @mock.patch.object(Variable, "get") @mock.patch.object(Deadline, "prune_deadlines") - def test_dagrun_deadline_variable_interval_missing_variable_is_isolated( - self, _, session, deadline_test_dag - ): - """A VariableInterval whose backing Variable is missing must NOT abort DagRun creation. - - ``VariableInterval.resolve()`` raises ``ValueError`` for a missing/invalid Variable. - That resolution happens inside ``_process_dagrun_deadline_alerts`` during - ``create_dagrun``; previously the error propagated out and aborted the whole run, - silently stopping the DAG from scheduling. The per-alert ``try``/``except`` now isolates - the failure: the DagRun is created, the bad deadline is skipped (logged), and no Deadline - row is written. (Isolation must NOT use ``begin_nested`` here — ``create_dagrun`` runs - under the scheduler ``prohibit_commit`` guard, where a SAVEPOINT release would raise - ``UNEXPECTED COMMIT`` and skip every scheduled DagRun's deadlines.) - """ - # No Variable named "missing_key" is seeded in any backend, so the scheduler-side resolver - # returns None -> ValueError(...could not be resolved...), isolated by the per-alert except. - # Fixed future date (repo standard: no datetime.now() in tests). Far enough ahead that the - # FIXED_DATETIME deadline never fires, but within MySQL's TIMESTAMP range (< 2038-01-19) so - # the deadline_time UtcDateTime column accepts it on all backends. - future_date = datetime.datetime(2037, 1, 1, tzinfo=datetime.timezone.utc) + def test_dagrun_deadline_variable_interval_stable(self, _, mock_get, session, deadline_test_dag): + future_date = datetime.datetime.now() + datetime.timedelta(days=365) + + # First value used during resolution. + mock_get.return_value = "60" scheduler_dag = deadline_test_dag( deadline=DeadlineAlert( reference=DeadlineReference.FIXED_DATETIME(future_date), - interval=VariableInterval("missing_key"), + interval=VariableInterval("my_key"), callback=AsyncCallback(empty_callback_for_deadline), ), ) - # DagRun creation must succeed despite the unresolvable VariableInterval. dag_run = self.create_dag_run( dag=scheduler_dag, - task_states={"task_1": TaskInstanceState.SUCCESS}, + task_states={"task_1": TaskInstanceState.SUCCESS, "task_2": TaskInstanceState.SUCCESS}, session=session, ) - assert dag_run is not None - - # The bad deadline was skipped — no Deadline row created. - deadline = session.execute(select(Deadline)).scalars().one_or_none() - assert deadline is None + dag_run.dag = scheduler_dag - @mock.patch.object(Deadline, "prune_deadlines") - def test_dagrun_deadline_variable_interval_resolves_from_env_var( - self, _, session, deadline_test_dag, monkeypatch - ): - """A VariableInterval backed by an ``AIRFLOW_VAR_*`` env var (no DB row) must resolve. + # First update resolve interval to "5". + dag_run.update_state(session=session) - Regression guard: the scheduler-side resolver must go through the full secrets chain - (env vars + secrets backends + metadata DB), not read only the ``variable`` table. A - table-only read returns None for an env/secrets-backed Variable, and the per-alert - ``except`` then silently drops the deadline. Here the Variable lives ONLY in the - environment, so a correct resolver creates the Deadline and a regressed one drops it. - """ - # Variable exists only as an env var — never written to the variable table. - # VariableInterval values are interpreted as SECONDS (see coerce_to_timedelta). - monkeypatch.setenv("AIRFLOW_VAR_ENV_INTERVAL_KEY", "7") - future_date = datetime.datetime(2037, 1, 1, tzinfo=datetime.timezone.utc) + deadline = session.execute(select(Deadline)).scalars().one_or_none() + first_deadline_time = deadline.deadline_time - scheduler_dag = deadline_test_dag( - deadline=DeadlineAlert( - reference=DeadlineReference.FIXED_DATETIME(future_date), - interval=VariableInterval("env_interval_key"), - callback=AsyncCallback(empty_callback_for_deadline), - ), - ) + # Change Variable value after resolution. + mock_get.return_value = "120" - dag_run = self.create_dag_run( - dag=scheduler_dag, - task_states={"task_1": TaskInstanceState.SUCCESS}, - session=session, - ) - assert dag_run is not None + # Run again (This should not change existing deadline). + dag_run.update_state(session=session) - # The env-var-backed interval resolved (7 seconds) -> a Deadline row WAS created. deadline = session.execute(select(Deadline)).scalars().one_or_none() - assert deadline is not None - assert deadline.deadline_time == future_date + datetime.timedelta(seconds=7) + assert deadline.deadline_time == first_deadline_time @mock.patch.object(Deadline, "prune_deadlines") - def test_dagrun_deadline_decode_failure_is_isolated(self, _, session, deadline_test_dag): - """A deadline alert that fails to *decode* must NOT abort DagRun creation either. - - ``decode_deadline_alert`` can raise for a malformed/legacy serialized blob (e.g. a - None interval after a partial downgrade, an invalid interval type, or a corrupt - reference dict). That decode happens inside the per-alert SAVEPOINT, so the failure is - isolated just like a resolve-time failure: the DagRun is created and the bad deadline - skipped, rather than the corrupt row taking down scheduling for the whole DAG. - """ - # Fixed future date (repo standard: no datetime.now() in tests). Far enough ahead that the - # FIXED_DATETIME deadline never fires, but within MySQL's TIMESTAMP range (< 2038-01-19) so - # the deadline_time UtcDateTime column accepts it on all backends. - future_date = datetime.datetime(2037, 1, 1, tzinfo=datetime.timezone.utc) - scheduler_dag = deadline_test_dag( - deadline=DeadlineAlert( - reference=DeadlineReference.FIXED_DATETIME(future_date), - interval=datetime.timedelta(hours=1), - callback=AsyncCallback(empty_callback_for_deadline), - ), - ) + def test_dagrun_deadline_variable_interval_missing_variable_fails(self, _, session, deadline_test_dag): + mock_err = mock.Mock() + mock_err.error.value = "MISSING_DEADLINE" + mock_err.detail = "missing deadline" - # Simulate a corrupt/legacy blob: decoding the alert raises. - with mock.patch( - "airflow.serialization.definitions.dag.decode_deadline_alert", - side_effect=ValueError("corrupt deadline alert blob"), + with mock.patch.object( + Variable, + "get", + side_effect=AirflowRuntimeError(mock_err), ): - dag_run = self.create_dag_run( - dag=scheduler_dag, - task_states={"task_1": TaskInstanceState.SUCCESS}, - session=session, + future_date = datetime.datetime.now() + datetime.timedelta(days=365) + + scheduler_dag = deadline_test_dag( + deadline=DeadlineAlert( + reference=DeadlineReference.FIXED_DATETIME(future_date), + interval=VariableInterval("missing_key"), + callback=AsyncCallback(empty_callback_for_deadline), + ), ) - assert dag_run is not None - # No Deadline row — the undecodable alert was skipped, not fatal. - deadline = session.execute(select(Deadline)).scalars().one_or_none() - assert deadline is None + with pytest.raises(ValueError, match="not found"): + self.create_dag_run( + dag=scheduler_dag, + task_states={"task_1": TaskInstanceState.SUCCESS}, + session=session, + ) @pytest.mark.parametrize( diff --git a/airflow-core/tests/unit/models/test_deadline.py b/airflow-core/tests/unit/models/test_deadline.py index f1d89215db267..3635bcadcbea5 100644 --- a/airflow-core/tests/unit/models/test_deadline.py +++ b/airflow-core/tests/unit/models/test_deadline.py @@ -26,6 +26,7 @@ from sqlalchemy import select from sqlalchemy.exc import SQLAlchemyError +from airflow.api_fastapi.core_api.datamodels.dag_run import DAGRunResponse from airflow.models import DagRun from airflow.models.deadline import Deadline, _fetch_from_db from airflow.providers.standard.operators.empty import EmptyOperator @@ -211,23 +212,6 @@ def test_repr_without_callback_kwargs(self, dagrun, session): assert f"needed by {DEFAULT_DATE}" in repr_str assert TEST_CALLBACK_PATH in repr_str - def test_repr_with_dagrun_id_but_no_dagrun_relationship(self, deadline_orm): - """__repr__ must NOT raise when dagrun_id is set but the dagrun relationship is None. - - The FK (dagrun_id) can be set while the relationship resolves to None — e.g. the DagRun - was deleted (ondelete=CASCADE) and this is a stale/expired in-memory Deadline. A __repr__ - that raised AttributeError here would break log lines, tracebacks, and debugger displays - exactly when something is already going wrong. The repr falls back to an id-only form. - """ - # Sever the relationship while keeping the FK id (simulates deleted/detached DagRun). - deadline_orm.dagrun = None - assert deadline_orm.dagrun_id is not None - - repr_str = repr(deadline_orm) # must not raise - assert "[DagRun Deadline]" in repr_str - assert f"Run: {deadline_orm.dagrun_id}" in repr_str - assert "Dag: " in repr_str - @pytest.mark.db_test def test_bundle_name_propagated_to_callback(self, dagrun, session): """The bundle name is forwarded to the callback so the triggerer can resolve its team.""" @@ -264,17 +248,13 @@ def test_handle_miss(self, dagrun, session): assert deadline_orm.missed - # DagRun identifiers are stored at the top level of callback.data for routing - assert deadline_orm.callback.data["dag_id"] == dagrun.dag_id - assert deadline_orm.callback.data["run_id"] == dagrun.run_id + callback_kwargs = deadline_orm.callback.data["kwargs"] + context = callback_kwargs.pop("context") + assert callback_kwargs == TEST_CALLBACK_KWARGS - # Deadline-specific info is in top-level data (not kwargs) — the context builder - # exposes them as context["deadline"] = {"id": ..., "deadline_time": ...} - assert deadline_orm.callback.data["deadline_id"] == str(deadline_orm.id) - assert deadline_orm.callback.data["deadline_time"] == deadline_orm.deadline_time.isoformat() - - # User kwargs remain unchanged - assert deadline_orm.callback.data["kwargs"] == TEST_CALLBACK_KWARGS + assert context["deadline"]["id"] == deadline_orm.id + assert context["deadline"]["deadline_time"].timestamp() == deadline_orm.deadline_time.timestamp() + assert context["dag_run"] == DAGRunResponse.model_validate(dagrun).model_dump(mode="json") @pytest.mark.db_test @@ -539,19 +519,6 @@ def test_average_runtime_min_runs_validation(self): with pytest.raises(ValueError, match="min_runs must be at least 1"): DeadlineReference.AVERAGE_RUNTIME(max_runs=10, min_runs=-1) - def test_average_runtime_min_runs_cannot_exceed_max_runs(self): - """``min_runs > max_runs`` is unsatisfiable: the query is ``LIMIT max_runs`` and then - requires ``len(durations) >= min_runs``, so the deadline would silently never be - created. It must be rejected at authoring/deserialize time, not produce an inert deadline. - """ - match = "cannot exceed max_runs" - - # Deserialize path (e.g. a hand-edited / legacy serialized blob) must reject it. - with pytest.raises(ValueError, match=match): - SerializedReferenceModels.AverageRuntimeDeadline.deserialize_reference( - {"max_runs": 5, "min_runs": 10} - ) - class TestDeadlineReference: """DeadlineReference lives in definitions/deadlines.py but properly testing them requires DB access.""" diff --git a/airflow-core/tests/unit/models/test_deadline_alert.py b/airflow-core/tests/unit/models/test_deadline_alert.py index 1ea54bfd69d75..a9b1854f6abce 100644 --- a/airflow-core/tests/unit/models/test_deadline_alert.py +++ b/airflow-core/tests/unit/models/test_deadline_alert.py @@ -117,37 +117,6 @@ def test_deadline_alert_repr(self, deadline_alert_orm, deadline_reference): assert "interval=1m" in repr_str assert repr(deadline_alert_orm.callback_def) in repr_str - @pytest.mark.parametrize( - ("interval", "expected"), - [ - # The PRODUCTION shape after migration 0117: ``interval`` is a JSON dict holding the - # Airflow-serialized form, NOT a bare number. A fixed timedelta serializes to - # ``{"__classname__": "datetime.timedelta", "__data__": }``. - pytest.param( - {"__classname__": "datetime.timedelta", "__data__": 7200.0}, "interval=2h", id="timedelta_2h" - ), - # A corrupted dict without ``__data__`` must still render (no raise) as dynamic. - pytest.param({"unexpected": "shape"}, "interval=dynamic", id="corrupted_dict_dynamic"), - ], - ) - def test_deadline_alert_repr_does_not_raise_on_json_dict_interval( - self, deadline_alert_orm, interval, expected - ): - """``DeadlineAlert.__repr__`` must never raise for the PRODUCTION (JSON-dict) ``interval`` shape. - - Regression for a bug where ``__repr__`` did ``isinstance(self.interval, datetime.timedelta)`` - while ``datetime`` is imported as the *class* (``from datetime import datetime``), so - ``datetime.timedelta`` raised ``AttributeError`` — and since ``interval`` is *always* a JSON - dict in production (post-migration-0117), every real ``repr()`` hit that branch and raised - (a ``__repr__`` that raises breaks logs/tracebacks/debuggers — the same class as the - ``Deadline.__repr__`` None-deref fix). The old ``test_deadline_alert_repr`` used a *bare int* - fixture (``DEADLINE_INTERVAL = 60``) which only exercised the int/float branch and masked it. - """ - deadline_alert_orm.interval = interval - repr_str = repr(deadline_alert_orm) # must not raise - assert "[DeadlineAlert]" in repr_str - assert expected in repr_str - def test_deadline_alert_matches_definition(self, session, deadline_reference): alert1 = DeadlineAlert( serialized_dag_id=SERIALIZED_DAG_ID, diff --git a/airflow-core/tests/unit/models/test_deadline_callback_adversarial.py b/airflow-core/tests/unit/models/test_deadline_callback_adversarial.py deleted file mode 100644 index d331f46f79cee..0000000000000 --- a/airflow-core/tests/unit/models/test_deadline_callback_adversarial.py +++ /dev/null @@ -1,137 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Adversarial QA wave 7 for the deadline-callback feature. - -These tests exercise the *real* models / scheduler-query / handle_event code on -SQLite, targeting ``handle_miss`` idempotency / latent fragility (duplicate -Trigger creation). -""" - -from __future__ import annotations - -import pytest -import time_machine -from sqlalchemy import select - -from airflow.models import DagRun, Trigger -from airflow.models.deadline import Deadline -from airflow.providers.standard.operators.empty import EmptyOperator -from airflow.sdk.definitions.callback import AsyncCallback -from airflow.utils.state import CallbackState, DagRunState - -from tests_common.test_utils import db -from unit.models import DEFAULT_DATE - -pytestmark = [pytest.mark.db_test] - - -async def qaw7_callback(): - """Awaitable used as the callback target for these tests.""" - return None - - -TEST_CALLBACK_PATH = f"{__name__}.{qaw7_callback.__name__}" -DAG_ID = "qaw7_dag" - - -def _clean_db(): - # Order matters for SQLite FK enforcement: a Callback may reference a Trigger - # (callback.trigger_id -> trigger.id), and clear_db_runs() deletes Trigger rows. - # Delete deadlines and callbacks first so no callback still references a trigger - # when clear_db_runs() issues DELETE FROM trigger. - db.clear_db_deadline() - db.clear_db_callbacks() - db.clear_db_triggers() - db.clear_db_runs() - db.clear_db_dags() - - -@pytest.fixture -def session(): - from airflow.utils.session import create_session - - with create_session() as session: - yield session - - -@pytest.fixture -def dagrun(session, dag_maker): - _clean_db() - with dag_maker(DAG_ID): - EmptyOperator(task_id="task") - with time_machine.travel(DEFAULT_DATE): - dag_maker.create_dagrun(state=DagRunState.QUEUED, logical_date=DEFAULT_DATE) - session.commit() - dr = session.scalars(select(DagRun)).all() - assert len(dr) == 1 - yield dr[0] - _clean_db() - - -def _make_deadline(dagrun, deadline_time, kwargs=None): - return Deadline( - deadline_time=deadline_time, - callback=AsyncCallback(TEST_CALLBACK_PATH, kwargs or {"arg": "v"}), - dagrun_id=dagrun.id, - dag_id=dagrun.dag_id, - deadline_alert_id=None, - ) - - -# --------------------------------------------------------------------------- -# 1. handle_miss idempotency / latent fragility -# --------------------------------------------------------------------------- -class TestHandleMissIdempotency: - def test_double_handle_miss_does_not_duplicate_trigger_or_orphan_first(self, dagrun, session): - """ - Calling ``handle_miss`` twice on the *same in-memory object* (simulating a - re-evaluation / retry-after-flush-before-commit) must NOT double-create a - Trigger. ``handle_miss`` carries an idempotency guard (``if self.missed: - return``): the first call flips ``missed=True`` and queues exactly one - Trigger, the second call short-circuits before re-running ``queue()``. - - Wave 6 predicted the latent fragility that an unguarded ``queue()`` would - mint a fresh ``Trigger.from_object(...)`` on every call and orphan the first; - the guard added in ``Deadline.handle_miss`` is what makes that unreachable. - """ - deadline = _make_deadline(dagrun, DEFAULT_DATE) - session.add(deadline) - session.flush() - - deadline.handle_miss(session) - session.flush() - first_trigger_id = deadline.callback.trigger_id - assert first_trigger_id is not None - assert deadline.callback.state == CallbackState.QUEUED - assert deadline.missed is True - - # Second call on the same object — the early-return guard short-circuits. - deadline.handle_miss(session) - session.flush() - second_trigger_id = deadline.callback.trigger_id - - # No new Trigger was minted; the callback still points at the original. - assert second_trigger_id is not None - assert second_trigger_id == first_trigger_id, ( - "Idempotency guard: handle_miss must not re-queue / mint a second Trigger" - ) - - all_triggers = session.scalars(select(Trigger)).all() - # Exactly one Trigger row exists — no orphan left behind. - assert len(all_triggers) == 1 - assert all_triggers[0].id == first_trigger_id diff --git a/airflow-core/tests/unit/models/test_deadline_callback_context_construction.py b/airflow-core/tests/unit/models/test_deadline_callback_context_construction.py deleted file mode 100644 index f74ab08b10536..0000000000000 --- a/airflow-core/tests/unit/models/test_deadline_callback_context_construction.py +++ /dev/null @@ -1,55 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Tests for the triggerer/deadline callback context-construction layers (PR #66608). - -Covers ``TriggerRunner._build_context_from_dag_run_data`` ``_deadline`` handling -and SDK callback path resolution for partial / lambda / bound-method callables. -""" - -from __future__ import annotations - -from airflow.jobs.triggerer_job_runner import TriggerRunner - -# Base valid dag_run_data accepted by DRDataModel. -_BASE_DAG_RUN_DATA: dict = { - "dag_id": "example_dag", - "run_id": "manual__2024-01-01", - "logical_date": "2024-01-01T00:00:00+00:00", - "data_interval_start": None, - "data_interval_end": None, - "run_after": "2024-01-01T00:00:00+00:00", - "start_date": "2024-01-01T00:01:00+00:00", - "end_date": None, - "run_type": "manual", - "state": "running", - "conf": {}, - "consumed_asset_events": [], - "partition_key": None, -} - - -# --------------------------------------------------------------------------- -# _build_context_from_dag_run_data with the _deadline key variants -# --------------------------------------------------------------------------- -class TestDeadlineKey: - def test_deadline_popped_before_model_construction(self): - """_deadline must be popped so it does not trip extra='forbid'.""" - data = {**_BASE_DAG_RUN_DATA, "_deadline": {"id": "x", "deadline_time": "t"}} - ctx = TriggerRunner._build_context_from_dag_run_data(data) - assert ctx["deadline"] == {"id": "x", "deadline_time": "t"} - # the input dict was mutated by pop - acceptable, but verify no leak into model - assert "_deadline" not in ctx["dag_run"].model_dump() diff --git a/airflow-core/tests/unit/models/test_deadline_handle_miss_reentrancy.py b/airflow-core/tests/unit/models/test_deadline_handle_miss_reentrancy.py deleted file mode 100644 index 97ceb566a424a..0000000000000 --- a/airflow-core/tests/unit/models/test_deadline_handle_miss_reentrancy.py +++ /dev/null @@ -1,102 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Tests for Deadline.handle_miss re-entrancy and pre-populated routing data (PR #66608).""" - -from __future__ import annotations - -import pytest -import time_machine -from sqlalchemy import select - -from airflow.models import DagRun -from airflow.models.deadline import Deadline -from airflow.providers.standard.operators.empty import EmptyOperator -from airflow.utils.state import DagRunState - -from tests_common.test_utils import db -from unit.models import DEFAULT_DATE - -DAG_ID = "handle_miss_reentrancy_dag" -TEST_CALLBACK_PATH = "unit.models.test_deadline_handle_miss_reentrancy.callback_for_deadline" -TEST_CALLBACK_KWARGS = {"arg1": "value1"} - - -async def callback_for_deadline(): - pass - - -def _clean_db(): - db.clear_db_dags() - db.clear_db_runs() - db.clear_db_deadline() - - -@pytest.fixture -def dagrun(session, dag_maker): - with dag_maker(DAG_ID): - EmptyOperator(task_id="TASK_ID") - with time_machine.travel(DEFAULT_DATE): - dag_maker.create_dagrun(state=DagRunState.QUEUED, logical_date=DEFAULT_DATE) - session.commit() - return session.scalars(select(DagRun)).all()[0] - - -@pytest.mark.db_test -class TestHandleMissReentrancy: - @staticmethod - def setup_method(): - _clean_db() - - @staticmethod - def teardown_method(): - _clean_db() - - def test_routing_data_persists_across_commit_and_refetch_executor(self, dagrun, session): - """handle_miss must ``flag_modified`` the JSON ``data`` column so the routing keys it - writes actually persist. The other tests assert only the in-memory dict (which holds the - mutation regardless of flag_modified); this one commits, expires, and RE-FETCHES the row - so it would fail if flag_modified were missing and the JSON mutation were silently dropped. - Uses an ExecutorCallback because its ``dag_run_id`` routing key is required by - ``_enqueue_executor_callbacks`` at pickup — losing it would break context-building. - """ - from airflow.models.callback import Callback, ExecutorCallback - from airflow.sdk.definitions.callback import SyncCallback - - deadline = Deadline( - deadline_time=DEFAULT_DATE, - callback=SyncCallback(TEST_CALLBACK_PATH, TEST_CALLBACK_KWARGS), - dagrun_id=dagrun.id, - dag_id=dagrun.dag_id, - deadline_alert_id=None, - ) - session.add(deadline) - session.flush() - callback_id = deadline.callback.id - assert isinstance(deadline.callback, ExecutorCallback) - - deadline.handle_miss(session) - session.commit() - session.expire_all() # drop all in-memory state — force a fresh DB read - - refetched = session.get(Callback, callback_id) - # These routing keys must have survived the JSON-column persist (flag_modified working). - assert refetched.data["dag_id"] == dagrun.dag_id - assert refetched.data["run_id"] == dagrun.run_id - assert refetched.data["deadline_id"] == str(deadline.id) - assert refetched.data["deadline_time"] == DEFAULT_DATE.isoformat() - # dag_run_id is the executor-specific routing key (consumed by _enqueue_executor_callbacks). - assert refetched.data["dag_run_id"] == str(dagrun.id) diff --git a/airflow-core/tests/unit/models/test_deadline_scheduler_resilience.py b/airflow-core/tests/unit/models/test_deadline_scheduler_resilience.py deleted file mode 100644 index e297a4aaa771e..0000000000000 --- a/airflow-core/tests/unit/models/test_deadline_scheduler_resilience.py +++ /dev/null @@ -1,149 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Resilience tests for the scheduler deadline-detection loop. - -Covers: -* A DeadlineReference evaluated against a DagRun missing the referenced field. -* Error isolation in the scheduler loop: one deadline whose ``handle_miss`` - raises must not crash the scheduler nor block the other overdue deadlines. -""" - -from __future__ import annotations - -from datetime import datetime, timedelta, timezone as dt_timezone - -import pytest - -from airflow.models.deadline import Deadline -from airflow.providers.standard.operators.empty import EmptyOperator -from airflow.sdk.definitions.callback import AsyncCallback -from airflow.utils.state import DagRunState - -from tests_common.test_utils import db -from unit.models import DEFAULT_DATE - -pytestmark = [pytest.mark.db_test] - - -async def deadline_callback(): - """Awaitable used as the callback target.""" - - -CB_PATH = f"{__name__}.{deadline_callback.__name__}" - - -def _clean(): - # Order matters: Deadline -> Callback (Callback.trigger_id FK -> trigger) -> Trigger. - db.clear_db_deadline() - db.clear_db_deadline_alert() - db.clear_db_callbacks() - db.clear_db_triggers() - db.clear_db_runs() - db.clear_db_dags() - - -@pytest.fixture(autouse=True) -def _cleanup(): - _clean() - yield - _clean() - - -# --------------------------------------------------------------------------- -# Helpers for scheduler-level scenarios. -# --------------------------------------------------------------------------- - - -def _make_dagrun(dag_maker, session, dag_id, logical_date, run_id): - with dag_maker(dag_id): - EmptyOperator(task_id="t") - dr = dag_maker.create_dagrun(state=DagRunState.RUNNING, logical_date=logical_date, run_id=run_id) - session.flush() - return dr - - -def _make_deadline(session, dagrun, deadline_time, callback=None): - dl = Deadline( - deadline_time=deadline_time, - callback=callback or AsyncCallback(CB_PATH), - dagrun_id=dagrun.id, - deadline_alert_id=None, - ) - session.add(dl) - session.flush() - return dl - - -# --------------------------------------------------------------------------- -# Scenario 6 + 4: error isolation in the REAL scheduler detection loop. -# A bad deadline (handle_miss raises) must not crash the scheduler nor prevent -# the other overdue deadlines from being processed. -# -# This drives the real SchedulerJobRunner._execute() (num_runs=1), patching the -# real Deadline.handle_miss so exactly one of three overdue deadlines raises. -# The desired behaviour: scheduler does not crash, and both good deadlines are -# still handled. -# --------------------------------------------------------------------------- - - -def test_scheduler_loop_isolates_one_bad_deadline(dag_maker, session, monkeypatch): - from airflow.jobs.job import Job - from airflow.jobs.scheduler_job_runner import SchedulerJobRunner - - try: - from airflow.executors.local_executor import LocalExecutor as _Exec - except Exception: - from tests_common.test_utils.mock_executor import MockExecutor as _Exec - - past = datetime(2025, 6, 1, tzinfo=dt_timezone.utc) - timedelta(days=4000) - # Three dagruns, each with one overdue deadline. - deadlines = [] - for i in range(3): - dr = _make_dagrun(dag_maker, session, f"dl_iso_{i}", DEFAULT_DATE + timedelta(days=i), f"run_{i}") - deadlines.append(_make_deadline(session, dr, past)) - session.commit() - bad_id = deadlines[1].id - - real_handle_miss = Deadline.handle_miss - processed: list = [] - - def patched(self, sess): - if self.id == bad_id: - raise RuntimeError("induced handle_miss failure") - processed.append(self.id) - return real_handle_miss(self, sess) - - monkeypatch.setattr(Deadline, "handle_miss", patched) - - job = Job() - runner = SchedulerJobRunner(job=job, num_runs=1, executors=[_Exec()]) - - crashed = None - try: - runner._execute() - except Exception as e: - crashed = e - - good_ids = {deadlines[0].id, deadlines[2].id} - assert crashed is None, ( - f"scheduler crashed on one bad deadline instead of isolating it: {crashed!r}. " - f"Processed before crash: {processed}." - ) - assert good_ids.issubset(set(processed)), ( - f"good deadlines not all processed: processed={processed}, expected superset of {good_ids}" - ) diff --git a/airflow-core/tests/unit/models/test_prune_deadlines.py b/airflow-core/tests/unit/models/test_prune_deadlines.py deleted file mode 100644 index b0368434072a5..0000000000000 --- a/airflow-core/tests/unit/models/test_prune_deadlines.py +++ /dev/null @@ -1,134 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Adversarial QA coverage for ``Deadline.prune_deadlines``. - -``prune_deadlines`` is the batch-delete path the scheduler invokes (via -``DagRun.update_state`` -> ``dagrun.py:1237``) when a DagRun completes on time: -deadlines that no longer need to fire are removed. These tests drill into the -on-time/overdue/pending selection logic, the (lack of) batching, callback -cascade behaviour, and concurrent-mutation edge cases against a real DB. -""" - -from __future__ import annotations - -from datetime import timedelta -from typing import TYPE_CHECKING - -import pytest -import time_machine -from sqlalchemy import select - -from airflow.models import DagRun -from airflow.models.callback import Callback -from airflow.models.deadline import Deadline -from airflow.providers.standard.operators.empty import EmptyOperator -from airflow.sdk.definitions.callback import AsyncCallback -from airflow.utils.state import DagRunState - -from tests_common.test_utils import db -from unit.models import DEFAULT_DATE - -if TYPE_CHECKING: - from sqlalchemy.orm import Session - -DAG_ID = "qaw18_prune_dag" - - -async def _qaw18_callback(): - pass - - -CALLBACK_PATH = f"{__name__}.{_qaw18_callback.__name__}" - - -def _clean_db(): - db.clear_db_dags() - db.clear_db_runs() - db.clear_db_deadline() - - -@pytest.fixture -def dagrun(session, dag_maker): - with dag_maker(DAG_ID): - EmptyOperator(task_id="task_id") - with time_machine.travel(DEFAULT_DATE): - dag_maker.create_dagrun(state=DagRunState.QUEUED, logical_date=DEFAULT_DATE) - session.commit() - return session.scalars(select(DagRun)).one() - - -def _make_deadline(session: Session, *, dagrun_id: int, deadline_time, state=None) -> Deadline: - deadline = Deadline( - deadline_time=deadline_time, - callback=AsyncCallback(CALLBACK_PATH), - dagrun_id=dagrun_id, - dag_id=DAG_ID, - deadline_alert_id=None, - ) - session.add(deadline) - session.flush() - if state is not None: - deadline.callback.state = state - session.add(deadline.callback) - session.flush() - return deadline - - -@pytest.mark.db_test -class TestPruneDeadlines: - @staticmethod - def setup_method(): - _clean_db() - - @staticmethod - def teardown_method(): - _clean_db() - - def test_handle_miss_then_prune_does_not_delete_missed(self, dagrun, session): - """ - Inverse race: handle_miss marks the deadline missed and queues the callback - BEFORE the on-time prune runs. prune must NOT delete a deadline already marked - ``missed`` — its callback is owned by the scheduler/triggerer, and cascade-deleting - the deadline would silently drop that queued callback (a lost-callback window). - - prune_deadlines explicitly filters ``~Deadline.missed`` so a missed deadline (and its - queued callback) survives even if the DagRun's end_date would otherwise match the - on-time predicate. (In the real scheduler a missed deadline always has - ``deadline_time < now <= end_date`` so it can't match the on-time predicate anyway; - the explicit filter makes the invariant robust to future callers and clock skew.) - """ - d = _make_deadline(session, dagrun_id=dagrun.id, deadline_time=DEFAULT_DATE + timedelta(hours=1)) - d.handle_miss(session) - session.flush() - assert d.missed is True - deadline_id = d.id - callback_id = d.callback.id - - # DagRun reports on-time completion (end_date <= deadline_time) — the exact condition - # that would have pruned the row before the ~missed guard was added. - dagrun.end_date = DEFAULT_DATE - session.add(dagrun) - session.flush() - - deleted = Deadline.prune_deadlines(session=session, conditions={DagRun.id: dagrun.id}) - session.flush() - - # The missed deadline and its queued callback both survive. - assert deleted == 0 - assert session.get(Deadline, deadline_id) is not None - assert session.get(Callback, callback_id) is not None diff --git a/airflow-core/tests/unit/models/test_trigger.py b/airflow-core/tests/unit/models/test_trigger.py index 12a0d67978a5f..1243a9112f97a 100644 --- a/airflow-core/tests/unit/models/test_trigger.py +++ b/airflow-core/tests/unit/models/test_trigger.py @@ -254,39 +254,6 @@ def test_submit_failure(session, create_task_instance): assert updated_task_instance.next_method == "__fail__" -def test_submit_failure_terminalizes_callback(session): - """A CallbackTrigger that fails (lands in failed_triggers → submit_failure) must terminalize - its Callback row as FAILED. ``submit_failure`` previously only handled deferred TaskInstances, - so a callback trigger's row was left stuck QUEUED/RUNNING forever (a zombie — callback rows - terminalise only via an event, never via the TI failure path). This mirrors ``submit_event``'s - callback handling on the success side.""" - from airflow.utils.state import CallbackState - - trigger = Trigger(classpath="airflow.triggers.testing.SuccessTrigger", kwargs={}) - session.add(trigger) - callback = TriggererCallback(callback_def=AsyncCallback("classpath.callback")) - callback.trigger = trigger - callback.state = CallbackState.QUEUED # state handle_miss leaves it in - session.add(callback) - session.commit() - - Trigger.submit_failure(trigger.id, exc=["boom\n"], session=session) - session.flush() - session.refresh(callback) - - assert callback.state == CallbackState.FAILED, ( - "submit_failure must mark a CallbackTrigger's callback FAILED, not leave it stuck QUEUED" - ) - assert callback.output is not None - assert "boom" in callback.output - - # Terminal-absorbing: a second submit_failure must NOT flip it away from FAILED. - Trigger.submit_failure(trigger.id, exc=["again\n"], session=session) - session.flush() - session.refresh(callback) - assert callback.state == CallbackState.FAILED - - @pytest.mark.parametrize( ("event_cls", "expected"), [ diff --git a/airflow-core/tests/unit/serialization/test_deadline_reference_registry.py b/airflow-core/tests/unit/serialization/test_deadline_reference_registry.py index ff54a6b2569a8..feae74bf98043 100644 --- a/airflow-core/tests/unit/serialization/test_deadline_reference_registry.py +++ b/airflow-core/tests/unit/serialization/test_deadline_reference_registry.py @@ -98,85 +98,3 @@ def test_serialized_custom_reference_rejects_unregistered(monkeypatch): SerializedReferenceModels.SerializedCustomReference.deserialize_reference( {"__class_path": "some.other.module.UnregisteredReference"} ) - - -@pytest.mark.parametrize( - "reference_data", - [ - pytest.param({"reference_type": "TotallyUnknownReference"}, id="unknown_type_no_class_path"), - pytest.param({"reference_type": "X", "__class_path": ""}, id="empty_class_path"), - ], -) -def test_serialized_custom_reference_missing_class_path_raises_clear_error(reference_data): - """A reference routed to SerializedCustomReference but lacking a usable ``__class_path`` - (corrupt / hand-edited row, blob from a newer version, or a custom ref whose plugin is gone) - must raise a clear ValueError — NOT a bare ``KeyError: '__class_path'``.""" - with pytest.raises(ValueError, match="unrecognized reference_type"): - SerializedReferenceModels.SerializedCustomReference.deserialize_reference(reference_data) - - -class FixedDatetimeDeadline(ReferenceModels.BaseDeadlineReference): - """Custom reference whose class name deliberately collides with a builtin reference name.""" - - required_kwargs: set[str] = set() - - def serialize_reference(self) -> dict: - return {"reference_type": self.reference_name, "marker": "i-am-custom"} - - def _evaluate_with(self, *, session, **kwargs): - raise AssertionError("custom evaluate should not be exercised in this test") - - -class DagRunLogicalDateDeadline(ReferenceModels.BaseDeadlineReference): - """Custom reference colliding with a builtin that has no required deserialize fields.""" - - required_kwargs: set[str] = set() - - def serialize_reference(self) -> dict: - return {"reference_type": self.reference_name} - - def _evaluate_with(self, *, session, **kwargs): - raise AssertionError("custom evaluate should not be exercised in this test") - - -_COLLIDING_REFS = { - f"{FixedDatetimeDeadline.__module__}.FixedDatetimeDeadline": FixedDatetimeDeadline, - f"{DagRunLogicalDateDeadline.__module__}.DagRunLogicalDateDeadline": DagRunLogicalDateDeadline, -} - - -@pytest.fixture -def colliding_plugin_registry(monkeypatch): - """Advertise custom references whose names collide with builtin reference names.""" - monkeypatch.setattr( - plugins_manager, - "get_deadline_references_plugins", - lambda: _COLLIDING_REFS, - ) - return _COLLIDING_REFS - - -@pytest.mark.parametrize( - "custom_cls", - [FixedDatetimeDeadline], -) -def test_custom_reference_name_collision_routes_to_custom(colliding_plugin_registry, custom_cls): - """ - A custom reference whose class name collides with a builtin must round-trip as the custom - class, not silently decode as the builtin (which loses the user's evaluation logic or raises - a spurious KeyError on builtin-only fields). - - Regression test: ``decode_deadline_reference`` previously routed solely by the - ``reference_type`` name string, ignoring the authoritative ``__class_path`` key. - """ - from airflow.serialization.decoders import decode_deadline_reference - from airflow.serialization.encoders import encode_deadline_reference - - encoded = encode_deadline_reference(custom_cls()) - # The encoder stamps __class_path for custom references regardless of name collision. - assert encoded["__class_path"] == f"{custom_cls.__module__}.{custom_cls.__name__}" - - decoded = decode_deadline_reference(encoded) - - assert isinstance(decoded, SerializedReferenceModels.SerializedCustomReference) - assert isinstance(decoded.inner_ref, custom_cls) diff --git a/airflow-core/tests/unit/triggers/test_callback.py b/airflow-core/tests/unit/triggers/test_callback.py index 1b48bcee7157b..99eca603323bb 100644 --- a/airflow-core/tests/unit/triggers/test_callback.py +++ b/airflow-core/tests/unit/triggers/test_callback.py @@ -17,8 +17,6 @@ from __future__ import annotations -import sys -from collections import deque from unittest import mock import pytest @@ -29,8 +27,7 @@ TEST_MESSAGE = "test_message" TEST_CALLBACK_PATH = "classpath.test_callback" -TEST_CALLBACK_KWARGS = {"message": TEST_MESSAGE} -TEST_CALLBACK_CONTEXT = {"dag_run": "test"} +TEST_CALLBACK_KWARGS = {"message": TEST_MESSAGE, "context": {"dag_run": "test"}} class ExampleAsyncNotifier(BaseNotifier): @@ -51,13 +48,10 @@ class TestCallbackTrigger: @pytest.fixture def trigger(self): """Create a fresh trigger per test to avoid shared mutable state.""" - trigger = CallbackTrigger( + return CallbackTrigger( callback_path=TEST_CALLBACK_PATH, callback_kwargs=dict(TEST_CALLBACK_KWARGS), ) - # Simulate the TriggerRunner setting context (built from dag_run_data) - trigger._callback_context = TEST_CALLBACK_CONTEXT - return trigger @pytest.fixture def mock_import_string(self): @@ -99,33 +93,10 @@ async def test_run_success_with_async_function(self, trigger, mock_import_string success_event = await anext(trigger_gen) mock_import_string.assert_called_once_with(TEST_CALLBACK_PATH) # AsyncMock accepts **kwargs, so _accepts_context returns True and context is passed through - mock_callback.assert_called_once_with(**TEST_CALLBACK_KWARGS, context=TEST_CALLBACK_CONTEXT) + mock_callback.assert_called_once_with(**TEST_CALLBACK_KWARGS) assert success_event.payload[PAYLOAD_STATUS_KEY] == CallbackState.SUCCESS assert success_event.payload[PAYLOAD_BODY_KEY] == callback_return_value - @pytest.mark.asyncio - async def test_run_renders_jinja_kwargs(self, mock_import_string): - """String kwargs containing Jinja are rendered against the context before the callback is - called — matching the synchronous executor path (regression test for the async path - previously passing kwargs through verbatim).""" - mock_callback = mock.AsyncMock(return_value="ok") - mock_import_string.return_value = mock_callback - - trigger = CallbackTrigger( - callback_path=TEST_CALLBACK_PATH, - callback_kwargs={"rendered": "run={{ dag_run }}", "plain": "no-jinja", "n": 42}, - ) - trigger._callback_context = TEST_CALLBACK_CONTEXT # {"dag_run": "test"} - - trigger_gen = trigger.run() - await anext(trigger_gen) # RUNNING - await anext(trigger_gen) # SUCCESS - - # "{{ dag_run }}" -> "test"; non-jinja string and non-string kwargs pass through untouched. - mock_callback.assert_called_once_with( - rendered="run=test", plain="no-jinja", n=42, context=TEST_CALLBACK_CONTEXT - ) - @pytest.mark.asyncio async def test_run_success_with_notifier(self, trigger, mock_import_string): """Test trigger handles async notifier classes correctly.""" @@ -158,136 +129,6 @@ async def test_run_failure(self, trigger, mock_import_string): failure_event = await anext(trigger_gen) mock_import_string.assert_called_once_with(TEST_CALLBACK_PATH) # AsyncMock accepts **kwargs, so _accepts_context returns True and context is passed through - mock_callback.assert_called_once_with(**TEST_CALLBACK_KWARGS, context=TEST_CALLBACK_CONTEXT) + mock_callback.assert_called_once_with(**TEST_CALLBACK_KWARGS) assert failure_event.payload[PAYLOAD_STATUS_KEY] == CallbackState.FAILED assert all(s in failure_event.payload[PAYLOAD_BODY_KEY] for s in ["raise", "RuntimeError", exc_msg]) - - @pytest.mark.asyncio - async def test_run_with_both_runtime_and_stale_kwargs_context(self, mock_import_string): - """A 3.2.x-serialized callback trigger has a stale ``context`` in ``callback_kwargs``; a 3.3 - TriggerRunner ALSO sets ``_callback_context`` (it does so unconditionally for callback - triggers with dag_run_data — triggerer_job_runner.py:1360). The runtime context must WIN, - and the stale kwargs ``context`` MUST be stripped so it is not double-passed (which would - raise ``TypeError: got multiple values for keyword argument 'context'``). - """ - mock_callback = mock.AsyncMock(return_value="ok") - mock_import_string.return_value = mock_callback - - trigger = CallbackTrigger( - callback_path=TEST_CALLBACK_PATH, - callback_kwargs={"message": TEST_MESSAGE, "context": {"dag_run": "stale_legacy"}}, - ) - # New path: the TriggerRunner injected a fresh runtime context. - trigger._callback_context = {"dag_run": "fresh_runtime"} - - trigger_gen = trigger.run() - running_event = await anext(trigger_gen) - assert running_event.payload[PAYLOAD_STATUS_KEY] == CallbackState.RUNNING - - # Must NOT raise "multiple values for keyword argument 'context'". - success_event = await anext(trigger_gen) - - # Runtime context wins; the stale kwargs context is stripped (passed exactly once). - mock_callback.assert_called_once_with(message=TEST_MESSAGE, context={"dag_run": "fresh_runtime"}) - assert success_event.payload[PAYLOAD_STATUS_KEY] == CallbackState.SUCCESS - - -class TestEnsureBundleModuleRegistered: - """Tests for _ensure_bundle_module_registered.""" - - def test_adds_bundle_path_to_sys_path_for_sibling_imports(self, tmp_path): - """The bundle dir must be put on sys.path so callbacks can import sibling modules. - - Parity with the executor/sync path (callback_supervisor) which appends the bundle - path to sys.path; without it, ``import sibling_helper`` inside an async callback - fails with ModuleNotFoundError on the triggerer. - """ - from airflow.triggers.callback import _ensure_bundle_module_registered - from airflow.utils.file import get_unique_dag_module_name - - stem = "dag_with_sibling" - (tmp_path / f"{stem}.py").write_text("LOADED = True\n") - (tmp_path / "sibling_helper.py").write_text("HELPER = 'ok'\n") - mod_name = get_unique_dag_module_name(str(tmp_path / f"{stem}.py")) - - fake_bundle = mock.Mock() - fake_bundle.name = "test-bundle" - fake_bundle.path = tmp_path - - bundle_path = str(tmp_path) - original_sys_path = list(sys.path) - try: - with mock.patch("airflow.triggers.callback.DagBundlesManager") as mock_mgr: - mock_mgr.return_value.get_all_dag_bundles.return_value = [fake_bundle] - _ensure_bundle_module_registered(f"{mod_name}.my_func") - - assert bundle_path in sys.path - # The sibling module is now importable because its directory is on sys.path. - import importlib - - sibling = importlib.import_module("sibling_helper") - assert sibling.HELPER == "ok" - finally: - sys.modules.pop(mod_name, None) - sys.modules.pop("sibling_helper", None) - sys.path[:] = original_sys_path - - -async def _callback_raising_system_exit(**kwargs): - """An async callback that raises a BaseException (not Exception) subclass. - - ``CallbackTrigger.run()`` only catches ``Exception``, so a ``SystemExit`` raised - here escapes ``run()`` and propagates into ``TriggerRunner.run_trigger``'s - ``async for`` loop — exactly the path the BaseException handler guards. - """ - raise SystemExit(2) - - -class TestCallbackTriggerBaseExceptionHandling: - """Regression tests for the triggerer BaseException handler (fix #3). - - A callback that raises a ``BaseException`` subclass (``SystemExit``, - ``KeyboardInterrupt``, ``GeneratorExit``, or a custom ``BaseException``) must NOT - crash the triggerer event loop or leave the Callback row stuck in ``RUNNING`` - forever. ``run_trigger`` must instead emit a terminal ``FAILED`` event for the - callback and return without re-raising. - """ - - def test_callback_trigger_system_exit_emits_failed_event_and_does_not_propagate(self, monkeypatch): - """A SystemExit from a CallbackTrigger callback yields a FAILED event, not a crash.""" - import asyncio - - from airflow.jobs.triggerer_job_runner import TriggerRunner - - # Disable the greenback portal so run_trigger doesn't require the supervisor's - # event-loop setup (parity with how the inline run_trigger tests drive it). - monkeypatch.setenv("AIRFLOW_DISABLE_GREENBACK_PORTAL", "true") - - callback_path = f"{__name__}._callback_raising_system_exit" - trigger = CallbackTrigger(callback_path=callback_path, callback_kwargs={}) - - trigger_runner = TriggerRunner() - trigger_runner.triggers = { - 1: {"task": mock.MagicMock(spec=asyncio.Task), "is_watcher": False, "name": "cb", "events": 0} - } - - # Must NOT raise SystemExit out of run_trigger — that is what would tear down - # the triggerer event loop. The handler swallows it and returns cleanly. - asyncio.run(trigger_runner.run_trigger(1, trigger)) - - # The RUNNING event is emitted by CallbackTrigger.run() before the callback is - # invoked; the terminal FAILED event is emitted by the BaseException handler. - statuses = [entry.event.payload[PAYLOAD_STATUS_KEY] for entry in trigger_runner.events] - assert CallbackState.FAILED in statuses, ( - "BaseException handler must emit a terminal FAILED event so the Callback " - "row is not stuck in RUNNING forever" - ) - # Callback triggers are terminalised via an event, never via the failure queue. - assert trigger_runner.failed_triggers == deque() - # The FAILED body names the BaseException type so operators can diagnose it. - failed_bodies = [ - entry.event.payload.get(PAYLOAD_BODY_KEY) - for entry in trigger_runner.events - if entry.event.payload[PAYLOAD_STATUS_KEY] == CallbackState.FAILED - ] - assert any("SystemExit" in (body or "") for body in failed_bodies) diff --git a/shared/module_loading/src/airflow_shared/module_loading/__init__.py b/shared/module_loading/src/airflow_shared/module_loading/__init__.py index 5cccce6c1420a..238506ae52dae 100644 --- a/shared/module_loading/src/airflow_shared/module_loading/__init__.py +++ b/shared/module_loading/src/airflow_shared/module_loading/__init__.py @@ -18,8 +18,6 @@ from __future__ import annotations import functools -import importlib.machinery -import importlib.util import inspect import logging import pkgutil @@ -27,7 +25,6 @@ from collections import defaultdict from collections.abc import Callable, Iterator from importlib import import_module -from pathlib import Path from typing import TYPE_CHECKING from .dag_file import ( @@ -138,43 +135,6 @@ def qualname(o: object | Callable, use_qualname: bool = False, exclude_module: b return name -def load_mangled_dag_module(mod_name: str, file_path: str) -> bool: - """ - Load a DAG bundle file into sys.modules under its mangled unusual_prefix_{hash}_{stem} name. - - The DAG processor assigns each DAG file a unique module name - (unusual_prefix_{sha1_of_filepath}_{stem}) to prevent import collisions between bundles. - Callback paths are serialized using that mangled name, so both the executor subprocess - and the triggerer must register the file under exactly that name before calling - import_string(). - - Returns True if the module was registered, False if the file was not found or load failed. - """ - path = Path(file_path) - if not path.exists(): - log.warning("Cannot load mangled DAG module %s: file not found at %s", mod_name, file_path) - return False - - if mod_name in sys.modules: - return True - - try: - loader = importlib.machinery.SourceFileLoader(mod_name, str(path)) - spec = importlib.util.spec_from_loader(mod_name, loader) - module = importlib.util.module_from_spec(spec) # type: ignore[arg-type] - sys.modules[mod_name] = module - try: - loader.exec_module(module) - except Exception: - sys.modules.pop(mod_name, None) - raise - log.debug("Loaded mangled DAG module %s from %s", mod_name, file_path) - return True - except Exception: - log.warning("Failed to load mangled DAG module %s from %s", mod_name, file_path, exc_info=True) - return False - - def iter_namespace(ns: ModuleType): return pkgutil.iter_modules(ns.__path__, ns.__name__ + ".") diff --git a/shared/observability/src/airflow_shared/observability/metrics/metrics_template.yaml b/shared/observability/src/airflow_shared/observability/metrics/metrics_template.yaml index fa26e0a4c481f..6c51e32ff7798 100644 --- a/shared/observability/src/airflow_shared/observability/metrics/metrics_template.yaml +++ b/shared/observability/src/airflow_shared/observability/metrics/metrics_template.yaml @@ -321,18 +321,6 @@ metrics: legacy_name: "-" name_variables: [] - - name: "deadline_alerts.deadline_creation_failed" - description: "Number of deadline alerts that could not be created because reference evaluation raised" - type: "counter" - legacy_name: "-" - name_variables: [] - - - name: "deadline_alerts.callback_orphaned_dagrun_deleted" - description: "Number of orphaned deadline callbacks deleted because their Dag run no longer exists" - type: "counter" - legacy_name: "-" - name_variables: [] - - name: "ol.emit.failed" description: "Number of failed OpenLineage event emit attempts" type: "counter" diff --git a/shared/template_rendering/src/airflow_shared/template_rendering/__init__.py b/shared/template_rendering/src/airflow_shared/template_rendering/__init__.py index 7da5b3b611f12..ddf01a88098f8 100644 --- a/shared/template_rendering/src/airflow_shared/template_rendering/__init__.py +++ b/shared/template_rendering/src/airflow_shared/template_rendering/__init__.py @@ -78,39 +78,9 @@ def truncate_rendered_value(rendered: str, max_length: int) -> str: return result -def render_callback_kwargs(kwargs: dict, context: dict) -> dict: - """ - Render Jinja templates in string-valued deadline-callback kwargs using ``context``. - - Only plain string values containing ``{{`` are rendered. Non-string values, the - ``context`` key itself, and strings without template markers are passed through - unchanged. A template that fails to render falls back to the raw value (logged at - warning), so a bad template never aborts the callback. - - Shared between the synchronous executor path (``callback_supervisor``) and the - asynchronous triggerer path (``airflow.triggers.callback.CallbackTrigger``) so the two - render kwargs identically — keeping them in lockstep instead of one path silently - skipping rendering. - """ - from jinja2 import Template - - rendered = {} - for key, val in kwargs.items(): - if key == "context" or not isinstance(val, str) or "{{" not in val: - rendered[key] = val - continue - try: - rendered[key] = Template(val).render(context) - except Exception: - log.warning("Failed to render Jinja template in kwarg %s, using raw value", key) - rendered[key] = val - return rendered - - __all__ = [ "TRUNCATE_MIN_CONTENT_LENGTH", "TRUNCATE_PREFIX", "TRUNCATE_SUFFIX", - "render_callback_kwargs", "truncate_rendered_value", ] diff --git a/task-sdk/pyproject.toml b/task-sdk/pyproject.toml index 776852cf54ecd..8b40daca744f2 100644 --- a/task-sdk/pyproject.toml +++ b/task-sdk/pyproject.toml @@ -324,5 +324,4 @@ shared_distributions = [ "apache-airflow-shared-observability", "apache-airflow-shared-plugins-manager", "apache-airflow-shared-providers-discovery", - "apache-airflow-shared-template-rendering", ] diff --git a/task-sdk/src/airflow/sdk/definitions/callback.py b/task-sdk/src/airflow/sdk/definitions/callback.py index a559efecff4f6..4280a43513d24 100644 --- a/task-sdk/src/airflow/sdk/definitions/callback.py +++ b/task-sdk/src/airflow/sdk/definitions/callback.py @@ -17,7 +17,6 @@ from __future__ import annotations import inspect -import json from abc import ABC from collections.abc import Callable from typing import Any @@ -111,15 +110,14 @@ def __eq__(self, other): return self.serialize() == other.serialize() def __hash__(self): - # ``serialize()`` is JSON-serializable by contract (it is persisted to a JSON column), - # and ``kwargs`` may legally contain NESTED dicts/lists (e.g. ``kwargs={"config": {...}}`` - # or ``kwargs={"tags": [...]}``). The previous implementation only flattened the top level - # (``tuple(sorted(v.items()))``), so a nested dict/list value remained unhashable and - # ``hash(callback)`` — and therefore ``hash(DeadlineAlert(...))`` and any ``set``/dict use — - # raised ``TypeError: unhashable type``. Hash a canonical JSON encoding instead: it handles - # arbitrary nesting, and ``sort_keys=True`` keeps it consistent with ``__eq__`` (which - # compares ``serialize()``) regardless of key insertion order. - return hash(json.dumps(self.serialize(), sort_keys=True, default=str)) + serialized = self.serialize() + hashable_items = [] + for k, v in serialized.items(): + if isinstance(v, dict): + hashable_items.append((k, tuple(sorted(v.items())))) + else: + hashable_items.append((k, v)) + return hash(tuple(sorted(hashable_items))) class AsyncCallback(Callback): diff --git a/task-sdk/src/airflow/sdk/definitions/deadline.py b/task-sdk/src/airflow/sdk/definitions/deadline.py index 63a6badefbd77..a9da3dd3d3ea2 100644 --- a/task-sdk/src/airflow/sdk/definitions/deadline.py +++ b/task-sdk/src/airflow/sdk/definitions/deadline.py @@ -120,16 +120,6 @@ def __post_init__(self): self.min_runs = self.max_runs if self.min_runs < 1: raise ValueError("min_runs must be at least 1") - if self.min_runs > self.max_runs: - # The evaluation query is ``LIMIT max_runs``, so it samples at most ``max_runs`` - # completed runs and then requires ``len(durations) >= min_runs`` before computing - # an average. With ``min_runs > max_runs`` that condition can never be met, so the - # deadline would silently never be created no matter how many runs exist. Fail fast - # at authoring time instead of producing a permanently inert deadline. - raise ValueError( - f"min_runs ({self.min_runs}) cannot exceed max_runs ({self.max_runs}); " - "the deadline would require more completed runs than it ever samples." - ) def serialize_reference(self) -> dict[str, Any]: return { @@ -162,13 +152,6 @@ def __init__( name: str | None = None, ): self.reference = reference - - if not isinstance(interval, (timedelta, VariableInterval)): - raise ValueError( - f"interval must be a timedelta or VariableInterval, got {type(interval).__name__}. " - "A non-timedelta interval is accepted silently at authoring time but fails later when " - "the scheduler evaluates the deadline (reference datetime + interval)." - ) self.interval = interval self.name = name @@ -179,15 +162,8 @@ def __init__( def __eq__(self, other: object) -> bool: if not isinstance(other, DeadlineAlert): return NotImplemented - # Compare reference by EXACT type, not ``isinstance``. ``isinstance(self.reference, - # type(other.reference))`` is asymmetric for subclasses — a custom reference subclassing a - # builtin (``register_custom_reference`` permits any ``BaseDeadlineReference`` subclass) made - # ``alert(SubRef) == alert(BaseRef)`` True but the reverse False (violating equality - # symmetry), AND disagreed with ``__hash__`` (which keys on ``type(...).__name__``), so two - # "equal" alerts could hash differently — breaking the eq/hash invariant and set/dict use. - # Exact-type matches ``__hash__`` and is symmetric. return ( - type(self.reference) is type(other.reference) + isinstance(self.reference, type(other.reference)) and self.interval == other.interval and self.callback == other.callback ) @@ -413,19 +389,9 @@ def resolve(self) -> timedelta: value = Variable.get(self.key) except AirflowRuntimeError as e: raise ValueError(f"VariableInterval '{self.key}' not found") from e - return self.coerce_to_timedelta(value) - def coerce_to_timedelta(self, value: str | int | float | None) -> timedelta: - """ - Validate a raw Variable value and convert it into a ``timedelta``. - - Split out from :meth:`resolve` so callers that already hold the Variable value (e.g. a - scheduler-side reader that must fetch it on its own session — see - ``DAG._process_dagrun_deadline_alerts`` — to avoid committing inside ``prohibit_commit``) - can reuse the exact same validation without going through ``Variable.get``. - """ try: - seconds = int(value) # type: ignore[arg-type] # None/non-numeric handled by the except below + seconds = int(value) except (TypeError, ValueError) as e: raise ValueError( f"VariableInterval '{self.key}' must be an integer (seconds), got: {value!r}" @@ -434,15 +400,4 @@ def coerce_to_timedelta(self, value: str | int | float | None) -> timedelta: if seconds <= 0: raise ValueError(f"VariableInterval '{self.key}' must be > 0, got: {seconds}") - try: - return timedelta(seconds=seconds) - except OverflowError as e: - # ``timedelta`` caps at ~999999999 days; a wildly out-of-range value (e.g. a - # fat-fingered ``deadline_seconds`` meant as ms) raises OverflowError, which is NOT - # a ValueError subclass and would otherwise escape this method's documented - # "raises ValueError on bad input" contract with a cryptic C-int message. Translate - # it to the same clean ValueError so callers (and the deadline-creation isolation) - # get a consistent, actionable error. - raise ValueError( - f"VariableInterval '{self.key}' is too large to be a valid interval: {seconds} seconds" - ) from e + return timedelta(seconds=seconds) diff --git a/task-sdk/src/airflow/sdk/execution_time/callback_supervisor.py b/task-sdk/src/airflow/sdk/execution_time/callback_supervisor.py index 3e62a90f23539..12f86dec36bff 100644 --- a/task-sdk/src/airflow/sdk/execution_time/callback_supervisor.py +++ b/task-sdk/src/airflow/sdk/execution_time/callback_supervisor.py @@ -32,19 +32,11 @@ import structlog from pydantic import Field, TypeAdapter -from airflow.sdk._shared.module_loading import ( - UNUSUAL_MODULE_PREFIX, - accepts_context, - accepts_keyword_args, - load_mangled_dag_module, -) -from airflow.sdk._shared.template_rendering import render_callback_kwargs +from airflow.sdk._shared.module_loading import UNUSUAL_MODULE_PREFIX, accepts_context, accepts_keyword_args from airflow.sdk.exceptions import ErrorType from airflow.sdk.execution_time.comms import ( - DagRunResult, ErrorResponse, GetConnection, - GetDagRun, GetVariable, GetVariableKeys, MaskSecret, @@ -62,7 +54,6 @@ _ensure_client, _make_process_nondumpable, ) -from airflow.utils.file import get_unique_dag_module_name if TYPE_CHECKING: from pydantic import BaseModel @@ -78,35 +69,16 @@ class _BundleInfoLike(Protocol): version: str | None -__all__ = ["CallbackSubprocess", "CallbackContextFetchError", "supervise_callback"] +__all__ = ["CallbackSubprocess", "supervise_callback"] log: FilteringBoundLogger = structlog.get_logger(logger_name="callback_supervisor") -# Distinct subprocess exit code for "could not fetch the DagRun context" — a transient, -# retryable failure (API blip, network partition, token expiry), as opposed to a genuine -# callback failure (any other non-zero exit). The supervisor maps this code to -# ``CallbackContextFetchError`` so the executor can requeue rather than terminally FAIL, -# matching the triggerer path (which re-evaluates on the next loop). 75 is the conventional -# EX_TEMPFAIL from sysexits.h ("temporary failure; user is invited to retry"). -CALLBACK_CONTEXT_FETCH_EXIT_CODE = 75 - - -class CallbackContextFetchError(RuntimeError): - """ - Raised when a deadline callback subprocess could not fetch its DagRun context. - - Distinct from a generic callback failure so the executor can treat it as transient and - requeue the callback (PENDING) instead of marking it terminally FAILED — keeping the - executor path's transient-failure behavior in lockstep with the triggerer path. - """ - # The set of messages that a callback subprocess can send to the supervisor. # This is a minimal subset of ToSupervisor: read-only access to Connections -# and Variables, plus MaskSecret for the secrets masker, plus GetDagRun for -# building context from DagRun identifiers. +# and Variables, plus MaskSecret for the secrets masker. CallbackToSupervisor = Annotated[ - GetConnection | GetDagRun | GetVariable | GetVariableKeys | MaskSecret, + GetConnection | GetVariable | GetVariableKeys | MaskSecret, Field(discriminator="type"), ] @@ -149,21 +121,18 @@ def execute_callback( # If the callback is defined within the Dag module, the module path is modified during DAG serialization. # Attempt to import it using the path of the Dag file. if module_path.startswith(UNUSUAL_MODULE_PREFIX): - if module_path in sys.modules: - module = sys.modules[module_path] - elif not dag_rel_path: + if not dag_rel_path: return False, "Dag relative path not found." - elif not bundle_path: + if not bundle_path: return False, "Bundle path not found." - else: - abs_path = Path(bundle_path) / Path(dag_rel_path) - spec = spec_from_file_location(module_path, abs_path) - if spec is None: - return False, f"Could not create module spec for {module_path}" - if spec.loader is None: - return False, f"Module spec has no loader for {module_path}" - module = module_from_spec(spec) - spec.loader.exec_module(module) + abs_path = Path(bundle_path) / Path(dag_rel_path) + spec = spec_from_file_location(module_path, abs_path) + if spec is None: + return False, f"Could not create module spec for {module_path}" + if spec.loader is None: + return False, f"Module spec has no loader for {module_path}" + module = module_from_spec(spec) + spec.loader.exec_module(module) else: module = import_module(module_path) callback_callable = getattr(module, function_name) @@ -194,14 +163,7 @@ def execute_callback( log.info("Callback executed successfully", callback_path=callback_path) return True, None - except BaseException as e: - # Catch BaseException, not just Exception: a callback that raises SystemExit or - # KeyboardInterrupt (directly, or indirectly via a library that calls sys.exit()) - # would otherwise propagate out of the forked _target and terminate the child with - # SystemExit's own code. A SystemExit(0) in particular makes the child exit 0, which - # supervise_callback reads as SUCCESS — silently recording an aborted callback as - # completed. Treating any BaseException as a failure keeps the success state honest. - # (Mirrors the BaseException handling on the triggerer/async path.) + except Exception as e: error_msg = f"Callback execution failed: {type(e).__name__}: {str(e)}" log.exception( "Callback execution failed", @@ -236,11 +198,7 @@ def start( # type: ignore[override] id: str, callback_path: str, callback_kwargs: dict, - dag_rel_path: os.PathLike[str] | str = "", - dag_id: str | None = None, - run_id: str | None = None, - deadline_id: str | None = None, - deadline_time: str | None = None, + dag_rel_path: os.PathLike[str], bundle_info: _BundleInfoLike | None = None, client: Client, logger: FilteringBoundLogger | None = None, @@ -276,12 +234,6 @@ def _target(): _log.debug( "Added bundle path to sys.path", bundle_name=bundle_info.name, path=bundle_path ) - # DAG processor loads bundle files with a mangled module name - # (unusual_prefix_{hash}_{stem}) to avoid collisions. The callback path - # was serialized using that mangled name. Register the module under that - # name so import_string can find it in the subprocess. - if callback_path and callback_path.startswith(UNUSUAL_MODULE_PREFIX): - _register_unusual_prefix_module(callback_path, bundle.path, _log) except Exception: _log.warning( "Failed to initialize DAG bundle for callback", @@ -289,33 +241,9 @@ def _target(): exc_info=True, ) - # When DagRun identifiers are provided, fetch the DagRun via SUPERVISOR_COMMS - # and build a context dict to pass to the callback function. - effective_kwargs = dict(callback_kwargs) - if dag_id and run_id: - deadline = ( - {"id": deadline_id, "deadline_time": deadline_time} - if (deadline_id or deadline_time) - else None - ) - context = _fetch_and_build_context( - task_runner.SUPERVISOR_COMMS, dag_id, run_id, _log, deadline=deadline - ) - if context is None: - _log.error( - "Cannot fetch DagRun context for callback — exiting with the transient " - "context-fetch code so the executor requeues rather than failing terminally " - "(a transient API/network blip should not permanently drop the deadline callback)", - dag_id=dag_id, - run_id=run_id, - ) - sys.exit(CALLBACK_CONTEXT_FETCH_EXIT_CODE) - effective_kwargs["context"] = context - effective_kwargs = render_callback_kwargs(effective_kwargs, context) - success, error_msg = execute_callback( callback_path=callback_path, - callback_kwargs=effective_kwargs, + callback_kwargs=callback_kwargs, dag_rel_path=dag_rel_path, bundle_path=bundle_path, log=_log, @@ -428,9 +356,6 @@ def _handle_request(self, msg: CallbackToSupervisor, log: FilteringBoundLogger, if isinstance(msg, GetConnection): resp, dump_opts = handle_get_connection(self.client, msg) - elif isinstance(msg, GetDagRun): - dr_resp = self.client.dag_runs.get_detail(msg.dag_id, msg.run_id) - resp = DagRunResult.from_api_response(dr_resp) elif isinstance(msg, GetVariable): resp, dump_opts = handle_get_variable(self.client, msg) elif isinstance(msg, GetVariableKeys): @@ -452,28 +377,6 @@ def _handle_request(self, msg: CallbackToSupervisor, log: FilteringBoundLogger, self.send_msg(resp, request_id=req_id, error=None, **dump_opts) -def _register_unusual_prefix_module(callback_path: str, bundle_path, _log) -> None: - """ - Register a DAG-bundle callback module under its unusual_prefix_{hash}_{stem} name. - - Resolves the stem from the mangled module name, walks the bundle tree to find - the file (handling nested directories), then delegates to load_mangled_dag_module. - """ - mod_name = callback_path.split(".")[0] - if mod_name in sys.modules: - return - - # Validate this is a mangled name: unusual_prefix_{hex40}_{stem} - if len(mod_name.split("_", 3)) < 4: - return - - for file_path in Path(bundle_path).rglob("*.py"): - if get_unique_dag_module_name(str(file_path)) == mod_name: - load_mangled_dag_module(mod_name, str(file_path)) - return - _log.debug("Could not find matching file for module %s in bundle", mod_name) - - def _configure_logging(log_path: str, client: Client) -> tuple[FilteringBoundLogger, BinaryIO]: """Configure file-based logging for the callback subprocess.""" from airflow.sdk.execution_time.supervisor import _remote_logging_conn @@ -489,52 +392,12 @@ def _configure_logging(log_path: str, client: Client) -> tuple[FilteringBoundLog return logger, log_file_descriptor -def _fetch_and_build_context( - comms, - dag_id: str, - run_id: str, - _log, - deadline: dict | None = None, -) -> dict | None: - """ - Fetch DagRun via SUPERVISOR_COMMS and build a standard context dict. - - Called inside the forked subprocess when DagRun identifiers are available. - Returns a context dict with dag_run, run_id, logical_date, ds, ts, etc. - Task-specific fields are absent since callbacks are not tied to a task. - """ - try: - from airflow.sdk.execution_time.context import build_context_from_dag_run - - response = comms.send(GetDagRun(dag_id=dag_id, run_id=run_id)) - if not isinstance(response, DagRunResult): - _log.warning( - "Unexpected response when fetching DagRun for callback context", - response_type=type(response).__name__, - ) - return None - - return build_context_from_dag_run(response, deadline=deadline) - except Exception: - _log.warning( - "Failed to fetch DagRun for callback context", - dag_id=dag_id, - run_id=run_id, - exc_info=True, - ) - return None - - def supervise_callback( *, id: str, callback_path: str, callback_kwargs: dict, - dag_rel_path: os.PathLike[str] | str = "", - dag_id: str | None = None, - run_id: str | None = None, - deadline_id: str | None = None, - deadline_time: str | None = None, + dag_rel_path: os.PathLike[str], log_path: str | None = None, bundle_info: _BundleInfoLike | None = None, token: str = "", @@ -548,10 +411,6 @@ def supervise_callback( :param callback_path: Dot-separated import path to the callback function or class. :param callback_kwargs: Keyword arguments to pass to the callback. :param dag_rel_path: Relative path to the DAG file. - :param dag_id: DAG ID for fetching DagRun context (optional, for deadline callbacks). - :param run_id: Run ID for fetching DagRun context (optional, for deadline callbacks). - :param deadline_id: Deadline ID to include in context["deadline"] (optional). - :param deadline_time: ISO deadline time to include in context["deadline"] (optional). :param log_path: Path to write logs, if required. :param bundle_info: When provided, the bundle's path is added to sys.path so callbacks in Dag Bundles are importable. :param token: Authentication token for the API client. @@ -578,10 +437,6 @@ def supervise_callback( callback_path=callback_path, callback_kwargs=callback_kwargs, dag_rel_path=dag_rel_path, - dag_id=dag_id, - run_id=run_id, - deadline_id=deadline_id, - deadline_time=deadline_time, bundle_info=bundle_info, client=client, logger=logger, @@ -597,13 +452,6 @@ def supervise_callback( exit_code=exit_code, duration=end - start, ) - if exit_code == CALLBACK_CONTEXT_FETCH_EXIT_CODE: - # Transient: the subprocess could not fetch the DagRun context. Signal the - # executor to requeue (not terminally fail) so a transient blip retries, mirroring - # the triggerer path's re-evaluate-next-loop behavior. - raise CallbackContextFetchError( - f"Callback subprocess could not fetch DagRun context (exit code {exit_code})" - ) if exit_code != 0: raise RuntimeError(f"Callback subprocess exited with code {exit_code}") return exit_code diff --git a/task-sdk/src/airflow/sdk/execution_time/context.py b/task-sdk/src/airflow/sdk/execution_time/context.py index 8d14f0b63b22b..d41306009eac7 100644 --- a/task-sdk/src/airflow/sdk/execution_time/context.py +++ b/task-sdk/src/airflow/sdk/execution_time/context.py @@ -1412,46 +1412,3 @@ def context_get_outlet_events(context: Context) -> OutletEventAccessorsProtocol: except KeyError: outlet_events = context["outlet_events"] = OutletEventAccessors() return outlet_events - - -def build_context_from_dag_run(dag_run, deadline: dict | None = None) -> dict: - """ - Build a standard callback Context dict from a DagRun-like object. - - Accepts any object with logical_date, run_id, data_interval_start, data_interval_end - attributes (e.g. DRDataModel from the execution API or DagRunResult from comms). - - Returns a context dict with dag_run, run_id, logical_date, ds, ts, etc. - Task-specific fields are absent since callbacks are not tied to a task. - - :param deadline: Optional ``{"id": ..., "deadline_time": ...}`` dict exposed as - ``context["deadline"]`` (for templates such as ``{{ deadline.deadline_time }}``). - Assembled here so the executor and triggerer callback paths produce identical context. - """ - from airflow.sdk.timezone import coerce_datetime - - context: dict = {"dag_run": dag_run} - - if logical_date := coerce_datetime(dag_run.logical_date): - ds = logical_date.strftime("%Y-%m-%d") - ts = logical_date.isoformat() - context.update( - { - "logical_date": logical_date, - "run_id": dag_run.run_id, - "ds": ds, - "ds_nodash": ds.replace("-", ""), - "ts": ts, - "ts_nodash": logical_date.strftime("%Y%m%dT%H%M%S"), - "ts_nodash_with_tz": ts.replace("-", "").replace(":", ""), - "data_interval_start": coerce_datetime(dag_run.data_interval_start), - "data_interval_end": coerce_datetime(dag_run.data_interval_end), - } - ) - else: - context["run_id"] = dag_run.run_id - - if deadline: - context["deadline"] = deadline - - return context diff --git a/task-sdk/tests/task_sdk/definitions/test_callback.py b/task-sdk/tests/task_sdk/definitions/test_callback.py index 7e3c61716b4f3..8b2bdb1bdc2c1 100644 --- a/task-sdk/tests/task_sdk/definitions/test_callback.py +++ b/task-sdk/tests/task_sdk/definitions/test_callback.py @@ -163,39 +163,12 @@ def test_callback_equality(self, callback1_args, callback2_args, should_equal): True, id="async_no_kwargs", ), - # Nested-dict kwargs must hash (and equal-hash) — the old top-level-only flattening - # raised TypeError: unhashable type: 'dict' on these. - pytest.param( - AsyncCallback, - (TEST_CALLBACK_PATH, {"config": {"retries": 3, "timeout": 30}}), - (TEST_CALLBACK_PATH, {"config": {"timeout": 30, "retries": 3}}), - True, - id="async_nested_dict_kwargs_order_independent", - ), - # List-valued kwargs must hash too (old code raised on the list value). - pytest.param( - SyncCallback, - (TEST_CALLBACK_PATH, {"tags": [1, 2, 3]}), - (TEST_CALLBACK_PATH, {"tags": [1, 2, 3]}), - True, - id="sync_list_kwargs", - ), - pytest.param( - SyncCallback, - (TEST_CALLBACK_PATH, {"config": {"retries": 3}}), - (TEST_CALLBACK_PATH, {"config": {"retries": 5}}), - False, - id="sync_nested_dict_different_values", - ), ], ) def test_callback_hash_and_set_behavior(self, callback_class, args1, args2, should_be_same_hash): callback1 = callback_class(*args1) callback2 = callback_class(*args2) - # Must not raise (the bug: nested dict/list kwargs were unhashable). assert (hash(callback1) == hash(callback2)) == should_be_same_hash - # Hashable means usable in a set/dict — exercise that too. - assert len({callback1, callback2}) == (1 if should_be_same_hash and callback1 == callback2 else 2) class TestAsyncCallback: diff --git a/task-sdk/tests/task_sdk/definitions/test_deadline.py b/task-sdk/tests/task_sdk/definitions/test_deadline.py index d75161f6afd7a..b104980e4c986 100644 --- a/task-sdk/tests/task_sdk/definitions/test_deadline.py +++ b/task-sdk/tests/task_sdk/definitions/test_deadline.py @@ -23,12 +23,7 @@ from task_sdk.definitions.test_callback import TEST_CALLBACK_KWARGS, TEST_CALLBACK_PATH, UNIMPORTABLE_DOT_PATH from airflow.sdk.definitions.callback import AsyncCallback, SyncCallback -from airflow.sdk.definitions.deadline import ( - DeadlineAlert, - DeadlineReference, - FixedDatetimeDeadline, - VariableInterval, -) +from airflow.sdk.definitions.deadline import DeadlineAlert, DeadlineReference, VariableInterval from airflow.sdk.definitions.variable import Variable from airflow.sdk.exceptions import AirflowRuntimeError @@ -47,15 +42,6 @@ TEST_DEADLINE_CALLBACK = AsyncCallback(TEST_CALLBACK_PATH, kwargs=TEST_CALLBACK_KWARGS) -class TestAverageRuntimeReference: - def test_min_runs_cannot_exceed_max_runs(self): - """``min_runs > max_runs`` is unsatisfiable (the evaluator samples at most ``max_runs`` - rows then requires ``min_runs`` of them), so it must be rejected at authoring time rather - than producing a deadline that silently never fires.""" - with pytest.raises(ValueError, match="cannot exceed max_runs"): - DeadlineReference.AVERAGE_RUNTIME(max_runs=5, min_runs=10) - - class TestDeadlineAlert: @pytest.mark.parametrize( ("test_alert", "should_equal"), @@ -136,35 +122,6 @@ def test_deadline_alert_hash(self): assert hash(alert1) == hash(alert1) assert hash(alert1) == hash(alert2) - def test_deadline_alert_eq_is_symmetric_and_hash_consistent_for_subclassed_reference(self): - """``__eq__`` must compare the reference by EXACT type, not ``isinstance``. A custom - reference subclassing a builtin (``register_custom_reference`` permits any - ``BaseDeadlineReference`` subclass) otherwise made equality ASYMMETRIC - (``alert(SubRef) == alert(BaseRef)`` True but the reverse False) and INCONSISTENT with - ``__hash__`` (which keys on ``type(...).__name__``) — two "equal" alerts hashing - differently, breaking the eq/hash invariant and set/dict use. - """ - import datetime - - class _MyFixedRef(FixedDatetimeDeadline): - pass - - dt = datetime.datetime(2026, 1, 1, tzinfo=datetime.timezone.utc) - common = dict(interval=timedelta(hours=1), callback=AsyncCallback(TEST_CALLBACK_PATH)) - a_sub = DeadlineAlert(reference=_MyFixedRef(dt), **common) - a_base = DeadlineAlert(reference=FixedDatetimeDeadline(dt), **common) - - # A subclass-vs-builtin reference are DISTINCT types → not equal, and equality is symmetric. - assert (a_sub == a_base) == (a_base == a_sub) # symmetric - assert a_sub != a_base # exact-type: different reference types are not equal - # Two alerts with the SAME exact subclass reference are equal AND hash-equal. - a_sub2 = DeadlineAlert(reference=_MyFixedRef(dt), **common) - assert a_sub == a_sub2 - assert hash(a_sub) == hash(a_sub2) - # eq/hash invariant: if (unequal) they may differ; the broken case was equal-but-unequal-hash. - if a_sub == a_base: - assert hash(a_sub) == hash(a_base) - def test_deadline_alert_in_set(self): std_interval = timedelta(hours=1) std_callback = TEST_CALLBACK_PATH @@ -184,24 +141,6 @@ def test_deadline_alert_in_set(self): alert_set = {alert1, alert2} assert len(alert_set) == 1 - def test_deadline_alert_hash_with_nested_callback_kwargs(self): - """A DeadlineAlert whose callback has NESTED dict/list kwargs must be hashable. Previously - Callback.__hash__ only flattened the top level, so hashing the alert raised - TypeError: unhashable type: 'dict'/'list' on a perfectly legal (JSON-serializable) config.""" - nested_kwargs = {"config": {"retries": 3, "endpoints": ["a", "b"]}, "flat": "x"} - - def _make(): - return DeadlineAlert( - reference=DeadlineReference.DAGRUN_QUEUED_AT, - interval=timedelta(hours=1), - callback=AsyncCallback(TEST_CALLBACK_PATH, kwargs=nested_kwargs), - ) - - a1, a2 = _make(), _make() - # Must not raise, and equal alerts must collapse in a set. - assert hash(a1) == hash(a2) - assert len({a1, a2}) == 1 - @pytest.mark.parametrize( ("callback_class"), [ @@ -233,6 +172,7 @@ class TestVariableInterval: ("value", "expected"), [ ("3", timedelta(seconds=3)), + ("10", timedelta(seconds=10)), ("05", timedelta(seconds=5)), # leading zero ], ) @@ -247,11 +187,10 @@ def test_resolve_valid(self, mocker, value, expected): ("value", "raise_runtime", "match"), [ (None, True, "not found"), - # Out-of-range: timedelta caps at ~999999999 days, so a wildly large value raises - # OverflowError (NOT a ValueError subclass) and would escape the method's contract. - # It must be translated to the same clean ValueError as every other bad input. - ("99999999999999", False, "too large to be a valid interval"), - (str(10**30), False, "too large to be a valid interval"), + ("abc", False, "must be an integer"), + ("", False, "must be an integer"), + ("0", False, "must be > 0"), + ("-5", False, "must be > 0"), ], ) def test_resolve_invalid(self, mocker, value, raise_runtime, match): @@ -273,18 +212,3 @@ def test_resolve_invalid(self, mocker, value, raise_runtime, match): with pytest.raises(ValueError, match=match): interval.resolve() - - @pytest.mark.parametrize( - ("value", "match"), - [ - ("abc", "must be an integer"), - ("", "must be an integer"), - (None, "must be an integer"), - ("0", "must be > 0"), - ("-5", "must be > 0"), - ("99999999999999", "too large to be a valid interval"), - ], - ) - def test_coerce_to_timedelta_invalid(self, value, match): - with pytest.raises(ValueError, match=match): - VariableInterval(key="k").coerce_to_timedelta(value) diff --git a/task-sdk/tests/task_sdk/definitions/test_deadline_alert_validation.py b/task-sdk/tests/task_sdk/definitions/test_deadline_alert_validation.py deleted file mode 100644 index c747eb7481587..0000000000000 --- a/task-sdk/tests/task_sdk/definitions/test_deadline_alert_validation.py +++ /dev/null @@ -1,63 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Authoring-time validation tests for the DeadlineAlert / DeadlineReference / Callback SDK surface. - -These tests exercise the *DAG authoring* surface (``airflow.sdk.definitions``), where a user -constructs a deadline in their DAG file. The intent is that bad input is rejected cleanly at -parse time, rather than being accepted silently and blowing up later when the scheduler evaluates -the deadline. -""" - -from __future__ import annotations - -import pytest - -from airflow.sdk.definitions.callback import AsyncCallback -from airflow.sdk.definitions.deadline import DeadlineAlert, DeadlineReference - - -async def _async_callback(): - pass - - -VALID_ASYNC_CB = AsyncCallback(_async_callback) - - -class TestDeadlineAlertIntervalValidation: - """Scenario 1: the ``interval`` argument must be a timedelta or VariableInterval.""" - - @pytest.mark.parametrize( - "interval", - [ - pytest.param(5, id="int"), - pytest.param(None, id="none"), - ], - ) - def test_non_timedelta_interval_rejected_at_authoring(self, interval): - """ - A non-timedelta interval must raise at construction time. - - Regression test: previously accepted silently and produced a broken deadline that - crashed at scheduler runtime with ``TypeError`` when computing ``base_time + interval``. - """ - with pytest.raises(ValueError, match="interval must be a timedelta or VariableInterval"): - DeadlineAlert( - reference=DeadlineReference.DAGRUN_QUEUED_AT, - interval=interval, - callback=VALID_ASYNC_CB, - ) diff --git a/task-sdk/tests/task_sdk/execution_time/test_build_context_field_edge_cases.py b/task-sdk/tests/task_sdk/execution_time/test_build_context_field_edge_cases.py deleted file mode 100644 index a82fb11fcebbf..0000000000000 --- a/task-sdk/tests/task_sdk/execution_time/test_build_context_field_edge_cases.py +++ /dev/null @@ -1,100 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Edge-case tests for build_context_from_dag_run callback context construction (PR #66608). - -Covers missing/None DagRun attributes, context key completeness (including the -absent top-level ``conf`` key), datetime formatting edges, deeply nested payloads, -and the supervisor ``_fetch_and_build_context`` path with partial DagRun fields. -""" - -from __future__ import annotations - -import pendulum -import structlog - -from airflow.sdk.execution_time.context import build_context_from_dag_run - - -class MockDagRun: - """Bare attribute holder; only attributes explicitly set exist.""" - - def __init__(self, **kwargs): - for k, v in kwargs.items(): - setattr(self, k, v) - - -def _full_kwargs(): - return dict( - dag_id="d", - run_id="r", - logical_date=pendulum.datetime(2024, 1, 15, 10, 30, 0), - data_interval_start=pendulum.datetime(2024, 1, 14, 0, 0, 0), - data_interval_end=pendulum.datetime(2024, 1, 15, 0, 0, 0), - run_after=pendulum.datetime(2024, 1, 15, 0, 0, 0), - ) - - -# --------------------------------------------------------------------------- -# DagRun missing each optional attr one at a time -# --------------------------------------------------------------------------- -class TestMissingOptionalAttrs: - """Probe build_context_from_dag_run with each optional attr removed.""" - - def test_missing_run_after_is_tolerated(self): - # run_after is never read by build_context_from_dag_run, so its absence - # must NOT affect the built context. - kw = _full_kwargs() - del kw["run_after"] - dr = MockDagRun(**kw) - ctx = build_context_from_dag_run(dr) - assert ctx["run_id"] == "r" - assert ctx["ds"] == "2024-01-15" - - -# --------------------------------------------------------------------------- -# _fetch_and_build_context with null/partial DagRun fields -# --------------------------------------------------------------------------- -class TestFetchPartialFields: - def _result(self, **overrides): - from airflow.sdk.execution_time.comms import DagRunResult - - base = dict( - dag_id="test_dag", - run_id="test_run", - run_after=pendulum.datetime(2024, 1, 1, 0, 0, 0), - run_type="manual", - state="running", - consumed_asset_events=[], - ) - base.update(overrides) - return DagRunResult(**base) - - def test_full_logical_date_builds_full_context(self, mocker): - from airflow.sdk.execution_time.callback_supervisor import _fetch_and_build_context - - comms = mocker.Mock() - comms.send.return_value = self._result( - logical_date=pendulum.datetime(2024, 1, 15, 0, 0, 0), - data_interval_start=None, - data_interval_end=None, - ) - ctx = _fetch_and_build_context(comms, "test_dag", "test_run", structlog.get_logger()) - assert ctx is not None - assert ctx["ds"] == "2024-01-15" - # partial interval fields => coerce_datetime(None) is None, still keyed - assert ctx["data_interval_start"] is None - assert ctx["data_interval_end"] is None diff --git a/task-sdk/tests/task_sdk/execution_time/test_build_context_from_dag_run.py b/task-sdk/tests/task_sdk/execution_time/test_build_context_from_dag_run.py deleted file mode 100644 index 26c6545f1f7ff..0000000000000 --- a/task-sdk/tests/task_sdk/execution_time/test_build_context_from_dag_run.py +++ /dev/null @@ -1,129 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""QA tests for build_context_from_dag_run (PR #66608).""" - -from __future__ import annotations - -import pendulum - -from airflow.sdk.api.datamodels._generated import DagRunState, DagRunType -from airflow.sdk.execution_time.comms import DagRunResult -from airflow.sdk.execution_time.context import build_context_from_dag_run - - -def make_dag_run_result( - *, - dag_id="test_dag", - run_id="scheduled__2024-01-15", - logical_date=None, - data_interval_start=None, - data_interval_end=None, -): - """Helper to build a DagRunResult for tests.""" - return DagRunResult( - dag_id=dag_id, - run_id=run_id, - logical_date=logical_date, - data_interval_start=data_interval_start, - data_interval_end=data_interval_end, - run_after=pendulum.datetime(2024, 1, 15, 0, 0, 0), - run_type=DagRunType.SCHEDULED, - state=DagRunState.RUNNING, - consumed_asset_events=[], - ) - - -class TestBuildContextFromDagRun: - """ - QA tests for PR #66608 - build_context_from_dag_run helper. - - Tests cover: - - All expected context fields are present with correct values - - logical_date=None case (asset-triggered runs) - - DagRunResult (comms model) as input - """ - - def test_all_context_fields_present_with_correct_values(self): - """TEST 1: All expected context fields are populated correctly.""" - logical_date = pendulum.datetime(2024, 1, 15, 10, 30, 0) - data_interval_start = pendulum.datetime(2024, 1, 14, 0, 0, 0) - data_interval_end = pendulum.datetime(2024, 1, 15, 0, 0, 0) - - dag_run = make_dag_run_result( - run_id="manual__2024-01-15T10:30:00+00:00", - logical_date=logical_date, - data_interval_start=data_interval_start, - data_interval_end=data_interval_end, - ) - - context = build_context_from_dag_run(dag_run) - - # Core run fields - assert context["run_id"] == "manual__2024-01-15T10:30:00+00:00" - assert context["dag_run"] is dag_run - - # Date fields - assert context["ds"] == "2024-01-15", f"Expected ds=2024-01-15, got {context['ds']}" - assert context["ds_nodash"] == "20240115" - assert "ts" in context - assert "ts_nodash" in context - assert "ts_nodash_with_tz" in context - - # logical_date - assert context["logical_date"] is not None - - # Interval fields - assert context["data_interval_start"] is not None - assert context["data_interval_end"] is not None - - def test_logical_date_none_returns_minimal_context(self): - """TEST: When logical_date is None (asset-triggered), context has only run_id and dag_run.""" - dag_run = make_dag_run_result( - run_id="asset__2024-01-15", - logical_date=None, - data_interval_start=None, - data_interval_end=None, - ) - - context = build_context_from_dag_run(dag_run) - - # Must have these - assert context["run_id"] == "asset__2024-01-15" - assert context["dag_run"] is dag_run - - # Must NOT have date-derived fields - assert "ds" not in context, "ds must not be present when logical_date is None" - assert "ts" not in context, "ts must not be present when logical_date is None" - assert "logical_date" not in context - assert "data_interval_start" not in context - assert "data_interval_end" not in context - - def test_deadline_dict_is_exposed_when_passed(self): - """When a deadline dict is supplied it is exposed as ``context["deadline"]``. - - Both callback paths (executor + triggerer) pass the deadline through this single helper, - so the resulting context is identical by construction. - """ - dag_run = make_dag_run_result( - run_id="test_run", - logical_date=pendulum.datetime(2024, 1, 15, 0, 0, 0), - ) - deadline = {"id": "dl-1", "deadline_time": "2024-01-15T01:00:00+00:00"} - - context = build_context_from_dag_run(dag_run, deadline=deadline) - - assert context["deadline"] == deadline diff --git a/task-sdk/tests/task_sdk/execution_time/test_callback_supervisor.py b/task-sdk/tests/task_sdk/execution_time/test_callback_supervisor.py index 647bda8ea87f8..c33299b313c81 100644 --- a/task-sdk/tests/task_sdk/execution_time/test_callback_supervisor.py +++ b/task-sdk/tests/task_sdk/execution_time/test_callback_supervisor.py @@ -24,60 +24,49 @@ import uuid from dataclasses import dataclass from operator import attrgetter -from types import SimpleNamespace from typing import Any from unittest.mock import ANY, Mock, patch import pytest import structlog -from airflow.sdk._shared.template_rendering import render_callback_kwargs -from airflow.sdk._shared.timezones import timezone -from airflow.sdk.api.datamodels._generated import DagRun, DagRunState, DagRunType -from airflow.sdk.execution_time.callback_supervisor import ( - CALLBACK_CONTEXT_FETCH_EXIT_CODE, - CallbackContextFetchError, - CallbackSubprocess, - Path, - execute_callback, - supervise_callback, -) +from airflow.sdk.execution_time.callback_supervisor import CallbackSubprocess, Path, execute_callback from airflow.sdk.execution_time.comms import ( - ErrorResponse, - GetDagRun, + BundleInfo, + ConnectionResult, + GetConnection, + GetVariable, + GetVariableKeys, + MaskSecret, + VariableKeysResult, + VariableResult, _RequestFrame, ) -# A minimal DagRun instance used in the GetDagRun test case. -_MOCK_DAG_RUN = DagRun( - dag_id="test_dag", - run_id="test_run", - run_after=timezone.parse("2024-01-01T00:00:00+00:00"), - run_type=DagRunType.MANUAL, - state=DagRunState.RUNNING, - consumed_asset_events=[], -) - def callback_no_args(): """A simple callback that takes no arguments.""" return "ok" +def callback_with_kwargs(arg1, arg2): + """A callback that accepts keyword arguments.""" + return f"{arg1}-{arg2}" + + def callback_that_raises(): """A callback that always raises.""" raise ValueError("something went wrong") -def callback_that_system_exits_zero(): - """A callback that raises SystemExit(0). +class CallableClass: + """A class that returns a callable instance (like BaseNotifier).""" - A library called inside a user callback may call ``sys.exit()`` on a soft condition. - SystemExit is a BaseException, not an Exception, so an ``except Exception`` handler would - let it propagate out of the forked callback target — exiting the child with code 0, which - the supervisor reads as SUCCESS, silently recording an aborted callback as completed. - """ - raise SystemExit(0) + def __init__(self, **kwargs): + self.kwargs = kwargs + + def __call__(self, context): + return "notified" class TestExecuteCallback: @@ -93,6 +82,24 @@ class TestExecuteCallback: None, id="successful_no_args", ), + pytest.param( + f"{__name__}.callback_with_kwargs", + {"arg1": "hello", "arg2": "world"}, + Path("test.py"), + Path("bundle/path"), + True, + None, + id="successful_with_kwargs", + ), + pytest.param( + f"{__name__}.CallableClass", + {"msg": "alert"}, + Path("test.py"), + Path("bundle/path"), + True, + None, + id="callable_class_pattern", + ), pytest.param( "", {}, @@ -102,6 +109,15 @@ class TestExecuteCallback: "Callback path not found", id="empty_path", ), + pytest.param( + "nonexistent.module.function", + {}, + Path("test.py"), + Path("bundle/path"), + False, + "ModuleNotFoundError", + id="import_error", + ), pytest.param( f"{__name__}.callback_that_raises", {}, @@ -111,16 +127,14 @@ class TestExecuteCallback: "ValueError", id="execution_error", ), - # BaseException variants must be caught and reported as a failure, not allowed to - # propagate — otherwise SystemExit(0) makes the forked child exit 0 (false success). pytest.param( - f"{__name__}.callback_that_system_exits_zero", + f"{__name__}.nonexistent_function_xyz", {}, Path("test.py"), Path("bundle/path"), False, - "SystemExit", - id="system_exit_zero_is_failure", + "AttributeError", + id="attribute_error", ), ], ) @@ -164,14 +178,47 @@ class RequestCase: REQUEST_CASES = [ RequestCase( - message=GetDagRun(dag_id="test_dag", run_id="test_run"), - test_id="get_dag_run", + message=GetConnection(conn_id="test_conn"), + test_id="get_connection", + client_mock=ClientMock( + method_path="connections.get", + args=("test_conn",), + response=ConnectionResult(conn_id="test_conn", conn_type="mysql"), + ), + ), + RequestCase( + message=GetConnection(conn_id="test_conn"), + test_id="get_connection_with_password", + client_mock=ClientMock( + method_path="connections.get", + args=("test_conn",), + response=ConnectionResult(conn_id="test_conn", conn_type="mysql", password="secret"), + ), + mask_secret_args=("secret",), + ), + RequestCase( + message=GetVariable(key="test_key"), + test_id="get_variable", + client_mock=ClientMock( + method_path="variables.get", + args=("test_key",), + response=VariableResult(key="test_key", value="test_value"), + ), + ), + RequestCase( + message=GetVariableKeys(prefix="test_"), + test_id="get_variable_keys", client_mock=ClientMock( - method_path="dag_runs.get_detail", - args=("test_dag", "test_run"), - response=_MOCK_DAG_RUN, + method_path="variables.keys", + kwargs={"prefix": "test_", "limit": 1000, "offset": 0}, + response=VariableKeysResult(keys=["test_key"], total_entries=1), ), ), + RequestCase( + message=MaskSecret(value="super_secret", name="api_key"), + test_id="mask_secret", + mask_secret_args=("super_secret", "api_key"), + ), ] @pytest.fixture @@ -217,6 +264,85 @@ def test_handle_requests( mock_client_method.assert_called_once_with(*client_mock.args, **client_mock.kwargs) +class TestConfigureLogging: + """Tests for _configure_logging remote logging connection setup.""" + + def test_configure_logging_uses_remote_logging_conn(self, tmp_path, mocker): + """Verify that _remote_logging_conn is invoked with the client during logging setup.""" + from airflow.sdk.execution_time.callback_supervisor import _configure_logging + + mock_client = mocker.Mock() + log_path = str(tmp_path / "callback.log") + + mock_remote_conn = mocker.patch( + "airflow.sdk.execution_time.supervisor._remote_logging_conn", + ) + + logger, fd = _configure_logging(log_path, mock_client) + fd.close() + + mock_remote_conn.assert_called_once_with(mock_client) + + +class TestUploadLogs: + """Tests for CallbackSubprocess._upload_logs.""" + + @pytest.fixture + def callback_subprocess(self, mocker): + read_end, write_end = socket.socketpair() + proc = CallbackSubprocess( + process_log=mocker.MagicMock(), + id="12345678-1234-5678-1234-567812345678", + pid=12345, + stdin=write_end, + client=mocker.Mock(), + process=mocker.Mock(), + ) + yield proc + read_end.close() + write_end.close() + + def test_wait_calls_upload_logs_after_subprocess_completes(self, callback_subprocess, mocker): + """wait() should call _upload_logs() after the subprocess finishes.""" + mock_upload = mocker.patch( + "airflow.sdk.execution_time.callback_supervisor.CallbackSubprocess._upload_logs" + ) + mocker.patch("airflow.sdk.execution_time.callback_supervisor.CallbackSubprocess._monitor_subprocess") + mocker.patch.object(callback_subprocess, "selector") + + callback_subprocess.wait() + + mock_upload.assert_called_once() + + def test_upload_logs_delegates_to_upload_to_remote(self, callback_subprocess, mocker): + """_upload_logs calls upload_to_remote with the process logger and no ti.""" + mock_upload = mocker.patch("airflow.sdk.log.upload_to_remote") + mocker.patch("airflow.sdk.execution_time.supervisor._remote_logging_conn") + + callback_subprocess._upload_logs() + + mock_upload.assert_called_once_with(callback_subprocess.process_log) + + def test_upload_logs_failure_is_swallowed(self, callback_subprocess, mocker): + """Upload failures must not propagate — callback exit code should still be returned.""" + mocker.patch( + "airflow.sdk.log.upload_to_remote", + side_effect=RuntimeError("S3 unreachable"), + ) + mocker.patch("airflow.sdk.execution_time.supervisor._remote_logging_conn") + + callback_subprocess._upload_logs() + + def test_upload_logs_no_remote_logging_configured(self, callback_subprocess, mocker): + """When remote logging is not configured, _upload_logs completes without error.""" + mock_load_handler = mocker.patch("airflow.sdk.log.load_remote_log_handler", return_value=None) + mocker.patch("airflow.sdk.execution_time.supervisor._remote_logging_conn") + + callback_subprocess._upload_logs() + + mock_load_handler.assert_called_once() + + class TestCallbackExecutionTimeout: """Tests for the callback_execution_timeout config enforcement.""" @@ -278,34 +404,6 @@ def test_timeout_kills_long_running_subprocess(self, callback_subprocess, mocker mock_kill.assert_called_once_with(proc, signal.SIGTERM, escalation_delay=5.0, force=True) - def test_wait_reports_failure_when_timeout_kill_leaves_exit_code_none(self, callback_subprocess, mocker): - """End-to-end: a timed-out callback must surface as a non-zero exit code. - - When the timeout branch fires and kill() returns without _exit_code being set - (e.g. the kill is still escalating, or the process record is gone), wait() must - default _exit_code to 1 so the callback is recorded as FAILURE, never SUCCESS. - """ - proc = callback_subprocess - - time_values = iter([100.0, 100.0, 106.0]) - mocker.patch("airflow.sdk.execution_time.callback_supervisor.time.monotonic", side_effect=time_values) - mocker.patch.object(CallbackSubprocess, "_service_subprocess", autospec=True, return_value=None) - mocker.patch.object( - CallbackSubprocess, "_get_callback_execution_timeout", autospec=True, return_value=5 - ) - # kill() that does NOT set _exit_code, mimicking a SIGTERM still in flight. - mock_kill = mocker.patch.object(CallbackSubprocess, "kill", autospec=True) - mock_upload = mocker.patch.object(CallbackSubprocess, "_upload_logs", autospec=True) - mocker.patch.object(proc, "selector") - - exit_code = proc.wait() - - mock_kill.assert_called_once_with(proc, signal.SIGTERM, escalation_delay=5.0, force=True) - assert exit_code == 1 - assert proc._exit_code == 1 - # Logs are still uploaded even on the timeout/failure path. - mock_upload.assert_called_once() - class TestCallbackSubprocessStart: """Verify that CallbackSubprocess.start() properly initializes and executes the callback target.""" @@ -342,6 +440,26 @@ def base_mocks_setup(self, mock_supervisor_comms): self.mock_execute_callback = mock_execute yield + @pytest.fixture + def mock_bundle_setup(self): + """Setup bundle-related mocks.""" + with patch("airflow.dag_processing.bundles.manager.DagBundlesManager") as mock_manager_class: + mock_bundle = Mock() + bundle_path = Path("/path/to/bundle") + mock_bundle.path = bundle_path + mock_bundle.name = "test-bundle" + + mock_bundle_manager = Mock() + mock_manager_class.return_value = mock_bundle_manager + mock_bundle_manager.get_bundle.return_value = mock_bundle + + yield { + "manager_class": mock_manager_class, + "manager": mock_bundle_manager, + "bundle": mock_bundle, + "bundle_path": bundle_path, + } + def test_execute_callback_receives_correct_parameters(self, base_start_kwargs): """Test that execute_callback receives the correct parameters.""" CallbackSubprocess.start(**base_start_kwargs) @@ -359,311 +477,54 @@ def test_execute_callback_receives_correct_parameters(self, base_start_kwargs): log=ANY, ) - def test_context_includes_deadline_metadata(self, base_start_kwargs): - """When dag_id/run_id are provided, the fetched context is passed to the callback, and - deadline_id/deadline_time are injected as context['deadline'] — the core deliverable of - the context-fetch PR (giving a deadline callback access to its own deadline metadata).""" - deadline_id = str(uuid.uuid4()) - deadline_time = "2026-06-13T00:00:00+00:00" - adjusted = { - **base_start_kwargs, - "dag_id": "my_dag", - "run_id": "my_run", - "deadline_id": deadline_id, - "deadline_time": deadline_time, - } - - with patch( - "airflow.sdk.execution_time.callback_supervisor._fetch_and_build_context", - return_value={"dag_run": Mock(), "run_id": "my_run"}, - ) as mock_fetch: - CallbackSubprocess.start(**adjusted) - self.mock_super_start.call_args.kwargs["target"]() - - # The deadline dict is assembled by the caller and threaded into - # _fetch_and_build_context (which forwards it to the shared build_context_from_dag_run), - # so both the executor and triggerer paths produce context["deadline"] identically. - mock_fetch.assert_called_once() - assert mock_fetch.call_args.kwargs["deadline"] == { - "id": deadline_id, - "deadline_time": deadline_time, - } - passed_kwargs = self.mock_execute_callback.call_args.kwargs["callback_kwargs"] - assert "context" in passed_kwargs - - def test_context_has_no_deadline_key_when_metadata_absent(self, base_start_kwargs): - """With dag_id/run_id but no deadline_id/deadline_time, the context is still passed but - carries no 'deadline' key (a non-deadline callback that just needs DagRun context).""" - adjusted = {**base_start_kwargs, "dag_id": "my_dag", "run_id": "my_run"} - - with patch( - "airflow.sdk.execution_time.callback_supervisor._fetch_and_build_context", - return_value={"dag_run": Mock(), "run_id": "my_run"}, - ) as mock_fetch: - CallbackSubprocess.start(**adjusted) - self.mock_super_start.call_args.kwargs["target"]() - - # No deadline metadata -> the caller passes deadline=None, so context carries no deadline key. - assert mock_fetch.call_args.kwargs["deadline"] is None - passed_kwargs = self.mock_execute_callback.call_args.kwargs["callback_kwargs"] - assert "context" in passed_kwargs - assert "deadline" not in passed_kwargs["context"] - - -class TestFetchAndBuildContext: - """Tests for _fetch_and_build_context.""" + def test_execute_callback_with_bundle_info_should_pass_correct_parameters( + self, base_start_kwargs, mock_bundle_setup + ): + """Test that execute_callback receives the correct parameters when bundle_info is provided.""" + bundle_info = BundleInfo(name="test-bundle", version="1.0") + adjusted_kwargs = {**base_start_kwargs, "bundle_info": bundle_info} - def test_returns_none_on_unexpected_response(self, mocker): - """When the comms channel returns a non-DagRunResult, context should be None.""" - from airflow.sdk.execution_time.callback_supervisor import _fetch_and_build_context + CallbackSubprocess.start(**adjusted_kwargs) + self.mock_super_start.call_args.kwargs["target"]() - mock_comms = mocker.Mock() - mock_comms.send.return_value = ErrorResponse( - error="API_SERVER_ERROR", - detail={"status_code": 404, "message": "DagRun not found"}, + self.mock_super_start.assert_called_with( + id=uuid.UUID(adjusted_kwargs["id"]), client=adjusted_kwargs["client"], target=ANY, logger=None ) - log = structlog.get_logger() - - result = _fetch_and_build_context(mock_comms, "missing_dag", "missing_run", log) - - assert result is None - def test_returns_none_on_exception(self, mocker): - """When the comms channel raises an exception, context should be None.""" - from airflow.sdk.execution_time.callback_supervisor import _fetch_and_build_context - - mock_comms = mocker.Mock() - mock_comms.send.side_effect = RuntimeError("connection lost") - log = structlog.get_logger() - - result = _fetch_and_build_context(mock_comms, "dag_1", "run_1", log) - - assert result is None - - def test_returns_context_dict_on_success(self, mocker): - """When DagRunResult is returned, a context dict is built.""" - from airflow.sdk.execution_time.callback_supervisor import _fetch_and_build_context - from airflow.sdk.execution_time.comms import DagRunResult - - mock_comms = mocker.Mock() - mock_comms.send.return_value = DagRunResult( - dag_id="test_dag", - run_id="test_run", - run_after=timezone.parse("2024-01-01T00:00:00+00:00"), - run_type="manual", - state="running", - consumed_asset_events=[], + mock_bundle_setup["manager"].get_bundle.assert_called_once_with( + name=bundle_info.name, + version=bundle_info.version, ) - log = structlog.get_logger() - - result = _fetch_and_build_context(mock_comms, "test_dag", "test_run", log) - - assert result is not None - assert result["dag_run"].dag_id == "test_dag" - assert result["dag_run"].run_id == "test_run" + mock_bundle_setup["bundle"].initialize.assert_called_once() + self.mock_execute_callback.assert_called_with( + bundle_path=str(mock_bundle_setup["bundle_path"]), + callback_kwargs=adjusted_kwargs["callback_kwargs"], + callback_path=adjusted_kwargs["callback_path"], + dag_rel_path=adjusted_kwargs["dag_rel_path"], + log=ANY, + ) -class TestLoadMangledModule: - """Tests for _load_mangled_module.""" - - def test_skips_registration_when_already_in_sys_modules(self, tmp_path, mocker): - import sys - - from airflow.sdk.execution_time.callback_supervisor import _register_unusual_prefix_module - from airflow.utils.file import get_unique_dag_module_name - - stem = "cached_dag" - (tmp_path / f"{stem}.py").write_text("def fn(): return 'cached'\n") - mod_name = get_unique_dag_module_name(str(tmp_path / f"{stem}.py")) - - log = mocker.Mock() - _register_unusual_prefix_module(f"{mod_name}.fn", tmp_path, log) - assert mod_name in sys.modules - - # Second call should be a no-op; module should not be re-loaded - (tmp_path / f"{stem}.py").write_text("def fn(): return 'new'\n") - _register_unusual_prefix_module(f"{mod_name}.fn", tmp_path, log) - - assert sys.modules[mod_name].fn() == "cached" - sys.modules.pop(mod_name) - - def test_finds_file_with_dashes_and_dots_in_name(self, tmp_path, mocker): - """File with dashes/dots (my-dag.file.py) is found despite sanitized stem.""" - import sys - - from airflow.sdk.execution_time.callback_supervisor import _register_unusual_prefix_module - from airflow.utils.file import get_unique_dag_module_name - - # File with dashes and dots: stem "my-dag.file" sanitizes to "my_dag_file" - dag_file = tmp_path / "my-dag.file.py" - dag_file.write_text("def callback(): return 'dashed'\n") - mod_name = get_unique_dag_module_name(str(dag_file)) - - # Verify the stem was sanitized - assert "my_dag_file" in mod_name - - log = mocker.Mock() - _register_unusual_prefix_module(f"{mod_name}.callback", tmp_path, log) - - assert mod_name in sys.modules - assert sys.modules[mod_name].callback() == "dashed" - sys.modules.pop(mod_name) - - -class TestRenderCallbackKwargsAdversarial: - """Adversarial QA-loop tests for render_callback_kwargs (mutation/SSTI/missing-key/guard edges).""" - - @pytest.fixture - def context(self): - return { - "dag_run": SimpleNamespace(run_id="manual__2026-06-10"), - "ds": "2026-06-10", - } - - def test_renders_string_kwarg_using_context(self, context): - """A string kwarg containing {{ }} is rendered against the context.""" - result = render_callback_kwargs({"msg": "Deadline {{ dag_run.run_id }} missed"}, context) - assert result["msg"] == "Deadline manual__2026-06-10 missed" - - @pytest.mark.parametrize( - "value", - [42, 3.14, True, None, {"nested": "{{ dag_run.run_id }}"}, ["{{ ds }}"], ("{{ ds }}",)], - ) - def test_non_string_values_pass_through_untouched(self, context, value): - """Non-string kwargs (int/float/bool/None/dict/list/tuple) are returned unchanged.""" - result = render_callback_kwargs({"arg": value}, context) - assert result["arg"] == value + def test_callback_supervisor_with_bundle_info_should_adjust_sys_path( + self, base_start_kwargs, mock_bundle_setup + ): + """Test that bundle_path is added to sys.path when bundle path is provided.""" + with patch("sys.path", new_callable=list) as mock_sys_path: + bundle_info = BundleInfo(name="test-bundle", version="1.0") + adjusted_kwargs = {**base_start_kwargs, "bundle_info": bundle_info} - def test_string_without_markers_passes_through(self, context): - """A plain string with no {{ markers is returned verbatim, never rendered.""" - result = render_callback_kwargs({"msg": "no templates here"}, context) - assert result["msg"] == "no templates here" + CallbackSubprocess.start(**adjusted_kwargs) + self.mock_super_start.call_args.kwargs["target"]() - @pytest.mark.parametrize( - "broken", - ["{{ undefined.attr.fails() }}", "{{ 1/0 }}", "{{ dag_run.does_not_exist.boom }}"], - ) - def test_broken_template_does_not_raise_and_returns_raw(self, context, broken): - """A template that errors at render time returns the raw value and logs a warning.""" - with patch("airflow.sdk._shared.template_rendering.log") as mock_log: - result = render_callback_kwargs({"msg": broken}, context) - assert result["msg"] == broken - mock_log.warning.assert_called_once() - - def test_does_not_mutate_input_dict(self, context): - """Rendering builds a new dict and leaves the caller's kwargs untouched.""" - original = {"msg": "Run {{ dag_run.run_id }}"} - result = render_callback_kwargs(original, context) - assert original == {"msg": "Run {{ dag_run.run_id }}"} - assert result is not original - assert result["msg"] == "Run manual__2026-06-10" - - -class TestJinjaContextIntegration: - """ - Integration tests coupling the two newly-added features: - - 1. ``build_context_from_dag_run`` (context.py) which constructs the real context dict. - 2. ``render_callback_kwargs`` (callback_supervisor.py) which Jinja-renders kwargs. - - These verify the ACTUAL fields produced by ``build_context_from_dag_run`` render - correctly when a deadline-callback kwarg references them, rather than rendering - against a hand-rolled stub dict (which is what the unit tests above do). - """ + assert str(mock_bundle_setup["bundle_path"]) in mock_sys_path - @pytest.fixture - def logical_date(self): - # Aware datetime so coerce_datetime keeps the tz; a non-midnight time so we can - # distinguish ds (date only) from ts (full isoformat) and the _nodash variants. - return timezone.parse("2026-06-01T07:30:45+00:00") + def test_callback_supervisor_should_exit_on_error(self, base_start_kwargs): + """Test that callback supervisor exits if execute_callback returns an error.""" + self.mock_execute_callback.return_value = (False, "Some error occurred") - @pytest.fixture - def real_context(self, logical_date): - """Build a real context via build_context_from_dag_run, then inject deadline.""" - from airflow.sdk.execution_time.context import build_context_from_dag_run - - dag_run = SimpleNamespace( - run_id="manual__2026-06-01T07:30:45+00:00", - logical_date=logical_date, - data_interval_start=logical_date, - data_interval_end=timezone.parse("2026-06-01T08:30:45+00:00"), - dag_id="deadline_dag", - ) - context = build_context_from_dag_run(dag_run) - context["deadline"] = {"id": "dl-1", "deadline_time": "2026-06-01T07:00:00Z"} - return context - - def test_real_context_fields_render(self, real_context): - """run_id, ds, and the injected deadline_time all render from a real context.""" - kwargs = { - "msg": "run={{ dag_run.run_id }} ds={{ ds }} dl={{ deadline.deadline_time }}", - "context": real_context, - } - result = render_callback_kwargs(kwargs, real_context) + CallbackSubprocess.start(**base_start_kwargs) - assert result["msg"] == ( - "run=manual__2026-06-01T07:30:45+00:00 ds=2026-06-01 dl=2026-06-01T07:00:00Z" - ) - # The reserved context key is passed through untouched. - assert result["context"] is real_context - - def test_logical_date_none_branch_renders_empty_for_missing_keys(self): - """ - When logical_date is None, build_context_from_dag_run only emits dag_run + run_id. - - A template referencing {{ ds }} must render to empty string (Jinja Undefined), - NOT raise -- mirroring the behavior callers depend on. - """ - from airflow.sdk.execution_time.context import build_context_from_dag_run - - dag_run = SimpleNamespace( - run_id="asset_triggered_run", - logical_date=None, - data_interval_start=None, - data_interval_end=None, - dag_id="asset_dag", - ) - context = build_context_from_dag_run(dag_run) - # Confirm the builder really omitted the date-derived keys. - assert "ds" not in context - assert "ts" not in context - assert set(context) == {"dag_run", "run_id"} - - kwargs = {"msg": "run={{ dag_run.run_id }} ds=[{{ ds }}] ts=[{{ ts }}]"} - result = render_callback_kwargs(kwargs, context) - - assert result["msg"] == "run=asset_triggered_run ds=[] ts=[]" - - -class TestSuperviseCallbackExitMapping: - """supervise_callback maps the subprocess exit code to the right exception so the executor can - distinguish a transient context-fetch failure (retryable) from a real callback failure.""" - - def _run(self, mocker, exit_code): - proc = mocker.Mock() - proc.wait.return_value = exit_code - mocker.patch.object(CallbackSubprocess, "start", return_value=proc) - return supervise_callback( - id=str(uuid.uuid4()), - callback_path="some.module.cb", - callback_kwargs={}, - dag_id="d", - run_id="r", - client=mocker.Mock(), - ) + with pytest.raises(SystemExit) as exc_info: + self.mock_super_start.call_args.kwargs["target"]() - def test_context_fetch_exit_code_raises_transient_error(self, mocker): - """The sentinel context-fetch exit code -> CallbackContextFetchError (retryable).""" - with pytest.raises(CallbackContextFetchError): - self._run(mocker, CALLBACK_CONTEXT_FETCH_EXIT_CODE) - - def test_other_nonzero_exit_raises_generic_runtime_error(self, mocker): - """A real callback failure (any other non-zero exit) -> generic RuntimeError, NOT the - transient subclass, so the executor still marks it terminally FAILED.""" - with pytest.raises(RuntimeError) as exc_info: - self._run(mocker, 1) - assert not isinstance(exc_info.value, CallbackContextFetchError) - - def test_zero_exit_returns_without_raising(self, mocker): - """A clean exit returns the code and raises nothing.""" - assert self._run(mocker, 0) == 0 + assert exc_info.value.code == 1 diff --git a/task-sdk/tests/task_sdk/execution_time/test_callback_supervisor_io_faults.py b/task-sdk/tests/task_sdk/execution_time/test_callback_supervisor_io_faults.py deleted file mode 100644 index d7d776d2e3599..0000000000000 --- a/task-sdk/tests/task_sdk/execution_time/test_callback_supervisor_io_faults.py +++ /dev/null @@ -1,61 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -IO-boundary fault-injection tests for the callback supervisor. - -Each test injects a fault at the IO boundary (socket recv/send, file open, fork, -comms decode) into the REAL code path under test, NOT by replacing the logic, -to verify the deadline-callback runtime surfaces faults cleanly rather than -hanging or crashing the supervisor. -""" - -from __future__ import annotations - -from unittest import mock - -import structlog - -from airflow.sdk.execution_time.callback_supervisor import _fetch_and_build_context - -log = structlog.get_logger() - - -# --------------------------------------------------------------------------- -# Scenario 1: comms send/recv raises mid-callback (socket error / broken pipe) -# --------------------------------------------------------------------------- -class TestScenario1CommsFailureMidCallback: - """SUPERVISOR_COMMS.send raises a socket error during the callback.""" - - def test_fetch_and_build_context_swallows_comms_failure(self): - """_fetch_and_build_context must return None (not raise) when comms.send raises.""" - comms = mock.MagicMock() - comms.send.side_effect = BrokenPipeError("pipe gone") - result = _fetch_and_build_context(comms, "d", "r", log) - assert result is None - - -# --------------------------------------------------------------------------- -# Scenario 2: GetDagRun comms returns malformed / partial / wrong-type payload -# --------------------------------------------------------------------------- -class TestScenario2MalformedDagRunPayload: - """_fetch_and_build_context fed a bad response from comms.send.""" - - def test_wrong_response_type_returns_none(self): - comms = mock.MagicMock() - comms.send.return_value = {"not": "a DagRunResult"} - assert _fetch_and_build_context(comms, "d", "r", log) is None