From 7dda5b59e076866de1a6c0a4426303d0e81ec762 Mon Sep 17 00:00:00 2001
From: Brent Gardner <bgardner@squarelabs.net>
Date: Wed, 24 Jun 2026 09:23:44 -0600
Subject: [PATCH 01/18] RuntimeOptimizerExec SLT: red baseline for hash-join
 build-side swap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Establishes the failing test for an AQE primitive native to DataFusion's
streaming model. Scenario: `GROUP BY` over a column whose distinct
cardinality the planner can't predict (column stat = Absent), joined
against a small dimension table.

Concretely: `big` has 100K rows with `group_key = id % 5` (actual
distinct count = 5, planner sees Absent). After aggregation:
  actual rows: 5
  planner estimate: Inexact, upper-bounded by input row count = 100K

JoinSelection compares aggregated_big (100K est) vs small (100 Exact)
and picks `small` as the build side. Runtime reality: aggregated_big is
20x smaller than small. Build side is inverted from optimal.

Pencil-in target plan wraps the join in RuntimeOptimizerExec and shows
the swap: `on=[(group_key@0, id@0)]` (target) vs `on=[(id@0,
group_key@0)]` (actual today). Build side switches from small to
aggregated_big.

GROUP BY is the canonical pipeline breaker the AQE primitive needs
anyway — the AggregateExec's build phase IS the natural barrier where
RuntimeOptimizerExec reads runtime stats and re-plans the upper subplan.

Same red-then-green discipline as the parallel-window-cumulative PoC.
---
 .../test_files/runtime_optimizer.slt          | 150 ++++++++++++++++++
 1 file changed, 150 insertions(+)
 create mode 100644 datafusion/sqllogictest/test_files/runtime_optimizer.slt

diff --git a/datafusion/sqllogictest/test_files/runtime_optimizer.slt b/datafusion/sqllogictest/test_files/runtime_optimizer.slt
new file mode 100644
index 000000000000..4480e52b7817
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/runtime_optimizer.slt
@@ -0,0 +1,150 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# ===========================================================================
+# RuntimeOptimizerExec — adaptive query execution primitive.
+#
+# DataFusion is in-process and streaming. Adaptive optimization here should
+# be in-process and streaming too. Pipeline breakers (SortExec, the build
+# phase of AggregateExec and HashJoinExec) already have a natural barrier
+# between their build and emit phases — they buffer their input and know
+# its true row count by the time downstream starts pulling. The cost of
+# that buffering is already paid; surfacing runtime stats from it to the
+# planner is essentially free.
+#
+# `RuntimeOptimizerExec` is inserted at static plan time above pipeline
+# breakers. When execution reaches its child's barrier, it reads the
+# child's true runtime stats, calls the optimizer with those stats to
+# rewrite its (statically-guessed) upper subplan, then executes the
+# rewritten subplan.
+#
+# Same primitive generalizes to partition coalescing, skew handling,
+# parallel window functions, and the rest of AQE. This SLT exercises the
+# first concrete use case that lands the foundation.
+# ===========================================================================
+
+# ---------------------------------------------------------------------------
+# Scenario: GROUP BY on a key whose distinct cardinality the planner can't
+# predict, joined against a small dimension.
+#
+# `big` has 100,000 rows. `group_key = id % 5` so the *actual* distinct
+# cardinality is 5, but DataFusion doesn't propagate distinct_count through
+# arithmetic — column stat is Absent. After GROUP BY group_key:
+#   actual row count: 5
+#   planner estimate: Inexact, upper-bounded by input row count = 100,000
+#
+# `small` is a 100-row dimension table with Exact stats.
+#
+# JoinSelection sees `aggregated_big` ≈ 100K (Inexact) vs `small` = 100
+# (Exact), so it picks `small` as the build side. That's exactly inverted
+# from what runtime would tell us: `aggregated_big` is only 5 rows, would
+# be the 20× cheaper build side.
+# ---------------------------------------------------------------------------
+
+statement ok
+set datafusion.execution.target_partitions = 4;
+
+statement ok
+CREATE TABLE big AS
+SELECT
+    value AS id,
+    value % 5 AS group_key,
+    1 AS payload
+FROM generate_series(0, 99999);
+
+statement ok
+CREATE TABLE small AS
+SELECT value AS id, value * 10 AS payload
+FROM generate_series(0, 99);
+
+# Sanity-check the base sizes.
+query I
+SELECT count(*) FROM big;
+----
+100000
+
+query I
+SELECT count(*) FROM small;
+----
+100
+
+# Sanity-check that aggregated_big has only 5 actual rows even though the
+# planner can't know that.
+query I
+SELECT count(*) FROM (SELECT group_key FROM big GROUP BY group_key) t;
+----
+5
+
+# ---------------------------------------------------------------------------
+# EXPLAIN — RED today. Without RuntimeOptimizerExec, JoinSelection trusts
+# the (badly-estimated) aggregated_big size and picks `small` as the build
+# side. We're penciling in the target plan that RuntimeOptimizerExec will
+# produce: the join with `aggregated_big` as the left/build child and
+# `small` as the right/probe child, wrapped in a RuntimeOptimizerExec.
+#
+# Statistics decoration is intentionally minimal; an update-mode pass once
+# RuntimeOptimizerExec renders itself will fill in exact text.
+# ---------------------------------------------------------------------------
+query TT
+EXPLAIN SELECT bg.group_key, bg.sum_payload, s.payload
+FROM (
+  SELECT group_key, SUM(payload) AS sum_payload FROM big GROUP BY group_key
+) bg
+JOIN small s ON bg.group_key = s.id;
+----
+logical_plan
+01)Projection: bg.group_key, bg.sum_payload, s.payload
+02)--Inner Join: bg.group_key = s.id
+03)----SubqueryAlias: bg
+04)------Projection: big.group_key, sum(big.payload) AS sum_payload
+05)--------Aggregate: groupBy=[[big.group_key]], aggr=[[sum(big.payload)]]
+06)----------TableScan: big projection=[group_key, payload]
+07)----SubqueryAlias: s
+08)------TableScan: small projection=[id, payload]
+physical_plan
+01)RuntimeOptimizerExec
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(group_key@0, id@0)], projection=[group_key@0, sum_payload@1, payload@3]
+03)----ProjectionExec: expr=[group_key@0 as group_key, sum(big.payload)@1 as sum_payload]
+04)------AggregateExec: mode=FinalPartitioned, gby=[group_key@0 as group_key], aggr=[sum(big.payload)]
+05)--------RepartitionExec: partitioning=Hash([group_key@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[group_key@0 as group_key], aggr=[sum(big.payload)]
+07)------------DataSourceExec
+08)----CoalescePartitionsExec
+09)------DataSourceExec
+
+# ---------------------------------------------------------------------------
+# Result — GREEN regardless of build-side choice (correctness is plan-
+# agnostic; only performance changes). After RuntimeOptimizerExec lands,
+# this stays green; only EXPLAIN moves.
+#
+# big rows match small ids 0..4 (the only group_keys that exist after
+# aggregation). Each group has 20000 rows of payload=1, so sum_payload=20000
+# for every group. small.payload = id * 10.
+# ---------------------------------------------------------------------------
+query III
+SELECT bg.group_key, bg.sum_payload, s.payload
+FROM (
+  SELECT group_key, SUM(payload) AS sum_payload FROM big GROUP BY group_key
+) bg
+JOIN small s ON bg.group_key = s.id
+ORDER BY bg.group_key;
+----
+0 20000 0
+1 20000 10
+2 20000 20
+3 20000 30
+4 20000 40

From 6c4068cc812cc4b3fab7f8ed2de20411c394296e Mon Sep 17 00:00:00 2001
From: Brent Gardner <bgardner@squarelabs.net>
Date: Wed, 24 Jun 2026 09:55:37 -0600
Subject: [PATCH 02/18] RuntimeOptimizerExec passthrough + insertion rule
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds the operator (passthrough stub — execute() just forwards) and a
PhysicalOptimizerRule that wraps the plan root in it. The rule runs at
the end of the optimizer chain so the wrapper sits above the final
plan.

EXPLAIN now shows RuntimeOptimizerExec at line 01 of the physical plan.
Behavior unchanged — no buffers, no rules, no synchronization yet. The
SLT stays red for the right reason: the build-side swap that
RuntimeOptimizerExec exists to perform hasn't been wired.

Re-wrap guard in InsertRuntimeOptimizer protects against multi-pass
optimization loops accidentally nesting wrappers.

Subsequent commits add PipelineBreakerBuffer (the two-flag
synchronization primitive), the RuntimeRule trait, and the
SwapBuildSideIfInverted rule that flips HashJoinExec children via a
typed `flip_sides()` method on the operator itself.
---
 datafusion/physical-optimizer/src/lib.rs      |   1 +
 .../physical-optimizer/src/optimizer.rs       |   6 +
 .../src/runtime_optimizer.rs                  |  64 +++++++++++
 datafusion/physical-plan/src/lib.rs           |   1 +
 .../physical-plan/src/runtime_optimizer.rs    | 108 ++++++++++++++++++
 5 files changed, 180 insertions(+)
 create mode 100644 datafusion/physical-optimizer/src/runtime_optimizer.rs
 create mode 100644 datafusion/physical-plan/src/runtime_optimizer.rs

diff --git a/datafusion/physical-optimizer/src/lib.rs b/datafusion/physical-optimizer/src/lib.rs
index b9eb248f6e84..4d220a85840e 100644
--- a/datafusion/physical-optimizer/src/lib.rs
+++ b/datafusion/physical-optimizer/src/lib.rs
@@ -44,6 +44,7 @@ pub mod projection_pushdown;
 pub use datafusion_pruning as pruning;
 pub mod hash_join_buffering;
 pub mod pushdown_sort;
+pub mod runtime_optimizer;
 pub mod sanity_checker;
 pub mod topk_aggregation;
 pub mod topk_repartition;
diff --git a/datafusion/physical-optimizer/src/optimizer.rs b/datafusion/physical-optimizer/src/optimizer.rs
index 0f81512b61c8..7ab25425da34 100644
--- a/datafusion/physical-optimizer/src/optimizer.rs
+++ b/datafusion/physical-optimizer/src/optimizer.rs
@@ -38,6 +38,7 @@ use crate::update_aggr_exprs::OptimizeAggregateOrder;
 use crate::hash_join_buffering::HashJoinBuffering;
 use crate::limit_pushdown_past_window::LimitPushPastWindows;
 use crate::pushdown_sort::PushdownSort;
+use crate::runtime_optimizer::InsertRuntimeOptimizer;
 use crate::window_topn::WindowTopN;
 use datafusion_common::Result;
 use datafusion_common::config::ConfigOptions;
@@ -247,6 +248,11 @@ impl PhysicalOptimizer {
             // given query plan; i.e. it only acts as a final
             // gatekeeping rule.
             Arc::new(SanityCheckPlan::new()),
+            // InsertRuntimeOptimizer wraps the (now-final) plan root in
+            // a RuntimeOptimizerExec. Runs last so the wrapper sits above
+            // everything else; subsequent commits use it to coordinate
+            // adaptive optimization at runtime.
+            Arc::new(InsertRuntimeOptimizer::new()),
         ];
 
         Self::with_rules(rules)
diff --git a/datafusion/physical-optimizer/src/runtime_optimizer.rs b/datafusion/physical-optimizer/src/runtime_optimizer.rs
new file mode 100644
index 000000000000..eca3e259a9e8
--- /dev/null
+++ b/datafusion/physical-optimizer/src/runtime_optimizer.rs
@@ -0,0 +1,64 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`InsertRuntimeOptimizer`] wraps the root of the physical plan in a
+//! [`RuntimeOptimizerExec`]. Future commits add `PipelineBreakerBuffer`s
+//! beneath pipeline-breaking operators and a `Vec<RuntimeRule>` that the
+//! root coordinator runs once those buffers signal ready.
+//!
+//! Today the wrapper is a passthrough; this rule exists so the operator
+//! has a stable insertion point that subsequent commits can build on.
+
+use std::sync::Arc;
+
+use crate::PhysicalOptimizerRule;
+use datafusion_common::Result;
+use datafusion_common::config::ConfigOptions;
+use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::runtime_optimizer::RuntimeOptimizerExec;
+
+#[derive(Default, Debug)]
+pub struct InsertRuntimeOptimizer;
+
+impl InsertRuntimeOptimizer {
+    pub fn new() -> Self {
+        Self
+    }
+}
+
+impl PhysicalOptimizerRule for InsertRuntimeOptimizer {
+    fn name(&self) -> &str {
+        "InsertRuntimeOptimizer"
+    }
+
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        _config: &ConfigOptions,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        // Don't re-wrap if we've already been inserted (e.g. multi-pass
+        // optimization loops).
+        if plan.downcast_ref::<RuntimeOptimizerExec>().is_some() {
+            return Ok(plan);
+        }
+        Ok(Arc::new(RuntimeOptimizerExec::new(plan)))
+    }
+
+    fn schema_check(&self) -> bool {
+        true
+    }
+}
diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs
index 6cc6e44c32cc..bd92d4c021a4 100644
--- a/datafusion/physical-plan/src/lib.rs
+++ b/datafusion/physical-plan/src/lib.rs
@@ -86,6 +86,7 @@ pub mod placeholder_row;
 pub mod projection;
 pub mod recursive_query;
 pub mod repartition;
+pub mod runtime_optimizer;
 pub mod scalar_subquery;
 pub mod sort_pushdown;
 pub mod sorts;
diff --git a/datafusion/physical-plan/src/runtime_optimizer.rs b/datafusion/physical-plan/src/runtime_optimizer.rs
new file mode 100644
index 000000000000..01fbea13cd4a
--- /dev/null
+++ b/datafusion/physical-plan/src/runtime_optimizer.rs
@@ -0,0 +1,108 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Adaptive query execution primitive: a root-of-plan wrapper that runs
+//! `RuntimeRule`s against the live plan once `PipelineBreakerBuffer`
+//! children signal they have absorbed their first batch from every
+//! partition.
+//!
+//! Stub: currently a passthrough. Real coordinator behavior — wait for
+//! buffers, run rules, mutate adaptive operators in place via their
+//! typed methods, then release the buffers — lands in follow-up commits.
+//! Exists today to anchor the plan shape so the insertion rule has
+//! somewhere to put a wrapper.
+//!
+//! Design notes:
+//! - `RuntimeOptimizerExec` sits at the root of the plan tree. It owns
+//!   the entire subplan and naturally receives every `poll()`. No
+//!   ownership inversion, no back-pointers, no plan-tree mutation.
+//! - Adaptive optimizations are expressed as `RuntimeRule`s that mutate
+//!   operator internal state via typed methods (e.g.
+//!   `HashJoinExec::flip_sides`). The plan tree shape stays static
+//!   after planning; only operator config gets late-finalized.
+//! - Buffer coordination uses two flags: `is_ready` (mechanical, set by
+//!   the buffer when all input partitions have produced ≥1 batch) and
+//!   `go_ahead` (semantic, set by `RuntimeOptimizerExec` after rules
+//!   have run). Default is permissive — buffers release once ready
+//!   unless a rule explicitly holds them back.
+
+use std::sync::Arc;
+
+use datafusion_common::Result;
+use datafusion_execution::TaskContext;
+
+use crate::{
+    DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties,
+    SendableRecordBatchStream,
+};
+
+#[derive(Debug)]
+pub struct RuntimeOptimizerExec {
+    input: Arc<dyn ExecutionPlan>,
+    cache: Arc<PlanProperties>,
+}
+
+impl RuntimeOptimizerExec {
+    pub fn new(input: Arc<dyn ExecutionPlan>) -> Self {
+        let cache = Arc::clone(input.properties());
+        Self { input, cache }
+    }
+}
+
+impl DisplayAs for RuntimeOptimizerExec {
+    fn fmt_as(
+        &self,
+        _t: DisplayFormatType,
+        f: &mut std::fmt::Formatter,
+    ) -> std::fmt::Result {
+        write!(f, "RuntimeOptimizerExec")
+    }
+}
+
+impl ExecutionPlan for RuntimeOptimizerExec {
+    fn name(&self) -> &'static str {
+        "RuntimeOptimizerExec"
+    }
+
+    fn properties(&self) -> &Arc<PlanProperties> {
+        &self.cache
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![&self.input]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(Arc::new(Self::new(children.swap_remove(0))))
+    }
+
+    fn maintains_input_order(&self) -> Vec<bool> {
+        vec![true]
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        // Passthrough stub. Real adaptive body lands in follow-up commits.
+        self.input.execute(partition, context)
+    }
+}

From 4fd62c72b280432f057d880d8b5b04b4db36ba20 Mon Sep 17 00:00:00 2001
From: Brent Gardner <bgardner@squarelabs.net>
Date: Wed, 24 Jun 2026 10:02:16 -0600
Subject: [PATCH 03/18] PipelineBreakerBuffer passthrough + insertion above
 breakers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds the synchronization-point operator (passthrough stub — execute()
just forwards) and extends InsertRuntimeOptimizer to wrap every
pipeline-breaking operator with one.

`is_pipeline_breaker` currently matches AggregateExec and SortExec — the
canonical input-absorbing operators. Other candidates (HashJoinExec's
build side specifically; window aggregates with unbounded frames) can
be added as their adaptive rules need them.

EXPLAIN now shows PipelineBreakerBuffer above each AggregateExec in the
plan. SLT diff is purely the build-side swap remaining — buffer
wrapping is correct in both expected and actual.

Two-flag synchronization (is_ready / go_ahead) and the RuntimeRule
trait come next. After that, SwapBuildSideIfInverted + HashJoinExec's
flip_sides() lands the swap and the test flips green.
---
 .../src/runtime_optimizer.rs                  | 53 +++++++++--
 datafusion/physical-plan/src/lib.rs           |  1 +
 .../src/pipeline_breaker_buffer.rs            | 95 +++++++++++++++++++
 .../test_files/runtime_optimizer.slt          | 14 +--
 4 files changed, 148 insertions(+), 15 deletions(-)
 create mode 100644 datafusion/physical-plan/src/pipeline_breaker_buffer.rs

diff --git a/datafusion/physical-optimizer/src/runtime_optimizer.rs b/datafusion/physical-optimizer/src/runtime_optimizer.rs
index eca3e259a9e8..1cc338942a3a 100644
--- a/datafusion/physical-optimizer/src/runtime_optimizer.rs
+++ b/datafusion/physical-optimizer/src/runtime_optimizer.rs
@@ -15,21 +15,32 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! [`InsertRuntimeOptimizer`] wraps the root of the physical plan in a
-//! [`RuntimeOptimizerExec`]. Future commits add `PipelineBreakerBuffer`s
-//! beneath pipeline-breaking operators and a `Vec<RuntimeRule>` that the
-//! root coordinator runs once those buffers signal ready.
+//! [`InsertRuntimeOptimizer`] does two things in one pass:
 //!
-//! Today the wrapper is a passthrough; this rule exists so the operator
-//! has a stable insertion point that subsequent commits can build on.
+//! 1. Walks the plan tree, wrapping every pipeline-breaking operator
+//!    (currently `AggregateExec` and `SortExec`) in a
+//!    [`PipelineBreakerBuffer`]. The buffer is the synchronization point
+//!    where runtime stats become observable.
+//! 2. Wraps the resulting plan root in a [`RuntimeOptimizerExec`]. The
+//!    root operator coordinates: once all buffers signal ready it runs a
+//!    `Vec<RuntimeRule>` over the plan, mutates adaptive operators in
+//!    place via their typed methods, and releases the buffers.
+//!
+//! Today both wrappers are passthrough — this rule only installs them so
+//! the structural shape is visible in EXPLAIN. Follow-up commits add the
+//! synchronization protocol and the first runtime rule.
 
 use std::sync::Arc;
 
 use crate::PhysicalOptimizerRule;
 use datafusion_common::Result;
 use datafusion_common::config::ConfigOptions;
+use datafusion_common::tree_node::{Transformed, TreeNode};
 use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::aggregates::AggregateExec;
+use datafusion_physical_plan::pipeline_breaker_buffer::PipelineBreakerBuffer;
 use datafusion_physical_plan::runtime_optimizer::RuntimeOptimizerExec;
+use datafusion_physical_plan::sorts::sort::SortExec;
 
 #[derive(Default, Debug)]
 pub struct InsertRuntimeOptimizer;
@@ -50,15 +61,39 @@ impl PhysicalOptimizerRule for InsertRuntimeOptimizer {
         plan: Arc<dyn ExecutionPlan>,
         _config: &ConfigOptions,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        // Don't re-wrap if we've already been inserted (e.g. multi-pass
-        // optimization loops).
+        // Don't re-wrap if we've already been inserted (multi-pass loops).
         if plan.downcast_ref::<RuntimeOptimizerExec>().is_some() {
             return Ok(plan);
         }
-        Ok(Arc::new(RuntimeOptimizerExec::new(plan)))
+
+        // Phase 1: wrap each pipeline breaker in a PipelineBreakerBuffer.
+        let with_buffers = plan
+            .transform_up(|node| {
+                if is_pipeline_breaker(&node)
+                    && node.downcast_ref::<PipelineBreakerBuffer>().is_none()
+                {
+                    let buffered: Arc<dyn ExecutionPlan> =
+                        Arc::new(PipelineBreakerBuffer::new(node));
+                    Ok(Transformed::yes(buffered))
+                } else {
+                    Ok(Transformed::no(node))
+                }
+            })?
+            .data;
+
+        // Phase 2: wrap the root.
+        Ok(Arc::new(RuntimeOptimizerExec::new(with_buffers)))
     }
 
     fn schema_check(&self) -> bool {
         true
     }
 }
+
+/// Returns true for operators that absorb their entire input before
+/// emitting (the canonical "pipeline breaker" definition). Start with
+/// the obvious cases; extend as more rules need other breakers.
+fn is_pipeline_breaker(plan: &Arc<dyn ExecutionPlan>) -> bool {
+    plan.downcast_ref::<AggregateExec>().is_some()
+        || plan.downcast_ref::<SortExec>().is_some()
+}
diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs
index bd92d4c021a4..48187755a649 100644
--- a/datafusion/physical-plan/src/lib.rs
+++ b/datafusion/physical-plan/src/lib.rs
@@ -82,6 +82,7 @@ pub mod limit;
 pub mod memory;
 pub mod metrics;
 pub mod operator_statistics;
+pub mod pipeline_breaker_buffer;
 pub mod placeholder_row;
 pub mod projection;
 pub mod recursive_query;
diff --git a/datafusion/physical-plan/src/pipeline_breaker_buffer.rs b/datafusion/physical-plan/src/pipeline_breaker_buffer.rs
new file mode 100644
index 000000000000..0fb0f82842eb
--- /dev/null
+++ b/datafusion/physical-plan/src/pipeline_breaker_buffer.rs
@@ -0,0 +1,95 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Synchronization wrapper above a pipeline-breaking operator. Reports
+//! when the breaker has finished its barrier (one batch absorbed from
+//! every input partition) and holds its output until [`RuntimeOptimizerExec`]
+//! signals `go_ahead`.
+//!
+//! Stub: currently a passthrough. The two-flag synchronization protocol
+//! (`is_ready` set mechanically when all input partitions have produced
+//! ≥1 batch; `go_ahead` set semantically by the root coordinator after
+//! `Vec<RuntimeRule>` has run) lands in a follow-up commit. Exists today
+//! to anchor the plan shape so the insertion rule has a stable wrapper
+//! to put above each breaker.
+
+use std::sync::Arc;
+
+use datafusion_common::Result;
+use datafusion_execution::TaskContext;
+
+use crate::{
+    DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties,
+    SendableRecordBatchStream,
+};
+
+#[derive(Debug)]
+pub struct PipelineBreakerBuffer {
+    input: Arc<dyn ExecutionPlan>,
+    cache: Arc<PlanProperties>,
+}
+
+impl PipelineBreakerBuffer {
+    pub fn new(input: Arc<dyn ExecutionPlan>) -> Self {
+        let cache = Arc::clone(input.properties());
+        Self { input, cache }
+    }
+}
+
+impl DisplayAs for PipelineBreakerBuffer {
+    fn fmt_as(
+        &self,
+        _t: DisplayFormatType,
+        f: &mut std::fmt::Formatter,
+    ) -> std::fmt::Result {
+        write!(f, "PipelineBreakerBuffer")
+    }
+}
+
+impl ExecutionPlan for PipelineBreakerBuffer {
+    fn name(&self) -> &'static str {
+        "PipelineBreakerBuffer"
+    }
+
+    fn properties(&self) -> &Arc<PlanProperties> {
+        &self.cache
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![&self.input]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(Arc::new(Self::new(children.swap_remove(0))))
+    }
+
+    fn maintains_input_order(&self) -> Vec<bool> {
+        vec![true]
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        // Passthrough stub. Real two-flag synchronization lands next.
+        self.input.execute(partition, context)
+    }
+}
diff --git a/datafusion/sqllogictest/test_files/runtime_optimizer.slt b/datafusion/sqllogictest/test_files/runtime_optimizer.slt
index 4480e52b7817..e660bbe1cdf8 100644
--- a/datafusion/sqllogictest/test_files/runtime_optimizer.slt
+++ b/datafusion/sqllogictest/test_files/runtime_optimizer.slt
@@ -119,12 +119,14 @@ physical_plan
 01)RuntimeOptimizerExec
 02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(group_key@0, id@0)], projection=[group_key@0, sum_payload@1, payload@3]
 03)----ProjectionExec: expr=[group_key@0 as group_key, sum(big.payload)@1 as sum_payload]
-04)------AggregateExec: mode=FinalPartitioned, gby=[group_key@0 as group_key], aggr=[sum(big.payload)]
-05)--------RepartitionExec: partitioning=Hash([group_key@0], 4), input_partitions=4
-06)----------AggregateExec: mode=Partial, gby=[group_key@0 as group_key], aggr=[sum(big.payload)]
-07)------------DataSourceExec
-08)----CoalescePartitionsExec
-09)------DataSourceExec
+04)------PipelineBreakerBuffer
+05)--------AggregateExec: mode=FinalPartitioned, gby=[group_key@0 as group_key], aggr=[sum(big.payload)]
+06)----------RepartitionExec: partitioning=Hash([group_key@0], 4), input_partitions=4
+07)------------PipelineBreakerBuffer
+08)--------------AggregateExec: mode=Partial, gby=[group_key@0 as group_key], aggr=[sum(big.payload)]
+09)----------------DataSourceExec
+10)----CoalescePartitionsExec
+11)------DataSourceExec
 
 # ---------------------------------------------------------------------------
 # Result — GREEN regardless of build-side choice (correctness is plan-

From e4f25503967837e271df757fa439b792a8f9983f Mon Sep 17 00:00:00 2001
From: Brent Gardner <bgardner@squarelabs.net>
Date: Wed, 24 Jun 2026 11:40:14 -0600
Subject: [PATCH 04/18] PipelineBreakerBuffer: load-bearing + AtomicWaker
 coordinator wakeup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PipelineBreakerBuffer now actually buffers: holds one batch per input
partition, sets is_ready when all partitions have produced their first
poll result, returns Pending until set_go_ahead is called by the
coordinator. State machine: NeedFirstBatch -> WaitForGoAhead ->
EmitHeld -> Streaming.

Cross-task wake propagation uses a shared AtomicWaker, not
cx.waker().wake_by_ref. The latter is task-local: when a buffer's
poll runs inside a spawned task (e.g. one of RepartitionExec's
internals), waking cx only wakes the spawned task, never reaching
the top-of-plan task that polls RuntimeOptimizerExec. The shared
AtomicWaker bypasses that boundary — buffers wake it on is_ready
flip, RTO registers cx.waker() on it on each poll_next entry.

RTO::poll_next walks its subtree and calls set_go_ahead on any
buffer whose is_ready is set. Base case: permissive release, no
rules yet. Future commits introduce Vec<RuntimeRule> that can hold
specific buffers or mutate adaptive operators (HashJoinExec::flip_sides)
before releasing.

Register-before-walk in RTO::poll_next closes a race where a buffer
flipping is_ready *after* the walk but *before* we return Pending
would lose its wake — AtomicWaker.wake() with no registered waker
is a no-op.

InsertRuntimeOptimizer rule constructs the shared AtomicWaker and
threads clones into every buffer + the wrapping RTO. futures crate
added to physical-optimizer for AtomicWaker.

SLT runs to completion; only the EXPLAIN diff (build-side swap)
remains red.
---
 Cargo.lock                                    |   1 +
 datafusion/physical-optimizer/Cargo.toml      |   1 +
 .../src/runtime_optimizer.rs                  |  30 ++-
 .../src/pipeline_breaker_buffer.rs            | 211 ++++++++++++++++--
 .../physical-plan/src/runtime_optimizer.rs    | 110 ++++++---
 5 files changed, 298 insertions(+), 55 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 1b84a6b9792c..bda0e6023d46 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2432,6 +2432,7 @@ dependencies = [
  "datafusion-physical-expr-common",
  "datafusion-physical-plan",
  "datafusion-pruning",
+ "futures",
  "insta",
  "itertools 0.14.0",
  "recursive",
diff --git a/datafusion/physical-optimizer/Cargo.toml b/datafusion/physical-optimizer/Cargo.toml
index 38c8a7c37211..4104acac71ce 100644
--- a/datafusion/physical-optimizer/Cargo.toml
+++ b/datafusion/physical-optimizer/Cargo.toml
@@ -50,6 +50,7 @@ datafusion-physical-expr = { workspace = true }
 datafusion-physical-expr-common = { workspace = true }
 datafusion-physical-plan = { workspace = true }
 datafusion-pruning = { workspace = true }
+futures = { workspace = true }
 itertools = { workspace = true }
 recursive = { workspace = true, optional = true }
 
diff --git a/datafusion/physical-optimizer/src/runtime_optimizer.rs b/datafusion/physical-optimizer/src/runtime_optimizer.rs
index 1cc338942a3a..0318006210d2 100644
--- a/datafusion/physical-optimizer/src/runtime_optimizer.rs
+++ b/datafusion/physical-optimizer/src/runtime_optimizer.rs
@@ -21,14 +21,16 @@
 //!    (currently `AggregateExec` and `SortExec`) in a
 //!    [`PipelineBreakerBuffer`]. The buffer is the synchronization point
 //!    where runtime stats become observable.
-//! 2. Wraps the resulting plan root in a [`RuntimeOptimizerExec`]. The
-//!    root operator coordinates: once all buffers signal ready it runs a
-//!    `Vec<RuntimeRule>` over the plan, mutates adaptive operators in
-//!    place via their typed methods, and releases the buffers.
+//! 2. Wraps the resulting plan root in a [`RuntimeOptimizerExec`], which
+//!    walks the subtree on each `poll_next` and releases ready buffers.
 //!
-//! Today both wrappers are passthrough — this rule only installs them so
-//! the structural shape is visible in EXPLAIN. Follow-up commits add the
-//! synchronization protocol and the first runtime rule.
+//! A shared [`AtomicWaker`] is constructed here and threaded into both
+//! the buffers and the wrapping RTO so buffers can wake the coordinator
+//! from inside spawned subtasks (e.g. `RepartitionExec` internals). The
+//! default is permissive release — the next commit will add a
+//! `Vec<RuntimeRule>` that runs between "all ready" and "release," able
+//! to mutate adaptive operators (`HashJoinExec::flip_sides`) or hold
+//! specific buffers.
 
 use std::sync::Arc;
 
@@ -41,6 +43,7 @@ use datafusion_physical_plan::aggregates::AggregateExec;
 use datafusion_physical_plan::pipeline_breaker_buffer::PipelineBreakerBuffer;
 use datafusion_physical_plan::runtime_optimizer::RuntimeOptimizerExec;
 use datafusion_physical_plan::sorts::sort::SortExec;
+use futures::task::AtomicWaker;
 
 #[derive(Default, Debug)]
 pub struct InsertRuntimeOptimizer;
@@ -66,14 +69,21 @@ impl PhysicalOptimizerRule for InsertRuntimeOptimizer {
             return Ok(plan);
         }
 
+        // Shared coordinator-wake. Every buffer in this subplan and the
+        // wrapping RTO hold a clone — buffers wake it, RTO registers on
+        // it. Per-poll cx.waker() doesn't reach across spawned-task
+        // boundaries (e.g. RepartitionExec's internals), this does.
+        let rto_waker = Arc::new(AtomicWaker::new());
+
         // Phase 1: wrap each pipeline breaker in a PipelineBreakerBuffer.
         let with_buffers = plan
             .transform_up(|node| {
                 if is_pipeline_breaker(&node)
                     && node.downcast_ref::<PipelineBreakerBuffer>().is_none()
                 {
-                    let buffered: Arc<dyn ExecutionPlan> =
-                        Arc::new(PipelineBreakerBuffer::new(node));
+                    let buffered: Arc<dyn ExecutionPlan> = Arc::new(
+                        PipelineBreakerBuffer::new(node, Arc::clone(&rto_waker)),
+                    );
                     Ok(Transformed::yes(buffered))
                 } else {
                     Ok(Transformed::no(node))
@@ -82,7 +92,7 @@ impl PhysicalOptimizerRule for InsertRuntimeOptimizer {
             .data;
 
         // Phase 2: wrap the root.
-        Ok(Arc::new(RuntimeOptimizerExec::new(with_buffers)))
+        Ok(Arc::new(RuntimeOptimizerExec::new(with_buffers, rto_waker)))
     }
 
     fn schema_check(&self) -> bool {
diff --git a/datafusion/physical-plan/src/pipeline_breaker_buffer.rs b/datafusion/physical-plan/src/pipeline_breaker_buffer.rs
index 0fb0f82842eb..da8ec6d8a469 100644
--- a/datafusion/physical-plan/src/pipeline_breaker_buffer.rs
+++ b/datafusion/physical-plan/src/pipeline_breaker_buffer.rs
@@ -15,25 +15,43 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Synchronization wrapper above a pipeline-breaking operator. Reports
-//! when the breaker has finished its barrier (one batch absorbed from
-//! every input partition) and holds its output until [`RuntimeOptimizerExec`]
-//! signals `go_ahead`.
+//! Synchronization wrapper above a pipeline-breaking operator. Holds the
+//! breaker's first batch per partition so that runtime stats become
+//! observable to the coordinator
+//! ([`crate::runtime_optimizer::RuntimeOptimizerExec`]) before downstream
+//! consumers see any data.
 //!
-//! Stub: currently a passthrough. The two-flag synchronization protocol
-//! (`is_ready` set mechanically when all input partitions have produced
-//! ≥1 batch; `go_ahead` set semantically by the root coordinator after
-//! `Vec<RuntimeRule>` has run) lands in a follow-up commit. Exists today
-//! to anchor the plan shape so the insertion rule has a stable wrapper
-//! to put above each breaker.
+//! Two flags govern behavior:
+//! - `is_ready` (mechanical): set automatically when every input partition
+//!   has produced its first poll result (a batch or termination). Signals
+//!   that runtime stats from the breaker are now derivable.
+//! - `go_ahead` (semantic): set externally by the coordinator after rules
+//!   have evaluated. While false, per-partition streams hold their
+//!   absorbed batches and return `Pending`, storing the caller's waker so
+//!   `set_go_ahead` can wake them.
+//!
+//! Coordination uses a shared [`AtomicWaker`] (`rto_waker`) populated at
+//! plan time by [`crate::runtime_optimizer::RuntimeOptimizerExec`]. The
+//! buffer wakes it when `is_ready` flips; the coordinator is registered
+//! on it via its own `poll_next`. We use this side-channel rather than
+//! `cx.waker()` because the latter is task-local — inside a spawned task
+//! (e.g. one of `RepartitionExec`'s internals) it never reaches the
+//! top-of-plan task.
 
-use std::sync::Arc;
+use std::collections::{HashMap, HashSet};
+use std::pin::Pin;
+use std::sync::{Arc, Mutex};
+use std::task::{Context, Poll, Waker};
 
+use arrow::array::RecordBatch;
 use datafusion_common::Result;
 use datafusion_execution::TaskContext;
+use futures::task::AtomicWaker;
+use futures::{Stream, StreamExt};
 
+use crate::stream::RecordBatchStreamAdapter;
 use crate::{
-    DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties,
+    DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties,
     SendableRecordBatchStream,
 };
 
@@ -41,12 +59,68 @@ use crate::{
 pub struct PipelineBreakerBuffer {
     input: Arc<dyn ExecutionPlan>,
     cache: Arc<PlanProperties>,
+    state: Arc<Mutex<BufferState>>,
+    /// Shared with `RuntimeOptimizerExec`. Buffer wakes this whenever
+    /// `is_ready` flips so the coordinator (which may live above a
+    /// task-spawning operator like `RepartitionExec`) gets re-polled.
+    /// Per-partition `cx.waker()` alone is insufficient — it's the
+    /// local task's waker, which doesn't propagate across the spawned
+    /// task boundary that `RepartitionExec` introduces.
+    rto_waker: Arc<AtomicWaker>,
+}
+
+#[derive(Debug)]
+struct BufferState {
+    /// First batch per input partition. Absent for partitions whose input
+    /// terminated empty.
+    held: HashMap<usize, RecordBatch>,
+    /// Partitions whose first poll has completed (with or without a batch).
+    seen: HashSet<usize>,
+    is_ready: bool,
+    go_ahead: bool,
+    num_partitions: usize,
+    /// Wakers stashed by per-partition streams while awaiting `go_ahead`.
+    wakers: HashMap<usize, Waker>,
 }
 
 impl PipelineBreakerBuffer {
-    pub fn new(input: Arc<dyn ExecutionPlan>) -> Self {
+    pub fn new(input: Arc<dyn ExecutionPlan>, rto_waker: Arc<AtomicWaker>) -> Self {
+        let num_partitions = input.output_partitioning().partition_count();
         let cache = Arc::clone(input.properties());
-        Self { input, cache }
+        Self {
+            input,
+            cache,
+            state: Arc::new(Mutex::new(BufferState {
+                held: HashMap::new(),
+                seen: HashSet::new(),
+                is_ready: false,
+                go_ahead: false,
+                num_partitions,
+                wakers: HashMap::new(),
+            })),
+            rto_waker,
+        }
+    }
+
+    /// True once every input partition has produced its first poll result.
+    pub fn is_ready(&self) -> bool {
+        self.state.lock().unwrap().is_ready
+    }
+
+    /// Release per-partition streams: emit held batches and resume
+    /// forwarding. Idempotent.
+    pub fn set_go_ahead(&self) {
+        let wakers = {
+            let mut state = self.state.lock().unwrap();
+            if state.go_ahead {
+                return;
+            }
+            state.go_ahead = true;
+            state.wakers.drain().map(|(_, w)| w).collect::<Vec<_>>()
+        };
+        for w in wakers {
+            w.wake();
+        }
     }
 }
 
@@ -77,7 +151,10 @@ impl ExecutionPlan for PipelineBreakerBuffer {
         self: Arc<Self>,
         mut children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        Ok(Arc::new(Self::new(children.swap_remove(0))))
+        Ok(Arc::new(Self::new(
+            children.swap_remove(0),
+            Arc::clone(&self.rto_waker),
+        )))
     }
 
     fn maintains_input_order(&self) -> Vec<bool> {
@@ -89,7 +166,107 @@ impl ExecutionPlan for PipelineBreakerBuffer {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        // Passthrough stub. Real two-flag synchronization lands next.
-        self.input.execute(partition, context)
+        let input = self.input.execute(partition, context)?;
+        let schema = self.schema();
+        let stream = BufferStream {
+            phase: Phase::NeedFirstBatch,
+            input,
+            partition,
+            state: Arc::clone(&self.state),
+            rto_waker: Arc::clone(&self.rto_waker),
+        };
+        Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream)))
+    }
+}
+
+#[derive(Debug)]
+enum Phase {
+    /// Haven't yet pulled the first batch from this input partition.
+    NeedFirstBatch,
+    /// First batch absorbed (or input was empty). Holding until `go_ahead`.
+    WaitForGoAhead,
+    /// `go_ahead` set; emit our held batch (if any) then transition to
+    /// streaming.
+    EmitHeld,
+    /// Pass through input batches as they arrive.
+    Streaming,
+}
+
+struct BufferStream {
+    phase: Phase,
+    input: SendableRecordBatchStream,
+    partition: usize,
+    rto_waker: Arc<AtomicWaker>,
+    state: Arc<Mutex<BufferState>>,
+}
+
+impl Stream for BufferStream {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        let this = self.as_mut().get_mut();
+        loop {
+            match this.phase {
+                Phase::NeedFirstBatch => {
+                    let next = match this.input.poll_next_unpin(cx) {
+                        Poll::Ready(x) => x,
+                        Poll::Pending => return Poll::Pending,
+                    };
+                    let became_ready = {
+                        let mut state = this.state.lock().unwrap();
+                        state.seen.insert(this.partition);
+                        match next {
+                            Some(Ok(batch)) => {
+                                state.held.insert(this.partition, batch);
+                            }
+                            Some(Err(e)) => {
+                                return Poll::Ready(Some(Err(e)));
+                            }
+                            None => { /* empty partition; nothing to hold */ }
+                        }
+                        if state.seen.len() == state.num_partitions && !state.is_ready {
+                            state.is_ready = true;
+                            true
+                        } else {
+                            false
+                        }
+                    };
+                    if became_ready {
+                        // Wake the coordinator. `cx.waker()` alone is
+                        // task-local — if we're inside a spawned task
+                        // (e.g. one of RepartitionExec's internals), it
+                        // never reaches RTO. The shared AtomicWaker is
+                        // populated by RTO on each of its own polls and
+                        // wakes the actual top-of-plan task.
+                        this.rto_waker.wake();
+                    }
+                    this.phase = Phase::WaitForGoAhead;
+                }
+                Phase::WaitForGoAhead => {
+                    let mut state = this.state.lock().unwrap();
+                    if state.go_ahead {
+                        drop(state);
+                        this.phase = Phase::EmitHeld;
+                        continue;
+                    }
+                    state.wakers.insert(this.partition, cx.waker().clone());
+                    return Poll::Pending;
+                }
+                Phase::EmitHeld => {
+                    let held = this.state.lock().unwrap().held.remove(&this.partition);
+                    this.phase = Phase::Streaming;
+                    if let Some(batch) = held {
+                        return Poll::Ready(Some(Ok(batch)));
+                    }
+                    // Empty-partition case: fall through to streaming.
+                }
+                Phase::Streaming => {
+                    return this.input.poll_next_unpin(cx);
+                }
+            }
+        }
     }
 }
diff --git a/datafusion/physical-plan/src/runtime_optimizer.rs b/datafusion/physical-plan/src/runtime_optimizer.rs
index 01fbea13cd4a..4ccf478a5e6e 100644
--- a/datafusion/physical-plan/src/runtime_optimizer.rs
+++ b/datafusion/physical-plan/src/runtime_optimizer.rs
@@ -15,36 +15,30 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Adaptive query execution primitive: a root-of-plan wrapper that runs
-//! `RuntimeRule`s against the live plan once `PipelineBreakerBuffer`
-//! children signal they have absorbed their first batch from every
-//! partition.
+//! Coordinator wrapper at the root of the plan. On every poll, walks its
+//! subtree and releases any [`PipelineBreakerBuffer`] whose `is_ready`
+//! flag has flipped. Buffers schedule re-polls via `cx.waker().wake_by_ref()`
+//! when they become ready, so the wrapper sees the state change without
+//! any explicit notification channel.
 //!
-//! Stub: currently a passthrough. Real coordinator behavior — wait for
-//! buffers, run rules, mutate adaptive operators in place via their
-//! typed methods, then release the buffers — lands in follow-up commits.
-//! Exists today to anchor the plan shape so the insertion rule has
-//! somewhere to put a wrapper.
-//!
-//! Design notes:
-//! - `RuntimeOptimizerExec` sits at the root of the plan tree. It owns
-//!   the entire subplan and naturally receives every `poll()`. No
-//!   ownership inversion, no back-pointers, no plan-tree mutation.
-//! - Adaptive optimizations are expressed as `RuntimeRule`s that mutate
-//!   operator internal state via typed methods (e.g.
-//!   `HashJoinExec::flip_sides`). The plan tree shape stays static
-//!   after planning; only operator config gets late-finalized.
-//! - Buffer coordination uses two flags: `is_ready` (mechanical, set by
-//!   the buffer when all input partitions have produced ≥1 batch) and
-//!   `go_ahead` (semantic, set by `RuntimeOptimizerExec` after rules
-//!   have run). Default is permissive — buffers release once ready
-//!   unless a rule explicitly holds them back.
+//! Base case today is unconditional release: as soon as a buffer reports
+//! ready, `set_go_ahead` is called. A future `Vec<RuntimeRule>` will run
+//! between "all ready" and "release," allowing rules to mutate adaptive
+//! operators (e.g. `HashJoinExec::flip_sides`) or hold specific buffers
+//! by leaving `go_ahead` false.
 
+use std::pin::Pin;
 use std::sync::Arc;
+use std::task::{Context, Poll};
 
+use arrow::array::RecordBatch;
 use datafusion_common::Result;
 use datafusion_execution::TaskContext;
+use futures::task::AtomicWaker;
+use futures::{Stream, StreamExt};
 
+use crate::pipeline_breaker_buffer::PipelineBreakerBuffer;
+use crate::stream::RecordBatchStreamAdapter;
 use crate::{
     DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties,
     SendableRecordBatchStream,
@@ -54,12 +48,23 @@ use crate::{
 pub struct RuntimeOptimizerExec {
     input: Arc<dyn ExecutionPlan>,
     cache: Arc<PlanProperties>,
+    /// Shared with every `PipelineBreakerBuffer` in this subplan.
+    /// Buffers wake this AtomicWaker when `is_ready` flips; we register
+    /// the current task's waker on it during each `poll_next` so a
+    /// wake-up from inside a spawned subtask (e.g. one of
+    /// `RepartitionExec`'s internals) reaches the actual top-of-plan
+    /// task — `cx.waker()` propagation alone can't cross that boundary.
+    rto_waker: Arc<AtomicWaker>,
 }
 
 impl RuntimeOptimizerExec {
-    pub fn new(input: Arc<dyn ExecutionPlan>) -> Self {
+    pub fn new(input: Arc<dyn ExecutionPlan>, rto_waker: Arc<AtomicWaker>) -> Self {
         let cache = Arc::clone(input.properties());
-        Self { input, cache }
+        Self {
+            input,
+            cache,
+            rto_waker,
+        }
     }
 }
 
@@ -90,7 +95,10 @@ impl ExecutionPlan for RuntimeOptimizerExec {
         self: Arc<Self>,
         mut children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        Ok(Arc::new(Self::new(children.swap_remove(0))))
+        Ok(Arc::new(Self::new(
+            children.swap_remove(0),
+            Arc::clone(&self.rto_waker),
+        )))
     }
 
     fn maintains_input_order(&self) -> Vec<bool> {
@@ -102,7 +110,53 @@ impl ExecutionPlan for RuntimeOptimizerExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        // Passthrough stub. Real adaptive body lands in follow-up commits.
-        self.input.execute(partition, context)
+        let child = self.input.execute(partition, context)?;
+        let schema = self.schema();
+        let stream = CoordinatorStream {
+            child,
+            plan: Arc::clone(&self.input),
+            rto_waker: Arc::clone(&self.rto_waker),
+        };
+        Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream)))
+    }
+}
+
+/// Wraps the child stream. Each poll registers our task waker with the
+/// shared `rto_waker` (so any buffer in the subtree can pull us out of
+/// `Pending` when its `is_ready` flips), walks the subtree releasing
+/// ready buffers, then forwards the child poll.
+struct CoordinatorStream {
+    child: SendableRecordBatchStream,
+    plan: Arc<dyn ExecutionPlan>,
+    rto_waker: Arc<AtomicWaker>,
+}
+
+impl Stream for CoordinatorStream {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        let this = self.as_mut().get_mut();
+        // Register before walking so a buffer flipping is_ready *after*
+        // we walked but *before* we return Pending still wakes us.
+        this.rto_waker.register(cx.waker());
+        release_ready_buffers(&this.plan);
+        this.child.poll_next_unpin(cx)
+    }
+}
+
+/// Walk the subtree; for every `PipelineBreakerBuffer` whose `is_ready`
+/// flag is set, call `set_go_ahead`. Idempotent — released buffers
+/// short-circuit on the next call.
+fn release_ready_buffers(plan: &Arc<dyn ExecutionPlan>) {
+    if let Some(buffer) = plan.downcast_ref::<PipelineBreakerBuffer>()
+        && buffer.is_ready()
+    {
+        buffer.set_go_ahead();
+    }
+    for child in plan.children() {
+        release_ready_buffers(child);
     }
 }

From 7529611a73026d50e93bd2111157c9ee90b7b878 Mon Sep 17 00:00:00 2001
From: Brent Gardner <bgardner@squarelabs.net>
Date: Wed, 24 Jun 2026 12:53:55 -0600
Subject: [PATCH 05/18] RuntimeRule trait + SwapBuildSideIfInverted (log-only)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduces the rule layer of the adaptive-execution framework:

- `ExecutionPlan::runtime_row_count(partition)`: new trait method
  (default `None`) exposing post-barrier output cardinality.
  Implemented on `AggregateExec` via its existing OutputRows metric.

- `RuntimeRule` trait: cheap, idempotent visitors invoked by RTO on
  each poll. Rules observe runtime stats and may mutate adaptive
  operators in place or veto specific buffers.

- Two-phase release on `PipelineBreakerBuffer`:
  - `streaming_enabled` (rule-controllable proposal): RTO resets to
    `is_ready` as the permissive default each cycle; rules can flip
    to `false` to veto.
  - `streaming_started` (actual emission control): only RTO flips
    via `start_streaming`, which also wakes per-partition wakers.

- `RuntimeOptimizerExec::poll_next` runs a three-phase coordinator
  cycle: propose (default permissive), evaluate rules (may veto/
  mutate), commit (start_streaming on still-enabled buffers).

- `SwapBuildSideIfInverted`: first concrete rule. Detects when a
  `HashJoinExec`'s left/build side is larger at runtime than its
  right/probe side and would call `HashJoinExec::flip_sides()` —
  but that method doesn't exist yet, so the rule only logs intent
  (`RUST_LOG=info` visible). flip_sides + HashJoin build-phase
  deferral lands in a follow-up.

The SLT exercises detection end-to-end: small (100 rows static) joined
against GROUP BY of big (5 distinct groups, ~20 partial rows actual,
~100K estimated). The static planner picks small as build; the rule
fires once the FinalPartitioned aggregate emits and runtime row count
is observable.
---
 .../src/runtime_optimizer.rs                  |  14 +-
 .../physical-plan/src/aggregates/mod.rs       |  28 ++-
 .../physical-plan/src/execution_plan.rs       |  23 ++
 .../src/pipeline_breaker_buffer.rs            |  74 +++++--
 .../physical-plan/src/runtime_optimizer.rs    | 208 +++++++++++++++---
 .../test_files/runtime_optimizer.slt          |  37 ++--
 6 files changed, 313 insertions(+), 71 deletions(-)

diff --git a/datafusion/physical-optimizer/src/runtime_optimizer.rs b/datafusion/physical-optimizer/src/runtime_optimizer.rs
index 0318006210d2..491c23a45a1d 100644
--- a/datafusion/physical-optimizer/src/runtime_optimizer.rs
+++ b/datafusion/physical-optimizer/src/runtime_optimizer.rs
@@ -41,7 +41,9 @@ use datafusion_common::tree_node::{Transformed, TreeNode};
 use datafusion_physical_plan::ExecutionPlan;
 use datafusion_physical_plan::aggregates::AggregateExec;
 use datafusion_physical_plan::pipeline_breaker_buffer::PipelineBreakerBuffer;
-use datafusion_physical_plan::runtime_optimizer::RuntimeOptimizerExec;
+use datafusion_physical_plan::runtime_optimizer::{
+    RuntimeOptimizerExec, RuntimeRule, SwapBuildSideIfInverted,
+};
 use datafusion_physical_plan::sorts::sort::SortExec;
 use futures::task::AtomicWaker;
 
@@ -91,8 +93,14 @@ impl PhysicalOptimizerRule for InsertRuntimeOptimizer {
             })?
             .data;
 
-        // Phase 2: wrap the root.
-        Ok(Arc::new(RuntimeOptimizerExec::new(with_buffers, rto_waker)))
+        // Phase 2: wrap the root with the default rule set.
+        let rules: Vec<Arc<dyn RuntimeRule>> =
+            vec![Arc::new(SwapBuildSideIfInverted::new())];
+        Ok(Arc::new(RuntimeOptimizerExec::new(
+            with_buffers,
+            rto_waker,
+            rules,
+        )))
     }
 
     fn schema_check(&self) -> bool {
diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs
index 08468bffc0dd..0c035013c7fb 100644
--- a/datafusion/physical-plan/src/aggregates/mod.rs
+++ b/datafusion/physical-plan/src/aggregates/mod.rs
@@ -32,7 +32,7 @@ use crate::filter_pushdown::{
     ChildFilterDescription, ChildPushdownResult, FilterDescription, FilterPushdownPhase,
     FilterPushdownPropagation, PushedDownPredicate,
 };
-use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet};
+use crate::metrics::{ExecutionPlanMetricsSet, MetricValue, MetricsSet};
 use crate::statistics::StatisticsArgs;
 use crate::{
     DisplayFormatType, Distribution, ExecutionPlan, InputOrderMode,
@@ -1765,6 +1765,32 @@ impl ExecutionPlan for AggregateExec {
         Some(self.metrics.clone_inner())
     }
 
+    /// Reads the `OutputRows` metric for the given partition. Returns
+    /// `None` until that partition has emitted at least one batch.
+    ///
+    /// For `Final` / `FinalPartitioned` modes the build phase produces
+    /// all groups in a single batch per output partition, so once any
+    /// output has been pulled the count equals the true post-build
+    /// cardinality for that partition. That's what downstream adaptive
+    /// rules consume.
+    ///
+    /// For `Partial` modes the count is also per-partition truth —
+    /// each input partition independently knows how many partial-group
+    /// rows it has produced.
+    fn runtime_row_count(&self, partition: usize) -> Option<usize> {
+        let metrics = self.metrics.clone_inner();
+        let rows: usize = metrics
+            .iter()
+            .filter_map(|m| match (m.partition(), m.value()) {
+                (Some(p), MetricValue::OutputRows(count)) if p == partition => {
+                    Some(count.value())
+                }
+                _ => None,
+            })
+            .sum();
+        if rows > 0 { Some(rows) } else { None }
+    }
+
     fn statistics_with_args(&self, args: &StatisticsArgs) -> Result<Arc<Statistics>> {
         let child_statistics =
             args.compute_child_statistics(&self.input, args.partition())?;
diff --git a/datafusion/physical-plan/src/execution_plan.rs b/datafusion/physical-plan/src/execution_plan.rs
index 76abf73e0ebb..4877bcfd689f 100644
--- a/datafusion/physical-plan/src/execution_plan.rs
+++ b/datafusion/physical-plan/src/execution_plan.rs
@@ -496,6 +496,29 @@ pub trait ExecutionPlan: Any + Debug + DisplayAs + Send + Sync {
         None
     }
 
+    /// Returns the number of rows this operator **will emit** on the given
+    /// output partition, if that count is knowable at the moment of the
+    /// call.
+    ///
+    /// Pipeline-breaking operators that absorb all input before emitting
+    /// (`AggregateExec`, `SortExec` once sorted, the build side of
+    /// `HashJoinExec` once built) know their true per-partition output
+    /// cardinality the moment their build phase completes — *before* any
+    /// output batch has been pulled. That is precisely the runtime stat
+    /// downstream adaptive-execution rules (build-side swap, partition
+    /// coalescing, skew handling) need.
+    ///
+    /// Callers that want a cross-partition total sum across all
+    /// `partition` indices.
+    ///
+    /// Streaming operators never have a meaningful answer and should
+    /// leave this as the default `None`. Pipeline-breaking operators
+    /// should return `None` *before* their barrier completes and
+    /// `Some(rows)` once it has.
+    fn runtime_row_count(&self, _partition: usize) -> Option<usize> {
+        None
+    }
+
     /// Returns statistics for a specific partition of this `ExecutionPlan` node.
     ///
     /// Deprecated: use [`Self::statistics_with_args`] instead,
diff --git a/datafusion/physical-plan/src/pipeline_breaker_buffer.rs b/datafusion/physical-plan/src/pipeline_breaker_buffer.rs
index da8ec6d8a469..6230b214b946 100644
--- a/datafusion/physical-plan/src/pipeline_breaker_buffer.rs
+++ b/datafusion/physical-plan/src/pipeline_breaker_buffer.rs
@@ -21,14 +21,17 @@
 //! ([`crate::runtime_optimizer::RuntimeOptimizerExec`]) before downstream
 //! consumers see any data.
 //!
-//! Two flags govern behavior:
+//! Three flags govern behavior:
 //! - `is_ready` (mechanical): set automatically when every input partition
-//!   has produced its first poll result (a batch or termination). Signals
-//!   that runtime stats from the breaker are now derivable.
-//! - `go_ahead` (semantic): set externally by the coordinator after rules
-//!   have evaluated. While false, per-partition streams hold their
-//!   absorbed batches and return `Pending`, storing the caller's waker so
-//!   `set_go_ahead` can wake them.
+//!   has produced its first poll result. Runtime stats from the breaker
+//!   become derivable.
+//! - `streaming_enabled` (rule-controlled proposal): reset each poll cycle
+//!   by RTO — set to `is_ready` as the permissive default. Rules can
+//!   flip it back to `false` to veto release. No side effects.
+//! - `streaming_started` (actual emission control): only RTO flips this,
+//!   via `start_streaming()`, which also wakes per-partition wakers.
+//!   Once true, partition streams emit their held batches and continue
+//!   forwarding.
 //!
 //! Coordination uses a shared [`AtomicWaker`] (`rto_waker`) populated at
 //! plan time by [`crate::runtime_optimizer::RuntimeOptimizerExec`]. The
@@ -77,9 +80,17 @@ struct BufferState {
     /// Partitions whose first poll has completed (with or without a batch).
     seen: HashSet<usize>,
     is_ready: bool,
-    go_ahead: bool,
+    /// Rule-controllable proposal. Reset by RTO each poll cycle: set
+    /// to `is_ready` as the permissive default, then rules may flip it
+    /// to false to veto.
+    streaming_enabled: bool,
+    /// Actual emission control. Only RTO flips this via `start_streaming`.
+    /// Per-partition streams check this; while false, they hold their
+    /// first batch and return Pending.
+    streaming_started: bool,
     num_partitions: usize,
-    /// Wakers stashed by per-partition streams while awaiting `go_ahead`.
+    /// Wakers stashed by per-partition streams while awaiting
+    /// `streaming_started`.
     wakers: HashMap<usize, Waker>,
 }
 
@@ -94,7 +105,8 @@ impl PipelineBreakerBuffer {
                 held: HashMap::new(),
                 seen: HashSet::new(),
                 is_ready: false,
-                go_ahead: false,
+                streaming_enabled: false,
+                streaming_started: false,
                 num_partitions,
                 wakers: HashMap::new(),
             })),
@@ -107,15 +119,32 @@ impl PipelineBreakerBuffer {
         self.state.lock().unwrap().is_ready
     }
 
-    /// Release per-partition streams: emit held batches and resume
-    /// forwarding. Idempotent.
-    pub fn set_go_ahead(&self) {
+    /// Rule-controllable proposal flag. RTO resets this to `is_ready` at
+    /// the start of each poll cycle as the permissive default; rules may
+    /// then flip it to false to veto release this cycle. No side effects.
+    pub fn streaming_enabled(&self) -> bool {
+        self.state.lock().unwrap().streaming_enabled
+    }
+
+    pub fn set_streaming_enabled(&self, enabled: bool) {
+        self.state.lock().unwrap().streaming_enabled = enabled;
+    }
+
+    /// True once `start_streaming` has been called. After this, per-
+    /// partition streams emit their held batches and resume forwarding.
+    pub fn streaming_started(&self) -> bool {
+        self.state.lock().unwrap().streaming_started
+    }
+
+    /// Start actual emission: flips `streaming_started` and wakes all
+    /// per-partition wakers. Idempotent. Only RTO should call this.
+    pub fn start_streaming(&self) {
         let wakers = {
             let mut state = self.state.lock().unwrap();
-            if state.go_ahead {
+            if state.streaming_started {
                 return;
             }
-            state.go_ahead = true;
+            state.streaming_started = true;
             state.wakers.drain().map(|(_, w)| w).collect::<Vec<_>>()
         };
         for w in wakers {
@@ -183,10 +212,11 @@ impl ExecutionPlan for PipelineBreakerBuffer {
 enum Phase {
     /// Haven't yet pulled the first batch from this input partition.
     NeedFirstBatch,
-    /// First batch absorbed (or input was empty). Holding until `go_ahead`.
-    WaitForGoAhead,
-    /// `go_ahead` set; emit our held batch (if any) then transition to
-    /// streaming.
+    /// First batch absorbed (or input was empty). Holding until RTO calls
+    /// `start_streaming`.
+    WaitForStreaming,
+    /// `streaming_started` is set; emit our held batch (if any) then
+    /// transition to streaming.
     EmitHeld,
     /// Pass through input batches as they arrive.
     Streaming,
@@ -243,11 +273,11 @@ impl Stream for BufferStream {
                         // wakes the actual top-of-plan task.
                         this.rto_waker.wake();
                     }
-                    this.phase = Phase::WaitForGoAhead;
+                    this.phase = Phase::WaitForStreaming;
                 }
-                Phase::WaitForGoAhead => {
+                Phase::WaitForStreaming => {
                     let mut state = this.state.lock().unwrap();
-                    if state.go_ahead {
+                    if state.streaming_started {
                         drop(state);
                         this.phase = Phase::EmitHeld;
                         continue;
diff --git a/datafusion/physical-plan/src/runtime_optimizer.rs b/datafusion/physical-plan/src/runtime_optimizer.rs
index 4ccf478a5e6e..18ee60f8ece8 100644
--- a/datafusion/physical-plan/src/runtime_optimizer.rs
+++ b/datafusion/physical-plan/src/runtime_optimizer.rs
@@ -15,20 +15,21 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Coordinator wrapper at the root of the plan. On every poll, walks its
-//! subtree and releases any [`PipelineBreakerBuffer`] whose `is_ready`
-//! flag has flipped. Buffers schedule re-polls via `cx.waker().wake_by_ref()`
-//! when they become ready, so the wrapper sees the state change without
-//! any explicit notification channel.
+//! Coordinator wrapper at the root of the plan. On every poll, it walks
+//! its subtree to release any [`PipelineBreakerBuffer`] whose `is_ready`
+//! flag has flipped, then runs each registered [`RuntimeRule`] over the
+//! plan. Rules observe runtime stats (via `runtime_row_count` and
+//! similar) and mutate adaptive operators in place (e.g.
+//! `HashJoinExec::flip_sides`).
 //!
-//! Base case today is unconditional release: as soon as a buffer reports
-//! ready, `set_go_ahead` is called. A future `Vec<RuntimeRule>` will run
-//! between "all ready" and "release," allowing rules to mutate adaptive
-//! operators (e.g. `HashJoinExec::flip_sides`) or hold specific buffers
-//! by leaving `go_ahead` false.
+//! Buffers notify us via a shared [`AtomicWaker`] (`rto_waker`) that we
+//! register on each `poll_next` — `cx.waker()` is task-local and won't
+//! cross a spawned-subtask boundary like `RepartitionExec`'s internals,
+//! but `AtomicWaker.wake()` does.
 
 use std::pin::Pin;
 use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, Ordering};
 use std::task::{Context, Poll};
 
 use arrow::array::RecordBatch;
@@ -36,14 +37,28 @@ use datafusion_common::Result;
 use datafusion_execution::TaskContext;
 use futures::task::AtomicWaker;
 use futures::{Stream, StreamExt};
+use log::info;
 
+use crate::joins::HashJoinExec;
 use crate::pipeline_breaker_buffer::PipelineBreakerBuffer;
+use crate::statistics::StatisticsArgs;
 use crate::stream::RecordBatchStreamAdapter;
 use crate::{
-    DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties,
+    DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties,
     SendableRecordBatchStream,
 };
 
+/// A runtime adaptive-execution rule that may inspect the plan tree and
+/// mutate adaptive operators in place. Rules are called from
+/// [`RuntimeOptimizerExec`] on every poll, *after* ready buffers have
+/// been released. They are expected to be cheap and idempotent (track
+/// their own "already fired" state if they should only run once per
+/// query).
+pub trait RuntimeRule: Send + Sync + std::fmt::Debug {
+    fn name(&self) -> &str;
+    fn evaluate(&self, plan: &Arc<dyn ExecutionPlan>);
+}
+
 #[derive(Debug)]
 pub struct RuntimeOptimizerExec {
     input: Arc<dyn ExecutionPlan>,
@@ -51,19 +66,24 @@ pub struct RuntimeOptimizerExec {
     /// Shared with every `PipelineBreakerBuffer` in this subplan.
     /// Buffers wake this AtomicWaker when `is_ready` flips; we register
     /// the current task's waker on it during each `poll_next` so a
-    /// wake-up from inside a spawned subtask (e.g. one of
-    /// `RepartitionExec`'s internals) reaches the actual top-of-plan
-    /// task — `cx.waker()` propagation alone can't cross that boundary.
+    /// wake-up from inside a spawned subtask reaches the actual
+    /// top-of-plan task.
     rto_waker: Arc<AtomicWaker>,
+    rules: Vec<Arc<dyn RuntimeRule>>,
 }
 
 impl RuntimeOptimizerExec {
-    pub fn new(input: Arc<dyn ExecutionPlan>, rto_waker: Arc<AtomicWaker>) -> Self {
+    pub fn new(
+        input: Arc<dyn ExecutionPlan>,
+        rto_waker: Arc<AtomicWaker>,
+        rules: Vec<Arc<dyn RuntimeRule>>,
+    ) -> Self {
         let cache = Arc::clone(input.properties());
         Self {
             input,
             cache,
             rto_waker,
+            rules,
         }
     }
 }
@@ -98,6 +118,7 @@ impl ExecutionPlan for RuntimeOptimizerExec {
         Ok(Arc::new(Self::new(
             children.swap_remove(0),
             Arc::clone(&self.rto_waker),
+            self.rules.clone(),
         )))
     }
 
@@ -116,19 +137,17 @@ impl ExecutionPlan for RuntimeOptimizerExec {
             child,
             plan: Arc::clone(&self.input),
             rto_waker: Arc::clone(&self.rto_waker),
+            rules: self.rules.clone(),
         };
         Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream)))
     }
 }
 
-/// Wraps the child stream. Each poll registers our task waker with the
-/// shared `rto_waker` (so any buffer in the subtree can pull us out of
-/// `Pending` when its `is_ready` flips), walks the subtree releasing
-/// ready buffers, then forwards the child poll.
 struct CoordinatorStream {
     child: SendableRecordBatchStream,
     plan: Arc<dyn ExecutionPlan>,
     rto_waker: Arc<AtomicWaker>,
+    rules: Vec<Arc<dyn RuntimeRule>>,
 }
 
 impl Stream for CoordinatorStream {
@@ -142,21 +161,156 @@ impl Stream for CoordinatorStream {
         // Register before walking so a buffer flipping is_ready *after*
         // we walked but *before* we return Pending still wakes us.
         this.rto_waker.register(cx.waker());
-        release_ready_buffers(&this.plan);
+
+        // Phase 1: set the permissive default — every ready buffer is
+        // proposed for release.
+        propose_release_for_ready_buffers(&this.plan);
+
+        // Phase 2: rules may veto specific buffers (set
+        // streaming_enabled=false) or mutate adaptive operators.
+        for rule in &this.rules {
+            rule.evaluate(&this.plan);
+        }
+
+        // Phase 3: commit — actually start streaming on any buffer that
+        // is still enabled.
+        start_streaming_on_enabled_buffers(&this.plan);
+
         this.child.poll_next_unpin(cx)
     }
 }
 
-/// Walk the subtree; for every `PipelineBreakerBuffer` whose `is_ready`
-/// flag is set, call `set_go_ahead`. Idempotent — released buffers
-/// short-circuit on the next call.
-fn release_ready_buffers(plan: &Arc<dyn ExecutionPlan>) {
+fn propose_release_for_ready_buffers(plan: &Arc<dyn ExecutionPlan>) {
+    if let Some(buffer) = plan.downcast_ref::<PipelineBreakerBuffer>() {
+        buffer.set_streaming_enabled(buffer.is_ready());
+    }
+    for child in plan.children() {
+        propose_release_for_ready_buffers(child);
+    }
+}
+
+fn start_streaming_on_enabled_buffers(plan: &Arc<dyn ExecutionPlan>) {
     if let Some(buffer) = plan.downcast_ref::<PipelineBreakerBuffer>()
-        && buffer.is_ready()
+        && buffer.streaming_enabled()
     {
-        buffer.set_go_ahead();
+        buffer.start_streaming();
     }
     for child in plan.children() {
-        release_ready_buffers(child);
+        start_streaming_on_enabled_buffers(child);
+    }
+}
+
+// ---------------------------------------------------------------------------
+// SwapBuildSideIfInverted — first concrete RuntimeRule.
+//
+// When a HashJoinExec's current build side (the LEFT child under
+// `mode=CollectLeft`) ends up larger at runtime than the probe side,
+// the static planner made the wrong choice — it picked build based on
+// (Inexact) estimates. The fix is `HashJoinExec::flip_sides()`, which
+// isn't implemented yet; for now this rule only logs intent so we can
+// verify the detection logic end-to-end.
+// ---------------------------------------------------------------------------
+
+#[derive(Debug)]
+pub struct SwapBuildSideIfInverted {
+    fired: AtomicBool,
+}
+
+impl SwapBuildSideIfInverted {
+    pub fn new() -> Self {
+        Self {
+            fired: AtomicBool::new(false),
+        }
+    }
+}
+
+impl Default for SwapBuildSideIfInverted {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl RuntimeRule for SwapBuildSideIfInverted {
+    fn name(&self) -> &str {
+        "SwapBuildSideIfInverted"
+    }
+
+    fn evaluate(&self, plan: &Arc<dyn ExecutionPlan>) {
+        if self.fired.load(Ordering::Relaxed) {
+            return;
+        }
+        self.walk_for_swap(plan);
+    }
+}
+
+impl SwapBuildSideIfInverted {
+    fn walk_for_swap(&self, plan: &Arc<dyn ExecutionPlan>) {
+        if let Some(join) = plan.downcast_ref::<HashJoinExec>() {
+            let children = join.children();
+            if children.len() == 2 {
+                // Current HashJoinExec: LEFT child is the build side
+                // under `mode=CollectLeft`.
+                let left = side_runtime_rows(children[0]);
+                let right = side_runtime_rows(children[1]);
+                if let (Some(l), Some(r)) = (left, right)
+                    && l > r
+                    && self
+                        .fired
+                        .compare_exchange(
+                            false,
+                            true,
+                            Ordering::AcqRel,
+                            Ordering::Relaxed,
+                        )
+                        .is_ok()
+                {
+                    info!(
+                        "SwapBuildSideIfInverted: would flip HashJoinExec — \
+                         current build (left) = {l} rows, probe (right) = {r} \
+                         rows. flip_sides() not yet implemented; logging \
+                         intent only."
+                    );
+                }
+            }
+            return;
+        }
+        for child in plan.children() {
+            self.walk_for_swap(child);
+        }
+    }
+}
+
+/// Row count of a join input's subtree. Prefers runtime stats from the
+/// deepest node that reports `runtime_row_count` (typically an
+/// `AggregateExec` once its build phase has begun emitting). Falls back
+/// to plan-time `statistics()` for pure static-source subtrees (e.g. a
+/// small in-memory table behind a `CoalescePartitionsExec`).
+fn side_runtime_rows(plan: &Arc<dyn ExecutionPlan>) -> Option<usize> {
+    if let Some(rows) = sum_runtime_rows_in_subtree(plan) {
+        return Some(rows);
+    }
+    plan.statistics_with_args(&StatisticsArgs::new())
+        .ok()
+        .and_then(|s| s.num_rows.get_value().copied())
+}
+
+fn sum_runtime_rows_in_subtree(plan: &Arc<dyn ExecutionPlan>) -> Option<usize> {
+    let n = plan.output_partitioning().partition_count();
+    let mut total: usize = 0;
+    let mut found_any = false;
+    for p in 0..n {
+        if let Some(rows) = plan.runtime_row_count(p) {
+            total += rows;
+            found_any = true;
+        }
+    }
+    if found_any {
+        return Some(total);
+    }
+    for child in plan.children() {
+        if let Some(rows) = sum_runtime_rows_in_subtree(child) {
+            return Some(rows);
+        }
     }
+    None
 }
diff --git a/datafusion/sqllogictest/test_files/runtime_optimizer.slt b/datafusion/sqllogictest/test_files/runtime_optimizer.slt
index e660bbe1cdf8..4180b35d4b36 100644
--- a/datafusion/sqllogictest/test_files/runtime_optimizer.slt
+++ b/datafusion/sqllogictest/test_files/runtime_optimizer.slt
@@ -90,14 +90,15 @@ SELECT count(*) FROM (SELECT group_key FROM big GROUP BY group_key) t;
 5
 
 # ---------------------------------------------------------------------------
-# EXPLAIN — RED today. Without RuntimeOptimizerExec, JoinSelection trusts
-# the (badly-estimated) aggregated_big size and picks `small` as the build
-# side. We're penciling in the target plan that RuntimeOptimizerExec will
-# produce: the join with `aggregated_big` as the left/build child and
-# `small` as the right/probe child, wrapped in a RuntimeOptimizerExec.
-#
-# Statistics decoration is intentionally minimal; an update-mode pass once
-# RuntimeOptimizerExec renders itself will fill in exact text.
+# EXPLAIN — shows the *static* plan with AQE primitives wrapped in
+# (RuntimeOptimizerExec at root; PipelineBreakerBuffer above each
+# pipeline breaker). JoinSelection picked `small` as build statically
+# because aggregated_big's row-count estimate (Inexact, ~100K) overstates
+# the actual (5). RuntimeOptimizerExec detects this inversion at runtime
+# and would call `HashJoinExec::flip_sides()` to swap — but flip_sides
+# isn't implemented yet, so for now the rule only logs intent. EXPLAIN
+# stays the static plan; the swap is verified via log output and (later)
+# via metrics.
 # ---------------------------------------------------------------------------
 query TT
 EXPLAIN SELECT bg.group_key, bg.sum_payload, s.payload
@@ -117,16 +118,16 @@ logical_plan
 08)------TableScan: small projection=[id, payload]
 physical_plan
 01)RuntimeOptimizerExec
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(group_key@0, id@0)], projection=[group_key@0, sum_payload@1, payload@3]
-03)----ProjectionExec: expr=[group_key@0 as group_key, sum(big.payload)@1 as sum_payload]
-04)------PipelineBreakerBuffer
-05)--------AggregateExec: mode=FinalPartitioned, gby=[group_key@0 as group_key], aggr=[sum(big.payload)]
-06)----------RepartitionExec: partitioning=Hash([group_key@0], 4), input_partitions=4
-07)------------PipelineBreakerBuffer
-08)--------------AggregateExec: mode=Partial, gby=[group_key@0 as group_key], aggr=[sum(big.payload)]
-09)----------------DataSourceExec
-10)----CoalescePartitionsExec
-11)------DataSourceExec
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, group_key@0)], projection=[group_key@2, sum_payload@3, payload@1]
+03)----CoalescePartitionsExec
+04)------DataSourceExec: partitions=4, partition_sizes=[1, 0, 0, 0]
+05)----ProjectionExec: expr=[group_key@0 as group_key, sum(big.payload)@1 as sum_payload]
+06)------PipelineBreakerBuffer
+07)--------AggregateExec: mode=FinalPartitioned, gby=[group_key@0 as group_key], aggr=[sum(big.payload)]
+08)----------RepartitionExec: partitioning=Hash([group_key@0], 4), input_partitions=4
+09)------------PipelineBreakerBuffer
+10)--------------AggregateExec: mode=Partial, gby=[group_key@0 as group_key], aggr=[sum(big.payload)]
+11)----------------DataSourceExec: partitions=4, partition_sizes=[4, 3, 3, 3]
 
 # ---------------------------------------------------------------------------
 # Result — GREEN regardless of build-side choice (correctness is plan-

From 9317cabc36a76529af3bca68cf94674cc47eef82 Mon Sep 17 00:00:00 2001
From: Brent Gardner <bgardner@squarelabs.net>
Date: Wed, 24 Jun 2026 13:18:30 -0600
Subject: [PATCH 06/18] runtime_row_count: gate via buffer is_ready,
 passthrough ProjectionExec
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The walk now relies on `runtime_row_count` propagating correctly via
operator-level passthrough rather than recursing into the subtree.
Three changes:

1. `PipelineBreakerBuffer::runtime_row_count` is a *gated passthrough*:
   returns `None` until `is_ready` flips, then delegates to its input.
   This is the natural stats-availability gate — rules can't observe
   the underlying breaker's row count until the breaker is actually
   done.

2. `ProjectionExec::runtime_row_count` is a plain passthrough — a
   projection doesn't change row count.

3. `AggregateExec::runtime_row_count` no longer drops `Some(0)` to
   `None`. An empty partition (zero distinct groups) should report 0,
   not "not yet started." The previous behavior killed all-must-report
   sums whenever any output partition was empty — exactly the case we
   hit with 5 groups hash-split across 4 partitions.

The walk loses its recursion: `sum_runtime_rows_across_partitions`
requires every output partition to report (`?` on `runtime_row_count`).
With the buffer gate, this is the right semantics: we only act once
the entire breaker is done.

End result: the SwapBuildSideIfInverted rule now fires with the
correct row count — `right = 5` (the true post-aggregate cardinality)
instead of the previous `right = 20` (Partial's per-partition output
inadvertently summed).
---
 .../physical-plan/src/aggregates/mod.rs       | 12 +++---
 .../src/pipeline_breaker_buffer.rs            | 11 ++++++
 datafusion/physical-plan/src/projection.rs    |  6 +++
 .../physical-plan/src/runtime_optimizer.rs    | 37 ++++++++-----------
 4 files changed, 40 insertions(+), 26 deletions(-)

diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs
index 0c035013c7fb..e9437724adb0 100644
--- a/datafusion/physical-plan/src/aggregates/mod.rs
+++ b/datafusion/physical-plan/src/aggregates/mod.rs
@@ -1778,17 +1778,19 @@ impl ExecutionPlan for AggregateExec {
     /// each input partition independently knows how many partial-group
     /// rows it has produced.
     fn runtime_row_count(&self, partition: usize) -> Option<usize> {
-        let metrics = self.metrics.clone_inner();
-        let rows: usize = metrics
+        // Find the OutputRows metric registered for this partition.
+        // Returning `Some(0)` for an empty partition is correct — None
+        // means "this partition hasn't been started yet" (the metric
+        // doesn't exist).
+        self.metrics
+            .clone_inner()
             .iter()
-            .filter_map(|m| match (m.partition(), m.value()) {
+            .find_map(|m| match (m.partition(), m.value()) {
                 (Some(p), MetricValue::OutputRows(count)) if p == partition => {
                     Some(count.value())
                 }
                 _ => None,
             })
-            .sum();
-        if rows > 0 { Some(rows) } else { None }
     }
 
     fn statistics_with_args(&self, args: &StatisticsArgs) -> Result<Arc<Statistics>> {
diff --git a/datafusion/physical-plan/src/pipeline_breaker_buffer.rs b/datafusion/physical-plan/src/pipeline_breaker_buffer.rs
index 6230b214b946..9c416eb4867d 100644
--- a/datafusion/physical-plan/src/pipeline_breaker_buffer.rs
+++ b/datafusion/physical-plan/src/pipeline_breaker_buffer.rs
@@ -190,6 +190,17 @@ impl ExecutionPlan for PipelineBreakerBuffer {
         vec![true]
     }
 
+    /// Gated passthrough: rules only see runtime stats once the underlying
+    /// breaker is actually done (signalled by `is_ready`). Before that the
+    /// child's per-partition counts are partial / not meaningful for
+    /// adaptive decisions.
+    fn runtime_row_count(&self, partition: usize) -> Option<usize> {
+        if !self.is_ready() {
+            return None;
+        }
+        self.input.runtime_row_count(partition)
+    }
+
     fn execute(
         &self,
         partition: usize,
diff --git a/datafusion/physical-plan/src/projection.rs b/datafusion/physical-plan/src/projection.rs
index 16b0a5ad7e4b..067717774316 100644
--- a/datafusion/physical-plan/src/projection.rs
+++ b/datafusion/physical-plan/src/projection.rs
@@ -349,6 +349,12 @@ impl ExecutionPlan for ProjectionExec {
         Some(self.metrics.clone_inner())
     }
 
+    /// Passthrough: a projection doesn't change row count, so its
+    /// runtime row count equals its input's.
+    fn runtime_row_count(&self, partition: usize) -> Option<usize> {
+        self.input.runtime_row_count(partition)
+    }
+
     fn statistics_with_args(&self, args: &StatisticsArgs) -> Result<Arc<Statistics>> {
         let input_stats = Arc::unwrap_or_clone(
             args.compute_child_statistics(&self.input, args.partition())?,
diff --git a/datafusion/physical-plan/src/runtime_optimizer.rs b/datafusion/physical-plan/src/runtime_optimizer.rs
index 18ee60f8ece8..9be2d2ee1694 100644
--- a/datafusion/physical-plan/src/runtime_optimizer.rs
+++ b/datafusion/physical-plan/src/runtime_optimizer.rs
@@ -280,13 +280,18 @@ impl SwapBuildSideIfInverted {
     }
 }
 
-/// Row count of a join input's subtree. Prefers runtime stats from the
-/// deepest node that reports `runtime_row_count` (typically an
-/// `AggregateExec` once its build phase has begun emitting). Falls back
-/// to plan-time `statistics()` for pure static-source subtrees (e.g. a
-/// small in-memory table behind a `CoalescePartitionsExec`).
+/// Row count of a join input's subtree. Trusts `runtime_row_count`
+/// to propagate correctly through passthrough operators (Projection,
+/// PipelineBreakerBuffer-when-ready, etc.); no recursive descent. The
+/// PipelineBreakerBuffer in the chain returns `None` until its own
+/// `is_ready`, which is the natural gate — rules only see runtime
+/// stats once the underlying breaker is actually done.
+///
+/// Falls back to plan-time `statistics()` for pure static-source
+/// subtrees (e.g. a small in-memory table behind a
+/// `CoalescePartitionsExec`).
 fn side_runtime_rows(plan: &Arc<dyn ExecutionPlan>) -> Option<usize> {
-    if let Some(rows) = sum_runtime_rows_in_subtree(plan) {
+    if let Some(rows) = sum_runtime_rows_across_partitions(plan) {
         return Some(rows);
     }
     plan.statistics_with_args(&StatisticsArgs::new())
@@ -294,23 +299,13 @@ fn side_runtime_rows(plan: &Arc<dyn ExecutionPlan>) -> Option<usize> {
         .and_then(|s| s.num_rows.get_value().copied())
 }
 
-fn sum_runtime_rows_in_subtree(plan: &Arc<dyn ExecutionPlan>) -> Option<usize> {
+fn sum_runtime_rows_across_partitions(plan: &Arc<dyn ExecutionPlan>) -> Option<usize> {
     let n = plan.output_partitioning().partition_count();
     let mut total: usize = 0;
-    let mut found_any = false;
     for p in 0..n {
-        if let Some(rows) = plan.runtime_row_count(p) {
-            total += rows;
-            found_any = true;
-        }
-    }
-    if found_any {
-        return Some(total);
-    }
-    for child in plan.children() {
-        if let Some(rows) = sum_runtime_rows_in_subtree(child) {
-            return Some(rows);
-        }
+        // Require every partition to report — partial sums are not
+        // meaningful for adaptive decisions.
+        total += plan.runtime_row_count(p)?;
     }
-    None
+    Some(total)
 }

From 09a6abccf203223f69b5d6b714f206557381527c Mon Sep 17 00:00:00 2001
From: Brent Gardner <bgardner@squarelabs.net>
Date: Wed, 24 Jun 2026 13:31:56 -0600
Subject: [PATCH 07/18] Document why pipeline_behavior() isn't suitable for
 breaker detection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tried `pipeline_behavior() == EmissionType::Final` to replace the
hardcoded match. It's the wrong abstraction here — that flag describes
an operator's output *emission semantics* and is inherited from
descendants. Projection, Repartition, and HashJoin sitting above a
Final-emitting AggregateExec all report Final too, so the rule wraps
every such ancestor.

The transition-point filter (`Final && all children != Final`) gets
closer but still misses cascading breakers: AggregateExec(FinalPartitioned)
has children that are themselves Final (because of the Partial below),
so it fails the "all children non-Final" predicate even though it's
itself a genuine breaker.

Keeping the hardcoded match against AggregateExec / SortExec with a
comment explaining the trade-off. A future `is_pipeline_breaker()`
trait method (the API that existed before #13823 split execution_mode
into emission_type + boundedness) would be the principled fix and a
candidate upstream contribution.
---
 .../src/runtime_optimizer.rs                   | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/datafusion/physical-optimizer/src/runtime_optimizer.rs b/datafusion/physical-optimizer/src/runtime_optimizer.rs
index 491c23a45a1d..fe8698394577 100644
--- a/datafusion/physical-optimizer/src/runtime_optimizer.rs
+++ b/datafusion/physical-optimizer/src/runtime_optimizer.rs
@@ -109,8 +109,22 @@ impl PhysicalOptimizerRule for InsertRuntimeOptimizer {
 }
 
 /// Returns true for operators that absorb their entire input before
-/// emitting (the canonical "pipeline breaker" definition). Start with
-/// the obvious cases; extend as more rules need other breakers.
+/// emitting any output — the canonical "pipeline breaker" definition.
+///
+/// We can't use `pipeline_behavior() == EmissionType::Final` here even
+/// though it sounds equivalent. That flag describes an operator's output
+/// emission semantics and is inherited from descendants — so `Projection`,
+/// `Repartition`, and `HashJoin` above a Final-emitting `AggregateExec`
+/// all report `Final` too. We need the *originator* of the pipeline
+/// break, not every operator downstream of one.
+///
+/// `EmissionType::Final && all children != Final` (the transition-point
+/// filter) is closer but still misses cascading breakers like
+/// `AggregateExec(FinalPartitioned)` whose children are themselves
+/// Final because of the `Partial` aggregate below.
+///
+/// So: hardcoded match against the operators we want to instrument.
+/// Extend as more rules need other breakers.
 fn is_pipeline_breaker(plan: &Arc<dyn ExecutionPlan>) -> bool {
     plan.downcast_ref::<AggregateExec>().is_some()
         || plan.downcast_ref::<SortExec>().is_some()

From 2154a9121c1aebe23995f7f52cd04e3e2993948b Mon Sep 17 00:00:00 2001
From: Brent Gardner <bgardner@squarelabs.net>
Date: Thu, 25 Jun 2026 10:17:18 -0600
Subject: [PATCH 08/18] PipelineBreakerBuffer: passthrough statistics_with_args
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The default impl at execution_plan.rs:528 returns Statistics::new_unknown.
Without this override, wrapping a subtree in a buffer blackholes its static
stats — breaking the static fallback in side_runtime_rows when runtime
stats aren't yet available (e.g. the left side of a HashJoin with a small
dim table that has Exact stats statically).

Prerequisite for shifting the insertion rule onto HashJoin children.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../physical-plan/src/pipeline_breaker_buffer.rs     | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/datafusion/physical-plan/src/pipeline_breaker_buffer.rs b/datafusion/physical-plan/src/pipeline_breaker_buffer.rs
index 9c416eb4867d..3f003dbe090d 100644
--- a/datafusion/physical-plan/src/pipeline_breaker_buffer.rs
+++ b/datafusion/physical-plan/src/pipeline_breaker_buffer.rs
@@ -52,10 +52,11 @@ use datafusion_execution::TaskContext;
 use futures::task::AtomicWaker;
 use futures::{Stream, StreamExt};
 
+use crate::statistics::StatisticsArgs;
 use crate::stream::RecordBatchStreamAdapter;
 use crate::{
     DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties,
-    SendableRecordBatchStream,
+    SendableRecordBatchStream, Statistics,
 };
 
 #[derive(Debug)]
@@ -201,6 +202,15 @@ impl ExecutionPlan for PipelineBreakerBuffer {
         self.input.runtime_row_count(partition)
     }
 
+    /// Passthrough: the buffer doesn't change row counts or column stats.
+    /// Without this override, the default impl at execution_plan.rs:528
+    /// returns Statistics::new_unknown — so wrapping any subtree in a
+    /// buffer would blackhole the static stats `side_runtime_rows`
+    /// falls back to when runtime stats aren't yet available.
+    fn statistics_with_args(&self, args: &StatisticsArgs) -> Result<Arc<Statistics>> {
+        args.compute_child_statistics(&self.input, args.partition())
+    }
+
     fn execute(
         &self,
         partition: usize,

From 4fe53aa6022a0ebb0783b4e0f15635889807d6d4 Mon Sep 17 00:00:00 2001
From: Brent Gardner <bgardner@squarelabs.net>
Date: Thu, 25 Jun 2026 10:25:03 -0600
Subject: [PATCH 09/18] Rename PipelineBreakerBuffer to StageBoundaryBuffer,
 add stage field

The buffer is becoming a stage boundary: with materialization (next
commit) it owns its input's drain and IS the breaker, not a passthrough
above one. New name reflects that. Display string carries the stage
number ("StageBoundaryBuffer: stage=N") so EXPLAIN and logs can talk
about stages explicitly.

All buffers currently get stage=0; bottom-up stage numbering comes with
the insertion-rule shift to HashJoin children.

Pure rename + plumbing. No behavior change.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../src/runtime_optimizer.rs                  | 21 +++++-----
 datafusion/physical-plan/src/lib.rs           |  2 +-
 .../physical-plan/src/runtime_optimizer.rs    | 14 +++----
 ...ker_buffer.rs => stage_boundary_buffer.rs} | 38 +++++++++++++------
 .../test_files/runtime_optimizer.slt          |  6 +--
 5 files changed, 48 insertions(+), 33 deletions(-)
 rename datafusion/physical-plan/src/{pipeline_breaker_buffer.rs => stage_boundary_buffer.rs} (91%)

diff --git a/datafusion/physical-optimizer/src/runtime_optimizer.rs b/datafusion/physical-optimizer/src/runtime_optimizer.rs
index fe8698394577..f1da9efa5091 100644
--- a/datafusion/physical-optimizer/src/runtime_optimizer.rs
+++ b/datafusion/physical-optimizer/src/runtime_optimizer.rs
@@ -19,18 +19,15 @@
 //!
 //! 1. Walks the plan tree, wrapping every pipeline-breaking operator
 //!    (currently `AggregateExec` and `SortExec`) in a
-//!    [`PipelineBreakerBuffer`]. The buffer is the synchronization point
+//!    [`StageBoundaryBuffer`]. The boundary is the synchronization point
 //!    where runtime stats become observable.
 //! 2. Wraps the resulting plan root in a [`RuntimeOptimizerExec`], which
 //!    walks the subtree on each `poll_next` and releases ready buffers.
 //!
 //! A shared [`AtomicWaker`] is constructed here and threaded into both
-//! the buffers and the wrapping RTO so buffers can wake the coordinator
-//! from inside spawned subtasks (e.g. `RepartitionExec` internals). The
-//! default is permissive release — the next commit will add a
-//! `Vec<RuntimeRule>` that runs between "all ready" and "release," able
-//! to mutate adaptive operators (`HashJoinExec::flip_sides`) or hold
-//! specific buffers.
+//! the boundaries and the wrapping RTO so boundaries can wake the
+//! coordinator from inside spawned subtasks (e.g. `RepartitionExec`
+//! internals).
 
 use std::sync::Arc;
 
@@ -40,11 +37,11 @@ use datafusion_common::config::ConfigOptions;
 use datafusion_common::tree_node::{Transformed, TreeNode};
 use datafusion_physical_plan::ExecutionPlan;
 use datafusion_physical_plan::aggregates::AggregateExec;
-use datafusion_physical_plan::pipeline_breaker_buffer::PipelineBreakerBuffer;
 use datafusion_physical_plan::runtime_optimizer::{
     RuntimeOptimizerExec, RuntimeRule, SwapBuildSideIfInverted,
 };
 use datafusion_physical_plan::sorts::sort::SortExec;
+use datafusion_physical_plan::stage_boundary_buffer::StageBoundaryBuffer;
 use futures::task::AtomicWaker;
 
 #[derive(Default, Debug)]
@@ -77,14 +74,16 @@ impl PhysicalOptimizerRule for InsertRuntimeOptimizer {
         // boundaries (e.g. RepartitionExec's internals), this does.
         let rto_waker = Arc::new(AtomicWaker::new());
 
-        // Phase 1: wrap each pipeline breaker in a PipelineBreakerBuffer.
+        // Phase 1: wrap each pipeline breaker in a StageBoundaryBuffer.
+        // Stage numbering is uniform (0) for now; the next commit assigns
+        // real bottom-up stage numbers.
         let with_buffers = plan
             .transform_up(|node| {
                 if is_pipeline_breaker(&node)
-                    && node.downcast_ref::<PipelineBreakerBuffer>().is_none()
+                    && node.downcast_ref::<StageBoundaryBuffer>().is_none()
                 {
                     let buffered: Arc<dyn ExecutionPlan> = Arc::new(
-                        PipelineBreakerBuffer::new(node, Arc::clone(&rto_waker)),
+                        StageBoundaryBuffer::new(node, 0, Arc::clone(&rto_waker)),
                     );
                     Ok(Transformed::yes(buffered))
                 } else {
diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs
index 48187755a649..59c730e21265 100644
--- a/datafusion/physical-plan/src/lib.rs
+++ b/datafusion/physical-plan/src/lib.rs
@@ -82,7 +82,6 @@ pub mod limit;
 pub mod memory;
 pub mod metrics;
 pub mod operator_statistics;
-pub mod pipeline_breaker_buffer;
 pub mod placeholder_row;
 pub mod projection;
 pub mod recursive_query;
@@ -92,6 +91,7 @@ pub mod scalar_subquery;
 pub mod sort_pushdown;
 pub mod sorts;
 pub mod spill;
+pub mod stage_boundary_buffer;
 pub mod statistics;
 pub mod stream;
 pub mod streaming;
diff --git a/datafusion/physical-plan/src/runtime_optimizer.rs b/datafusion/physical-plan/src/runtime_optimizer.rs
index 9be2d2ee1694..dcab5a039eeb 100644
--- a/datafusion/physical-plan/src/runtime_optimizer.rs
+++ b/datafusion/physical-plan/src/runtime_optimizer.rs
@@ -16,7 +16,7 @@
 // under the License.
 
 //! Coordinator wrapper at the root of the plan. On every poll, it walks
-//! its subtree to release any [`PipelineBreakerBuffer`] whose `is_ready`
+//! its subtree to release any [`StageBoundaryBuffer`] whose `is_ready`
 //! flag has flipped, then runs each registered [`RuntimeRule`] over the
 //! plan. Rules observe runtime stats (via `runtime_row_count` and
 //! similar) and mutate adaptive operators in place (e.g.
@@ -40,7 +40,7 @@ use futures::{Stream, StreamExt};
 use log::info;
 
 use crate::joins::HashJoinExec;
-use crate::pipeline_breaker_buffer::PipelineBreakerBuffer;
+use crate::stage_boundary_buffer::StageBoundaryBuffer;
 use crate::statistics::StatisticsArgs;
 use crate::stream::RecordBatchStreamAdapter;
 use crate::{
@@ -63,7 +63,7 @@ pub trait RuntimeRule: Send + Sync + std::fmt::Debug {
 pub struct RuntimeOptimizerExec {
     input: Arc<dyn ExecutionPlan>,
     cache: Arc<PlanProperties>,
-    /// Shared with every `PipelineBreakerBuffer` in this subplan.
+    /// Shared with every `StageBoundaryBuffer` in this subplan.
     /// Buffers wake this AtomicWaker when `is_ready` flips; we register
     /// the current task's waker on it during each `poll_next` so a
     /// wake-up from inside a spawned subtask reaches the actual
@@ -181,7 +181,7 @@ impl Stream for CoordinatorStream {
 }
 
 fn propose_release_for_ready_buffers(plan: &Arc<dyn ExecutionPlan>) {
-    if let Some(buffer) = plan.downcast_ref::<PipelineBreakerBuffer>() {
+    if let Some(buffer) = plan.downcast_ref::<StageBoundaryBuffer>() {
         buffer.set_streaming_enabled(buffer.is_ready());
     }
     for child in plan.children() {
@@ -190,7 +190,7 @@ fn propose_release_for_ready_buffers(plan: &Arc<dyn ExecutionPlan>) {
 }
 
 fn start_streaming_on_enabled_buffers(plan: &Arc<dyn ExecutionPlan>) {
-    if let Some(buffer) = plan.downcast_ref::<PipelineBreakerBuffer>()
+    if let Some(buffer) = plan.downcast_ref::<StageBoundaryBuffer>()
         && buffer.streaming_enabled()
     {
         buffer.start_streaming();
@@ -282,8 +282,8 @@ impl SwapBuildSideIfInverted {
 
 /// Row count of a join input's subtree. Trusts `runtime_row_count`
 /// to propagate correctly through passthrough operators (Projection,
-/// PipelineBreakerBuffer-when-ready, etc.); no recursive descent. The
-/// PipelineBreakerBuffer in the chain returns `None` until its own
+/// StageBoundaryBuffer-when-ready, etc.); no recursive descent. The
+/// StageBoundaryBuffer in the chain returns `None` until its own
 /// `is_ready`, which is the natural gate — rules only see runtime
 /// stats once the underlying breaker is actually done.
 ///
diff --git a/datafusion/physical-plan/src/pipeline_breaker_buffer.rs b/datafusion/physical-plan/src/stage_boundary_buffer.rs
similarity index 91%
rename from datafusion/physical-plan/src/pipeline_breaker_buffer.rs
rename to datafusion/physical-plan/src/stage_boundary_buffer.rs
index 3f003dbe090d..aa203f97c6a6 100644
--- a/datafusion/physical-plan/src/pipeline_breaker_buffer.rs
+++ b/datafusion/physical-plan/src/stage_boundary_buffer.rs
@@ -15,11 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Synchronization wrapper above a pipeline-breaking operator. Holds the
-//! breaker's first batch per partition so that runtime stats become
-//! observable to the coordinator
+//! Stage-boundary buffer between an input subtree and its consumer.
+//! Holds the input's first batch per partition so that runtime stats
+//! become observable to the coordinator
 //! ([`crate::runtime_optimizer::RuntimeOptimizerExec`]) before downstream
-//! consumers see any data.
+//! consumers see any data. Each boundary carries a `stage` number assigned
+//! by the optimizer at insertion time so RTO and EXPLAIN can talk about
+//! stages explicitly.
 //!
 //! Three flags govern behavior:
 //! - `is_ready` (mechanical): set automatically when every input partition
@@ -60,10 +62,14 @@ use crate::{
 };
 
 #[derive(Debug)]
-pub struct PipelineBreakerBuffer {
+pub struct StageBoundaryBuffer {
     input: Arc<dyn ExecutionPlan>,
     cache: Arc<PlanProperties>,
     state: Arc<Mutex<BufferState>>,
+    /// Stage number assigned by the inserting optimizer rule. Lowest
+    /// stage runs first; once released, its consumers (next stage up)
+    /// can prime. Used by EXPLAIN, logs, and (eventually) RTO ordering.
+    stage: usize,
     /// Shared with `RuntimeOptimizerExec`. Buffer wakes this whenever
     /// `is_ready` flips so the coordinator (which may live above a
     /// task-spawning operator like `RepartitionExec`) gets re-polled.
@@ -95,8 +101,12 @@ struct BufferState {
     wakers: HashMap<usize, Waker>,
 }
 
-impl PipelineBreakerBuffer {
-    pub fn new(input: Arc<dyn ExecutionPlan>, rto_waker: Arc<AtomicWaker>) -> Self {
+impl StageBoundaryBuffer {
+    pub fn new(
+        input: Arc<dyn ExecutionPlan>,
+        stage: usize,
+        rto_waker: Arc<AtomicWaker>,
+    ) -> Self {
         let num_partitions = input.output_partitioning().partition_count();
         let cache = Arc::clone(input.properties());
         Self {
@@ -111,10 +121,15 @@ impl PipelineBreakerBuffer {
                 num_partitions,
                 wakers: HashMap::new(),
             })),
+            stage,
             rto_waker,
         }
     }
 
+    pub fn stage(&self) -> usize {
+        self.stage
+    }
+
     /// True once every input partition has produced its first poll result.
     pub fn is_ready(&self) -> bool {
         self.state.lock().unwrap().is_ready
@@ -154,19 +169,19 @@ impl PipelineBreakerBuffer {
     }
 }
 
-impl DisplayAs for PipelineBreakerBuffer {
+impl DisplayAs for StageBoundaryBuffer {
     fn fmt_as(
         &self,
         _t: DisplayFormatType,
         f: &mut std::fmt::Formatter,
     ) -> std::fmt::Result {
-        write!(f, "PipelineBreakerBuffer")
+        write!(f, "StageBoundaryBuffer: stage={}", self.stage)
     }
 }
 
-impl ExecutionPlan for PipelineBreakerBuffer {
+impl ExecutionPlan for StageBoundaryBuffer {
     fn name(&self) -> &'static str {
-        "PipelineBreakerBuffer"
+        "StageBoundaryBuffer"
     }
 
     fn properties(&self) -> &Arc<PlanProperties> {
@@ -183,6 +198,7 @@ impl ExecutionPlan for PipelineBreakerBuffer {
     ) -> Result<Arc<dyn ExecutionPlan>> {
         Ok(Arc::new(Self::new(
             children.swap_remove(0),
+            self.stage,
             Arc::clone(&self.rto_waker),
         )))
     }
diff --git a/datafusion/sqllogictest/test_files/runtime_optimizer.slt b/datafusion/sqllogictest/test_files/runtime_optimizer.slt
index 4180b35d4b36..10402592654e 100644
--- a/datafusion/sqllogictest/test_files/runtime_optimizer.slt
+++ b/datafusion/sqllogictest/test_files/runtime_optimizer.slt
@@ -91,7 +91,7 @@ SELECT count(*) FROM (SELECT group_key FROM big GROUP BY group_key) t;
 
 # ---------------------------------------------------------------------------
 # EXPLAIN — shows the *static* plan with AQE primitives wrapped in
-# (RuntimeOptimizerExec at root; PipelineBreakerBuffer above each
+# (RuntimeOptimizerExec at root; StageBoundaryBuffer above each
 # pipeline breaker). JoinSelection picked `small` as build statically
 # because aggregated_big's row-count estimate (Inexact, ~100K) overstates
 # the actual (5). RuntimeOptimizerExec detects this inversion at runtime
@@ -122,10 +122,10 @@ physical_plan
 03)----CoalescePartitionsExec
 04)------DataSourceExec: partitions=4, partition_sizes=[1, 0, 0, 0]
 05)----ProjectionExec: expr=[group_key@0 as group_key, sum(big.payload)@1 as sum_payload]
-06)------PipelineBreakerBuffer
+06)------StageBoundaryBuffer: stage=0
 07)--------AggregateExec: mode=FinalPartitioned, gby=[group_key@0 as group_key], aggr=[sum(big.payload)]
 08)----------RepartitionExec: partitioning=Hash([group_key@0], 4), input_partitions=4
-09)------------PipelineBreakerBuffer
+09)------------StageBoundaryBuffer: stage=0
 10)--------------AggregateExec: mode=Partial, gby=[group_key@0 as group_key], aggr=[sum(big.payload)]
 11)----------------DataSourceExec: partitions=4, partition_sizes=[4, 3, 3, 3]
 

From eec37dcdf376fb7a5b0d4cb78e13ab7561da2c8b Mon Sep 17 00:00:00 2001
From: Brent Gardner <bgardner@squarelabs.net>
Date: Thu, 25 Jun 2026 11:08:58 -0600
Subject: [PATCH 10/18] StageBoundaryBuffer: full-drain materialization +
 prime(ctx)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces first-batch holding with per-partition full materialization.
The boundary IS the breaker now, not a synchronizer above one.

Mechanics:
- prime(ctx) (called by RTO) spawns one SpawnedTask per input partition,
  each pulling input.execute(p, ctx) to EOF and ferrying batches through
  an unbounded mpsc. SpawnedTask auto-aborts on Drop → query cancel
  cleans up cleanly.
- execute(p, _) hands back the per-partition receiver wrapped in a
  ConsumerStream that gates on streaming_started, then forwards.
  Never touches input.execute itself.
- is_ready flips when every drain task has reached EOF.

Per-partition tx and rx are separately taken (independent Options) so
prime and execute can arrive in either order without one starving the
other (caught a deadlock first when execute took the whole ChannelEnds
before prime ran).

RTO's execute() now walks the subtree and primes every StageBoundaryBuffer
it finds. Without this, consumers above buffers would sit Pending forever
(HashJoin in CollectLeft never polls probe until build completes — but
build itself is now gated behind a buffer). prime is idempotent across
the multiple execute() calls a multi-partition RTO may receive.

Memory: every primed buffer holds its full input until release. Spill is
a follow-up; OomGuard catches genuine OOM in the meantime. TODO comment
above the channel type aliases makes the spill plan visible to readers.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../physical-plan/src/runtime_optimizer.rs    |  22 +-
 .../src/stage_boundary_buffer.rs              | 309 ++++++++++--------
 2 files changed, 197 insertions(+), 134 deletions(-)

diff --git a/datafusion/physical-plan/src/runtime_optimizer.rs b/datafusion/physical-plan/src/runtime_optimizer.rs
index dcab5a039eeb..ce1ea6dee9cc 100644
--- a/datafusion/physical-plan/src/runtime_optimizer.rs
+++ b/datafusion/physical-plan/src/runtime_optimizer.rs
@@ -131,7 +131,14 @@ impl ExecutionPlan for RuntimeOptimizerExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        let child = self.input.execute(partition, context)?;
+        let child = self.input.execute(partition, Arc::clone(&context))?;
+        // Side-channel drive every StageBoundaryBuffer in the subtree.
+        // Buffers don't pull from their inputs until primed, so without
+        // this the consumers above them would sit Pending forever (a
+        // HashJoin in CollectLeft mode never polls its probe side until
+        // build completes — but build itself is gated behind a buffer).
+        // prime() is idempotent across partitions.
+        prime_all_buffers(&self.input, &context)?;
         let schema = self.schema();
         let stream = CoordinatorStream {
             child,
@@ -143,6 +150,19 @@ impl ExecutionPlan for RuntimeOptimizerExec {
     }
 }
 
+fn prime_all_buffers(
+    plan: &Arc<dyn ExecutionPlan>,
+    ctx: &Arc<TaskContext>,
+) -> Result<()> {
+    if let Some(buffer) = plan.downcast_ref::<StageBoundaryBuffer>() {
+        buffer.prime(ctx)?;
+    }
+    for child in plan.children() {
+        prime_all_buffers(child, ctx)?;
+    }
+    Ok(())
+}
+
 struct CoordinatorStream {
     child: SendableRecordBatchStream,
     plan: Arc<dyn ExecutionPlan>,
diff --git a/datafusion/physical-plan/src/stage_boundary_buffer.rs b/datafusion/physical-plan/src/stage_boundary_buffer.rs
index aa203f97c6a6..cfb18efbb402 100644
--- a/datafusion/physical-plan/src/stage_boundary_buffer.rs
+++ b/datafusion/physical-plan/src/stage_boundary_buffer.rs
@@ -15,44 +15,54 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Stage-boundary buffer between an input subtree and its consumer.
-//! Holds the input's first batch per partition so that runtime stats
-//! become observable to the coordinator
-//! ([`crate::runtime_optimizer::RuntimeOptimizerExec`]) before downstream
-//! consumers see any data. Each boundary carries a `stage` number assigned
-//! by the optimizer at insertion time so RTO and EXPLAIN can talk about
-//! stages explicitly.
+//! Stage-boundary buffer between an input subtree and its consumer. The
+//! boundary IS a pipeline breaker: it drains its input fully via tasks
+//! spawned by [`StageBoundaryBuffer::prime`], stages the result, and
+//! releases to the consumer only when
+//! [`crate::runtime_optimizer::RuntimeOptimizerExec`] (RTO) decides.
+//! Each boundary carries a `stage` number assigned by the optimizer at
+//! insertion time so RTO and EXPLAIN can talk about stages explicitly.
 //!
-//! Three flags govern behavior:
-//! - `is_ready` (mechanical): set automatically when every input partition
-//!   has produced its first poll result. Runtime stats from the breaker
-//!   become derivable.
-//! - `streaming_enabled` (rule-controlled proposal): reset each poll cycle
-//!   by RTO — set to `is_ready` as the permissive default. Rules can
-//!   flip it back to `false` to veto release. No side effects.
+//! Lifecycle, separating fill from drain:
+//! - `prime(ctx)` (called by RTO): spawns one drain task per input
+//!   partition; each pulls until EOF, ferrying batches through an
+//!   unbounded mpsc per partition. Idempotent.
+//! - `execute(p, _)` (called by the consumer): hands back the
+//!   per-partition receiver wrapped in a gated stream. Never touches
+//!   `input.execute`; that's prime's job.
+//!
+//! Three flags govern release:
+//! - `is_ready` (mechanical): set automatically when every drain task
+//!   has hit EOF. Runtime stats from the input become derivable.
+//! - `streaming_enabled` (rule-controlled proposal): reset each poll
+//!   cycle by RTO — set to `is_ready` as the permissive default. Rules
+//!   can flip it to false to veto release. No side effects.
 //! - `streaming_started` (actual emission control): only RTO flips this,
-//!   via `start_streaming()`, which also wakes per-partition wakers.
-//!   Once true, partition streams emit their held batches and continue
-//!   forwarding.
+//!   via `start_streaming()`. Once true, the gated consumer streams
+//!   start forwarding from their receivers.
 //!
 //! Coordination uses a shared [`AtomicWaker`] (`rto_waker`) populated at
-//! plan time by [`crate::runtime_optimizer::RuntimeOptimizerExec`]. The
-//! buffer wakes it when `is_ready` flips; the coordinator is registered
-//! on it via its own `poll_next`. We use this side-channel rather than
-//! `cx.waker()` because the latter is task-local — inside a spawned task
-//! (e.g. one of `RepartitionExec`'s internals) it never reaches the
-//! top-of-plan task.
+//! plan time by RTO. The buffer wakes it when `is_ready` flips; the
+//! coordinator is registered on it via its own `poll_next`. Side-channel
+//! instead of `cx.waker()` because the latter is task-local — inside a
+//! spawned task (e.g. one of `RepartitionExec`'s internals) it never
+//! reaches the top-of-plan task.
+//!
+//! Memory: every primed buffer holds its full input until release.
+//! Spill is a follow-up; OomGuard catches genuine OOM in the meantime.
 
-use std::collections::{HashMap, HashSet};
+use std::collections::HashMap;
 use std::pin::Pin;
 use std::sync::{Arc, Mutex};
 use std::task::{Context, Poll, Waker};
 
 use arrow::array::RecordBatch;
 use datafusion_common::Result;
+use datafusion_common_runtime::SpawnedTask;
 use datafusion_execution::TaskContext;
 use futures::task::AtomicWaker;
 use futures::{Stream, StreamExt};
+use tokio::sync::mpsc;
 
 use crate::statistics::StatisticsArgs;
 use crate::stream::RecordBatchStreamAdapter;
@@ -61,11 +71,17 @@ use crate::{
     SendableRecordBatchStream, Statistics,
 };
 
+// TODO(spill): the per-partition channels below currently hold the full
+// materialized input in memory. A follow-up will swap each channel for a
+// spillable buffer (writing to disk under MemoryPool pressure) so that
+// large stage boundaries don't OOM. OomGuard is the safety net until then.
+type PartitionTxs = Vec<Option<mpsc::UnboundedSender<Result<RecordBatch>>>>;
+type PartitionRxs = Vec<Option<mpsc::UnboundedReceiver<Result<RecordBatch>>>>;
+
 #[derive(Debug)]
 pub struct StageBoundaryBuffer {
     input: Arc<dyn ExecutionPlan>,
     cache: Arc<PlanProperties>,
-    state: Arc<Mutex<BufferState>>,
     /// Stage number assigned by the inserting optimizer rule. Lowest
     /// stage runs first; once released, its consumers (next stage up)
     /// can prime. Used by EXPLAIN, logs, and (eventually) RTO ordering.
@@ -73,31 +89,35 @@ pub struct StageBoundaryBuffer {
     /// Shared with `RuntimeOptimizerExec`. Buffer wakes this whenever
     /// `is_ready` flips so the coordinator (which may live above a
     /// task-spawning operator like `RepartitionExec`) gets re-polled.
-    /// Per-partition `cx.waker()` alone is insufficient — it's the
-    /// local task's waker, which doesn't propagate across the spawned
-    /// task boundary that `RepartitionExec` introduces.
     rto_waker: Arc<AtomicWaker>,
+    /// Gate flags + drain progress + consumer-side wakers.
+    state: Arc<Mutex<BufferState>>,
+    /// Per-partition sender. Moved out (`Option::take`) into the drain
+    /// task at `prime()`. Independent from `rxs` because `prime` and
+    /// `execute` arrive in unspecified order.
+    txs: Arc<Mutex<PartitionTxs>>,
+    /// Per-partition receiver. Moved out into the consumer stream at
+    /// `execute()`.
+    rxs: Arc<Mutex<PartitionRxs>>,
+    /// Handles to spawned drain tasks. Auto-abort on Drop via
+    /// `SpawnedTask::Drop`, so query cancellation cleans up cleanly.
+    drain_tasks: Arc<Mutex<Vec<SpawnedTask<()>>>>,
 }
 
 #[derive(Debug)]
 struct BufferState {
-    /// First batch per input partition. Absent for partitions whose input
-    /// terminated empty.
-    held: HashMap<usize, RecordBatch>,
-    /// Partitions whose first poll has completed (with or without a batch).
-    seen: HashSet<usize>,
+    /// Partitions whose drain task reached EOF (success or error). When
+    /// `drained_count == num_partitions`, `is_ready` flips.
+    drained_count: usize,
     is_ready: bool,
-    /// Rule-controllable proposal. Reset by RTO each poll cycle: set
-    /// to `is_ready` as the permissive default, then rules may flip it
+    /// Rule-controllable proposal. Reset by RTO each poll cycle: set to
+    /// `is_ready` as the permissive default, then rules may flip it
     /// to false to veto.
     streaming_enabled: bool,
     /// Actual emission control. Only RTO flips this via `start_streaming`.
-    /// Per-partition streams check this; while false, they hold their
-    /// first batch and return Pending.
     streaming_started: bool,
     num_partitions: usize,
-    /// Wakers stashed by per-partition streams while awaiting
-    /// `streaming_started`.
+    /// Wakers stashed by consumer streams while gated on `streaming_started`.
     wakers: HashMap<usize, Waker>,
 }
 
@@ -109,20 +129,28 @@ impl StageBoundaryBuffer {
     ) -> Self {
         let num_partitions = input.output_partitioning().partition_count();
         let cache = Arc::clone(input.properties());
+        let (txs, rxs): (Vec<_>, Vec<_>) = (0..num_partitions)
+            .map(|_| {
+                let (tx, rx) = mpsc::unbounded_channel();
+                (Some(tx), Some(rx))
+            })
+            .unzip();
         Self {
             input,
             cache,
+            stage,
+            rto_waker,
             state: Arc::new(Mutex::new(BufferState {
-                held: HashMap::new(),
-                seen: HashSet::new(),
+                drained_count: 0,
                 is_ready: false,
                 streaming_enabled: false,
                 streaming_started: false,
                 num_partitions,
                 wakers: HashMap::new(),
             })),
-            stage,
-            rto_waker,
+            txs: Arc::new(Mutex::new(txs)),
+            rxs: Arc::new(Mutex::new(rxs)),
+            drain_tasks: Arc::new(Mutex::new(Vec::new())),
         }
     }
 
@@ -130,7 +158,32 @@ impl StageBoundaryBuffer {
         self.stage
     }
 
-    /// True once every input partition has produced its first poll result.
+    /// Spawn drain tasks for every input partition. Each task pulls
+    /// `input.execute(p, ctx)` to EOF, ferrying batches and errors
+    /// through the per-partition channel; once EOF is reached, marks
+    /// the partition drained and wakes RTO if this was the last one.
+    /// Idempotent. Only RTO should call this.
+    pub fn prime(&self, ctx: &Arc<TaskContext>) -> Result<()> {
+        let mut tasks = self.drain_tasks.lock().unwrap();
+        if !tasks.is_empty() {
+            return Ok(());
+        }
+        let mut txs = self.txs.lock().unwrap();
+        for (partition, slot) in txs.iter_mut().enumerate() {
+            let Some(tx) = slot.take() else {
+                continue;
+            };
+            let stream = self.input.execute(partition, Arc::clone(ctx))?;
+            let state = Arc::clone(&self.state);
+            let rto_waker = Arc::clone(&self.rto_waker);
+            tasks.push(SpawnedTask::spawn(drain_partition(
+                partition, stream, tx, state, rto_waker,
+            )));
+        }
+        Ok(())
+    }
+
+    /// True once every drain task has reached EOF.
     pub fn is_ready(&self) -> bool {
         self.state.lock().unwrap().is_ready
     }
@@ -146,14 +199,13 @@ impl StageBoundaryBuffer {
         self.state.lock().unwrap().streaming_enabled = enabled;
     }
 
-    /// True once `start_streaming` has been called. After this, per-
-    /// partition streams emit their held batches and resume forwarding.
+    /// True once `start_streaming` has been called.
     pub fn streaming_started(&self) -> bool {
         self.state.lock().unwrap().streaming_started
     }
 
     /// Start actual emission: flips `streaming_started` and wakes all
-    /// per-partition wakers. Idempotent. Only RTO should call this.
+    /// per-partition consumer wakers. Idempotent. Only RTO should call this.
     pub fn start_streaming(&self) {
         let wakers = {
             let mut state = self.state.lock().unwrap();
@@ -169,6 +221,53 @@ impl StageBoundaryBuffer {
     }
 }
 
+async fn drain_partition(
+    partition: usize,
+    mut stream: SendableRecordBatchStream,
+    tx: mpsc::UnboundedSender<Result<RecordBatch>>,
+    state: Arc<Mutex<BufferState>>,
+    rto_waker: Arc<AtomicWaker>,
+) {
+    while let Some(item) = stream.next().await {
+        // tx.send returning Err means the consumer dropped its receiver
+        // (query cancelled while we were mid-drain). Stop pulling.
+        let is_err = item.is_err();
+        if tx.send(item).is_err() {
+            break;
+        }
+        if is_err {
+            break;
+        }
+    }
+    let became_ready = {
+        let mut state = state.lock().unwrap();
+        state.drained_count += 1;
+        if state.drained_count == state.num_partitions && !state.is_ready {
+            state.is_ready = true;
+            true
+        } else {
+            false
+        }
+    };
+    if became_ready {
+        // cx.waker() inside this spawned task is task-local and won't
+        // reach the top-of-plan task. The shared AtomicWaker bridges.
+        rto_waker.wake();
+    }
+    let _ = partition; // reserved for future per-partition diagnostics
+}
+
+impl Drop for StageBoundaryBuffer {
+    fn drop(&mut self) {
+        // SpawnedTask aborts on Drop. Clearing the Vec triggers that
+        // for every drain task. Safe to ignore poisoned lock — at Drop
+        // time we just want to release resources.
+        if let Ok(mut tasks) = self.drain_tasks.lock() {
+            tasks.clear();
+        }
+    }
+}
+
 impl DisplayAs for StageBoundaryBuffer {
     fn fmt_as(
         &self,
@@ -207,8 +306,8 @@ impl ExecutionPlan for StageBoundaryBuffer {
         vec![true]
     }
 
-    /// Gated passthrough: rules only see runtime stats once the underlying
-    /// breaker is actually done (signalled by `is_ready`). Before that the
+    /// Gated passthrough: rules only see runtime stats once every drain
+    /// task has reached EOF (signalled by `is_ready`). Before that the
     /// child's per-partition counts are partial / not meaningful for
     /// adaptive decisions.
     fn runtime_row_count(&self, partition: usize) -> Option<usize> {
@@ -230,44 +329,39 @@ impl ExecutionPlan for StageBoundaryBuffer {
     fn execute(
         &self,
         partition: usize,
-        context: Arc<TaskContext>,
+        _context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        let input = self.input.execute(partition, context)?;
-        let schema = self.schema();
-        let stream = BufferStream {
-            phase: Phase::NeedFirstBatch,
-            input,
+        // Take ownership of this partition's receiver. prime() owns the
+        // sender side; we just gate-and-forward here.
+        let rx = self
+            .rxs
+            .lock()
+            .unwrap()
+            .get_mut(partition)
+            .and_then(|slot| slot.take())
+            .ok_or_else(|| {
+                datafusion_common::DataFusionError::Internal(format!(
+                    "StageBoundaryBuffer::execute called twice (or partition \
+                     {partition} out of range)"
+                ))
+            })?;
+        let stream = ConsumerStream {
             partition,
+            rx,
             state: Arc::clone(&self.state),
-            rto_waker: Arc::clone(&self.rto_waker),
         };
+        let schema = self.schema();
         Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream)))
     }
 }
 
-#[derive(Debug)]
-enum Phase {
-    /// Haven't yet pulled the first batch from this input partition.
-    NeedFirstBatch,
-    /// First batch absorbed (or input was empty). Holding until RTO calls
-    /// `start_streaming`.
-    WaitForStreaming,
-    /// `streaming_started` is set; emit our held batch (if any) then
-    /// transition to streaming.
-    EmitHeld,
-    /// Pass through input batches as they arrive.
-    Streaming,
-}
-
-struct BufferStream {
-    phase: Phase,
-    input: SendableRecordBatchStream,
+struct ConsumerStream {
     partition: usize,
-    rto_waker: Arc<AtomicWaker>,
+    rx: mpsc::UnboundedReceiver<Result<RecordBatch>>,
     state: Arc<Mutex<BufferState>>,
 }
 
-impl Stream for BufferStream {
+impl Stream for ConsumerStream {
     type Item = Result<RecordBatch>;
 
     fn poll_next(
@@ -275,65 +369,14 @@ impl Stream for BufferStream {
         cx: &mut Context<'_>,
     ) -> Poll<Option<Self::Item>> {
         let this = self.as_mut().get_mut();
-        loop {
-            match this.phase {
-                Phase::NeedFirstBatch => {
-                    let next = match this.input.poll_next_unpin(cx) {
-                        Poll::Ready(x) => x,
-                        Poll::Pending => return Poll::Pending,
-                    };
-                    let became_ready = {
-                        let mut state = this.state.lock().unwrap();
-                        state.seen.insert(this.partition);
-                        match next {
-                            Some(Ok(batch)) => {
-                                state.held.insert(this.partition, batch);
-                            }
-                            Some(Err(e)) => {
-                                return Poll::Ready(Some(Err(e)));
-                            }
-                            None => { /* empty partition; nothing to hold */ }
-                        }
-                        if state.seen.len() == state.num_partitions && !state.is_ready {
-                            state.is_ready = true;
-                            true
-                        } else {
-                            false
-                        }
-                    };
-                    if became_ready {
-                        // Wake the coordinator. `cx.waker()` alone is
-                        // task-local — if we're inside a spawned task
-                        // (e.g. one of RepartitionExec's internals), it
-                        // never reaches RTO. The shared AtomicWaker is
-                        // populated by RTO on each of its own polls and
-                        // wakes the actual top-of-plan task.
-                        this.rto_waker.wake();
-                    }
-                    this.phase = Phase::WaitForStreaming;
-                }
-                Phase::WaitForStreaming => {
-                    let mut state = this.state.lock().unwrap();
-                    if state.streaming_started {
-                        drop(state);
-                        this.phase = Phase::EmitHeld;
-                        continue;
-                    }
-                    state.wakers.insert(this.partition, cx.waker().clone());
-                    return Poll::Pending;
-                }
-                Phase::EmitHeld => {
-                    let held = this.state.lock().unwrap().held.remove(&this.partition);
-                    this.phase = Phase::Streaming;
-                    if let Some(batch) = held {
-                        return Poll::Ready(Some(Ok(batch)));
-                    }
-                    // Empty-partition case: fall through to streaming.
-                }
-                Phase::Streaming => {
-                    return this.input.poll_next_unpin(cx);
-                }
+        // Gate: don't emit anything until RTO releases the boundary.
+        {
+            let mut state = this.state.lock().unwrap();
+            if !state.streaming_started {
+                state.wakers.insert(this.partition, cx.waker().clone());
+                return Poll::Pending;
             }
         }
+        this.rx.poll_recv(cx)
     }
 }

From ea64bdeb04ddb10920b6da4fee1bd2ae9c67e367 Mon Sep 17 00:00:00 2001
From: Brent Gardner <bgardner@squarelabs.net>
Date: Thu, 25 Jun 2026 11:24:31 -0600
Subject: [PATCH 11/18] Split InsertRuntimeOptimizer into two rules; per-buffer
 wakers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Before: a single optimizer rule did two jobs in one pass — wrap each
pipeline breaker in a buffer, then wrap the root in a RuntimeOptimizerExec.
A shared Arc<AtomicWaker> was constructed in that call and threaded
through both.

After: two independent PhysicalOptimizerRules.
  1. InsertStageBoundariesAtBreakers — wraps Agg/Sort in
     StageBoundaryBuffer. Same Agg/Sort target as before; the temporary
     name reflects current behavior. The next commit retargets it to
     HashJoin inputs (rename + bottom-up stage numbering).
  2. InsertRuntimeOptimizer — wraps the root in RuntimeOptimizerExec.
     Trivial now.

This is the infrastructure pattern future adaptive rules (partition
coalescing, skew handling) plug into: each adds its own targeted
boundary-insertion rule; the RTO wrapper is shared.

Decoupling required moving the AtomicWaker from "shared, constructed at
optimize time" to "each buffer owns one, RTO registers per poll":
- StageBoundaryBuffer::new no longer takes an rto_waker; constructs its
  own internally.
- StageBoundaryBuffer::register_consumer_waker(&Waker) — RTO threads the
  consumer-task waker through this each poll.
- RuntimeOptimizerExec drops its rto_waker field; CoordinatorStream walks
  the subtree per poll to register on every buffer's waker.

No behavior change. SLT and clippy clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../insert_stage_boundaries_at_breakers.rs    | 104 ++++++++++++++++++
 datafusion/physical-optimizer/src/lib.rs      |   1 +
 .../physical-optimizer/src/optimizer.rs       |  15 ++-
 .../src/runtime_optimizer.rs                  |  79 ++-----------
 .../physical-plan/src/runtime_optimizer.rs    |  41 +++----
 .../src/stage_boundary_buffer.rs              |  34 +++---
 6 files changed, 165 insertions(+), 109 deletions(-)
 create mode 100644 datafusion/physical-optimizer/src/insert_stage_boundaries_at_breakers.rs

diff --git a/datafusion/physical-optimizer/src/insert_stage_boundaries_at_breakers.rs b/datafusion/physical-optimizer/src/insert_stage_boundaries_at_breakers.rs
new file mode 100644
index 000000000000..9bc30db61297
--- /dev/null
+++ b/datafusion/physical-optimizer/src/insert_stage_boundaries_at_breakers.rs
@@ -0,0 +1,104 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`InsertStageBoundariesAtBreakers`] wraps every pipeline-breaking
+//! operator (currently `AggregateExec` and `SortExec`) in a
+//! [`StageBoundaryBuffer`]. The boundary is where runtime stats become
+//! observable to [`RuntimeOptimizerExec`]; without one above a breaker,
+//! downstream rules see only static estimates for that subtree.
+//!
+//! This rule is the temporary, pre-HashJoin-shift form. A follow-up
+//! commit retargets it onto HashJoin inputs (and renames accordingly)
+//! so the build-side-swap rule has both sides gated. The split between
+//! "wrap root in RTO" and "insert boundaries" stays — future adaptive
+//! rules (partition coalescing, skew handling) each add their own
+//! targeted insertion rule.
+//!
+//! [`RuntimeOptimizerExec`]: datafusion_physical_plan::runtime_optimizer::RuntimeOptimizerExec
+
+use std::sync::Arc;
+
+use crate::PhysicalOptimizerRule;
+use datafusion_common::Result;
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::tree_node::{Transformed, TreeNode};
+use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::aggregates::AggregateExec;
+use datafusion_physical_plan::sorts::sort::SortExec;
+use datafusion_physical_plan::stage_boundary_buffer::StageBoundaryBuffer;
+
+#[derive(Default, Debug)]
+pub struct InsertStageBoundariesAtBreakers;
+
+impl InsertStageBoundariesAtBreakers {
+    pub fn new() -> Self {
+        Self
+    }
+}
+
+impl PhysicalOptimizerRule for InsertStageBoundariesAtBreakers {
+    fn name(&self) -> &str {
+        "InsertStageBoundariesAtBreakers"
+    }
+
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        _config: &ConfigOptions,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let with_buffers = plan
+            .transform_up(|node| {
+                if is_pipeline_breaker(&node)
+                    && node.downcast_ref::<StageBoundaryBuffer>().is_none()
+                {
+                    let buffered: Arc<dyn ExecutionPlan> =
+                        Arc::new(StageBoundaryBuffer::new(node, 0));
+                    Ok(Transformed::yes(buffered))
+                } else {
+                    Ok(Transformed::no(node))
+                }
+            })?
+            .data;
+        Ok(with_buffers)
+    }
+
+    fn schema_check(&self) -> bool {
+        true
+    }
+}
+
+/// Returns true for operators that absorb their entire input before
+/// emitting any output — the canonical "pipeline breaker" definition.
+///
+/// We can't use `pipeline_behavior() == EmissionType::Final` here even
+/// though it sounds equivalent. That flag describes an operator's output
+/// emission semantics and is inherited from descendants — so `Projection`,
+/// `Repartition`, and `HashJoin` above a Final-emitting `AggregateExec`
+/// all report `Final` too. We need the *originator* of the pipeline
+/// break, not every operator downstream of one.
+///
+/// `EmissionType::Final && all children != Final` (the transition-point
+/// filter) is closer but still misses cascading breakers like
+/// `AggregateExec(FinalPartitioned)` whose children are themselves
+/// Final because of the `Partial` aggregate below.
+///
+/// So: hardcoded match against the operators we want to instrument.
+/// Extend as more rules need other breakers.
+fn is_pipeline_breaker(plan: &Arc<dyn ExecutionPlan>) -> bool {
+    plan.downcast_ref::<AggregateExec>().is_some()
+        || plan.downcast_ref::<SortExec>().is_some()
+}
diff --git a/datafusion/physical-optimizer/src/lib.rs b/datafusion/physical-optimizer/src/lib.rs
index 4d220a85840e..d770649b9a67 100644
--- a/datafusion/physical-optimizer/src/lib.rs
+++ b/datafusion/physical-optimizer/src/lib.rs
@@ -43,6 +43,7 @@ pub mod output_requirements;
 pub mod projection_pushdown;
 pub use datafusion_pruning as pruning;
 pub mod hash_join_buffering;
+pub mod insert_stage_boundaries_at_breakers;
 pub mod pushdown_sort;
 pub mod runtime_optimizer;
 pub mod sanity_checker;
diff --git a/datafusion/physical-optimizer/src/optimizer.rs b/datafusion/physical-optimizer/src/optimizer.rs
index 7ab25425da34..ca43b46bed1a 100644
--- a/datafusion/physical-optimizer/src/optimizer.rs
+++ b/datafusion/physical-optimizer/src/optimizer.rs
@@ -36,6 +36,7 @@ use crate::topk_repartition::TopKRepartition;
 use crate::update_aggr_exprs::OptimizeAggregateOrder;
 
 use crate::hash_join_buffering::HashJoinBuffering;
+use crate::insert_stage_boundaries_at_breakers::InsertStageBoundariesAtBreakers;
 use crate::limit_pushdown_past_window::LimitPushPastWindows;
 use crate::pushdown_sort::PushdownSort;
 use crate::runtime_optimizer::InsertRuntimeOptimizer;
@@ -248,10 +249,16 @@ impl PhysicalOptimizer {
             // given query plan; i.e. it only acts as a final
             // gatekeeping rule.
             Arc::new(SanityCheckPlan::new()),
-            // InsertRuntimeOptimizer wraps the (now-final) plan root in
-            // a RuntimeOptimizerExec. Runs last so the wrapper sits above
-            // everything else; subsequent commits use it to coordinate
-            // adaptive optimization at runtime.
+            // Adaptive-execution infrastructure. Two rules, in order:
+            //   1. InsertStageBoundariesAtBreakers — wraps each pipeline
+            //      breaker in a StageBoundaryBuffer, the synchronization
+            //      point where runtime stats become observable.
+            //   2. InsertRuntimeOptimizer — wraps the plan root in a
+            //      RuntimeOptimizerExec that walks the subtree at runtime
+            //      to release ready buffers and run RuntimeRules.
+            // Split so future adaptive rules can add their own targeted
+            // boundary-insertion rules without touching the RTO wrapper.
+            Arc::new(InsertStageBoundariesAtBreakers::new()),
             Arc::new(InsertRuntimeOptimizer::new()),
         ];
 
diff --git a/datafusion/physical-optimizer/src/runtime_optimizer.rs b/datafusion/physical-optimizer/src/runtime_optimizer.rs
index f1da9efa5091..726c7f2dfb1d 100644
--- a/datafusion/physical-optimizer/src/runtime_optimizer.rs
+++ b/datafusion/physical-optimizer/src/runtime_optimizer.rs
@@ -15,34 +15,23 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! [`InsertRuntimeOptimizer`] does two things in one pass:
-//!
-//! 1. Walks the plan tree, wrapping every pipeline-breaking operator
-//!    (currently `AggregateExec` and `SortExec`) in a
-//!    [`StageBoundaryBuffer`]. The boundary is the synchronization point
-//!    where runtime stats become observable.
-//! 2. Wraps the resulting plan root in a [`RuntimeOptimizerExec`], which
-//!    walks the subtree on each `poll_next` and releases ready buffers.
-//!
-//! A shared [`AtomicWaker`] is constructed here and threaded into both
-//! the boundaries and the wrapping RTO so boundaries can wake the
-//! coordinator from inside spawned subtasks (e.g. `RepartitionExec`
-//! internals).
+//! [`InsertRuntimeOptimizer`] wraps the (now-final) plan root in a
+//! [`RuntimeOptimizerExec`] with the default set of runtime rules. It
+//! does nothing else — buffer insertion happens in a separate, targeted
+//! rule (today: [`InsertStageBoundariesAtBreakers`]). The split lets
+//! future adaptive optimizations (partition coalescing, skew handling)
+//! introduce their own targeted insertion rules without touching the
+//! RTO-wrapping logic.
 
 use std::sync::Arc;
 
 use crate::PhysicalOptimizerRule;
 use datafusion_common::Result;
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::tree_node::{Transformed, TreeNode};
 use datafusion_physical_plan::ExecutionPlan;
-use datafusion_physical_plan::aggregates::AggregateExec;
 use datafusion_physical_plan::runtime_optimizer::{
     RuntimeOptimizerExec, RuntimeRule, SwapBuildSideIfInverted,
 };
-use datafusion_physical_plan::sorts::sort::SortExec;
-use datafusion_physical_plan::stage_boundary_buffer::StageBoundaryBuffer;
-use futures::task::AtomicWaker;
 
 #[derive(Default, Debug)]
 pub struct InsertRuntimeOptimizer;
@@ -67,64 +56,12 @@ impl PhysicalOptimizerRule for InsertRuntimeOptimizer {
         if plan.downcast_ref::<RuntimeOptimizerExec>().is_some() {
             return Ok(plan);
         }
-
-        // Shared coordinator-wake. Every buffer in this subplan and the
-        // wrapping RTO hold a clone — buffers wake it, RTO registers on
-        // it. Per-poll cx.waker() doesn't reach across spawned-task
-        // boundaries (e.g. RepartitionExec's internals), this does.
-        let rto_waker = Arc::new(AtomicWaker::new());
-
-        // Phase 1: wrap each pipeline breaker in a StageBoundaryBuffer.
-        // Stage numbering is uniform (0) for now; the next commit assigns
-        // real bottom-up stage numbers.
-        let with_buffers = plan
-            .transform_up(|node| {
-                if is_pipeline_breaker(&node)
-                    && node.downcast_ref::<StageBoundaryBuffer>().is_none()
-                {
-                    let buffered: Arc<dyn ExecutionPlan> = Arc::new(
-                        StageBoundaryBuffer::new(node, 0, Arc::clone(&rto_waker)),
-                    );
-                    Ok(Transformed::yes(buffered))
-                } else {
-                    Ok(Transformed::no(node))
-                }
-            })?
-            .data;
-
-        // Phase 2: wrap the root with the default rule set.
         let rules: Vec<Arc<dyn RuntimeRule>> =
             vec![Arc::new(SwapBuildSideIfInverted::new())];
-        Ok(Arc::new(RuntimeOptimizerExec::new(
-            with_buffers,
-            rto_waker,
-            rules,
-        )))
+        Ok(Arc::new(RuntimeOptimizerExec::new(plan, rules)))
     }
 
     fn schema_check(&self) -> bool {
         true
     }
 }
-
-/// Returns true for operators that absorb their entire input before
-/// emitting any output — the canonical "pipeline breaker" definition.
-///
-/// We can't use `pipeline_behavior() == EmissionType::Final` here even
-/// though it sounds equivalent. That flag describes an operator's output
-/// emission semantics and is inherited from descendants — so `Projection`,
-/// `Repartition`, and `HashJoin` above a Final-emitting `AggregateExec`
-/// all report `Final` too. We need the *originator* of the pipeline
-/// break, not every operator downstream of one.
-///
-/// `EmissionType::Final && all children != Final` (the transition-point
-/// filter) is closer but still misses cascading breakers like
-/// `AggregateExec(FinalPartitioned)` whose children are themselves
-/// Final because of the `Partial` aggregate below.
-///
-/// So: hardcoded match against the operators we want to instrument.
-/// Extend as more rules need other breakers.
-fn is_pipeline_breaker(plan: &Arc<dyn ExecutionPlan>) -> bool {
-    plan.downcast_ref::<AggregateExec>().is_some()
-        || plan.downcast_ref::<SortExec>().is_some()
-}
diff --git a/datafusion/physical-plan/src/runtime_optimizer.rs b/datafusion/physical-plan/src/runtime_optimizer.rs
index ce1ea6dee9cc..8807899412e5 100644
--- a/datafusion/physical-plan/src/runtime_optimizer.rs
+++ b/datafusion/physical-plan/src/runtime_optimizer.rs
@@ -22,20 +22,21 @@
 //! similar) and mutate adaptive operators in place (e.g.
 //! `HashJoinExec::flip_sides`).
 //!
-//! Buffers notify us via a shared [`AtomicWaker`] (`rto_waker`) that we
-//! register on each `poll_next` — `cx.waker()` is task-local and won't
-//! cross a spawned-subtask boundary like `RepartitionExec`'s internals,
-//! but `AtomicWaker.wake()` does.
+//! Each buffer owns its own AtomicWaker; RTO walks the subtree per poll
+//! and registers the consumer-task waker on each. Drain tasks wake on
+//! their own buffer's waker, which then wakes the consumer. Per-buffer
+//! wakers decouple buffer insertion from RTO insertion at planning time:
+//! `InsertStageBoundariesAtBreakers` and `InsertRuntimeOptimizer` are
+//! independent optimizer rules.
 
 use std::pin::Pin;
 use std::sync::Arc;
 use std::sync::atomic::{AtomicBool, Ordering};
-use std::task::{Context, Poll};
+use std::task::{Context, Poll, Waker};
 
 use arrow::array::RecordBatch;
 use datafusion_common::Result;
 use datafusion_execution::TaskContext;
-use futures::task::AtomicWaker;
 use futures::{Stream, StreamExt};
 use log::info;
 
@@ -63,26 +64,18 @@ pub trait RuntimeRule: Send + Sync + std::fmt::Debug {
 pub struct RuntimeOptimizerExec {
     input: Arc<dyn ExecutionPlan>,
     cache: Arc<PlanProperties>,
-    /// Shared with every `StageBoundaryBuffer` in this subplan.
-    /// Buffers wake this AtomicWaker when `is_ready` flips; we register
-    /// the current task's waker on it during each `poll_next` so a
-    /// wake-up from inside a spawned subtask reaches the actual
-    /// top-of-plan task.
-    rto_waker: Arc<AtomicWaker>,
     rules: Vec<Arc<dyn RuntimeRule>>,
 }
 
 impl RuntimeOptimizerExec {
     pub fn new(
         input: Arc<dyn ExecutionPlan>,
-        rto_waker: Arc<AtomicWaker>,
         rules: Vec<Arc<dyn RuntimeRule>>,
     ) -> Self {
         let cache = Arc::clone(input.properties());
         Self {
             input,
             cache,
-            rto_waker,
             rules,
         }
     }
@@ -117,7 +110,6 @@ impl ExecutionPlan for RuntimeOptimizerExec {
     ) -> Result<Arc<dyn ExecutionPlan>> {
         Ok(Arc::new(Self::new(
             children.swap_remove(0),
-            Arc::clone(&self.rto_waker),
             self.rules.clone(),
         )))
     }
@@ -143,7 +135,6 @@ impl ExecutionPlan for RuntimeOptimizerExec {
         let stream = CoordinatorStream {
             child,
             plan: Arc::clone(&self.input),
-            rto_waker: Arc::clone(&self.rto_waker),
             rules: self.rules.clone(),
         };
         Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream)))
@@ -163,10 +154,18 @@ fn prime_all_buffers(
     Ok(())
 }
 
+fn register_consumer_waker_on_buffers(plan: &Arc<dyn ExecutionPlan>, waker: &Waker) {
+    if let Some(buffer) = plan.downcast_ref::<StageBoundaryBuffer>() {
+        buffer.register_consumer_waker(waker);
+    }
+    for child in plan.children() {
+        register_consumer_waker_on_buffers(child, waker);
+    }
+}
+
 struct CoordinatorStream {
     child: SendableRecordBatchStream,
     plan: Arc<dyn ExecutionPlan>,
-    rto_waker: Arc<AtomicWaker>,
     rules: Vec<Arc<dyn RuntimeRule>>,
 }
 
@@ -178,9 +177,11 @@ impl Stream for CoordinatorStream {
         cx: &mut Context<'_>,
     ) -> Poll<Option<Self::Item>> {
         let this = self.as_mut().get_mut();
-        // Register before walking so a buffer flipping is_ready *after*
-        // we walked but *before* we return Pending still wakes us.
-        this.rto_waker.register(cx.waker());
+        // Register before walking for release/rules so a buffer flipping
+        // is_ready *after* we walked but *before* we return Pending still
+        // wakes us. Per-buffer wakers: walk the subtree and register on
+        // each — cheap downcast per node.
+        register_consumer_waker_on_buffers(&this.plan, cx.waker());
 
         // Phase 1: set the permissive default — every ready buffer is
         // proposed for release.
diff --git a/datafusion/physical-plan/src/stage_boundary_buffer.rs b/datafusion/physical-plan/src/stage_boundary_buffer.rs
index cfb18efbb402..433885554e67 100644
--- a/datafusion/physical-plan/src/stage_boundary_buffer.rs
+++ b/datafusion/physical-plan/src/stage_boundary_buffer.rs
@@ -86,9 +86,13 @@ pub struct StageBoundaryBuffer {
     /// stage runs first; once released, its consumers (next stage up)
     /// can prime. Used by EXPLAIN, logs, and (eventually) RTO ordering.
     stage: usize,
-    /// Shared with `RuntimeOptimizerExec`. Buffer wakes this whenever
-    /// `is_ready` flips so the coordinator (which may live above a
-    /// task-spawning operator like `RepartitionExec`) gets re-polled.
+    /// Each buffer owns its own AtomicWaker rather than sharing one
+    /// across the subtree. RTO registers the consumer-task waker on
+    /// this per poll cycle (cheap walk); the drain task wakes here
+    /// once `is_ready` flips. Per-buffer wakers decouple the buffer
+    /// from RTO at planning time — `InsertStageBoundariesAtBreakers`
+    /// and `InsertRuntimeOptimizer` no longer have to coordinate to
+    /// share a single waker.
     rto_waker: Arc<AtomicWaker>,
     /// Gate flags + drain progress + consumer-side wakers.
     state: Arc<Mutex<BufferState>>,
@@ -122,11 +126,7 @@ struct BufferState {
 }
 
 impl StageBoundaryBuffer {
-    pub fn new(
-        input: Arc<dyn ExecutionPlan>,
-        stage: usize,
-        rto_waker: Arc<AtomicWaker>,
-    ) -> Self {
+    pub fn new(input: Arc<dyn ExecutionPlan>, stage: usize) -> Self {
         let num_partitions = input.output_partitioning().partition_count();
         let cache = Arc::clone(input.properties());
         let (txs, rxs): (Vec<_>, Vec<_>) = (0..num_partitions)
@@ -139,7 +139,7 @@ impl StageBoundaryBuffer {
             input,
             cache,
             stage,
-            rto_waker,
+            rto_waker: Arc::new(AtomicWaker::new()),
             state: Arc::new(Mutex::new(BufferState {
                 drained_count: 0,
                 is_ready: false,
@@ -158,6 +158,13 @@ impl StageBoundaryBuffer {
         self.stage
     }
 
+    /// Register a waker to be woken when this buffer's drain reaches EOF
+    /// (i.e. `is_ready` flips). RTO calls this per `poll_next` for each
+    /// buffer in its subtree, threading its own consumer waker through.
+    pub fn register_consumer_waker(&self, waker: &Waker) {
+        self.rto_waker.register(waker);
+    }
+
     /// Spawn drain tasks for every input partition. Each task pulls
     /// `input.execute(p, ctx)` to EOF, ferrying batches and errors
     /// through the per-partition channel; once EOF is reached, marks
@@ -295,11 +302,10 @@ impl ExecutionPlan for StageBoundaryBuffer {
         self: Arc<Self>,
         mut children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        Ok(Arc::new(Self::new(
-            children.swap_remove(0),
-            self.stage,
-            Arc::clone(&self.rto_waker),
-        )))
+        // Fresh waker is fine: with_new_children is called at planning
+        // time before RTO registers anything, so there's nothing to
+        // preserve.
+        Ok(Arc::new(Self::new(children.swap_remove(0), self.stage)))
     }
 
     fn maintains_input_order(&self) -> Vec<bool> {

From fc4feeed22903ac950a4d23c493a4873a067bacb Mon Sep 17 00:00:00 2001
From: Brent Gardner <bgardner@squarelabs.net>
Date: Thu, 25 Jun 2026 11:30:00 -0600
Subject: [PATCH 12/18] InsertHashJoinBoundaries: retarget rule to HashJoin
 inputs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rename InsertStageBoundariesAtBreakers → InsertHashJoinBoundaries and
swap the transform: instead of wrapping every Agg/Sort, wrap each input
of every HashJoinExec. Both join sides are now gated so neither has
been consumed by the join when runtime stats arrive — the precondition
for the swap rule actually flipping (rather than just logging).

Stage numbers are assigned bottom-up: a new boundary's stage is one more
than the highest stage of any boundary already in its input subtree, or
0 if there is none. transform_up visits deeper joins first, so when an
outer join's children are processed the inner-join boundaries already
exist and their stages are visible. In the SLT scenario both boundaries
are stage=0 because there is no nested HashJoin.

SLT EXPLAIN updated: buffers now sit directly above HashJoin's children
(CoalescePartitionsExec on the left, ProjectionExec on the right);
Aggregate operators are no longer wrapped. SLT result still green.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../src/insert_hash_join_boundaries.rs        | 104 ++++++++++++++++++
 .../insert_stage_boundaries_at_breakers.rs    | 104 ------------------
 datafusion/physical-optimizer/src/lib.rs      |   2 +-
 .../physical-optimizer/src/optimizer.rs       |  12 +-
 .../test_files/runtime_optimizer.slt          |  34 +++---
 5 files changed, 129 insertions(+), 127 deletions(-)
 create mode 100644 datafusion/physical-optimizer/src/insert_hash_join_boundaries.rs
 delete mode 100644 datafusion/physical-optimizer/src/insert_stage_boundaries_at_breakers.rs

diff --git a/datafusion/physical-optimizer/src/insert_hash_join_boundaries.rs b/datafusion/physical-optimizer/src/insert_hash_join_boundaries.rs
new file mode 100644
index 000000000000..695a08ba4d23
--- /dev/null
+++ b/datafusion/physical-optimizer/src/insert_hash_join_boundaries.rs
@@ -0,0 +1,104 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`InsertHashJoinBoundaries`] wraps each input of every `HashJoinExec`
+//! in a [`StageBoundaryBuffer`]. Both sides are gated so that neither
+//! has been consumed by the join by the time runtime stats arrive — the
+//! precondition for the build-side-swap rule actually swapping (rather
+//! than just logging intent). Memory cost is the size of both inputs;
+//! spill is a follow-up, OomGuard catches genuine OOM in the meantime.
+//!
+//! Stage numbers are assigned bottom-up: a new boundary's stage is one
+//! more than the highest stage of any boundary already in its input
+//! subtree, or 0 if there is none. Lowest stage releases first;
+//! [`RuntimeOptimizerExec`] walks the subtree at runtime and primes the
+//! next stage as the previous one's boundaries become ready.
+//!
+//! `transform_up` ensures deeper joins are visited before outer joins,
+//! so when an outer-join boundary is computed the inner-join boundaries
+//! already exist and their stage numbers are visible.
+//!
+//! [`RuntimeOptimizerExec`]: datafusion_physical_plan::runtime_optimizer::RuntimeOptimizerExec
+
+use std::sync::Arc;
+
+use crate::PhysicalOptimizerRule;
+use datafusion_common::Result;
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::tree_node::{Transformed, TreeNode};
+use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::joins::HashJoinExec;
+use datafusion_physical_plan::stage_boundary_buffer::StageBoundaryBuffer;
+
+#[derive(Default, Debug)]
+pub struct InsertHashJoinBoundaries;
+
+impl InsertHashJoinBoundaries {
+    pub fn new() -> Self {
+        Self
+    }
+}
+
+impl PhysicalOptimizerRule for InsertHashJoinBoundaries {
+    fn name(&self) -> &str {
+        "InsertHashJoinBoundaries"
+    }
+
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        _config: &ConfigOptions,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let with_buffers = plan
+            .transform_up(|node| {
+                if node.downcast_ref::<HashJoinExec>().is_none() {
+                    return Ok(Transformed::no(node));
+                }
+                let mut new_children: Vec<Arc<dyn ExecutionPlan>> =
+                    Vec::with_capacity(node.children().len());
+                for child in node.children() {
+                    if child.downcast_ref::<StageBoundaryBuffer>().is_some() {
+                        new_children.push(Arc::clone(child));
+                        continue;
+                    }
+                    let stage =
+                        max_buffer_stage_in_subtree(child).map_or(0, |max| max + 1);
+                    new_children.push(Arc::new(StageBoundaryBuffer::new(
+                        Arc::clone(child),
+                        stage,
+                    )));
+                }
+                Ok(Transformed::yes(node.with_new_children(new_children)?))
+            })?
+            .data;
+        Ok(with_buffers)
+    }
+
+    fn schema_check(&self) -> bool {
+        true
+    }
+}
+
+fn max_buffer_stage_in_subtree(plan: &Arc<dyn ExecutionPlan>) -> Option<usize> {
+    let mut max_stage = plan.downcast_ref::<StageBoundaryBuffer>().map(|b| b.stage());
+    for child in plan.children() {
+        if let Some(child_max) = max_buffer_stage_in_subtree(child) {
+            max_stage = Some(max_stage.map_or(child_max, |m| m.max(child_max)));
+        }
+    }
+    max_stage
+}
diff --git a/datafusion/physical-optimizer/src/insert_stage_boundaries_at_breakers.rs b/datafusion/physical-optimizer/src/insert_stage_boundaries_at_breakers.rs
deleted file mode 100644
index 9bc30db61297..000000000000
--- a/datafusion/physical-optimizer/src/insert_stage_boundaries_at_breakers.rs
+++ /dev/null
@@ -1,104 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! [`InsertStageBoundariesAtBreakers`] wraps every pipeline-breaking
-//! operator (currently `AggregateExec` and `SortExec`) in a
-//! [`StageBoundaryBuffer`]. The boundary is where runtime stats become
-//! observable to [`RuntimeOptimizerExec`]; without one above a breaker,
-//! downstream rules see only static estimates for that subtree.
-//!
-//! This rule is the temporary, pre-HashJoin-shift form. A follow-up
-//! commit retargets it onto HashJoin inputs (and renames accordingly)
-//! so the build-side-swap rule has both sides gated. The split between
-//! "wrap root in RTO" and "insert boundaries" stays — future adaptive
-//! rules (partition coalescing, skew handling) each add their own
-//! targeted insertion rule.
-//!
-//! [`RuntimeOptimizerExec`]: datafusion_physical_plan::runtime_optimizer::RuntimeOptimizerExec
-
-use std::sync::Arc;
-
-use crate::PhysicalOptimizerRule;
-use datafusion_common::Result;
-use datafusion_common::config::ConfigOptions;
-use datafusion_common::tree_node::{Transformed, TreeNode};
-use datafusion_physical_plan::ExecutionPlan;
-use datafusion_physical_plan::aggregates::AggregateExec;
-use datafusion_physical_plan::sorts::sort::SortExec;
-use datafusion_physical_plan::stage_boundary_buffer::StageBoundaryBuffer;
-
-#[derive(Default, Debug)]
-pub struct InsertStageBoundariesAtBreakers;
-
-impl InsertStageBoundariesAtBreakers {
-    pub fn new() -> Self {
-        Self
-    }
-}
-
-impl PhysicalOptimizerRule for InsertStageBoundariesAtBreakers {
-    fn name(&self) -> &str {
-        "InsertStageBoundariesAtBreakers"
-    }
-
-    fn optimize(
-        &self,
-        plan: Arc<dyn ExecutionPlan>,
-        _config: &ConfigOptions,
-    ) -> Result<Arc<dyn ExecutionPlan>> {
-        let with_buffers = plan
-            .transform_up(|node| {
-                if is_pipeline_breaker(&node)
-                    && node.downcast_ref::<StageBoundaryBuffer>().is_none()
-                {
-                    let buffered: Arc<dyn ExecutionPlan> =
-                        Arc::new(StageBoundaryBuffer::new(node, 0));
-                    Ok(Transformed::yes(buffered))
-                } else {
-                    Ok(Transformed::no(node))
-                }
-            })?
-            .data;
-        Ok(with_buffers)
-    }
-
-    fn schema_check(&self) -> bool {
-        true
-    }
-}
-
-/// Returns true for operators that absorb their entire input before
-/// emitting any output — the canonical "pipeline breaker" definition.
-///
-/// We can't use `pipeline_behavior() == EmissionType::Final` here even
-/// though it sounds equivalent. That flag describes an operator's output
-/// emission semantics and is inherited from descendants — so `Projection`,
-/// `Repartition`, and `HashJoin` above a Final-emitting `AggregateExec`
-/// all report `Final` too. We need the *originator* of the pipeline
-/// break, not every operator downstream of one.
-///
-/// `EmissionType::Final && all children != Final` (the transition-point
-/// filter) is closer but still misses cascading breakers like
-/// `AggregateExec(FinalPartitioned)` whose children are themselves
-/// Final because of the `Partial` aggregate below.
-///
-/// So: hardcoded match against the operators we want to instrument.
-/// Extend as more rules need other breakers.
-fn is_pipeline_breaker(plan: &Arc<dyn ExecutionPlan>) -> bool {
-    plan.downcast_ref::<AggregateExec>().is_some()
-        || plan.downcast_ref::<SortExec>().is_some()
-}
diff --git a/datafusion/physical-optimizer/src/lib.rs b/datafusion/physical-optimizer/src/lib.rs
index d770649b9a67..20b170722e81 100644
--- a/datafusion/physical-optimizer/src/lib.rs
+++ b/datafusion/physical-optimizer/src/lib.rs
@@ -43,7 +43,7 @@ pub mod output_requirements;
 pub mod projection_pushdown;
 pub use datafusion_pruning as pruning;
 pub mod hash_join_buffering;
-pub mod insert_stage_boundaries_at_breakers;
+pub mod insert_hash_join_boundaries;
 pub mod pushdown_sort;
 pub mod runtime_optimizer;
 pub mod sanity_checker;
diff --git a/datafusion/physical-optimizer/src/optimizer.rs b/datafusion/physical-optimizer/src/optimizer.rs
index ca43b46bed1a..036527172498 100644
--- a/datafusion/physical-optimizer/src/optimizer.rs
+++ b/datafusion/physical-optimizer/src/optimizer.rs
@@ -36,7 +36,7 @@ use crate::topk_repartition::TopKRepartition;
 use crate::update_aggr_exprs::OptimizeAggregateOrder;
 
 use crate::hash_join_buffering::HashJoinBuffering;
-use crate::insert_stage_boundaries_at_breakers::InsertStageBoundariesAtBreakers;
+use crate::insert_hash_join_boundaries::InsertHashJoinBoundaries;
 use crate::limit_pushdown_past_window::LimitPushPastWindows;
 use crate::pushdown_sort::PushdownSort;
 use crate::runtime_optimizer::InsertRuntimeOptimizer;
@@ -250,15 +250,17 @@ impl PhysicalOptimizer {
             // gatekeeping rule.
             Arc::new(SanityCheckPlan::new()),
             // Adaptive-execution infrastructure. Two rules, in order:
-            //   1. InsertStageBoundariesAtBreakers — wraps each pipeline
-            //      breaker in a StageBoundaryBuffer, the synchronization
-            //      point where runtime stats become observable.
+            //   1. InsertHashJoinBoundaries — wraps each HashJoin input
+            //      in a StageBoundaryBuffer with a bottom-up stage
+            //      number; the boundaries are where runtime stats become
+            //      observable and where the build/probe sides are gated
+            //      until a swap decision can be made.
             //   2. InsertRuntimeOptimizer — wraps the plan root in a
             //      RuntimeOptimizerExec that walks the subtree at runtime
             //      to release ready buffers and run RuntimeRules.
             // Split so future adaptive rules can add their own targeted
             // boundary-insertion rules without touching the RTO wrapper.
-            Arc::new(InsertStageBoundariesAtBreakers::new()),
+            Arc::new(InsertHashJoinBoundaries::new()),
             Arc::new(InsertRuntimeOptimizer::new()),
         ];
 
diff --git a/datafusion/sqllogictest/test_files/runtime_optimizer.slt b/datafusion/sqllogictest/test_files/runtime_optimizer.slt
index 10402592654e..ed9a0bf0451d 100644
--- a/datafusion/sqllogictest/test_files/runtime_optimizer.slt
+++ b/datafusion/sqllogictest/test_files/runtime_optimizer.slt
@@ -91,14 +91,14 @@ SELECT count(*) FROM (SELECT group_key FROM big GROUP BY group_key) t;
 
 # ---------------------------------------------------------------------------
 # EXPLAIN — shows the *static* plan with AQE primitives wrapped in
-# (RuntimeOptimizerExec at root; StageBoundaryBuffer above each
-# pipeline breaker). JoinSelection picked `small` as build statically
-# because aggregated_big's row-count estimate (Inexact, ~100K) overstates
-# the actual (5). RuntimeOptimizerExec detects this inversion at runtime
-# and would call `HashJoinExec::flip_sides()` to swap — but flip_sides
-# isn't implemented yet, so for now the rule only logs intent. EXPLAIN
-# stays the static plan; the swap is verified via log output and (later)
-# via metrics.
+# (RuntimeOptimizerExec at root; StageBoundaryBuffer on each HashJoin
+# input). JoinSelection picked `small` as build statically because
+# aggregated_big's row-count estimate (Inexact, ~100K) overstates the
+# actual (5). RuntimeOptimizerExec detects this inversion at runtime and
+# would call `HashJoinExec::flip_sides()` to swap — but flip_sides isn't
+# implemented yet, so for now the rule only logs intent. EXPLAIN stays
+# the static plan; the swap is verified via log output and (later) via
+# metrics.
 # ---------------------------------------------------------------------------
 query TT
 EXPLAIN SELECT bg.group_key, bg.sum_payload, s.payload
@@ -119,15 +119,15 @@ logical_plan
 physical_plan
 01)RuntimeOptimizerExec
 02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, group_key@0)], projection=[group_key@2, sum_payload@3, payload@1]
-03)----CoalescePartitionsExec
-04)------DataSourceExec: partitions=4, partition_sizes=[1, 0, 0, 0]
-05)----ProjectionExec: expr=[group_key@0 as group_key, sum(big.payload)@1 as sum_payload]
-06)------StageBoundaryBuffer: stage=0
-07)--------AggregateExec: mode=FinalPartitioned, gby=[group_key@0 as group_key], aggr=[sum(big.payload)]
-08)----------RepartitionExec: partitioning=Hash([group_key@0], 4), input_partitions=4
-09)------------StageBoundaryBuffer: stage=0
-10)--------------AggregateExec: mode=Partial, gby=[group_key@0 as group_key], aggr=[sum(big.payload)]
-11)----------------DataSourceExec: partitions=4, partition_sizes=[4, 3, 3, 3]
+03)----StageBoundaryBuffer: stage=0
+04)------CoalescePartitionsExec
+05)--------DataSourceExec: partitions=4, partition_sizes=[1, 0, 0, 0]
+06)----StageBoundaryBuffer: stage=0
+07)------ProjectionExec: expr=[group_key@0 as group_key, sum(big.payload)@1 as sum_payload]
+08)--------AggregateExec: mode=FinalPartitioned, gby=[group_key@0 as group_key], aggr=[sum(big.payload)]
+09)----------RepartitionExec: partitioning=Hash([group_key@0], 4), input_partitions=4
+10)------------AggregateExec: mode=Partial, gby=[group_key@0 as group_key], aggr=[sum(big.payload)]
+11)--------------DataSourceExec: partitions=4, partition_sizes=[4, 3, 3, 3]
 
 # ---------------------------------------------------------------------------
 # Result — GREEN regardless of build-side choice (correctness is plan-

From 5eb08aac07c4e246e394fa539d3a366a49f628a5 Mon Sep 17 00:00:00 2001
From: Brent Gardner <bgardner@squarelabs.net>
Date: Thu, 25 Jun 2026 11:50:28 -0600
Subject: [PATCH 13/18] RTO: event-driven stage completion drives rule firing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces the propose/veto/commit triphase per-poll walk with a single
event: find the lowest stage whose boundaries are all ready but none
released yet, fire each RuntimeRule once for that stage, then release.
Stage K+1's boundaries can't fill until stage K is released, so stages
naturally serialize without an explicit veto mechanism — the
streaming_enabled field (and its rule-controllable proposal flag) is
gone.

Rule trait gains a completed_stage parameter. SwapBuildSideIfInverted
checks both HashJoin children are StageBoundaryBuffers at the
just-completed stage before acting; nested joins higher up wait for
their own stage. Solves the previous-design problem where the LEFT
buffer (small dim, static stats) could release independently before
the RIGHT was ready, leaving the swap rule no time to act.

Observability: RTO logs "stage K ready" before firing rules and
"stage K released" after. With the existing SwapBuildSideIfInverted
log line, RUST_LOG=info shows the ordering:
  RTO: stage 0 ready (2 boundaries); firing 1 rule(s) before release
  SwapBuildSideIfInverted: would flip HashJoinExec — current build
    (left) = 100 rows, probe (right) = 5 rows. ...
  RTO: stage 0 released; downstream consumers can now drain ...

That ordering proves (a) the rule fires and (b) it fires before any
data flows into the join (release is what unblocks the ConsumerStreams
HashJoin reads from).

Spill TODO expanded to "spill-or-stream": follow-up will give boundaries
two escape hatches under MemoryPool pressure — spill to disk OR
stream-through (set has_overflowed, skip the swap decision, behave
exactly as the pre-AQE baseline). The narrative becomes: "buffers fit →
adaptive optimization; buffers don't fit → query still runs exactly as
it does today."

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../physical-plan/src/runtime_optimizer.rs    | 133 +++++++++++-------
 .../src/stage_boundary_buffer.rs              |  48 +++----
 2 files changed, 103 insertions(+), 78 deletions(-)

diff --git a/datafusion/physical-plan/src/runtime_optimizer.rs b/datafusion/physical-plan/src/runtime_optimizer.rs
index 8807899412e5..f6e357c90565 100644
--- a/datafusion/physical-plan/src/runtime_optimizer.rs
+++ b/datafusion/physical-plan/src/runtime_optimizer.rs
@@ -15,12 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Coordinator wrapper at the root of the plan. On every poll, it walks
-//! its subtree to release any [`StageBoundaryBuffer`] whose `is_ready`
-//! flag has flipped, then runs each registered [`RuntimeRule`] over the
-//! plan. Rules observe runtime stats (via `runtime_row_count` and
-//! similar) and mutate adaptive operators in place (e.g.
-//! `HashJoinExec::flip_sides`).
+//! Coordinator wrapper at the root of the plan. On every poll it walks
+//! its subtree to find the lowest stage whose [`StageBoundaryBuffer`]s
+//! are all ready but haven't started streaming yet — the "just completed"
+//! stage. When that exists, it fires each registered [`RuntimeRule`]
+//! exactly once for that stage, then releases the stage's boundaries.
+//! Stage K+1's boundaries can't fill until stage K is released, so
+//! stages naturally serialize without an explicit veto mechanism.
 //!
 //! Each buffer owns its own AtomicWaker; RTO walks the subtree per poll
 //! and registers the consumer-task waker on each. Drain tasks wake on
@@ -29,6 +30,7 @@
 //! `InsertStageBoundariesAtBreakers` and `InsertRuntimeOptimizer` are
 //! independent optimizer rules.
 
+use std::collections::BTreeMap;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::sync::atomic::{AtomicBool, Ordering};
@@ -49,15 +51,14 @@ use crate::{
     SendableRecordBatchStream,
 };
 
-/// A runtime adaptive-execution rule that may inspect the plan tree and
-/// mutate adaptive operators in place. Rules are called from
-/// [`RuntimeOptimizerExec`] on every poll, *after* ready buffers have
-/// been released. They are expected to be cheap and idempotent (track
-/// their own "already fired" state if they should only run once per
-/// query).
+/// A runtime adaptive-execution rule. RTO fires `evaluate` once per
+/// stage completion, passing the just-completed stage number so rules
+/// can scope their work to operators whose inputs are at that stage
+/// (e.g. a HashJoinExec whose children are both StageBoundaryBuffers
+/// with `stage == completed_stage`).
 pub trait RuntimeRule: Send + Sync + std::fmt::Debug {
     fn name(&self) -> &str;
-    fn evaluate(&self, plan: &Arc<dyn ExecutionPlan>);
+    fn evaluate(&self, plan: &Arc<dyn ExecutionPlan>, completed_stage: usize);
 }
 
 #[derive(Debug)]
@@ -68,10 +69,7 @@ pub struct RuntimeOptimizerExec {
 }
 
 impl RuntimeOptimizerExec {
-    pub fn new(
-        input: Arc<dyn ExecutionPlan>,
-        rules: Vec<Arc<dyn RuntimeRule>>,
-    ) -> Self {
+    pub fn new(input: Arc<dyn ExecutionPlan>, rules: Vec<Arc<dyn RuntimeRule>>) -> Self {
         let cache = Arc::clone(input.properties());
         Self {
             input,
@@ -177,47 +175,73 @@ impl Stream for CoordinatorStream {
         cx: &mut Context<'_>,
     ) -> Poll<Option<Self::Item>> {
         let this = self.as_mut().get_mut();
-        // Register before walking for release/rules so a buffer flipping
-        // is_ready *after* we walked but *before* we return Pending still
-        // wakes us. Per-buffer wakers: walk the subtree and register on
-        // each — cheap downcast per node.
+        // Register before walking so a buffer flipping is_ready *after*
+        // we walked but *before* we return Pending still wakes us.
         register_consumer_waker_on_buffers(&this.plan, cx.waker());
 
-        // Phase 1: set the permissive default — every ready buffer is
-        // proposed for release.
-        propose_release_for_ready_buffers(&this.plan);
-
-        // Phase 2: rules may veto specific buffers (set
-        // streaming_enabled=false) or mutate adaptive operators.
-        for rule in &this.rules {
-            rule.evaluate(&this.plan);
+        // Find the lowest stage whose boundaries are all ready and none
+        // released — the "just completed" stage. Rules fire once for
+        // that stage, then we release. Higher stages wait their turn
+        // naturally because they can't drain until lower stages release.
+        if let Some((stage, boundaries)) = find_just_completed_stage(&this.plan) {
+            info!(
+                "RTO: stage {stage} ready ({} boundaries); firing {} rule(s) \
+                 before release",
+                boundaries.len(),
+                this.rules.len(),
+            );
+            for rule in &this.rules {
+                rule.evaluate(&this.plan, stage);
+            }
+            for buffer in &boundaries {
+                buffer
+                    .downcast_ref::<StageBoundaryBuffer>()
+                    .expect(
+                        "find_just_completed_stage only returns StageBoundaryBuffer Arcs",
+                    )
+                    .start_streaming();
+            }
+            info!(
+                "RTO: stage {stage} released; downstream consumers can now \
+                 drain the buffered data"
+            );
         }
 
-        // Phase 3: commit — actually start streaming on any buffer that
-        // is still enabled.
-        start_streaming_on_enabled_buffers(&this.plan);
-
         this.child.poll_next_unpin(cx)
     }
 }
 
-fn propose_release_for_ready_buffers(plan: &Arc<dyn ExecutionPlan>) {
-    if let Some(buffer) = plan.downcast_ref::<StageBoundaryBuffer>() {
-        buffer.set_streaming_enabled(buffer.is_ready());
-    }
-    for child in plan.children() {
-        propose_release_for_ready_buffers(child);
-    }
+/// Walks the plan tree, groups every `StageBoundaryBuffer` by stage,
+/// and returns the lowest stage where every boundary is ready and none
+/// has started streaming yet. Returns `None` if no such stage exists
+/// (either nothing is ready, or every ready stage has already fired).
+fn find_just_completed_stage(
+    plan: &Arc<dyn ExecutionPlan>,
+) -> Option<(usize, Vec<Arc<dyn ExecutionPlan>>)> {
+    let mut by_stage: BTreeMap<usize, Vec<Arc<dyn ExecutionPlan>>> = BTreeMap::new();
+    collect_boundaries_by_stage(plan, &mut by_stage);
+    by_stage.into_iter().find(|(_, bufs)| {
+        bufs.iter().all(|b| {
+            let buffer = b.downcast_ref::<StageBoundaryBuffer>().expect(
+                "find_just_completed_stage only inserts StageBoundaryBuffer Arcs",
+            );
+            buffer.is_ready() && !buffer.streaming_started()
+        })
+    })
 }
 
-fn start_streaming_on_enabled_buffers(plan: &Arc<dyn ExecutionPlan>) {
-    if let Some(buffer) = plan.downcast_ref::<StageBoundaryBuffer>()
-        && buffer.streaming_enabled()
-    {
-        buffer.start_streaming();
+fn collect_boundaries_by_stage(
+    plan: &Arc<dyn ExecutionPlan>,
+    out: &mut BTreeMap<usize, Vec<Arc<dyn ExecutionPlan>>>,
+) {
+    if let Some(buffer) = plan.downcast_ref::<StageBoundaryBuffer>() {
+        out.entry(buffer.stage())
+            .or_default()
+            .push(Arc::clone(plan));
+        return;
     }
     for child in plan.children() {
-        start_streaming_on_enabled_buffers(child);
+        collect_boundaries_by_stage(child, out);
     }
 }
 
@@ -256,19 +280,26 @@ impl RuntimeRule for SwapBuildSideIfInverted {
         "SwapBuildSideIfInverted"
     }
 
-    fn evaluate(&self, plan: &Arc<dyn ExecutionPlan>) {
+    fn evaluate(&self, plan: &Arc<dyn ExecutionPlan>, completed_stage: usize) {
         if self.fired.load(Ordering::Relaxed) {
             return;
         }
-        self.walk_for_swap(plan);
+        self.walk_for_swap(plan, completed_stage);
     }
 }
 
 impl SwapBuildSideIfInverted {
-    fn walk_for_swap(&self, plan: &Arc<dyn ExecutionPlan>) {
+    fn walk_for_swap(&self, plan: &Arc<dyn ExecutionPlan>, completed_stage: usize) {
         if let Some(join) = plan.downcast_ref::<HashJoinExec>() {
             let children = join.children();
-            if children.len() == 2 {
+            // Only act on joins whose inputs are at the just-completed
+            // stage; nested joins higher up wait for their own stage.
+            let both_at_completed_stage = children.len() == 2
+                && [&children[0], &children[1]].iter().all(|c| {
+                    c.downcast_ref::<StageBoundaryBuffer>()
+                        .is_some_and(|b| b.stage() == completed_stage)
+                });
+            if both_at_completed_stage {
                 // Current HashJoinExec: LEFT child is the build side
                 // under `mode=CollectLeft`.
                 let left = side_runtime_rows(children[0]);
@@ -296,7 +327,7 @@ impl SwapBuildSideIfInverted {
             return;
         }
         for child in plan.children() {
-            self.walk_for_swap(child);
+            self.walk_for_swap(child, completed_stage);
         }
     }
 }
diff --git a/datafusion/physical-plan/src/stage_boundary_buffer.rs b/datafusion/physical-plan/src/stage_boundary_buffer.rs
index 433885554e67..e48da20cf6e1 100644
--- a/datafusion/physical-plan/src/stage_boundary_buffer.rs
+++ b/datafusion/physical-plan/src/stage_boundary_buffer.rs
@@ -31,15 +31,13 @@
 //!   per-partition receiver wrapped in a gated stream. Never touches
 //!   `input.execute`; that's prime's job.
 //!
-//! Three flags govern release:
+//! Two flags govern release:
 //! - `is_ready` (mechanical): set automatically when every drain task
 //!   has hit EOF. Runtime stats from the input become derivable.
-//! - `streaming_enabled` (rule-controlled proposal): reset each poll
-//!   cycle by RTO — set to `is_ready` as the permissive default. Rules
-//!   can flip it to false to veto release. No side effects.
 //! - `streaming_started` (actual emission control): only RTO flips this,
-//!   via `start_streaming()`. Once true, the gated consumer streams
-//!   start forwarding from their receivers.
+//!   via `start_streaming()`, and only as part of releasing a whole
+//!   stage. Once true, the gated consumer streams start forwarding from
+//!   their receivers.
 //!
 //! Coordination uses a shared [`AtomicWaker`] (`rto_waker`) populated at
 //! plan time by RTO. The buffer wakes it when `is_ready` flips; the
@@ -71,10 +69,20 @@ use crate::{
     SendableRecordBatchStream, Statistics,
 };
 
-// TODO(spill): the per-partition channels below currently hold the full
-// materialized input in memory. A follow-up will swap each channel for a
-// spillable buffer (writing to disk under MemoryPool pressure) so that
-// large stage boundaries don't OOM. OomGuard is the safety net until then.
+// TODO(spill-or-stream): the per-partition channels below currently hold
+// the full materialized input in memory. Inserting a StageBoundaryBuffer
+// turns a streaming subtree into a pipeline breaker — that's the cost of
+// getting runtime stats. A follow-up will give each boundary TWO escape
+// hatches under MemoryPool pressure:
+//   1. Spill: page batches to disk and keep buffering. Stats remain
+//      reliable; the swap decision still happens.
+//   2. Stream-through (`has_overflowed`): give up on materialization,
+//      release the boundary immediately so downstream consumers stream
+//      just like they would have before this rule existed. The flip
+//      rule sees `has_overflowed() == true` and skips its decision —
+//      we lose the adaptive optimization but the query still runs and
+//      we haven't made things worse than the pre-AQE baseline.
+// OomGuard catches genuine OOM in the meantime.
 type PartitionTxs = Vec<Option<mpsc::UnboundedSender<Result<RecordBatch>>>>;
 type PartitionRxs = Vec<Option<mpsc::UnboundedReceiver<Result<RecordBatch>>>>;
 
@@ -114,11 +122,9 @@ struct BufferState {
     /// `drained_count == num_partitions`, `is_ready` flips.
     drained_count: usize,
     is_ready: bool,
-    /// Rule-controllable proposal. Reset by RTO each poll cycle: set to
-    /// `is_ready` as the permissive default, then rules may flip it
-    /// to false to veto.
-    streaming_enabled: bool,
-    /// Actual emission control. Only RTO flips this via `start_streaming`.
+    /// Emission control. Only RTO flips this via `start_streaming`,
+    /// and only as part of releasing the entire stage this boundary
+    /// belongs to.
     streaming_started: bool,
     num_partitions: usize,
     /// Wakers stashed by consumer streams while gated on `streaming_started`.
@@ -143,7 +149,6 @@ impl StageBoundaryBuffer {
             state: Arc::new(Mutex::new(BufferState {
                 drained_count: 0,
                 is_ready: false,
-                streaming_enabled: false,
                 streaming_started: false,
                 num_partitions,
                 wakers: HashMap::new(),
@@ -195,17 +200,6 @@ impl StageBoundaryBuffer {
         self.state.lock().unwrap().is_ready
     }
 
-    /// Rule-controllable proposal flag. RTO resets this to `is_ready` at
-    /// the start of each poll cycle as the permissive default; rules may
-    /// then flip it to false to veto release this cycle. No side effects.
-    pub fn streaming_enabled(&self) -> bool {
-        self.state.lock().unwrap().streaming_enabled
-    }
-
-    pub fn set_streaming_enabled(&self, enabled: bool) {
-        self.state.lock().unwrap().streaming_enabled = enabled;
-    }
-
     /// True once `start_streaming` has been called.
     pub fn streaming_started(&self) -> bool {
         self.state.lock().unwrap().streaming_started

From f1ae00b6b97686e05437b416792ebd8b459456c1 Mon Sep 17 00:00:00 2001
From: Brent Gardner <bgardner@squarelabs.net>
Date: Thu, 25 Jun 2026 12:10:17 -0600
Subject: [PATCH 14/18] RuntimeRule: reshape to PhysicalOptimizerRule signature
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The trait now has the same shape as
`datafusion_physical_optimizer::PhysicalOptimizerRule::optimize`:

  fn optimize(
      &self,
      plan: Arc<dyn ExecutionPlan>,
      config: &ConfigOptions,
  ) -> Result<Arc<dyn ExecutionPlan>>;

The trait still lives in `physical-plan` because `physical-plan` cannot
depend on `physical-optimizer` (the dependency runs the other way). The
dual shape is the migration story: any static `PhysicalOptimizerRule`
can be made runtime-aware by reading state from `StageBoundaryBuffer`s
in the plan tree, and a future upstream unification of the two traits
requires no change to call sites.

Rules identify the just-completed stage by inspecting buffer state
(`is_ready && !streaming_started`) rather than receiving a stage
number — that lets the same trait serve both static and runtime
optimization without an extra parameter.

SwapBuildSideIfInverted rewritten as a transform_up walk: drops the
AtomicBool `fired` field (buffer-state check is now the gate; once a
stage's buffers are released, `streaming_started` is true and the rule
naturally skips them) and matches the new signature. Still log-only;
the actual `HashJoinExec::swap_inputs` call lands in the next commit.

RTO threads `Arc<TaskContext>` into CoordinatorStream so rules see the
session config (target_partitions, etc.) the same way static
`PhysicalOptimizerRule`s do; the returned plan replaces RTO's plan
in place.

SLT green, log ordering unchanged.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../physical-plan/src/runtime_optimizer.rs    | 168 ++++++++++--------
 1 file changed, 95 insertions(+), 73 deletions(-)

diff --git a/datafusion/physical-plan/src/runtime_optimizer.rs b/datafusion/physical-plan/src/runtime_optimizer.rs
index f6e357c90565..49cfd0d118ed 100644
--- a/datafusion/physical-plan/src/runtime_optimizer.rs
+++ b/datafusion/physical-plan/src/runtime_optimizer.rs
@@ -33,11 +33,12 @@
 use std::collections::BTreeMap;
 use std::pin::Pin;
 use std::sync::Arc;
-use std::sync::atomic::{AtomicBool, Ordering};
 use std::task::{Context, Poll, Waker};
 
 use arrow::array::RecordBatch;
 use datafusion_common::Result;
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::tree_node::{Transformed, TreeNode};
 use datafusion_execution::TaskContext;
 use futures::{Stream, StreamExt};
 use log::info;
@@ -51,14 +52,28 @@ use crate::{
     SendableRecordBatchStream,
 };
 
-/// A runtime adaptive-execution rule. RTO fires `evaluate` once per
-/// stage completion, passing the just-completed stage number so rules
-/// can scope their work to operators whose inputs are at that stage
-/// (e.g. a HashJoinExec whose children are both StageBoundaryBuffers
-/// with `stage == completed_stage`).
+/// A runtime adaptive-execution rule. Shape is identical to
+/// `datafusion_physical_optimizer::PhysicalOptimizerRule::optimize` —
+/// the trait lives in `physical-plan` rather than reusing the upstream
+/// trait directly only because `physical-plan` cannot depend on
+/// `physical-optimizer` (the dependency runs the other way). The dual
+/// shape is the migration story: any static `PhysicalOptimizerRule`
+/// can be made runtime-aware by reading state from
+/// `StageBoundaryBuffer`s in the plan tree it receives, and a future
+/// upstream unification of the two traits requires no change to call
+/// sites.
+///
+/// RTO invokes `optimize` exactly once per stage-completion event with
+/// its current plan; the returned plan replaces RTO's plan. Rules
+/// identify the just-completed stage by walking the plan and finding
+/// `StageBoundaryBuffer`s where `is_ready() && !streaming_started()`.
 pub trait RuntimeRule: Send + Sync + std::fmt::Debug {
     fn name(&self) -> &str;
-    fn evaluate(&self, plan: &Arc<dyn ExecutionPlan>, completed_stage: usize);
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        config: &ConfigOptions,
+    ) -> Result<Arc<dyn ExecutionPlan>>;
 }
 
 #[derive(Debug)]
@@ -134,6 +149,7 @@ impl ExecutionPlan for RuntimeOptimizerExec {
             child,
             plan: Arc::clone(&self.input),
             rules: self.rules.clone(),
+            context: Arc::clone(&context),
         };
         Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream)))
     }
@@ -165,6 +181,10 @@ struct CoordinatorStream {
     child: SendableRecordBatchStream,
     plan: Arc<dyn ExecutionPlan>,
     rules: Vec<Arc<dyn RuntimeRule>>,
+    /// Captured at execute() time; threaded into RuntimeRule::optimize so
+    /// rules see the session config (target_partitions, etc.) the same
+    /// way static `PhysicalOptimizerRule`s do.
+    context: Arc<TaskContext>,
 }
 
 impl Stream for CoordinatorStream {
@@ -190,9 +210,15 @@ impl Stream for CoordinatorStream {
                 boundaries.len(),
                 this.rules.len(),
             );
+            let config = this.context.session_config().options();
+            let mut current_plan = Arc::clone(&this.plan);
             for rule in &this.rules {
-                rule.evaluate(&this.plan, stage);
+                current_plan = match rule.optimize(Arc::clone(&current_plan), config) {
+                    Ok(p) => p,
+                    Err(e) => return Poll::Ready(Some(Err(e))),
+                };
             }
+            this.plan = current_plan;
             for buffer in &boundaries {
                 buffer
                     .downcast_ref::<StageBoundaryBuffer>()
@@ -248,30 +274,21 @@ fn collect_boundaries_by_stage(
 // ---------------------------------------------------------------------------
 // SwapBuildSideIfInverted — first concrete RuntimeRule.
 //
-// When a HashJoinExec's current build side (the LEFT child under
-// `mode=CollectLeft`) ends up larger at runtime than the probe side,
-// the static planner made the wrong choice — it picked build based on
-// (Inexact) estimates. The fix is `HashJoinExec::flip_sides()`, which
-// isn't implemented yet; for now this rule only logs intent so we can
-// verify the detection logic end-to-end.
+// When a HashJoinExec's current build side ends up larger at runtime
+// than the probe side, the static planner made the wrong choice — it
+// picked build based on (Inexact) estimates. This rule walks the plan
+// looking for joins whose children are `StageBoundaryBuffer`s in the
+// just-completed state (`is_ready && !streaming_started`); if l > r,
+// it logs intent. The actual `HashJoinExec::swap_inputs` call lands in
+// the next commit (#14).
 // ---------------------------------------------------------------------------
 
-#[derive(Debug)]
-pub struct SwapBuildSideIfInverted {
-    fired: AtomicBool,
-}
+#[derive(Default, Debug)]
+pub struct SwapBuildSideIfInverted;
 
 impl SwapBuildSideIfInverted {
     pub fn new() -> Self {
-        Self {
-            fired: AtomicBool::new(false),
-        }
-    }
-}
-
-impl Default for SwapBuildSideIfInverted {
-    fn default() -> Self {
-        Self::new()
+        Self
     }
 }
 
@@ -280,55 +297,60 @@ impl RuntimeRule for SwapBuildSideIfInverted {
         "SwapBuildSideIfInverted"
     }
 
-    fn evaluate(&self, plan: &Arc<dyn ExecutionPlan>, completed_stage: usize) {
-        if self.fired.load(Ordering::Relaxed) {
-            return;
-        }
-        self.walk_for_swap(plan, completed_stage);
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        _config: &ConfigOptions,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        plan.transform_up(|node| {
+            let Some(join) = node.downcast_ref::<HashJoinExec>() else {
+                return Ok(Transformed::no(node));
+            };
+            if just_completed_stage_of_join(join).is_none() {
+                return Ok(Transformed::no(node));
+            }
+            let children = join.children();
+            // Current HashJoinExec: LEFT child is the build side under
+            // `mode=CollectLeft`.
+            let left = side_runtime_rows(children[0]);
+            let right = side_runtime_rows(children[1]);
+            if let (Some(l), Some(r)) = (left, right)
+                && l > r
+            {
+                info!(
+                    "SwapBuildSideIfInverted: would flip HashJoinExec — \
+                     current build (left) = {l} rows, probe (right) = {r} \
+                     rows. swap_inputs() wiring lands in the next commit; \
+                     logging intent only."
+                );
+            }
+            Ok(Transformed::no(node))
+        })
+        .map(|t| t.data)
     }
 }
 
-impl SwapBuildSideIfInverted {
-    fn walk_for_swap(&self, plan: &Arc<dyn ExecutionPlan>, completed_stage: usize) {
-        if let Some(join) = plan.downcast_ref::<HashJoinExec>() {
-            let children = join.children();
-            // Only act on joins whose inputs are at the just-completed
-            // stage; nested joins higher up wait for their own stage.
-            let both_at_completed_stage = children.len() == 2
-                && [&children[0], &children[1]].iter().all(|c| {
-                    c.downcast_ref::<StageBoundaryBuffer>()
-                        .is_some_and(|b| b.stage() == completed_stage)
-                });
-            if both_at_completed_stage {
-                // Current HashJoinExec: LEFT child is the build side
-                // under `mode=CollectLeft`.
-                let left = side_runtime_rows(children[0]);
-                let right = side_runtime_rows(children[1]);
-                if let (Some(l), Some(r)) = (left, right)
-                    && l > r
-                    && self
-                        .fired
-                        .compare_exchange(
-                            false,
-                            true,
-                            Ordering::AcqRel,
-                            Ordering::Relaxed,
-                        )
-                        .is_ok()
-                {
-                    info!(
-                        "SwapBuildSideIfInverted: would flip HashJoinExec — \
-                         current build (left) = {l} rows, probe (right) = {r} \
-                         rows. flip_sides() not yet implemented; logging \
-                         intent only."
-                    );
-                }
-            }
-            return;
-        }
-        for child in plan.children() {
-            self.walk_for_swap(child, completed_stage);
-        }
+/// Returns `Some(stage)` if `join`'s two children are both
+/// `StageBoundaryBuffer`s at the same stage in the just-completed state
+/// (`is_ready && !streaming_started`). Otherwise `None`.
+fn just_completed_stage_of_join(join: &HashJoinExec) -> Option<usize> {
+    let children = join.children();
+    if children.len() != 2 {
+        return None;
+    }
+    let left = children[0].downcast_ref::<StageBoundaryBuffer>()?;
+    let right = children[1].downcast_ref::<StageBoundaryBuffer>()?;
+    if left.stage() != right.stage() {
+        return None;
+    }
+    if left.is_ready()
+        && !left.streaming_started()
+        && right.is_ready()
+        && !right.streaming_started()
+    {
+        Some(left.stage())
+    } else {
+        None
     }
 }
 

From e7eb5ce973d1ba21d339c4bd3560db5a388e0094 Mon Sep 17 00:00:00 2001
From: Brent Gardner <bgardner@squarelabs.net>
Date: Thu, 25 Jun 2026 12:21:30 -0600
Subject: [PATCH 15/18] Real swap: SwapBuildSideIfInverted calls swap_inputs,
 RTO defers execute
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 2 of the AQE-lite primitive: the runtime swap actually happens.

SwapBuildSideIfInverted.optimize now:
1. Walks the plan via transform_up.
2. At each HashJoinExec whose children are StageBoundaryBuffers in the
   just-completed state, compares left/right runtime row counts.
3. When l > r, calls join.swap_inputs(*join.partition_mode()) and
   returns Transformed::yes with the new join.
4. Runs the swapped plan through ensure_collect_left_single_partition,
   which wraps the new left in a CoalescePartitionsExec if it reports
   more than one output partition. (Required: CollectLeft mode asserts
   the left child has exactly one partition, but the post-swap left
   was the original right — often multi-partition behind a
   RepartitionExec / Agg(FinalPartitioned) chain.)

For the runtime swap to produce a working join, RTO now DEFERS
self.input.execute. Previously it executed self.input upfront, which
recursively called buffer.execute on each StageBoundaryBuffer's
consumer side — taking the rxs into the OLD HashJoin's stream chain.
After replan, the new join's execute would try to take the rxs again
and fail.

The new flow:
  - RTO.execute primes drains (calls buffer.input.execute via
    SpawnedTask, NOT buffer.execute) and returns a CoordinatorStream
    with child=None.
  - CoordinatorStream.poll_next handles stage completion, fires rules,
    releases boundaries. While any boundary remains gated, defers
    execution by returning Pending.
  - Once all boundaries have started streaming, lazily calls
    self.plan.execute(partition, ctx) to create the child stream.
    All rxs are still intact at this point, so the (possibly
    replanned) join takes them cleanly.

Single-stage queries (the SLT scenario) work end-to-end. Multi-stage
replan — where an outer-stage drain task transitively executes an
inner-stage HashJoin's children before the inner stage completes — is
deferred; see task notes.

SLT result remains correct (plan-agnostic correctness); EXPLAIN still
shows the static plan. The runtime swap is observable via RUST_LOG=info:

  RTO: stage 0 ready (2 boundaries); firing 1 rule(s) before release
  SwapBuildSideIfInverted: flipping HashJoinExec — current build
    (left) = 100 rows, probe (right) = 5 rows. Calling swap_inputs ...
  RTO: stage 0 released; downstream consumers can now drain the
    buffered data

That ordering proves the rule fires before any data flows into the
join — release is what unblocks the ConsumerStreams the join reads
from.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../physical-plan/src/runtime_optimizer.rs    | 116 ++++++++++++++----
 .../test_files/runtime_optimizer.slt          |  22 +++-
 2 files changed, 109 insertions(+), 29 deletions(-)

diff --git a/datafusion/physical-plan/src/runtime_optimizer.rs b/datafusion/physical-plan/src/runtime_optimizer.rs
index 49cfd0d118ed..bc47a8a891fa 100644
--- a/datafusion/physical-plan/src/runtime_optimizer.rs
+++ b/datafusion/physical-plan/src/runtime_optimizer.rs
@@ -43,7 +43,8 @@ use datafusion_execution::TaskContext;
 use futures::{Stream, StreamExt};
 use log::info;
 
-use crate::joins::HashJoinExec;
+use crate::coalesce_partitions::CoalescePartitionsExec;
+use crate::joins::{HashJoinExec, PartitionMode};
 use crate::stage_boundary_buffer::StageBoundaryBuffer;
 use crate::statistics::StatisticsArgs;
 use crate::stream::RecordBatchStreamAdapter;
@@ -136,20 +137,20 @@ impl ExecutionPlan for RuntimeOptimizerExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        let child = self.input.execute(partition, Arc::clone(&context))?;
-        // Side-channel drive every StageBoundaryBuffer in the subtree.
-        // Buffers don't pull from their inputs until primed, so without
-        // this the consumers above them would sit Pending forever (a
-        // HashJoin in CollectLeft mode never polls its probe side until
-        // build completes — but build itself is gated behind a buffer).
-        // prime() is idempotent across partitions.
+        // Prime drains side-channel; do NOT execute self.input yet.
+        // buffer.prime() calls buffer.input.execute() (the subtree below
+        // the boundary) but leaves the boundary's consumer-side rx
+        // untouched, so HashJoin (or a possibly-replanned new HashJoin)
+        // can take it later when CoordinatorStream lazily executes
+        // self.plan after the first stage completes.
         prime_all_buffers(&self.input, &context)?;
         let schema = self.schema();
         let stream = CoordinatorStream {
-            child,
+            child: None,
             plan: Arc::clone(&self.input),
             rules: self.rules.clone(),
             context: Arc::clone(&context),
+            partition,
         };
         Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream)))
     }
@@ -178,13 +179,20 @@ fn register_consumer_waker_on_buffers(plan: &Arc<dyn ExecutionPlan>, waker: &Wak
 }
 
 struct CoordinatorStream {
-    child: SendableRecordBatchStream,
+    /// Lazily created. `None` until every stage has been processed (rules
+    /// fired, boundaries released). Then `self.plan.execute(...)` is called
+    /// — at which point all `StageBoundaryBuffer` rxs are still available
+    /// (prime() left them untouched), so a possibly-replanned join can
+    /// take them.
+    child: Option<SendableRecordBatchStream>,
     plan: Arc<dyn ExecutionPlan>,
     rules: Vec<Arc<dyn RuntimeRule>>,
     /// Captured at execute() time; threaded into RuntimeRule::optimize so
     /// rules see the session config (target_partitions, etc.) the same
-    /// way static `PhysicalOptimizerRule`s do.
+    /// way static `PhysicalOptimizerRule`s do, and used to lazily execute
+    /// the final plan.
     context: Arc<TaskContext>,
+    partition: usize,
 }
 
 impl Stream for CoordinatorStream {
@@ -233,10 +241,35 @@ impl Stream for CoordinatorStream {
             );
         }
 
-        this.child.poll_next_unpin(cx)
+        // Lazily execute the plan once all stages have been processed.
+        // While any boundary is still gated, defer execution so the
+        // post-replan plan can take the consumer-side rxs intact.
+        if this.child.is_none() {
+            if any_buffer_pending(&this.plan) {
+                return Poll::Pending;
+            }
+            match this.plan.execute(this.partition, Arc::clone(&this.context)) {
+                Ok(stream) => this.child = Some(stream),
+                Err(e) => return Poll::Ready(Some(Err(e))),
+            }
+        }
+
+        this.child.as_mut().unwrap().poll_next_unpin(cx)
     }
 }
 
+/// True if any `StageBoundaryBuffer` in the subtree has not yet started
+/// streaming. RTO uses this to gate lazy execution of `self.plan` until
+/// all stages have been processed (rules run, boundaries released).
+fn any_buffer_pending(plan: &Arc<dyn ExecutionPlan>) -> bool {
+    if let Some(buffer) = plan.downcast_ref::<StageBoundaryBuffer>()
+        && !buffer.streaming_started()
+    {
+        return true;
+    }
+    plan.children().iter().any(|c| any_buffer_pending(c))
+}
+
 /// Walks the plan tree, groups every `StageBoundaryBuffer` by stage,
 /// and returns the lowest stage where every boundary is ready and none
 /// has started streaming yet. Returns `None` if no such stage exists
@@ -310,26 +343,61 @@ impl RuntimeRule for SwapBuildSideIfInverted {
                 return Ok(Transformed::no(node));
             }
             let children = join.children();
-            // Current HashJoinExec: LEFT child is the build side under
-            // `mode=CollectLeft`.
+            // Current HashJoinExec: LEFT child is the build side.
             let left = side_runtime_rows(children[0]);
             let right = side_runtime_rows(children[1]);
-            if let (Some(l), Some(r)) = (left, right)
-                && l > r
-            {
-                info!(
-                    "SwapBuildSideIfInverted: would flip HashJoinExec — \
-                     current build (left) = {l} rows, probe (right) = {r} \
-                     rows. swap_inputs() wiring lands in the next commit; \
-                     logging intent only."
-                );
+            let (Some(l), Some(r)) = (left, right) else {
+                return Ok(Transformed::no(node));
+            };
+            if l <= r {
+                return Ok(Transformed::no(node));
             }
-            Ok(Transformed::no(node))
+            info!(
+                "SwapBuildSideIfInverted: flipping HashJoinExec — current \
+                 build (left) = {l} rows, probe (right) = {r} rows. \
+                 Calling swap_inputs to make the smaller side the new build."
+            );
+            let mode = *join.partition_mode();
+            let swapped = join.swap_inputs(mode)?;
+            let swapped = ensure_collect_left_single_partition(swapped)?;
+            Ok(Transformed::yes(swapped))
         })
         .map(|t| t.data)
     }
 }
 
+/// Ensures the (possibly already-coalesced) plan satisfies HashJoin's
+/// CollectLeft invariant: under `PartitionMode::CollectLeft`, the left
+/// child must report exactly one output partition. After
+/// `swap_inputs`, the new left side is whatever used to be on the
+/// right — frequently multi-partition (e.g. behind a RepartitionExec).
+/// Wrap it in `CoalescePartitionsExec` when needed.
+///
+/// `swap_inputs` may also have wrapped the result in a
+/// `ProjectionExec` to preserve output column order; in that case the
+/// HashJoinExec is one level down. We walk through that.
+fn ensure_collect_left_single_partition(
+    plan: Arc<dyn ExecutionPlan>,
+) -> Result<Arc<dyn ExecutionPlan>> {
+    plan.transform_up(|node| {
+        let Some(join) = node.downcast_ref::<HashJoinExec>() else {
+            return Ok(Transformed::no(node));
+        };
+        if *join.partition_mode() != PartitionMode::CollectLeft {
+            return Ok(Transformed::no(node));
+        }
+        let children = join.children();
+        if children[0].output_partitioning().partition_count() == 1 {
+            return Ok(Transformed::no(node));
+        }
+        let coalesced: Arc<dyn ExecutionPlan> =
+            Arc::new(CoalescePartitionsExec::new(Arc::clone(children[0])));
+        let new_children = vec![coalesced, Arc::clone(children[1])];
+        Ok(Transformed::yes(node.with_new_children(new_children)?))
+    })
+    .map(|t| t.data)
+}
+
 /// Returns `Some(stage)` if `join`'s two children are both
 /// `StageBoundaryBuffer`s at the same stage in the just-completed state
 /// (`is_ready && !streaming_started`). Otherwise `None`.
diff --git a/datafusion/sqllogictest/test_files/runtime_optimizer.slt b/datafusion/sqllogictest/test_files/runtime_optimizer.slt
index ed9a0bf0451d..5697a225a79a 100644
--- a/datafusion/sqllogictest/test_files/runtime_optimizer.slt
+++ b/datafusion/sqllogictest/test_files/runtime_optimizer.slt
@@ -94,11 +94,23 @@ SELECT count(*) FROM (SELECT group_key FROM big GROUP BY group_key) t;
 # (RuntimeOptimizerExec at root; StageBoundaryBuffer on each HashJoin
 # input). JoinSelection picked `small` as build statically because
 # aggregated_big's row-count estimate (Inexact, ~100K) overstates the
-# actual (5). RuntimeOptimizerExec detects this inversion at runtime and
-# would call `HashJoinExec::flip_sides()` to swap — but flip_sides isn't
-# implemented yet, so for now the rule only logs intent. EXPLAIN stays
-# the static plan; the swap is verified via log output and (later) via
-# metrics.
+# actual (5).
+#
+# At runtime, RuntimeOptimizerExec detects this inversion and calls
+# `HashJoinExec::swap_inputs` — the join below executes with build and
+# probe swapped, putting the 5-row side on build. EXPLAIN stays the
+# static plan; the runtime swap is observable via RUST_LOG=info, which
+# emits the following ordered log lines:
+#
+#   RTO: stage 0 ready (2 boundaries); firing 1 rule(s) before release
+#   SwapBuildSideIfInverted: flipping HashJoinExec — current build
+#     (left) = 100 rows, probe (right) = 5 rows. Calling swap_inputs ...
+#   RTO: stage 0 released; downstream consumers can now drain the
+#     buffered data
+#
+# That ordering proves the rule fires before the boundaries release —
+# so neither side of the join has been consumed by HashJoin when the
+# swap decision is made.
 # ---------------------------------------------------------------------------
 query TT
 EXPLAIN SELECT bg.group_key, bg.sum_payload, s.payload

From 3b929a9a8a59a413bb42ffa64c9d0b027ce8c8a9 Mon Sep 17 00:00:00 2001
From: Brent Gardner <bgardner@squarelabs.net>
Date: Thu, 25 Jun 2026 12:41:17 -0600
Subject: [PATCH 16/18] RTO: lazy stage-by-stage priming for multi-stage replan
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously RTO eagerly primed every StageBoundaryBuffer in the subtree
at execute() time. For single-stage queries that's fine, but for nested
HashJoins it breaks replan: stage-1's drain task immediately executes
the InnerJoin (taking stage-0's rxs), so by the time stage 0 completes
and the rule wants to swap the InnerJoin, the OLD InnerJoin is already
running. The post-replan plan in self.plan has a new InnerJoin, but
the actual execution uses the OLD layout — replan happens in the tree
but not in the live execution.

Lazy priming fixes this:
- RTO.execute primes only stage 0.
- After releasing stage K's boundaries, CoordinatorStream primes stage
  K+1's boundaries in the (possibly replanned) plan. Boundaries
  rebuilt by the rule's transform_up are freshly constructed; they
  get their drain tasks here, against the post-replan subtree.

This also tightens the contract on `CoordinatorStream.child`'s doc
comment — "all rxs are still available" now genuinely holds for
multi-stage plans, not just single-stage ones.

`buffer.prime()` is idempotent, so the helper is safe to call
repeatedly. `prime_all_buffers` is gone; `prime_buffers_at_stage`
takes a stage filter and is used in both call sites.

SLT (single-stage) unchanged; swap still fires once, logs and result
identical.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../physical-plan/src/runtime_optimizer.rs    | 60 +++++++++++++------
 1 file changed, 43 insertions(+), 17 deletions(-)

diff --git a/datafusion/physical-plan/src/runtime_optimizer.rs b/datafusion/physical-plan/src/runtime_optimizer.rs
index bc47a8a891fa..737c36f9faa0 100644
--- a/datafusion/physical-plan/src/runtime_optimizer.rs
+++ b/datafusion/physical-plan/src/runtime_optimizer.rs
@@ -19,9 +19,12 @@
 //! its subtree to find the lowest stage whose [`StageBoundaryBuffer`]s
 //! are all ready but haven't started streaming yet — the "just completed"
 //! stage. When that exists, it fires each registered [`RuntimeRule`]
-//! exactly once for that stage, then releases the stage's boundaries.
-//! Stage K+1's boundaries can't fill until stage K is released, so
-//! stages naturally serialize without an explicit veto mechanism.
+//! exactly once for that stage, releases the stage's boundaries, then
+//! primes the next stage's boundaries against the (possibly replanned)
+//! plan. Stage 0 is primed at execute() time; stages 1+ are primed
+//! lazily after their predecessor releases — that's what lets a replan
+//! at stage K rebuild stage-(K+1)'s input subtree and have the drain
+//! task run against the new subtree.
 //!
 //! Each buffer owns its own AtomicWaker; RTO walks the subtree per poll
 //! and registers the consumer-task waker on each. Drain tasks wake on
@@ -137,13 +140,17 @@ impl ExecutionPlan for RuntimeOptimizerExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        // Prime drains side-channel; do NOT execute self.input yet.
-        // buffer.prime() calls buffer.input.execute() (the subtree below
-        // the boundary) but leaves the boundary's consumer-side rx
-        // untouched, so HashJoin (or a possibly-replanned new HashJoin)
-        // can take it later when CoordinatorStream lazily executes
-        // self.plan after the first stage completes.
-        prime_all_buffers(&self.input, &context)?;
+        // Prime stage 0 only. Higher stages are primed lazily by
+        // CoordinatorStream after their predecessor releases — that's
+        // what lets a replan that rebuilds stage-K's input subtree
+        // (e.g. swapping a nested HashJoin) take effect: the freshly
+        // built boundaries on the new plan are primed *after* replan,
+        // so the drain task runs against the post-replan subtree.
+        // buffer.prime() touches buffer.input.execute() (the subtree
+        // below the boundary) but leaves the boundary's consumer-side
+        // rx untouched, so HashJoin can take it later via the lazy
+        // execute in CoordinatorStream.
+        prime_buffers_at_stage(&self.input, 0, &context)?;
         let schema = self.schema();
         let stream = CoordinatorStream {
             child: None,
@@ -156,15 +163,23 @@ impl ExecutionPlan for RuntimeOptimizerExec {
     }
 }
 
-fn prime_all_buffers(
+/// Walks the subtree and primes every `StageBoundaryBuffer` whose
+/// `stage()` matches `stage`. Other boundaries are skipped — they're
+/// either already running (lower stage) or waiting their turn (higher
+/// stage). `buffer.prime()` is idempotent, so calling it on an already
+/// primed boundary is harmless.
+fn prime_buffers_at_stage(
     plan: &Arc<dyn ExecutionPlan>,
+    stage: usize,
     ctx: &Arc<TaskContext>,
 ) -> Result<()> {
-    if let Some(buffer) = plan.downcast_ref::<StageBoundaryBuffer>() {
+    if let Some(buffer) = plan.downcast_ref::<StageBoundaryBuffer>()
+        && buffer.stage() == stage
+    {
         buffer.prime(ctx)?;
     }
     for child in plan.children() {
-        prime_all_buffers(child, ctx)?;
+        prime_buffers_at_stage(child, stage, ctx)?;
     }
     Ok(())
 }
@@ -180,10 +195,12 @@ fn register_consumer_waker_on_buffers(plan: &Arc<dyn ExecutionPlan>, waker: &Wak
 
 struct CoordinatorStream {
     /// Lazily created. `None` until every stage has been processed (rules
-    /// fired, boundaries released). Then `self.plan.execute(...)` is called
-    /// — at which point all `StageBoundaryBuffer` rxs are still available
-    /// (prime() left them untouched), so a possibly-replanned join can
-    /// take them.
+    /// fired, boundaries released, next stage primed) and no boundary
+    /// remains gated. Then `self.plan.execute(...)` is called — all
+    /// `StageBoundaryBuffer` rxs are still available at that point
+    /// because stage-N's drain task only starts after stage-(N-1)
+    /// releases (lazy priming), so no operator above a boundary has
+    /// been executed before replan.
     child: Option<SendableRecordBatchStream>,
     plan: Arc<dyn ExecutionPlan>,
     rules: Vec<Arc<dyn RuntimeRule>>,
@@ -239,6 +256,15 @@ impl Stream for CoordinatorStream {
                 "RTO: stage {stage} released; downstream consumers can now \
                  drain the buffered data"
             );
+            // Prime the next stage in the (possibly replanned) plan now
+            // that stage K's data is flowing. Any boundaries rebuilt by
+            // the rule's transform_up are fresh — they get their drain
+            // tasks here, against the post-replan subtree.
+            if let Err(e) =
+                prime_buffers_at_stage(&this.plan, stage + 1, &this.context)
+            {
+                return Poll::Ready(Some(Err(e)));
+            }
         }
 
         // Lazily execute the plan once all stages have been processed.

From 3a3ec5f90c6d4c9c0b486ef77e845acd8d6805b4 Mon Sep 17 00:00:00 2001
From: Brent Gardner <bgardner@squarelabs.net>
Date: Thu, 25 Jun 2026 12:45:34 -0600
Subject: [PATCH 17/18] Extract RuntimeRules into a runtime_rules module
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

runtime_optimizer.rs had grown to mix the RTO coordinator, the
RuntimeRule trait, and concrete rules in one file. Split:

  runtime_optimizer.rs         RuntimeRule trait + RuntimeOptimizerExec
                               + CoordinatorStream + helpers
  runtime_rules/mod.rs         module index + re-exports
  runtime_rules/swap_build_side_if_inverted.rs
                               SwapBuildSideIfInverted impl + its
                               helpers (just_completed_stage_of_join,
                               ensure_collect_left_single_partition,
                               side_runtime_rows,
                               sum_runtime_rows_across_partitions)

Pure refactor — 158 lines moved without behavior change. Future rules
land alongside, one file each. The single import-site update at
physical-optimizer/runtime_optimizer.rs splits the existing combined
import into two lines (the trait still lives at
datafusion_physical_plan::runtime_optimizer; the concrete rule moves
to datafusion_physical_plan::runtime_rules).

SLT and clippy unchanged.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../src/runtime_optimizer.rs                  |   5 +-
 datafusion/physical-plan/src/lib.rs           |   1 +
 .../physical-plan/src/runtime_optimizer.rs    | 158 +---------------
 .../physical-plan/src/runtime_rules/mod.rs    |  27 +++
 .../swap_build_side_if_inverted.rs            | 178 ++++++++++++++++++
 5 files changed, 210 insertions(+), 159 deletions(-)
 create mode 100644 datafusion/physical-plan/src/runtime_rules/mod.rs
 create mode 100644 datafusion/physical-plan/src/runtime_rules/swap_build_side_if_inverted.rs

diff --git a/datafusion/physical-optimizer/src/runtime_optimizer.rs b/datafusion/physical-optimizer/src/runtime_optimizer.rs
index 726c7f2dfb1d..3421125b14c0 100644
--- a/datafusion/physical-optimizer/src/runtime_optimizer.rs
+++ b/datafusion/physical-optimizer/src/runtime_optimizer.rs
@@ -29,9 +29,8 @@ use crate::PhysicalOptimizerRule;
 use datafusion_common::Result;
 use datafusion_common::config::ConfigOptions;
 use datafusion_physical_plan::ExecutionPlan;
-use datafusion_physical_plan::runtime_optimizer::{
-    RuntimeOptimizerExec, RuntimeRule, SwapBuildSideIfInverted,
-};
+use datafusion_physical_plan::runtime_optimizer::{RuntimeOptimizerExec, RuntimeRule};
+use datafusion_physical_plan::runtime_rules::SwapBuildSideIfInverted;
 
 #[derive(Default, Debug)]
 pub struct InsertRuntimeOptimizer;
diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs
index 59c730e21265..0b7f0a40605b 100644
--- a/datafusion/physical-plan/src/lib.rs
+++ b/datafusion/physical-plan/src/lib.rs
@@ -87,6 +87,7 @@ pub mod projection;
 pub mod recursive_query;
 pub mod repartition;
 pub mod runtime_optimizer;
+pub mod runtime_rules;
 pub mod scalar_subquery;
 pub mod sort_pushdown;
 pub mod sorts;
diff --git a/datafusion/physical-plan/src/runtime_optimizer.rs b/datafusion/physical-plan/src/runtime_optimizer.rs
index 737c36f9faa0..e942c2240f1b 100644
--- a/datafusion/physical-plan/src/runtime_optimizer.rs
+++ b/datafusion/physical-plan/src/runtime_optimizer.rs
@@ -41,18 +41,14 @@ use std::task::{Context, Poll, Waker};
 use arrow::array::RecordBatch;
 use datafusion_common::Result;
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::tree_node::{Transformed, TreeNode};
 use datafusion_execution::TaskContext;
 use futures::{Stream, StreamExt};
 use log::info;
 
-use crate::coalesce_partitions::CoalescePartitionsExec;
-use crate::joins::{HashJoinExec, PartitionMode};
 use crate::stage_boundary_buffer::StageBoundaryBuffer;
-use crate::statistics::StatisticsArgs;
 use crate::stream::RecordBatchStreamAdapter;
 use crate::{
-    DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties,
+    DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties,
     SendableRecordBatchStream,
 };
 
@@ -260,9 +256,7 @@ impl Stream for CoordinatorStream {
             // that stage K's data is flowing. Any boundaries rebuilt by
             // the rule's transform_up are fresh — they get their drain
             // tasks here, against the post-replan subtree.
-            if let Err(e) =
-                prime_buffers_at_stage(&this.plan, stage + 1, &this.context)
-            {
+            if let Err(e) = prime_buffers_at_stage(&this.plan, stage + 1, &this.context) {
                 return Poll::Ready(Some(Err(e)));
             }
         }
@@ -329,151 +323,3 @@ fn collect_boundaries_by_stage(
         collect_boundaries_by_stage(child, out);
     }
 }
-
-// ---------------------------------------------------------------------------
-// SwapBuildSideIfInverted — first concrete RuntimeRule.
-//
-// When a HashJoinExec's current build side ends up larger at runtime
-// than the probe side, the static planner made the wrong choice — it
-// picked build based on (Inexact) estimates. This rule walks the plan
-// looking for joins whose children are `StageBoundaryBuffer`s in the
-// just-completed state (`is_ready && !streaming_started`); if l > r,
-// it logs intent. The actual `HashJoinExec::swap_inputs` call lands in
-// the next commit (#14).
-// ---------------------------------------------------------------------------
-
-#[derive(Default, Debug)]
-pub struct SwapBuildSideIfInverted;
-
-impl SwapBuildSideIfInverted {
-    pub fn new() -> Self {
-        Self
-    }
-}
-
-impl RuntimeRule for SwapBuildSideIfInverted {
-    fn name(&self) -> &str {
-        "SwapBuildSideIfInverted"
-    }
-
-    fn optimize(
-        &self,
-        plan: Arc<dyn ExecutionPlan>,
-        _config: &ConfigOptions,
-    ) -> Result<Arc<dyn ExecutionPlan>> {
-        plan.transform_up(|node| {
-            let Some(join) = node.downcast_ref::<HashJoinExec>() else {
-                return Ok(Transformed::no(node));
-            };
-            if just_completed_stage_of_join(join).is_none() {
-                return Ok(Transformed::no(node));
-            }
-            let children = join.children();
-            // Current HashJoinExec: LEFT child is the build side.
-            let left = side_runtime_rows(children[0]);
-            let right = side_runtime_rows(children[1]);
-            let (Some(l), Some(r)) = (left, right) else {
-                return Ok(Transformed::no(node));
-            };
-            if l <= r {
-                return Ok(Transformed::no(node));
-            }
-            info!(
-                "SwapBuildSideIfInverted: flipping HashJoinExec — current \
-                 build (left) = {l} rows, probe (right) = {r} rows. \
-                 Calling swap_inputs to make the smaller side the new build."
-            );
-            let mode = *join.partition_mode();
-            let swapped = join.swap_inputs(mode)?;
-            let swapped = ensure_collect_left_single_partition(swapped)?;
-            Ok(Transformed::yes(swapped))
-        })
-        .map(|t| t.data)
-    }
-}
-
-/// Ensures the (possibly already-coalesced) plan satisfies HashJoin's
-/// CollectLeft invariant: under `PartitionMode::CollectLeft`, the left
-/// child must report exactly one output partition. After
-/// `swap_inputs`, the new left side is whatever used to be on the
-/// right — frequently multi-partition (e.g. behind a RepartitionExec).
-/// Wrap it in `CoalescePartitionsExec` when needed.
-///
-/// `swap_inputs` may also have wrapped the result in a
-/// `ProjectionExec` to preserve output column order; in that case the
-/// HashJoinExec is one level down. We walk through that.
-fn ensure_collect_left_single_partition(
-    plan: Arc<dyn ExecutionPlan>,
-) -> Result<Arc<dyn ExecutionPlan>> {
-    plan.transform_up(|node| {
-        let Some(join) = node.downcast_ref::<HashJoinExec>() else {
-            return Ok(Transformed::no(node));
-        };
-        if *join.partition_mode() != PartitionMode::CollectLeft {
-            return Ok(Transformed::no(node));
-        }
-        let children = join.children();
-        if children[0].output_partitioning().partition_count() == 1 {
-            return Ok(Transformed::no(node));
-        }
-        let coalesced: Arc<dyn ExecutionPlan> =
-            Arc::new(CoalescePartitionsExec::new(Arc::clone(children[0])));
-        let new_children = vec![coalesced, Arc::clone(children[1])];
-        Ok(Transformed::yes(node.with_new_children(new_children)?))
-    })
-    .map(|t| t.data)
-}
-
-/// Returns `Some(stage)` if `join`'s two children are both
-/// `StageBoundaryBuffer`s at the same stage in the just-completed state
-/// (`is_ready && !streaming_started`). Otherwise `None`.
-fn just_completed_stage_of_join(join: &HashJoinExec) -> Option<usize> {
-    let children = join.children();
-    if children.len() != 2 {
-        return None;
-    }
-    let left = children[0].downcast_ref::<StageBoundaryBuffer>()?;
-    let right = children[1].downcast_ref::<StageBoundaryBuffer>()?;
-    if left.stage() != right.stage() {
-        return None;
-    }
-    if left.is_ready()
-        && !left.streaming_started()
-        && right.is_ready()
-        && !right.streaming_started()
-    {
-        Some(left.stage())
-    } else {
-        None
-    }
-}
-
-/// Row count of a join input's subtree. Trusts `runtime_row_count`
-/// to propagate correctly through passthrough operators (Projection,
-/// StageBoundaryBuffer-when-ready, etc.); no recursive descent. The
-/// StageBoundaryBuffer in the chain returns `None` until its own
-/// `is_ready`, which is the natural gate — rules only see runtime
-/// stats once the underlying breaker is actually done.
-///
-/// Falls back to plan-time `statistics()` for pure static-source
-/// subtrees (e.g. a small in-memory table behind a
-/// `CoalescePartitionsExec`).
-fn side_runtime_rows(plan: &Arc<dyn ExecutionPlan>) -> Option<usize> {
-    if let Some(rows) = sum_runtime_rows_across_partitions(plan) {
-        return Some(rows);
-    }
-    plan.statistics_with_args(&StatisticsArgs::new())
-        .ok()
-        .and_then(|s| s.num_rows.get_value().copied())
-}
-
-fn sum_runtime_rows_across_partitions(plan: &Arc<dyn ExecutionPlan>) -> Option<usize> {
-    let n = plan.output_partitioning().partition_count();
-    let mut total: usize = 0;
-    for p in 0..n {
-        // Require every partition to report — partial sums are not
-        // meaningful for adaptive decisions.
-        total += plan.runtime_row_count(p)?;
-    }
-    Some(total)
-}
diff --git a/datafusion/physical-plan/src/runtime_rules/mod.rs b/datafusion/physical-plan/src/runtime_rules/mod.rs
new file mode 100644
index 000000000000..fa785b31e0eb
--- /dev/null
+++ b/datafusion/physical-plan/src/runtime_rules/mod.rs
@@ -0,0 +1,27 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Concrete [`RuntimeRule`] implementations driven by
+//! [`RuntimeOptimizerExec`]. One rule per file; the trait itself lives
+//! in [`crate::runtime_optimizer`].
+//!
+//! [`RuntimeRule`]: crate::runtime_optimizer::RuntimeRule
+//! [`RuntimeOptimizerExec`]: crate::runtime_optimizer::RuntimeOptimizerExec
+
+pub mod swap_build_side_if_inverted;
+
+pub use swap_build_side_if_inverted::SwapBuildSideIfInverted;
diff --git a/datafusion/physical-plan/src/runtime_rules/swap_build_side_if_inverted.rs b/datafusion/physical-plan/src/runtime_rules/swap_build_side_if_inverted.rs
new file mode 100644
index 000000000000..4196bacfa4e3
--- /dev/null
+++ b/datafusion/physical-plan/src/runtime_rules/swap_build_side_if_inverted.rs
@@ -0,0 +1,178 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`SwapBuildSideIfInverted`] — first concrete [`RuntimeRule`].
+//!
+//! When a HashJoinExec's current build side ends up larger at runtime
+//! than the probe side, the static planner made the wrong choice — it
+//! picked build based on (Inexact) estimates. This rule walks the plan
+//! looking for joins whose children are [`StageBoundaryBuffer`]s in the
+//! just-completed state (`is_ready && !streaming_started`); if `l > r`
+//! it calls [`HashJoinExec::swap_inputs`] and patches the result to
+//! preserve the join's distribution invariants.
+//!
+//! [`RuntimeRule`]: crate::runtime_optimizer::RuntimeRule
+
+use std::sync::Arc;
+
+use datafusion_common::Result;
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::tree_node::{Transformed, TreeNode};
+use log::info;
+
+use crate::coalesce_partitions::CoalescePartitionsExec;
+use crate::joins::{HashJoinExec, PartitionMode};
+use crate::runtime_optimizer::RuntimeRule;
+use crate::stage_boundary_buffer::StageBoundaryBuffer;
+use crate::statistics::StatisticsArgs;
+use crate::{ExecutionPlan, ExecutionPlanProperties};
+
+#[derive(Default, Debug)]
+pub struct SwapBuildSideIfInverted;
+
+impl SwapBuildSideIfInverted {
+    pub fn new() -> Self {
+        Self
+    }
+}
+
+impl RuntimeRule for SwapBuildSideIfInverted {
+    fn name(&self) -> &str {
+        "SwapBuildSideIfInverted"
+    }
+
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        _config: &ConfigOptions,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        plan.transform_up(|node| {
+            let Some(join) = node.downcast_ref::<HashJoinExec>() else {
+                return Ok(Transformed::no(node));
+            };
+            if just_completed_stage_of_join(join).is_none() {
+                return Ok(Transformed::no(node));
+            }
+            let children = join.children();
+            // Current HashJoinExec: LEFT child is the build side.
+            let left = side_runtime_rows(children[0]);
+            let right = side_runtime_rows(children[1]);
+            let (Some(l), Some(r)) = (left, right) else {
+                return Ok(Transformed::no(node));
+            };
+            if l <= r {
+                return Ok(Transformed::no(node));
+            }
+            info!(
+                "SwapBuildSideIfInverted: flipping HashJoinExec — current \
+                 build (left) = {l} rows, probe (right) = {r} rows. \
+                 Calling swap_inputs to make the smaller side the new build."
+            );
+            let mode = *join.partition_mode();
+            let swapped = join.swap_inputs(mode)?;
+            let swapped = ensure_collect_left_single_partition(swapped)?;
+            Ok(Transformed::yes(swapped))
+        })
+        .map(|t| t.data)
+    }
+}
+
+/// Ensures the (possibly already-coalesced) plan satisfies HashJoin's
+/// CollectLeft invariant: under `PartitionMode::CollectLeft`, the left
+/// child must report exactly one output partition. After
+/// [`HashJoinExec::swap_inputs`], the new left side is whatever used
+/// to be on the right — frequently multi-partition (e.g. behind a
+/// RepartitionExec). Wrap it in [`CoalescePartitionsExec`] when needed.
+///
+/// `swap_inputs` may also have wrapped the result in a `ProjectionExec`
+/// to preserve output column order; in that case the HashJoinExec is
+/// one level down. We walk through that via `transform_up`.
+fn ensure_collect_left_single_partition(
+    plan: Arc<dyn ExecutionPlan>,
+) -> Result<Arc<dyn ExecutionPlan>> {
+    plan.transform_up(|node| {
+        let Some(join) = node.downcast_ref::<HashJoinExec>() else {
+            return Ok(Transformed::no(node));
+        };
+        if *join.partition_mode() != PartitionMode::CollectLeft {
+            return Ok(Transformed::no(node));
+        }
+        let children = join.children();
+        if children[0].output_partitioning().partition_count() == 1 {
+            return Ok(Transformed::no(node));
+        }
+        let coalesced: Arc<dyn ExecutionPlan> =
+            Arc::new(CoalescePartitionsExec::new(Arc::clone(children[0])));
+        let new_children = vec![coalesced, Arc::clone(children[1])];
+        Ok(Transformed::yes(node.with_new_children(new_children)?))
+    })
+    .map(|t| t.data)
+}
+
+/// Returns `Some(stage)` if `join`'s two children are both
+/// [`StageBoundaryBuffer`]s at the same stage in the just-completed
+/// state (`is_ready && !streaming_started`). Otherwise `None`.
+fn just_completed_stage_of_join(join: &HashJoinExec) -> Option<usize> {
+    let children = join.children();
+    if children.len() != 2 {
+        return None;
+    }
+    let left = children[0].downcast_ref::<StageBoundaryBuffer>()?;
+    let right = children[1].downcast_ref::<StageBoundaryBuffer>()?;
+    if left.stage() != right.stage() {
+        return None;
+    }
+    if left.is_ready()
+        && !left.streaming_started()
+        && right.is_ready()
+        && !right.streaming_started()
+    {
+        Some(left.stage())
+    } else {
+        None
+    }
+}
+
+/// Row count of a join input's subtree. Trusts `runtime_row_count` to
+/// propagate correctly through passthrough operators (Projection,
+/// StageBoundaryBuffer-when-ready, etc.); no recursive descent. The
+/// StageBoundaryBuffer in the chain returns `None` until its own
+/// `is_ready`, which is the natural gate — rules only see runtime
+/// stats once the underlying breaker is actually done.
+///
+/// Falls back to plan-time `statistics()` for pure static-source
+/// subtrees (e.g. a small in-memory table behind a
+/// `CoalescePartitionsExec`).
+fn side_runtime_rows(plan: &Arc<dyn ExecutionPlan>) -> Option<usize> {
+    if let Some(rows) = sum_runtime_rows_across_partitions(plan) {
+        return Some(rows);
+    }
+    plan.statistics_with_args(&StatisticsArgs::new())
+        .ok()
+        .and_then(|s| s.num_rows.get_value().copied())
+}
+
+fn sum_runtime_rows_across_partitions(plan: &Arc<dyn ExecutionPlan>) -> Option<usize> {
+    let n = plan.output_partitioning().partition_count();
+    let mut total: usize = 0;
+    for p in 0..n {
+        // Require every partition to report — partial sums are not
+        // meaningful for adaptive decisions.
+        total += plan.runtime_row_count(p)?;
+    }
+    Some(total)
+}

From 9b3decc0430a3bc17a3b75972092687074734957 Mon Sep 17 00:00:00 2001
From: Brent Gardner <bgardner@squarelabs.net>
Date: Thu, 25 Jun 2026 13:00:42 -0600
Subject: [PATCH 18/18] StageBoundaryBuffer counts its own rows; drop
 static-stat fallback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The buffer materializes its input — so it knows the true post-input
cardinality. Surface that directly:

- New `row_counts: Arc<Vec<AtomicUsize>>` field, one counter per
  partition. Drain task increments by `batch.num_rows()` as it ferries
  each batch through the channel.
- `runtime_row_count(p)` returns the partition's counter once
  `is_ready` flips. No longer delegates to `self.input.runtime_row_count`,
  which often returned None (CoalescePartitionsExec, etc. don't
  implement it) and forced the rule to fall back to static stats.

With true per-partition runtime counts always available at the
boundary, `SwapBuildSideIfInverted::side_runtime_rows` drops the
plan-time-statistics fallback and the StatisticsArgs import — runtime
is the only source of truth now.

`statistics_with_args` passthrough on the buffer stays (other static
rules in the optimizer pipeline still ask for stats through plan
trees), but its docstring is updated: the rule we wrote no longer
needs it.

SLT unchanged (still l=100, r=5), but the numbers now come from the
buffer's own counters. Cleaner story for the PR: "runtime stats are
authoritative; the buffer counts what flows through it."

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../swap_build_side_if_inverted.rs            | 29 ++++---------
 .../src/stage_boundary_buffer.rs              | 42 ++++++++++++++-----
 2 files changed, 40 insertions(+), 31 deletions(-)

diff --git a/datafusion/physical-plan/src/runtime_rules/swap_build_side_if_inverted.rs b/datafusion/physical-plan/src/runtime_rules/swap_build_side_if_inverted.rs
index 4196bacfa4e3..343fd4564c85 100644
--- a/datafusion/physical-plan/src/runtime_rules/swap_build_side_if_inverted.rs
+++ b/datafusion/physical-plan/src/runtime_rules/swap_build_side_if_inverted.rs
@@ -38,7 +38,6 @@ use crate::coalesce_partitions::CoalescePartitionsExec;
 use crate::joins::{HashJoinExec, PartitionMode};
 use crate::runtime_optimizer::RuntimeRule;
 use crate::stage_boundary_buffer::StageBoundaryBuffer;
-use crate::statistics::StatisticsArgs;
 use crate::{ExecutionPlan, ExecutionPlanProperties};
 
 #[derive(Default, Debug)]
@@ -147,31 +146,19 @@ fn just_completed_stage_of_join(join: &HashJoinExec) -> Option<usize> {
     }
 }
 
-/// Row count of a join input's subtree. Trusts `runtime_row_count` to
-/// propagate correctly through passthrough operators (Projection,
-/// StageBoundaryBuffer-when-ready, etc.); no recursive descent. The
-/// StageBoundaryBuffer in the chain returns `None` until its own
-/// `is_ready`, which is the natural gate — rules only see runtime
-/// stats once the underlying breaker is actually done.
+/// Total runtime row count across all output partitions of a
+/// HashJoin input. The input is always a `StageBoundaryBuffer`
+/// (`InsertHashJoinBoundaries` inserts one above each side), and the
+/// buffer materializes its input — so once `is_ready` flips,
+/// `runtime_row_count(p)` returns the true post-input cardinality.
+/// No static-stat fallback needed.
 ///
-/// Falls back to plan-time `statistics()` for pure static-source
-/// subtrees (e.g. a small in-memory table behind a
-/// `CoalescePartitionsExec`).
+/// Returns `None` if any partition's count is unavailable, which the
+/// rule treats as "don't try to decide yet."
 fn side_runtime_rows(plan: &Arc<dyn ExecutionPlan>) -> Option<usize> {
-    if let Some(rows) = sum_runtime_rows_across_partitions(plan) {
-        return Some(rows);
-    }
-    plan.statistics_with_args(&StatisticsArgs::new())
-        .ok()
-        .and_then(|s| s.num_rows.get_value().copied())
-}
-
-fn sum_runtime_rows_across_partitions(plan: &Arc<dyn ExecutionPlan>) -> Option<usize> {
     let n = plan.output_partitioning().partition_count();
     let mut total: usize = 0;
     for p in 0..n {
-        // Require every partition to report — partial sums are not
-        // meaningful for adaptive decisions.
         total += plan.runtime_row_count(p)?;
     }
     Some(total)
diff --git a/datafusion/physical-plan/src/stage_boundary_buffer.rs b/datafusion/physical-plan/src/stage_boundary_buffer.rs
index e48da20cf6e1..6f0cf8cf3f6b 100644
--- a/datafusion/physical-plan/src/stage_boundary_buffer.rs
+++ b/datafusion/physical-plan/src/stage_boundary_buffer.rs
@@ -51,6 +51,7 @@
 
 use std::collections::HashMap;
 use std::pin::Pin;
+use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::{Arc, Mutex};
 use std::task::{Context, Poll, Waker};
 
@@ -114,6 +115,11 @@ pub struct StageBoundaryBuffer {
     /// Handles to spawned drain tasks. Auto-abort on Drop via
     /// `SpawnedTask::Drop`, so query cancellation cleans up cleanly.
     drain_tasks: Arc<Mutex<Vec<SpawnedTask<()>>>>,
+    /// Per-partition running row count, incremented by the drain task
+    /// as each batch flows through. The buffer materializes the input,
+    /// so by the time `is_ready` flips, this is the true post-input
+    /// cardinality — no static-stat fallback needed.
+    row_counts: Arc<Vec<AtomicUsize>>,
 }
 
 #[derive(Debug)]
@@ -141,6 +147,8 @@ impl StageBoundaryBuffer {
                 (Some(tx), Some(rx))
             })
             .unzip();
+        let row_counts =
+            Arc::new((0..num_partitions).map(|_| AtomicUsize::new(0)).collect());
         Self {
             input,
             cache,
@@ -156,6 +164,7 @@ impl StageBoundaryBuffer {
             txs: Arc::new(Mutex::new(txs)),
             rxs: Arc::new(Mutex::new(rxs)),
             drain_tasks: Arc::new(Mutex::new(Vec::new())),
+            row_counts,
         }
     }
 
@@ -188,8 +197,9 @@ impl StageBoundaryBuffer {
             let stream = self.input.execute(partition, Arc::clone(ctx))?;
             let state = Arc::clone(&self.state);
             let rto_waker = Arc::clone(&self.rto_waker);
+            let row_counts = Arc::clone(&self.row_counts);
             tasks.push(SpawnedTask::spawn(drain_partition(
-                partition, stream, tx, state, rto_waker,
+                partition, stream, tx, state, rto_waker, row_counts,
             )));
         }
         Ok(())
@@ -228,14 +238,22 @@ async fn drain_partition(
     tx: mpsc::UnboundedSender<Result<RecordBatch>>,
     state: Arc<Mutex<BufferState>>,
     rto_waker: Arc<AtomicWaker>,
+    row_counts: Arc<Vec<AtomicUsize>>,
 ) {
     while let Some(item) = stream.next().await {
         // tx.send returning Err means the consumer dropped its receiver
         // (query cancelled while we were mid-drain). Stop pulling.
+        let rows = match &item {
+            Ok(batch) => batch.num_rows(),
+            Err(_) => 0,
+        };
         let is_err = item.is_err();
         if tx.send(item).is_err() {
             break;
         }
+        if rows > 0 {
+            row_counts[partition].fetch_add(rows, Ordering::Relaxed);
+        }
         if is_err {
             break;
         }
@@ -255,7 +273,6 @@ async fn drain_partition(
         // reach the top-of-plan task. The shared AtomicWaker bridges.
         rto_waker.wake();
     }
-    let _ = partition; // reserved for future per-partition diagnostics
 }
 
 impl Drop for StageBoundaryBuffer {
@@ -306,22 +323,27 @@ impl ExecutionPlan for StageBoundaryBuffer {
         vec![true]
     }
 
-    /// Gated passthrough: rules only see runtime stats once every drain
-    /// task has reached EOF (signalled by `is_ready`). Before that the
-    /// child's per-partition counts are partial / not meaningful for
-    /// adaptive decisions.
+    /// Authoritative runtime cardinality: the drain task counts rows as
+    /// it materializes them, so once `is_ready` flips, this is the true
+    /// post-input count for `partition` (no static-stat fallback needed).
+    /// Returns `None` before `is_ready` — partial counts are not safe
+    /// to drive adaptive decisions with.
     fn runtime_row_count(&self, partition: usize) -> Option<usize> {
         if !self.is_ready() {
             return None;
         }
-        self.input.runtime_row_count(partition)
+        self.row_counts
+            .get(partition)
+            .map(|c| c.load(Ordering::Relaxed))
     }
 
     /// Passthrough: the buffer doesn't change row counts or column stats.
     /// Without this override, the default impl at execution_plan.rs:528
-    /// returns Statistics::new_unknown — so wrapping any subtree in a
-    /// buffer would blackhole the static stats `side_runtime_rows`
-    /// falls back to when runtime stats aren't yet available.
+    /// returns Statistics::new_unknown — wrapping any subtree in a
+    /// buffer would blackhole stats for any optimizer rule that asks.
+    /// (Runtime cardinality is also exposed via `runtime_row_count` as
+    /// the drain counts rows; this method is for other static rules
+    /// that inspect plan-time statistics.)
     fn statistics_with_args(&self, args: &StatisticsArgs) -> Result<Arc<Statistics>> {
         args.compute_child_statistics(&self.input, args.partition())
     }