From 5531d6162ad7c0ac7a302599bb8521cb4665e212 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Mon, 1 Jun 2026 17:03:28 -0700 Subject: [PATCH 01/64] spike(workflow): concurrent dynamic dispatch harness (DynamicNodeSupervisor + ctx.pipeline) RFC #92 reference harness. DynamicNodeSupervisor (gate-on-leaf, TaskGroup fan-out) + ctx.pipeline/ctx.parallel on the real ADK Workflow engine. 11 deterministic CI-safe tests (no LLM) + an env-gated live Gemini E2E. Proves barrier-free execution, failed-item isolation, control-exception cancellation (requires TaskGroup, not gather), nested no-deadlock with leaf gating (+ driver-gating deadlock contrast), and resume exactly-once for completed children. pyink/isort/mdformat clean. --- .../dynamic_supervisor_spike/README.md | 103 ++++ .../dynamic_supervisor_spike/supervisor.py | 171 +++++++ .../test_dynamic_supervisor_spike.py | 472 ++++++++++++++++++ .../test_live_gemini_e2e.py | 159 ++++++ 4 files changed, 905 insertions(+) create mode 100644 contributing/samples/workflows/dynamic_supervisor_spike/README.md create mode 100644 contributing/samples/workflows/dynamic_supervisor_spike/supervisor.py create mode 100644 contributing/samples/workflows/dynamic_supervisor_spike/test_dynamic_supervisor_spike.py create mode 100644 contributing/samples/workflows/dynamic_supervisor_spike/test_live_gemini_e2e.py diff --git a/contributing/samples/workflows/dynamic_supervisor_spike/README.md b/contributing/samples/workflows/dynamic_supervisor_spike/README.md new file mode 100644 index 00000000000..f89917ec51c --- /dev/null +++ b/contributing/samples/workflows/dynamic_supervisor_spike/README.md @@ -0,0 +1,103 @@ +# Dynamic Supervisor Spike — concurrent dynamic dispatch for ADK Workflows + +Reproducible harness for an RFC proposing leaf-gated concurrent dynamic +dispatch (`ctx.pipeline` / `ctx.parallel`) on the ADK Workflow engine. + +**The harness exists to prove the design on the real engine, not to ship an +API.** It pins exactly which properties hold: all five merge-gate properties +hold with a wrapper supervisor on the unmodified engine. The v1 interrupt +behavior is decided — cancel in-flight siblings and re-run them on resume; +checkpoint-then-pause is a deferred v2 product decision. + +## Environment + +- Built/run against `google/adk-python` (branch rebased onto current `main`). +- *Historical run evidence below was captured on ADK `2.0.0` at `origin/main` @ `4006fe40`; results re-verified on the rebased branch.* +- Python: 3.11+ (uses `asyncio.TaskGroup` + `except*`) + +## Files + +| File | Purpose | +| ---------------------------------- | ------------------------------------------------------------------------------------------------------ | +| `supervisor.py` | Prototype `DynamicNodeSupervisor` (gate-on-leaf + `TaskGroup` fan-out) over the real `ctx.run_node()`. | +| `test_dynamic_supervisor_spike.py` | Deterministic regression harness (no LLM). The trustworthy artifact. | +| `test_live_gemini_e2e.py` | OPTIONAL live-model evidence; env-gated, skipped by default. | + +## Run the deterministic harness (CI-safe, no network) + +```bash +pytest contributing/samples/workflows/dynamic_supervisor_spike/test_dynamic_supervisor_spike.py -q +``` + +### Expected current result: **11 passed** + +1. `test_concurrent_dispatch_correct_and_barrier_free` — concurrent `ctx.run_node` + executes correctly (distinct results, no corruption), wall ≈ max-delay not serial sum. +1. `test_pipeline_barrier_free` — item 0 enters stage 2 before item 1 finishes stage 1. +1. `test_parallel_failed_item_isolation` — ordinary error → `None`, siblings unaffected. +1. `test_control_exception_propagates_and_cancels_siblings` — `NodeInterruptedError` + propagates **and cancels the running sibling**. Requires `asyncio.TaskGroup`: + `asyncio.gather` propagates but does **not** cancel siblings, so the supervisor + contract mandates TaskGroup-equivalent structured concurrency. +1. `test_nested_combinator_no_deadlock_leaf_gating` — a pipeline stage calling + `parallel` with `gate=2` completes; peak in-flight ≤ gate. +1. `test_driver_gating_deadlocks_as_predicted` — CONTRAST: gating *drivers* instead + of *leaves* deadlocks (timeout). Proves the leaf-gating decision empirically. +1. `test_sequential_resume_is_exactly_once` — sequential dispatch resumes + exactly-once (completed children fast-forward; interrupted node re-runs). +1. `test_concurrent_resume_completed_children_fast_forward` — **the merge gate.** + Under *concurrent* dispatch, children that COMPLETE before the interrupt + fast-forward on resume (exactly-once). No double-spend. +1. `test_concurrent_inflight_children_cancelled_on_interrupt_rerun` — pins the + **decided v1 semantic**: a sibling that interrupts while others are still IN + FLIGHT cancels them; cancelled (never-completed) children correctly re-run on + resume. (Checkpoint-then-pause is deferred to v2.) +1. `test_child_cancellederror_does_not_cancel_siblings` — a branch-originated + `asyncio.CancelledError` is asyncio task-cancellation: not propagated, siblings + untouched, slot left `None`. Only `NodeInterruptedError` / non-cancellation + `BaseException` cancel siblings. +1. `test_gate_must_be_positive` — `gate=0`/negative raises `ValueError` at + construction (would otherwise deadlock every dispatch). + +## Resume exactly-once: there is no engine gap (a correction) + +An earlier draft of this harness reported a resume "engine gap." That was a +**test artifact and has been retracted.** The earlier test let the +`RequestInput` child interrupt *before* its siblings finished, so the +`TaskGroup` cancelled still-running siblings; those **cancelled (never +completed)** children then re-ran on resume — which is *correct*, not a bug. + +With the timing separated (test 8 vs test 9), the truth is: + +- **Completed** concurrent children **fast-forward** on resume (exactly-once) — + identical to sequential. No double-spend of completed LLM work. +- **In-flight** children cancelled by an interrupting sibling **re-run** on + resume — correctness-preserving (they never completed). + +The `"Workflow ...: cancelling N leftover tasks"` log is **benign cleanup** — it +appears even in the sequential exactly-once run, and completion is still +checkpointed correctly. It is not corruption. + +**Net: all five merge-gate properties hold with a wrapper supervisor + the real +engine; no `_workflow.py` change is required for resume correctness.** The one +behavior worth calling out in the RFC is a design trade-off, not a bug: +interrupting one branch cancels in-flight siblings and discards their partial +progress. If preserving that progress is desired, that is a separate design +decision (e.g. checkpoint-then-pause instead of cancel). + +## Optional: live model evidence (supporting only) + +Skipped unless explicitly configured — never runs in CI by accident: + +```bash +export SPIKE_LIVE=1 +export GOOGLE_GENAI_USE_VERTEXAI=1 +export GOOGLE_CLOUD_PROJECT= +export GOOGLE_CLOUD_LOCATION=global # gemini-3.5-flash serves here +export SPIKE_GEMINI_MODEL=gemini-3.5-flash # or any flash model you can access +pytest contributing/samples/workflows/dynamic_supervisor_spike/test_live_gemini_e2e.py -q -s +``` + +Asserts the concurrent pipeline wall-clock is well under the serial sum of +per-call latencies. The deterministic engine tests — not this — are the +artifact maintainers should trust. diff --git a/contributing/samples/workflows/dynamic_supervisor_spike/supervisor.py b/contributing/samples/workflows/dynamic_supervisor_spike/supervisor.py new file mode 100644 index 00000000000..51a03d6ea05 --- /dev/null +++ b/contributing/samples/workflows/dynamic_supervisor_spike/supervisor.py @@ -0,0 +1,171 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Prototype DynamicNodeSupervisor for concurrent dynamic dispatch. + +This is the RFC spike artifact (see README.md). It layers concurrent +``ctx.run_node()`` orchestration on the real ADK Workflow engine via a +framework-owned supervisor. Two design decisions are encoded here and +verified by the tests: + +1. The concurrency gate is acquired around each LEAF dispatch (a single + ``ctx.run_node`` call), NOT around drivers. Orchestration frames + (drivers, fan-out, nested pipeline/parallel) hold no permit while + awaiting children, so nesting a combinator inside a stage cannot + deadlock. (Gating drivers DOES deadlock — see the contrast test.) + +2. Fan-out uses ``asyncio.TaskGroup`` (structured concurrency), NOT + ``asyncio.gather``. ``gather`` propagates an exception but does not + cancel siblings. + +Failure / cancellation contract (verified by the tests): + +* Ordinary ``Exception`` in a branch -> that branch becomes ``None``; + siblings are unaffected. +* ``NodeInterruptedError`` (and other non-cancellation ``BaseException`` + such as ``KeyboardInterrupt`` / ``SystemExit``) -> propagates, and + ``TaskGroup`` cancels the remaining branches. +* External cancellation of the combinator -> propagates down to the + in-flight branches (standard structured concurrency). +* A branch raising ``asyncio.CancelledError`` itself is treated by asyncio + as that task's own cancellation: ``TaskGroup`` does NOT propagate it and + does NOT cancel siblings; the branch's slot is left ``None`` and the + others run to completion. (This is asyncio semantics, not something the + supervisor can override without bespoke handling — see the test.) +""" + +from __future__ import annotations + +import asyncio +import os +from typing import Any +from typing import Awaitable +from typing import Callable +from typing import Sequence + +from google.adk.workflow._errors import NodeInterruptedError + +# Control exceptions are NEVER converted to None. NodeInterruptedError is a +# BaseException by design so it cannot be swallowed by ``except Exception``. +_CONTROL_EXC = ( + NodeInterruptedError, + asyncio.CancelledError, + KeyboardInterrupt, + SystemExit, +) + + +def default_gate() -> int: + """min(16, cpu-2): matches the Claude Code reference runtime cap.""" + return min(16, max(1, (os.cpu_count() or 3) - 2)) + + +class DynamicNodeSupervisor: + """Drives concurrent dynamic ``ctx.run_node()`` chains under one parent.""" + + def __init__(self, ctx, *, gate: int | None = None) -> None: + self.ctx = ctx + resolved_gate = gate if gate is not None else default_gate() + if resolved_gate < 1: + raise ValueError(f"gate must be >= 1, got {resolved_gate}") + self.gate = asyncio.Semaphore(resolved_gate) + self.peak_in_flight = 0 + self._in_flight = 0 + + async def dispatch( + self, child, *, node_input: Any = None, run_id: str | None = None + ) -> Any: + """One leaf dispatch. The gate is held ONLY for the child execution.""" + async with self.gate: + self._in_flight += 1 + self.peak_in_flight = max(self.peak_in_flight, self._in_flight) + try: + return await self.ctx.run_node( + child, node_input=node_input, run_id=run_id + ) + finally: + self._in_flight -= 1 + + async def _guard_ordinary(self, factory: Callable[[], Awaitable[Any]]) -> Any: + """Ordinary Exception -> None (drop the branch). Control exceptions escape.""" + try: + return await factory() + except _CONTROL_EXC: + raise + except Exception: # noqa: BLE001 - includes DynamicNodeFailError + return None + + async def _supervise( + self, factories: Sequence[Callable[[], Awaitable[Any]]] + ) -> list[Any]: + """Structured fan-out via TaskGroup. See the failure/cancellation contract + in the module docstring: ordinary failure -> None; NodeInterruptedError + (and other non-cancellation BaseException) propagates and cancels the rest; + a branch's own CancelledError leaves its slot None without cancelling + siblings. Results preserve input order. + """ + results: list[Any] = [None] * len(factories) + + async def _run_one(i: int, f: Callable[[], Awaitable[Any]]) -> None: + results[i] = await self._guard_ordinary(f) + + try: + async with asyncio.TaskGroup() as tg: + for i, f in enumerate(factories): + tg.create_task(_run_one(i, f)) + except* NodeInterruptedError: + raise NodeInterruptedError() + return results + + async def parallel( + self, thunks: Sequence[Callable[[], Awaitable[Any]]] + ) -> list[Any]: + """BARRIER fan-out. thunks: zero-arg callables returning awaitables.""" + return await self._supervise(thunks) + + async def pipeline( + self, + items: Sequence[Any], + *stages: Callable[[Any, Any, int], Awaitable[Any]], + gate_drivers: bool = False, + ) -> list[Any]: + """Barrier-free per-item pipelining. Stage signature: (prev, item, index). + + Each item flows through all stages independently; item A may be in stage + k while item B is in stage 1. An ordinary Exception in a stage drops that + item to None; control exceptions propagate. + + ``gate_drivers=True`` is the intentionally-BUGGY variant used by the + contrast test to demonstrate the nested-combinator deadlock. + """ + + def make_driver(item: Any, i: int) -> Callable[[], Awaitable[Any]]: + async def drive() -> Any: + prev = item + for stage in stages: + prev = await stage(prev, item, i) + return prev + + if gate_drivers: + + async def gated() -> Any: + async with self.gate: # gating the DRIVER -> deadlock on nesting + return await drive() + + return gated + return drive + + return await self._supervise( + [make_driver(it, i) for i, it in enumerate(items)] + ) diff --git a/contributing/samples/workflows/dynamic_supervisor_spike/test_dynamic_supervisor_spike.py b/contributing/samples/workflows/dynamic_supervisor_spike/test_dynamic_supervisor_spike.py new file mode 100644 index 00000000000..f7d6af655cd --- /dev/null +++ b/contributing/samples/workflows/dynamic_supervisor_spike/test_dynamic_supervisor_spike.py @@ -0,0 +1,472 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Deterministic regression harness for the DynamicNodeSupervisor spike. + +These tests use deterministic FunctionNodes (no LLM) and run against the REAL +ADK Workflow engine (Runner + InMemorySessionService). They are the artifact +that makes the RFC credible: they pin exactly which properties hold. + +Expected result: ALL PASS. (First captured on ADK 2.0.0; re-verified on the +branch rebased onto current upstream main.) +All five merge-gate properties hold with a wrapper supervisor on the unmodified +engine — barrier-free execution, pipeline barrier-free, failed-item isolation, +control-exception cancellation, nested no-deadlock (+ driver-gating deadlock +contrast), and resume exactly-once for children that COMPLETE before an +interrupt (both sequential and concurrent). The only documented behavior is a +design trade-off, not a bug: a child that interrupts while siblings are still +IN FLIGHT causes those siblings to be cancelled and re-run on resume. + +(An earlier draft reported a concurrent-resume "engine gap"; that was a test +artifact — the interrupt fired before siblings completed, so they were +cancelled, not completed. It has been retracted.) +""" + +from __future__ import annotations + +import asyncio +import collections +import os +import sys +import time + +from google.adk import Context +from google.adk import Event +from google.adk import Workflow +from google.adk.apps.app import App +from google.adk.apps.app import ResumabilityConfig +from google.adk.events.request_input import RequestInput +from google.adk.runners import Runner +from google.adk.sessions.in_memory_session_service import InMemorySessionService +from google.adk.workflow import node +from google.adk.workflow._errors import NodeInterruptedError +from google.adk.workflow.utils._workflow_hitl_utils import create_request_input_response +from google.adk.workflow.utils._workflow_hitl_utils import get_request_input_interrupt_ids +from google.adk.workflow.utils._workflow_hitl_utils import REQUEST_INPUT_FUNCTION_CALL_NAME +from google.genai import types +import pytest + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from supervisor import DynamicNodeSupervisor # noqa: E402 (local spike module) + + +# -------------------------------------------------------------------------- +# Harness helpers +# -------------------------------------------------------------------------- +async def _run(parent_fn, *, app_name="spike"): + wf = Workflow(name=app_name, edges=[("START", parent_fn)]) + ss = InMemorySessionService() + runner = Runner(app_name=wf.name, node=wf, session_service=ss) + session = await ss.create_session(app_name=wf.name, user_id="u") + msg = types.Content(parts=[types.Part(text="go")], role="user") + probes = [] + async for ev in runner.run_async( + user_id="u", session_id=session.id, new_message=msg + ): + if ( + isinstance(ev, Event) + and isinstance(ev.output, dict) + and "probe" in ev.output + ): + probes.append(ev.output) + return probes + + +def _child(name, delay=0.0, fail=None, log=None): + @node(name=name) + async def child(ctx, node_input): + if log is not None: + log.append((name, "start", time.perf_counter())) + await asyncio.sleep(delay) + if fail == "error": + raise ValueError(f"{name} boom") + if log is not None: + log.append((name, "end", time.perf_counter())) + yield Event(output=f"{name}<-{node_input}") + + return child + + +# -------------------------------------------------------------------------- +# 1. Concurrent dispatch executes correctly and barrier-free +# -------------------------------------------------------------------------- +@pytest.mark.asyncio +async def test_concurrent_dispatch_correct_and_barrier_free(): + log = [] + + @node(rerun_on_resume=True) + async def parent(ctx: Context, node_input): + sup = DynamicNodeSupervisor(ctx) + delays = [0.05, 0.05, 0.05, 0.05] + res = await sup.parallel([ + ( + lambda i=i, d=delays[i]: sup.dispatch( + _child(f"c{i}", d, log=log), node_input=i, run_id=f"r{i}" + ) + ) + for i in range(4) + ]) + # peak_in_flight is the primary, timing-independent proof of concurrency. + yield Event(output={"probe": "bf", "res": res, "peak": sup.peak_in_flight}) + + out = (await _run(parent))[0] + assert sorted(out["res"]) == [ + f"c{i}<-{i}" for i in range(4) + ] # correct + distinct + assert len(set(out["res"])) == 4 # no aliasing / corruption + assert out["peak"] == 4 # all 4 truly ran at once + # event-order overlap: every child starts before any child ends (true fan-out) + starts = sorted(t for (_, p, t) in log if p == "start") + ends = sorted(t for (_, p, t) in log if p == "end") + assert max(starts) < min(ends) # all started before any ended + + +# -------------------------------------------------------------------------- +# 2. pipeline barrier-free: item0 enters stage2 before item1 finishes stage1 +# -------------------------------------------------------------------------- +@pytest.mark.asyncio +async def test_pipeline_barrier_free(): + log = [] + + @node(rerun_on_resume=True) + async def parent(ctx: Context, node_input): + sup = DynamicNodeSupervisor(ctx) + + async def s1(prev, item, i): + return await sup.dispatch( + _child(f"s1_{i}", 0.25 if i == 1 else 0.0, log=log), + node_input=item, + run_id=f"s1x{i}", + ) + + async def s2(prev, item, i): + return await sup.dispatch( + _child(f"s2_{i}", 0.0, log=log), node_input=prev, run_id=f"s2x{i}" + ) + + res = await sup.pipeline([0, 1], s1, s2) + yield Event(output={"probe": "pf", "res": res}) + + await _run(parent) + starts = {n: t for (n, p, t) in log if p == "start"} + ends = {n: t for (n, p, t) in log if p == "end"} + assert "s2_0" in starts and "s1_1" in ends + assert starts["s2_0"] < ends["s1_1"] # no inter-stage barrier + + +# -------------------------------------------------------------------------- +# 3. parallel failed-item isolation: ordinary error -> None, siblings fine +# -------------------------------------------------------------------------- +@pytest.mark.asyncio +async def test_parallel_failed_item_isolation(): + @node(rerun_on_resume=True) + async def parent(ctx: Context, node_input): + sup = DynamicNodeSupervisor(ctx) + res = await sup.parallel([ + (lambda: sup.dispatch(_child("p0", 0.02), node_input=0, run_id="p0")), + ( + lambda: sup.dispatch( + _child("p1", 0.01, fail="error"), node_input=1, run_id="p1" + ) + ), + (lambda: sup.dispatch(_child("p2", 0.02), node_input=2, run_id="p2")), + ]) + yield Event(output={"probe": "fi", "res": res}) + + res = (await _run(parent))[0]["res"] + assert res == ["p0<-0", None, "p2<-2"] + + +# -------------------------------------------------------------------------- +# 4. Supervisor fan-out contract: ordinary -> None; control exception +# PROPAGATES and CANCELS siblings. Requires TaskGroup (gather would not +# cancel). Tested directly on the supervisor (no engine needed). +# -------------------------------------------------------------------------- +@pytest.mark.asyncio +async def test_control_exception_propagates_and_cancels_siblings(): + sup = DynamicNodeSupervisor(ctx=None) + cancelled = {"v": False} + + async def boom(): + raise ValueError("ordinary") + + async def interrupt(): + raise NodeInterruptedError() + + async def sibling(): + try: + await asyncio.sleep(1.0) + return "finished" + except asyncio.CancelledError: + cancelled["v"] = True + raise + + async def okk(): + return "ok" + + # ordinary error -> None; sibling unaffected + assert await sup.parallel([lambda: boom(), lambda: okk()]) == [None, "ok"] + + # control exception propagates AND cancels the running sibling + with pytest.raises(NodeInterruptedError): + await sup.parallel([lambda: interrupt(), lambda: sibling()]) + await asyncio.sleep(0) # let cancellation settle + assert cancelled["v"] is True # explicit sibling-cancellation assertion + + +@pytest.mark.asyncio +async def test_child_cancellederror_does_not_cancel_siblings(): + """Contract boundary (narrowed): a branch raising asyncio.CancelledError is + asyncio's own task-cancellation. TaskGroup does NOT propagate it and does NOT + cancel siblings — the branch's slot is left None and siblings complete. This + is asyncio semantics; the supervisor does not (and is not claimed to) override + it. Only NodeInterruptedError / non-cancellation BaseException cancel siblings. + """ + sup = DynamicNodeSupervisor(ctx=None) + sib_finished = {"v": False} + + async def canceller(): + raise asyncio.CancelledError() + + async def sibling(): + await asyncio.sleep(0.03) + sib_finished["v"] = True + return "sib-done" + + res = await sup.parallel([lambda: canceller(), lambda: sibling()]) + assert res == [ + None, + "sib-done", + ] # cancelled branch -> None; sibling NOT cancelled + assert sib_finished["v"] is True + + +def test_gate_must_be_positive(): + """gate=0 would deadlock every dispatch; reject it at construction.""" + with pytest.raises(ValueError): + DynamicNodeSupervisor(ctx=None, gate=0) + with pytest.raises(ValueError): + DynamicNodeSupervisor(ctx=None, gate=-1) + + +# -------------------------------------------------------------------------- +# 5. Nested combinator no-deadlock with LEAF gating (gate=2). +# -------------------------------------------------------------------------- +@pytest.mark.asyncio +async def test_nested_combinator_no_deadlock_leaf_gating(): + @node(rerun_on_resume=True) + async def parent(ctx: Context, node_input): + sup = DynamicNodeSupervisor(ctx, gate=2) + + async def stage(prev, item, i): + return await sup.parallel([ + ( + lambda k=k: sup.dispatch( + _child(f"n{item}_{k}", 0.02), + node_input=k, + run_id=f"n{item}x{k}", + ) + ) + for k in range(3) + ]) + + res = await sup.pipeline(list(range(5)), stage) + yield Event( + output={"probe": "nest", "n": len(res), "peak": sup.peak_in_flight} + ) + + out = await asyncio.wait_for(_run(parent), timeout=10.0) # must NOT hang + assert out[0]["n"] == 5 + assert out[0]["peak"] <= 2 # leaf-gating bounds in-flight to the gate + + +# -------------------------------------------------------------------------- +# 5b. CONTRAST: gating DRIVERS deadlocks on nesting (proves leaf-gating matters) +# -------------------------------------------------------------------------- +@pytest.mark.asyncio +async def test_driver_gating_deadlocks_as_predicted(): + @node(rerun_on_resume=True) + async def parent(ctx: Context, node_input): + sup = DynamicNodeSupervisor(ctx, gate=2) + + async def stage(prev, item, i): + return await sup.parallel([ + ( + lambda k=k: sup.dispatch( + _child(f"d{item}_{k}", 0.02), + node_input=k, + run_id=f"d{item}x{k}", + ) + ) + for k in range(3) + ]) + + res = await sup.pipeline(list(range(5)), stage, gate_drivers=True) # BUGGY + yield Event(output={"probe": "dead", "n": len(res)}) + + with pytest.raises(asyncio.TimeoutError): + await asyncio.wait_for(_run(parent), timeout=3.0) + + +# -------------------------------------------------------------------------- +# Resume exactly-once — the merge gate. +# +# CORRECTION (vs an earlier draft of this harness): there is NO resume engine +# gap. An earlier test let the RequestInput child interrupt *before* its +# siblings finished, so the TaskGroup CANCELLED still-running siblings; those +# cancelled (never-completed) children then re-ran on resume — which is +# correct, not a bug. The tests below separate the two cases cleanly: +# * children that COMPLETE before the interrupt -> fast-forward (exactly-once) +# * children still IN FLIGHT at the interrupt -> cancelled -> correctly re-run +# Both hold for sequential AND concurrent dispatch. +# -------------------------------------------------------------------------- +async def _resume_scenario(*, concurrent, ask_delay, child_delay): + """Dispatch 3 plain children + 1 RequestInput child, interrupt, resume. + + Returns (body_runs, completed) where body_runs[name] counts body ENTRIES and + `completed` lists children that ran to completion (emitted output) on run 1. + The counter is captured by closure (NOT a pydantic field) so every body + execution is observed by the same object. + + Timing knobs decide whether children complete before the interrupt: + ask_delay -- ask sleeps this long before issuing RequestInput + child_delay -- each plain child sleeps this long before completing + """ + body_runs = collections.Counter() + completed = [] + + def plain(name): + @node(name=name) + async def child(ctx, node_input): + body_runs[name] += 1 + await asyncio.sleep(child_delay) + completed.append(name) + yield Event(output=f"{name}=done") + + return child + + @node(name="ask", rerun_on_resume=True) + async def ask(ctx: Context, node_input): + body_runs["ask"] += 1 + resume = getattr(ctx, "resume_inputs", {}).get("ask") + if resume is None: + await asyncio.sleep(ask_delay) + yield RequestInput(interrupt_id="ask", message="approve ask?") + else: + yield Event(output="ask=approved") + + @node(rerun_on_resume=True) + async def parent(ctx: Context, node_input): + sup = DynamicNodeSupervisor(ctx, gate=8) + thunks = [ + (lambda: sup.dispatch(plain("a"), node_input=1, run_id="ax")), + (lambda: sup.dispatch(plain("b"), node_input=2, run_id="bx")), + (lambda: sup.dispatch(plain("c"), node_input=3, run_id="cx")), + (lambda: sup.dispatch(ask, node_input=4, run_id="askx")), + ] + if concurrent: + res = await sup.parallel(thunks) + else: + res = [await t() for t in thunks] # sequential control + yield Event(output={"probe": "resume", "res": res}) + + wf = Workflow(name="resume_wf", edges=[("START", parent)]) + app = App( + name="resume_app", + root_agent=wf, + resumability_config=ResumabilityConfig(is_resumable=True), + ) + ss = InMemorySessionService() + runner = Runner(app=app, session_service=ss) + session = await ss.create_session(app_name=app.name, user_id="u") + + # run 1 -> expect RequestInput interrupt + msg = types.Content(parts=[types.Part(text="go")], role="user") + ev1 = [ + e + async for e in runner.run_async( + user_id="u", session_id=session.id, new_message=msg + ) + ] + req = None + for e in ev1: + if getattr(e, "content", None) and e.content and e.content.parts: + for p in e.content.parts: + if ( + p.function_call + and p.function_call.name == REQUEST_INPUT_FUNCTION_CALL_NAME + ): + req = e + assert req is not None, "expected a RequestInput interrupt on run 1" + completed_on_run1 = list(completed) + interrupt_id = get_request_input_interrupt_ids(req)[0] + invocation_id = req.invocation_id + + # resume + part = create_request_input_response(interrupt_id, {"approved": "yes"}) + _ = [ + e + async for e in runner.run_async( + user_id="u", + session_id=session.id, + new_message=types.Content(parts=[part], role="user"), + invocation_id=invocation_id, + ) + ] + return body_runs, completed_on_run1 + + +@pytest.mark.asyncio +async def test_sequential_resume_is_exactly_once(): + """Baseline: sequential dispatch — children complete in order before ask.""" + runs, completed1 = await _resume_scenario( + concurrent=False, ask_delay=0.0, child_delay=0.0 + ) + assert set(completed1) == {"a", "b", "c"} # all completed on run 1 + assert ( + runs["a"] == 1 and runs["b"] == 1 and runs["c"] == 1 + ) # fast-forward on resume + assert runs["ask"] == 2 # interrupted node re-runs + + +@pytest.mark.asyncio +async def test_concurrent_resume_completed_children_fast_forward(): + """Merge gate: under CONCURRENT dispatch, children that COMPLETE before the + interrupt fast-forward on resume (exactly-once). ask sleeps so a/b/c finish + first.""" + runs, completed1 = await _resume_scenario( + concurrent=True, ask_delay=0.10, child_delay=0.0 + ) + assert set(completed1) == {"a", "b", "c"} # genuinely completed + assert ( + runs["a"] == 1 and runs["b"] == 1 and runs["c"] == 1 + ) # NOT re-run -> exactly-once + assert runs["ask"] == 2 + + +@pytest.mark.asyncio +async def test_concurrent_inflight_children_cancelled_on_interrupt_rerun(): + """Documents the one real behavior: under CONCURRENT dispatch, a sibling that + interrupts while others are still IN FLIGHT causes the TaskGroup to cancel + them. Cancelled (never-completed) children correctly re-run on resume. This + is correctness-preserving (not a double-spend of completed work), though it + does discard the cancelled siblings' partial progress — a design trade-off + the RFC should note.""" + runs, completed1 = await _resume_scenario( + concurrent=True, ask_delay=0.0, child_delay=0.10 + ) + assert completed1 == [] # none completed (all cancelled) + assert ( + runs["a"] == 2 and runs["b"] == 2 and runs["c"] == 2 + ) # re-run is CORRECT here diff --git a/contributing/samples/workflows/dynamic_supervisor_spike/test_live_gemini_e2e.py b/contributing/samples/workflows/dynamic_supervisor_spike/test_live_gemini_e2e.py new file mode 100644 index 00000000000..7a638eae785 --- /dev/null +++ b/contributing/samples/workflows/dynamic_supervisor_spike/test_live_gemini_e2e.py @@ -0,0 +1,159 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""OPTIONAL live end-to-end evidence for the DynamicNodeSupervisor spike. + +This is supporting evidence only — NOT part of the deterministic merge gate. +It is skipped unless a real model is explicitly configured via env vars, so it +never runs in CI by accident and contains no hardcoded project/location/model. + +Enable with, e.g. (Vertex): + export SPIKE_LIVE=1 + export GOOGLE_GENAI_USE_VERTEXAI=1 + export GOOGLE_CLOUD_PROJECT= + export GOOGLE_CLOUD_LOCATION=global # gemini-3.5-flash serves here + export SPIKE_GEMINI_MODEL=gemini-3.5-flash # or any flash model you can access + +The model is read from ``SPIKE_GEMINI_MODEL`` and **defaults to +``gemini-2.5-flash``** (broadly available in regional Vertex). To use +``gemini-3.5-flash`` set ``SPIKE_GEMINI_MODEL=gemini-3.5-flash`` and +``GOOGLE_CLOUD_LOCATION=global`` (it does not serve from ``us-central1``). + +It runs a 2-stage (review -> severity) pipeline over a few snippets, fanned out +concurrently through the supervisor, and asserts a real concurrency speedup. +""" + +from __future__ import annotations + +import asyncio +import os +import time + +import pytest + +_LIVE = os.environ.get("SPIKE_LIVE") == "1" and bool( + os.environ.get("GOOGLE_CLOUD_PROJECT") or os.environ.get("GOOGLE_API_KEY") +) + +pytestmark = pytest.mark.skipif( + not _LIVE, + reason=( + "live model not configured; set SPIKE_LIVE=1 and model/project env vars" + ), +) + +MODEL = os.environ.get("SPIKE_GEMINI_MODEL", "gemini-2.5-flash") + +SNIPPETS = [ + "def login(pw): return pw == 'admin123' # hardcoded password", + "query = f\"SELECT * FROM users WHERE id = {request.args['id']}\"", + "def add(a, b): return a + b", + "os.system('ping ' + user_supplied_host)", +] + + +@pytest.mark.asyncio +async def test_live_gemini_pipeline_speedup(): + import os as _os + import sys as _sys + + from google.adk import Agent + from google.adk import Context + from google.adk import Event + from google.adk import Workflow + from google.adk.runners import Runner + from google.adk.sessions.in_memory_session_service import InMemorySessionService + from google.adk.workflow import node + from google.genai import types + + _sys.path.insert(0, _os.path.dirname(_os.path.abspath(__file__))) + from supervisor import DynamicNodeSupervisor # noqa: E402 + + reviewer = Agent( + name="reviewer", + model=MODEL, + instruction=( + "You are a security reviewer. The user message is a code " + "snippet. In ONE short sentence, state the single biggest " + "security concern, or 'none'." + ), + ) + rater = Agent( + name="rater", + model=MODEL, + instruction=( + "The user message is a security concern. Reply with EXACTLY " + "one word: CRITICAL, HIGH, MEDIUM, LOW, or NONE." + ), + ) + + latencies: list[float] = [] + + async def timed(coro): + t = time.perf_counter() + out = await coro + latencies.append(time.perf_counter() - t) + return out + + @node(rerun_on_resume=True) + async def parent(ctx: Context, node_input): + sup = DynamicNodeSupervisor(ctx, gate=8) + + async def review(_prev, snippet, i): + return await timed( + sup.dispatch(reviewer, node_input=snippet, run_id=f"rev{i}") + ) + + async def rate(concern, snippet, i): + return await timed( + sup.dispatch(rater, node_input=str(concern), run_id=f"rate{i}") + ) + + t0 = time.perf_counter() + res = await sup.pipeline(SNIPPETS, review, rate) + yield Event( + output={ + "probe": "live", + "res": res, + "wall": time.perf_counter() - t0, + "sum": sum(latencies), + "n": len(latencies), + } + ) + + wf = Workflow(name="live", edges=[("START", parent)]) + ss = InMemorySessionService() + runner = Runner(app_name=wf.name, node=wf, session_service=ss) + session = await ss.create_session(app_name=wf.name, user_id="u") + msg = types.Content(parts=[types.Part(text="go")], role="user") + out = None + async for ev in runner.run_async( + user_id="u", session_id=session.id, new_message=msg + ): + if ( + isinstance(ev, Event) + and isinstance(ev.output, dict) + and ev.output.get("probe") == "live" + ): + out = ev.output + + assert out is not None + assert out["n"] == len(SNIPPETS) * 2 # 2 real calls per item + assert len([r for r in out["res"] if r]) == len(SNIPPETS) + # concurrent pipeline wall-clock is well under the serial sum of call latencies + assert out["wall"] < out["sum"] * 0.6 + print( + f"\nlive {MODEL}: {out['n']} calls, wall={out['wall']:.2f}s " + f"vs serial-sum={out['sum']:.2f}s = {out['sum']/out['wall']:.1f}x" + ) From 627a5dcafcb4bd6f3bea3a0f918549a9ded50f18 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Mon, 1 Jun 2026 17:03:28 -0700 Subject: [PATCH 02/64] spike(workflow): RFC #93 authored-workflow demand-gate artifact A model emits a declarative, validated WorkflowSpec (typed data, not code) that the framework validates and executes on the real ADK engine via the #92 supervisor. authoring.py (WorkflowSpec plain kind-tagged recursive union; CapabilityRegistry; WorkflowSpecValidator; SpecInterpreter for step/fan_out/ branch/loop_until), 10 deterministic tests, env-gated live planner sweep (multi-stage/branch/loop on gemini-3.5-flash, shape-specific assertions). Findings folded into the RFC: open dict[str,X] maps are a structured-output hazard (Branch.routes -> list[Route]); Gemini response_schema rejects Field(discriminator=...), so the vocabulary is a plain kind-tagged union; planning vs capability quality are separable. pyink/isort/mdformat clean. --- .../authored_workflow_spike/README.md | 80 ++++ .../authored_workflow_spike/authoring.py | 347 +++++++++++++++++ .../authored_workflow_spike/test_authoring.py | 351 ++++++++++++++++++ .../test_live_planner_sweep.py | 328 ++++++++++++++++ 4 files changed, 1106 insertions(+) create mode 100644 contributing/samples/workflows/authored_workflow_spike/README.md create mode 100644 contributing/samples/workflows/authored_workflow_spike/authoring.py create mode 100644 contributing/samples/workflows/authored_workflow_spike/test_authoring.py create mode 100644 contributing/samples/workflows/authored_workflow_spike/test_live_planner_sweep.py diff --git a/contributing/samples/workflows/authored_workflow_spike/README.md b/contributing/samples/workflows/authored_workflow_spike/README.md new file mode 100644 index 00000000000..c206f84f6e2 --- /dev/null +++ b/contributing/samples/workflows/authored_workflow_spike/README.md @@ -0,0 +1,80 @@ +# Authored Workflow Spike — demand gate for RFC #93 + +Reference spike for **agent-authored typed Workflows** (RFC #93): a model emits +a declarative, validated `WorkflowSpec` (typed data, **not** code) that the +framework validates and executes on the real ADK Workflow engine via the #92 +`DynamicNodeSupervisor`. This directory is the re-runnable demand-gate artifact +behind the RFC's "can a model author good plans?" question. + +## Environment + +- ADK: `2.1.0` +- Built against `google/adk-python` upstream `main`. +- Python 3.11+ (recursive `kind`-tagged unions; `asyncio.TaskGroup` in #92). + +## Files + +| File | Purpose | +| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `authoring.py` | `WorkflowSpec` (plain `kind`-tagged recursive tree), `CapabilityRegistry`, `WorkflowSpecValidator`, `SpecInterpreter` (step / fan_out / branch / loop_until). | +| `test_authoring.py` | Deterministic, CI-safe tests (no LLM). The trustworthy artifact. | +| `test_live_planner_sweep.py` | OPTIONAL env-gated live planner sweep across plan shapes. | + +## Deterministic tests (CI-safe, no network) + +```bash +pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q +``` + +Expected: **10 passed** — `Binding` invariant, `max_iters>=1`, validator accepts a +valid spec and rejects unknown capability / non-preceding binding / duplicate id, +the open-map warning, and interpreter execution of fan_out→aggregate, branch +(correct route), and loop_until (stops + correct output). + +## Live planner sweep (optional evidence) + +Skipped unless configured — no hardcoded project/model: + +```bash +export SPIKE_LIVE=1 GOOGLE_GENAI_USE_VERTEXAI=1 +export GOOGLE_CLOUD_PROJECT= GOOGLE_CLOUD_LOCATION=global +export SPIKE_GEMINI_MODEL=gemini-3.5-flash # 3.5 serves from `global` +pytest contributing/samples/workflows/authored_workflow_spike/test_live_planner_sweep.py -q -s +``` + +## Gate results (run on `gemini-3.5-flash`) + +**Initial gate (codebase audit):** planner authored a valid, sensible, executable +plan (`fan_out reviewer → triager`) matching a hand-wired baseline. **PASS.** + +**Shape sweep (this directory):** the planner authored + validated + executed all +three shapes: + +| Shape | Authored steps | Result | +| ----------- | ----------------------- | --------------------------------------- | +| multi-stage | `fan_out → step → step` | report → formatted note | +| branch | `step → branch` | took the matched route, produced a note | +| loop_until | `loop_until` | iterated to a headline | + +## Findings that fell out (and shaped the RFC) + +1. **Open-ended `dict[str, X]` maps are a structured-output reliability hazard.** + Surfaced **twice**: a capability's `counts: dict[str,int]` came back empty, and + the spec's own `Branch.routes: dict[str, list]` came back empty. **Both fixed by + using enumerated/list structures** — capability outputs use fixed severity + fields; `Branch.routes` is now a `list[Route]`, not a map. The validator also + warns on open-map capability outputs. +1. **The strict `unmatched=fail` branch contract earns its keep** — when the planner + bound a branch switch to a whole object instead of its field, execution failed + loudly instead of silently mis-routing. +1. **Gemini `response_schema` rejects Pydantic's `Field(discriminator=...)`.** The + plan vocabulary is a PLAIN union of models that each carry a `kind` literal (a + *structurally-tagged* union). The strict discriminated form emits a + `discriminator` keyword that genai's `response_schema` refuses + (`Schema: extra_forbidden`, verified on `gemini-3.5-flash`); the `kind` tags + still make parsing and switching unambiguous. +1. **Planning vs capability quality are separable** — authoring/structure was + reliably good; the residual variance was per-capability output quality + (prompts/schemas/retries), not planning. + +This is a demand-gate artifact, not production code. diff --git a/contributing/samples/workflows/authored_workflow_spike/authoring.py b/contributing/samples/workflows/authored_workflow_spike/authoring.py new file mode 100644 index 00000000000..ec52b7ed702 --- /dev/null +++ b/contributing/samples/workflows/authored_workflow_spike/authoring.py @@ -0,0 +1,347 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Agent-authored typed Workflows — reference spike (RFC #93). + +A minimal, faithful implementation of the RFC's authoring layer: + +* ``WorkflowSpec`` — a plain ``kind``-tagged recursive union (a typed plan + vocabulary; not Pydantic's discriminated union — see the SpecNode note). +* ``CapabilityRegistry`` — the closed set of agents/tools a plan may compose. +* ``WorkflowSpecValidator`` — semantic validation (capability refs, binding + scope, list/loop/branch rules) + an open-map output-schema warning. +* ``SpecInterpreter`` — executes a validated spec on the real ADK Workflow + engine via the #92 ``DynamicNodeSupervisor`` (step / fan_out / branch / + loop_until). + +This is a demand-gate artifact, not production code. See README.md. +""" + +from __future__ import annotations + +import json +import os +import sys +from typing import Any +from typing import Literal +from typing import Optional +from typing import Union + +from pydantic import BaseModel +from pydantic import Field +from pydantic import model_validator + +# The #92 supervisor lives in a sibling sample dir. +sys.path.insert( + 0, + os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "..", + "dynamic_supervisor_spike", + ), +) +from supervisor import DynamicNodeSupervisor # noqa: E402 + + +# ----------------------------------------------------------------- WorkflowSpec +class Binding(BaseModel): + """The only way a node sources input: a source + optional dotted path.""" + + source: Literal["task", "step"] + step: Optional[str] = None + path: Optional[str] = None + + @model_validator(mode="after") + def _invariant(self): + if (self.source == "step") != (self.step is not None): + raise ValueError("source=='step' iff `step` is set") + return self + + +class StepRef(BaseModel): + kind: Literal["step"] + id: str + capability: str + input: Binding + + +class FanOut(BaseModel): + kind: Literal["fan_out"] + id: str + over: Binding + capability: str + collect: Literal["list"] = "list" + + +class Route(BaseModel): + value: str + block: list["SpecNode"] + + +class Branch(BaseModel): + kind: Literal["branch"] + id: str + on: Binding + # Enumerated LIST of routes, NOT an open dict[str, ...] map: open maps are a + # structured-output reliability hazard (the model leaves them empty). The + # spike's branch shape exposed exactly this — see README. + routes: list[Route] + unmatched: Literal["fail"] = "fail" + + +class LoopUntil(BaseModel): + kind: Literal["loop_until"] + id: str + body: list["SpecNode"] + until_capability: str + until_input: Binding + max_iters: int = Field(ge=1) + + +# NOTE: a PLAIN union, not Pydantic's Field(discriminator="kind"). The discriminated +# form emits a JSON schema with a `discriminator` keyword that genai's response_schema +# rejects (Schema: extra_forbidden — verified on gemini-3.5-flash). Each member still +# carries a `kind` Literal, so this is a structurally-tagged union: unambiguous to parse +# and to switch on, AND accepted as a Gemini response_schema. +SpecNode = Union[StepRef, FanOut, Branch, LoopUntil] + + +class WorkflowSpec(BaseModel): + goal: str + steps: list[SpecNode] + output: Binding + + +for _m in (StepRef, FanOut, Branch, Route, LoopUntil, WorkflowSpec): + _m.model_rebuild() + + +# ----------------------------------------------------------------- registry +class Capability(BaseModel): + """A registered capability the planner may compose by name.""" + + model_config = {"arbitrary_types_allowed": True} + + name: str + build: Any # () -> NodeLike (an ADK Agent, or a deterministic @node fn) + input_kind: Literal["item", "list"] + output_model: Optional[type[BaseModel]] = None + serialize_input: bool = ( + True # json.dumps the node_input (True for LLM agents) + ) + max_fan_out: int = 100 + side_effect: bool = False + + +class CapabilityRegistry: + + def __init__(self, capabilities: list[Capability]): + self._by_name = {c.name: c for c in capabilities} + + def __contains__(self, name): + return name in self._by_name + + def __getitem__(self, name): + return self._by_name[name] + + def open_map_warnings(self) -> list[str]: + """Spike lesson: open-ended dict[str, X] output fields are a structured- + output reliability hazard (Gemini fills them unreliably). Warn on them.""" + warnings = [] + for cap in self._by_name.values(): + model = cap.output_model + if model is None: + continue + for fname, field in model.model_fields.items(): + ann = str(field.annotation) + if "dict[" in ann.replace(" ", "") and "int]" not in ann.lower()[:0]: + if ann.replace(" ", "").startswith( + "dict[str," + ) or "dict[str," in ann.replace(" ", ""): + warnings.append( + f"capability '{cap.name}': output field '{fname}' is an open" + f" map ({ann}); prefer enumerated fields for reliable" + " structured output" + ) + return warnings + + +# ----------------------------------------------------------------- validator +class SpecValidationError(Exception): + pass + + +class WorkflowSpecValidator: + + def __init__(self, registry: CapabilityRegistry): + self.registry = registry + + def validate(self, spec: WorkflowSpec) -> list[str]: + """Raises SpecValidationError on a hard error; returns soft warnings.""" + ids: set[str] = set() + self._walk(spec.steps, set(), ids) + if spec.output.source == "step" and spec.output.step not in ids: + raise SpecValidationError( + f"output references unknown step {spec.output.step!r}" + ) + return self.registry.open_map_warnings() + + def _walk(self, nodes, preceding: set[str], ids: set[str]) -> set[str]: + preceding = set(preceding) + for n in nodes: + if n.id in ids: + raise SpecValidationError(f"duplicate id {n.id!r}") + ids.add(n.id) + if isinstance(n, (StepRef, FanOut)) and n.capability not in self.registry: + raise SpecValidationError(f"unknown capability {n.capability!r}") + if isinstance(n, LoopUntil) and n.until_capability not in self.registry: + raise SpecValidationError( + f"unknown until_capability {n.until_capability!r}" + ) + # Entry bindings (input/over/on) reference a PRIOR step on this path. + for f in ("input", "over", "on"): + b = getattr(n, f, None) + if ( + isinstance(b, Binding) + and b.source == "step" + and b.step not in preceding + ): + raise SpecValidationError( + f"{n.id}: binding references non-preceding step {b.step!r}" + ) + if ( + isinstance(n, FanOut) + and self.registry[n.capability].input_kind != "item" + ): + raise SpecValidationError( + f"fan_out {n.id}: capability must take an item" + ) + if isinstance(n, LoopUntil): + # body executes in-scope; until_input may reference a body step. + body_scope = self._walk(n.body, preceding | {n.id}, ids) + ui = n.until_input + if ui.source == "step" and ui.step not in body_scope: + raise SpecValidationError( + f"loop {n.id}: until_input references step {ui.step!r} not in its" + " body/scope" + ) + if isinstance(n, Branch): + for route in n.routes: + self._walk(route.block, preceding | {n.id}, ids) + preceding.add(n.id) + return preceding + + +def _bindings(n) -> list[Binding]: + out = [] + for f in ("input", "over", "on", "until_input"): + b = getattr(n, f, None) + if isinstance(b, Binding): + out.append(b) + return out + + +# ----------------------------------------------------------------- interpreter +class SpecInterpreter: + """Executes a validated WorkflowSpec on the real ADK engine via the #92 + supervisor. Handles step / fan_out / branch / loop_until.""" + + def __init__(self, registry: CapabilityRegistry, ctx, *, gate: int = 8): + self.registry = registry + self.ctx = ctx + self.sup = DynamicNodeSupervisor(ctx, gate=gate) + self.state: dict[str, Any] = {} + + def _resolve(self, binding: Binding, task_input): + base = task_input if binding.source == "task" else self.state[binding.step] + if binding.path: + cur = base + for part in binding.path.split("."): + cur = cur[part] if isinstance(cur, dict) else getattr(cur, part) + return cur + return base + + def _arg(self, cap: Capability, value): + return json.dumps(value, default=str) if cap.serialize_input else value + + async def _dispatch(self, cap_name: str, value, run_id: str): + cap = self.registry[cap_name] + return await self.sup.dispatch( + cap.build(), node_input=self._arg(cap, value), run_id=run_id + ) + + async def execute(self, spec: WorkflowSpec, task_input) -> Any: + await self._run_block(spec.steps, task_input, prefix="") + return self._resolve(spec.output, task_input) + + async def _run_block(self, nodes, task_input, prefix: str): + last = None + for n in nodes: + rid = f"{prefix}{n.id}" + if isinstance(n, StepRef): + self.state[n.id] = await self._dispatch( + n.capability, self._resolve(n.input, task_input), rid + ) + elif isinstance(n, FanOut): + cap = self.registry[n.capability] + items = self._resolve(n.over, task_input) + if len(items) > cap.max_fan_out: + raise SpecValidationError( + f"runtime: fan_out {len(items)} exceeds max_fan_out" + f" {cap.max_fan_out}" + ) + self.state[n.id] = await self.sup.pipeline( + items, + ( + lambda _p, it, i, c=cap, rid=rid: self.sup.dispatch( + c.build(), node_input=self._arg(c, it), run_id=f"{rid}_{i}" + ) + ), + ) + elif isinstance(n, Branch): + value = str(self._resolve(n.on, task_input)) + routes = {r.value: r.block for r in n.routes} + if value not in routes: + raise SpecValidationError( + f"runtime: branch {n.id} unmatched value {value!r}" + " (unmatched=fail)" + ) + out = await self._run_block( + routes[value], task_input, prefix=f"{rid}_{value}_" + ) + self.state[n.id] = out + elif isinstance(n, LoopUntil): + out = None + for i in range(n.max_iters): + out = await self._run_block(n.body, task_input, prefix=f"{rid}_i{i}_") + verdict = await self._dispatch( + n.until_capability, + self._resolve(n.until_input, task_input), + f"{rid}_i{i}_until", + ) + if _truthy(verdict): + break + self.state[n.id] = out + last = self.state.get(n.id) + return last + + +def _truthy(v) -> bool: + if isinstance(v, bool): + return v + if isinstance(v, dict): + for k in ("result", "value", "done", "ok"): + if k in v: + return bool(v[k]) + return bool(v) diff --git a/contributing/samples/workflows/authored_workflow_spike/test_authoring.py b/contributing/samples/workflows/authored_workflow_spike/test_authoring.py new file mode 100644 index 00000000000..7c97ae3732c --- /dev/null +++ b/contributing/samples/workflows/authored_workflow_spike/test_authoring.py @@ -0,0 +1,351 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Deterministic, CI-safe tests for the authored-workflow spike (RFC #93). + +No LLM. Capabilities are deterministic stub nodes, so these exercise the +validator + the interpreter (step / fan_out / branch / loop_until + binding +scope) on the real ADK Workflow engine. The live planner sweep lives in +test_live_planner_sweep.py (env-gated). +""" + +from __future__ import annotations + +import os +import sys + +from google.adk import Event +from google.adk import Workflow +from google.adk.runners import Runner +from google.adk.sessions.in_memory_session_service import InMemorySessionService +from google.adk.workflow import node +from google.genai import types +from pydantic import BaseModel +import pytest + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from authoring import Binding # noqa: E402 +from authoring import Branch +from authoring import Capability +from authoring import CapabilityRegistry +from authoring import FanOut +from authoring import LoopUntil +from authoring import Route +from authoring import SpecInterpreter +from authoring import SpecValidationError +from authoring import StepRef +from authoring import WorkflowSpec +from authoring import WorkflowSpecValidator + + +# ----------------------------------------------------------------- stub caps +def _cap_node(name, fn): + def build(): + @node(name=name) + async def n(ctx, node_input): + yield Event(output=fn(node_input)) + + return n + + return build + + +def _registry(): + return CapabilityRegistry([ + Capability( + name="review", + build=_cap_node( + "review", + lambda f: { + "path": f["path"], + "severity": "HIGH" if "bad" in f["code"] else "NONE", + }, + ), + input_kind="item", + serialize_input=False, + max_fan_out=10, + ), + Capability( + name="count", + build=_cap_node( + "count", + lambda findings: { + "n": len(findings), + "high": sum(1 for x in findings if x["severity"] == "HIGH"), + }, + ), + input_kind="list", + serialize_input=False, + ), + Capability( + name="classify", + build=_cap_node( + "classify", lambda s: "tech" if "code" in str(s) else "other" + ), + input_kind="item", + serialize_input=False, + ), + Capability( + name="tech_summary", + build=_cap_node("tech_summary", lambda s: "TECH:" + str(s)), + input_kind="item", + serialize_input=False, + ), + Capability( + name="other_summary", + build=_cap_node("other_summary", lambda s: "OTHER:" + str(s)), + input_kind="item", + serialize_input=False, + ), + Capability( + name="draft", + build=_cap_node("draft", lambda s: {"text": "v", "len": len(str(s))}), + input_kind="item", + serialize_input=False, + ), + Capability( + name="is_good", + build=_cap_node("is_good", lambda s: True), + input_kind="item", + serialize_input=False, + ), + ]) + + +async def _run_spec(spec, registry, task_input): + holder = {} + + @node(rerun_on_resume=True) + async def parent(ctx, node_input): + interp = SpecInterpreter(registry, ctx) + holder["out"] = await interp.execute(spec, task_input) + yield Event(output={"_done": True}) + + wf = Workflow(name="t", edges=[("START", parent)]) + ss = InMemorySessionService() + r = Runner(app_name=wf.name, node=wf, session_service=ss) + s = await ss.create_session(app_name=wf.name, user_id="u") + async for _ in r.run_async( + user_id="u", + session_id=s.id, + new_message=types.Content(parts=[types.Part(text="go")], role="user"), + ): + pass + return holder["out"] + + +# ----------------------------------------------------------------- validator +def test_binding_invariant(): + with pytest.raises(Exception): + Binding(source="step") # step missing + with pytest.raises(Exception): + Binding(source="task", step="x") # step set for task + + +def test_loop_max_iters_must_be_positive(): + with pytest.raises(Exception): + LoopUntil( + kind="loop_until", + id="l", + body=[], + until_capability="is_good", + until_input=Binding(source="task"), + max_iters=0, + ) + + +def _fanout_aggregate_spec(): + return WorkflowSpec( + goal="audit", + steps=[ + FanOut( + kind="fan_out", + id="rev", + over=Binding(source="task", path="files"), + capability="review", + ), + StepRef( + kind="step", + id="agg", + capability="count", + input=Binding(source="step", step="rev"), + ), + ], + output=Binding(source="step", step="agg"), + ) + + +def test_validator_accepts_valid_spec(): + WorkflowSpecValidator(_registry()).validate( + _fanout_aggregate_spec() + ) # no raise + + +def test_validator_rejects_unknown_capability(): + spec = _fanout_aggregate_spec() + spec.steps[0].capability = "nope" + with pytest.raises(SpecValidationError): + WorkflowSpecValidator(_registry()).validate(spec) + + +def test_validator_rejects_nonpreceding_binding(): + spec = WorkflowSpec( + goal="x", + steps=[ + StepRef( + kind="step", + id="a", + capability="count", + input=Binding(source="step", step="later"), + ) + ], # references a later/unknown step + output=Binding(source="step", step="a"), + ) + with pytest.raises(SpecValidationError): + WorkflowSpecValidator(_registry()).validate(spec) + + +def test_validator_rejects_duplicate_id(): + spec = WorkflowSpec( + goal="x", + steps=[ + StepRef( + kind="step", + id="a", + capability="classify", + input=Binding(source="task"), + ), + StepRef( + kind="step", + id="a", + capability="classify", + input=Binding(source="task"), + ), + ], + output=Binding(source="step", step="a"), + ) + with pytest.raises(SpecValidationError): + WorkflowSpecValidator(_registry()).validate(spec) + + +def test_open_map_warning(): + class BadReport(BaseModel): + total: int + counts: dict[str, int] # open map — should warn + + reg = CapabilityRegistry([ + Capability( + name="triage", + build=lambda: None, + input_kind="list", + output_model=BadReport, + ) + ]) + warnings = reg.open_map_warnings() + assert any("open map" in w for w in warnings) + + +# ----------------------------------------------------------------- interpreter +@pytest.mark.asyncio +async def test_interpreter_fanout_then_aggregate(): + files = [ + {"path": "a.py", "code": "bad thing"}, + {"path": "b.py", "code": "fine"}, + {"path": "c.py", "code": "bad"}, + ] + out = await _run_spec(_fanout_aggregate_spec(), _registry(), {"files": files}) + assert out == {"n": 3, "high": 2} + + +@pytest.mark.asyncio +async def test_interpreter_branch_takes_correct_route(): + spec = WorkflowSpec( + goal="branch", + steps=[ + StepRef( + kind="step", + id="cls", + capability="classify", + input=Binding(source="task"), + ), + Branch( + kind="branch", + id="br", + on=Binding(source="step", step="cls"), + routes=[ + Route( + value="tech", + block=[ + StepRef( + kind="step", + id="t", + capability="tech_summary", + input=Binding(source="task"), + ) + ], + ), + Route( + value="other", + block=[ + StepRef( + kind="step", + id="o", + capability="other_summary", + input=Binding(source="task"), + ) + ], + ), + ], + ), + ], + output=Binding(source="step", step="br"), + ) + WorkflowSpecValidator(_registry()).validate(spec) + assert (await _run_spec(spec, _registry(), "this is code")).startswith( + "TECH:" + ) + assert (await _run_spec(spec, _registry(), "hello world")).startswith( + "OTHER:" + ) + + +@pytest.mark.asyncio +async def test_interpreter_loop_until_stops_and_outputs(): + spec = WorkflowSpec( + goal="loop", + steps=[ + LoopUntil( + kind="loop_until", + id="lp", + body=[ + StepRef( + kind="step", + id="d", + capability="draft", + input=Binding(source="task"), + ) + ], + until_capability="is_good", + until_input=Binding(source="step", step="d"), + max_iters=3, + ), + ], + output=Binding(source="step", step="lp"), + ) + WorkflowSpecValidator(_registry()).validate(spec) + out = await _run_spec(spec, _registry(), "topic") + assert out == { + "text": "v", + "len": len("topic"), + } # loop output = last body node output diff --git a/contributing/samples/workflows/authored_workflow_spike/test_live_planner_sweep.py b/contributing/samples/workflows/authored_workflow_spike/test_live_planner_sweep.py new file mode 100644 index 00000000000..27425b5dc86 --- /dev/null +++ b/contributing/samples/workflows/authored_workflow_spike/test_live_planner_sweep.py @@ -0,0 +1,328 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""OPTIONAL live planner sweep for RFC #93 — coverage across plan shapes. + +Skipped unless a real model is configured (no hardcoded project/model). Asks a +planner LlmAgent(output_schema=WorkflowSpec) to author plans for three shapes — +multi-stage, branch, loop_until — then validates and executes each on the real +ADK engine. Demonstrates authoring quality beyond the single fan-out/aggregate +shape from the original gate. + +Enable (Vertex): + export SPIKE_LIVE=1 GOOGLE_GENAI_USE_VERTEXAI=1 + export GOOGLE_CLOUD_PROJECT= GOOGLE_CLOUD_LOCATION=global + export SPIKE_GEMINI_MODEL=gemini-3.5-flash # 3.5 serves from `global` + pytest test_live_planner_sweep.py -q -s +""" + +from __future__ import annotations + +import os +import sys +from typing import Literal + +from google.adk import Agent +from google.adk import Event +from google.adk import Workflow +from google.adk.runners import Runner +from google.adk.sessions.in_memory_session_service import InMemorySessionService +from google.adk.workflow import node +from google.genai import types +from pydantic import BaseModel +import pytest + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from authoring import Capability # noqa: E402 +from authoring import CapabilityRegistry +from authoring import SpecInterpreter +from authoring import WorkflowSpec +from authoring import WorkflowSpecValidator + +_LIVE = os.environ.get("SPIKE_LIVE") == "1" and bool( + os.environ.get("GOOGLE_CLOUD_PROJECT") +) +pytestmark = pytest.mark.skipif( + not _LIVE, reason="set SPIKE_LIVE=1 + project/model env to run" +) +MODEL = os.environ.get("SPIKE_GEMINI_MODEL", "gemini-2.5-flash") +DET = types.GenerateContentConfig(temperature=0) + + +def _agent(name, schema, instr): + return Capability( + name=name, + input_kind="item", + output_model=schema, + serialize_input=True, + build=lambda: Agent( + name=name, + model=MODEL, + output_schema=schema, + generate_content_config=DET, + instruction=instr, + ), + ) + + +# Enumerated fields (NOT an open dict) — the contract lesson from the first gate. +class ReportFixed(BaseModel): + total: int + critical: int + high: int + medium: int + low: int + none: int + summary: str + + +class Finding(BaseModel): + path: str + severity: Literal["CRITICAL", "HIGH", "MEDIUM", "LOW", "NONE"] + issue: str + + +class Verdict(BaseModel): + is_tech: bool + + +class Category(BaseModel): + category: Literal["tech", "other"] + + +class Note(BaseModel): + note: str + + +def _registry(): + caps = [ + _agent( + "reviewer", + Finding, + "Input JSON with keys path and code. Output a Finding (echo path).", + ), + Capability( + name="triager", + input_kind="list", + output_model=ReportFixed, + serialize_input=True, + build=lambda: Agent( + name="triager", + model=MODEL, + output_schema=ReportFixed, + generate_content_config=DET, + instruction=( + "Input: JSON list of Findings. Output ReportFixed: total, " + "per-severity counts (sum to total), one-line summary." + ), + ), + ), + _agent( + "formatter", + Note, + "Input: a ReportFixed JSON. Output a Note: a one-line markdown" + " bullet.", + ), + _agent( + "writer", + Note, + "Input: a topic (maybe with feedback). Output a Note: a short tech" + " headline.", + ), + _agent( + "is_tech", + Verdict, + "Input: a headline/Note JSON. Output Verdict.is_tech=true iff it is" + " about technology/software.", + ), + _agent( + "classifier", + Category, + "Input: a short text. Output Category 'tech' or 'other'.", + ), + _agent( + "tech_note", + Note, + "Input: text. Output a Note summarizing it as a tech item.", + ), + _agent( + "other_note", + Note, + "Input: text. Output a Note summarizing it as a general item.", + ), + ] + # mark reviewer as item/list correctly + caps[0] = Capability( + name="reviewer", + input_kind="item", + output_model=Finding, + serialize_input=True, + build=lambda: Agent( + name="reviewer", + model=MODEL, + output_schema=Finding, + generate_content_config=DET, + instruction=( + "Input JSON with keys path and code. Output a Finding (echo" + " path)." + ), + ), + ) + return CapabilityRegistry(caps) + + +SHAPES = { + "multi_stage": { + "registry_desc": ( + "reviewer (item: a file with path and code -> Finding), triager" + " (LIST of Findings -> ReportFixed), formatter (item: a ReportFixed" + " -> Note)." + ), + "task": ( + "Audit files for security. Fan out reviewer over task.files (a list" + " of {path,code}), triager on the findings, then formatter on the" + " report. output=formatter." + ), + "task_input": { + "files": [ + {"path": "a.py", "code": "os.system('ping '+host)"}, + {"path": "b.py", "code": "def add(x,y): return x+y"}, + ] + }, + }, + "branch": { + "registry_desc": ( + "classifier (item: text -> Category with category tech or other)," + " tech_note (item -> Note), other_note (item -> Note)." + ), + "task": ( + "Classify task.text with classifier, then branch on the category." + " The classifier outputs a Category object, so bind the branch `on`" + " to its category field (Binding source=step, step=," + " path='category'). Routes: tech->tech_note, other->other_note" + " (both run on task.text). output=the branch." + ), + "task_input": {"text": "a new programming language for systems code"}, + }, + "loop": { + "registry_desc": ( + "writer (item: a topic -> a Note headline), is_tech (item: a Note" + " -> a Verdict with boolean is_tech)." + ), + "task": ( + "loop_until: body=[writer on task.topic], until_capability=is_tech" + " with until_input bound to the writer step, max_iters=3." + " output=the loop." + ), + "task_input": {"topic": "quantum computing"}, + }, +} + + +async def _author_validate_execute(shape, cfg): + reg = _registry() + planner = Agent( + name="planner", + model=MODEL, + output_schema=WorkflowSpec, + generate_content_config=DET, + instruction=( + "Author a WorkflowSpec using ONLY these capabilities: " + + cfg["registry_desc"] + + " Use Binding(source='task', path=...) for task input and" + " Binding(source='step', step=) to chain. " + + cfg["task"] + ), + ) + holder = {} + + @node(rerun_on_resume=True) + async def parent(ctx, node_input): + raw = await ctx.run_node( + planner, node_input=f"Shape: {shape}. Author the plan.", run_id="plan" + ) + spec = WorkflowSpec.model_validate(raw) + holder["spec"] = spec + WorkflowSpecValidator(reg).validate(spec) # raises on invalid + holder["valid"] = True + interp = SpecInterpreter(reg, ctx) + holder["output"] = await interp.execute(spec, cfg["task_input"]) + yield Event(output={"_done": True}) + + wf = Workflow(name=shape, edges=[("START", parent)]) + ss = InMemorySessionService() + r = Runner(app_name=wf.name, node=wf, session_service=ss) + s = await ss.create_session(app_name=wf.name, user_id="u") + async for _ in r.run_async( + user_id="u", + session_id=s.id, + new_message=types.Content(parts=[types.Part(text="go")], role="user"), + ): + pass + return holder + + +def _all_capabilities(nodes) -> set: + """Capabilities referenced anywhere in the plan tree (incl. branch/loop).""" + out = set() + for n in nodes: + if n.kind in ("step", "fan_out"): + out.add(n.capability) + elif n.kind == "loop_until": + out.add(n.until_capability) + out |= _all_capabilities(n.body) + elif n.kind == "branch": + for route in n.routes: + out |= _all_capabilities(route.block) + return out + + +@pytest.mark.parametrize("shape", list(SHAPES)) +@pytest.mark.asyncio +async def test_planner_sweep(shape): + h = await _author_validate_execute(shape, SHAPES[shape]) + spec = h["spec"] + top_kinds = [s.kind for s in spec.steps] + caps = _all_capabilities(spec.steps) + print( + f"\n[{shape}] top_kinds={top_kinds} caps={sorted(caps)} " + f"valid={h.get('valid')} output={str(h.get('output'))[:100]}" + ) + assert h.get("valid") is True + assert h.get("output") is not None + + # Shape-specific structure — a degenerate plan must NOT pass. + if shape == "multi_stage": + assert top_kinds == ["fan_out", "step", "step"] + assert {"reviewer", "triager", "formatter"} <= caps + elif shape == "branch": + assert "branch" in top_kinds and "step" in top_kinds + branch = next(n for n in spec.steps if n.kind == "branch") + route_values = {r.value for r in branch.routes} + route_caps = set() + for r in branch.routes: + route_caps |= _all_capabilities(r.block) + assert route_values == { + "tech", + "other", + }, route_values # exact routes, not just >=2 + assert { + "tech_note", + "other_note", + } <= route_caps # both routes wired correctly + assert "classifier" in caps + elif shape == "loop": + loop = next(n for n in spec.steps if n.kind == "loop_until") + assert len(loop.body) >= 1 + assert "writer" in caps and "is_tech" in caps From 6177a554fca788063d6b3ed11ebdf887ff25da80 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Mon, 1 Jun 2026 23:57:28 -0700 Subject: [PATCH 03/64] demo(workflow): ADK Web wrapper for RFC #93 authored workflows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A discoverable `root_agent` (a Workflow) that exposes the #93 flow in ADK Web: the model authors a typed WorkflowSpec, ADK validates it against the capability registry, freezes spec+hash to session state, and executes it on the real engine via the #92 supervisor — each step surfaced as a chat message so the authored plan, validation, capabilities, frozen hash, and final output are all visible in the ADK Web chat / State / Events surfaces. adk web contributing/samples/workflows/authored_workflow_demo Load-or-author: if a frozen spec already exists in the session it is REUSED (planner not re-invoked) and replayed — so the resume/reproducibility claim is real, verified by a CI-safe no-LLM test (test_demo_agent.py: import + name + registry + spec validation + reuse-path-with-stub-registry). Reuses the authored_workflow_spike/ stack; model from env (default gemini-2.5-flash; gemini-3.5-flash needs location=global); no hardcoded project. README is the ~7-min recording script. pyink/isort/mdformat clean; 4 demo tests pass. --- .../authored_workflow_demo/DEMO_NARRATIVE.md | 108 +++++++ .../authored_workflow_demo/README.md | 70 +++++ .../security_audit_planner/__init__.py | 15 + .../security_audit_planner/agent.py | 268 ++++++++++++++++++ .../authored_workflow_demo/test_demo_agent.py | 178 ++++++++++++ 5 files changed, 639 insertions(+) create mode 100644 contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md create mode 100644 contributing/samples/workflows/authored_workflow_demo/README.md create mode 100644 contributing/samples/workflows/authored_workflow_demo/security_audit_planner/__init__.py create mode 100644 contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py create mode 100644 contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py diff --git a/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md new file mode 100644 index 00000000000..fc60a51c07f --- /dev/null +++ b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md @@ -0,0 +1,108 @@ +# Demo narrative — model-authored typed Workflows (RFC #93) + +A beat-by-beat narration for the ~7-minute recording, with a **real transcript** +captured on Vertex `gemini-3.5-flash`. Pair this with the run commands in +`README.md`. Bottom line: *ADK Web sells the product fit; pytest/CI sells the +correctness.* + +## Thesis (say this first, ~20s) + +> "#92 gives ADK a supervised concurrent executor. #93 lets a model **author** a +> typed `WorkflowSpec` — a plan as *data, not code* — that ADK validates against +> a capability allow-list, freezes, and executes reproducibly. Watch the model +> write a plan, ADK validate it, run it, and then **replay the exact same frozen +> plan** without re-invoking the model." + +## Beat 1 — author (ADK Web chat) + +Send: **"Plan and run a codebase security review."** The chat streams: + +``` +🧭 Model-authored Workflow — planning a security audit over 4 files using only + registered capabilities (reviewer, triager, formatter). + +📋 Authored plan (fan_out → step → step): + { + "goal": "Audit files and format the report", + "steps": [ + {"kind": "fan_out", "id": "review_files", "over": {"source":"task","path":"files"}, "capability": "reviewer"}, + {"kind": "step", "id": "triage_findings", "input": {"source":"step","step":"review_files"}, "capability": "triager"}, + {"kind": "step", "id": "format_report", "input": {"source":"step","step":"triage_findings"}, "capability": "formatter"} + ], + "output": {"source": "step", "step": "format_report"} + } +``` + +> "The model emitted a *typed plan*, not code — a fan-out of `reviewer` over the +> files, then `triager`, then `formatter`, with explicit data bindings between +> steps." + +## Beat 2 — validate (capability allow-list) + +``` +✅ Validation passed. Capabilities referenced (all registered): + ['formatter', 'reviewer', 'triager']. +``` + +> "Validation confirms every capability the plan names is in the registry. The +> model can only compose pre-approved capabilities — no arbitrary calls, no code +> execution. That's the security model: capability allow-listing, not a sandbox." + +## Beat 3 — freeze (State tab) + +``` +🔒 Frozen spec persisted to session state — hash 206fb4d3a27b. + Re-send the prompt: it replays this exact plan, not a new one. +``` + +> "Open the **State** tab: `authored_workflow:frozen_spec` and `…_hash`. The plan +> is now durable data you can store, diff, and audit." + +## Beat 4 — execute (Events / trace tab) + +``` +📄 Audit result: Identified 4 vulnerabilities: 1 critical (command injection), + 2 high (hardcoded credentials and SQL injection), and 1 medium (division by zero). +``` + +> "Open **Events**: ADK runs the plan on the real engine via the #92 supervisor — +> the `reviewer` fan-out over the 4 files, then `triager`, then `formatter`. The +> findings are real: a CRITICAL `os.system` injection, HIGH hardcoded creds and +> SQL injection, and a MEDIUM divide-by-zero." + +## Beat 5 — reproduce (re-send the same prompt) + +``` +♻️ Reusing frozen plan from session state — hash 206fb4d3a27b. + The model is NOT re-invoked; the exact prior plan is replayed. +✅ Validation passed. ... +📄 Audit result: ... +``` + +> "Send the same prompt again — **same hash, model not re-invoked**. The frozen +> plan is replayed. That's the reproducibility guarantee: authoring is a +> one-time, auditable step; execution is deterministic replay." + +**Verified outputs (this capture):** + +| Run | `reused` | `hash` | +| ----------- | -------- | -------------- | +| 1 (author) | `false` | `206fb4d3a27b` | +| 2 (re-send) | `true` | `206fb4d3a27b` | + +Same hash, `reused` flips to `true` — the model is not called the second time. + +## Close (~20s) + +> "So: a model authored a typed, validated, capability-bounded plan; ADK executed +> it on the real engine; and a re-send replayed the exact frozen plan. The +> deterministic test suites — 11 (#92) + 10 (#93) + 4 (demo) — lock all of this +> in CI, including a no-LLM test of this reuse path." + +## Proof commands (terminal, ~60s) + +```bash +pytest contributing/samples/workflows/dynamic_supervisor_spike/test_dynamic_supervisor_spike.py -q # 11 +pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 10 +pytest contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py -q # 4 +``` diff --git a/contributing/samples/workflows/authored_workflow_demo/README.md b/contributing/samples/workflows/authored_workflow_demo/README.md new file mode 100644 index 00000000000..6fb6b3279ca --- /dev/null +++ b/contributing/samples/workflows/authored_workflow_demo/README.md @@ -0,0 +1,70 @@ +# ADK Web demo — model-authored typed Workflows (RFC #93) + +A ~7-minute demo: a model authors a typed `WorkflowSpec`, ADK validates it +against a capability registry, freezes it to session state, and executes it on +the real ADK engine via the #92 supervisor — all visible in ADK Web's chat, +state, and event surfaces. **ADK Web sells the product fit; pytest/CI sells the +correctness.** + +## 0. Configure a model (no hardcoded project) + +```bash +export GOOGLE_GENAI_USE_VERTEXAI=1 +export GOOGLE_CLOUD_PROJECT= +export GOOGLE_CLOUD_LOCATION=global # gemini-3.5-flash serves from `global` +export SPIKE_GEMINI_MODEL=gemini-3.5-flash # or any flash model you can access +``` + +## 1. Thesis (20s) + +- **#92** is the supervised concurrent executor (`DynamicNodeSupervisor` + `ctx.pipeline`). +- **#93** is the model-authored typed `WorkflowSpec` layer. +- The demo: a model authors a *validated* plan, then ADK executes that *frozen* plan reproducibly. + +## 2. ADK Web walkthrough (3–5 min) + +```bash +adk web contributing/samples/workflows/authored_workflow_demo \ + --port 8000 --session_service_uri "sqlite:///demo_sessions.db" +``` + +Open the UI, pick `security_audit_planner`, and send: + +```text +Plan and run a codebase security review. +``` + +Point at the ADK-native evidence as it streams: + +1. **Authored `WorkflowSpec`** — the chat shows the JSON plan (`fan_out → step → step`). +1. **Validation** — "Validation passed" + the capability list (all registered). +1. **Frozen spec + hash** — open the **State** tab: `authored_workflow:frozen_spec` and `…_hash`. +1. **Execution** — the **Events / trace** view shows the `reviewer` fan-out, then `triager`, then `formatter` node runs. +1. **Final output** — the triaged audit (e.g. 3 HIGH + 1 MEDIUM across `auth.py`/`db.py`/`net.py`/`math.py`). + +(Re-send the same prompt to show resume reuses the frozen spec — same hash, not re-authored.) + +## 3. Shape sweep — not a one-off (1–2 min) + +```bash +SPIKE_LIVE=1 pytest \ + contributing/samples/workflows/authored_workflow_spike/test_live_planner_sweep.py -q -s +``` + +Proof points: multi-stage `fan_out → step → step`; branch `step → branch`; loop `loop_until`. + +## 4. Correctness proof (60s) + +```bash +pytest contributing/samples/workflows/dynamic_supervisor_spike/test_dynamic_supervisor_spike.py -q # 11 +pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 10 +pytest contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py -q # 4 +``` + +- Deterministic suites: #92 **11** + #93 **10** + demo **4** = **25** (incl. a no-LLM reuse-path test). +- PR #3 CI green except the documented fork-only `agent-triage` token job. + +## Recording notes + +- macOS `Cmd+Shift+5` or Loom; browser at 110–125% zoom, terminal font 16+. +- Hide project IDs / env vars. Keep it under ~7 minutes. diff --git a/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/__init__.py b/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/__init__.py new file mode 100644 index 00000000000..1a38cf933e9 --- /dev/null +++ b/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import agent # noqa: F401 diff --git a/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py b/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py new file mode 100644 index 00000000000..8d14a0fcaf7 --- /dev/null +++ b/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py @@ -0,0 +1,268 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ADK Web demo agent for RFC #93 — model-authored typed Workflows. + +`root_agent` is a `Workflow` whose single node: + 1. asks a planner `LlmAgent(output_schema=WorkflowSpec)` to author a plan, + 2. validates it (`WorkflowSpecValidator`) against a capability registry, + 3. persists the frozen spec + hash to session state, + 4. executes it on the real ADK engine via the #92 supervisor, +surfacing each step as a chat message so the ADK Web UI shows the authored +plan, validation, capabilities, frozen hash, and final output. Run with: + + adk web contributing/samples/workflows/authored_workflow_demo + +Configure a model first (no hardcoded project): + export GOOGLE_GENAI_USE_VERTEXAI=1 GOOGLE_CLOUD_PROJECT= + export GOOGLE_CLOUD_LOCATION=global SPIKE_GEMINI_MODEL=gemini-3.5-flash +""" + +from __future__ import annotations + +import hashlib +import json +import os +import sys +from typing import Literal + +from google.adk import Agent +from google.adk import Context +from google.adk import Event +from google.adk import Workflow +from google.adk.workflow import node +from google.genai import types +from pydantic import BaseModel + +# Reuse the committed #93 authoring stack (sibling sample dir). +sys.path.insert( + 0, + os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "..", + "..", + "authored_workflow_spike", + ), +) +from authoring import Capability # noqa: E402 +from authoring import CapabilityRegistry # noqa: E402 +from authoring import SpecInterpreter # noqa: E402 +from authoring import WorkflowSpec # noqa: E402 +from authoring import WorkflowSpecValidator # noqa: E402 + +MODEL = os.environ.get("SPIKE_GEMINI_MODEL", "gemini-2.5-flash") +DET = types.GenerateContentConfig(temperature=0) + +# A small, deliberately-mixed codebase to audit (3 vulnerable, 1 safe). +FILES = [ + { + "path": "auth.py", + "code": "def login(pw): return pw == 'admin123' # hardcoded", + }, + { + "path": "db.py", + "code": "q = 'SELECT * FROM users WHERE id=' + request.args['id']", + }, + {"path": "net.py", "code": "os.system('ping ' + user_supplied_host)"}, + {"path": "math.py", "code": "def mean(xs):\n return sum(xs) / len(xs)"}, +] + + +class Finding(BaseModel): + path: str + severity: Literal["CRITICAL", "HIGH", "MEDIUM", "LOW", "NONE"] + issue: str + + +class ReportFixed(BaseModel): + total: int + critical: int + high: int + medium: int + low: int + none: int + summary: str + + +class Note(BaseModel): + note: str + + +def _registry() -> CapabilityRegistry: + return CapabilityRegistry([ + Capability( + name="reviewer", + input_kind="item", + output_model=Finding, + serialize_input=True, + max_fan_out=50, + build=lambda: Agent( + name="reviewer", + model=MODEL, + output_schema=Finding, + generate_content_config=DET, + instruction=( + "Input JSON with keys path and code. Output a Finding" + " (echo the path)." + ), + ), + ), + Capability( + name="triager", + input_kind="list", + output_model=ReportFixed, + serialize_input=True, + build=lambda: Agent( + name="triager", + model=MODEL, + output_schema=ReportFixed, + generate_content_config=DET, + instruction=( + "Input: a JSON list of Findings. Output ReportFixed:" + " total, per-severity counts (must sum to total), and" + " a one-line summary." + ), + ), + ), + Capability( + name="formatter", + input_kind="item", + output_model=Note, + serialize_input=True, + build=lambda: Agent( + name="formatter", + model=MODEL, + output_schema=Note, + generate_content_config=DET, + instruction=( + "Input: a ReportFixed JSON. Output a Note: a one-line" + " markdown bullet summarizing the audit." + ), + ), + ), + ]) + + +_REGISTRY_DESC = ( + "reviewer (item: a file with path and code -> Finding), triager (LIST of" + " Findings -> ReportFixed), formatter (item: a ReportFixed -> Note)." +) +_PLANNER_INSTR = ( + "Author a WorkflowSpec using ONLY these capabilities: " + + _REGISTRY_DESC + + " The task input has a 'files' list of objects with path and code." + " Author:" + " a fan_out of reviewer over task.files, then a step running triager on the" + " findings, then a step running formatter on the report. Use" + " Binding(source='task', path='files') and Binding(source='step'," + " step=) to chain. Set output to the formatter step." +) + + +def _msg(text: str) -> Event: + return Event( + content=types.Content(role="model", parts=[types.Part(text=text)]) + ) + + +def _hash(spec: WorkflowSpec) -> str: + return hashlib.sha256( + json.dumps(spec.model_dump(), sort_keys=True).encode() + ).hexdigest()[:12] + + +@node(rerun_on_resume=True) +async def author_validate_execute(ctx: Context, node_input): + reg = _registry() + + # 1. LOAD-OR-AUTHOR. If a frozen spec exists in this session, REUSE it (do not + # re-author) — this is the resume/reproducibility claim. Otherwise the model + # authors a fresh typed WorkflowSpec (data, not code). + existing = ctx.state.get("authored_workflow:frozen_spec") + if existing: + spec = WorkflowSpec.model_validate(existing) + spec_hash = ctx.state.get("authored_workflow:frozen_spec_hash") or _hash( + spec + ) + reused = True + yield _msg( + f"♻️ **Reusing frozen plan** from session state — hash `{spec_hash}`. " + "The model is NOT re-invoked; the exact prior plan is replayed." + ) + else: + reused = False + yield _msg( + "🧭 **Model-authored Workflow** — planning a security audit over " + f"{len(FILES)} files using only registered capabilities " + "(`reviewer`, `triager`, `formatter`)." + ) + planner = Agent( + name="planner", + model=MODEL, + output_schema=WorkflowSpec, + generate_content_config=DET, + instruction=_PLANNER_INSTR, + ) + raw = await ctx.run_node( + planner, + node_input=f"Audit these files: {[f['path'] for f in FILES]}.", + run_id="plan", + ) + spec = WorkflowSpec.model_validate(raw) + spec_hash = _hash(spec) + steps = " → ".join(s.kind for s in spec.steps) + yield _msg( + f"📋 **Authored plan** (`{steps}`):\n```json\n" + f"{json.dumps(spec.model_dump(), indent=1)}\n```" + ) + + # 2. VALIDATE — semantic validation against the registry (always). + warnings = WorkflowSpecValidator(reg).validate(spec) # raises on hard error + caps = sorted({getattr(s, "capability", None) for s in spec.steps} - {None}) + yield _msg( + "✅ **Validation passed.** Capabilities referenced (all registered): " + f"`{caps}`." + + (f"\n⚠️ warnings: {warnings}" if warnings else "") + ) + + # 3. FREEZE — persist spec + hash to session state on first author only + # (visible in the State tab; reused runs already have it). + if not reused: + ctx.state["authored_workflow:frozen_spec"] = spec.model_dump() + ctx.state["authored_workflow:frozen_spec_hash"] = spec_hash + yield _msg( + f"🔒 **Frozen spec** persisted to session state — hash `{spec_hash}`. " + "Re-send the prompt: it replays this exact plan, not a new one." + ) + + # 4. EXECUTE — run the validated plan on the real ADK engine (#92 supervisor). + result = await SpecInterpreter(reg, ctx).execute(spec, {"files": FILES}) + yield _msg( + "📄 **Audit result:**" + f" {result.get('note') if isinstance(result, dict) else result}" + ) + yield Event( + output={ + "hash": spec_hash, + "result": result, + "capabilities": caps, + "reused": reused, + } + ) + + +root_agent = Workflow( + name="security_audit_planner", + edges=[("START", author_validate_execute)], +) diff --git a/contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py b/contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py new file mode 100644 index 00000000000..e8dc40a129f --- /dev/null +++ b/contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py @@ -0,0 +1,178 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""CI-safe tests for the ADK Web demo wrapper (no live model). + +Covers: import + `root_agent` shape, the demo registry, demo-spec validation, +and — crucially — the **reuse path** end-to-end with a stubbed (deterministic) +capability registry, so the frozen-spec/replay claim the demo makes on camera +is actually verified without calling Gemini. +""" + +from __future__ import annotations + +import os +import sys + +from google.adk import Event +from google.adk.runners import Runner +from google.adk.sessions.in_memory_session_service import InMemorySessionService +from google.adk.workflow import node +from google.adk.workflow import Workflow +from google.genai import types +import pytest + +_HERE = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, os.path.join(_HERE, "security_audit_planner")) +sys.path.insert(0, os.path.join(_HERE, "..", "authored_workflow_spike")) +import agent as demo # noqa: E402 +from authoring import Binding # noqa: E402 +from authoring import Capability # noqa: E402 +from authoring import CapabilityRegistry # noqa: E402 +from authoring import FanOut # noqa: E402 +from authoring import StepRef # noqa: E402 +from authoring import WorkflowSpec # noqa: E402 +from authoring import WorkflowSpecValidator # noqa: E402 + + +def _demo_spec() -> WorkflowSpec: + return WorkflowSpec( + goal="audit", + steps=[ + FanOut( + kind="fan_out", + id="rev", + over=Binding(source="task", path="files"), + capability="reviewer", + ), + StepRef( + kind="step", + id="tri", + capability="triager", + input=Binding(source="step", step="rev"), + ), + StepRef( + kind="step", + id="fmt", + capability="formatter", + input=Binding(source="step", step="tri"), + ), + ], + output=Binding(source="step", step="fmt"), + ) + + +def test_root_agent_importable_and_named(): + assert isinstance(demo.root_agent, Workflow) + assert demo.root_agent.name == "security_audit_planner" + assert len(demo.root_agent.edges) == 1 + + +def test_demo_registry_is_clean(): + reg = demo._registry() + for name in ("reviewer", "triager", "formatter"): + assert name in reg + assert reg["reviewer"].input_kind == "item" + assert reg["triager"].input_kind == "list" + # ReportFixed uses enumerated fields, not an open dict[str, X] map. + assert reg.open_map_warnings() == [] + + +def test_demo_spec_validates(): + WorkflowSpecValidator(demo._registry()).validate(_demo_spec()) # no raise + + +def _stub_registry() -> CapabilityRegistry: + def stub(name, fn): + def build(): + @node(name=name) + async def n(ctx, node_input): + yield Event(output=fn(node_input)) + + return n + + return build + + return CapabilityRegistry([ + Capability( + name="reviewer", + input_kind="item", + serialize_input=False, + build=stub( + "reviewer", + lambda f: {"path": f["path"], "severity": "HIGH", "issue": "x"}, + ), + ), + Capability( + name="triager", + input_kind="list", + serialize_input=False, + build=stub("triager", lambda findings: {"total": len(findings)}), + ), + Capability( + name="formatter", + input_kind="item", + serialize_input=False, + build=stub( + "formatter", lambda r: {"note": f"audited {r['total']} files"} + ), + ), + ]) + + +@pytest.mark.asyncio +async def test_reuse_path_no_llm(monkeypatch): + """Pre-seed a frozen spec + stub the registry: the demo must REUSE the plan + (no planner/Gemini call) and still surface hash, capabilities, and output.""" + monkeypatch.setattr(demo, "_registry", _stub_registry) + spec = _demo_spec() + + ss = InMemorySessionService() + session = await ss.create_session( + app_name="demo", + user_id="u", + state={ + "authored_workflow:frozen_spec": spec.model_dump(), + "authored_workflow:frozen_spec_hash": "deadbeef0000", + }, + ) + runner = Runner(app_name="demo", node=demo.root_agent, session_service=ss) + + out, reused_msg, authored_msg = None, False, False + async for ev in runner.run_async( + user_id="u", + session_id=session.id, + new_message=types.Content(parts=[types.Part(text="go")], role="user"), + ): + if isinstance(ev, Event) and ev.content and ev.content.parts: + for p in ev.content.parts: + if p.text and "Reusing frozen plan" in p.text: + reused_msg = True + if p.text and "Authored plan" in p.text: + authored_msg = True + if ( + isinstance(ev, Event) + and isinstance(ev.output, dict) + and "hash" in ev.output + ): + out = ev.output + + assert reused_msg and not authored_msg # reused; planner NOT invoked + assert out is not None + assert out["reused"] is True + assert ( + out["hash"] == "deadbeef0000" + ) # same frozen hash, not re-derived from a new plan + assert set(out["capabilities"]) == {"reviewer", "triager", "formatter"} + assert out["result"]["note"].startswith("audited") From f7ad788e546e56a4c97da8b38b306fe8c4dbd0de Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Tue, 2 Jun 2026 00:36:30 -0700 Subject: [PATCH 04/64] docs(workflow): RFC #93 canonical design (incl. plan export/storage tiers) Standalone design for the authored-workflow spike: data model, validator, interpreter, frozen-spec contract, security model, testing, empirical findings, and the plan export/storage tiers (v1 per-run persist; v1.1 portable JSON export envelope; v2 reusable templates with import-time registry revalidation; compiled Workflow is a derived artifact, never the stored source of truth). --- .../authored_workflow_spike/DESIGN.md | 198 ++++++++++++++++++ 1 file changed, 198 insertions(+) create mode 100644 contributing/samples/workflows/authored_workflow_spike/DESIGN.md diff --git a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md new file mode 100644 index 00000000000..58cba980fd6 --- /dev/null +++ b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md @@ -0,0 +1,198 @@ +# Design — Agent-authored typed Workflows (RFC #93) + +Canonical technical design for RFC #93 (GoogleCloudPlatform/BigQuery-Agent-Analytics-SDK#93). Mirrors the issue's Technical Design comment. Covers the data model, validator, interpreter/compilation, frozen-spec contract, security model, framework changes, testing, and the empirical findings that shaped it. Audience: implementers / technical reviewers. + +## 1. Data model — `WorkflowSpec` + +A discriminated-by-`kind`, recursive, ordered **tree of blocks** (not a graph with jumps). `id`s are globally-unique **binding names** for dataflow, never jump targets — which removes join / fall-through / GOTO ambiguity by construction. + +```python +# src/google/adk/workflow/authoring/_spec.py (NEW) + +# Typed dataflow: a Binding is the ONLY way a node sources input — a source + +# optional dotted path, validated against the producer's output schema. +class Binding(BaseModel): + source: Literal["task", "step"] # the workflow task input, or a prior step's output + step: str | None = None # REQUIRED iff source == "step"; None when source == "task" + path: str | None = None # optional dotted field path; checked vs the schema + # model_validator enforces: (source == "step") == (step is not None) + +class StepRef(BaseModel): + kind: Literal["step"] + id: str + capability: str # MUST resolve in the registry + input: Binding # validated against the capability's input_schema + +class FanOut(BaseModel): + kind: Literal["fan_out"] + id: str + over: Binding # MUST resolve to a LIST-typed value + capability: str # run once per element (compiles to ctx.pipeline/parallel) + collect: Literal["list"] = "list" # per-item outputs aggregate to an order-preserving list + +class Route(BaseModel): + value: str # the switch value this route matches + block: list["SpecNode"] # non-empty; output = block's last-node output + +class Branch(BaseModel): + kind: Literal["branch"] + id: str + on: Binding # switch value; MUST resolve to a STRING/STR-ENUM schema + routes: list[Route] # ENUMERATED LIST, not an open dict[str, ...] map (see Findings) + unmatched: Literal["fail"] = "fail" # unmatched value at runtime = FAIL (no runtime re-plan in v1) + +class LoopUntil(BaseModel): + kind: Literal["loop_until"] + id: str + body: list["SpecNode"] # non-empty; loop output = LAST iteration's last-node output + until_capability: str # MUST declare a STRICT-bool output schema + until_input: Binding # predicate input (validated vs until_capability.input_schema) + max_iters: int = Field(ge=1) # REQUIRED, >= 1 + +# PLAIN union, each member carrying a `kind` Literal (structurally-tagged) — NOT +# Annotated[..., Field(discriminator="kind")]: the discriminated form emits a +# JSON-schema `discriminator` keyword that Gemini's response_schema rejects +# (Schema: extra_forbidden — verified). `kind` still disambiguates parsing. +SpecNode = Union[StepRef, FanOut, Branch, LoopUntil] + +class WorkflowSpec(BaseModel): + goal: str + steps: list[SpecNode] # ordered blocks (sequence by list order) + output: Binding # terminal output selection (validated) +``` + +**Block-output rule:** a block's output is its **last node's** output — so a `Branch`'s output is the taken route's last-node output, and a `LoopUntil`'s is the last iteration's last-node output. This gives every composite node a well-defined output schema, which is what makes `Binding(source="step", step=)` schema-checkable. + +**Binding scope:** a `Binding` may reference only a step that lexically precedes it on the **same** root-to-node path (ancestors + earlier same-level siblings). References into a not-taken sibling route, or to a later step, are rejected at validation. + +## 2. The agent + +```python +# src/google/adk/workflow/authoring/_agent.py (NEW) +class AuthoredWorkflowAgent(BaseAgent): + planner_model: str + registry: CapabilityRegistry # the ONLY capabilities a plan may reference + max_replans: int = 1 + + async def _run_async_impl(self, ctx): + frozen = await self._load_frozen_spec(ctx) # resume: reuse the SAME spec, never re-plan + if frozen is None: + spec = await self._author(ctx) # LlmAgent(output_schema=WorkflowSpec) + WorkflowSpecValidator(self.registry).validate(spec) + frozen = self._freeze_and_persist(ctx, spec) # see Frozen-spec contract + workflow = WorkflowCompiler(self.registry).compile(frozen.spec) # -> a real Workflow + async for event in workflow.run_async(ctx): # deterministic + resumable + yield event +``` + +- **Authoring** = `LlmAgent(output_schema=WorkflowSpec)`; ADK validates structured output, so a malformed plan is caught and re-planned (bounded by `max_replans`). +- **Validation** is a **new semantic validator** (below) that *lowers to* `Graph.validate_graph()` for structural checks. +- **Compilation** lowers the block tree: sequence → edges; `Branch` → conditional route edges over nested blocks; `FanOut`/`LoopUntil` → the #92 `ctx.pipeline`/`ctx.parallel` + bounded loop. The compiled artifact is an ordinary `Workflow` — nothing downstream knows it was machine-authored. +- **Registry** = developer-supplied capabilities (an agent, or a tool wrapped as a node), each with per-capability policy. + +## 3. Validator — semantic, then structural + +`WorkflowSpecValidator` checks what `Graph` cannot, then lowers: + +- capability refs resolve in the registry; +- `Binding` invariant + path/type compatibility vs the producer's `output_schema` and consumer's `input_schema`; +- `FanOut.over` resolves to a list; the fan-out capability takes an item; +- `Branch.on` is string/str-enum-typed; route blocks share a compatible last-node output schema; non-exhaustive enum domain is flagged (unmatched at runtime fails); +- `LoopUntil`: strict-bool `until_capability`, present/compatible `until_input`, `max_iters >= 1`; +- globally-unique `id`s; binding-scope (no non-preceding / cross-route references); +- registry-version match vs a frozen spec (drift = hard error). + +Then **`Graph.validate_graph()`** (reused) handles duplicate names, `START`/reachability, duplicate edges, unconditional cycles on the compiled graph. + +## 4. Semantics + +- **Authoring non-deterministic; execution deterministic.** Once frozen, execution + resume replay is fully deterministic (it's just a `Workflow`). +- **Reuses #92 + the engine wholesale.** Fan-out → supervised `ctx.pipeline`/`ctx.parallel` (bounded, interrupt-safe); sequence/branch → edges + routes; loop → bounded loop. No new executor. +- **Re-plan is pre-execution-only.** `max_replans` applies only to validation failures; an execution failure fails the frozen run; recovery = a new explicit run/version. No recursive planner-spawning-planner. +- **Budget + agent caps from #92** bound a mis-plan's spend. + +## 5. Frozen-spec contract (correctness requirement) + +Before any execution, persist: spec JSON, content hash, planner model+version, registry/capability version, validation result. Deterministic replay holds **only** if resume loads the **same** frozen spec → **resume MUST reuse it and MUST NOT re-plan** unless the user starts a new run; a registry-version mismatch on resume is a hard error. + +- **Storage target (v1):** session state under an **unprefixed (session-scoped) key** (e.g. `authored_workflow:frozen_spec`). **Not** `app:` — that's app-scoped (`State.APP_PREFIX`, extracted in `_session_util.extract_state_delta`) and shared across sessions/users, leaking per-run data and breaking per-run resume. +- **Audit event shape:** persist **state-only** — `Event(state={"authored_workflow:frozen_spec": frozen_record})`, hash/kind inside the record or a sibling key. **Not** `Event.output` (`NodeRunner._track_event_in_context` sets `ctx.output = event.output` and `Context.output` rejects a second output → "Output already set"). **Not** `Event.content` (would re-enter a model's context). + +## 6. Security model + +Going declarative **eliminates the code-execution / sandbox-escape class** — but **not** all risk (bad args, prompt-injected inputs, side-effectful tools, expensive fan-out/loops). Controls = validation **+ per-capability policy**: + +- **Capability allow-list** — non-registry refs rejected at validation. +- **No code execution** — nothing to sandbox. +- **Per-capability policy** (registry-declared): `max_calls`, `max_fan_out`, allowed caller/edge constraints, `side_effect` (requires explicit approval to appear in a plan), argument constraints/schema. **Static vs runtime split:** the validator enforces statically-knowable policy (static call counts, `max_iters`, side-effect approval, caller/edge, arg schemas); runtime enforces data-dependent caps before dispatch (`max_fan_out` vs actual list size, realized branch-path call counts). +- **Output-schema guidance (from the spike):** registered capabilities should avoid open `dict[str, X]` output maps (Gemini fills them unreliably); the registry/validator SHOULD warn, and outputs should carry invariants (e.g. counts sum to total) checked with one repair retry. +- **Per-capability permissions unchanged** — each agent runs under its own ADK tool allowlist; authoring grants no elevation. +- **Bounded blast radius** — current ADK enforces `RunConfig.max_llm_calls` (default 500); the proposed #92 limits (leaf gate, optional per-run agent cap, optional `max_tokens`) bound further; `max_iters`/`max_replans` bound loops. +- **Auditable** — frozen spec (+ hash, versions) persisted; humans can review/pre-approve. + +Residual: "model composes approved capabilities, within policy, in a wasteful-but-bounded order, possibly on injected inputs" — dramatically smaller than executing model-authored Python, but **not zero**; argument-level injection into an approved side-effectful tool is the sharpest residual (hence side-effect caps default to approval-required). + +## 7. Backward compatibility + +Fully additive. New `authoring/` package + `AuthoredWorkflowAgent`; no change to existing agents, `Workflow`, or the engine. Opt-in; the compiled artifact is a plain `Workflow`. + +## 8. Testing + +- **Semantic validator rejects:** unknown capability; `Binding` invariant / incompatible path-type; `FanOut.over` non-list; `Branch.on` non-string or incompatible route output schemas; `LoopUntil` non-strict-bool predicate / missing `until_input` / `max_iters < 1`; non-preceding or cross-route binding; duplicate `id`; registry drift. +- **Structural lowering:** `Graph.validate_graph()` catches duplicate names / unreachable / unconditional cycles. +- **Frozen-spec contract:** persisted before execution; resume reuses, does not re-plan; registry-version mismatch is a hard error. +- **Per-capability policy:** plan exceeding `max_calls`/`max_fan_out` or placing an unapproved side-effect capability is rejected pre-execution. +- **Compiler:** golden test — a `WorkflowSpec` lowers to a `Workflow` matching a hand-written equivalent; fan-out → bounded `ctx.pipeline`. +- **`AuthoredWorkflowAgent`:** malformed planner output → bounded re-plan → fail past `max_replans`. +- **Determinism:** frozen spec replays identically, resumes exactly-once (inherits #92). +- **Two gates:** *planning* (valid + sensible + executable + structurally matches a hand-wired baseline) and *output-quality* (intermediate outputs match, capability invariants hold, one repair retry). + +## 9. Empirical findings (from the demand-gate spike on `gemini-3.5-flash`) + +1. **Gate passed.** A planner authored a valid, structurally-correct spec for a codebase audit, validated first try, executed on the real engine, matched a hand-wired baseline — across multi-stage / branch / loop_until shapes. +1. **Open-`dict[str, X]` maps are a structured-output reliability hazard** — hit twice: a capability's `counts: dict[str,int]` came back empty, and the spec's own `Branch.routes` (an open map) came back empty. **Both fixed by enumerated/list structures** (`Branch.routes` → `list[Route]`; capability outputs use fixed fields). The validator warns on open-map capability outputs. +1. **Discriminated unions are incompatible with Gemini `response_schema`** — `Field(discriminator="kind")` emits a `discriminator` keyword genai rejects (`Schema: extra_forbidden`). Use a plain `kind`-tagged union. +1. **Planner quality vs capability quality are separable** — authoring/structure was reliably good; the residual variance was per-capability output quality (prompts/schemas/retries), proven via an intermediate-output diff (authored vs baseline findings were semantically identical). The strict `unmatched=fail` branch contract also caught a bad field-binding loudly instead of mis-routing. + +Re-runnable: `contributing/samples/workflows/authored_workflow_spike/` (10 deterministic tests + env-gated live sweep) and `authored_workflow_demo/` (ADK Web `root_agent` + 4 CI-safe tests incl. the no-LLM reuse path), in `caohy1988/adk-python` PR #3. + +## 10. Plan export & storage — the frozen spec as a durable artifact + +**Source of truth = the typed `WorkflowSpec`.** The compiled `Workflow` is a *derived* artifact. Storage is tiered, scoped to keep generated code and compiled graphs out of v1: + +- **v1 (required) — persist frozen spec per run.** Already core (§5): session state `authored_workflow:frozen_spec` + hash, for resume/replay. + +- **v1.1 (recommended) — export the frozen spec as a portable JSON envelope.** An explicit "Export plan" operation producing a self-describing record: + + ```json + { + "schema_version": "v1", + "spec": { "...": "the WorkflowSpec" }, + "spec_hash": "...", + "planner_model": "...", + "registry_version": "...", + "capability_versions": { "reviewer": "...", "triager": "..." }, + "validation": { "passed": true, "warnings": [] }, + "created_at": "", + "task_input_digest": "" + } + ``` + + This is the enterprise story: a model-authored plan becomes **reviewable, diffable, auditable, replayable** data. `created_at` is stamped at export (not at replay); `task_input_digest` is a hash so a portable plan doesn't carry raw task content. + + ```python + def export_plan(frozen) -> dict: ... # the envelope above + def import_plan(envelope, registry) -> WorkflowSpec: # re-validates vs the CURRENT registry + ``` + +- **v2 (optional) — promote an exported plan to a reusable template.** A human approves a spec and saves it as a template. **On import, ADK MUST re-validate against the *current* registry**; registry/capability drift **fails loudly or requires explicit migration** — never a silent run against a changed capability set. (The envelope's `registry_version` / `capability_versions` are what make drift detectable.) + +- **Deferred — compiled `Workflow`/graph (or generated Python) as the source of truth.** The compiled `Workflow` is regenerated from the spec on demand; it is **not** stored as canonical, because compiler behavior and ADK internals evolve. Persisting generated code or a compiled graph is explicitly out of scope. + +Net: this turns the proposal from "a model can author plans" into "**model-authored plans become durable enterprise artifacts**" — without committing to durable generated code. + +## References + +- #92 — supervised concurrent dynamic dispatch + `ctx.pipeline` (executor). +- Claude Code Dynamic Workflows — https://code.claude.com/docs/en/workflows +- ADK: `Workflow`/`Graph` (`src/google/adk/workflow/_graph.py`), `LlmAgent.output_schema` / `validate_schema`, `BaseAgent.run_async`, `_session_util.extract_state_delta`, `NodeRunner._track_event_in_context`. From bfd90a44870c7f8328ade8c38b4b7330dcbdcd0d Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Tue, 2 Jun 2026 00:40:07 -0700 Subject: [PATCH 05/64] =?UTF-8?q?docs(workflow):=20export=20contract=20?= =?UTF-8?q?=E2=80=94=20digest=20definition,=20task=5Finput=5Fschema,=20imp?= =?UTF-8?q?ort-input=20rule?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DESIGN.md §10: spec_hash/task_input_digest defined as sha256 over canonical JSON; envelope carries an optional task_input_schema; import contract — digest is advisory provenance for replaying the original run, template reuse validates a new task input against task_input_schema, else replay-only on matching digest or explicit template promotion. Never silently bind a stored plan to an incompatible task shape. --- .../samples/workflows/authored_workflow_spike/DESIGN.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md index 58cba980fd6..954ab2955ee 100644 --- a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md +++ b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md @@ -174,11 +174,16 @@ Re-runnable: `contributing/samples/workflows/authored_workflow_spike/` (10 deter "capability_versions": { "reviewer": "...", "triager": "..." }, "validation": { "passed": true, "warnings": [] }, "created_at": "", - "task_input_digest": "" + "task_input_schema": { "...": "expected task-input JSON schema, or null" }, + "task_input_digest": "" } ``` - This is the enterprise story: a model-authored plan becomes **reviewable, diffable, auditable, replayable** data. `created_at` is stamped at export (not at replay); `task_input_digest` is a hash so a portable plan doesn't carry raw task content. + This is the enterprise story: a model-authored plan becomes **reviewable, diffable, auditable, replayable** data. `created_at` is stamped at export (not at replay); `task_input_digest` is a digest so a portable plan doesn't carry raw task content. + + **Digest/hash definition.** `spec_hash` and `task_input_digest` are `sha256` over **canonical JSON** — `json.dumps(value, sort_keys=True, separators=(",", ":"))` — of the spec and the task input respectively. A single fixed definition so two exporters produce identical hashes for the same logical value (no whitespace/key-order drift). + + **Execution-input contract on import.** `task_input_digest` is *advisory provenance* for replaying the **original** run. Reusing a plan against a **new** task input is template behavior: ADK validates the new input against the captured `task_input_schema`. If `task_input_schema` is null (none captured), import may only **replay** with a matching `task_input_digest`, or must go through explicit **template promotion** (which attaches a `task_input_schema`) first. A stored plan must never silently bind (e.g. `task.files`) against an incompatible task shape. ```python def export_plan(frozen) -> dict: ... # the envelope above From c4c1b23157f330c3598e0580b1dffcb78c8a6cf6 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Tue, 2 Jun 2026 00:44:33 -0700 Subject: [PATCH 06/64] docs(workflow): unify FrozenWorkflowRecord + import integrity (review) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit One FrozenWorkflowRecord backs session state, audit event, and export envelope (§5/§10) — v1 persists the full record under authored_workflow:frozen_record, not a weaker {spec,hash} subset. import_plan recomputes spec_hash (reject on mismatch) and re-runs validation against the current registry rather than trusting envelope.validation; replay vs template execution-input rule made explicit. 'discriminated-by-kind' -> 'plain kind-tagged union' wording fix. --- .../authored_workflow_spike/DESIGN.md | 41 +++++++++++++++---- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md index 954ab2955ee..743d494419e 100644 --- a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md +++ b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md @@ -4,7 +4,7 @@ Canonical technical design for RFC #93 (GoogleCloudPlatform/BigQuery-Agent-Analy ## 1. Data model — `WorkflowSpec` -A discriminated-by-`kind`, recursive, ordered **tree of blocks** (not a graph with jumps). `id`s are globally-unique **binding names** for dataflow, never jump targets — which removes join / fall-through / GOTO ambiguity by construction. +A plain `kind`-tagged, recursive, ordered **tree of blocks** (not a graph with jumps). `id`s are globally-unique **binding names** for dataflow, never jump targets — which removes join / fall-through / GOTO ambiguity by construction. ```python # src/google/adk/workflow/authoring/_spec.py (NEW) @@ -113,10 +113,27 @@ Then **`Graph.validate_graph()`** (reused) handles duplicate names, `START`/reac ## 5. Frozen-spec contract (correctness requirement) -Before any execution, persist: spec JSON, content hash, planner model+version, registry/capability version, validation result. Deterministic replay holds **only** if resume loads the **same** frozen spec → **resume MUST reuse it and MUST NOT re-plan** unless the user starts a new run; a registry-version mismatch on resume is a hard error. +Persist **one** `FrozenWorkflowRecord` before any execution — the *same* shape backs session state, the audit event, and the export envelope (§10), so v1 storage is never a weaker subset: -- **Storage target (v1):** session state under an **unprefixed (session-scoped) key** (e.g. `authored_workflow:frozen_spec`). **Not** `app:` — that's app-scoped (`State.APP_PREFIX`, extracted in `_session_util.extract_state_delta`) and shared across sessions/users, leaking per-run data and breaking per-run resume. -- **Audit event shape:** persist **state-only** — `Event(state={"authored_workflow:frozen_spec": frozen_record})`, hash/kind inside the record or a sibling key. **Not** `Event.output` (`NodeRunner._track_event_in_context` sets `ctx.output = event.output` and `Context.output` rejects a second output → "Output already set"). **Not** `Event.content` (would re-enter a model's context). +```python +class FrozenWorkflowRecord(BaseModel): + schema_version: str # "v1" + spec: WorkflowSpec + spec_hash: str # sha256(canonical_json(spec)) — see §10 + planner_model: str + registry_version: str + capability_versions: dict[str, str] + validation: ValidationResult # {passed: bool, warnings: [...]} + created_at: str # ISO-8601, stamped at freeze + task_input_schema: dict | None # expected root task-input schema (enables template reuse) + task_input_digest: str | None # sha256(canonical_json(task_input)) +``` + +Deterministic replay holds **only** if resume loads the **same** record → **resume MUST reuse it and MUST NOT re-plan** unless the user starts a new run; a registry/capability-version mismatch on resume is a hard error. + +- **Storage target (v1):** the **full record** in session state under an **unprefixed (session-scoped) key** `authored_workflow:frozen_record` — not just `{spec, hash}`, so drift detection and audit have everything they need. **Not** `app:` (app-scoped — `State.APP_PREFIX`, extracted in `_session_util.extract_state_delta` — would leak per-run data and break per-run resume). +- **Audit event shape:** persist **state-only** — `Event(state={"authored_workflow:frozen_record": record})`. **Not** `Event.output` (`NodeRunner._track_event_in_context` sets `ctx.output = event.output`; `Context.output` rejects a second output → "Output already set"). **Not** `Event.content` (would re-enter a model's context). +- *(The committed demo persists a minimal `{spec, hash}` subset for illustration; the canonical v1 shape is `FrozenWorkflowRecord`.)* ## 6. Security model @@ -160,9 +177,9 @@ Re-runnable: `contributing/samples/workflows/authored_workflow_spike/` (10 deter **Source of truth = the typed `WorkflowSpec`.** The compiled `Workflow` is a *derived* artifact. Storage is tiered, scoped to keep generated code and compiled graphs out of v1: -- **v1 (required) — persist frozen spec per run.** Already core (§5): session state `authored_workflow:frozen_spec` + hash, for resume/replay. +- **v1 (required) — persist the full `FrozenWorkflowRecord` per run** (§5) under `authored_workflow:frozen_record` — for resume/replay **and** drift detection. -- **v1.1 (recommended) — export the frozen spec as a portable JSON envelope.** An explicit "Export plan" operation producing a self-describing record: +- **v1.1 (recommended) — export the record as a portable JSON envelope.** The envelope **is a serialized `FrozenWorkflowRecord`** (§5) — same fields, never a weaker shape — produced by an explicit "Export plan" operation: ```json { @@ -186,8 +203,16 @@ Re-runnable: `contributing/samples/workflows/authored_workflow_spike/` (10 deter **Execution-input contract on import.** `task_input_digest` is *advisory provenance* for replaying the **original** run. Reusing a plan against a **new** task input is template behavior: ADK validates the new input against the captured `task_input_schema`. If `task_input_schema` is null (none captured), import may only **replay** with a matching `task_input_digest`, or must go through explicit **template promotion** (which attaches a `task_input_schema`) first. A stored plan must never silently bind (e.g. `task.files`) against an incompatible task shape. ```python - def export_plan(frozen) -> dict: ... # the envelope above - def import_plan(envelope, registry) -> WorkflowSpec: # re-validates vs the CURRENT registry + def export_plan(record: FrozenWorkflowRecord) -> dict: ... # serialize the §5 record + def import_plan(envelope, registry, *, task_input=None) -> WorkflowSpec: + # INTEGRITY (never trust the envelope's own `validation`): + # 1. recompute sha256(canonical_json(spec)); REJECT if != envelope["spec_hash"] + # 2. re-run WorkflowSpecValidator against the CURRENT registry + # 3. registry/capability drift -> fail loudly (or explicit migration) + # EXECUTION-INPUT: + # replay : task_input digest must match envelope["task_input_digest"] (else audit-only) + # template : task_input validated against envelope["task_input_schema"] before execution + # neither : do NOT execute against arbitrary new input ``` - **v2 (optional) — promote an exported plan to a reusable template.** A human approves a spec and saves it as a template. **On import, ADK MUST re-validate against the *current* registry**; registry/capability drift **fails loudly or requires explicit migration** — never a silent run against a changed capability set. (The envelope's `registry_version` / `capability_versions` are what make drift detectable.) From 921fbbecc2cd119bd8a5a718c147b326d6111194 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Tue, 2 Jun 2026 00:47:35 -0700 Subject: [PATCH 07/64] docs(workflow): make positioning/phasing explicit (MVP-first, #92-first, 3-5-task build gate) --- .../samples/workflows/authored_workflow_spike/DESIGN.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md index 743d494419e..14f5cf7af9c 100644 --- a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md +++ b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md @@ -2,6 +2,8 @@ Canonical technical design for RFC #93 (GoogleCloudPlatform/BigQuery-Agent-Analytics-SDK#93). Mirrors the issue's Technical Design comment. Covers the data model, validator, interpreter/compilation, frozen-spec contract, security model, framework changes, testing, and the empirical findings that shaped it. Audience: implementers / technical reviewers. +> **Phasing (MVP-first).** Ship **#92 first**; build full #93 only once leadership commits it as a product bet **and** 3–5 real tasks beat hand-wired workflows. **MVP scope** = `WorkflowSpec` + validator + **freeze/replay + export**; **defer** templates (v2), complex loops, and broad compiler features. (Strategic rationale: the concise RFC's *Positioning & priority*.) + ## 1. Data model — `WorkflowSpec` A plain `kind`-tagged, recursive, ordered **tree of blocks** (not a graph with jumps). `id`s are globally-unique **binding names** for dataflow, never jump targets — which removes join / fall-through / GOTO ambiguity by construction. From 5aa2dc65e4d685213ec4f9dedc64ef37a72a6c4e Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Tue, 2 Jun 2026 01:00:28 -0700 Subject: [PATCH 08/64] docs(workflow): make demo-vs-production persistence explicit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Demo persists only {spec, hash}; production v1 stores the full FrozenWorkflowRecord (DESIGN.md §5). State it explicitly in DESIGN.md, the demo agent.py freeze step, and DEMO_NARRATIVE.md so the demo isn't misread as the canonical persistence contract. --- .../workflows/authored_workflow_demo/DEMO_NARRATIVE.md | 2 ++ .../authored_workflow_demo/security_audit_planner/agent.py | 5 +++++ .../samples/workflows/authored_workflow_spike/DESIGN.md | 2 +- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md index fc60a51c07f..65ffc2e0db7 100644 --- a/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md +++ b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md @@ -58,6 +58,8 @@ Send: **"Plan and run a codebase security review."** The chat streams: > "Open the **State** tab: `authored_workflow:frozen_spec` and `…_hash`. The plan > is now durable data you can store, diff, and audit." +*(Presenter note: this demo persists only `{spec, hash}` for readability. Production v1 stores the full `FrozenWorkflowRecord` — planner/registry/capability versions, validation, `task_input_schema`/`digest` — see `authored_workflow_spike/DESIGN.md` §5. The demo illustrates the behavior; it is not the canonical persistence contract.)* + ## Beat 4 — execute (Events / trace tab) ``` diff --git a/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py b/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py index 8d14a0fcaf7..8c50ef60644 100644 --- a/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py @@ -238,6 +238,11 @@ async def author_validate_execute(ctx: Context, node_input): # 3. FREEZE — persist spec + hash to session state on first author only # (visible in the State tab; reused runs already have it). + # NOTE: this demo persists only a minimal {spec, hash} subset to keep the + # walkthrough readable. Production v1 would store the full FrozenWorkflowRecord + # (planner/registry/capability versions, validation, task_input_schema/digest) + # — see authored_workflow_spike/DESIGN.md §5. The demo is illustrative, not the + # canonical persistence contract. if not reused: ctx.state["authored_workflow:frozen_spec"] = spec.model_dump() ctx.state["authored_workflow:frozen_spec_hash"] = spec_hash diff --git a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md index 14f5cf7af9c..30669e23a5c 100644 --- a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md +++ b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md @@ -135,7 +135,7 @@ Deterministic replay holds **only** if resume loads the **same** record → **re - **Storage target (v1):** the **full record** in session state under an **unprefixed (session-scoped) key** `authored_workflow:frozen_record` — not just `{spec, hash}`, so drift detection and audit have everything they need. **Not** `app:` (app-scoped — `State.APP_PREFIX`, extracted in `_session_util.extract_state_delta` — would leak per-run data and break per-run resume). - **Audit event shape:** persist **state-only** — `Event(state={"authored_workflow:frozen_record": record})`. **Not** `Event.output` (`NodeRunner._track_event_in_context` sets `ctx.output = event.output`; `Context.output` rejects a second output → "Output already set"). **Not** `Event.content` (would re-enter a model's context). -- *(The committed demo persists a minimal `{spec, hash}` subset for illustration; the canonical v1 shape is `FrozenWorkflowRecord`.)* +- **Demo vs production:** the committed demo persists only a minimal `{spec, hash}` subset to keep the walkthrough readable — **it illustrates the behavior; production v1 would store the full `FrozenWorkflowRecord`.** The demo is illustrative, not the canonical contract. ## 6. Security model From 5aa73fdb64db29147ea6a41ea5454c06d2e952f9 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Tue, 2 Jun 2026 01:01:15 -0700 Subject: [PATCH 09/64] docs(workflow): rename to 'Reproducible Model-Authored Workflows for ADK' --- .../samples/workflows/authored_workflow_spike/DESIGN.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md index 30669e23a5c..ee10e5de8a0 100644 --- a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md +++ b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md @@ -1,4 +1,4 @@ -# Design — Agent-authored typed Workflows (RFC #93) +# Design — Reproducible Model-Authored Workflows for ADK (RFC #93) Canonical technical design for RFC #93 (GoogleCloudPlatform/BigQuery-Agent-Analytics-SDK#93). Mirrors the issue's Technical Design comment. Covers the data model, validator, interpreter/compilation, frozen-spec contract, security model, framework changes, testing, and the empirical findings that shaped it. Audience: implementers / technical reviewers. From 305c62e4ce3dc0c3f6f75aeef074c90edd6c75aa Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Tue, 2 Jun 2026 01:11:33 -0700 Subject: [PATCH 10/64] docs(workflow): add Pipeline block (uses #92 ctx.pipeline) + Claude Code comparison Pipeline/PipelineStage make barrier-free multi-stage per-item flow first-class so the authoring vocabulary is not less expressive than its #92 executor. Add a candid 'Comparison to Claude Code Dynamic Workflows' (wins: audit/safety; gaps: expressiveness/maturity, plan-size ceiling, quality-pattern templates, scale). Hierarchical/sub-plan authoring noted as post-gate future, not MVP. --- .../authored_workflow_spike/DESIGN.md | 26 ++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md index ee10e5de8a0..21b5d81a488 100644 --- a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md +++ b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md @@ -1,4 +1,4 @@ -# Design — Reproducible Model-Authored Workflows for ADK (RFC #93) +# Technical Design — Agent-authored typed Workflows (RFC #93) Canonical technical design for RFC #93 (GoogleCloudPlatform/BigQuery-Agent-Analytics-SDK#93). Mirrors the issue's Technical Design comment. Covers the data model, validator, interpreter/compilation, frozen-spec contract, security model, framework changes, testing, and the empirical findings that shaped it. Audience: implementers / technical reviewers. @@ -32,6 +32,20 @@ class FanOut(BaseModel): capability: str # run once per element (compiles to ctx.pipeline/parallel) collect: Literal["list"] = "list" # per-item outputs aggregate to an order-preserving list +class PipelineStage(BaseModel): + capability: str # MUST resolve in the registry; takes an item + input: Binding | None = None # defaults to the previous stage's per-item output + +class Pipeline(BaseModel): + kind: Literal["pipeline"] + id: str + over: Binding # MUST resolve to a LIST-typed value + stages: list[PipelineStage] # each item flows through ALL stages, BARRIER-FREE + collect: Literal["list"] = "list" # outputs aggregate to an order-preserving list + # Compiles to #92 ctx.pipeline: item A may be in stage k while item B is in stage 1. + # Failed item -> None; control exceptions follow #92. stage[0] input defaults to the + # per-item element; stage[n] input defaults to stage[n-1]'s per-item output. + class Route(BaseModel): value: str # the switch value this route matches block: list["SpecNode"] # non-empty; output = block's last-node output @@ -55,7 +69,7 @@ class LoopUntil(BaseModel): # Annotated[..., Field(discriminator="kind")]: the discriminated form emits a # JSON-schema `discriminator` keyword that Gemini's response_schema rejects # (Schema: extra_forbidden — verified). `kind` still disambiguates parsing. -SpecNode = Union[StepRef, FanOut, Branch, LoopUntil] +SpecNode = Union[StepRef, FanOut, Pipeline, Branch, LoopUntil] class WorkflowSpec(BaseModel): goal: str @@ -89,7 +103,7 @@ class AuthoredWorkflowAgent(BaseAgent): - **Authoring** = `LlmAgent(output_schema=WorkflowSpec)`; ADK validates structured output, so a malformed plan is caught and re-planned (bounded by `max_replans`). - **Validation** is a **new semantic validator** (below) that *lowers to* `Graph.validate_graph()` for structural checks. -- **Compilation** lowers the block tree: sequence → edges; `Branch` → conditional route edges over nested blocks; `FanOut`/`LoopUntil` → the #92 `ctx.pipeline`/`ctx.parallel` + bounded loop. The compiled artifact is an ordinary `Workflow` — nothing downstream knows it was machine-authored. +- **Compilation** lowers the block tree: sequence → edges; `Branch` → conditional route edges over nested blocks; `FanOut` → `ctx.parallel`-map; `Pipeline` → barrier-free `ctx.pipeline` (multi-stage); `LoopUntil` → bounded loop. The compiled artifact is an ordinary `Workflow` — nothing downstream knows it was machine-authored. - **Registry** = developer-supplied capabilities (an agent, or a tool wrapped as a node), each with per-capability policy. ## 3. Validator — semantic, then structural @@ -100,6 +114,7 @@ class AuthoredWorkflowAgent(BaseAgent): - `Binding` invariant + path/type compatibility vs the producer's `output_schema` and consumer's `input_schema`; - `FanOut.over` resolves to a list; the fan-out capability takes an item; - `Branch.on` is string/str-enum-typed; route blocks share a compatible last-node output schema; non-exhaustive enum domain is flagged (unmatched at runtime fails); +- `Pipeline`: `over` resolves to a list; every stage `capability` is registered and takes an item; stage[0] input defaults to the per-item element, stage[n] to stage[n-1]'s output; the last stage's output type defines the pipeline output (validated for downstream bindings); - `LoopUntil`: strict-bool `until_capability`, present/compatible `until_input`, `max_iters >= 1`; - globally-unique `id`s; binding-scope (no non-preceding / cross-route references); - registry-version match vs a frozen spec (drift = hard error). @@ -110,6 +125,7 @@ Then **`Graph.validate_graph()`** (reused) handles duplicate names, `START`/reac - **Authoring non-deterministic; execution deterministic.** Once frozen, execution + resume replay is fully deterministic (it's just a `Workflow`). - **Reuses #92 + the engine wholesale.** Fan-out → supervised `ctx.pipeline`/`ctx.parallel` (bounded, interrupt-safe); sequence/branch → edges + routes; loop → bounded loop. No new executor. +- **`Pipeline` is barrier-free per-item** (compiles directly to #92's `ctx.pipeline`): item A may be in stage *k* while item B is in stage 1; an ordinary failure drops that item to `None`; control exceptions follow #92. This closes the gap where the vocabulary was *less* expressive than its own executor — a single-capability `fan_out` is parallel-map; `Pipeline` is the multi-stage barrier-free form. - **Re-plan is pre-execution-only.** `max_replans` applies only to validation failures; an execution failure fails the frozen run; recovery = a new explicit run/version. No recursive planner-spawning-planner. - **Budget + agent caps from #92** bound a mis-plan's spend. @@ -223,6 +239,10 @@ Re-runnable: `contributing/samples/workflows/authored_workflow_spike/` (10 deter Net: this turns the proposal from "a model can author plans" into "**model-authored plans become durable enterprise artifacts**" — without committing to durable generated code. +## 11. Future (post-gate, NOT MVP) + +**Hierarchical / sub-plan authoring** — a registered capability that is itself an `AuthoredWorkflowAgent`, so a step can expand into its own authored sub-plan. This is the likely path to parity with Claude Code's unbounded orchestration (it lifts the single-response plan-size ceiling), but it is **out of MVP scope** and should be evaluated **only after the 3–5-task build gate**. MVP stays single-level: `WorkflowSpec` + validator + freeze/replay + export. + ## References - #92 — supervised concurrent dynamic dispatch + `ctx.pipeline` (executor). From 40e2814986c3060a6295b60eacbd7ff9bb0e3ac0 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Tue, 2 Jun 2026 01:17:04 -0700 Subject: [PATCH 11/64] spike(workflow): implement Pipeline (barrier-free per-item) + tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit authoring.py now covers step/fan_out/pipeline/branch/loop_until. Pipeline + PipelineStage compile to #92 ctx.pipeline: each item threads ALL stages barrier-free (item A in stage k while item B in stage 1) — NOT two barriered fan_outs. Validator: over is a list, every stage capability exists and takes an item, stage input scope. Interpreter: stage[0] input defaults to the per-item element, stage[n] to stage[n-1] output; collect=list. 3 new deterministic tests (13 total): validator accept/reject pipeline, and an ordered + BARRIER-FREE proof (a verifier starts before the slow reviewer finishes — impossible with two barriered fan_outs). --- .../authored_workflow_spike/README.md | 4 +- .../authored_workflow_spike/authoring.py | 78 +++++++++++++++- .../authored_workflow_spike/test_authoring.py | 89 +++++++++++++++++++ 3 files changed, 165 insertions(+), 6 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_spike/README.md b/contributing/samples/workflows/authored_workflow_spike/README.md index c206f84f6e2..99be63bdd67 100644 --- a/contributing/samples/workflows/authored_workflow_spike/README.md +++ b/contributing/samples/workflows/authored_workflow_spike/README.md @@ -26,9 +26,9 @@ behind the RFC's "can a model author good plans?" question. pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q ``` -Expected: **10 passed** — `Binding` invariant, `max_iters>=1`, validator accepts a +Expected: **13 passed** — `Binding` invariant, `max_iters>=1`, validator accepts a valid spec and rejects unknown capability / non-preceding binding / duplicate id, -the open-map warning, and interpreter execution of fan_out→aggregate, branch +the open-map warning, and interpreter execution of fan_out→aggregate, **pipeline (barrier-free per-item review→verify)**, branch (correct route), and loop_until (stops + correct output). ## Live planner sweep (optional evidence) diff --git a/contributing/samples/workflows/authored_workflow_spike/authoring.py b/contributing/samples/workflows/authored_workflow_spike/authoring.py index ec52b7ed702..cdb9d68b3f5 100644 --- a/contributing/samples/workflows/authored_workflow_spike/authoring.py +++ b/contributing/samples/workflows/authored_workflow_spike/authoring.py @@ -22,8 +22,8 @@ * ``WorkflowSpecValidator`` — semantic validation (capability refs, binding scope, list/loop/branch rules) + an open-map output-schema warning. * ``SpecInterpreter`` — executes a validated spec on the real ADK Workflow - engine via the #92 ``DynamicNodeSupervisor`` (step / fan_out / branch / - loop_until). + engine via the #92 ``DynamicNodeSupervisor`` (step / fan_out / pipeline / + branch / loop_until). This is a demand-gate artifact, not production code. See README.md. """ @@ -84,6 +84,24 @@ class FanOut(BaseModel): collect: Literal["list"] = "list" +class PipelineStage(BaseModel): + capability: str # registered; takes an item + input: Binding | None = ( + None # defaults to the previous stage's per-item output + ) + + +class Pipeline(BaseModel): + # Barrier-free per-item multi-stage flow: each item runs through ALL stages + # via #92 ctx.pipeline (item A can be in stage k while item B is in stage 1) — + # NOT two barriered fan_outs. Compiles to DynamicNodeSupervisor.pipeline. + kind: Literal["pipeline"] + id: str + over: Binding # MUST resolve to a list + stages: list[PipelineStage] + collect: Literal["list"] = "list" + + class Route(BaseModel): value: str block: list["SpecNode"] @@ -114,7 +132,7 @@ class LoopUntil(BaseModel): # rejects (Schema: extra_forbidden — verified on gemini-3.5-flash). Each member still # carries a `kind` Literal, so this is a structurally-tagged union: unambiguous to parse # and to switch on, AND accepted as a Gemini response_schema. -SpecNode = Union[StepRef, FanOut, Branch, LoopUntil] +SpecNode = Union[StepRef, FanOut, Pipeline, Branch, LoopUntil] class WorkflowSpec(BaseModel): @@ -123,7 +141,16 @@ class WorkflowSpec(BaseModel): output: Binding -for _m in (StepRef, FanOut, Branch, Route, LoopUntil, WorkflowSpec): +for _m in ( + StepRef, + FanOut, + PipelineStage, + Pipeline, + Branch, + Route, + LoopUntil, + WorkflowSpec, +): _m.model_rebuild() @@ -227,6 +254,25 @@ def _walk(self, nodes, preceding: set[str], ids: set[str]) -> set[str]: raise SpecValidationError( f"fan_out {n.id}: capability must take an item" ) + if isinstance(n, Pipeline): + if not n.stages: + raise SpecValidationError(f"pipeline {n.id}: needs >= 1 stage") + for st in n.stages: + if st.capability not in self.registry: + raise SpecValidationError(f"unknown capability {st.capability!r}") + if self.registry[st.capability].input_kind != "item": + raise SpecValidationError( + f"pipeline {n.id}: stage {st.capability!r} must take an item" + ) + if ( + isinstance(st.input, Binding) + and st.input.source == "step" + and st.input.step not in preceding + ): + raise SpecValidationError( + f"pipeline {n.id}: stage input references non-preceding step" + f" {st.input.step!r}" + ) if isinstance(n, LoopUntil): # body executes in-scope; until_input may reference a body step. body_scope = self._walk(n.body, preceding | {n.id}, ids) @@ -309,6 +355,30 @@ async def _run_block(self, nodes, task_input, prefix: str): ) ), ) + elif isinstance(n, Pipeline): + # Barrier-free per-item multi-stage flow via #92 ctx.pipeline — each item + # threads ALL stages; item A can be in stage k while item B is in stage 1 + # (NOT two barriered fan_outs). stage[0] input defaults to the per-item + # element; stage[k] input defaults to stage[k-1]'s per-item output. + items = self._resolve(n.over, task_input) + stage_fns = [] + for si, st in enumerate(n.stages): + + def stage(prev, it, i, si=si, st=st, rid=rid): + cap = self.registry[st.capability] + value = ( + self._resolve(st.input, task_input) + if st.input is not None + else (it if si == 0 else prev) + ) + return self.sup.dispatch( + cap.build(), + node_input=self._arg(cap, value), + run_id=f"{rid}_{i}_{si}", + ) + + stage_fns.append(stage) + self.state[n.id] = await self.sup.pipeline(items, *stage_fns) elif isinstance(n, Branch): value = str(self._resolve(n.on, task_input)) routes = {r.value: r.block for r in n.routes} diff --git a/contributing/samples/workflows/authored_workflow_spike/test_authoring.py b/contributing/samples/workflows/authored_workflow_spike/test_authoring.py index 7c97ae3732c..47b042a6714 100644 --- a/contributing/samples/workflows/authored_workflow_spike/test_authoring.py +++ b/contributing/samples/workflows/authored_workflow_spike/test_authoring.py @@ -41,6 +41,8 @@ from authoring import CapabilityRegistry from authoring import FanOut from authoring import LoopUntil +from authoring import Pipeline +from authoring import PipelineStage from authoring import Route from authoring import SpecInterpreter from authoring import SpecValidationError @@ -349,3 +351,90 @@ async def test_interpreter_loop_until_stops_and_outputs(): "text": "v", "len": len("topic"), } # loop output = last body node output + + +# ----------------------------------------------------------------- pipeline +def _timed_registry(log): + """reviewer (stage 0) + verifier (stage 1) as deterministic timed stubs.""" + import asyncio + import time + + def stage_cap(name, slow_for=None, key="r"): + def build(): + @node(name=name) + async def n(ctx, node_input): + item = node_input + log.append((name, "start", time.perf_counter())) + await asyncio.sleep( + 0.05 if (slow_for is not None and item == slow_for) else 0.0 + ) + log.append((name, "end", time.perf_counter())) + yield Event(output={key: item}) + + return n + + return Capability( + name=name, build=build, input_kind="item", serialize_input=False + ) + + return CapabilityRegistry([ + stage_cap("reviewer", slow_for=1, key="review"), + stage_cap("verifier", key="verdict"), + ]) + + +def _pipeline_spec(): + return WorkflowSpec( + goal="pipe", + steps=[ + Pipeline( + kind="pipeline", + id="pp", + over=Binding(source="task", path="items"), + stages=[ + PipelineStage(capability="reviewer"), + PipelineStage(capability="verifier"), + ], + ) + ], + output=Binding(source="step", step="pp"), + ) + + +def test_validator_accepts_pipeline(): + log = [] + WorkflowSpecValidator(_timed_registry(log)).validate(_pipeline_spec()) + + +def test_validator_rejects_pipeline_list_stage(): + spec = _pipeline_spec() + # "count" takes a list, not an item -> invalid pipeline stage + spec.steps[0].stages[1] = PipelineStage(capability="count") + with pytest.raises(SpecValidationError): + WorkflowSpecValidator(_registry()).validate(spec) + + +@pytest.mark.asyncio +async def test_interpreter_pipeline_ordered_and_barrier_free(): + log = [] + reg = _timed_registry(log) + # input items [0, 1]; reviewer is slow for item 1 only. + out = await _run_spec(_pipeline_spec(), reg, {"items": [0, 1]}) + + # Ordered, per-item review->verify (verdict carries the reviewed value): + assert out == [{"verdict": {"review": 0}}, {"verdict": {"review": 1}}] + + starts = {n: t for (n, p, t) in log if p == "start"} + ends = {n: t for (n, p, t) in log if p == "end"} + # BARRIER-FREE proof: item 0 reaches stage 2 (verifier) BEFORE item 1 finishes + # stage 1 (reviewer). Two barriered fan_outs could NOT do this — every + # reviewer would finish before any verifier started. + assert "verifier" in starts and "reviewer" in ends + # earliest verifier start precedes the latest reviewer end: + first_verifier_start = min( + t for (n, p, t) in log if n == "verifier" and p == "start" + ) + last_reviewer_end = max( + t for (n, p, t) in log if n == "reviewer" and p == "end" + ) + assert first_verifier_start < last_reviewer_end From 64fcb95d70ea0634d901b27db8ab484f54f3b608 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Tue, 2 Jun 2026 01:23:51 -0700 Subject: [PATCH 12/64] spike(workflow): enforce max_fan_out per Pipeline stage + sync docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P2: each Pipeline stage dispatches once per item, so every stage capability is subject to the same data-dependent fan_out cap as FanOut. The interpreter now rejects (pre-dispatch) when len(items) > stage cap.max_fan_out, closing a gap where a pipeline over N items bypassed a capability's cap — the RFC security model relies on runtime enforcement of these caps. New deterministic test (14 total) asserts rejection before any stage runs. P3: sync stale shape lists (add pipeline) and test counts after the prior Pipeline addition — authoring 10/13 -> 14, totals 11+14+4 = 29, across authoring.py, test_authoring.py, both READMEs, DESIGN.md, DEMO_NARRATIVE.md. --- .../authored_workflow_demo/DEMO_NARRATIVE.md | 4 ++-- .../workflows/authored_workflow_demo/README.md | 4 ++-- .../workflows/authored_workflow_spike/DESIGN.md | 2 +- .../workflows/authored_workflow_spike/README.md | 14 +++++++------- .../authored_workflow_spike/authoring.py | 11 ++++++++++- .../authored_workflow_spike/test_authoring.py | 15 ++++++++++++++- 6 files changed, 36 insertions(+), 14 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md index 65ffc2e0db7..d83c05fff6b 100644 --- a/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md +++ b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md @@ -98,13 +98,13 @@ Same hash, `reused` flips to `true` — the model is not called the second time. > "So: a model authored a typed, validated, capability-bounded plan; ADK executed > it on the real engine; and a re-send replayed the exact frozen plan. The -> deterministic test suites — 11 (#92) + 10 (#93) + 4 (demo) — lock all of this +> deterministic test suites — 11 (#92) + 14 (#93) + 4 (demo) — lock all of this > in CI, including a no-LLM test of this reuse path." ## Proof commands (terminal, ~60s) ```bash pytest contributing/samples/workflows/dynamic_supervisor_spike/test_dynamic_supervisor_spike.py -q # 11 -pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 10 +pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 14 pytest contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py -q # 4 ``` diff --git a/contributing/samples/workflows/authored_workflow_demo/README.md b/contributing/samples/workflows/authored_workflow_demo/README.md index 6fb6b3279ca..9b251a57572 100644 --- a/contributing/samples/workflows/authored_workflow_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_demo/README.md @@ -57,11 +57,11 @@ Proof points: multi-stage `fan_out → step → step`; branch `step → branch`; ```bash pytest contributing/samples/workflows/dynamic_supervisor_spike/test_dynamic_supervisor_spike.py -q # 11 -pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 10 +pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 14 pytest contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py -q # 4 ``` -- Deterministic suites: #92 **11** + #93 **10** + demo **4** = **25** (incl. a no-LLM reuse-path test). +- Deterministic suites: #92 **11** + #93 **14** + demo **4** = **29** (incl. a no-LLM reuse-path test). - PR #3 CI green except the documented fork-only `agent-triage` token job. ## Recording notes diff --git a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md index 21b5d81a488..726ff49a84f 100644 --- a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md +++ b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md @@ -189,7 +189,7 @@ Fully additive. New `authoring/` package + `AuthoredWorkflowAgent`; no change to 1. **Discriminated unions are incompatible with Gemini `response_schema`** — `Field(discriminator="kind")` emits a `discriminator` keyword genai rejects (`Schema: extra_forbidden`). Use a plain `kind`-tagged union. 1. **Planner quality vs capability quality are separable** — authoring/structure was reliably good; the residual variance was per-capability output quality (prompts/schemas/retries), proven via an intermediate-output diff (authored vs baseline findings were semantically identical). The strict `unmatched=fail` branch contract also caught a bad field-binding loudly instead of mis-routing. -Re-runnable: `contributing/samples/workflows/authored_workflow_spike/` (10 deterministic tests + env-gated live sweep) and `authored_workflow_demo/` (ADK Web `root_agent` + 4 CI-safe tests incl. the no-LLM reuse path), in `caohy1988/adk-python` PR #3. +Re-runnable: `contributing/samples/workflows/authored_workflow_spike/` (14 deterministic tests + env-gated live sweep) and `authored_workflow_demo/` (ADK Web `root_agent` + 4 CI-safe tests incl. the no-LLM reuse path), in `caohy1988/adk-python` PR #3. ## 10. Plan export & storage — the frozen spec as a durable artifact diff --git a/contributing/samples/workflows/authored_workflow_spike/README.md b/contributing/samples/workflows/authored_workflow_spike/README.md index 99be63bdd67..36ed92f2691 100644 --- a/contributing/samples/workflows/authored_workflow_spike/README.md +++ b/contributing/samples/workflows/authored_workflow_spike/README.md @@ -14,11 +14,11 @@ behind the RFC's "can a model author good plans?" question. ## Files -| File | Purpose | -| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `authoring.py` | `WorkflowSpec` (plain `kind`-tagged recursive tree), `CapabilityRegistry`, `WorkflowSpecValidator`, `SpecInterpreter` (step / fan_out / branch / loop_until). | -| `test_authoring.py` | Deterministic, CI-safe tests (no LLM). The trustworthy artifact. | -| `test_live_planner_sweep.py` | OPTIONAL env-gated live planner sweep across plan shapes. | +| File | Purpose | +| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `authoring.py` | `WorkflowSpec` (plain `kind`-tagged recursive tree), `CapabilityRegistry`, `WorkflowSpecValidator`, `SpecInterpreter` (step / fan_out / pipeline / branch / loop_until). | +| `test_authoring.py` | Deterministic, CI-safe tests (no LLM). The trustworthy artifact. | +| `test_live_planner_sweep.py` | OPTIONAL env-gated live planner sweep across plan shapes. | ## Deterministic tests (CI-safe, no network) @@ -26,9 +26,9 @@ behind the RFC's "can a model author good plans?" question. pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q ``` -Expected: **13 passed** — `Binding` invariant, `max_iters>=1`, validator accepts a +Expected: **14 passed** — `Binding` invariant, `max_iters>=1`, validator accepts a valid spec and rejects unknown capability / non-preceding binding / duplicate id, -the open-map warning, and interpreter execution of fan_out→aggregate, **pipeline (barrier-free per-item review→verify)**, branch +the open-map warning, and interpreter execution of fan_out→aggregate, **pipeline (barrier-free per-item review→verify, plus per-stage `max_fan_out` enforcement)**, branch (correct route), and loop_until (stops + correct output). ## Live planner sweep (optional evidence) diff --git a/contributing/samples/workflows/authored_workflow_spike/authoring.py b/contributing/samples/workflows/authored_workflow_spike/authoring.py index cdb9d68b3f5..c251a1daa2a 100644 --- a/contributing/samples/workflows/authored_workflow_spike/authoring.py +++ b/contributing/samples/workflows/authored_workflow_spike/authoring.py @@ -301,7 +301,7 @@ def _bindings(n) -> list[Binding]: # ----------------------------------------------------------------- interpreter class SpecInterpreter: """Executes a validated WorkflowSpec on the real ADK engine via the #92 - supervisor. Handles step / fan_out / branch / loop_until.""" + supervisor. Handles step / fan_out / pipeline / branch / loop_until.""" def __init__(self, registry: CapabilityRegistry, ctx, *, gate: int = 8): self.registry = registry @@ -361,6 +361,15 @@ async def _run_block(self, nodes, task_input, prefix: str): # (NOT two barriered fan_outs). stage[0] input defaults to the per-item # element; stage[k] input defaults to stage[k-1]'s per-item output. items = self._resolve(n.over, task_input) + # Each stage dispatches once per item, so every stage capability is + # subject to the same data-dependent fan-out cap as a FanOut. + for st in n.stages: + cap = self.registry[st.capability] + if len(items) > cap.max_fan_out: + raise SpecValidationError( + f"runtime: pipeline stage {st.capability!r} fan_out" + f" {len(items)} exceeds max_fan_out {cap.max_fan_out}" + ) stage_fns = [] for si, st in enumerate(n.stages): diff --git a/contributing/samples/workflows/authored_workflow_spike/test_authoring.py b/contributing/samples/workflows/authored_workflow_spike/test_authoring.py index 47b042a6714..cedf8eb1295 100644 --- a/contributing/samples/workflows/authored_workflow_spike/test_authoring.py +++ b/contributing/samples/workflows/authored_workflow_spike/test_authoring.py @@ -15,7 +15,7 @@ """Deterministic, CI-safe tests for the authored-workflow spike (RFC #93). No LLM. Capabilities are deterministic stub nodes, so these exercise the -validator + the interpreter (step / fan_out / branch / loop_until + binding +validator + the interpreter (step / fan_out / pipeline / branch / loop_until + binding scope) on the real ADK Workflow engine. The live planner sweep lives in test_live_planner_sweep.py (env-gated). """ @@ -438,3 +438,16 @@ async def test_interpreter_pipeline_ordered_and_barrier_free(): t for (n, p, t) in log if n == "reviewer" and p == "end" ) assert first_verifier_start < last_reviewer_end + + +@pytest.mark.asyncio +async def test_interpreter_pipeline_enforces_max_fan_out(): + # Each stage dispatches once per item, so a stage capability's max_fan_out is + # a data-dependent cap that must be enforced at runtime (same as FanOut). + log = [] + reg = _timed_registry(log) + reg["verifier"].max_fan_out = 1 # 2 items > cap -> reject before dispatch + with pytest.raises(SpecValidationError): + await _run_spec(_pipeline_spec(), reg, {"items": [0, 1]}) + # rejected pre-dispatch: no stage ran. + assert log == [] From 75264e548300db3ff430273e60d066f3339160b6 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Tue, 2 Jun 2026 01:30:40 -0700 Subject: [PATCH 13/64] demo(workflow): author a Pipeline (reviewer->verifier) in the security-audit demo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The demo now exercises Pipeline — the construct that closes the Claude Code gap — without adding visual complexity. The planner authors pipeline -> step -> step: a reviewer->verifier pipeline over the files (each file reviewed then its finding verified, barrier-free per item), then triager, then formatter. - agent.py: add a 'verifier' capability (Finding -> confirmed Finding); planner instruction authors the pipeline; capability collection walks pipeline stages so the displayed/audited capability set includes stage caps. - test_demo_agent.py: demo spec + stub registry use Pipeline/verifier; reuse path still proves no-LLM frozen replay. - Narrative + README: Beat 1 plan is pipeline -> step -> step; Beat 4 calls out reviewer/verifier interleaving per file; fresh captured hash 1f4c0883beb6. Validated live on gemini-3.5-flash: planner authored the pipeline 3/3 trials (no flakiness); reuse replays the same hash without re-invoking the model. --- .../authored_workflow_demo/DEMO_NARRATIVE.md | 44 ++++++++++-------- .../authored_workflow_demo/README.md | 6 +-- .../security_audit_planner/agent.py | 45 +++++++++++++++---- .../authored_workflow_demo/test_demo_agent.py | 31 ++++++++++--- 4 files changed, 91 insertions(+), 35 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md index d83c05fff6b..1c77f99af28 100644 --- a/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md +++ b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md @@ -19,29 +19,34 @@ Send: **"Plan and run a codebase security review."** The chat streams: ``` 🧭 Model-authored Workflow — planning a security audit over 4 files using only - registered capabilities (reviewer, triager, formatter). + registered capabilities (reviewer, verifier, triager, formatter). -📋 Authored plan (fan_out → step → step): +📋 Authored plan (pipeline → step → step): { "goal": "Audit files and format the report", "steps": [ - {"kind": "fan_out", "id": "review_files", "over": {"source":"task","path":"files"}, "capability": "reviewer"}, - {"kind": "step", "id": "triage_findings", "input": {"source":"step","step":"review_files"}, "capability": "triager"}, - {"kind": "step", "id": "format_report", "input": {"source":"step","step":"triage_findings"}, "capability": "formatter"} + {"kind": "pipeline", "id": "review_pipeline", + "over": {"source":"task","path":"files"}, + "stages": [{"capability":"reviewer"}, {"capability":"verifier"}], + "collect": "list"}, + {"kind": "step", "id": "triage_step", "input": {"source":"step","step":"review_pipeline"}, "capability": "triager"}, + {"kind": "step", "id": "format_step", "input": {"source":"step","step":"triage_step"}, "capability": "formatter"} ], - "output": {"source": "step", "step": "format_report"} + "output": {"source": "step", "step": "format_step"} } ``` -> "The model emitted a *typed plan*, not code — a fan-out of `reviewer` over the -> files, then `triager`, then `formatter`, with explicit data bindings between -> steps." +> "The model emitted a *typed plan*, not code — a **pipeline** over the files +> (`reviewer → verifier` per file, barrier-free), then `triager`, then +> `formatter`, with explicit data bindings between steps. The pipeline is the +> construct that lets each file flow review→verify independently — item A can be +> verifying while item B is still being reviewed." ## Beat 2 — validate (capability allow-list) ``` ✅ Validation passed. Capabilities referenced (all registered): - ['formatter', 'reviewer', 'triager']. + ['formatter', 'reviewer', 'triager', 'verifier']. ``` > "Validation confirms every capability the plan names is in the registry. The @@ -51,7 +56,7 @@ Send: **"Plan and run a codebase security review."** The chat streams: ## Beat 3 — freeze (State tab) ``` -🔒 Frozen spec persisted to session state — hash 206fb4d3a27b. +🔒 Frozen spec persisted to session state — hash 1f4c0883beb6. Re-send the prompt: it replays this exact plan, not a new one. ``` @@ -67,15 +72,18 @@ Send: **"Plan and run a codebase security review."** The chat streams: 2 high (hardcoded credentials and SQL injection), and 1 medium (division by zero). ``` -> "Open **Events**: ADK runs the plan on the real engine via the #92 supervisor — -> the `reviewer` fan-out over the 4 files, then `triager`, then `formatter`. The -> findings are real: a CRITICAL `os.system` injection, HIGH hardcoded creds and -> SQL injection, and a MEDIUM divide-by-zero." +> "Open **Events**: ADK runs the plan on the real engine via the #92 supervisor. +> Note the interleaving — `reviewer` and `verifier` events alternate **per +> file** (a file is being verified while another is still under review); that's +> the barrier-free pipeline, not two separate fan-out waves. Then `triager` over +> all verified findings, then `formatter`. The findings are real: a CRITICAL +> `os.system` injection, HIGH hardcoded creds and SQL injection, and a MEDIUM +> divide-by-zero." ## Beat 5 — reproduce (re-send the same prompt) ``` -♻️ Reusing frozen plan from session state — hash 206fb4d3a27b. +♻️ Reusing frozen plan from session state — hash 1f4c0883beb6. The model is NOT re-invoked; the exact prior plan is replayed. ✅ Validation passed. ... 📄 Audit result: ... @@ -89,8 +97,8 @@ Send: **"Plan and run a codebase security review."** The chat streams: | Run | `reused` | `hash` | | ----------- | -------- | -------------- | -| 1 (author) | `false` | `206fb4d3a27b` | -| 2 (re-send) | `true` | `206fb4d3a27b` | +| 1 (author) | `false` | `1f4c0883beb6` | +| 2 (re-send) | `true` | `1f4c0883beb6` | Same hash, `reused` flips to `true` — the model is not called the second time. diff --git a/contributing/samples/workflows/authored_workflow_demo/README.md b/contributing/samples/workflows/authored_workflow_demo/README.md index 9b251a57572..3a1ff6a9e04 100644 --- a/contributing/samples/workflows/authored_workflow_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_demo/README.md @@ -36,11 +36,11 @@ Plan and run a codebase security review. Point at the ADK-native evidence as it streams: -1. **Authored `WorkflowSpec`** — the chat shows the JSON plan (`fan_out → step → step`). +1. **Authored `WorkflowSpec`** — the chat shows the JSON plan (`pipeline → step → step`: a `reviewer → verifier` pipeline over the files, then `triager`, then `formatter`). 1. **Validation** — "Validation passed" + the capability list (all registered). 1. **Frozen spec + hash** — open the **State** tab: `authored_workflow:frozen_spec` and `…_hash`. -1. **Execution** — the **Events / trace** view shows the `reviewer` fan-out, then `triager`, then `formatter` node runs. -1. **Final output** — the triaged audit (e.g. 3 HIGH + 1 MEDIUM across `auth.py`/`db.py`/`net.py`/`math.py`). +1. **Execution** — the **Events / trace** view shows `reviewer` and `verifier` interleaving **per file** (the barrier-free pipeline), then `triager`, then `formatter`. +1. **Final output** — the triaged audit (1 CRITICAL + 2 HIGH + 1 MEDIUM across `auth.py`/`db.py`/`net.py`/`math.py`). (Re-send the same prompt to show resume reuses the frozen spec — same hash, not re-authored.) diff --git a/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py b/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py index 8c50ef60644..c83e9eb109b 100644 --- a/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py @@ -118,6 +118,24 @@ def _registry() -> CapabilityRegistry: ), ), ), + Capability( + name="verifier", + input_kind="item", + output_model=Finding, + serialize_input=True, + max_fan_out=50, + build=lambda: Agent( + name="verifier", + model=MODEL, + output_schema=Finding, + generate_content_config=DET, + instruction=( + "Input: a Finding JSON (path, severity, issue). Confirm the" + " severity and keep or adjust the issue. Output the Finding" + " (echo the path)." + ), + ), + ), Capability( name="triager", input_kind="list", @@ -155,18 +173,23 @@ def _registry() -> CapabilityRegistry: _REGISTRY_DESC = ( - "reviewer (item: a file with path and code -> Finding), triager (LIST of" - " Findings -> ReportFixed), formatter (item: a ReportFixed -> Note)." + "reviewer (item: a file with path and code -> Finding), verifier (item: a" + " Finding -> a confirmed Finding), triager (LIST of Findings ->" + " ReportFixed), formatter (item: a ReportFixed -> Note)." ) _PLANNER_INSTR = ( "Author a WorkflowSpec using ONLY these capabilities: " + _REGISTRY_DESC + " The task input has a 'files' list of objects with path and code." - " Author:" - " a fan_out of reviewer over task.files, then a step running triager on the" - " findings, then a step running formatter on the report. Use" - " Binding(source='task', path='files') and Binding(source='step'," - " step=) to chain. Set output to the formatter step." + " Author, in order:" + " (1) a pipeline over task.files with two stages, reviewer then verifier," + " so each file is reviewed and then its finding is verified per item;" + " (2) a step running triager on the pipeline output;" + " (3) a step running formatter on the report." + " Use Binding(source='task', path='files') for the pipeline's over, and" + " Binding(source='step', step=) to chain steps. A pipeline stage takes" + " its input from the previous stage automatically, so stages need no input" + " binding. Set output to the formatter step." ) @@ -229,7 +252,13 @@ async def author_validate_execute(ctx: Context, node_input): # 2. VALIDATE — semantic validation against the registry (always). warnings = WorkflowSpecValidator(reg).validate(spec) # raises on hard error - caps = sorted({getattr(s, "capability", None) for s in spec.steps} - {None}) + caps = set() + for s in spec.steps: + if getattr(s, "capability", None): + caps.add(s.capability) + for st in getattr(s, "stages", None) or []: # pipeline stage capabilities + caps.add(st.capability) + caps = sorted(caps) yield _msg( "✅ **Validation passed.** Capabilities referenced (all registered): " f"`{caps}`." diff --git a/contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py b/contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py index e8dc40a129f..85992eb41cb 100644 --- a/contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py +++ b/contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py @@ -40,7 +40,8 @@ from authoring import Binding # noqa: E402 from authoring import Capability # noqa: E402 from authoring import CapabilityRegistry # noqa: E402 -from authoring import FanOut # noqa: E402 +from authoring import Pipeline # noqa: E402 +from authoring import PipelineStage # noqa: E402 from authoring import StepRef # noqa: E402 from authoring import WorkflowSpec # noqa: E402 from authoring import WorkflowSpecValidator # noqa: E402 @@ -50,11 +51,14 @@ def _demo_spec() -> WorkflowSpec: return WorkflowSpec( goal="audit", steps=[ - FanOut( - kind="fan_out", + Pipeline( + kind="pipeline", id="rev", over=Binding(source="task", path="files"), - capability="reviewer", + stages=[ + PipelineStage(capability="reviewer"), + PipelineStage(capability="verifier"), + ], ), StepRef( kind="step", @@ -81,9 +85,10 @@ def test_root_agent_importable_and_named(): def test_demo_registry_is_clean(): reg = demo._registry() - for name in ("reviewer", "triager", "formatter"): + for name in ("reviewer", "verifier", "triager", "formatter"): assert name in reg assert reg["reviewer"].input_kind == "item" + assert reg["verifier"].input_kind == "item" # pipeline stages take an item assert reg["triager"].input_kind == "list" # ReportFixed uses enumerated fields, not an open dict[str, X] map. assert reg.open_map_warnings() == [] @@ -114,6 +119,15 @@ async def n(ctx, node_input): lambda f: {"path": f["path"], "severity": "HIGH", "issue": "x"}, ), ), + Capability( + name="verifier", + input_kind="item", + serialize_input=False, + build=stub( + "verifier", + lambda finding: {**finding, "issue": finding["issue"] + "!"}, + ), + ), Capability( name="triager", input_kind="list", @@ -174,5 +188,10 @@ async def test_reuse_path_no_llm(monkeypatch): assert ( out["hash"] == "deadbeef0000" ) # same frozen hash, not re-derived from a new plan - assert set(out["capabilities"]) == {"reviewer", "triager", "formatter"} + assert set(out["capabilities"]) == { + "reviewer", + "verifier", + "triager", + "formatter", + } assert out["result"]["note"].startswith("audited") From f48b985dff4af8a060d7c0e65a82cc6e26b85713 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Tue, 2 Jun 2026 01:35:19 -0700 Subject: [PATCH 14/64] demo(workflow): derive the planning message's capability list from the registry The first chat message hardcoded (reviewer, triager, formatter) and so contradicted the validation list after verifier was added. Derive it from CapabilityRegistry.names() (new accessor) so the recording can't drift from the registered set again. --- .../authored_workflow_demo/security_audit_planner/agent.py | 3 ++- .../samples/workflows/authored_workflow_spike/authoring.py | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py b/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py index c83e9eb109b..9aff51a7fc1 100644 --- a/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py @@ -225,10 +225,11 @@ async def author_validate_execute(ctx: Context, node_input): ) else: reused = False + cap_list = ", ".join(f"`{n}`" for n in reg.names()) yield _msg( "🧭 **Model-authored Workflow** — planning a security audit over " f"{len(FILES)} files using only registered capabilities " - "(`reviewer`, `triager`, `formatter`)." + f"({cap_list})." ) planner = Agent( name="planner", diff --git a/contributing/samples/workflows/authored_workflow_spike/authoring.py b/contributing/samples/workflows/authored_workflow_spike/authoring.py index c251a1daa2a..4a23c3cbf9f 100644 --- a/contributing/samples/workflows/authored_workflow_spike/authoring.py +++ b/contributing/samples/workflows/authored_workflow_spike/authoring.py @@ -182,6 +182,9 @@ def __contains__(self, name): def __getitem__(self, name): return self._by_name[name] + def names(self) -> list[str]: + return list(self._by_name) + def open_map_warnings(self) -> list[str]: """Spike lesson: open-ended dict[str, X] output fields are a structured- output reliability hazard (Gemini fills them unreliably). Warn on them.""" From 82bd893e36821e0046d1e38ee3bdcee02a13cb4c Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Tue, 2 Jun 2026 03:15:55 -0700 Subject: [PATCH 15/64] =?UTF-8?q?feat(workflow):=20exportable=20FrozenWork?= =?UTF-8?q?flowRecord=20=E2=80=94=20export=5Fplan/import=5Fplan=20+=20demo?= =?UTF-8?q?=20beat?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Makes the frozen plan a first-class, portable artifact (DESIGN.md §10), so the RFC's enterprise claim (reviewable / diffable / replayable model-authored plans) is real, not paper. authoring.py: - FrozenWorkflowRecord (the single §5 shape) + ValidationResult; FrozenWorkflowRecord.freeze() captures spec_hash, planner_model, registry + per-capability versions, validation, task_input_digest. - canonical_json / sha256_hex — the one fixed hash definition (sort_keys + tight separators) so two exporters agree. - export_plan(record) -> dict; import_plan(envelope, registry, task_input=None) that NEVER trusts the envelope: recompute sha256 (reject tamper), re-validate vs CURRENT registry (reject dropped capability), reject per-capability version drift; execution-input contract (replay needs matching input digest; template needs task_input_schema). - Capability.version + CapabilityRegistry.capability_versions() for drift detection; referenced_capabilities() walker. test_authoring.py (+5 -> 19): round-trip replays same hash; tamper rejected; dropped capability rejected; version drift rejected; new input without template schema rejected (and accepted once a schema is attached). demo: an 'Export plan' beat writes the full envelope to security_audit_plan.json and re-imports it (proving defensive import). Unifies the displayed hash on the canonical definition. Narrative/README gain Beat 3b; counts 11+19+4 = 34. Generated envelope + demo session dbs gitignored. Validated live on gemini-3.5-flash: export beat writes a complete envelope, re-import passes, reuse replays the same hash. --- .gitignore | 6 + .../authored_workflow_demo/DEMO_NARRATIVE.md | 45 ++++- .../authored_workflow_demo/README.md | 5 +- .../security_audit_planner/agent.py | 56 +++++- .../authored_workflow_spike/DESIGN.md | 4 +- .../authored_workflow_spike/README.md | 16 +- .../authored_workflow_spike/authoring.py | 185 +++++++++++++++++- .../authored_workflow_spike/test_authoring.py | 67 +++++++ 8 files changed, 355 insertions(+), 29 deletions(-) diff --git a/.gitignore b/.gitignore index c3ddc7ea990..7bacd1d1d43 100644 --- a/.gitignore +++ b/.gitignore @@ -121,3 +121,9 @@ CLAUDE.md # Conformance test outputs (timestamped folders from --test mode) **/conformance/20*-*-*_*-*-*/ + +# Generated by the authored_workflow_demo "Export plan" beat (sample output) +security_audit_plan.json + +# ADK Web demo session stores (runtime) +demo_sessions*.db diff --git a/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md index 1c77f99af28..845613ca942 100644 --- a/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md +++ b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md @@ -56,14 +56,40 @@ Send: **"Plan and run a codebase security review."** The chat streams: ## Beat 3 — freeze (State tab) ``` -🔒 Frozen spec persisted to session state — hash 1f4c0883beb6. +🔒 Frozen spec persisted to session state — hash 71997cdf0669. Re-send the prompt: it replays this exact plan, not a new one. ``` > "Open the **State** tab: `authored_workflow:frozen_spec` and `…_hash`. The plan > is now durable data you can store, diff, and audit." -*(Presenter note: this demo persists only `{spec, hash}` for readability. Production v1 stores the full `FrozenWorkflowRecord` — planner/registry/capability versions, validation, `task_input_schema`/`digest` — see `authored_workflow_spike/DESIGN.md` §5. The demo illustrates the behavior; it is not the canonical persistence contract.)* +*(Presenter note: session **state** keeps a minimal `{spec, hash}` subset so the State tab stays readable. The **export** beat below serializes the full `FrozenWorkflowRecord`. Production v1 would persist the full record to state too — see `authored_workflow_spike/DESIGN.md` §5. The split here is presentational, not the canonical contract.)* + +## Beat 3b — export the plan (the enterprise artifact) + +``` +📦 Exported plan → security_audit_plan.json (full 71997cdf0669, schema v1, + planner gemini-3.5-flash). Re-imported OK — import recomputes the hash and + re-validates against the current registry, never trusting the envelope's own + validation. This is the reviewable / diffable / replayable audit artifact. +``` + +> "The frozen plan isn't just in-memory state — it serializes to a **portable +> JSON envelope**: the spec, its `sha256`, the planner model, registry + +> per-capability versions, the validation result, and a *digest* of the task +> input (not the raw input). `cat security_audit_plan.json` — this is the thing +> you check into a repo, diff in a PR, and hand to an auditor. And import is +> **defensive**: it recomputes the hash (rejects a tampered spec), re-validates +> against the *current* registry (rejects a dropped capability), and flags +> per-capability version drift — it never trusts the envelope's own `validation` +> stamp. That defensive import is exactly what makes a model-authored plan safe +> to store and replay later." + +Show the file on camera: + +```bash +cat security_audit_plan.json | jq '{schema_version, spec_hash, planner_model, capability_versions, validation}' +``` ## Beat 4 — execute (Events / trace tab) @@ -83,7 +109,7 @@ Send: **"Plan and run a codebase security review."** The chat streams: ## Beat 5 — reproduce (re-send the same prompt) ``` -♻️ Reusing frozen plan from session state — hash 1f4c0883beb6. +♻️ Reusing frozen plan from session state — hash 71997cdf0669. The model is NOT re-invoked; the exact prior plan is replayed. ✅ Validation passed. ... 📄 Audit result: ... @@ -97,22 +123,23 @@ Send: **"Plan and run a codebase security review."** The chat streams: | Run | `reused` | `hash` | | ----------- | -------- | -------------- | -| 1 (author) | `false` | `1f4c0883beb6` | -| 2 (re-send) | `true` | `1f4c0883beb6` | +| 1 (author) | `false` | `71997cdf0669` | +| 2 (re-send) | `true` | `71997cdf0669` | Same hash, `reused` flips to `true` — the model is not called the second time. ## Close (~20s) > "So: a model authored a typed, validated, capability-bounded plan; ADK executed -> it on the real engine; and a re-send replayed the exact frozen plan. The -> deterministic test suites — 11 (#92) + 14 (#93) + 4 (demo) — lock all of this -> in CI, including a no-LLM test of this reuse path." +> it on the real engine; the plan **exported** to a portable, defensively-imported +> audit artifact; and a re-send replayed the exact frozen plan. The deterministic +> test suites — 11 (#92) + 19 (#93) + 4 (demo) — lock all of this in CI, including +> the no-LLM reuse path and the export round-trip / tamper / drift checks." ## Proof commands (terminal, ~60s) ```bash pytest contributing/samples/workflows/dynamic_supervisor_spike/test_dynamic_supervisor_spike.py -q # 11 -pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 14 +pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 19 pytest contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py -q # 4 ``` diff --git a/contributing/samples/workflows/authored_workflow_demo/README.md b/contributing/samples/workflows/authored_workflow_demo/README.md index 3a1ff6a9e04..ecb333b2b34 100644 --- a/contributing/samples/workflows/authored_workflow_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_demo/README.md @@ -39,6 +39,7 @@ Point at the ADK-native evidence as it streams: 1. **Authored `WorkflowSpec`** — the chat shows the JSON plan (`pipeline → step → step`: a `reviewer → verifier` pipeline over the files, then `triager`, then `formatter`). 1. **Validation** — "Validation passed" + the capability list (all registered). 1. **Frozen spec + hash** — open the **State** tab: `authored_workflow:frozen_spec` and `…_hash`. +1. **Exported plan** — `📦 Exported plan → security_audit_plan.json`. The full `FrozenWorkflowRecord` (spec, `sha256`, planner model, registry + capability versions, validation, task-input digest) as a portable envelope; import recomputes the hash and re-validates against the current registry. `cat security_audit_plan.json | jq .` on camera. 1. **Execution** — the **Events / trace** view shows `reviewer` and `verifier` interleaving **per file** (the barrier-free pipeline), then `triager`, then `formatter`. 1. **Final output** — the triaged audit (1 CRITICAL + 2 HIGH + 1 MEDIUM across `auth.py`/`db.py`/`net.py`/`math.py`). @@ -57,11 +58,11 @@ Proof points: multi-stage `fan_out → step → step`; branch `step → branch`; ```bash pytest contributing/samples/workflows/dynamic_supervisor_spike/test_dynamic_supervisor_spike.py -q # 11 -pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 14 +pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 19 pytest contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py -q # 4 ``` -- Deterministic suites: #92 **11** + #93 **14** + demo **4** = **29** (incl. a no-LLM reuse-path test). +- Deterministic suites: #92 **11** + #93 **19** + demo **4** = **34** (incl. a no-LLM reuse-path test). - PR #3 CI green except the documented fork-only `agent-triage` token job. ## Recording notes diff --git a/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py b/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py index 9aff51a7fc1..a9316c6f62f 100644 --- a/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py @@ -31,7 +31,7 @@ from __future__ import annotations -import hashlib +import datetime import json import os import sys @@ -57,6 +57,10 @@ ) from authoring import Capability # noqa: E402 from authoring import CapabilityRegistry # noqa: E402 +from authoring import export_plan # noqa: E402 +from authoring import FrozenWorkflowRecord # noqa: E402 +from authoring import import_plan # noqa: E402 +from authoring import sha256_hex # noqa: E402 from authoring import SpecInterpreter # noqa: E402 from authoring import WorkflowSpec # noqa: E402 from authoring import WorkflowSpecValidator # noqa: E402 @@ -200,9 +204,13 @@ def _msg(text: str) -> Event: def _hash(spec: WorkflowSpec) -> str: - return hashlib.sha256( - json.dumps(spec.model_dump(), sort_keys=True).encode() - ).hexdigest()[:12] + # The one canonical hash definition (authored_workflow_spike/authoring.py), + # shown truncated; the full digest lives in the exported FrozenWorkflowRecord. + return sha256_hex(spec.model_dump(mode="json"))[:12] + + +# Where the "Export plan" beat writes the portable envelope (cwd of `adk web`). +_EXPORT_PATH = os.path.join(os.getcwd(), "security_audit_plan.json") @node(rerun_on_resume=True) @@ -268,11 +276,12 @@ async def author_validate_execute(ctx: Context, node_input): # 3. FREEZE — persist spec + hash to session state on first author only # (visible in the State tab; reused runs already have it). - # NOTE: this demo persists only a minimal {spec, hash} subset to keep the - # walkthrough readable. Production v1 would store the full FrozenWorkflowRecord - # (planner/registry/capability versions, validation, task_input_schema/digest) - # — see authored_workflow_spike/DESIGN.md §5. The demo is illustrative, not the - # canonical persistence contract. + # NOTE: session state keeps a minimal {spec, hash} subset so the State tab + # stays readable for the resume/reuse beat. The EXPORT beat below serializes + # the full FrozenWorkflowRecord (planner/registry/capability versions, + # validation, task_input_digest) — see authored_workflow_spike/DESIGN.md §5/§10. + # Production v1 would persist that full record to state too; the split here is + # presentational, not the canonical contract. if not reused: ctx.state["authored_workflow:frozen_spec"] = spec.model_dump() ctx.state["authored_workflow:frozen_spec_hash"] = spec_hash @@ -281,6 +290,35 @@ async def author_validate_execute(ctx: Context, node_input): "Re-send the prompt: it replays this exact plan, not a new one." ) + # 3b. EXPORT — serialize the full FrozenWorkflowRecord to a portable JSON + # envelope (DESIGN.md §10), then prove the import contract by re-importing + # it: import_plan recomputes the hash and re-validates against the CURRENT + # registry — it never trusts the envelope's own `validation`. + record = FrozenWorkflowRecord.freeze( + spec, + planner_model=MODEL, + registry=reg, + created_at=datetime.datetime.now(datetime.timezone.utc).isoformat(), + task_input={"files": FILES}, + ) + envelope = export_plan(record) + try: + with open(_EXPORT_PATH, "w") as f: + json.dump(envelope, f, indent=2) + import_plan( + envelope, reg, task_input={"files": FILES} + ) # re-hash+re-validate + yield _msg( + f"📦 **Exported plan** → `{os.path.basename(_EXPORT_PATH)}` " + f"(full `{record.spec_hash[:12]}`, schema `{record.schema_version}`, " + f"planner `{record.planner_model}`). Re-imported OK — import " + "recomputes the hash and re-validates against the current registry, " + "never trusting the envelope's own validation. This is the " + "reviewable / diffable / replayable audit artifact." + ) + except OSError as e: + yield _msg(f"📦 Export skipped (filesystem): {e}") + # 4. EXECUTE — run the validated plan on the real ADK engine (#92 supervisor). result = await SpecInterpreter(reg, ctx).execute(spec, {"files": FILES}) yield _msg( diff --git a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md index 726ff49a84f..c4309ec4c1f 100644 --- a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md +++ b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md @@ -189,10 +189,12 @@ Fully additive. New `authoring/` package + `AuthoredWorkflowAgent`; no change to 1. **Discriminated unions are incompatible with Gemini `response_schema`** — `Field(discriminator="kind")` emits a `discriminator` keyword genai rejects (`Schema: extra_forbidden`). Use a plain `kind`-tagged union. 1. **Planner quality vs capability quality are separable** — authoring/structure was reliably good; the residual variance was per-capability output quality (prompts/schemas/retries), proven via an intermediate-output diff (authored vs baseline findings were semantically identical). The strict `unmatched=fail` branch contract also caught a bad field-binding loudly instead of mis-routing. -Re-runnable: `contributing/samples/workflows/authored_workflow_spike/` (14 deterministic tests + env-gated live sweep) and `authored_workflow_demo/` (ADK Web `root_agent` + 4 CI-safe tests incl. the no-LLM reuse path), in `caohy1988/adk-python` PR #3. +Re-runnable: `contributing/samples/workflows/authored_workflow_spike/` (19 deterministic tests + env-gated live sweep) and `authored_workflow_demo/` (ADK Web `root_agent` + 4 CI-safe tests incl. the no-LLM reuse path), in `caohy1988/adk-python` PR #3. ## 10. Plan export & storage — the frozen spec as a durable artifact +> **Spike status:** `export_plan` / `import_plan` / `FrozenWorkflowRecord` are **implemented** in `authoring.py` and exercised by deterministic tests (round-trip, tamper, dropped-capability, version-drift, replay-vs-template input) and a live demo "Export plan" beat. The *tiering* below remains the production roadmap. + **Source of truth = the typed `WorkflowSpec`.** The compiled `Workflow` is a *derived* artifact. Storage is tiered, scoped to keep generated code and compiled graphs out of v1: - **v1 (required) — persist the full `FrozenWorkflowRecord` per run** (§5) under `authored_workflow:frozen_record` — for resume/replay **and** drift detection. diff --git a/contributing/samples/workflows/authored_workflow_spike/README.md b/contributing/samples/workflows/authored_workflow_spike/README.md index 36ed92f2691..132056e6c9c 100644 --- a/contributing/samples/workflows/authored_workflow_spike/README.md +++ b/contributing/samples/workflows/authored_workflow_spike/README.md @@ -14,11 +14,11 @@ behind the RFC's "can a model author good plans?" question. ## Files -| File | Purpose | -| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `authoring.py` | `WorkflowSpec` (plain `kind`-tagged recursive tree), `CapabilityRegistry`, `WorkflowSpecValidator`, `SpecInterpreter` (step / fan_out / pipeline / branch / loop_until). | -| `test_authoring.py` | Deterministic, CI-safe tests (no LLM). The trustworthy artifact. | -| `test_live_planner_sweep.py` | OPTIONAL env-gated live planner sweep across plan shapes. | +| File | Purpose | +| ---------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `authoring.py` | `WorkflowSpec` (plain `kind`-tagged recursive tree), `CapabilityRegistry`, `WorkflowSpecValidator`, `SpecInterpreter` (step / fan_out / pipeline / branch / loop_until), and `FrozenWorkflowRecord` / `export_plan` / `import_plan` (portable plan envelope + defensive import). | +| `test_authoring.py` | Deterministic, CI-safe tests (no LLM). The trustworthy artifact. | +| `test_live_planner_sweep.py` | OPTIONAL env-gated live planner sweep across plan shapes. | ## Deterministic tests (CI-safe, no network) @@ -26,10 +26,12 @@ behind the RFC's "can a model author good plans?" question. pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q ``` -Expected: **14 passed** — `Binding` invariant, `max_iters>=1`, validator accepts a +Expected: **19 passed** — `Binding` invariant, `max_iters>=1`, validator accepts a valid spec and rejects unknown capability / non-preceding binding / duplicate id, the open-map warning, and interpreter execution of fan_out→aggregate, **pipeline (barrier-free per-item review→verify, plus per-stage `max_fan_out` enforcement)**, branch -(correct route), and loop_until (stops + correct output). +(correct route), and loop_until (stops + correct output); plus **plan export/import** +(round-trip replays the same hash; import rejects a tampered spec, a dropped +capability, capability version drift, and a new input with no template schema). ## Live planner sweep (optional evidence) diff --git a/contributing/samples/workflows/authored_workflow_spike/authoring.py b/contributing/samples/workflows/authored_workflow_spike/authoring.py index 4a23c3cbf9f..d9c086c6da8 100644 --- a/contributing/samples/workflows/authored_workflow_spike/authoring.py +++ b/contributing/samples/workflows/authored_workflow_spike/authoring.py @@ -24,12 +24,17 @@ * ``SpecInterpreter`` — executes a validated spec on the real ADK Workflow engine via the #92 ``DynamicNodeSupervisor`` (step / fan_out / pipeline / branch / loop_until). +* ``FrozenWorkflowRecord`` / ``export_plan`` / ``import_plan`` — the frozen spec + as a first-class, portable artifact (DESIGN.md §10): export to a JSON + envelope; import recomputes the hash and re-validates against the *current* + registry, never trusting the envelope's own ``validation``. This is a demand-gate artifact, not production code. See README.md. """ from __future__ import annotations +import hashlib import json import os import sys @@ -169,12 +174,14 @@ class Capability(BaseModel): ) max_fan_out: int = 100 side_effect: bool = False + version: str = "1" # bumped when the capability's contract changes (drift) class CapabilityRegistry: - def __init__(self, capabilities: list[Capability]): + def __init__(self, capabilities: list[Capability], *, version: str = "1"): self._by_name = {c.name: c for c in capabilities} + self.version = version # registry_version (coarse drift signal) def __contains__(self, name): return name in self._by_name @@ -185,6 +192,16 @@ def __getitem__(self, name): def names(self) -> list[str]: return list(self._by_name) + def capability_versions( + self, only: Optional[set[str]] = None + ) -> dict[str, str]: + """name -> version for drift detection on import (optionally filtered).""" + return { + n: c.version + for n, c in self._by_name.items() + if only is None or n in only + } + def open_map_warnings(self) -> list[str]: """Spike lesson: open-ended dict[str, X] output fields are a structured- output reliability hazard (Gemini fills them unreliably). Warn on them.""" @@ -301,6 +318,172 @@ def _bindings(n) -> list[Binding]: return out +# ----------------------------------------------------------- export / import +# +# DESIGN.md §10: the frozen spec is a first-class, exportable artifact. The +# source of truth is the typed WorkflowSpec; the compiled Workflow is derived +# and never stored. A single canonical hash definition keeps two exporters in +# agreement, and import NEVER trusts the envelope's own `validation` — it +# recomputes the hash and re-validates against the *current* registry. + + +def canonical_json(value) -> str: + """The one fixed serialization for hashing (no whitespace/key-order drift).""" + return json.dumps(value, sort_keys=True, separators=(",", ":")) + + +def sha256_hex(value) -> str: + return hashlib.sha256(canonical_json(value).encode()).hexdigest() + + +def referenced_capabilities(spec: WorkflowSpec) -> set[str]: + """Every capability name a spec composes (walks pipeline stages, branch + routes, and loop bodies — not just top-level steps).""" + found: set[str] = set() + + def walk(nodes): + for n in nodes: + cap = getattr(n, "capability", None) + if cap: + found.add(cap) + for st in getattr(n, "stages", None) or []: + found.add(st.capability) + for route in getattr(n, "routes", None) or []: + walk(route.block) + if getattr(n, "until_capability", None): + found.add(n.until_capability) + if getattr(n, "body", None): + walk(n.body) + + walk(spec.steps) + return found + + +class ValidationResult(BaseModel): + passed: bool + warnings: list[str] = Field(default_factory=list) + + +class FrozenWorkflowRecord(BaseModel): + """The single shape behind session state, the audit event, and the export + envelope (DESIGN.md §5) — v1 storage is never a weaker subset.""" + + schema_version: str = "v1" + spec: WorkflowSpec + spec_hash: str + planner_model: str + registry_version: str + capability_versions: dict[str, str] + validation: ValidationResult + created_at: str # ISO-8601, stamped at freeze (caller supplies; not now()) + task_input_schema: Optional[dict] = None + task_input_digest: Optional[str] = None + + @classmethod + def freeze( + cls, + spec: WorkflowSpec, + *, + planner_model: str, + registry: CapabilityRegistry, + created_at: str, + task_input=None, + task_input_schema: Optional[dict] = None, + ) -> "FrozenWorkflowRecord": + """Validate + capture everything needed for replay and drift detection.""" + warnings = WorkflowSpecValidator(registry).validate(spec) # raises on hard + refs = referenced_capabilities(spec) + return cls( + spec=spec, + spec_hash=sha256_hex(spec.model_dump(mode="json")), + planner_model=planner_model, + registry_version=registry.version, + capability_versions=registry.capability_versions(only=refs), + validation=ValidationResult(passed=True, warnings=warnings), + created_at=created_at, + task_input_schema=task_input_schema, + task_input_digest=( + None if task_input is None else sha256_hex(task_input) + ), + ) + + +class PlanImportError(Exception): + """Raised when an exported plan fails integrity, drift, or input checks.""" + + +def export_plan(record: FrozenWorkflowRecord) -> dict: + """Serialize the §5 record to a portable JSON-able envelope.""" + return record.model_dump(mode="json") + + +def import_plan( + envelope: dict, registry: CapabilityRegistry, *, task_input=None +) -> WorkflowSpec: + """Re-hydrate an exported plan, NEVER trusting the envelope's own checks. + + Integrity + drift (DESIGN.md §10): + 1. recompute sha256(canonical_json(spec)); REJECT if != envelope spec_hash; + 2. re-run WorkflowSpecValidator against the CURRENT registry (catches a + dropped/renamed capability); + 3. per-capability version drift vs the envelope -> fail loudly. + Execution-input contract: + * replay (no schema): task_input digest MUST match the envelope's; + * template (schema): task_input is validated against task_input_schema; + * neither: do NOT execute against arbitrary new input. + """ + spec = WorkflowSpec.model_validate(envelope["spec"]) + + # 1. integrity — recompute, don't trust. + recomputed = sha256_hex(spec.model_dump(mode="json")) + if recomputed != envelope.get("spec_hash"): + raise PlanImportError( + "spec_hash mismatch: envelope has" + f" {envelope.get('spec_hash')!r}, recomputed {recomputed!r} — the spec" + " was tampered with or re-serialized under a different definition" + ) + + # 2. re-validate against the CURRENT registry (dropped capability fails here). + try: + WorkflowSpecValidator(registry).validate(spec) + except SpecValidationError as e: + raise PlanImportError(f"re-validation against current registry failed: {e}") + + # 3. per-capability version drift. + current = registry.capability_versions(only=referenced_capabilities(spec)) + recorded = envelope.get("capability_versions", {}) + drifted = { + n: (recorded.get(n), current.get(n)) + for n in current + if recorded.get(n) != current.get(n) + } + if drifted: + raise PlanImportError( + f"capability version drift (recorded vs current): {drifted} — promote" + " to a template with explicit migration before reuse" + ) + + # Execution-input contract. + if task_input is not None: + schema = envelope.get("task_input_schema") + if schema is not None: + missing = [k for k in schema.get("required", []) if k not in task_input] + if missing: + raise PlanImportError( + f"task input missing required keys {missing} for this template" + ) + else: + digest = sha256_hex(task_input) + if digest != envelope.get("task_input_digest"): + raise PlanImportError( + "task_input digest mismatch and no task_input_schema captured:" + " this plan can only be REPLAYED on its original input (promote to" + " a template to reuse it on new input)" + ) + + return spec + + # ----------------------------------------------------------------- interpreter class SpecInterpreter: """Executes a validated WorkflowSpec on the real ADK engine via the #92 diff --git a/contributing/samples/workflows/authored_workflow_spike/test_authoring.py b/contributing/samples/workflows/authored_workflow_spike/test_authoring.py index cedf8eb1295..2f6934a5eb2 100644 --- a/contributing/samples/workflows/authored_workflow_spike/test_authoring.py +++ b/contributing/samples/workflows/authored_workflow_spike/test_authoring.py @@ -22,6 +22,7 @@ from __future__ import annotations +import json import os import sys @@ -39,11 +40,16 @@ from authoring import Branch from authoring import Capability from authoring import CapabilityRegistry +from authoring import export_plan from authoring import FanOut +from authoring import FrozenWorkflowRecord +from authoring import import_plan from authoring import LoopUntil +from authoring import PlanImportError from authoring import Pipeline from authoring import PipelineStage from authoring import Route +from authoring import sha256_hex from authoring import SpecInterpreter from authoring import SpecValidationError from authoring import StepRef @@ -451,3 +457,64 @@ async def test_interpreter_pipeline_enforces_max_fan_out(): await _run_spec(_pipeline_spec(), reg, {"items": [0, 1]}) # rejected pre-dispatch: no stage ran. assert log == [] + + +# ----------------------------------------------------------------- export/import +_TASK = {"files": [{"path": "a.py", "code": "bad"}]} + + +def _frozen(): + return FrozenWorkflowRecord.freeze( + _fanout_aggregate_spec(), + planner_model="gemini-3.5-flash", + registry=_registry(), + created_at="2026-06-02T00:00:00Z", + task_input=_TASK, + ) + + +def test_export_then_import_roundtrip_replays_same_hash(): + env = export_plan(_frozen()) + # the envelope is JSON-serializable and carries the full §5 record. + assert json.loads(json.dumps(env))["schema_version"] == "v1" + assert set(env["capability_versions"]) == {"review", "count"} + # re-import on the ORIGINAL input (replay path) succeeds and recomputes the + # SAME hash from the spec — integrity holds. + spec = import_plan(env, _registry(), task_input=_TASK) + assert sha256_hex(spec.model_dump(mode="json")) == env["spec_hash"] + + +def test_import_rejects_tampered_spec(): + env = export_plan(_frozen()) + # tamper with the spec but leave the recorded hash -> integrity check fires. + env["spec"]["goal"] = "exfiltrate" + with pytest.raises(PlanImportError, match="spec_hash mismatch"): + import_plan(env, _registry(), task_input=_TASK) + + +def test_import_rejects_dropped_capability(): + env = export_plan(_frozen()) + # current registry no longer has `count` -> re-validation against the CURRENT + # registry fails (we never trust the envelope's own `validation`). + shrunk = CapabilityRegistry([_registry()["review"]]) + with pytest.raises(PlanImportError, match="re-validation"): + import_plan(env, shrunk, task_input=_TASK) + + +def test_import_rejects_capability_version_drift(): + env = export_plan(_frozen()) + # same capabilities, but `review` was bumped since export -> drift. + bumped = _registry() + bumped["review"].version = "2" + with pytest.raises(PlanImportError, match="version drift"): + import_plan(env, bumped, task_input=_TASK) + + +def test_import_rejects_new_input_without_template_schema(): + env = export_plan(_frozen()) # no task_input_schema captured -> replay-only + other = {"files": [{"path": "z.py", "code": "ok"}]} + with pytest.raises(PlanImportError, match="digest mismatch"): + import_plan(env, _registry(), task_input=other) + # but template promotion (a captured schema) lets a new input through: + env["task_input_schema"] = {"required": ["files"]} + assert import_plan(env, _registry(), task_input=other) is not None From 0bc33236d03020e2a68e78168b9e7943931c84fd Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Tue, 2 Jun 2026 03:25:45 -0700 Subject: [PATCH 16/64] fix(workflow): enforce schema_version + registry_version on import; isort MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P1: isort ordering in test_authoring.py imports (PlanImportError after PipelineStage) — pre-commit clean. P2: import_plan now hard-errors on registry_version drift (envelope vs current registry.version), matching DESIGN.md §10 'registry-version match … drift = hard error'. Previously only dropped capabilities + per-capability versions were checked. P3: import_plan rejects an unsupported schema_version (only 'v1' supported) — a defensive importer refuses formats it can't read. +2 deterministic tests (-> 21): registry-version drift and unsupported schema_version both rejected. Counts 11+21+4 = 36. --- .../authored_workflow_demo/DEMO_NARRATIVE.md | 4 ++-- .../authored_workflow_demo/README.md | 4 ++-- .../authored_workflow_spike/DESIGN.md | 2 +- .../authored_workflow_spike/README.md | 5 ++-- .../authored_workflow_spike/authoring.py | 24 +++++++++++++++++-- .../authored_workflow_spike/test_authoring.py | 20 +++++++++++++++- 6 files changed, 49 insertions(+), 10 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md index 845613ca942..8d9ad460b42 100644 --- a/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md +++ b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md @@ -133,13 +133,13 @@ Same hash, `reused` flips to `true` — the model is not called the second time. > "So: a model authored a typed, validated, capability-bounded plan; ADK executed > it on the real engine; the plan **exported** to a portable, defensively-imported > audit artifact; and a re-send replayed the exact frozen plan. The deterministic -> test suites — 11 (#92) + 19 (#93) + 4 (demo) — lock all of this in CI, including +> test suites — 11 (#92) + 21 (#93) + 4 (demo) — lock all of this in CI, including > the no-LLM reuse path and the export round-trip / tamper / drift checks." ## Proof commands (terminal, ~60s) ```bash pytest contributing/samples/workflows/dynamic_supervisor_spike/test_dynamic_supervisor_spike.py -q # 11 -pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 19 +pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 21 pytest contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py -q # 4 ``` diff --git a/contributing/samples/workflows/authored_workflow_demo/README.md b/contributing/samples/workflows/authored_workflow_demo/README.md index ecb333b2b34..5aa4fb7f2ba 100644 --- a/contributing/samples/workflows/authored_workflow_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_demo/README.md @@ -58,11 +58,11 @@ Proof points: multi-stage `fan_out → step → step`; branch `step → branch`; ```bash pytest contributing/samples/workflows/dynamic_supervisor_spike/test_dynamic_supervisor_spike.py -q # 11 -pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 19 +pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 21 pytest contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py -q # 4 ``` -- Deterministic suites: #92 **11** + #93 **19** + demo **4** = **34** (incl. a no-LLM reuse-path test). +- Deterministic suites: #92 **11** + #93 **21** + demo **4** = **36** (incl. a no-LLM reuse-path test). - PR #3 CI green except the documented fork-only `agent-triage` token job. ## Recording notes diff --git a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md index c4309ec4c1f..6f597f0f66d 100644 --- a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md +++ b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md @@ -189,7 +189,7 @@ Fully additive. New `authoring/` package + `AuthoredWorkflowAgent`; no change to 1. **Discriminated unions are incompatible with Gemini `response_schema`** — `Field(discriminator="kind")` emits a `discriminator` keyword genai rejects (`Schema: extra_forbidden`). Use a plain `kind`-tagged union. 1. **Planner quality vs capability quality are separable** — authoring/structure was reliably good; the residual variance was per-capability output quality (prompts/schemas/retries), proven via an intermediate-output diff (authored vs baseline findings were semantically identical). The strict `unmatched=fail` branch contract also caught a bad field-binding loudly instead of mis-routing. -Re-runnable: `contributing/samples/workflows/authored_workflow_spike/` (19 deterministic tests + env-gated live sweep) and `authored_workflow_demo/` (ADK Web `root_agent` + 4 CI-safe tests incl. the no-LLM reuse path), in `caohy1988/adk-python` PR #3. +Re-runnable: `contributing/samples/workflows/authored_workflow_spike/` (21 deterministic tests + env-gated live sweep) and `authored_workflow_demo/` (ADK Web `root_agent` + 4 CI-safe tests incl. the no-LLM reuse path), in `caohy1988/adk-python` PR #3. ## 10. Plan export & storage — the frozen spec as a durable artifact diff --git a/contributing/samples/workflows/authored_workflow_spike/README.md b/contributing/samples/workflows/authored_workflow_spike/README.md index 132056e6c9c..6508a12ff6d 100644 --- a/contributing/samples/workflows/authored_workflow_spike/README.md +++ b/contributing/samples/workflows/authored_workflow_spike/README.md @@ -26,12 +26,13 @@ behind the RFC's "can a model author good plans?" question. pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q ``` -Expected: **19 passed** — `Binding` invariant, `max_iters>=1`, validator accepts a +Expected: **21 passed** — `Binding` invariant, `max_iters>=1`, validator accepts a valid spec and rejects unknown capability / non-preceding binding / duplicate id, the open-map warning, and interpreter execution of fan_out→aggregate, **pipeline (barrier-free per-item review→verify, plus per-stage `max_fan_out` enforcement)**, branch (correct route), and loop_until (stops + correct output); plus **plan export/import** (round-trip replays the same hash; import rejects a tampered spec, a dropped -capability, capability version drift, and a new input with no template schema). +capability, capability/registry version drift, an unsupported schema_version, +and a new input with no template schema). ## Live planner sweep (optional evidence) diff --git a/contributing/samples/workflows/authored_workflow_spike/authoring.py b/contributing/samples/workflows/authored_workflow_spike/authoring.py index d9c086c6da8..daea013e73a 100644 --- a/contributing/samples/workflows/authored_workflow_spike/authoring.py +++ b/contributing/samples/workflows/authored_workflow_spike/authoring.py @@ -412,6 +412,9 @@ class PlanImportError(Exception): """Raised when an exported plan fails integrity, drift, or input checks.""" +SUPPORTED_SCHEMA_VERSION = "v1" + + def export_plan(record: FrozenWorkflowRecord) -> dict: """Serialize the §5 record to a portable JSON-able envelope.""" return record.model_dump(mode="json") @@ -423,15 +426,24 @@ def import_plan( """Re-hydrate an exported plan, NEVER trusting the envelope's own checks. Integrity + drift (DESIGN.md §10): + 0. reject an unsupported schema_version; 1. recompute sha256(canonical_json(spec)); REJECT if != envelope spec_hash; 2. re-run WorkflowSpecValidator against the CURRENT registry (catches a dropped/renamed capability); - 3. per-capability version drift vs the envelope -> fail loudly. + 3. registry-version and per-capability version drift -> fail loudly. Execution-input contract: * replay (no schema): task_input digest MUST match the envelope's; * template (schema): task_input is validated against task_input_schema; * neither: do NOT execute against arbitrary new input. """ + # 0. schema_version — a defensive importer refuses formats it doesn't know. + schema_version = envelope.get("schema_version") + if schema_version != SUPPORTED_SCHEMA_VERSION: + raise PlanImportError( + f"unsupported schema_version {schema_version!r} (this importer supports" + f" {SUPPORTED_SCHEMA_VERSION!r})" + ) + spec = WorkflowSpec.model_validate(envelope["spec"]) # 1. integrity — recompute, don't trust. @@ -449,7 +461,15 @@ def import_plan( except SpecValidationError as e: raise PlanImportError(f"re-validation against current registry failed: {e}") - # 3. per-capability version drift. + # 3a. registry-version drift is a hard error (DESIGN.md §10). + if envelope.get("registry_version") != registry.version: + raise PlanImportError( + "registry_version drift (recorded" + f" {envelope.get('registry_version')!r} vs current" + f" {registry.version!r}) — re-validate / migrate before reuse" + ) + + # 3b. per-capability version drift. current = registry.capability_versions(only=referenced_capabilities(spec)) recorded = envelope.get("capability_versions", {}) drifted = { diff --git a/contributing/samples/workflows/authored_workflow_spike/test_authoring.py b/contributing/samples/workflows/authored_workflow_spike/test_authoring.py index 2f6934a5eb2..37662cf1d31 100644 --- a/contributing/samples/workflows/authored_workflow_spike/test_authoring.py +++ b/contributing/samples/workflows/authored_workflow_spike/test_authoring.py @@ -45,9 +45,9 @@ from authoring import FrozenWorkflowRecord from authoring import import_plan from authoring import LoopUntil -from authoring import PlanImportError from authoring import Pipeline from authoring import PipelineStage +from authoring import PlanImportError from authoring import Route from authoring import sha256_hex from authoring import SpecInterpreter @@ -510,6 +510,24 @@ def test_import_rejects_capability_version_drift(): import_plan(env, bumped, task_input=_TASK) +def test_import_rejects_unsupported_schema_version(): + env = export_plan(_frozen()) + env["schema_version"] = "v2" # an importer must refuse formats it can't read + with pytest.raises(PlanImportError, match="schema_version"): + import_plan(env, _registry(), task_input=_TASK) + + +def test_import_rejects_registry_version_drift(): + env = export_plan(_frozen()) + # same capabilities/versions, but the whole registry was re-versioned -> + # hard error per DESIGN.md §10. + v2_registry = CapabilityRegistry( + list(_registry()._by_name.values()), version="2" + ) + with pytest.raises(PlanImportError, match="registry_version"): + import_plan(env, v2_registry, task_input=_TASK) + + def test_import_rejects_new_input_without_template_schema(): env = export_plan(_frozen()) # no task_input_schema captured -> replay-only other = {"files": [{"path": "z.py", "code": "ok"}]} From 7dc25cfb0cef685c60a4ee887b161d5ff9d1cef2 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Wed, 3 Jun 2026 13:07:46 -0700 Subject: [PATCH 17/64] docs(workflow): converge with ADK AgentConfig; answer storage/tools/observability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds DESIGN §11 'Convergence with ADK AgentConfig' (renumbers Future -> §12) in response to reviewer questions on issue #93: - Lower the static subset (sequence/parallel/loop) to ADK's Sequential/Parallel/ LoopAgentConfig instead of reinventing serialization; keep branch/fan_out/ pipeline as new types only because config can't express them (static sub_agents resolved once at load; no ConditionalAgent; needs #92 ctx.pipeline). - Why the planner does NOT emit raw AgentConfig: static graph; Discriminator union rejected by response_schema; FQN tool/agent/callback refs (importlib, no allow-list) re-open the code-exec surface the declarative+allow-list model closes. - Q1 storage: FrozenWorkflowRecord in session State + audit event + export envelope. - Q2 custom tools: registered capability by registry name (allow-list), not FQN. - Q3 version/observability: spec_hash + registry/capability versions -> drift rejected on import; compiled Workflow runs on the real engine so ADK tracing applies. All claims source-verified against agents/agent_config.py and config_agent_utils.py. --- .../authored_workflow_spike/DESIGN.md | 33 ++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md index 6f597f0f66d..1d6814752df 100644 --- a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md +++ b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md @@ -241,10 +241,41 @@ Re-runnable: `contributing/samples/workflows/authored_workflow_spike/` (21 deter Net: this turns the proposal from "a model can author plans" into "**model-authored plans become durable enterprise artifacts**" — without committing to durable generated code. -## 11. Future (post-gate, NOT MVP) +## 11. Convergence with ADK `AgentConfig` (+ storage, custom tools, observability) + +A reviewer asked whether the planner should author ADK's existing **`AgentConfig`** (the `root_agent.yaml` format) directly. Verified against source — `agents/agent_config.py`, `agents/base_agent_config.py`, `agents/{llm,sequential,parallel,loop}_agent_config.py`, `agents/common_configs.py`, `tools/_tool_configs.py` (names), and `agents/config_agent_utils.py`: + +**Lower to config where it fits.** ADK config already models the *static* shapes; `WorkflowSpec`'s static subset compiles to them rather than reinventing a parallel serialization: + +| `WorkflowSpec` block | Lowers to | Notes | +| ----------------------------------------------- | ----------------------- | ----------------------------------------------------------- | +| sequence | `SequentialAgentConfig` | `sub_agents: list[AgentRefConfig]` | +| parallel-map (`fan_out` over a **static** list) | `ParallelAgentConfig` | static sub-agent list | +| bounded loop | `LoopAgentConfig` | `max_iterations` | +| `branch` (route on a value) | — | **no config type** (`ConditionalAgent` doesn't exist) | +| `fan_out` over a **runtime** list | — | `sub_agents` is resolved once at load; no runtime iteration | +| `pipeline` (barrier-free per-item) | — | requires #92 `ctx.pipeline` | + +**Why the planner does not emit raw `AgentConfig`:** + +1. **Static graph, build-time only.** `config_agent_utils.from_config(path)` parses the YAML and resolves the *entire* `sub_agents` tree at load (`base_agent.__create_kwargs` → `resolve_agent_reference` per child), before any request. There is no runtime list iteration — so dynamic per-item fan-out and conditional branch cannot be expressed. +1. **Not a clean `response_schema`.** `AgentConfig` is a `RootModel` over a `Discriminator(agent_config_discriminator)` union; Gemini's `response_schema` rejects the emitted `discriminator` keyword (`Schema: extra_forbidden` — the spike's §9 lesson). It also carries open `extra='allow'` maps (`ToolArgsConfig`, `BaseAgentConfig.model_extra`). +1. **Code-bearing fields re-open the execution surface.** Tools/agents/callbacks are named by **fully-qualified importable path** — `CodeConfig.name`, `AgentRefConfig.code`, `LlmAgentConfig.tools[].name`, `*_callbacks` — resolved via `importlib.import_module` + `getattr` (`config_agent_utils.resolve_code_reference`) with **no allow-list**. A model authoring those would reintroduce exactly the arbitrary-import risk the declarative + allow-list model removes. + +**Direction:** keep `WorkflowSpec` as the thin **authoring** schema (closed, allow-listed, `response_schema`-safe). Compile its static subset to `AgentConfig` so those shapes share ADK's serialization and tooling; the `branch` / `fan_out` / `pipeline` types and capability allow-listing exist only for what config cannot express or cannot do safely. The compiled artifact is still an ordinary `Workflow` (§2). + +**Q1 — spec storage.** §5/§10: one `FrozenWorkflowRecord` in session State (`authored_workflow:frozen_record`, unprefixed/session-scoped; resume reuses, never re-plans), a state-only audit event, and a v1.1 export envelope. Compiled `Workflow` is derived, never canonical. + +**Q2 — custom tools.** A custom tool is a **registered capability** referenced by **registry name** (the registry is the allow-list), carrying per-capability policy (`max_calls`, `max_fan_out`, `side_effect`→approval, arg constraints) — §6. Deliberately *not* config's FQN `tools:` field: the model never names an import path. + +**Q3 — version control & observability.** Drift surface = `spec_hash` (sha256/canonical-JSON) + `planner_model` + `registry_version` + per-capability `capability_versions` in the record (§5); import hard-errors on schema-version, hash, registry-version, or capability-version drift (spike-enforced, §10). The export envelope is diffable for PR/audit review. Runtime observability is unchanged: the compiled `Workflow` runs on the real engine, so existing ADK tracing/events apply; the frozen record + hash anchor each run to its plan. + +## 12. Future (post-gate, NOT MVP) **Hierarchical / sub-plan authoring** — a registered capability that is itself an `AuthoredWorkflowAgent`, so a step can expand into its own authored sub-plan. This is the likely path to parity with Claude Code's unbounded orchestration (it lifts the single-response plan-size ceiling), but it is **out of MVP scope** and should be evaluated **only after the 3–5-task build gate**. MVP stays single-level: `WorkflowSpec` + validator + freeze/replay + export. +**Upstream `AgentConfig` extension (optional).** If the dynamic constructs prove their value, the cleaner long-term home for `branch` / dynamic `fan_out` may be **new agent-config types upstream** (a conditional agent; a runtime-fan-out agent) plus an allow-listed capability-reference tool field — at which point authoring could converge fully onto an extended `AgentConfig`. Out of scope here; depends on upstream accepting new agent classes. + ## References - #92 — supervised concurrent dynamic dispatch + `ctx.pipeline` (executor). From 0061706772fae273c1a65b54a3624d874d82829c Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Wed, 3 Jun 2026 15:17:40 -0700 Subject: [PATCH 18/64] =?UTF-8?q?docs(workflow):=20correct=20AgentConfig?= =?UTF-8?q?=20convergence=20framing=20(=C2=A711)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address review on the convergence section: - 'design converges / should lower', not 'now lowers' — the spike does not yet implement an AgentConfig-lowering compiler (explicit caveat added). - precise table: static parallel block -> ParallelAgentConfig; runtime fan_out/pipeline/branch have no direct config equivalent (ParallelAgentConfig is static parallel sub-agents, not data-mapping over a runtime list). - soften FQN wording to a trust-boundary mismatch (FQN imports are fine for developer-authored config; the concern is a MODEL authoring raw FQNs), not 'config is unsafe'. --- .../authored_workflow_spike/DESIGN.md | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md index 1d6814752df..26384f4916a 100644 --- a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md +++ b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md @@ -245,24 +245,24 @@ Net: this turns the proposal from "a model can author plans" into "**model-autho A reviewer asked whether the planner should author ADK's existing **`AgentConfig`** (the `root_agent.yaml` format) directly. Verified against source — `agents/agent_config.py`, `agents/base_agent_config.py`, `agents/{llm,sequential,parallel,loop}_agent_config.py`, `agents/common_configs.py`, `tools/_tool_configs.py` (names), and `agents/config_agent_utils.py`: -**Lower to config where it fits.** ADK config already models the *static* shapes; `WorkflowSpec`'s static subset compiles to them rather than reinventing a parallel serialization: +**Lower to config where it fits.** ADK config already models the *static* shapes, so the static subset **should lower to** them rather than reinvent a serialization. **This is a design direction — the spike does not yet implement an `AgentConfig`-lowering compiler.** -| `WorkflowSpec` block | Lowers to | Notes | -| ----------------------------------------------- | ----------------------- | ----------------------------------------------------------- | -| sequence | `SequentialAgentConfig` | `sub_agents: list[AgentRefConfig]` | -| parallel-map (`fan_out` over a **static** list) | `ParallelAgentConfig` | static sub-agent list | -| bounded loop | `LoopAgentConfig` | `max_iterations` | -| `branch` (route on a value) | — | **no config type** (`ConditionalAgent` doesn't exist) | -| `fan_out` over a **runtime** list | — | `sub_agents` is resolved once at load; no runtime iteration | -| `pipeline` (barrier-free per-item) | — | requires #92 `ctx.pipeline` | +| `WorkflowSpec` block | ADK config relationship | +| ----------------------------------------- | ---------------------------------------------------------------------- | +| sequence | lowers to `SequentialAgentConfig` (`sub_agents: list[AgentRefConfig]`) | +| static parallel block | lowers to `ParallelAgentConfig` (static sub-agent list) | +| bounded loop | lowers to `LoopAgentConfig` (`max_iterations`) | +| runtime `fan_out` / `pipeline` / `branch` | no direct config equivalent | -**Why the planner does not emit raw `AgentConfig`:** +`ParallelAgentConfig` models a **static** set of parallel sub-agents, **not** data-mapping over a runtime list — so per-item `fan_out` sits in the "no equivalent" row, not the parallel row. -1. **Static graph, build-time only.** `config_agent_utils.from_config(path)` parses the YAML and resolves the *entire* `sub_agents` tree at load (`base_agent.__create_kwargs` → `resolve_agent_reference` per child), before any request. There is no runtime list iteration — so dynamic per-item fan-out and conditional branch cannot be expressed. +**Why the planner should not emit raw `AgentConfig`:** + +1. **Static graph, build-time only.** `config_agent_utils.from_config(path)` parses the YAML and resolves the *entire* `sub_agents` tree at load (`base_agent.__create_kwargs` → `resolve_agent_reference` per child), before any request. There is no runtime list iteration — so per-item fan-out, pipeline, and conditional branch routing cannot be expressed. 1. **Not a clean `response_schema`.** `AgentConfig` is a `RootModel` over a `Discriminator(agent_config_discriminator)` union; Gemini's `response_schema` rejects the emitted `discriminator` keyword (`Schema: extra_forbidden` — the spike's §9 lesson). It also carries open `extra='allow'` maps (`ToolArgsConfig`, `BaseAgentConfig.model_extra`). -1. **Code-bearing fields re-open the execution surface.** Tools/agents/callbacks are named by **fully-qualified importable path** — `CodeConfig.name`, `AgentRefConfig.code`, `LlmAgentConfig.tools[].name`, `*_callbacks` — resolved via `importlib.import_module` + `getattr` (`config_agent_utils.resolve_code_reference`) with **no allow-list**. A model authoring those would reintroduce exactly the arbitrary-import risk the declarative + allow-list model removes. +1. **Trust-boundary mismatch on tool/agent refs.** Tools/agents/callbacks are named by **fully-qualified importable path** — `CodeConfig.name`, `AgentRefConfig.code`, `LlmAgentConfig.tools[].name`, `*_callbacks` — resolved via `importlib.import_module` + `getattr` (`config_agent_utils.resolve_code_reference`). That FQN-import model is appropriate for **developer-authored** config; the concern is specifically letting a **model** author raw FQNs. For model-authored plans we want **capability allow-listing**, not arbitrary import paths — a trust-boundary difference, not a flaw in config. -**Direction:** keep `WorkflowSpec` as the thin **authoring** schema (closed, allow-listed, `response_schema`-safe). Compile its static subset to `AgentConfig` so those shapes share ADK's serialization and tooling; the `branch` / `fan_out` / `pipeline` types and capability allow-listing exist only for what config cannot express or cannot do safely. The compiled artifact is still an ordinary `Workflow` (§2). +**Direction:** keep `WorkflowSpec` as the thin **authoring** schema (closed, allow-listed, `response_schema`-safe); lower its static subset to `AgentConfig` so those shapes share ADK's serialization and tooling; keep `branch` / `fan_out` / `pipeline` + capability allow-listing as new surface only for the dynamic and trust-boundary pieces config doesn't cover. The compiled artifact is still an ordinary `Workflow` (§2). **Q1 — spec storage.** §5/§10: one `FrozenWorkflowRecord` in session State (`authored_workflow:frozen_record`, unprefixed/session-scoped; resume reuses, never re-plans), a state-only audit event, and a v1.1 export envelope. Compiled `Workflow` is derived, never canonical. From 11b32154d6ce8800313f1bb5b323b4fd4f948265 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Wed, 3 Jun 2026 15:22:55 -0700 Subject: [PATCH 19/64] demo(workflow): add honest AgentConfig-convergence talking point MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tie the demo to RFC #93 §11 without overclaiming: this plan's top-level sequence is the kind of static shape that should lower to SequentialAgentConfig, while the reviewer->verifier pipeline (per-item over a runtime list) is exactly what AgentConfig can't express. Explicitly notes this is a design direction — the demo runs via SpecInterpreter and does NOT lower to AgentConfig (no such compiler in the spike). README section + a presenter aside in the narrative. --- .../authored_workflow_demo/DEMO_NARRATIVE.md | 12 ++++++++++++ .../workflows/authored_workflow_demo/README.md | 9 +++++++++ 2 files changed, 21 insertions(+) diff --git a/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md index 8d9ad460b42..e6a87659d25 100644 --- a/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md +++ b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md @@ -136,6 +136,18 @@ Same hash, `reused` flips to `true` — the model is not called the second time. > test suites — 11 (#92) + 21 (#93) + 4 (demo) — lock all of this in CI, including > the no-LLM reuse path and the export round-trip / tamper / drift checks." +**Optional aside — convergence with ADK `AgentConfig`** (if a reviewer asks "why not author `root_agent.yaml`?"): + +> "The static parts of a plan — this one's a top-level sequence of three steps — +> are exactly what ADK config already models (`SequentialAgent`). The RFC's +> direction is to **lower those to `AgentConfig`** rather than reinvent them. But +> the `reviewer → verifier` **pipeline** is per-item over a *runtime* list, which +> config can't express (`sub_agents` resolve once at load; there's no conditional +> agent), and capabilities are referenced by **registry name, not importable +> FQN** — so a model never names an import path. That dynamic + trust-boundary +> delta is the only reason `WorkflowSpec` exists. (Direction, not shown here — the +> demo runs via the interpreter; no `AgentConfig` compiler in the spike yet.)" + ## Proof commands (terminal, ~60s) ```bash diff --git a/contributing/samples/workflows/authored_workflow_demo/README.md b/contributing/samples/workflows/authored_workflow_demo/README.md index 5aa4fb7f2ba..37a2397cd14 100644 --- a/contributing/samples/workflows/authored_workflow_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_demo/README.md @@ -45,6 +45,15 @@ Point at the ADK-native evidence as it streams: (Re-send the same prompt to show resume reuses the frozen spec — same hash, not re-authored.) +### Relationship to ADK `AgentConfig` (talking point) + +The RFC's direction is to **converge with ADK config** (RFC #93 → "Relationship to ADK `AgentConfig`"; DESIGN §11): the *static* shapes of an authored plan should lower to `Sequential`/`Parallel`/`LoopAgentConfig`, while the dynamic constructs stay `WorkflowSpec`-only. This demo's plan makes the split concrete: + +- the **top-level sequence** (`pipeline → triager → formatter`) is the kind of static composition that maps to a `SequentialAgent`; +- the **`reviewer → verifier` pipeline** (per-item, barrier-free over a runtime list) is exactly what `AgentConfig` **can't** express — no `ConditionalAgent`, and `sub_agents` are resolved once at load — which is why `WorkflowSpec` exists. + +Honest scope: this is a **design direction**, not shown here — the demo executes via the `SpecInterpreter` on the real engine; it does **not** emit or lower to `AgentConfig` (no such compiler in the spike yet). + ## 3. Shape sweep — not a one-off (1–2 min) ```bash From 625d34b7f0a3ac04052b45d7967532bca0897163 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Wed, 3 Jun 2026 15:45:13 -0700 Subject: [PATCH 20/64] =?UTF-8?q?feat(workflow):=20demonstrate=20AgentConf?= =?UTF-8?q?ig=20lowering=20of=20the=20static=20subset=20(=C2=A711)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the convergence concrete instead of paper. lower_to_agent_config() projects a WorkflowSpec's static skeleton onto ADK AgentConfig shapes: - sequence -> SequentialAgent; loop -> LoopAgent (max_iterations); leaf step -> LlmAgent, referenced by ALLOW-LISTED capability name (never an importable FQN); - dynamic blocks (fan_out over a runtime list, pipeline, branch) are emitted as explicit markers, never fabricated as config. Illustrative structural projection — NOT a loadable root_agent.yaml. A full loadable-config compiler stays future (DESIGN §12). - authoring.py: lower_to_agent_config / agent_config_coverage / _lower_block. - test_authoring.py (+4 -> 25): pure-sequence lowers to SequentialAgent; loop -> LoopAgent; dynamic blocks flagged unsupported; projection never emits an FQN. - demo: a '🧬 AgentConfig lowering' beat prints the projection (2/3 of the demo plan lowers; pipeline flagged) — validated live on gemini-3.5-flash. - demo test (+1 -> 5): demo plan lowers to SequentialAgent + 2 LlmAgent leaves, pipeline no-equivalent, no FQN. - docs: README/DESIGN §11/narrative updated to 'demonstrated' (not 'not shown'); counts 11+25+5 = 41. --- .../authored_workflow_demo/DEMO_NARRATIVE.md | 41 ++++++-- .../authored_workflow_demo/README.md | 11 ++- .../security_audit_planner/agent.py | 18 ++++ .../authored_workflow_demo/test_demo_agent.py | 21 +++++ .../authored_workflow_spike/DESIGN.md | 4 +- .../authored_workflow_spike/README.md | 4 +- .../authored_workflow_spike/authoring.py | 94 +++++++++++++++++++ .../authored_workflow_spike/test_authoring.py | 91 ++++++++++++++++++ 8 files changed, 266 insertions(+), 18 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md index e6a87659d25..1a06bba92f6 100644 --- a/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md +++ b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md @@ -91,6 +91,27 @@ Show the file on camera: cat security_audit_plan.json | jq '{schema_version, spec_hash, planner_model, capability_versions, validation}' ``` +## Beat 3c — lower the static subset to AgentConfig + +``` +🧬 AgentConfig lowering (static subset) — 2/3 top-level steps project to ADK + config; dynamic blocks stay SpecInterpreter-only: ['pipeline']. + { "agent_class": "SequentialAgent", "name": "security_audit_planner", + "sub_agents": [ + { "agent_class": "", "workflowspec_kind": "pipeline", … }, + { "agent_class": "LlmAgent", "name": "triage_step", "capability": "triager" }, + { "agent_class": "LlmAgent", "name": "format_step", "capability": "formatter" } ] } +``` + +> "This is the convergence with ADK config, made concrete. The plan's **static +> skeleton** — the top-level sequence — projects onto `AgentConfig` shapes: a +> `SequentialAgent` whose two leaf steps are `LlmAgent`s, referenced by +> **capability name, not an importable FQN**. The `reviewer → verifier` +> **pipeline** is flagged `` — it's per-item over a +> runtime list, which config can't express — rather than faked. Honest framing: +> this is an *illustrative projection* (RFC #93 §11), not a loadable +> `root_agent.yaml`; execution still runs through the interpreter." + ## Beat 4 — execute (Events / trace tab) ``` @@ -133,25 +154,25 @@ Same hash, `reused` flips to `true` — the model is not called the second time. > "So: a model authored a typed, validated, capability-bounded plan; ADK executed > it on the real engine; the plan **exported** to a portable, defensively-imported > audit artifact; and a re-send replayed the exact frozen plan. The deterministic -> test suites — 11 (#92) + 21 (#93) + 4 (demo) — lock all of this in CI, including +> test suites — 11 (#92) + 25 (#93) + 5 (demo) — lock all of this in CI, including > the no-LLM reuse path and the export round-trip / tamper / drift checks." -**Optional aside — convergence with ADK `AgentConfig`** (if a reviewer asks "why not author `root_agent.yaml`?"): +**Convergence with ADK `AgentConfig`** — this is what Beat 3c shows, if a reviewer asks "why not author `root_agent.yaml`?": -> "The static parts of a plan — this one's a top-level sequence of three steps — -> are exactly what ADK config already models (`SequentialAgent`). The RFC's -> direction is to **lower those to `AgentConfig`** rather than reinvent them. But -> the `reviewer → verifier` **pipeline** is per-item over a *runtime* list, which +> "The static parts of a plan are exactly what ADK config already models +> (`SequentialAgent`), and Beat 3c projects them onto that shape. But the +> `reviewer → verifier` **pipeline** is per-item over a *runtime* list, which > config can't express (`sub_agents` resolve once at load; there's no conditional > agent), and capabilities are referenced by **registry name, not importable > FQN** — so a model never names an import path. That dynamic + trust-boundary -> delta is the only reason `WorkflowSpec` exists. (Direction, not shown here — the -> demo runs via the interpreter; no `AgentConfig` compiler in the spike yet.)" +> delta is the only reason `WorkflowSpec` exists. The lowering shown is an +> *illustrative projection* (RFC #93 §11), not a loadable `root_agent.yaml`; a +> full config compiler is future work." ## Proof commands (terminal, ~60s) ```bash pytest contributing/samples/workflows/dynamic_supervisor_spike/test_dynamic_supervisor_spike.py -q # 11 -pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 21 -pytest contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py -q # 4 +pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 25 +pytest contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py -q # 5 ``` diff --git a/contributing/samples/workflows/authored_workflow_demo/README.md b/contributing/samples/workflows/authored_workflow_demo/README.md index 37a2397cd14..4caf45e4aa6 100644 --- a/contributing/samples/workflows/authored_workflow_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_demo/README.md @@ -40,6 +40,7 @@ Point at the ADK-native evidence as it streams: 1. **Validation** — "Validation passed" + the capability list (all registered). 1. **Frozen spec + hash** — open the **State** tab: `authored_workflow:frozen_spec` and `…_hash`. 1. **Exported plan** — `📦 Exported plan → security_audit_plan.json`. The full `FrozenWorkflowRecord` (spec, `sha256`, planner model, registry + capability versions, validation, task-input digest) as a portable envelope; import recomputes the hash and re-validates against the current registry. `cat security_audit_plan.json | jq .` on camera. +1. **AgentConfig lowering** — `🧬 AgentConfig lowering (static subset) — 2/3 …`. The plan's static skeleton projects onto ADK `AgentConfig` shapes (`SequentialAgent` + `LlmAgent` leaves by capability name); the `reviewer → verifier` pipeline is flagged **no-AgentConfig-equivalent**, not fabricated. An illustrative projection (RFC #93 §11) — see the talking point below. 1. **Execution** — the **Events / trace** view shows `reviewer` and `verifier` interleaving **per file** (the barrier-free pipeline), then `triager`, then `formatter`. 1. **Final output** — the triaged audit (1 CRITICAL + 2 HIGH + 1 MEDIUM across `auth.py`/`db.py`/`net.py`/`math.py`). @@ -52,7 +53,9 @@ The RFC's direction is to **converge with ADK config** (RFC #93 → "Relationshi - the **top-level sequence** (`pipeline → triager → formatter`) is the kind of static composition that maps to a `SequentialAgent`; - the **`reviewer → verifier` pipeline** (per-item, barrier-free over a runtime list) is exactly what `AgentConfig` **can't** express — no `ConditionalAgent`, and `sub_agents` are resolved once at load — which is why `WorkflowSpec` exists. -Honest scope: this is a **design direction**, not shown here — the demo executes via the `SpecInterpreter` on the real engine; it does **not** emit or lower to `AgentConfig` (no such compiler in the spike yet). +The demo now **shows** this split: the 🧬 lowering beat prints the static skeleton projected onto `AgentConfig` shapes (2/3 of the demo plan), with the pipeline marked no-equivalent. + +Honest scope: it's an **illustrative structural projection** (leaves by capability name, dynamic blocks flagged) — **not** a loadable `root_agent.yaml`. Execution still runs via the `SpecInterpreter` on the real engine; a full loadable-config compiler (child YAML / an allow-listed capability-ref field) is future work (DESIGN §12). ## 3. Shape sweep — not a one-off (1–2 min) @@ -67,11 +70,11 @@ Proof points: multi-stage `fan_out → step → step`; branch `step → branch`; ```bash pytest contributing/samples/workflows/dynamic_supervisor_spike/test_dynamic_supervisor_spike.py -q # 11 -pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 21 -pytest contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py -q # 4 +pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 25 +pytest contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py -q # 5 ``` -- Deterministic suites: #92 **11** + #93 **21** + demo **4** = **36** (incl. a no-LLM reuse-path test). +- Deterministic suites: #92 **11** + #93 **25** + demo **5** = **41** (incl. a no-LLM reuse-path test). - PR #3 CI green except the documented fork-only `agent-triage` token job. ## Recording notes diff --git a/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py b/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py index a9316c6f62f..3724ebb0b36 100644 --- a/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py @@ -55,11 +55,13 @@ "authored_workflow_spike", ), ) +from authoring import agent_config_coverage # noqa: E402 from authoring import Capability # noqa: E402 from authoring import CapabilityRegistry # noqa: E402 from authoring import export_plan # noqa: E402 from authoring import FrozenWorkflowRecord # noqa: E402 from authoring import import_plan # noqa: E402 +from authoring import lower_to_agent_config # noqa: E402 from authoring import sha256_hex # noqa: E402 from authoring import SpecInterpreter # noqa: E402 from authoring import WorkflowSpec # noqa: E402 @@ -319,6 +321,22 @@ async def author_validate_execute(ctx: Context, node_input): except OSError as e: yield _msg(f"📦 Export skipped (filesystem): {e}") + # 3c. LOWER — project the plan's STATIC subset onto ADK AgentConfig shapes + # (RFC #93 §11 convergence, shown concretely). Illustrative structural + # projection — NOT a loadable root_agent.yaml: leaves are referenced by + # allow-listed capability name (never an importable FQN), and dynamic blocks + # (pipeline/fan_out/branch) are flagged unsupported, never fabricated. + cov = agent_config_coverage(spec) + lowered = lower_to_agent_config(spec, name="security_audit_planner") + yield _msg( + "🧬 **AgentConfig lowering (static subset)** —" + f" {cov['lowerable']}/{cov['total']} top-level steps project to ADK" + " config; dynamic blocks stay SpecInterpreter-only:" + f" {cov['dynamic']}.\n```json\n{json.dumps(lowered, indent=1)}\n```\n_Illustrative" + " structural projection (RFC #93 §11) — leaves by capability name, not" + " an importable FQN; not a loadable `root_agent.yaml`._" + ) + # 4. EXECUTE — run the validated plan on the real ADK engine (#92 supervisor). result = await SpecInterpreter(reg, ctx).execute(spec, {"files": FILES}) yield _msg( diff --git a/contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py b/contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py index 85992eb41cb..1ebc150dc04 100644 --- a/contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py +++ b/contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py @@ -22,6 +22,7 @@ from __future__ import annotations +import json import os import sys @@ -98,6 +99,26 @@ def test_demo_spec_validates(): WorkflowSpecValidator(demo._registry()).validate(_demo_spec()) # no raise +def test_demo_spec_agentconfig_lowering(): + # The demo's plan (pipeline -> step -> step) is exactly the static/dynamic + # split RFC #93 §11 describes: the two trailing steps lower to LlmAgent under + # a SequentialAgent; the reviewer->verifier pipeline has no AgentConfig + # equivalent. (Illustrative projection — leaves by capability name, not FQN.) + from authoring import agent_config_coverage # noqa: E402 + from authoring import lower_to_agent_config # noqa: E402 + + cfg = lower_to_agent_config(_demo_spec(), name="security_audit_planner") + assert cfg["agent_class"] == "SequentialAgent" + kinds = [s["agent_class"] for s in cfg["sub_agents"]] + assert kinds == ["", "LlmAgent", "LlmAgent"] + assert agent_config_coverage(_demo_spec()) == { + "total": 3, + "lowerable": 2, + "dynamic": ["pipeline"], + } + assert '"code"' not in json.dumps(cfg) # never an importable FQN + + def _stub_registry() -> CapabilityRegistry: def stub(name, fn): def build(): diff --git a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md index 26384f4916a..474bb5539e5 100644 --- a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md +++ b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md @@ -189,7 +189,7 @@ Fully additive. New `authoring/` package + `AuthoredWorkflowAgent`; no change to 1. **Discriminated unions are incompatible with Gemini `response_schema`** — `Field(discriminator="kind")` emits a `discriminator` keyword genai rejects (`Schema: extra_forbidden`). Use a plain `kind`-tagged union. 1. **Planner quality vs capability quality are separable** — authoring/structure was reliably good; the residual variance was per-capability output quality (prompts/schemas/retries), proven via an intermediate-output diff (authored vs baseline findings were semantically identical). The strict `unmatched=fail` branch contract also caught a bad field-binding loudly instead of mis-routing. -Re-runnable: `contributing/samples/workflows/authored_workflow_spike/` (21 deterministic tests + env-gated live sweep) and `authored_workflow_demo/` (ADK Web `root_agent` + 4 CI-safe tests incl. the no-LLM reuse path), in `caohy1988/adk-python` PR #3. +Re-runnable: `contributing/samples/workflows/authored_workflow_spike/` (25 deterministic tests + env-gated live sweep) and `authored_workflow_demo/` (ADK Web `root_agent` + 4 CI-safe tests incl. the no-LLM reuse path), in `caohy1988/adk-python` PR #3. ## 10. Plan export & storage — the frozen spec as a durable artifact @@ -245,7 +245,7 @@ Net: this turns the proposal from "a model can author plans" into "**model-autho A reviewer asked whether the planner should author ADK's existing **`AgentConfig`** (the `root_agent.yaml` format) directly. Verified against source — `agents/agent_config.py`, `agents/base_agent_config.py`, `agents/{llm,sequential,parallel,loop}_agent_config.py`, `agents/common_configs.py`, `tools/_tool_configs.py` (names), and `agents/config_agent_utils.py`: -**Lower to config where it fits.** ADK config already models the *static* shapes, so the static subset **should lower to** them rather than reinvent a serialization. **This is a design direction — the spike does not yet implement an `AgentConfig`-lowering compiler.** +**Lower to config where it fits.** ADK config already models the *static* shapes, so the static subset **should lower to** them rather than reinvent a serialization. The spike **demonstrates** this with an illustrative structural projection (`lower_to_agent_config` — `SequentialAgent`/`LoopAgent`/`LlmAgent` shapes, leaves by capability name, dynamic blocks flagged ``); a **full loadable-`root_agent.yaml` compiler** (child YAML / an allow-listed capability-ref field) remains future work (§12). | `WorkflowSpec` block | ADK config relationship | | ----------------------------------------- | ---------------------------------------------------------------------- | diff --git a/contributing/samples/workflows/authored_workflow_spike/README.md b/contributing/samples/workflows/authored_workflow_spike/README.md index 6508a12ff6d..c350d37f277 100644 --- a/contributing/samples/workflows/authored_workflow_spike/README.md +++ b/contributing/samples/workflows/authored_workflow_spike/README.md @@ -26,13 +26,13 @@ behind the RFC's "can a model author good plans?" question. pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q ``` -Expected: **21 passed** — `Binding` invariant, `max_iters>=1`, validator accepts a +Expected: **25 passed** — `Binding` invariant, `max_iters>=1`, validator accepts a valid spec and rejects unknown capability / non-preceding binding / duplicate id, the open-map warning, and interpreter execution of fan_out→aggregate, **pipeline (barrier-free per-item review→verify, plus per-stage `max_fan_out` enforcement)**, branch (correct route), and loop_until (stops + correct output); plus **plan export/import** (round-trip replays the same hash; import rejects a tampered spec, a dropped capability, capability/registry version drift, an unsupported schema_version, -and a new input with no template schema). +and a new input with no template schema); plus **AgentConfig lowering** of the static subset (sequence→`SequentialAgent`, loop→`LoopAgent`, leaf→`LlmAgent` by capability name; dynamic blocks flagged no-equivalent). ## Live planner sweep (optional evidence) diff --git a/contributing/samples/workflows/authored_workflow_spike/authoring.py b/contributing/samples/workflows/authored_workflow_spike/authoring.py index daea013e73a..46eced1a285 100644 --- a/contributing/samples/workflows/authored_workflow_spike/authoring.py +++ b/contributing/samples/workflows/authored_workflow_spike/authoring.py @@ -504,6 +504,100 @@ def import_plan( return spec +# ------------------------------------------------- AgentConfig lowering (§11) +# A STRUCTURAL PROJECTION of a WorkflowSpec's static skeleton onto ADK +# `AgentConfig` shapes — the convergence direction from DESIGN §11, shown +# concretely. It is deliberately NOT a loadable `root_agent.yaml`: +# * the static subset projects to SequentialAgent / LoopAgent / LlmAgent shapes; +# * leaf agents are referenced by ALLOW-LISTED capability name, never by an +# importable FQN (the trust-boundary point — a model never names an import); +# * the dynamic blocks (fan_out over a runtime list, pipeline, branch) have NO +# `AgentConfig` equivalent and are emitted as explicit `unsupported` markers, +# never fabricated as config. +# A full loadable-config compiler (child YAML / an allow-listed capability-ref +# field) is future work (DESIGN §12). + +AGENTCONFIG_UNSUPPORTED = "" + + +def _lower_block(node) -> dict: + if isinstance(node, StepRef): + return { + "agent_class": "LlmAgent", + "name": node.id, + "capability": node.capability, + } + if isinstance(node, LoopUntil): + return { + "agent_class": "LoopAgent", + "name": node.id, + "max_iterations": node.max_iters, + "sub_agents": [_lower_block(b) for b in node.body], + "_note": ( + f"until-predicate ({node.until_capability}) has no AgentConfig" + " field; enforced by SpecInterpreter" + ), + } + if isinstance(node, FanOut): + return { + "agent_class": AGENTCONFIG_UNSUPPORTED, + "workflowspec_kind": "fan_out", + "name": node.id, + "capability": node.capability, + "reason": ( + "per-item over a runtime list; AgentConfig sub_agents are static" + ), + } + if isinstance(node, Pipeline): + return { + "agent_class": AGENTCONFIG_UNSUPPORTED, + "workflowspec_kind": "pipeline", + "name": node.id, + "stages": [st.capability for st in node.stages], + "reason": "barrier-free per-item multi-stage; needs #92 ctx.pipeline", + } + if isinstance(node, Branch): + return { + "agent_class": AGENTCONFIG_UNSUPPORTED, + "workflowspec_kind": "branch", + "name": node.id, + "reason": "route-on-value; AgentConfig has no ConditionalAgent", + } + raise TypeError(f"unknown block: {type(node).__name__}") + + +def lower_to_agent_config( + spec: WorkflowSpec, *, name: str = "authored_workflow" +) -> dict: + """Project the static skeleton of `spec` onto an ADK `AgentConfig` shape. + + Illustrative (see the module note above), not a loadable `root_agent.yaml`: + the ordered `steps` sequence projects to a `SequentialAgent`; leaf steps to + `LlmAgent` (by capability name, not FQN); dynamic blocks are flagged + `unsupported`, never fabricated. + """ + return { + "agent_class": "SequentialAgent", + "name": name, + "sub_agents": [_lower_block(s) for s in spec.steps], + } + + +def agent_config_coverage(spec: WorkflowSpec) -> dict: + """A quick 'X of N top-level blocks lower to config' number for the demo.""" + lowered = lower_to_agent_config(spec)["sub_agents"] + dynamic = [ + b["workflowspec_kind"] + for b in lowered + if b["agent_class"] == AGENTCONFIG_UNSUPPORTED + ] + return { + "total": len(lowered), + "lowerable": len(lowered) - len(dynamic), + "dynamic": dynamic, + } + + # ----------------------------------------------------------------- interpreter class SpecInterpreter: """Executes a validated WorkflowSpec on the real ADK engine via the #92 diff --git a/contributing/samples/workflows/authored_workflow_spike/test_authoring.py b/contributing/samples/workflows/authored_workflow_spike/test_authoring.py index 37662cf1d31..bf34778f191 100644 --- a/contributing/samples/workflows/authored_workflow_spike/test_authoring.py +++ b/contributing/samples/workflows/authored_workflow_spike/test_authoring.py @@ -36,6 +36,8 @@ import pytest sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from authoring import agent_config_coverage +from authoring import AGENTCONFIG_UNSUPPORTED from authoring import Binding # noqa: E402 from authoring import Branch from authoring import Capability @@ -45,6 +47,7 @@ from authoring import FrozenWorkflowRecord from authoring import import_plan from authoring import LoopUntil +from authoring import lower_to_agent_config from authoring import Pipeline from authoring import PipelineStage from authoring import PlanImportError @@ -528,6 +531,94 @@ def test_import_rejects_registry_version_drift(): import_plan(env, v2_registry, task_input=_TASK) +def test_lower_static_sequence_to_sequential_agent(): + spec = WorkflowSpec( + goal="x", + steps=[ + StepRef( + kind="step", + id="c", + capability="classify", + input=Binding(source="task"), + ), + StepRef( + kind="step", + id="s", + capability="tech_summary", + input=Binding(source="step", step="c"), + ), + ], + output=Binding(source="step", step="s"), + ) + cfg = lower_to_agent_config(spec) + assert cfg["agent_class"] == "SequentialAgent" + assert [s["agent_class"] for s in cfg["sub_agents"]] == [ + "LlmAgent", + "LlmAgent", + ] + assert [s["capability"] for s in cfg["sub_agents"]] == [ + "classify", + "tech_summary", + ] + assert AGENTCONFIG_UNSUPPORTED not in [ + s["agent_class"] for s in cfg["sub_agents"] + ] + + +def test_lower_loop_to_loop_agent(): + spec = WorkflowSpec( + goal="x", + steps=[ + LoopUntil( + kind="loop_until", + id="lp", + body=[ + StepRef( + kind="step", + id="d", + capability="draft", + input=Binding(source="task"), + ) + ], + until_capability="is_good", + until_input=Binding(source="step", step="d"), + max_iters=3, + ) + ], + output=Binding(source="step", step="lp"), + ) + loop = lower_to_agent_config(spec)["sub_agents"][0] + assert loop["agent_class"] == "LoopAgent" + assert loop["max_iterations"] == 3 + assert loop["sub_agents"][0]["capability"] == "draft" + + +def test_lower_marks_dynamic_blocks_unsupported(): + # pipeline is per-item over a runtime list -> no AgentConfig equivalent. + cov = agent_config_coverage(_pipeline_spec()) + assert cov == {"total": 1, "lowerable": 0, "dynamic": ["pipeline"]} + + +def test_lower_never_emits_importable_fqn(): + # leaves are referenced by allow-listed capability name, never by an + # importable path; the FQN-bearing keys ADK config would use are absent. + spec = WorkflowSpec( + goal="x", + steps=[ + StepRef( + kind="step", + id="c", + capability="classify", + input=Binding(source="task"), + ) + ], + output=Binding(source="step", step="c"), + ) + blob = json.dumps(lower_to_agent_config(spec)) + assert '"code"' not in blob and '"config_path"' not in blob + assert '"capability": "classify"' in blob + + def test_import_rejects_new_input_without_template_schema(): env = export_plan(_frozen()) # no task_input_schema captured -> replay-only other = {"files": [{"path": "z.py", "code": "ok"}]} From 5a911368de9ceed04e82653d1fc03c8a8c8bf917 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Wed, 3 Jun 2026 16:08:36 -0700 Subject: [PATCH 21/64] docs(workflow): note AgentConfig is deprecated/experimental; qualify loop lowering; fix count MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - §11: AgentConfig + Sequential/Parallel/LoopAgentConfig + BaseAgentConfig are @deprecated + @experimental in this checkout (agent_config.py:72-73, sequential_agent_config.py:28, loop_agent_config.py:30) — so this is convergence with the existing config SHAPE for compatibility, not a long-term dependency on deprecated YAML config. - qualify LoopUntil lowering: only the max_iterations skeleton; the until-predicate has no AgentConfig field (enforced by the interpreter). - §9: demo count 4 -> 5 CI-safe tests. --- .../samples/workflows/authored_workflow_spike/DESIGN.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md index 474bb5539e5..619166e7939 100644 --- a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md +++ b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md @@ -189,7 +189,7 @@ Fully additive. New `authoring/` package + `AuthoredWorkflowAgent`; no change to 1. **Discriminated unions are incompatible with Gemini `response_schema`** — `Field(discriminator="kind")` emits a `discriminator` keyword genai rejects (`Schema: extra_forbidden`). Use a plain `kind`-tagged union. 1. **Planner quality vs capability quality are separable** — authoring/structure was reliably good; the residual variance was per-capability output quality (prompts/schemas/retries), proven via an intermediate-output diff (authored vs baseline findings were semantically identical). The strict `unmatched=fail` branch contract also caught a bad field-binding loudly instead of mis-routing. -Re-runnable: `contributing/samples/workflows/authored_workflow_spike/` (25 deterministic tests + env-gated live sweep) and `authored_workflow_demo/` (ADK Web `root_agent` + 4 CI-safe tests incl. the no-LLM reuse path), in `caohy1988/adk-python` PR #3. +Re-runnable: `contributing/samples/workflows/authored_workflow_spike/` (25 deterministic tests + env-gated live sweep) and `authored_workflow_demo/` (ADK Web `root_agent` + 5 CI-safe tests incl. the no-LLM reuse path), in `caohy1988/adk-python` PR #3. ## 10. Plan export & storage — the frozen spec as a durable artifact @@ -251,10 +251,12 @@ A reviewer asked whether the planner should author ADK's existing **`AgentConfig | ----------------------------------------- | ---------------------------------------------------------------------- | | sequence | lowers to `SequentialAgentConfig` (`sub_agents: list[AgentRefConfig]`) | | static parallel block | lowers to `ParallelAgentConfig` (static sub-agent list) | -| bounded loop | lowers to `LoopAgentConfig` (`max_iterations`) | +| bounded loop | lowers `max_iterations` skeleton to `LoopAgentConfig` | | runtime `fan_out` / `pipeline` / `branch` | no direct config equivalent | -`ParallelAgentConfig` models a **static** set of parallel sub-agents, **not** data-mapping over a runtime list — so per-item `fan_out` sits in the "no equivalent" row, not the parallel row. +`ParallelAgentConfig` models a **static** set of parallel sub-agents, **not** data-mapping over a runtime list — so per-item `fan_out` sits in the "no equivalent" row, not the parallel row. `LoopUntil` lowers only its **`max_iterations` skeleton**; the `until_capability` predicate has no `AgentConfig` field and is enforced by the interpreter. + +**Caveat (ADK source):** `AgentConfig` and the concrete config classes (`Sequential`/`Parallel`/`LoopAgentConfig`, `BaseAgentConfig`) are currently marked **`@deprecated` + `@experimental`** in this checkout (`agents/agent_config.py:72-73`, `sequential_agent_config.py:28`, `loop_agent_config.py:30`). So this is **convergence with the existing config *shape* for compatibility/illustration — not a long-term dependency** on (deprecated) YAML config. If the config surface stabilizes under a different shape, the lowering target moves with it; the `WorkflowSpec` authoring layer is unaffected. **Why the planner should not emit raw `AgentConfig`:** From 1aebca2873b428b7c7ead95340d8850bfc2bf9ca Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Wed, 3 Jun 2026 16:20:42 -0700 Subject: [PATCH 22/64] demo(workflow): add AgentConfig deprecated/experimental caveat to demo docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Match RFC §11 in the demo-facing materials so an ADK TL watching the demo gets the same one-sentence caveat: AgentConfig + the concrete config classes are @deprecated + @experimental in ADK source, so the lowering is convergence with the config SHAPE for compatibility, not a long-term dependency on YAML config. Added to the README talking point and the narrative aside. --- .../workflows/authored_workflow_demo/DEMO_NARRATIVE.md | 4 +++- .../samples/workflows/authored_workflow_demo/README.md | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md index 1a06bba92f6..a99da37048c 100644 --- a/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md +++ b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md @@ -167,7 +167,9 @@ Same hash, `reused` flips to `true` — the model is not called the second time. > FQN** — so a model never names an import path. That dynamic + trust-boundary > delta is the only reason `WorkflowSpec` exists. The lowering shown is an > *illustrative projection* (RFC #93 §11), not a loadable `root_agent.yaml`; a -> full config compiler is future work." +> full config compiler is future work. And note `AgentConfig` is currently +> `@deprecated` + `@experimental` in ADK source — so this is convergence with +> the config *shape* for compatibility, not a bet on deprecated YAML config." ## Proof commands (terminal, ~60s) diff --git a/contributing/samples/workflows/authored_workflow_demo/README.md b/contributing/samples/workflows/authored_workflow_demo/README.md index 4caf45e4aa6..4fdccdba885 100644 --- a/contributing/samples/workflows/authored_workflow_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_demo/README.md @@ -57,6 +57,8 @@ The demo now **shows** this split: the 🧬 lowering beat prints the static skel Honest scope: it's an **illustrative structural projection** (leaves by capability name, dynamic blocks flagged) — **not** a loadable `root_agent.yaml`. Execution still runs via the `SpecInterpreter` on the real engine; a full loadable-config compiler (child YAML / an allow-listed capability-ref field) is future work (DESIGN §12). +> **If asked "why build on deprecated config?"** — `AgentConfig` and the concrete config classes are currently `@deprecated` + `@experimental` in ADK source, so this is convergence with the existing config **shape** for compatibility/illustration, **not** a long-term dependency on YAML config (RFC §11). + ## 3. Shape sweep — not a one-off (1–2 min) ```bash From 3774e4aa148ec281d66de050982b3c9929ee7a9f Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Wed, 3 Jun 2026 16:39:16 -0700 Subject: [PATCH 23/64] docs(workflow): clarify config lowering vs loop_config raw YAML --- .../authored_workflow_demo/DEMO_NARRATIVE.md | 49 ++++++++++--------- .../authored_workflow_demo/README.md | 17 ++++--- .../security_audit_planner/agent.py | 6 +-- .../authored_workflow_spike/DESIGN.md | 34 ++++++------- .../authored_workflow_spike/README.md | 5 +- 5 files changed, 59 insertions(+), 52 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md index a99da37048c..f2b36fb3dc2 100644 --- a/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md +++ b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md @@ -91,10 +91,10 @@ Show the file on camera: cat security_audit_plan.json | jq '{schema_version, spec_hash, planner_model, capability_versions, validation}' ``` -## Beat 3c — lower the static subset to AgentConfig +## Beat 3c — lower the static subset to ADK config ``` -🧬 AgentConfig lowering (static subset) — 2/3 top-level steps project to ADK +🧬 ADK config lowering (static subset) — 2/3 top-level steps project to ADK config; dynamic blocks stay SpecInterpreter-only: ['pipeline']. { "agent_class": "SequentialAgent", "name": "security_audit_planner", "sub_agents": [ @@ -103,14 +103,16 @@ cat security_audit_plan.json | jq '{schema_version, spec_hash, planner_model, ca { "agent_class": "LlmAgent", "name": "format_step", "capability": "formatter" } ] } ``` -> "This is the convergence with ADK config, made concrete. The plan's **static -> skeleton** — the top-level sequence — projects onto `AgentConfig` shapes: a -> `SequentialAgent` whose two leaf steps are `LlmAgent`s, referenced by -> **capability name, not an importable FQN**. The `reviewer → verifier` -> **pipeline** is flagged `` — it's per-item over a -> runtime list, which config can't express — rather than faked. Honest framing: -> this is an *illustrative projection* (RFC #93 §11), not a loadable -> `root_agent.yaml`; execution still runs through the interpreter." +> "This is the convergence with ADK config, made concrete. The static parts are +> what the `loop_config/root_agent.yaml` style is good at: a known Workflow graph +> and known child agents. This demo projects the top-level sequence onto that +> family of config shapes, with leaves referenced by **capability name, not an +> importable FQN**. The `reviewer → verifier` **pipeline** is flagged +> `` because it is per-item over a runtime list; raw +> YAML would need a wrapper node, while `WorkflowSpec` keeps it typed and +> policy-checked. Honest framing: this is an *illustrative projection* (RFC #93 +> §11), not a loadable `root_agent.yaml`; execution still runs through the +> interpreter." ## Beat 4 — execute (Events / trace tab) @@ -157,19 +159,20 @@ Same hash, `reused` flips to `true` — the model is not called the second time. > test suites — 11 (#92) + 25 (#93) + 5 (demo) — lock all of this in CI, including > the no-LLM reuse path and the export round-trip / tamper / drift checks." -**Convergence with ADK `AgentConfig`** — this is what Beat 3c shows, if a reviewer asks "why not author `root_agent.yaml`?": - -> "The static parts of a plan are exactly what ADK config already models -> (`SequentialAgent`), and Beat 3c projects them onto that shape. But the -> `reviewer → verifier` **pipeline** is per-item over a *runtime* list, which -> config can't express (`sub_agents` resolve once at load; there's no conditional -> agent), and capabilities are referenced by **registry name, not importable -> FQN** — so a model never names an import path. That dynamic + trust-boundary -> delta is the only reason `WorkflowSpec` exists. The lowering shown is an -> *illustrative projection* (RFC #93 §11), not a loadable `root_agent.yaml`; a -> full config compiler is future work. And note `AgentConfig` is currently -> `@deprecated` + `@experimental` in ADK source — so this is convergence with -> the config *shape* for compatibility, not a bet on deprecated YAML config." +**Convergence with ADK Workflow config / `root_agent.yaml`** — this is what Beat 3c shows, if a reviewer asks "why not author `loop_config/root_agent.yaml`?": + +> "`loop_config/root_agent.yaml` is a good **derived target** for static graph +> structure: it has `agent_class: Workflow`, fixed `edges`, child YAML files, and +> route functions like `.agent.route_headline`. It is not the right **raw model +> output** because those refs are exactly what we don't want a model to invent: +> Python functions, `_code` refs, child config paths, tools/callbacks, or FQNs. +> #93 keeps the planner output closed and allow-listed, then lowers static parts +> toward config. The `reviewer → verifier` pipeline stays a first-class +> `WorkflowSpec` block because it dispatches per item over a runtime list; raw +> YAML would need a wrapper. The lowering shown is illustrative, not a loadable +> `root_agent.yaml`; a full config compiler is future work. Also, the current +> config loader path is `@deprecated` + `@experimental`, so this is convergence +> with the config *shape* for compatibility, not a bet on today's YAML loader." ## Proof commands (terminal, ~60s) diff --git a/contributing/samples/workflows/authored_workflow_demo/README.md b/contributing/samples/workflows/authored_workflow_demo/README.md index 4fdccdba885..a25dbcb93d9 100644 --- a/contributing/samples/workflows/authored_workflow_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_demo/README.md @@ -40,24 +40,25 @@ Point at the ADK-native evidence as it streams: 1. **Validation** — "Validation passed" + the capability list (all registered). 1. **Frozen spec + hash** — open the **State** tab: `authored_workflow:frozen_spec` and `…_hash`. 1. **Exported plan** — `📦 Exported plan → security_audit_plan.json`. The full `FrozenWorkflowRecord` (spec, `sha256`, planner model, registry + capability versions, validation, task-input digest) as a portable envelope; import recomputes the hash and re-validates against the current registry. `cat security_audit_plan.json | jq .` on camera. -1. **AgentConfig lowering** — `🧬 AgentConfig lowering (static subset) — 2/3 …`. The plan's static skeleton projects onto ADK `AgentConfig` shapes (`SequentialAgent` + `LlmAgent` leaves by capability name); the `reviewer → verifier` pipeline is flagged **no-AgentConfig-equivalent**, not fabricated. An illustrative projection (RFC #93 §11) — see the talking point below. +1. **ADK config lowering** — `🧬 ADK config lowering (static subset) — 2/3 …`. The plan's static skeleton projects toward ADK Workflow/agent config shapes (a static `Workflow`/`SequentialAgent` skeleton + `LlmAgent` leaves by capability name); the `reviewer → verifier` pipeline is flagged **no-AgentConfig-equivalent**, not fabricated. An illustrative projection (RFC #93 §11) — see the talking point below. 1. **Execution** — the **Events / trace** view shows `reviewer` and `verifier` interleaving **per file** (the barrier-free pipeline), then `triager`, then `formatter`. 1. **Final output** — the triaged audit (1 CRITICAL + 2 HIGH + 1 MEDIUM across `auth.py`/`db.py`/`net.py`/`math.py`). (Re-send the same prompt to show resume reuses the frozen spec — same hash, not re-authored.) -### Relationship to ADK `AgentConfig` (talking point) +### Relationship to ADK Workflow config / `root_agent.yaml` (talking point) -The RFC's direction is to **converge with ADK config** (RFC #93 → "Relationship to ADK `AgentConfig`"; DESIGN §11): the *static* shapes of an authored plan should lower to `Sequential`/`Parallel`/`LoopAgentConfig`, while the dynamic constructs stay `WorkflowSpec`-only. This demo's plan makes the split concrete: +The RFC's direction is to **converge with ADK config where it fits** (RFC #93 → "Relationship to ADK Workflow config / `root_agent.yaml`"; DESIGN §11). The linked `loop_config/root_agent.yaml` sample is the right mental model for the **static** portion: a human-authored `agent_class: Workflow` YAML graph with known `edges`, child YAML files, and function refs like `.agent.route_headline`. #93 should be able to lower/export static graph skeletons toward that style, while the model-facing format stays `WorkflowSpec`. -- the **top-level sequence** (`pipeline → triager → formatter`) is the kind of static composition that maps to a `SequentialAgent`; -- the **`reviewer → verifier` pipeline** (per-item, barrier-free over a runtime list) is exactly what `AgentConfig` **can't** express — no `ConditionalAgent`, and `sub_agents` are resolved once at load — which is why `WorkflowSpec` exists. +- the **top-level sequence** (`pipeline → triager → formatter`) is the kind of static composition that can lower to a static Workflow/config skeleton; +- the **`reviewer → verifier` pipeline** (per-item, barrier-free over a runtime list) is exactly what raw YAML **doesn't express directly** today; it would need a wrapper node, while `WorkflowSpec` can keep it typed and policy-checked as a first-class runtime block; +- raw YAML can name function refs, `_code` refs, child YAML files, tools, callbacks, or importable FQNs; model-authored plans should reference only allow-listed capability names. -The demo now **shows** this split: the 🧬 lowering beat prints the static skeleton projected onto `AgentConfig` shapes (2/3 of the demo plan), with the pipeline marked no-equivalent. +The demo now **shows** this split: the 🧬 lowering beat prints the static skeleton projected onto ADK config shapes (2/3 of the demo plan), with the pipeline marked no-equivalent. -Honest scope: it's an **illustrative structural projection** (leaves by capability name, dynamic blocks flagged) — **not** a loadable `root_agent.yaml`. Execution still runs via the `SpecInterpreter` on the real engine; a full loadable-config compiler (child YAML / an allow-listed capability-ref field) is future work (DESIGN §12). +Honest scope: it's an **illustrative structural projection** (leaves by capability name, dynamic blocks flagged) — **not** a loadable `root_agent.yaml`. Execution still runs via the `SpecInterpreter` on the real engine; a full loadable-config compiler (Workflow YAML edges + child YAML + an allow-listed capability-ref field) is future work (DESIGN §12). -> **If asked "why build on deprecated config?"** — `AgentConfig` and the concrete config classes are currently `@deprecated` + `@experimental` in ADK source, so this is convergence with the existing config **shape** for compatibility/illustration, **not** a long-term dependency on YAML config (RFC §11). +> **If asked "why not just author `loop_config/root_agent.yaml`?"** — use that YAML shape as a lowering/export target for static graphs, not as the raw model output. The sample intentionally resolves Python function refs and child YAML refs; #93 needs a closed, response-schema-safe, capability-allow-listed authoring format first. Also, the current config loader path is `@deprecated` + `@experimental`, so this is convergence with the config **shape** for compatibility/illustration, not a long-term dependency on today's YAML loader (RFC §11). ## 3. Shape sweep — not a one-off (1–2 min) diff --git a/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py b/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py index 3724ebb0b36..105fa64c905 100644 --- a/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py @@ -321,7 +321,7 @@ async def author_validate_execute(ctx: Context, node_input): except OSError as e: yield _msg(f"📦 Export skipped (filesystem): {e}") - # 3c. LOWER — project the plan's STATIC subset onto ADK AgentConfig shapes + # 3c. LOWER — project the plan's STATIC subset toward ADK config shapes # (RFC #93 §11 convergence, shown concretely). Illustrative structural # projection — NOT a loadable root_agent.yaml: leaves are referenced by # allow-listed capability name (never an importable FQN), and dynamic blocks @@ -329,12 +329,12 @@ async def author_validate_execute(ctx: Context, node_input): cov = agent_config_coverage(spec) lowered = lower_to_agent_config(spec, name="security_audit_planner") yield _msg( - "🧬 **AgentConfig lowering (static subset)** —" + "🧬 **ADK config lowering (static subset)** —" f" {cov['lowerable']}/{cov['total']} top-level steps project to ADK" " config; dynamic blocks stay SpecInterpreter-only:" f" {cov['dynamic']}.\n```json\n{json.dumps(lowered, indent=1)}\n```\n_Illustrative" " structural projection (RFC #93 §11) — leaves by capability name, not" - " an importable FQN; not a loadable `root_agent.yaml`._" + " an importable FQN; not raw model-authored `root_agent.yaml`._" ) # 4. EXECUTE — run the validated plan on the real ADK engine (#92 supervisor). diff --git a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md index 619166e7939..9c0f90651cf 100644 --- a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md +++ b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md @@ -241,30 +241,30 @@ Re-runnable: `contributing/samples/workflows/authored_workflow_spike/` (25 deter Net: this turns the proposal from "a model can author plans" into "**model-authored plans become durable enterprise artifacts**" — without committing to durable generated code. -## 11. Convergence with ADK `AgentConfig` (+ storage, custom tools, observability) +## 11. Convergence with ADK Workflow config / `root_agent.yaml` (+ storage, custom tools, observability) -A reviewer asked whether the planner should author ADK's existing **`AgentConfig`** (the `root_agent.yaml` format) directly. Verified against source — `agents/agent_config.py`, `agents/base_agent_config.py`, `agents/{llm,sequential,parallel,loop}_agent_config.py`, `agents/common_configs.py`, `tools/_tool_configs.py` (names), and `agents/config_agent_utils.py`: +A reviewer asked whether the planner should author ADK's existing **YAML config** directly, specifically the `contributing/samples/workflows/loop_config/root_agent.yaml` pattern. Verified against source and the sample — `loop_config/root_agent.yaml` is `agent_class: Workflow` with static `edges`, function refs like `.agent.route_headline`, and child YAML refs like `generate_headline.yaml`; the lower-level loader still goes through the `AgentConfig` / `BaseAgentConfig` path and resolves code/config refs via `config_agent_utils.py`. -**Lower to config where it fits.** ADK config already models the *static* shapes, so the static subset **should lower to** them rather than reinvent a serialization. The spike **demonstrates** this with an illustrative structural projection (`lower_to_agent_config` — `SequentialAgent`/`LoopAgent`/`LlmAgent` shapes, leaves by capability name, dynamic blocks flagged ``); a **full loadable-`root_agent.yaml` compiler** (child YAML / an allow-listed capability-ref field) remains future work (§12). +**Lower to config where it fits.** ADK Workflow YAML already models a useful *static* graph shape (`agent_class: Workflow`, `edges`, route labels, child agent YAML files). The static subset **should lower to that style** rather than inventing a separate serialization. The spike **demonstrates the first step** with an illustrative structural projection (`lower_to_agent_config` — `SequentialAgent`/`LoopAgent`/`LlmAgent` shapes, leaves by capability name, dynamic blocks flagged ``); a **full loadable-`root_agent.yaml` compiler** (Workflow YAML edges + child YAML + an allow-listed capability-ref field) remains future work (§12). -| `WorkflowSpec` block | ADK config relationship | -| ----------------------------------------- | ---------------------------------------------------------------------- | -| sequence | lowers to `SequentialAgentConfig` (`sub_agents: list[AgentRefConfig]`) | -| static parallel block | lowers to `ParallelAgentConfig` (static sub-agent list) | -| bounded loop | lowers `max_iterations` skeleton to `LoopAgentConfig` | -| runtime `fan_out` / `pipeline` / `branch` | no direct config equivalent | +| `WorkflowSpec` block | ADK config relationship | +| -------------------------------------------- | ------------------------------------------------------------------------------------------------------------ | +| sequence / static branch / static route loop | should lower to `agent_class: Workflow` YAML (`edges`), like `contributing/samples/workflows/loop_config` | +| leaf capability | should lower to child agent YAML or an allow-listed capability-ref field, not an importable FQN from a model | +| bounded `LoopUntil` | can lower the bounded graph skeleton; its `until_capability` predicate remains interpreter/compiler logic | +| runtime `fan_out` / `pipeline` | no direct YAML equivalent for per-item runtime list dispatch / barrier-free multi-stage flow | -`ParallelAgentConfig` models a **static** set of parallel sub-agents, **not** data-mapping over a runtime list — so per-item `fan_out` sits in the "no equivalent" row, not the parallel row. `LoopUntil` lowers only its **`max_iterations` skeleton**; the `until_capability` predicate has no `AgentConfig` field and is enforced by the interpreter. +`loop_config` is the right mental model for the **static** portion: a known graph with known function/agent references. It is not enough for #93's planner-facing contract because the model would be authoring those references. The safe contract is still `WorkflowSpec` → validate against the registry → optionally lower/export to Workflow YAML as a **derived artifact**. -**Caveat (ADK source):** `AgentConfig` and the concrete config classes (`Sequential`/`Parallel`/`LoopAgentConfig`, `BaseAgentConfig`) are currently marked **`@deprecated` + `@experimental`** in this checkout (`agents/agent_config.py:72-73`, `sequential_agent_config.py:28`, `loop_agent_config.py:30`). So this is **convergence with the existing config *shape* for compatibility/illustration — not a long-term dependency** on (deprecated) YAML config. If the config surface stabilizes under a different shape, the lowering target moves with it; the `WorkflowSpec` authoring layer is unaffected. +**Caveat (ADK source):** the current `AgentConfig` / `BaseAgentConfig` loader path and concrete `Sequential`/`Parallel`/`LoopAgentConfig` classes are marked **`@deprecated` + `@experimental`** in this checkout (`agents/agent_config.py:72-73`, `base_agent_config.py:30`, `sequential_agent_config.py:28`, `loop_agent_config.py:30`). So this is **convergence with the existing config *shape* for compatibility/illustration — not a long-term dependency** on today's YAML loader. If the config surface stabilizes under a different shape, the lowering target moves with it; the `WorkflowSpec` authoring layer is unaffected. -**Why the planner should not emit raw `AgentConfig`:** +**Why the planner should not emit raw `root_agent.yaml`:** -1. **Static graph, build-time only.** `config_agent_utils.from_config(path)` parses the YAML and resolves the *entire* `sub_agents` tree at load (`base_agent.__create_kwargs` → `resolve_agent_reference` per child), before any request. There is no runtime list iteration — so per-item fan-out, pipeline, and conditional branch routing cannot be expressed. -1. **Not a clean `response_schema`.** `AgentConfig` is a `RootModel` over a `Discriminator(agent_config_discriminator)` union; Gemini's `response_schema` rejects the emitted `discriminator` keyword (`Schema: extra_forbidden` — the spike's §9 lesson). It also carries open `extra='allow'` maps (`ToolArgsConfig`, `BaseAgentConfig.model_extra`). -1. **Trust-boundary mismatch on tool/agent refs.** Tools/agents/callbacks are named by **fully-qualified importable path** — `CodeConfig.name`, `AgentRefConfig.code`, `LlmAgentConfig.tools[].name`, `*_callbacks` — resolved via `importlib.import_module` + `getattr` (`config_agent_utils.resolve_code_reference`). That FQN-import model is appropriate for **developer-authored** config; the concern is specifically letting a **model** author raw FQNs. For model-authored plans we want **capability allow-listing**, not arbitrary import paths — a trust-boundary difference, not a flaw in config. +1. **It is static / load-time.** `loop_config` wires known nodes and routes ahead of time. That is great for human-authored graphs, but runtime per-item `fan_out` and barrier-free `pipeline` need dispatch over the actual input list; YAML can only call a wrapper node for that today, not express the dynamic dispatch itself. +1. **It is not a clean `response_schema`.** The loader model uses `AgentConfig` as a `RootModel` over a `Discriminator(agent_config_discriminator)` union; Gemini's `response_schema` rejects the emitted `discriminator` keyword (`Schema: extra_forbidden` — the spike's §9 lesson). It also carries open `extra='allow'` maps (`ToolArgsConfig`, `BaseAgentConfig.model_extra`). +1. **Trust-boundary mismatch on refs.** `loop_config` intentionally resolves `.agent.process_input`, `.agent.route_headline`, `output_schema_code: .agent.Feedback`, and child YAML files. Tools/agents/callbacks can also be named by **fully-qualified importable path** (`CodeConfig.name`, `AgentRefConfig.code`, `LlmAgentConfig.tools[].name`, `*_callbacks`) resolved via `importlib`. That is appropriate for **developer-authored** config; the concern is specifically letting a **model** author those raw refs. For model-authored plans we want **capability allow-listing**, not arbitrary code/config/import paths — a trust-boundary difference, not a flaw in config. -**Direction:** keep `WorkflowSpec` as the thin **authoring** schema (closed, allow-listed, `response_schema`-safe); lower its static subset to `AgentConfig` so those shapes share ADK's serialization and tooling; keep `branch` / `fan_out` / `pipeline` + capability allow-listing as new surface only for the dynamic and trust-boundary pieces config doesn't cover. The compiled artifact is still an ordinary `Workflow` (§2). +**Direction:** keep `WorkflowSpec` as the thin **authoring** schema (closed, allow-listed, `response_schema`-safe); lower/export its static graph subset to ADK Workflow YAML so those shapes share ADK's serialization and tooling; keep runtime `fan_out` / `pipeline` + capability allow-listing as new surface only for the dynamic and trust-boundary pieces config doesn't cover. The compiled artifact is still an ordinary `Workflow` (§2). **Q1 — spec storage.** §5/§10: one `FrozenWorkflowRecord` in session State (`authored_workflow:frozen_record`, unprefixed/session-scoped; resume reuses, never re-plans), a state-only audit event, and a v1.1 export envelope. Compiled `Workflow` is derived, never canonical. @@ -276,7 +276,7 @@ A reviewer asked whether the planner should author ADK's existing **`AgentConfig **Hierarchical / sub-plan authoring** — a registered capability that is itself an `AuthoredWorkflowAgent`, so a step can expand into its own authored sub-plan. This is the likely path to parity with Claude Code's unbounded orchestration (it lifts the single-response plan-size ceiling), but it is **out of MVP scope** and should be evaluated **only after the 3–5-task build gate**. MVP stays single-level: `WorkflowSpec` + validator + freeze/replay + export. -**Upstream `AgentConfig` extension (optional).** If the dynamic constructs prove their value, the cleaner long-term home for `branch` / dynamic `fan_out` may be **new agent-config types upstream** (a conditional agent; a runtime-fan-out agent) plus an allow-listed capability-reference tool field — at which point authoring could converge fully onto an extended `AgentConfig`. Out of scope here; depends on upstream accepting new agent classes. +**Upstream config extension (optional).** If the dynamic constructs prove their value, the cleaner long-term home for runtime `fan_out` / `pipeline` may be **new Workflow YAML block types upstream** plus an allow-listed capability-reference field — at which point authoring could converge more fully onto an extended ADK config shape. Out of scope here; depends on upstream accepting those config/compiler extensions. ## References diff --git a/contributing/samples/workflows/authored_workflow_spike/README.md b/contributing/samples/workflows/authored_workflow_spike/README.md index c350d37f277..cc265a63ee6 100644 --- a/contributing/samples/workflows/authored_workflow_spike/README.md +++ b/contributing/samples/workflows/authored_workflow_spike/README.md @@ -32,7 +32,10 @@ the open-map warning, and interpreter execution of fan_out→aggregate, **pipeli (correct route), and loop_until (stops + correct output); plus **plan export/import** (round-trip replays the same hash; import rejects a tampered spec, a dropped capability, capability/registry version drift, an unsupported schema_version, -and a new input with no template schema); plus **AgentConfig lowering** of the static subset (sequence→`SequentialAgent`, loop→`LoopAgent`, leaf→`LlmAgent` by capability name; dynamic blocks flagged no-equivalent). +and a new input with no template schema); plus **ADK config lowering** of the +static subset (an illustrative projection toward static Workflow/agent config +shapes: sequence/loop/leaf by capability name; runtime fan_out/pipeline/branch +flagged no-equivalent rather than fabricated). ## Live planner sweep (optional evidence) From f6b4b0e44b91de78a523a1de52bcac6da1547913 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Wed, 3 Jun 2026 16:59:22 -0700 Subject: [PATCH 24/64] docs(workflow): separate Workflow YAML target from deprecated config sugar --- .../workflows/authored_workflow_demo/DEMO_NARRATIVE.md | 8 +++++--- .../samples/workflows/authored_workflow_demo/README.md | 2 +- .../samples/workflows/authored_workflow_spike/DESIGN.md | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md index f2b36fb3dc2..19c638a0432 100644 --- a/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md +++ b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md @@ -170,9 +170,11 @@ Same hash, `reused` flips to `true` — the model is not called the second time. > toward config. The `reviewer → verifier` pipeline stays a first-class > `WorkflowSpec` block because it dispatches per item over a runtime list; raw > YAML would need a wrapper. The lowering shown is illustrative, not a loadable -> `root_agent.yaml`; a full config compiler is future work. Also, the current -> config loader path is `@deprecated` + `@experimental`, so this is convergence -> with the config *shape* for compatibility, not a bet on today's YAML loader." +> `root_agent.yaml`; a full config compiler is future work. `Workflow` itself is +> not deprecated, but the current config loader path and agent-config sugar +> classes are `@deprecated` + `@experimental`, so this is convergence with the +> Workflow config *shape* for compatibility, not a bet on today's loader or +> deprecated sugar." ## Proof commands (terminal, ~60s) diff --git a/contributing/samples/workflows/authored_workflow_demo/README.md b/contributing/samples/workflows/authored_workflow_demo/README.md index a25dbcb93d9..d76c100bf56 100644 --- a/contributing/samples/workflows/authored_workflow_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_demo/README.md @@ -58,7 +58,7 @@ The demo now **shows** this split: the 🧬 lowering beat prints the static skel Honest scope: it's an **illustrative structural projection** (leaves by capability name, dynamic blocks flagged) — **not** a loadable `root_agent.yaml`. Execution still runs via the `SpecInterpreter` on the real engine; a full loadable-config compiler (Workflow YAML edges + child YAML + an allow-listed capability-ref field) is future work (DESIGN §12). -> **If asked "why not just author `loop_config/root_agent.yaml`?"** — use that YAML shape as a lowering/export target for static graphs, not as the raw model output. The sample intentionally resolves Python function refs and child YAML refs; #93 needs a closed, response-schema-safe, capability-allow-listed authoring format first. Also, the current config loader path is `@deprecated` + `@experimental`, so this is convergence with the config **shape** for compatibility/illustration, not a long-term dependency on today's YAML loader (RFC §11). +> **If asked "why not just author `loop_config/root_agent.yaml`?"** — use that YAML shape as a lowering/export target for static graphs, not as the raw model output. The sample intentionally resolves Python function refs and child YAML refs; #93 needs a closed, response-schema-safe, capability-allow-listed authoring format first. Also, `Workflow` itself is not deprecated, but the current config loader path and agent-config sugar classes are `@deprecated` + `@experimental`; this is convergence with the Workflow config **shape** for compatibility/illustration, not a long-term dependency on today's loader or deprecated sugar (RFC §11). ## 3. Shape sweep — not a one-off (1–2 min) diff --git a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md index 9c0f90651cf..8d2c39dcf31 100644 --- a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md +++ b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md @@ -256,7 +256,7 @@ A reviewer asked whether the planner should author ADK's existing **YAML config* `loop_config` is the right mental model for the **static** portion: a known graph with known function/agent references. It is not enough for #93's planner-facing contract because the model would be authoring those references. The safe contract is still `WorkflowSpec` → validate against the registry → optionally lower/export to Workflow YAML as a **derived artifact**. -**Caveat (ADK source):** the current `AgentConfig` / `BaseAgentConfig` loader path and concrete `Sequential`/`Parallel`/`LoopAgentConfig` classes are marked **`@deprecated` + `@experimental`** in this checkout (`agents/agent_config.py:72-73`, `base_agent_config.py:30`, `sequential_agent_config.py:28`, `loop_agent_config.py:30`). So this is **convergence with the existing config *shape* for compatibility/illustration — not a long-term dependency** on today's YAML loader. If the config surface stabilizes under a different shape, the lowering target moves with it; the `WorkflowSpec` authoring layer is unaffected. +**Caveat (ADK source):** `Workflow` itself is not marked deprecated in this checkout; the recommended static target is the `agent_class: Workflow` graph-YAML shape. What *is* marked **`@deprecated` + `@experimental`** is the current `AgentConfig` / `BaseAgentConfig` loader path and the concrete `Sequential`/`Parallel`/`LoopAgentConfig` sugar classes (`agents/agent_config.py:72-73`, `base_agent_config.py:30`, `sequential_agent_config.py:28`, `loop_agent_config.py:30`). So this is **convergence with the Workflow config *shape* for compatibility/illustration — not a long-term dependency** on today's YAML loader or deprecated agent-config sugar. If the config surface stabilizes under a different shape, the lowering target moves with it; the `WorkflowSpec` authoring layer is unaffected. **Why the planner should not emit raw `root_agent.yaml`:** From 8d61f0e623f2df95459f6855d34cb7af8c498042 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Thu, 4 Jun 2026 16:59:48 -0700 Subject: [PATCH 25/64] =?UTF-8?q?demo(workflow):=20trim=20lowering=20outpu?= =?UTF-8?q?t=20=E2=80=94=20drop=20per-block=20reason=20+=20message=20trail?= =?UTF-8?q?er?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Declutter the 🧬 ADK config lowering beat: - remove the 'reason' field from the fan_out/pipeline/branch unsupported markers (workflowspec_kind already names the construct); no test referenced it. - drop the trailing italic 'illustrative structural projection…' line from the demo chat message (the README/narrative/DESIGN already carry that framing). --- .../authored_workflow_demo/security_audit_planner/agent.py | 4 +--- .../samples/workflows/authored_workflow_spike/authoring.py | 5 ----- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py b/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py index 105fa64c905..a72ee314813 100644 --- a/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py @@ -332,9 +332,7 @@ async def author_validate_execute(ctx: Context, node_input): "🧬 **ADK config lowering (static subset)** —" f" {cov['lowerable']}/{cov['total']} top-level steps project to ADK" " config; dynamic blocks stay SpecInterpreter-only:" - f" {cov['dynamic']}.\n```json\n{json.dumps(lowered, indent=1)}\n```\n_Illustrative" - " structural projection (RFC #93 §11) — leaves by capability name, not" - " an importable FQN; not raw model-authored `root_agent.yaml`._" + f" {cov['dynamic']}.\n```json\n{json.dumps(lowered, indent=1)}\n```" ) # 4. EXECUTE — run the validated plan on the real ADK engine (#92 supervisor). diff --git a/contributing/samples/workflows/authored_workflow_spike/authoring.py b/contributing/samples/workflows/authored_workflow_spike/authoring.py index 46eced1a285..136b23d1959 100644 --- a/contributing/samples/workflows/authored_workflow_spike/authoring.py +++ b/contributing/samples/workflows/authored_workflow_spike/authoring.py @@ -544,9 +544,6 @@ def _lower_block(node) -> dict: "workflowspec_kind": "fan_out", "name": node.id, "capability": node.capability, - "reason": ( - "per-item over a runtime list; AgentConfig sub_agents are static" - ), } if isinstance(node, Pipeline): return { @@ -554,14 +551,12 @@ def _lower_block(node) -> dict: "workflowspec_kind": "pipeline", "name": node.id, "stages": [st.capability for st in node.stages], - "reason": "barrier-free per-item multi-stage; needs #92 ctx.pipeline", } if isinstance(node, Branch): return { "agent_class": AGENTCONFIG_UNSUPPORTED, "workflowspec_kind": "branch", "name": node.id, - "reason": "route-on-value; AgentConfig has no ConditionalAgent", } raise TypeError(f"unknown block: {type(node).__name__}") From 98c3ed9268b935ab81f975ff5529615abc23f2a5 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Tue, 9 Jun 2026 11:58:07 -0700 Subject: [PATCH 26/64] spike(workflow): pattern-coverage sweep, plan-quality lints, loop-carried LoopUntil.init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apply the empirical upgrades from 'Claude Dynamic Workflows: Scaling Complex Work' (aipractitioner.substack.com) to the #93 spike: * Pattern coverage: the six coordination patterns (classify-route, fan-out/ synthesize, generate-filter, loop-until-done, adversarial verification, tournament) all author+validate+execute; explicit deterministic tests for the two non-obvious shapes. * Finding: tournament surfaced a real vocabulary gap — data-dependent pairing needs loop-carried state. Added LoopUntil.init (seed binding) + body bindings to the loop's own id read the carried value; binding it without init is a validation error. * Plan-quality lints (soft warnings): same-capability self-review (self-preferential bias) and unsynthesized fan-out; independence_facts() derives the positive provenance statements the frozen record can prove. * SpecInterpreter.dispatch_count for cheap cost visibility. * DESIGN.md/README.md: lints + init semantics, pattern table, budget-binding and no-plan escape hatch as future work; 25 -> 31 deterministic tests. --- .../authored_workflow_spike/DESIGN.md | 24 ++- .../authored_workflow_spike/README.md | 35 +++- .../authored_workflow_spike/authoring.py | 196 ++++++++++++++++-- .../authored_workflow_spike/test_authoring.py | 196 ++++++++++++++++++ 4 files changed, 433 insertions(+), 18 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md index 8d2c39dcf31..b151affbae3 100644 --- a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md +++ b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md @@ -64,6 +64,11 @@ class LoopUntil(BaseModel): until_capability: str # MUST declare a STRICT-bool output schema until_input: Binding # predicate input (validated vs until_capability.input_schema) max_iters: int = Field(ge=1) # REQUIRED, >= 1 + init: Binding | None = None # LOOP-CARRIED seed: a body step may bind the loop's OWN id to + # read the prior iteration's body output (`init` on round 0). + # Surfaced by the tournament pattern (pairs recomputed per round + # from prior winners); required for any accumulate-and-refine loop. + # Binding the loop's id in the body WITHOUT init = validation error. # PLAIN union, each member carrying a `kind` Literal (structurally-tagged) — NOT # Annotated[..., Field(discriminator="kind")]: the discriminated form emits a @@ -115,12 +120,19 @@ class AuthoredWorkflowAgent(BaseAgent): - `FanOut.over` resolves to a list; the fan-out capability takes an item; - `Branch.on` is string/str-enum-typed; route blocks share a compatible last-node output schema; non-exhaustive enum domain is flagged (unmatched at runtime fails); - `Pipeline`: `over` resolves to a list; every stage `capability` is registered and takes an item; stage[0] input defaults to the per-item element, stage[n] to stage[n-1]'s output; the last stage's output type defines the pipeline output (validated for downstream bindings); -- `LoopUntil`: strict-bool `until_capability`, present/compatible `until_input`, `max_iters >= 1`; +- `LoopUntil`: strict-bool `until_capability`, present/compatible `until_input`, `max_iters >= 1`; a body binding to the loop's own id requires `init`; - globally-unique `id`s; binding-scope (no non-preceding / cross-route references); - registry-version match vs a frozen spec (drift = hard error). Then **`Graph.validate_graph()`** (reused) handles duplicate names, `START`/reachability, duplicate edges, unconditional cycles on the compiled graph. +**Plan-quality lints (soft warnings).** Multi-agent quality rests on isolation — it mitigates the documented single-agent failure modes (*agentic laziness*, *self-preferential bias*, *goal drift*; see [Dynamic Workflows: scaling complex work](https://aipractitioner.substack.com/p/claude-dynamic-workflows-scaling)). Because dataflow is typed `Binding`s, independence is **statically checkable** — something model-authored orchestration *code* cannot offer — and the validator lints two violations: + +- **self-review**: a node (or pipeline stage) consuming output produced by the *same capability* — same-capability review cannot provide independent verification; +- **unsynthesized fan-out**: the terminal output binds a bare per-item `fan_out` never combined or verified downstream. + +The complementary positive facts (`independence_facts`) are derivable from the frozen spec — e.g. *"stage `verifier` sees ONLY stage `reviewer`'s per-item output"* — which is what lets the frozen record **prove** structural bias controls to an auditor, not just assert them. + ## 4. Semantics - **Authoring non-deterministic; execution deterministic.** Once frozen, execution + resume replay is fully deterministic (it's just a `Workflow`). @@ -181,6 +193,8 @@ Fully additive. New `authoring/` package + `AuthoredWorkflowAgent`; no change to - **`AuthoredWorkflowAgent`:** malformed planner output → bounded re-plan → fail past `max_replans`. - **Determinism:** frozen spec replays identically, resumes exactly-once (inherits #92). - **Two gates:** *planning* (valid + sensible + executable + structurally matches a hand-wired baseline) and *output-quality* (intermediate outputs match, capability invariants hold, one repair retry). +- **Pattern coverage:** the six empirically common coordination patterns (classify-route, fan-out/synthesize, generate-filter, loop-until-done, adversarial verification, tournament) all author + validate + execute. The two non-obvious shapes have explicit deterministic tests; tournament exercises loop-carried state. +- **Plan-quality lints:** same-capability self-review and unsynthesized fan-out warn; an independent (different-capability) verification plan lints clean. ## 9. Empirical findings (from the demand-gate spike on `gemini-3.5-flash`) @@ -188,8 +202,9 @@ Fully additive. New `authoring/` package + `AuthoredWorkflowAgent`; no change to 1. **Open-`dict[str, X]` maps are a structured-output reliability hazard** — hit twice: a capability's `counts: dict[str,int]` came back empty, and the spec's own `Branch.routes` (an open map) came back empty. **Both fixed by enumerated/list structures** (`Branch.routes` → `list[Route]`; capability outputs use fixed fields). The validator warns on open-map capability outputs. 1. **Discriminated unions are incompatible with Gemini `response_schema`** — `Field(discriminator="kind")` emits a `discriminator` keyword genai rejects (`Schema: extra_forbidden`). Use a plain `kind`-tagged union. 1. **Planner quality vs capability quality are separable** — authoring/structure was reliably good; the residual variance was per-capability output quality (prompts/schemas/retries), proven via an intermediate-output diff (authored vs baseline findings were semantically identical). The strict `unmatched=fail` branch contract also caught a bad field-binding loudly instead of mis-routing. +1. **The pattern-coverage sweep surfaced a real vocabulary gap** — the tournament shape (pairs recomputed per round from the prior round's winners) is inexpressible without **loop-carried state**: a body step must read the previous iteration's output, which the binding-scope rules statically forbid. Fixed with `LoopUntil.init` (seed binding) + the rule that a body binding to the loop's own id reads the carried value. Pattern-driven gate-task selection finds these gaps; single ad-hoc tasks don't. -Re-runnable: `contributing/samples/workflows/authored_workflow_spike/` (25 deterministic tests + env-gated live sweep) and `authored_workflow_demo/` (ADK Web `root_agent` + 5 CI-safe tests incl. the no-LLM reuse path), in `caohy1988/adk-python` PR #3. +Re-runnable: `contributing/samples/workflows/authored_workflow_spike/` (31 deterministic tests + env-gated live sweep) and `authored_workflow_demo/` (ADK Web `root_agent` + 6 CI-safe tests incl. the no-LLM reuse path), in `caohy1988/adk-python` PR #3. ## 10. Plan export & storage — the frozen spec as a durable artifact @@ -278,8 +293,13 @@ A reviewer asked whether the planner should author ADK's existing **YAML config* **Upstream config extension (optional).** If the dynamic constructs prove their value, the cleaner long-term home for runtime `fan_out` / `pipeline` may be **new Workflow YAML block types upstream** plus an allow-listed capability-reference field — at which point authoring could converge more fully onto an extended ADK config shape. Out of scope here; depends on upstream accepting those config/compiler extensions. +**Budget as a bindable runtime value (v1.1-sized).** #92 caps *bound* spend, but a plan cannot *react* to it. Allowing `until_input` (or any `Binding`) to source a runtime-provided budget struct — e.g. `Binding(source="runtime", path="budget.remaining_tokens")` — makes loop-until-budget expressible declaratively, with no new node kind. + +**A "no-plan" escape hatch.** Each orchestration level adds overhead; small, linear tasks are solved more efficiently by a single agent. Letting the planner's output schema include a degenerate direct-execution variant (a single `StepRef`, or an explicit `kind: "direct"`) lets trivial inputs skip orchestration — classify-and-route applied to the meta-decision of whether to orchestrate at all. + ## References - #92 — supervised concurrent dynamic dispatch + `ctx.pipeline` (executor). - Claude Code Dynamic Workflows — https://code.claude.com/docs/en/workflows +- Empirical patterns & failure modes: *Claude Dynamic Workflows: Scaling Complex Work* — https://aipractitioner.substack.com/p/claude-dynamic-workflows-scaling - ADK: `Workflow`/`Graph` (`src/google/adk/workflow/_graph.py`), `LlmAgent.output_schema` / `validate_schema`, `BaseAgent.run_async`, `_session_util.extract_state_delta`, `NodeRunner._track_event_in_context`. diff --git a/contributing/samples/workflows/authored_workflow_spike/README.md b/contributing/samples/workflows/authored_workflow_spike/README.md index cc265a63ee6..854186d7b37 100644 --- a/contributing/samples/workflows/authored_workflow_spike/README.md +++ b/contributing/samples/workflows/authored_workflow_spike/README.md @@ -26,7 +26,7 @@ behind the RFC's "can a model author good plans?" question. pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q ``` -Expected: **25 passed** — `Binding` invariant, `max_iters>=1`, validator accepts a +Expected: **31 passed** — `Binding` invariant, `max_iters>=1`, validator accepts a valid spec and rejects unknown capability / non-preceding binding / duplicate id, the open-map warning, and interpreter execution of fan_out→aggregate, **pipeline (barrier-free per-item review→verify, plus per-stage `max_fan_out` enforcement)**, branch (correct route), and loop_until (stops + correct output); plus **plan export/import** @@ -35,7 +35,25 @@ capability, capability/registry version drift, an unsupported schema_version, and a new input with no template schema); plus **ADK config lowering** of the static subset (an illustrative projection toward static Workflow/agent config shapes: sequence/loop/leaf by capability name; runtime fan_out/pipeline/branch -flagged no-equivalent rather than fabricated). +flagged no-equivalent rather than fabricated); plus **pattern coverage** +(adversarial verification and tournament via loop-carried `init`, incl. the +no-`init` validation error) and **plan-quality lints** (same-capability +self-review and unsynthesized fan-out warn; an independent plan lints clean). + +## Pattern coverage — the six coordination shapes + +The six empirically common coordination patterns ([Dynamic Workflows: scaling +complex work](https://aipractitioner.substack.com/p/claude-dynamic-workflows-scaling)) +are all expressible in the v1 vocabulary, with deterministic tests: + +| Pattern | `WorkflowSpec` expression | Test | +|---|---|---| +| classify & route | `StepRef(classifier)` → `Branch` | `test_interpreter_branch_takes_correct_route` | +| fan-out / synthesize | `FanOut` → `StepRef(synthesizer)` | `test_interpreter_fanout_then_aggregate` | +| generate & filter | `FanOut(generate)` → `StepRef(filter)` | same shape as above | +| loop until done | `LoopUntil` + `until_capability` | `test_interpreter_loop_until_stops_and_outputs` | +| adversarial verification | `FanOut(skeptics)` → threshold/filter step | `test_pattern_adversarial_verification` | +| tournament | `LoopUntil(init=…, body=[pair_maker, FanOut(judge)])` | `test_pattern_tournament_loop_carried` | ## Live planner sweep (optional evidence) @@ -82,5 +100,18 @@ three shapes: 1. **Planning vs capability quality are separable** — authoring/structure was reliably good; the residual variance was per-capability output quality (prompts/schemas/retries), not planning. +1. **The pattern-coverage sweep surfaced a real vocabulary gap.** The tournament + shape (pairs recomputed each round from the prior round's winners) needs + **loop-carried state**, which the binding-scope rules statically forbade. + Fixed with `LoopUntil.init` + the rule that a body binding to the loop's own + id reads the carried value (validation error without `init`). This is why + gate tasks should be selected per coordination pattern, not ad hoc — single + tasks don't find these gaps. +1. **Typed bindings make agent independence statically checkable.** The + validator now lints same-capability self-review (self-preferential bias) + and unsynthesized fan-out; `independence_facts()` derives the positive + provenance statements (e.g. *"stage `verifier` sees ONLY stage `reviewer`'s + per-item output"*) the frozen record can prove to an auditor. Model-authored + orchestration *code* cannot be checked this way. This is a demand-gate artifact, not production code. diff --git a/contributing/samples/workflows/authored_workflow_spike/authoring.py b/contributing/samples/workflows/authored_workflow_spike/authoring.py index 136b23d1959..2fab312b6a6 100644 --- a/contributing/samples/workflows/authored_workflow_spike/authoring.py +++ b/contributing/samples/workflows/authored_workflow_spike/authoring.py @@ -130,6 +130,12 @@ class LoopUntil(BaseModel): until_capability: str until_input: Binding max_iters: int = Field(ge=1) + # Loop-carried state (optional): seeds the value a body step reads when it + # binds the loop's OWN id; after each iteration the carried value becomes the + # body's last-node output. Surfaced by the pattern-coverage sweep: a + # tournament (pairs recomputed each round from the prior round's winners) + # is inexpressible without it — every other accumulate-and-refine loop too. + init: Optional[Binding] = None # NOTE: a PLAIN union, not Pydantic's Field(discriminator="kind"). The discriminated @@ -242,7 +248,101 @@ def validate(self, spec: WorkflowSpec) -> list[str]: raise SpecValidationError( f"output references unknown step {spec.output.step!r}" ) - return self.registry.open_map_warnings() + return self.registry.open_map_warnings() + self.quality_lints(spec) + + def quality_lints(self, spec: WorkflowSpec) -> list[str]: + """Plan-quality lints (soft warnings, never hard errors). + + Multi-agent quality rests on isolation: it is what mitigates + self-preferential bias (an agent grading its own output) and goal drift. + Typed bindings make two such properties STATICALLY checkable — something + model-authored orchestration *code* cannot offer: + + * self-review: a node consuming output produced by the SAME capability + cannot provide independent verification; + * unsynthesized fan-out: a plan whose terminal output is a bare per-item + fan_out never combined or verified by a downstream capability. + """ + lints: list[str] = [] + producer_cap: dict[str, str] = {} # node id -> capability producing output + consumed: set[str] = set() # step ids some other node reads from + + def walk(nodes): + for n in nodes: + if isinstance(n, (StepRef, FanOut)): + producer_cap[n.id] = n.capability + elif isinstance(n, Pipeline): + producer_cap[n.id] = n.stages[-1].capability if n.stages else "" + for b in _bindings(n): + if b.source == "step": + consumed.add(b.step) + if isinstance(n, Pipeline): + for prev, st in zip(n.stages, n.stages[1:]): + if st.input is not None and st.input.source == "step": + consumed.add(st.input.step) + if st.capability == prev.capability and st.input is None: + lints.append( + f"plan-quality: pipeline {n.id!r} stage" + f" {st.capability!r} re-checks its own capability's output —" + " same-capability review cannot provide independent" + " verification (self-preferential bias)" + ) + if isinstance(n, Branch): + for route in n.routes: + walk(route.block) + if isinstance(n, LoopUntil): + walk(n.body) + + walk(spec.steps) + + def walk_consumers(nodes): + for n in nodes: + my_cap = getattr(n, "capability", None) + b = getattr(n, "input", None) or getattr(n, "over", None) + if ( + my_cap + and isinstance(b, Binding) + and b.source == "step" + and producer_cap.get(b.step) == my_cap + ): + lints.append( + f"plan-quality: {n.id!r} consumes the output of {b.step!r} via" + f" the same capability {my_cap!r} — same-capability review" + " cannot provide independent verification (self-preferential" + " bias)" + ) + for route in getattr(n, "routes", None) or []: + walk_consumers(route.block) + if getattr(n, "body", None): + walk_consumers(n.body) + + walk_consumers(spec.steps) + + if spec.output.source == "step": + terminal = spec.output.step + + def find(nodes): + for n in nodes: + if n.id == terminal: + return n + for route in getattr(n, "routes", None) or []: + hit = find(route.block) + if hit is not None: + return hit + if getattr(n, "body", None): + hit = find(n.body) + if hit is not None: + return hit + return None + + node_ = find(spec.steps) + if isinstance(node_, FanOut) and terminal not in consumed: + lints.append( + f"plan-quality: output binds directly to fan_out {terminal!r}" + " with no downstream synthesis or verification step — parallel" + " findings are never combined or independently checked" + ) + return lints def _walk(self, nodes, preceding: set[str], ids: set[str]) -> set[str]: preceding = set(preceding) @@ -256,8 +356,8 @@ def _walk(self, nodes, preceding: set[str], ids: set[str]) -> set[str]: raise SpecValidationError( f"unknown until_capability {n.until_capability!r}" ) - # Entry bindings (input/over/on) reference a PRIOR step on this path. - for f in ("input", "over", "on"): + # Entry bindings (input/over/on/init) reference a PRIOR step on this path. + for f in ("input", "over", "on", "init"): b = getattr(n, f, None) if ( isinstance(b, Binding) @@ -294,6 +394,13 @@ def _walk(self, nodes, preceding: set[str], ids: set[str]) -> set[str]: f" {st.input.step!r}" ) if isinstance(n, LoopUntil): + # A body step may bind the loop's OWN id to read the loop-carried + # value — but only if `init` seeds it (else iteration 0 has nothing). + if n.init is None and _references_step(n.body, n.id): + raise SpecValidationError( + f"loop {n.id}: body reads the loop-carried value (binds the" + " loop's own id) but no `init` binding seeds it" + ) # body executes in-scope; until_input may reference a body step. body_scope = self._walk(n.body, preceding | {n.id}, ids) ui = n.until_input @@ -311,13 +418,29 @@ def _walk(self, nodes, preceding: set[str], ids: set[str]) -> set[str]: def _bindings(n) -> list[Binding]: out = [] - for f in ("input", "over", "on", "until_input"): + for f in ("input", "over", "on", "until_input", "init"): b = getattr(n, f, None) if isinstance(b, Binding): out.append(b) + for st in getattr(n, "stages", None) or []: + if isinstance(st.input, Binding): + out.append(st.input) return out +def _references_step(nodes, step_id: str) -> bool: + """True if any binding in `nodes` (recursively) reads `step_id`.""" + for n in nodes: + if any(b.source == "step" and b.step == step_id for b in _bindings(n)): + return True + for route in getattr(n, "routes", None) or []: + if _references_step(route.block, step_id): + return True + if getattr(n, "body", None) and _references_step(n.body, step_id): + return True + return False + + # ----------------------------------------------------------- export / import # # DESIGN.md §10: the frozen spec is a first-class, exportable artifact. The @@ -359,6 +482,44 @@ def walk(nodes): return found +def independence_facts(spec: WorkflowSpec) -> list[str]: + """Human-readable provenance facts derivable STATICALLY from the bindings. + + Each fact states what a step can possibly see — its only input is a typed + binding, so isolation (no shared context, no inherited reasoning) is a + checkable property of the frozen plan, not a runtime hope. This is what + makes structural bias controls auditable: the record proves a verifier saw + only the producer's output and that synthesis traces back to the task input. + """ + facts: list[str] = [] + + def walk(nodes): + for n in nodes: + if isinstance(n, Pipeline): + for prev, st in zip(n.stages, n.stages[1:]): + if st.input is None and st.capability != prev.capability: + facts.append( + f"pipeline {n.id!r}: stage {st.capability!r} sees ONLY stage" + f" {prev.capability!r}'s per-item output — independent" + " verification, per item" + ) + b = getattr(n, "input", None) or getattr(n, "over", None) + if isinstance(b, Binding): + src = ( + "the task input" + if b.source == "task" + else f"the typed output of {b.step!r}" + ) + facts.append(f"{n.id!r} consumes ONLY {src}") + for route in getattr(n, "routes", None) or []: + walk(route.block) + if getattr(n, "body", None): + walk(n.body) + + walk(spec.steps) + return facts + + class ValidationResult(BaseModel): passed: bool warnings: list[str] = Field(default_factory=list) @@ -603,6 +764,7 @@ def __init__(self, registry: CapabilityRegistry, ctx, *, gate: int = 8): self.ctx = ctx self.sup = DynamicNodeSupervisor(ctx, gate=gate) self.state: dict[str, Any] = {} + self.dispatch_count = 0 # capability dispatches — cheap cost visibility def _resolve(self, binding: Binding, task_input): base = task_input if binding.source == "task" else self.state[binding.step] @@ -616,12 +778,15 @@ def _resolve(self, binding: Binding, task_input): def _arg(self, cap: Capability, value): return json.dumps(value, default=str) if cap.serialize_input else value - async def _dispatch(self, cap_name: str, value, run_id: str): - cap = self.registry[cap_name] - return await self.sup.dispatch( + def _dispatch_cap(self, cap: Capability, value, run_id: str): + self.dispatch_count += 1 + return self.sup.dispatch( cap.build(), node_input=self._arg(cap, value), run_id=run_id ) + async def _dispatch(self, cap_name: str, value, run_id: str): + return await self._dispatch_cap(self.registry[cap_name], value, run_id) + async def execute(self, spec: WorkflowSpec, task_input) -> Any: await self._run_block(spec.steps, task_input, prefix="") return self._resolve(spec.output, task_input) @@ -645,8 +810,8 @@ async def _run_block(self, nodes, task_input, prefix: str): self.state[n.id] = await self.sup.pipeline( items, ( - lambda _p, it, i, c=cap, rid=rid: self.sup.dispatch( - c.build(), node_input=self._arg(c, it), run_id=f"{rid}_{i}" + lambda _p, it, i, c=cap, rid=rid: self._dispatch_cap( + c, it, f"{rid}_{i}" ) ), ) @@ -675,11 +840,7 @@ def stage(prev, it, i, si=si, st=st, rid=rid): if st.input is not None else (it if si == 0 else prev) ) - return self.sup.dispatch( - cap.build(), - node_input=self._arg(cap, value), - run_id=f"{rid}_{i}_{si}", - ) + return self._dispatch_cap(cap, value, f"{rid}_{i}_{si}") stage_fns.append(stage) self.state[n.id] = await self.sup.pipeline(items, *stage_fns) @@ -696,9 +857,16 @@ def stage(prev, it, i, si=si, st=st, rid=rid): ) self.state[n.id] = out elif isinstance(n, LoopUntil): + # Loop-carried state: `init` seeds state[loop.id]; after every + # iteration the carried value becomes the body's last-node output, so + # a body step binding the loop's own id reads the PRIOR round's result + # (tournament: pairs recomputed each round from the prior winners). + if n.init is not None: + self.state[n.id] = self._resolve(n.init, task_input) out = None for i in range(n.max_iters): out = await self._run_block(n.body, task_input, prefix=f"{rid}_i{i}_") + self.state[n.id] = out verdict = await self._dispatch( n.until_capability, self._resolve(n.until_input, task_input), diff --git a/contributing/samples/workflows/authored_workflow_spike/test_authoring.py b/contributing/samples/workflows/authored_workflow_spike/test_authoring.py index bf34778f191..bf8d088691d 100644 --- a/contributing/samples/workflows/authored_workflow_spike/test_authoring.py +++ b/contributing/samples/workflows/authored_workflow_spike/test_authoring.py @@ -627,3 +627,199 @@ def test_import_rejects_new_input_without_template_schema(): # but template promotion (a captured schema) lets a new input through: env["task_input_schema"] = {"required": ["files"]} assert import_plan(env, _registry(), task_input=other) is not None + + +# ------------------------------------------------------------ pattern coverage +# The six empirically common coordination patterns (classify-route, fan-out/ +# synthesize, generate-filter, loop-until-done, adversarial verification, +# tournament) must all be expressible in the v1 vocabulary. Four are already +# exercised above (branch test = classify-route; fanout_then_aggregate = +# fan-out/synthesize AND generate-filter; loop test = loop-until-done). The two +# non-obvious shapes get explicit tests here. Tournament is the one that +# surfaced a vocabulary gap: data-dependent pairing needs LOOP-CARRIED state +# (`LoopUntil.init` + body bindings to the loop's own id). + + +def _pattern_registry(): + return CapabilityRegistry([ + Capability( + name="pair_maker", + build=_cap_node( + "pair_maker", + lambda lst: [lst[i : i + 2] for i in range(0, len(lst), 2)], + ), + input_kind="list", + serialize_input=False, + ), + Capability( + name="judge", + build=_cap_node("judge", lambda pair: min(pair)), + input_kind="item", + serialize_input=False, + ), + Capability( + name="single_winner", + build=_cap_node("single_winner", lambda lst: len(lst) == 1), + input_kind="list", + serialize_input=False, + ), + Capability( + name="skeptic", + build=_cap_node( + "skeptic", + lambda f: {"claim": f["claim"], "refuted": not f["evidence"]}, + ), + input_kind="item", + serialize_input=False, + ), + Capability( + name="keep_unrefuted", + build=_cap_node( + "keep_unrefuted", + lambda vs: [v["claim"] for v in vs if not v["refuted"]], + ), + input_kind="list", + serialize_input=False, + ), + ]) + + +def _tournament_spec(): + return WorkflowSpec( + goal="single elimination", + steps=[ + LoopUntil( + kind="loop_until", + id="tourney", + init=Binding(source="task", path="candidates"), + body=[ + StepRef( + kind="step", + id="pairs", + capability="pair_maker", + # reads the LOOP-CARRIED value: the candidates on round 0, + # the prior round's winners afterwards. + input=Binding(source="step", step="tourney"), + ), + FanOut( + kind="fan_out", + id="round_winners", + over=Binding(source="step", step="pairs"), + capability="judge", + ), + ], + until_capability="single_winner", + until_input=Binding(source="step", step="round_winners"), + max_iters=4, + ), + ], + output=Binding(source="step", step="tourney"), + ) + + +@pytest.mark.asyncio +async def test_pattern_tournament_loop_carried(): + reg = _pattern_registry() + assert WorkflowSpecValidator(reg).validate(_tournament_spec()) == [] + out = await _run_spec( + _tournament_spec(), + reg, + {"candidates": ["delta", "bravo", "charlie", "alpha"]}, + ) + # round 1: (delta,bravo)->bravo, (charlie,alpha)->alpha; round 2: -> alpha. + assert out == ["alpha"] + + +def test_validator_rejects_loop_carried_read_without_init(): + spec = _tournament_spec() + spec.steps[0].init = None # body still binds the loop's own id + with pytest.raises(SpecValidationError, match="init"): + WorkflowSpecValidator(_pattern_registry()).validate(spec) + + +@pytest.mark.asyncio +async def test_pattern_adversarial_verification(): + # Independent skeptics per finding (fan_out) + a threshold/filter step: + # only evidence-backed claims survive. No new vocabulary needed. + spec = WorkflowSpec( + goal="verify findings adversarially", + steps=[ + FanOut( + kind="fan_out", + id="verdicts", + over=Binding(source="task", path="findings"), + capability="skeptic", + ), + StepRef( + kind="step", + id="confirmed", + capability="keep_unrefuted", + input=Binding(source="step", step="verdicts"), + ), + ], + output=Binding(source="step", step="confirmed"), + ) + reg = _pattern_registry() + assert WorkflowSpecValidator(reg).validate(spec) == [] + out = await _run_spec( + spec, + reg, + { + "findings": [ + {"claim": "A", "evidence": True}, + {"claim": "B", "evidence": False}, + {"claim": "C", "evidence": True}, + ] + }, + ) + assert out == ["A", "C"] + + +# ------------------------------------------------------------ quality lints +def test_lint_warns_on_same_capability_review(): + # classify reviewing classify's own output cannot be independent. + spec = WorkflowSpec( + goal="x", + steps=[ + StepRef( + kind="step", + id="a", + capability="classify", + input=Binding(source="task"), + ), + StepRef( + kind="step", + id="b", + capability="classify", + input=Binding(source="step", step="a"), + ), + ], + output=Binding(source="step", step="b"), + ) + warnings = WorkflowSpecValidator(_registry()).validate(spec) + assert any("same capability 'classify'" in w for w in warnings) + + +def test_lint_warns_on_unsynthesized_fanout(): + spec = WorkflowSpec( + goal="x", + steps=[ + FanOut( + kind="fan_out", + id="rev", + over=Binding(source="task", path="files"), + capability="review", + ), + ], + output=Binding(source="step", step="rev"), + ) + warnings = WorkflowSpecValidator(_registry()).validate(spec) + assert any("no downstream synthesis" in w for w in warnings) + + +def test_lints_clean_on_independent_plan(): + # review -> count: different capabilities, fan_out is synthesized. Clean. + assert ( + WorkflowSpecValidator(_registry()).validate(_fanout_aggregate_spec()) + == [] + ) From c6bc3350d3c4a389faf16e2daedcea529e38fafd Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Tue, 9 Jun 2026 11:59:43 -0700 Subject: [PATCH 27/64] demo(workflow): independence-lints beat + cost beat; narrative and counts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Beat 2b (independence lints): the typed bindings prove the verifier stage sees only the reviewer's per-item output and each step consumes only its upstream's typed output — the static-independence advantage over model-written orchestration code, shown on camera. * Beat 4 cost line: N capability dispatches + at-most-one planner call (zero on frozen replay); dispatches surfaced in the output event. * README/DEMO_NARRATIVE: new beats, close updated; suite counts 11+31+6=48. --- .../authored_workflow_demo/DEMO_NARRATIVE.md | 47 +++++++++++++++---- .../authored_workflow_demo/README.md | 9 ++-- .../security_audit_planner/agent.py | 32 ++++++++++++- .../authored_workflow_demo/test_demo_agent.py | 18 +++++++ 4 files changed, 93 insertions(+), 13 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md index 19c638a0432..050dd471f03 100644 --- a/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md +++ b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md @@ -53,6 +53,29 @@ Send: **"Plan and run a codebase security review."** The chat streams: > model can only compose pre-approved capabilities — no arbitrary calls, no code > execution. That's the security model: capability allow-listing, not a sandbox." +## Beat 2b — independence lints (the quality argument, made static) + +``` +🧪 Plan-quality lints: 0 warnings. Agent independence is statically checkable + from the typed bindings — the frozen record *proves* it to an auditor: + - pipeline 'review_pipeline': stage 'verifier' sees ONLY stage 'reviewer''s + per-item output — independent verification, per item + - 'review_pipeline' consumes ONLY the task input + - 'triage_step' consumes ONLY the typed output of 'review_pipeline' + - 'format_step' consumes ONLY the typed output of 'triage_step' +``` + +> "This is the multi-agent quality argument, made structural. Isolation is what +> mitigates the documented single-agent failure modes — an agent grading its own +> output (self-preferential bias) and requirements decaying through layers of +> summarization (goal drift). Because every step's only input is a typed +> binding, those properties are **statically checkable**: the validator warns +> if a plan has a step reviewing its own capability's output, or a fan-out +> that's never synthesized. Here it's clean — and the frozen record *proves* +> the verifier saw only the reviewer's output. You can't prove that about +> model-written orchestration code; you can about a typed plan. For a regulated +> audience, that's the sharpest line in the demo." + ## Beat 3 — freeze (State tab) ``` @@ -119,6 +142,8 @@ cat security_audit_plan.json | jq '{schema_version, spec_hash, planner_model, ca ``` 📄 Audit result: Identified 4 vulnerabilities: 1 critical (command injection), 2 high (hardcoded credentials and SQL injection), and 1 medium (division by zero). +📊 Cost: 10 capability dispatches in 8.2s + 1 planner call — per-step work runs + outside the planner's context. ``` > "Open **Events**: ADK runs the plan on the real engine via the #92 supervisor. @@ -127,7 +152,9 @@ cat security_audit_plan.json | jq '{schema_version, spec_hash, planner_model, ca > the barrier-free pipeline, not two separate fan-out waves. Then `triager` over > all verified findings, then `formatter`. The findings are real: a CRITICAL > `os.system` injection, HIGH hardcoded creds and SQL injection, and a MEDIUM -> divide-by-zero." +> divide-by-zero. And note the cost line: **one** planner call, ten capability +> dispatches — the plan is authored once, the work scales outside the planner's +> context. On the replay beat it'll say **zero** planner calls." ## Beat 5 — reproduce (re-send the same prompt) @@ -153,11 +180,15 @@ Same hash, `reused` flips to `true` — the model is not called the second time. ## Close (~20s) -> "So: a model authored a typed, validated, capability-bounded plan; ADK executed -> it on the real engine; the plan **exported** to a portable, defensively-imported -> audit artifact; and a re-send replayed the exact frozen plan. The deterministic -> test suites — 11 (#92) + 25 (#93) + 5 (demo) — lock all of this in CI, including -> the no-LLM reuse path and the export round-trip / tamper / drift checks." +> "So: a model authored a typed, validated, capability-bounded plan whose +> **agent independence is statically proven**; ADK executed it on the real +> engine at a visible cost (one planner call, the work outside its context); +> the plan **exported** to a portable, defensively-imported audit artifact; and +> a re-send replayed the exact frozen plan with zero planner calls. The +> deterministic test suites — 11 (#92) + 31 (#93) + 6 (demo) — lock all of this +> in CI, including the no-LLM reuse path, the export round-trip / tamper / +> drift checks, the plan-quality lints, and the six-coordination-pattern +> coverage sweep (adversarial verification and tournament included)." **Convergence with ADK Workflow config / `root_agent.yaml`** — this is what Beat 3c shows, if a reviewer asks "why not author `loop_config/root_agent.yaml`?": @@ -180,6 +211,6 @@ Same hash, `reused` flips to `true` — the model is not called the second time. ```bash pytest contributing/samples/workflows/dynamic_supervisor_spike/test_dynamic_supervisor_spike.py -q # 11 -pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 25 -pytest contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py -q # 5 +pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 31 +pytest contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py -q # 6 ``` diff --git a/contributing/samples/workflows/authored_workflow_demo/README.md b/contributing/samples/workflows/authored_workflow_demo/README.md index d76c100bf56..a06285c50b1 100644 --- a/contributing/samples/workflows/authored_workflow_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_demo/README.md @@ -38,11 +38,12 @@ Point at the ADK-native evidence as it streams: 1. **Authored `WorkflowSpec`** — the chat shows the JSON plan (`pipeline → step → step`: a `reviewer → verifier` pipeline over the files, then `triager`, then `formatter`). 1. **Validation** — "Validation passed" + the capability list (all registered). +1. **Independence lints** — `🧪 Plan-quality lints: 0 warnings.` The typed bindings make agent isolation **statically checkable**: the verifier stage provably sees only the reviewer's per-item output (independent verification, per file), and each downstream step provably consumes only its upstream's typed output. The frozen record can *prove* these structural bias controls to an auditor — model-written orchestration code can't be checked this way. 1. **Frozen spec + hash** — open the **State** tab: `authored_workflow:frozen_spec` and `…_hash`. 1. **Exported plan** — `📦 Exported plan → security_audit_plan.json`. The full `FrozenWorkflowRecord` (spec, `sha256`, planner model, registry + capability versions, validation, task-input digest) as a portable envelope; import recomputes the hash and re-validates against the current registry. `cat security_audit_plan.json | jq .` on camera. 1. **ADK config lowering** — `🧬 ADK config lowering (static subset) — 2/3 …`. The plan's static skeleton projects toward ADK Workflow/agent config shapes (a static `Workflow`/`SequentialAgent` skeleton + `LlmAgent` leaves by capability name); the `reviewer → verifier` pipeline is flagged **no-AgentConfig-equivalent**, not fabricated. An illustrative projection (RFC #93 §11) — see the talking point below. 1. **Execution** — the **Events / trace** view shows `reviewer` and `verifier` interleaving **per file** (the barrier-free pipeline), then `triager`, then `formatter`. -1. **Final output** — the triaged audit (1 CRITICAL + 2 HIGH + 1 MEDIUM across `auth.py`/`db.py`/`net.py`/`math.py`). +1. **Final output + cost** — the triaged audit (1 CRITICAL + 2 HIGH + 1 MEDIUM across `auth.py`/`db.py`/`net.py`/`math.py`), then `📊 Cost: 10 capability dispatches in N.Ns + 1 planner call` — the planner is invoked at most once (zero on replay); all per-step work runs outside its context. (Re-send the same prompt to show resume reuses the frozen spec — same hash, not re-authored.) @@ -73,11 +74,11 @@ Proof points: multi-stage `fan_out → step → step`; branch `step → branch`; ```bash pytest contributing/samples/workflows/dynamic_supervisor_spike/test_dynamic_supervisor_spike.py -q # 11 -pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 25 -pytest contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py -q # 5 +pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 31 +pytest contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py -q # 6 ``` -- Deterministic suites: #92 **11** + #93 **25** + demo **5** = **41** (incl. a no-LLM reuse-path test). +- Deterministic suites: #92 **11** + #93 **31** + demo **6** = **48** (incl. a no-LLM reuse-path test, the six-pattern coverage sweep — adversarial verification + tournament via loop-carried `init` — and the plan-quality lints). - PR #3 CI green except the documented fork-only `agent-triage` token job. ## Recording notes diff --git a/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py b/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py index a72ee314813..8b1bc21093b 100644 --- a/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py @@ -35,6 +35,7 @@ import json import os import sys +import time from typing import Literal from google.adk import Agent @@ -61,6 +62,7 @@ from authoring import export_plan # noqa: E402 from authoring import FrozenWorkflowRecord # noqa: E402 from authoring import import_plan # noqa: E402 +from authoring import independence_facts # noqa: E402 from authoring import lower_to_agent_config # noqa: E402 from authoring import sha256_hex # noqa: E402 from authoring import SpecInterpreter # noqa: E402 @@ -276,6 +278,19 @@ async def author_validate_execute(ctx: Context, node_input): + (f"\n⚠️ warnings: {warnings}" if warnings else "") ) + # 2b. INDEPENDENCE — the quality argument, made static. Isolation is what + # mitigates self-preferential bias and goal drift in multi-agent work; with + # typed bindings it is a checkable property of the frozen plan (the validator + # lints same-capability self-review and unsynthesized fan-out), not a runtime + # hope. Model-authored orchestration *code* cannot be checked this way. + lints = [w for w in warnings if w.startswith("plan-quality")] + facts = "\n".join(f" - {f}" for f in independence_facts(spec)) + yield _msg( + f"🧪 **Plan-quality lints: {len(lints)} warnings.** Agent independence" + " is statically checkable from the typed bindings — the frozen record" + f" *proves* it to an auditor:\n{facts}" + ) + # 3. FREEZE — persist spec + hash to session state on first author only # (visible in the State tab; reused runs already have it). # NOTE: session state keeps a minimal {spec, hash} subset so the State tab @@ -336,17 +351,32 @@ async def author_validate_execute(ctx: Context, node_input): ) # 4. EXECUTE — run the validated plan on the real ADK engine (#92 supervisor). - result = await SpecInterpreter(reg, ctx).execute(spec, {"files": FILES}) + t0 = time.perf_counter() + interp = SpecInterpreter(reg, ctx) + result = await interp.execute(spec, {"files": FILES}) + elapsed = time.perf_counter() - t0 yield _msg( "📄 **Audit result:**" f" {result.get('note') if isinstance(result, dict) else result}" ) + # 4b. COST — cheap visibility into what the orchestration spent. The planner + # was invoked at most once (zero on frozen replay); every capability dispatch + # ran OUTSIDE the planner's context. + planner_cost = ( + "0 planner calls (frozen replay)" if reused else "1 planner call" + ) + yield _msg( + f"📊 **Cost:** {interp.dispatch_count} capability dispatches in" + f" {elapsed:.1f}s + {planner_cost} — per-step work runs outside the" + " planner's context." + ) yield Event( output={ "hash": spec_hash, "result": result, "capabilities": caps, "reused": reused, + "dispatches": interp.dispatch_count, } ) diff --git a/contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py b/contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py index 1ebc150dc04..bbd87cbdb43 100644 --- a/contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py +++ b/contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py @@ -99,6 +99,22 @@ def test_demo_spec_validates(): WorkflowSpecValidator(demo._registry()).validate(_demo_spec()) # no raise +def test_demo_spec_quality_lints_clean_and_independent(): + # Zero plan-quality lints: verification is by a DIFFERENT capability + # (reviewer -> verifier), and the fan-out is synthesized (triager). + warnings = WorkflowSpecValidator(demo._registry()).validate(_demo_spec()) + assert [w for w in warnings if w.startswith("plan-quality")] == [] + # The independence facts the demo shows on camera are derivable statically: + # the verifier stage provably sees only the reviewer's per-item output, and + # each downstream step provably consumes only its upstream's typed output. + from authoring import independence_facts # noqa: E402 + + facts = "\n".join(independence_facts(_demo_spec())) + assert "stage 'verifier' sees ONLY stage 'reviewer'" in facts + assert "'tri' consumes ONLY the typed output of 'rev'" in facts + assert "'fmt' consumes ONLY the typed output of 'tri'" in facts + + def test_demo_spec_agentconfig_lowering(): # The demo's plan (pipeline -> step -> step) is exactly the static/dynamic # split RFC #93 §11 describes: the two trailing steps lower to LlmAgent under @@ -216,3 +232,5 @@ async def test_reuse_path_no_llm(monkeypatch): "formatter", } assert out["result"]["note"].startswith("audited") + # cost visibility: 4 files x 2 pipeline stages + triager + formatter = 10. + assert out["dispatches"] == 10 From 47dc9070cf1f5883786f45ad92431dc757c81f3a Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Tue, 9 Jun 2026 12:01:19 -0700 Subject: [PATCH 28/64] docs(workflow): mdformat table alignment in spike README --- .../workflows/authored_workflow_spike/README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_spike/README.md b/contributing/samples/workflows/authored_workflow_spike/README.md index 854186d7b37..d31a94d04be 100644 --- a/contributing/samples/workflows/authored_workflow_spike/README.md +++ b/contributing/samples/workflows/authored_workflow_spike/README.md @@ -46,14 +46,14 @@ The six empirically common coordination patterns ([Dynamic Workflows: scaling complex work](https://aipractitioner.substack.com/p/claude-dynamic-workflows-scaling)) are all expressible in the v1 vocabulary, with deterministic tests: -| Pattern | `WorkflowSpec` expression | Test | -|---|---|---| -| classify & route | `StepRef(classifier)` → `Branch` | `test_interpreter_branch_takes_correct_route` | -| fan-out / synthesize | `FanOut` → `StepRef(synthesizer)` | `test_interpreter_fanout_then_aggregate` | -| generate & filter | `FanOut(generate)` → `StepRef(filter)` | same shape as above | -| loop until done | `LoopUntil` + `until_capability` | `test_interpreter_loop_until_stops_and_outputs` | -| adversarial verification | `FanOut(skeptics)` → threshold/filter step | `test_pattern_adversarial_verification` | -| tournament | `LoopUntil(init=…, body=[pair_maker, FanOut(judge)])` | `test_pattern_tournament_loop_carried` | +| Pattern | `WorkflowSpec` expression | Test | +| ------------------------ | ----------------------------------------------------- | ----------------------------------------------- | +| classify & route | `StepRef(classifier)` → `Branch` | `test_interpreter_branch_takes_correct_route` | +| fan-out / synthesize | `FanOut` → `StepRef(synthesizer)` | `test_interpreter_fanout_then_aggregate` | +| generate & filter | `FanOut(generate)` → `StepRef(filter)` | same shape as above | +| loop until done | `LoopUntil` + `until_capability` | `test_interpreter_loop_until_stops_and_outputs` | +| adversarial verification | `FanOut(skeptics)` → threshold/filter step | `test_pattern_adversarial_verification` | +| tournament | `LoopUntil(init=…, body=[pair_maker, FanOut(judge)])` | `test_pattern_tournament_loop_carried` | ## Live planner sweep (optional evidence) From d449d1b5fd17bf2c7a5bb5ede4bea14545744f92 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Tue, 9 Jun 2026 13:03:35 -0700 Subject: [PATCH 29/64] =?UTF-8?q?demo(workflow):=20quality-gate=20beat=20?= =?UTF-8?q?=E2=80=94=20adversarial=20ask,=20lint=20fires=20on=20camera,=20?= =?UTF-8?q?plan=20rejected?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Send 'Plan a sloppy review: have the reviewer double-check its own findings' and the planner authors a VALID plan (registered capabilities, typed bindings) whose pipeline is reviewer -> reviewer. Plain validation passes; the plan-quality lint catches the structural self-review bias and the gate rejects the plan before freezing or executing (production: bounded re-plan). * agent.py: trigger-phrase path (checked before load-or-author so it works in any session), live-model sloppy planner, lint beat, rejection beat. * test_demo_agent.py: CI-safe test pinning the lint on the sloppy shape (7). * README/DEMO_NARRATIVE: Beat 6 + counts 11+31+7=49. --- .../authored_workflow_demo/DEMO_NARRATIVE.md | 29 +++++++- .../authored_workflow_demo/README.md | 12 +++- .../security_audit_planner/agent.py | 68 +++++++++++++++++++ .../authored_workflow_demo/test_demo_agent.py | 37 ++++++++++ 4 files changed, 142 insertions(+), 4 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md index 050dd471f03..f2ea5b4c6bc 100644 --- a/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md +++ b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md @@ -178,6 +178,31 @@ cat security_audit_plan.json | jq '{schema_version, spec_hash, planner_model, ca Same hash, `reused` flips to `true` — the model is not called the second time. +## Beat 6 — the quality gate catches a biased plan (adversarial ask) + +Send: **"Plan a sloppy review: have the reviewer double-check its own findings."** + +``` +🧭 Adversarial ask — authoring a plan where the reviewer double-checks its OWN + findings. Watch the quality gate. +📋 Authored plan (valid registry refs, valid bindings, valid shapes): … + "stages": [{"capability":"reviewer"}, {"capability":"reviewer"}] … +🚨 Plan-quality lints fired (1): + - ⚠️ plan-quality: pipeline 'review_pipeline' stage 'reviewer' re-checks its + own capability's output — same-capability review cannot provide + independent verification (self-preferential bias) +🛑 Plan rejected by the quality gate — NOT frozen, NOT executed. +``` + +> "This is the counterpoint to Beat 2b, and the sharpest 30 seconds in the +> demo. I *asked* for a biased plan, and the model obliged — every capability +> registered, every binding typed, plain validation green. A code-authoring +> system would now run it. Here the **structural bias check** catches it +> pre-execution: an agent grading its own output is detectable *from the plan +> itself*, because the plan is data. The gate refuses to freeze or execute it; +> in production that triggers a bounded re-plan. Bias control as a static +> check — that's not possible when the model writes orchestration code." + ## Close (~20s) > "So: a model authored a typed, validated, capability-bounded plan whose @@ -185,7 +210,7 @@ Same hash, `reused` flips to `true` — the model is not called the second time. > engine at a visible cost (one planner call, the work outside its context); > the plan **exported** to a portable, defensively-imported audit artifact; and > a re-send replayed the exact frozen plan with zero planner calls. The -> deterministic test suites — 11 (#92) + 31 (#93) + 6 (demo) — lock all of this +> deterministic test suites — 11 (#92) + 31 (#93) + 7 (demo) — lock all of this > in CI, including the no-LLM reuse path, the export round-trip / tamper / > drift checks, the plan-quality lints, and the six-coordination-pattern > coverage sweep (adversarial verification and tournament included)." @@ -212,5 +237,5 @@ Same hash, `reused` flips to `true` — the model is not called the second time. ```bash pytest contributing/samples/workflows/dynamic_supervisor_spike/test_dynamic_supervisor_spike.py -q # 11 pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 31 -pytest contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py -q # 6 +pytest contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py -q # 7 ``` diff --git a/contributing/samples/workflows/authored_workflow_demo/README.md b/contributing/samples/workflows/authored_workflow_demo/README.md index a06285c50b1..9c3b89ba354 100644 --- a/contributing/samples/workflows/authored_workflow_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_demo/README.md @@ -47,6 +47,14 @@ Point at the ADK-native evidence as it streams: (Re-send the same prompt to show resume reuses the frozen spec — same hash, not re-authored.) +Then run the **quality-gate beat** — send: + +```text +Plan a sloppy review: have the reviewer double-check its own findings. +``` + +The planner authors a *valid* plan (registered capabilities, typed bindings) whose pipeline is `reviewer → reviewer` — and the **plan-quality lint fires on camera**: `🚨 plan-quality: pipeline 'rev' stage 'reviewer' re-checks its own capability's output — same-capability review cannot provide independent verification (self-preferential bias)`, followed by `🛑 Plan rejected by the quality gate — NOT frozen, NOT executed`. Talking point: *plain validation passes; only the structural bias check catches it — before anything runs, and provably.* + ### Relationship to ADK Workflow config / `root_agent.yaml` (talking point) The RFC's direction is to **converge with ADK config where it fits** (RFC #93 → "Relationship to ADK Workflow config / `root_agent.yaml`"; DESIGN §11). The linked `loop_config/root_agent.yaml` sample is the right mental model for the **static** portion: a human-authored `agent_class: Workflow` YAML graph with known `edges`, child YAML files, and function refs like `.agent.route_headline`. #93 should be able to lower/export static graph skeletons toward that style, while the model-facing format stays `WorkflowSpec`. @@ -75,10 +83,10 @@ Proof points: multi-stage `fan_out → step → step`; branch `step → branch`; ```bash pytest contributing/samples/workflows/dynamic_supervisor_spike/test_dynamic_supervisor_spike.py -q # 11 pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 31 -pytest contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py -q # 6 +pytest contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py -q # 7 ``` -- Deterministic suites: #92 **11** + #93 **31** + demo **6** = **48** (incl. a no-LLM reuse-path test, the six-pattern coverage sweep — adversarial verification + tournament via loop-carried `init` — and the plan-quality lints). +- Deterministic suites: #92 **11** + #93 **31** + demo **7** = **49** (incl. a no-LLM reuse-path test, the six-pattern coverage sweep — adversarial verification + tournament via loop-carried `init` — and the plan-quality lints). - PR #3 CI green except the documented fork-only `agent-triage` token job. ## Recording notes diff --git a/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py b/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py index 8b1bc21093b..f38d72bb4df 100644 --- a/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py @@ -201,6 +201,27 @@ def _registry() -> CapabilityRegistry: ) +# The QUALITY-GATE beat: a deliberately biased ask — the reviewer +# double-checking its OWN findings. Registry/bindings/shapes are all valid, so +# plain validation passes; only the plan-quality lints catch the structural +# self-review bias, and the demo rejects the plan before freezing or running. +_SLOPPY_TRIGGERS = ("sloppy", "self-review", "own findings", "double-check") +_SLOPPY_PLANNER_INSTR = ( + "Author a WorkflowSpec using ONLY these capabilities: " + + _REGISTRY_DESC + + " The task input has a 'files' list of objects with path and code." + " Author, in order:" + " (1) a pipeline over task.files with two stages, reviewer then reviewer" + " AGAIN — the reviewer double-checks its own findings per item;" + " (2) a step running triager on the pipeline output;" + " (3) a step running formatter on the report." + " Use Binding(source='task', path='files') for the pipeline's over, and" + " Binding(source='step', step=) to chain steps. A pipeline stage takes" + " its input from the previous stage automatically, so stages need no input" + " binding. Set output to the formatter step." +) + + def _msg(text: str) -> Event: return Event( content=types.Content(role="model", parts=[types.Part(text=text)]) @@ -221,6 +242,53 @@ def _hash(spec: WorkflowSpec) -> str: async def author_validate_execute(ctx: Context, node_input): reg = _registry() + # 0. QUALITY-GATE path (checked before load-or-author so it works in any + # session): an adversarial ask makes the planner author a structurally + # biased plan; the lints catch it and the gate rejects it pre-execution. + if any(k in str(node_input or "").lower() for k in _SLOPPY_TRIGGERS): + yield _msg( + "🧭 **Adversarial ask** — authoring a plan where the reviewer" + " double-checks its OWN findings. Watch the quality gate." + ) + sloppy = Agent( + name="planner", + model=MODEL, + output_schema=WorkflowSpec, + generate_content_config=DET, + instruction=_SLOPPY_PLANNER_INSTR, + ) + raw = await ctx.run_node( + sloppy, + node_input=f"Audit these files: {[f['path'] for f in FILES]}.", + run_id="plan_sloppy", + ) + spec = WorkflowSpec.model_validate(raw) + yield _msg( + "📋 **Authored plan** (valid registry refs, valid bindings, valid" + f" shapes):\n```json\n{json.dumps(spec.model_dump(), indent=1)}\n```" + ) + lints = [ + w + for w in WorkflowSpecValidator(reg).validate(spec) + if w.startswith("plan-quality") + ] + if lints: + fired = "\n".join(f" - ⚠️ {w}" for w in lints) + yield _msg( + f"🚨 **Plan-quality lints fired ({len(lints)}):**\n{fired}\n\n🛑" + " **Plan rejected by the quality gate** — NOT frozen, NOT executed." + " Plain validation passed (every capability is registered, every" + " binding is typed); only the structural bias check caught it. In" + " production this triggers a bounded re-plan (`max_replans`)." + ) + else: + yield _msg( + "ℹ️ The planner did not author the biased shape this time —" + " re-send the prompt to retry the adversarial ask." + ) + yield Event(output={"rejected": bool(lints), "lints": len(lints)}) + return + # 1. LOAD-OR-AUTHOR. If a frozen spec exists in this session, REUSE it (do not # re-author) — this is the resume/reproducibility claim. Otherwise the model # authors a fresh typed WorkflowSpec (data, not code). diff --git a/contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py b/contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py index bbd87cbdb43..9d527b0b5b6 100644 --- a/contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py +++ b/contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py @@ -99,6 +99,43 @@ def test_demo_spec_validates(): WorkflowSpecValidator(demo._registry()).validate(_demo_spec()) # no raise +def test_quality_gate_rejects_self_review_plan(): + # The adversarial-ask beat: reviewer double-checking its OWN findings is a + # VALID plan (registry, bindings, shapes) — only the plan-quality lint + # catches the structural bias. This pins the shape the live beat relies on. + sloppy = WorkflowSpec( + goal="audit", + steps=[ + Pipeline( + kind="pipeline", + id="rev", + over=Binding(source="task", path="files"), + stages=[ + PipelineStage(capability="reviewer"), + PipelineStage(capability="reviewer"), # self-review + ], + ), + StepRef( + kind="step", + id="tri", + capability="triager", + input=Binding(source="step", step="rev"), + ), + StepRef( + kind="step", + id="fmt", + capability="formatter", + input=Binding(source="step", step="tri"), + ), + ], + output=Binding(source="step", step="fmt"), + ) + warnings = WorkflowSpecValidator(demo._registry()).validate(sloppy) + lints = [w for w in warnings if w.startswith("plan-quality")] + assert len(lints) == 1 + assert "re-checks its own capability's output" in lints[0] + + def test_demo_spec_quality_lints_clean_and_independent(): # Zero plan-quality lints: verification is by a DIFFERENT capability # (reviewer -> verifier), and the fan-out is synthesized (triager). From 1a2957efdd549bcf1fbc22b12833ee6bdcded9b1 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Tue, 9 Jun 2026 14:04:23 -0700 Subject: [PATCH 30/64] spike+demo(workflow): contract-hash drift, auditable lint waivers, free-authoring beat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements the three maintainer-review items that strengthen gate evidence (the rest stay RFC-design-level): * Contract-hash drift (review #3): Capability.contract_hash() = sha256(input_kind + output schema), recorded per referenced capability in FrozenWorkflowRecord; import rejects contract drift even when the manual version string was never bumped (manual versions demoted to secondary). * Auditable lint suppression (review #6): allow_self_chain capability policy opts legitimate draft->critique->redraft refinement out of the self-review lint; per-plan lint_waivers (node id -> justification) suppress lints AND are recorded in the frozen record/export envelope. * Free-authoring beat (demo review): 'freely/decompose' trigger gives the planner ONLY goal + capability descriptions (no recipe) — the honest model-authored claim; recipe-free instruction pinned by a CI test. Suites: 11 + 34 + 8 = 53 deterministic tests. --- .../authored_workflow_demo/DEMO_NARRATIVE.md | 23 +++- .../authored_workflow_demo/README.md | 14 ++- .../security_audit_planner/agent.py | 41 +++++- .../authored_workflow_demo/test_demo_agent.py | 12 ++ .../authored_workflow_spike/DESIGN.md | 11 +- .../authored_workflow_spike/README.md | 7 +- .../authored_workflow_spike/authoring.py | 119 ++++++++++++++++-- .../authored_workflow_spike/test_authoring.py | 53 +++++++- 8 files changed, 250 insertions(+), 30 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md index f2ea5b4c6bc..8f7e98c3bad 100644 --- a/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md +++ b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md @@ -42,6 +42,23 @@ Send: **"Plan and run a codebase security review."** The chat streams: > construct that lets each file flow review→verify independently — item A can be > verifying while item B is still being reviewed." +## Beat 1b — free authoring (the honest "model-authored" claim) + +In a new session, send: **"Freely plan a security review of the files — decompose it yourself."** + +``` +🧭 Free authoring — the planner receives ONLY the goal + capability + descriptions (reviewer, verifier, triager, formatter); no plan recipe. + The shape below is the model's own decomposition (it may differ run to + run — and the freeze beat then makes THIS run replayable). +``` + +> "Beat 1 was instruction-guided so the recording is reproducible — the model +> filled in a known shape. This beat is the real claim: goal + capabilities +> in, plan out, no recipe. Whatever it authors, the same validator, lints, +> freeze, and replay machinery apply — free authoring composes with +> everything you're about to see, and the frozen hash pins *this* run." + ## Beat 2 — validate (capability allow-list) ``` @@ -210,7 +227,7 @@ Send: **"Plan a sloppy review: have the reviewer double-check its own findings." > engine at a visible cost (one planner call, the work outside its context); > the plan **exported** to a portable, defensively-imported audit artifact; and > a re-send replayed the exact frozen plan with zero planner calls. The -> deterministic test suites — 11 (#92) + 31 (#93) + 7 (demo) — lock all of this +> deterministic test suites — 11 (#92) + 34 (#93) + 8 (demo) — lock all of this > in CI, including the no-LLM reuse path, the export round-trip / tamper / > drift checks, the plan-quality lints, and the six-coordination-pattern > coverage sweep (adversarial verification and tournament included)." @@ -236,6 +253,6 @@ Send: **"Plan a sloppy review: have the reviewer double-check its own findings." ```bash pytest contributing/samples/workflows/dynamic_supervisor_spike/test_dynamic_supervisor_spike.py -q # 11 -pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 31 -pytest contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py -q # 7 +pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 34 +pytest contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py -q # 8 ``` diff --git a/contributing/samples/workflows/authored_workflow_demo/README.md b/contributing/samples/workflows/authored_workflow_demo/README.md index 9c3b89ba354..000fffa48ef 100644 --- a/contributing/samples/workflows/authored_workflow_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_demo/README.md @@ -47,6 +47,14 @@ Point at the ADK-native evidence as it streams: (Re-send the same prompt to show resume reuses the frozen spec — same hash, not re-authored.) +Then run the **free-authoring beat** — in a **new session**, send: + +```text +Freely plan a security review of the files — decompose it yourself. +``` + +The planner receives ONLY the goal + capability descriptions (no plan recipe — `test_free_planner_instruction_is_recipe_free` pins this). The shape may differ run to run; that's the point — and the freeze beat then makes *this* run replayable. Talking point: *the default walkthrough shows the mechanics on a scripted plan; this beat is the honest "model-authored" claim.* + Then run the **quality-gate beat** — send: ```text @@ -82,11 +90,11 @@ Proof points: multi-stage `fan_out → step → step`; branch `step → branch`; ```bash pytest contributing/samples/workflows/dynamic_supervisor_spike/test_dynamic_supervisor_spike.py -q # 11 -pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 31 -pytest contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py -q # 7 +pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 34 +pytest contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py -q # 8 ``` -- Deterministic suites: #92 **11** + #93 **31** + demo **7** = **49** (incl. a no-LLM reuse-path test, the six-pattern coverage sweep — adversarial verification + tournament via loop-carried `init` — and the plan-quality lints). +- Deterministic suites: #92 **11** + #93 **34** + demo **8** = **53** (incl. a no-LLM reuse-path test, the six-pattern coverage sweep — adversarial verification + tournament via loop-carried `init` — the plan-quality lints with `allow_self_chain` policy + recorded waivers, contract-hash drift rejection, and the recipe-free free-authoring instruction pin). - PR #3 CI green except the documented fork-only `agent-triage` token job. ## Recording notes diff --git a/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py b/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py index f38d72bb4df..62bcbc85049 100644 --- a/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py @@ -222,6 +222,26 @@ def _registry() -> CapabilityRegistry: ) +# The FREE-AUTHORING beat: the planner receives ONLY the goal + capability +# descriptions — no plan recipe. This is the honest "model-authored" claim +# (the default _PLANNER_INSTR dictates the shape for recording reliability; +# the spike's demand gate also used free authoring). +_FREE_TRIGGERS = ("freely", "free-form", "your own plan", "decompose") +_FREE_PLANNER_INSTR = ( + "Author a WorkflowSpec using ONLY these capabilities: " + + _REGISTRY_DESC + + " The task input has a 'files' list of objects with path and code." + " GOAL: audit the files for security issues and produce a one-line" + " report note. Decompose the goal into a plan YOURSELF — no recipe is" + " provided. Choose whichever control blocks fit (step / fan_out /" + " pipeline / branch / loop_until). Binding rules:" + " Binding(source='task', path='files') reads the file list;" + " Binding(source='step', step=) reads a prior step's output; a" + " pipeline stage takes the previous stage's per-item output" + " automatically. Set output to the final step." +) + + def _msg(text: str) -> Event: return Event( content=types.Content(role="model", parts=[types.Part(text=text)]) @@ -305,18 +325,27 @@ async def author_validate_execute(ctx: Context, node_input): ) else: reused = False + free = any(k in str(node_input or "").lower() for k in _FREE_TRIGGERS) cap_list = ", ".join(f"`{n}`" for n in reg.names()) - yield _msg( - "🧭 **Model-authored Workflow** — planning a security audit over " - f"{len(FILES)} files using only registered capabilities " - f"({cap_list})." - ) + if free: + yield _msg( + "🧭 **Free authoring** — the planner receives ONLY the goal +" + f" capability descriptions ({cap_list}); no plan recipe. The shape" + " below is the model's own decomposition (it may differ run to" + " run — and the freeze beat then makes THIS run replayable)." + ) + else: + yield _msg( + "🧭 **Model-authored Workflow** — planning a security audit over " + f"{len(FILES)} files using only registered capabilities " + f"({cap_list})." + ) planner = Agent( name="planner", model=MODEL, output_schema=WorkflowSpec, generate_content_config=DET, - instruction=_PLANNER_INSTR, + instruction=_FREE_PLANNER_INSTR if free else _PLANNER_INSTR, ) raw = await ctx.run_node( planner, diff --git a/contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py b/contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py index 9d527b0b5b6..cc7cc02025d 100644 --- a/contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py +++ b/contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py @@ -136,6 +136,18 @@ def test_quality_gate_rejects_self_review_plan(): assert "re-checks its own capability's output" in lints[0] +def test_free_planner_instruction_is_recipe_free(): + # The honesty contract behind the free-authoring beat: the default + # instruction dictates the plan; the free instruction must NOT — only the + # goal, the capability descriptions, and the binding rules. + assert "reviewer then verifier" in demo._PLANNER_INSTR # scripted (default) + assert "reviewer then verifier" not in demo._FREE_PLANNER_INSTR + assert "(1)" not in demo._FREE_PLANNER_INSTR # no step recipe + assert "YOURSELF" in demo._FREE_PLANNER_INSTR + # trigger sets must not overlap (a prompt can't be both free and sloppy). + assert not set(demo._FREE_TRIGGERS) & set(demo._SLOPPY_TRIGGERS) + + def test_demo_spec_quality_lints_clean_and_independent(): # Zero plan-quality lints: verification is by a DIFFERENT capability # (reviewer -> verifier), and the fan-out is synthesized (triager). diff --git a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md index b151affbae3..820ff8955bc 100644 --- a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md +++ b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md @@ -131,6 +131,8 @@ Then **`Graph.validate_graph()`** (reused) handles duplicate names, `START`/reac - **self-review**: a node (or pipeline stage) consuming output produced by the *same capability* — same-capability review cannot provide independent verification; - **unsynthesized fan-out**: the terminal output binds a bare per-item `fan_out` never combined or verified downstream. +**Suppression** (so the lints stay credible instead of globally disabled): a capability registered with `allow_self_chain=True` opts out of the self-review lint (legitimate `draft → critique → redraft` refinement), and per-plan `lint_waivers` (node id → justification) are **recorded in the `FrozenWorkflowRecord`** — a suppressed lint is an auditable decision, not a silenced one. + The complementary positive facts (`independence_facts`) are derivable from the frozen spec — e.g. *"stage `verifier` sees ONLY stage `reviewer`'s per-item output"* — which is what lets the frozen record **prove** structural bias controls to an auditor, not just assert them. ## 4. Semantics @@ -152,7 +154,9 @@ class FrozenWorkflowRecord(BaseModel): spec_hash: str # sha256(canonical_json(spec)) — see §10 planner_model: str registry_version: str - capability_versions: dict[str, str] + capability_versions: dict[str, str] # manual bumps — coarse SECONDARY signal + capability_contract_hashes: dict[str, str] # DERIVED sha256(input_kind+output schema) — primary drift signal + lint_waivers: dict[str, str] # node id -> justification; auditable lint suppression validation: ValidationResult # {passed: bool, warnings: [...]} created_at: str # ISO-8601, stamped at freeze task_input_schema: dict | None # expected root task-input schema (enables template reuse) @@ -243,7 +247,10 @@ Re-runnable: `contributing/samples/workflows/authored_workflow_spike/` (31 deter # INTEGRITY (never trust the envelope's own `validation`): # 1. recompute sha256(canonical_json(spec)); REJECT if != envelope["spec_hash"] # 2. re-run WorkflowSpecValidator against the CURRENT registry - # 3. registry/capability drift -> fail loudly (or explicit migration) + # 3. registry/capability drift -> fail loudly (or explicit migration); + # capability drift = manual version (secondary) AND derived contract + # hash sha256(input_kind + output schema) (primary — catches schema + # changes nobody versioned) # EXECUTION-INPUT: # replay : task_input digest must match envelope["task_input_digest"] (else audit-only) # template : task_input validated against envelope["task_input_schema"] before execution diff --git a/contributing/samples/workflows/authored_workflow_spike/README.md b/contributing/samples/workflows/authored_workflow_spike/README.md index d31a94d04be..69e850efd6d 100644 --- a/contributing/samples/workflows/authored_workflow_spike/README.md +++ b/contributing/samples/workflows/authored_workflow_spike/README.md @@ -26,7 +26,7 @@ behind the RFC's "can a model author good plans?" question. pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q ``` -Expected: **31 passed** — `Binding` invariant, `max_iters>=1`, validator accepts a +Expected: **34 passed** — `Binding` invariant, `max_iters>=1`, validator accepts a valid spec and rejects unknown capability / non-preceding binding / duplicate id, the open-map warning, and interpreter execution of fan_out→aggregate, **pipeline (barrier-free per-item review→verify, plus per-stage `max_fan_out` enforcement)**, branch (correct route), and loop_until (stops + correct output); plus **plan export/import** @@ -38,7 +38,10 @@ shapes: sequence/loop/leaf by capability name; runtime fan_out/pipeline/branch flagged no-equivalent rather than fabricated); plus **pattern coverage** (adversarial verification and tournament via loop-carried `init`, incl. the no-`init` validation error) and **plan-quality lints** (same-capability -self-review and unsynthesized fan-out warn; an independent plan lints clean). +self-review and unsynthesized fan-out warn; an independent plan lints clean; +`allow_self_chain` policy and recorded per-plan waivers suppress auditably); +plus **contract-hash drift** (import rejects a changed capability schema even +when the manual version string was never bumped). ## Pattern coverage — the six coordination shapes diff --git a/contributing/samples/workflows/authored_workflow_spike/authoring.py b/contributing/samples/workflows/authored_workflow_spike/authoring.py index 2fab312b6a6..d7bd0d15a2a 100644 --- a/contributing/samples/workflows/authored_workflow_spike/authoring.py +++ b/contributing/samples/workflows/authored_workflow_spike/authoring.py @@ -180,7 +180,25 @@ class Capability(BaseModel): ) max_fan_out: int = 100 side_effect: bool = False - version: str = "1" # bumped when the capability's contract changes (drift) + version: str = "1" # manual bump — a coarse SECONDARY drift signal only + # Lint policy: same-capability chains (draft -> critique own draft -> + # redraft) are legitimate refinement for some capabilities; opting in here + # suppresses the self-review lint for this capability. + allow_self_chain: bool = False + + def contract_hash(self) -> str: + """Derived drift signal — sha256 over the capability's declared contract. + + Manual version strings don't get bumped when someone tweaks a schema; the + contract hash changes automatically, so drift detection on import does + not rely on developer discipline. + """ + schema = ( + None + if self.output_model is None + else self.output_model.model_json_schema() + ) + return sha256_hex({"input_kind": self.input_kind, "output_schema": schema}) class CapabilityRegistry: @@ -201,13 +219,23 @@ def names(self) -> list[str]: def capability_versions( self, only: Optional[set[str]] = None ) -> dict[str, str]: - """name -> version for drift detection on import (optionally filtered).""" + """name -> MANUAL version (coarse secondary drift signal).""" return { n: c.version for n, c in self._by_name.items() if only is None or n in only } + def capability_contract_hashes( + self, only: Optional[set[str]] = None + ) -> dict[str, str]: + """name -> DERIVED contract hash (the primary drift signal on import).""" + return { + n: c.contract_hash() + for n, c in self._by_name.items() + if only is None or n in only + } + def open_map_warnings(self) -> list[str]: """Spike lesson: open-ended dict[str, X] output fields are a structured- output reliability hazard (Gemini fills them unreliably). Warn on them.""" @@ -240,17 +268,34 @@ class WorkflowSpecValidator: def __init__(self, registry: CapabilityRegistry): self.registry = registry - def validate(self, spec: WorkflowSpec) -> list[str]: - """Raises SpecValidationError on a hard error; returns soft warnings.""" + def validate( + self, + spec: WorkflowSpec, + *, + lint_waivers: Optional[dict[str, str]] = None, + ) -> list[str]: + """Raises SpecValidationError on a hard error; returns soft warnings. + + `lint_waivers` (node id -> justification) suppresses plan-quality lints + for the named nodes; record waivers in the FrozenWorkflowRecord so the + suppression itself is auditable. + """ ids: set[str] = set() self._walk(spec.steps, set(), ids) if spec.output.source == "step" and spec.output.step not in ids: raise SpecValidationError( f"output references unknown step {spec.output.step!r}" ) - return self.registry.open_map_warnings() + self.quality_lints(spec) + return self.registry.open_map_warnings() + self.quality_lints( + spec, lint_waivers=lint_waivers + ) - def quality_lints(self, spec: WorkflowSpec) -> list[str]: + def quality_lints( + self, + spec: WorkflowSpec, + *, + lint_waivers: Optional[dict[str, str]] = None, + ) -> list[str]: """Plan-quality lints (soft warnings, never hard errors). Multi-agent quality rests on isolation: it is what mitigates @@ -262,7 +307,13 @@ def quality_lints(self, spec: WorkflowSpec) -> list[str]: cannot provide independent verification; * unsynthesized fan-out: a plan whose terminal output is a bare per-item fan_out never combined or verified by a downstream capability. + + Suppression (so the lints stay credible instead of globally disabled): + a capability registered with `allow_self_chain=True` opts out of the + self-review lint (legitimate draft -> critique -> redraft refinement); + `lint_waivers` suppresses lints for specific node ids per plan. """ + waivers = lint_waivers or {} lints: list[str] = [] producer_cap: dict[str, str] = {} # node id -> capability producing output consumed: set[str] = set() # step ids some other node reads from @@ -280,7 +331,15 @@ def walk(nodes): for prev, st in zip(n.stages, n.stages[1:]): if st.input is not None and st.input.source == "step": consumed.add(st.input.step) - if st.capability == prev.capability and st.input is None: + if ( + st.capability == prev.capability + and st.input is None + and n.id not in waivers + and not ( + st.capability in self.registry + and self.registry[st.capability].allow_self_chain + ) + ): lints.append( f"plan-quality: pipeline {n.id!r} stage" f" {st.capability!r} re-checks its own capability's output —" @@ -304,6 +363,11 @@ def walk_consumers(nodes): and isinstance(b, Binding) and b.source == "step" and producer_cap.get(b.step) == my_cap + and n.id not in waivers + and not ( + my_cap in self.registry + and self.registry[my_cap].allow_self_chain + ) ): lints.append( f"plan-quality: {n.id!r} consumes the output of {b.step!r} via" @@ -336,7 +400,11 @@ def find(nodes): return None node_ = find(spec.steps) - if isinstance(node_, FanOut) and terminal not in consumed: + if ( + isinstance(node_, FanOut) + and terminal not in consumed + and terminal not in waivers + ): lints.append( f"plan-quality: output binds directly to fan_out {terminal!r}" " with no downstream synthesis or verification step — parallel" @@ -535,6 +603,12 @@ class FrozenWorkflowRecord(BaseModel): planner_model: str registry_version: str capability_versions: dict[str, str] + # DERIVED sha256 over each referenced capability's declared contract — the + # primary drift signal (manual versions above are secondary). + capability_contract_hashes: dict[str, str] = Field(default_factory=dict) + # Per-plan lint waivers (node id -> justification), recorded so a + # suppressed lint is an AUDITABLE decision, not a silenced one. + lint_waivers: dict[str, str] = Field(default_factory=dict) validation: ValidationResult created_at: str # ISO-8601, stamped at freeze (caller supplies; not now()) task_input_schema: Optional[dict] = None @@ -550,9 +624,12 @@ def freeze( created_at: str, task_input=None, task_input_schema: Optional[dict] = None, + lint_waivers: Optional[dict[str, str]] = None, ) -> "FrozenWorkflowRecord": """Validate + capture everything needed for replay and drift detection.""" - warnings = WorkflowSpecValidator(registry).validate(spec) # raises on hard + warnings = WorkflowSpecValidator(registry).validate( + spec, lint_waivers=lint_waivers + ) # raises on hard error refs = referenced_capabilities(spec) return cls( spec=spec, @@ -560,6 +637,10 @@ def freeze( planner_model=planner_model, registry_version=registry.version, capability_versions=registry.capability_versions(only=refs), + capability_contract_hashes=registry.capability_contract_hashes( + only=refs + ), + lint_waivers=dict(lint_waivers or {}), validation=ValidationResult(passed=True, warnings=warnings), created_at=created_at, task_input_schema=task_input_schema, @@ -630,7 +711,7 @@ def import_plan( f" {registry.version!r}) — re-validate / migrate before reuse" ) - # 3b. per-capability version drift. + # 3b. per-capability MANUAL version drift (coarse secondary signal). current = registry.capability_versions(only=referenced_capabilities(spec)) recorded = envelope.get("capability_versions", {}) drifted = { @@ -644,6 +725,24 @@ def import_plan( " to a template with explicit migration before reuse" ) + # 3c. per-capability CONTRACT drift (primary, derived signal): catches a + # changed input_kind / output schema even when nobody bumped a version. + current_ch = registry.capability_contract_hashes( + only=referenced_capabilities(spec) + ) + recorded_ch = envelope.get("capability_contract_hashes") or {} + contract_drift = { + n: (recorded_ch[n], current_ch[n]) + for n in current_ch + if n in recorded_ch and recorded_ch[n] != current_ch[n] + } + if contract_drift: + raise PlanImportError( + "capability contract drift (recorded vs current schema hash):" + f" {contract_drift} — the capability's declared contract changed" + " since export; re-validate / migrate before reuse" + ) + # Execution-input contract. if task_input is not None: schema = envelope.get("task_input_schema") diff --git a/contributing/samples/workflows/authored_workflow_spike/test_authoring.py b/contributing/samples/workflows/authored_workflow_spike/test_authoring.py index bf8d088691d..24aa85c6290 100644 --- a/contributing/samples/workflows/authored_workflow_spike/test_authoring.py +++ b/contributing/samples/workflows/authored_workflow_spike/test_authoring.py @@ -776,9 +776,8 @@ async def test_pattern_adversarial_verification(): # ------------------------------------------------------------ quality lints -def test_lint_warns_on_same_capability_review(): - # classify reviewing classify's own output cannot be independent. - spec = WorkflowSpec( +def _self_review_spec(): + return WorkflowSpec( goal="x", steps=[ StepRef( @@ -796,10 +795,56 @@ def test_lint_warns_on_same_capability_review(): ], output=Binding(source="step", step="b"), ) - warnings = WorkflowSpecValidator(_registry()).validate(spec) + + +def test_lint_warns_on_same_capability_review(): + # classify reviewing classify's own output cannot be independent. + warnings = WorkflowSpecValidator(_registry()).validate(_self_review_spec()) assert any("same capability 'classify'" in w for w in warnings) +def test_lint_self_chain_policy_suppresses(): + # draft -> critique-own-draft -> redraft is legitimate refinement; a + # capability can opt out of the self-review lint via allow_self_chain. + reg = _registry() + reg["classify"].allow_self_chain = True + warnings = WorkflowSpecValidator(reg).validate(_self_review_spec()) + assert [w for w in warnings if w.startswith("plan-quality")] == [] + + +def test_lint_waiver_suppresses_and_is_recorded(): + # A per-plan waiver (node id -> justification) suppresses the lint AND is + # recorded in the frozen record — auditable suppression, not silence. + waivers = {"b": "intentional self-refinement pass"} + warnings = WorkflowSpecValidator(_registry()).validate( + _self_review_spec(), lint_waivers=waivers + ) + assert [w for w in warnings if w.startswith("plan-quality")] == [] + rec = FrozenWorkflowRecord.freeze( + _self_review_spec(), + planner_model="gemini-3.5-flash", + registry=_registry(), + created_at="2026-06-09T00:00:00Z", + lint_waivers=waivers, + ) + assert export_plan(rec)["lint_waivers"] == waivers + + +def test_import_rejects_contract_hash_drift(): + # The DERIVED drift signal: change a capability's declared contract (here, + # its output schema) WITHOUT bumping the manual version — manual-version + # drift stays silent; the contract hash catches it. + env = export_plan(_frozen()) + + class NewCountReport(BaseModel): + n: int # narrower contract than before + + changed = _registry() + changed["count"].output_model = NewCountReport # version string unchanged + with pytest.raises(PlanImportError, match="contract drift"): + import_plan(env, changed, task_input=_TASK) + + def test_lint_warns_on_unsynthesized_fanout(): spec = WorkflowSpec( goal="x", From 826ab5b62c87ec1954a3b2f25b5b279e21179365 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Tue, 9 Jun 2026 14:16:15 -0700 Subject: [PATCH 31/64] fix(workflow): fail closed on stripped contract hashes in import_plan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review finding (High): import_plan() compared contract hashes only for names present in the envelope, so deleting capability_contract_hashes (or a single entry) silently bypassed drift detection — undercutting the 'automatic drift detection' claim. * import_plan now REQUIRES a recorded contract hash for every referenced capability in a v1 envelope; missing field or missing entry is a hard PlanImportError (fail closed), checked before the drift comparison. * Tests: exact reviewer reproduction (strip field + change schema without a version bump), strip-field-with-no-drift, and partial (single entry) stripping. 34 -> 36 spike tests; suites 11 + 36 + 8 = 55. * DESIGN.md: fail-closed contract documented; new deferred item — envelope metadata beyond spec_hash is not integrity-protected (a tampered task_input_schema could turn a replay-only plan into a template); production v1.1 should sign/hash the full record. * README/demo docs: counts + fail-closed wording synced. --- .../authored_workflow_demo/DEMO_NARRATIVE.md | 4 +-- .../authored_workflow_demo/README.md | 4 +-- .../authored_workflow_spike/DESIGN.md | 8 +++-- .../authored_workflow_spike/README.md | 5 +-- .../authored_workflow_spike/authoring.py | 13 +++++++- .../authored_workflow_spike/test_authoring.py | 32 +++++++++++++++++++ 6 files changed, 57 insertions(+), 9 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md index 8f7e98c3bad..45603b40319 100644 --- a/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md +++ b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md @@ -227,7 +227,7 @@ Send: **"Plan a sloppy review: have the reviewer double-check its own findings." > engine at a visible cost (one planner call, the work outside its context); > the plan **exported** to a portable, defensively-imported audit artifact; and > a re-send replayed the exact frozen plan with zero planner calls. The -> deterministic test suites — 11 (#92) + 34 (#93) + 8 (demo) — lock all of this +> deterministic test suites — 11 (#92) + 36 (#93) + 8 (demo) — lock all of this > in CI, including the no-LLM reuse path, the export round-trip / tamper / > drift checks, the plan-quality lints, and the six-coordination-pattern > coverage sweep (adversarial verification and tournament included)." @@ -253,6 +253,6 @@ Send: **"Plan a sloppy review: have the reviewer double-check its own findings." ```bash pytest contributing/samples/workflows/dynamic_supervisor_spike/test_dynamic_supervisor_spike.py -q # 11 -pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 34 +pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 36 pytest contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py -q # 8 ``` diff --git a/contributing/samples/workflows/authored_workflow_demo/README.md b/contributing/samples/workflows/authored_workflow_demo/README.md index 000fffa48ef..a22490bb4cd 100644 --- a/contributing/samples/workflows/authored_workflow_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_demo/README.md @@ -90,11 +90,11 @@ Proof points: multi-stage `fan_out → step → step`; branch `step → branch`; ```bash pytest contributing/samples/workflows/dynamic_supervisor_spike/test_dynamic_supervisor_spike.py -q # 11 -pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 34 +pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 36 pytest contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py -q # 8 ``` -- Deterministic suites: #92 **11** + #93 **34** + demo **8** = **53** (incl. a no-LLM reuse-path test, the six-pattern coverage sweep — adversarial verification + tournament via loop-carried `init` — the plan-quality lints with `allow_self_chain` policy + recorded waivers, contract-hash drift rejection, and the recipe-free free-authoring instruction pin). +- Deterministic suites: #92 **11** + #93 **36** + demo **8** = **55** (incl. a no-LLM reuse-path test, the six-pattern coverage sweep — adversarial verification + tournament via loop-carried `init` — the plan-quality lints with `allow_self_chain` policy + recorded waivers, contract-hash drift rejection (fail-closed on stripped hashes), and the recipe-free free-authoring instruction pin). - PR #3 CI green except the documented fork-only `agent-triage` token job. ## Recording notes diff --git a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md index 820ff8955bc..4bb8bc55e59 100644 --- a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md +++ b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md @@ -208,7 +208,7 @@ Fully additive. New `authoring/` package + `AuthoredWorkflowAgent`; no change to 1. **Planner quality vs capability quality are separable** — authoring/structure was reliably good; the residual variance was per-capability output quality (prompts/schemas/retries), proven via an intermediate-output diff (authored vs baseline findings were semantically identical). The strict `unmatched=fail` branch contract also caught a bad field-binding loudly instead of mis-routing. 1. **The pattern-coverage sweep surfaced a real vocabulary gap** — the tournament shape (pairs recomputed per round from the prior round's winners) is inexpressible without **loop-carried state**: a body step must read the previous iteration's output, which the binding-scope rules statically forbid. Fixed with `LoopUntil.init` (seed binding) + the rule that a body binding to the loop's own id reads the carried value. Pattern-driven gate-task selection finds these gaps; single ad-hoc tasks don't. -Re-runnable: `contributing/samples/workflows/authored_workflow_spike/` (31 deterministic tests + env-gated live sweep) and `authored_workflow_demo/` (ADK Web `root_agent` + 6 CI-safe tests incl. the no-LLM reuse path), in `caohy1988/adk-python` PR #3. +Re-runnable: `contributing/samples/workflows/authored_workflow_spike/` (36 deterministic tests + env-gated live sweep) and `authored_workflow_demo/` (ADK Web `root_agent` + 8 CI-safe tests incl. the no-LLM reuse path), in `caohy1988/adk-python` PR #3. ## 10. Plan export & storage — the frozen spec as a durable artifact @@ -250,7 +250,9 @@ Re-runnable: `contributing/samples/workflows/authored_workflow_spike/` (31 deter # 3. registry/capability drift -> fail loudly (or explicit migration); # capability drift = manual version (secondary) AND derived contract # hash sha256(input_kind + output schema) (primary — catches schema - # changes nobody versioned) + # changes nobody versioned). FAIL CLOSED: a v1 envelope must carry a + # contract hash for EVERY referenced capability — a stripped field or + # entry is a hard import error, never a silent bypass. # EXECUTION-INPUT: # replay : task_input digest must match envelope["task_input_digest"] (else audit-only) # template : task_input validated against envelope["task_input_schema"] before execution @@ -259,6 +261,8 @@ Re-runnable: `contributing/samples/workflows/authored_workflow_spike/` (31 deter - **v2 (optional) — promote an exported plan to a reusable template.** A human approves a spec and saves it as a template. **On import, ADK MUST re-validate against the *current* registry**; registry/capability drift **fails loudly or requires explicit migration** — never a silent run against a changed capability set. (The envelope's `registry_version` / `capability_versions` are what make drift detectable.) +- **Deferred — envelope-level integrity beyond `spec_hash`.** `spec_hash` protects the *plan*; envelope metadata (`task_input_schema`, `created_at`, …) is re-checked against the current registry where possible but not integrity-protected — a tampered `task_input_schema` could turn a replay-only plan into a template. Production v1.1 should sign or hash the full serialized record. + - **Deferred — compiled `Workflow`/graph (or generated Python) as the source of truth.** The compiled `Workflow` is regenerated from the spec on demand; it is **not** stored as canonical, because compiler behavior and ADK internals evolve. Persisting generated code or a compiled graph is explicitly out of scope. Net: this turns the proposal from "a model can author plans" into "**model-authored plans become durable enterprise artifacts**" — without committing to durable generated code. diff --git a/contributing/samples/workflows/authored_workflow_spike/README.md b/contributing/samples/workflows/authored_workflow_spike/README.md index 69e850efd6d..e95b4f52c41 100644 --- a/contributing/samples/workflows/authored_workflow_spike/README.md +++ b/contributing/samples/workflows/authored_workflow_spike/README.md @@ -26,7 +26,7 @@ behind the RFC's "can a model author good plans?" question. pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q ``` -Expected: **34 passed** — `Binding` invariant, `max_iters>=1`, validator accepts a +Expected: **36 passed** — `Binding` invariant, `max_iters>=1`, validator accepts a valid spec and rejects unknown capability / non-preceding binding / duplicate id, the open-map warning, and interpreter execution of fan_out→aggregate, **pipeline (barrier-free per-item review→verify, plus per-stage `max_fan_out` enforcement)**, branch (correct route), and loop_until (stops + correct output); plus **plan export/import** @@ -41,7 +41,8 @@ no-`init` validation error) and **plan-quality lints** (same-capability self-review and unsynthesized fan-out warn; an independent plan lints clean; `allow_self_chain` policy and recorded per-plan waivers suppress auditably); plus **contract-hash drift** (import rejects a changed capability schema even -when the manual version string was never bumped). +when the manual version string was never bumped, and **fails closed** when the +envelope's contract hashes are stripped entirely or partially). ## Pattern coverage — the six coordination shapes diff --git a/contributing/samples/workflows/authored_workflow_spike/authoring.py b/contributing/samples/workflows/authored_workflow_spike/authoring.py index d7bd0d15a2a..942d67e4381 100644 --- a/contributing/samples/workflows/authored_workflow_spike/authoring.py +++ b/contributing/samples/workflows/authored_workflow_spike/authoring.py @@ -727,14 +727,25 @@ def import_plan( # 3c. per-capability CONTRACT drift (primary, derived signal): catches a # changed input_kind / output schema even when nobody bumped a version. + # FAIL CLOSED: a v1 envelope MUST record a contract hash for every + # referenced capability — otherwise stripping the field (or one entry) + # from the envelope would silently bypass drift detection. current_ch = registry.capability_contract_hashes( only=referenced_capabilities(spec) ) recorded_ch = envelope.get("capability_contract_hashes") or {} + missing_ch = sorted(n for n in current_ch if n not in recorded_ch) + if missing_ch: + raise PlanImportError( + f"envelope is missing contract hashes for {missing_ch} — a v1" + " envelope must record a contract hash for every referenced" + " capability (fail closed: a stripped field must not bypass drift" + " detection)" + ) contract_drift = { n: (recorded_ch[n], current_ch[n]) for n in current_ch - if n in recorded_ch and recorded_ch[n] != current_ch[n] + if recorded_ch[n] != current_ch[n] } if contract_drift: raise PlanImportError( diff --git a/contributing/samples/workflows/authored_workflow_spike/test_authoring.py b/contributing/samples/workflows/authored_workflow_spike/test_authoring.py index 24aa85c6290..ee59d490596 100644 --- a/contributing/samples/workflows/authored_workflow_spike/test_authoring.py +++ b/contributing/samples/workflows/authored_workflow_spike/test_authoring.py @@ -830,6 +830,38 @@ def test_lint_waiver_suppresses_and_is_recorded(): assert export_plan(rec)["lint_waivers"] == waivers +def test_import_rejects_missing_contract_hashes(): + # Review finding (High): stripping capability_contract_hashes from the + # envelope must NOT bypass drift detection. Exact reproduction: export, + # delete the field, change a capability's output schema without bumping + # the manual version — import must fail closed on the missing hashes. + env = export_plan(_frozen()) + del env["capability_contract_hashes"] + + class NewCountReport(BaseModel): + n: int + + changed = _registry() + changed["count"].output_model = NewCountReport # version string unchanged + with pytest.raises(PlanImportError, match="missing contract hashes"): + import_plan(env, changed, task_input=_TASK) + # fail closed even with NO drift at all — the field itself is required: + env2 = export_plan(_frozen()) + del env2["capability_contract_hashes"] + with pytest.raises(PlanImportError, match="missing contract hashes"): + import_plan(env2, _registry(), task_input=_TASK) + + +def test_import_rejects_partial_contract_hashes(): + # Dropping a SINGLE capability's hash must fail closed too. + env = export_plan(_frozen()) + del env["capability_contract_hashes"]["count"] + with pytest.raises( + PlanImportError, match=r"missing contract hashes for \['count'\]" + ): + import_plan(env, _registry(), task_input=_TASK) + + def test_import_rejects_contract_hash_drift(): # The DERIVED drift signal: change a capability's declared contract (here, # its output schema) WITHOUT bumping the manual version — manual-version From 5d5ad04709a9479636af6f1f969741865ecb7722 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Tue, 9 Jun 2026 14:38:23 -0700 Subject: [PATCH 32/64] =?UTF-8?q?demo(workflow):=20BQ=20Conversational=20A?= =?UTF-8?q?nalytics=20planner=20=E2=80=94=207=20prompts,=207=20authored=20?= =?UTF-8?q?shapes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New sample authored_workflow_ca_demo/bq_ca_planner, styled after BigQuery Conversational Analytics: NL questions over a mock thelook_ecommerce dataset, where the planner authors a DIFFERENT typed WorkflowSpec per scenario from one closed capability vocabulary (nl2sql, dry_run, run_query, profile_table, skeptic, chart judging — execution/profiling are deterministic mocks, language steps are live Gemini): sequence revenue-by-region ask (nl2sql -> dry_run -> run -> summarize) fan-out per-table data-quality profiling -> one report pipeline dashboard: nl2sql -> dry_run per question, barrier-free branch classify & route (metadata questions skip SQL) loop_until SQL self-repair via loop-carried init (draft -> failed dry run -> repair using the error; simulated transient failure -> exactly one repair iteration per run) adversarial independent skeptics per insight; $1M AOV claim refuted tournament pairwise chart judging to a single winner Every scenario: author -> validate -> independence lints -> freeze (per-scenario state key; re-send = frozen replay, 0 planner calls) -> execute on the #92 supervisor -> cost line. 11 CI-safe tests: all seven shapes validated + lint-clean + executed end-to-end with language capabilities stubbed (loop repairs exactly once, branch routes away from SQL, audit rejects the implausible insight, tournament converges). CA tests import the agent as a package (bq_ca_planner.agent) to avoid sys.modules collision with the sibling demo's bare 'agent' module under combined pytest collection. Suites: 11 + 36 + 8 + 11 = 66. --- .../authored_workflow_ca_demo/README.md | 83 +++ .../bq_ca_planner/__init__.py | 15 + .../bq_ca_planner/agent.py | 612 ++++++++++++++++++ .../test_ca_demo_agent.py | 487 ++++++++++++++ 4 files changed, 1197 insertions(+) create mode 100644 contributing/samples/workflows/authored_workflow_ca_demo/README.md create mode 100644 contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/__init__.py create mode 100644 contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py create mode 100644 contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/README.md b/contributing/samples/workflows/authored_workflow_ca_demo/README.md new file mode 100644 index 00000000000..311fd5bde67 --- /dev/null +++ b/contributing/samples/workflows/authored_workflow_ca_demo/README.md @@ -0,0 +1,83 @@ +# ADK Web demo — model-authored workflows for BigQuery Conversational Analytics (RFC #93) + +One agent, **seven prompts, seven workflow shapes**. Styled after [BigQuery +Conversational Analytics](https://docs.cloud.google.com/bigquery/docs/conversational-analytics): +a user asks data questions in natural language, and the planner **authors a +different typed `WorkflowSpec` per scenario** over Conversational-Analytics +capabilities — `nl2sql`, `dry_run`, `run_query`, `profile_table`, `skeptic`, +chart judging — against a mock `thelook_ecommerce` dataset (the dataset the +CA docs demo against). Query execution / dry-run / profiling are +deterministic mocks (**no BigQuery project needed**); the language steps +(NL2SQL, summaries, classification, skeptics) are live Gemini calls. + +Every scenario runs the full #93 machinery: **author → validate → +independence lints → freeze (per-scenario key) → execute on the real engine +(#92 supervisor) → cost line**, and every shape is pinned in CI with the +language capabilities stubbed. + +## 0. Configure a model (no hardcoded project) + +```bash +export GOOGLE_GENAI_USE_VERTEXAI=1 +export GOOGLE_CLOUD_PROJECT= +export GOOGLE_CLOUD_LOCATION=global +export SPIKE_GEMINI_MODEL=gemini-3.5-flash +``` + +## 1. Run it + +```bash +adk web contributing/samples/workflows/authored_workflow_ca_demo --port 8001 +``` + +Open the UI, pick `bq_ca_planner`, and send the prompts below — **one +scenario per prompt**, each authoring a different coordination shape: + +| # | Send this prompt | Shape authored | CA story | +| --- | ------------------------------------------------------------ | ---------------------------------------------------- | ---------------------------------------------------------------- | +| 1 | `What was revenue by region last quarter?` | sequence: `nl2sql → dry_run → run_query → summarize` | the basic ask-a-question flow | +| 2 | `Profile data quality across the dataset tables.` | fan-out → synthesize | per-table profiling in parallel, one report | +| 3 | `Build a dashboard for these three questions.` | pipeline(`nl2sql → dry_run`) per item | each panel translated + validated barrier-free | +| 4 | `Route my question: what does order status 'Complete' mean?` | classify & route (branch) | metadata questions skip SQL entirely | +| 5 | `Answer with SQL self-repair — the dry run is unreliable.` | loop_until + **loop-carried `init`** | draft → failed dry run → repair using the error | +| 6 | `Audit these insights — verify each one independently.` | adversarial verification | independent skeptics per insight; the $1M AOV claim gets refuted | +| 7 | `Pick the best chart for revenue by region.` | tournament | pairwise chart judging to a single winner | + +What to point at as each one streams: + +- **🗂️ scenario banner** — the expected shape, named before the model authors it. +- **📋 authored plan** — a *different* typed `WorkflowSpec` per prompt; same closed vocabulary every time. +- **✅ + 🧪 validation & independence lints** — every scenario lints clean; the provenance facts are statically provable from the bindings. +- **🔒 freeze (per-scenario key)** — **re-send any prompt**: same hash, `0 planner calls (frozen replay)`. Seven independent frozen plans in one session. +- **📄 result + 📊 cost** — real execution on the #92 supervisor; the repair scenario shows exactly one repair iteration (`Table not found … did you mean orders?` → fixed), the audit scenario rejects the implausible insight, the tournament returns `["bar"]`. + +Talking point for scenario 5 (the differentiated one): *the repair loop needs +**loop-carried state** — the drafting step reads the loop's own id to get the +prior round's failed dry-run output. That's `LoopUntil.init`, the vocabulary +gap the pattern-coverage sweep surfaced. And the whole loop is frozen and +replayable — a turn-by-turn agent retry never is.* + +## 2. Correctness proof (no LLM, no BigQuery) + +```bash +pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 11 +``` + +All seven expected shapes are built by hand, validated + lint-checked against +the demo registry, and **executed end-to-end** with the language capabilities +stubbed: the loop repairs exactly once, the branch routes the metadata +question away from SQL, the audit rejects the implausible insight, the +tournament converges to `bar`. The fan-out and tournament scenarios execute +against the **live** registry (their capabilities are deterministic mocks). + +## Notes + +- Honesty: like the security-audit demo, scenario recipes are + instruction-guided so each prompt reliably authors its intended shape; the + free-decomposition evidence is the spike's demand gate and the main demo's + free-authoring beat. The *variety* — seven shapes from one closed + vocabulary — is the claim here. +- The `flaky_dry_run` failure is simulated (every odd call fails) so the + repair loop behaves identically on every run and in CI. +- Frozen plans are per-scenario (`authored_workflow:ca:`), so all + seven replay independently within a session. diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/__init__.py b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/__init__.py new file mode 100644 index 00000000000..1a38cf933e9 --- /dev/null +++ b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import agent # noqa: F401 diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py new file mode 100644 index 00000000000..9def4038220 --- /dev/null +++ b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py @@ -0,0 +1,612 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ADK Web demo agent for RFC #93 — BigQuery Conversational Analytics planner. + +One agent, SEVEN scenario prompts, each making the planner author a DIFFERENT +workflow shape over Conversational-Analytics-flavored capabilities (nl2sql, +dry_run, run_query, profiling, insight verification) against a mock +``thelook_ecommerce`` dataset: + + sequence "What was revenue by region last quarter?" + fan-out "Profile data quality across the dataset tables." + pipeline "Build a dashboard for these three questions." + branch "Route my question: what does order status 'Complete' mean?" + loop_until "Answer with SQL self-repair — the dry run is unreliable." + adversarial "Audit these insights — verify each one independently." + tournament "Pick the best chart for revenue by region." + +Each scenario runs the same machinery as the security-audit demo: author +(live planner) -> validate -> independence lints -> freeze (per-scenario +state key; re-send replays without re-invoking the model) -> execute on the +real engine via the #92 supervisor -> cost line. Query execution and dry-run +are deterministic mocks (no BigQuery project needed); language steps +(nl2sql, summaries, classification, skeptics) are live Gemini calls. Run: + + adk web contributing/samples/workflows/authored_workflow_ca_demo + +Configure a model first (no hardcoded project): + export GOOGLE_GENAI_USE_VERTEXAI=1 GOOGLE_CLOUD_PROJECT= + export GOOGLE_CLOUD_LOCATION=global SPIKE_GEMINI_MODEL=gemini-3.5-flash +""" + +from __future__ import annotations + +import json +import os +import sys +import time +from typing import Literal + +from google.adk import Agent +from google.adk import Context +from google.adk import Event +from google.adk import Workflow +from google.adk.workflow import node +from google.genai import types +from pydantic import BaseModel + +# Reuse the committed #93 authoring stack (sibling sample dir). +sys.path.insert( + 0, + os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "..", + "..", + "authored_workflow_spike", + ), +) +from authoring import Capability # noqa: E402 +from authoring import CapabilityRegistry # noqa: E402 +from authoring import independence_facts # noqa: E402 +from authoring import sha256_hex # noqa: E402 +from authoring import SpecInterpreter # noqa: E402 +from authoring import WorkflowSpec # noqa: E402 +from authoring import WorkflowSpecValidator # noqa: E402 + +MODEL = os.environ.get("SPIKE_GEMINI_MODEL", "gemini-2.5-flash") +DET = types.GenerateContentConfig(temperature=0) + +# ------------------------------------------------- mock thelook_ecommerce +# A miniature of bigquery-public-data.thelook_ecommerce — the dataset the +# Conversational Analytics docs demo against. run_query/dry_run/profiling +# are deterministic mocks so the demo needs no BigQuery project. +TABLES = { + "orders": "order_id, user_id, status, created_at, num_of_item", + "order_items": "id, order_id, product_id, sale_price, status", + "products": "id, name, category, brand, retail_price, department", + "users": "id, email, age, country, traffic_source, created_at", +} + +_CANNED_ROWS = [ + {"region": "US-West", "revenue": 412300.50}, + {"region": "US-East", "revenue": 387910.25}, + {"region": "EMEA", "revenue": 295004.10}, + {"region": "APAC", "revenue": 188777.75}, +] + +_CANNED_PROFILES = { + "orders": {"table": "orders", "row_count": 125210, "null_pct": 0.2}, + "order_items": { + "table": "order_items", + "row_count": 181830, + "null_pct": 0.0, + }, + "products": {"table": "products", "row_count": 29120, "null_pct": 3.4}, + "users": {"table": "users", "row_count": 100000, "null_pct": 7.9}, +} + +_SCHEMA_NOTES = { + "default": ( + "orders.status takes Complete / Shipped / Processing / Cancelled /" + " Returned; 'Complete' means the order was delivered and the return" + " window has closed." + ) +} + +# Simulated transient dry-run failure (repair-loop scenario): every ODD call +# fails, so EVERY run of the loop shows exactly one repair iteration — +# deterministic on camera and in CI, and replay behaves identically. +_FLAKY_CALLS = {"n": 0} + +_JUDGE_RANK = {"bar": 0, "line": 1, "scatter": 2, "pie": 3} + + +# ------------------------------------------------- typed outputs (LLM caps) +class Sql(BaseModel): + sql: str + + +class Insight(BaseModel): + insight: str + + +class Category(BaseModel): + category: Literal["data", "schema"] + + +class Verdict(BaseModel): + insight: str + refuted: bool + + +def _stub(name, fn): + def build(): + @node(name=name) + async def n(ctx, node_input): + yield Event(output=fn(node_input)) + + return n + + return build + + +def _llm(name, output_schema, instruction): + return lambda: Agent( + name=name, + model=MODEL, + output_schema=output_schema, + generate_content_config=DET, + instruction=instruction, + ) + + +def _registry() -> CapabilityRegistry: + schema_blurb = "; ".join(f"{t}({c})" for t, c in TABLES.items()) + return CapabilityRegistry([ + # ---- live language capabilities (Gemini) ---- + Capability( + name="nl2sql", + input_kind="item", + output_model=Sql, + serialize_input=True, + build=_llm( + "nl2sql", + Sql, + "Translate the question in the input JSON to one BigQuery" + f" StandardSQL SELECT over thelook_ecommerce: {schema_blurb}." + " Output Sql.", + ), + ), + Capability( + name="draft_or_repair_sql", + input_kind="item", + output_model=Sql, + serialize_input=True, + build=_llm( + "draft_or_repair_sql", + Sql, + "Input JSON has a question, and possibly a prior sql + error" + " from a failed dry run. Draft (or repair, using the error)" + " one BigQuery StandardSQL SELECT over thelook_ecommerce:" + f" {schema_blurb}. Output Sql.", + ), + ), + Capability( + name="summarize_insight", + input_kind="item", + output_model=Insight, + serialize_input=True, + build=_llm( + "summarize_insight", + Insight, + "Input: JSON query results (or profiling stats). Output" + " Insight: one crisp analyst sentence.", + ), + ), + Capability( + name="classify_question", + input_kind="item", + output_model=Category, + serialize_input=True, + build=_llm( + "classify_question", + Category, + "Classify the user question: 'data' if it needs a SQL query" + " over the tables, 'schema' if it asks what a column/value" + " means. Output Category.", + ), + ), + Capability( + name="skeptic", + input_kind="item", + output_model=Verdict, + serialize_input=True, + build=_llm( + "skeptic", + Verdict, + "You are an adversarial data reviewer. Input: one insight" + " about an e-commerce dataset (avg order ~ $60-90, 100k" + " users). Try to REFUTE it; refuted=true if implausible." + " Echo the insight. Output Verdict.", + ), + ), + # ---- deterministic mocks (no BigQuery needed) ---- + Capability( + name="dry_run", + input_kind="item", + serialize_input=False, + build=_stub( + "dry_run", + lambda s: { + "sql": (s or {}).get("sql", ""), + "valid": "select" in str((s or {}).get("sql", "")).lower(), + "error": None, + }, + ), + ), + Capability( + name="flaky_dry_run", + input_kind="item", + serialize_input=False, + build=_stub("flaky_dry_run", lambda s: _flaky_dry_run(s)), + ), + Capability( + name="sql_ok", + input_kind="item", + serialize_input=False, + build=_stub("sql_ok", lambda s: bool((s or {}).get("valid"))), + ), + Capability( + name="run_query", + input_kind="item", + serialize_input=False, + build=_stub("run_query", lambda s: {"rows": _CANNED_ROWS}), + ), + Capability( + name="profile_table", + input_kind="item", + serialize_input=False, + max_fan_out=20, + build=_stub( + "profile_table", + lambda t: _CANNED_PROFILES.get( + str(t), {"table": str(t), "row_count": 0, "null_pct": 0.0} + ), + ), + ), + Capability( + name="quality_report", + input_kind="list", + serialize_input=False, + build=_stub( + "quality_report", + lambda profiles: { + "tables": len(profiles), + "worst_table": max(profiles, key=lambda p: p["null_pct"])[ + "table" + ], + "max_null_pct": max(p["null_pct"] for p in profiles), + }, + ), + ), + Capability( + name="describe_schema", + input_kind="item", + serialize_input=False, + build=_stub( + "describe_schema", + lambda q: {"answer": _SCHEMA_NOTES["default"]}, + ), + ), + Capability( + name="keep_verified", + input_kind="list", + serialize_input=False, + build=_stub( + "keep_verified", + lambda vs: { + "verified": [ + v["insight"] for v in vs if not v.get("refuted") + ], + "rejected": [v["insight"] for v in vs if v.get("refuted")], + }, + ), + ), + Capability( + name="pair_charts", + input_kind="list", + serialize_input=False, + build=_stub( + "pair_charts", + lambda lst: [lst[i : i + 2] for i in range(0, len(lst), 2)], + ), + ), + Capability( + name="judge_chart", + input_kind="item", + serialize_input=False, + build=_stub( + "judge_chart", + lambda pair: min(pair, key=lambda c: _JUDGE_RANK.get(c, 99)), + ), + ), + Capability( + name="single_chart", + input_kind="list", + serialize_input=False, + build=_stub("single_chart", lambda lst: len(lst) == 1), + ), + ]) + + +def _flaky_dry_run(s): + _FLAKY_CALLS["n"] += 1 + if _FLAKY_CALLS["n"] % 2 == 1: # every odd call fails -> 1 repair per run + return { + "question": (s or {}).get("question", ""), + "sql": (s or {}).get("sql", ""), + "valid": False, + "error": "Table not found: `thelook.order` (did you mean orders?)", + } + return { + "question": (s or {}).get("question", ""), + "sql": (s or {}).get("sql", ""), + "valid": True, + "error": None, + } + + +# ------------------------------------------------- scenarios +_CAPS_BLURB = ( + "nl2sql (item: {question} -> Sql), draft_or_repair_sql (item: {question," + " sql?, error?} -> Sql), summarize_insight (item: rows/stats JSON ->" + " Insight), classify_question (item: {question} -> Category with" + " category 'data'|'schema'), skeptic (item: one insight -> Verdict)," + " dry_run (item: Sql -> {sql, valid, error}), flaky_dry_run (same, may" + " fail transiently), sql_ok (item: dry-run output -> bool), run_query" + " (item: validated sql -> {rows}), profile_table (item: table name ->" + " stats), quality_report (LIST of stats -> report), describe_schema" + " (item: {question} -> {answer}), keep_verified (LIST of Verdicts ->" + " {verified, rejected}), pair_charts (LIST -> list of pairs)," + " judge_chart (item: pair -> winner), single_chart (LIST -> bool)." +) + +_BINDING_RULES = ( + " Binding rules: Binding(source='task', path=) reads the task" + " input; Binding(source='step', step=) chains steps; pipeline" + " stages take the previous stage's per-item output automatically." +) + + +def _scenario_defs(): + """key -> (title, shape, triggers, task_input, planner recipe).""" + q_region = "What was revenue by region last quarter?" + return { + "sequence": dict( + title="Ask a question (sequence)", + shape="step → step → step → step", + triggers=("revenue by region", "sequence"), + task={"question": q_region}, + recipe=( + "Author, in order: (1) a step running nl2sql on the task;" + " (2) a step running dry_run on it; (3) a step running" + " run_query on that; (4) a step running summarize_insight on" + " the rows. Output = the summarize step." + ), + ), + "fanout": dict( + title="Profile data quality (fan-out / synthesize)", + shape="fan_out → step", + triggers=("profile", "data quality"), + task={"tables": list(TABLES)}, + recipe=( + "Author: (1) a fan_out over task.tables running profile_table" + " per table; (2) a step running quality_report on the fan_out" + " output. Output = the report step." + ), + ), + "pipeline": dict( + title="Build a dashboard (pipeline)", + shape="pipeline(nl2sql → dry_run) → step", + triggers=("dashboard",), + task={ + "questions": [ + {"question": "Top 5 product categories by revenue?"}, + {"question": "Monthly active users by traffic source?"}, + {"question": "Return rate by department?"}, + ] + }, + recipe=( + "Author: (1) a pipeline over task.questions with two stages," + " nl2sql then dry_run, so each dashboard question is" + " translated and validated per item; (2) a step running" + " summarize_insight on the pipeline output. Output = the" + " summarize step." + ), + ), + "branch": dict( + title="Route the question (classify & route)", + shape="step → branch", + triggers=("route", "what does", "mean"), + task={"question": "What does order status 'Complete' mean?"}, + recipe=( + "Author: (1) a step running classify_question on the task;" + " (2) a branch on that step's 'category' field" + " (Binding(source='step', step=, path='category')) with" + " TWO routes: value 'data' -> a block [nl2sql on task," + " dry_run, run_query, summarize_insight]; value 'schema' -> a" + " block [describe_schema on task]. Output = the branch." + ), + ), + "loop": dict( + title="SQL self-repair (loop_until + loop-carried state)", + shape="loop_until(init=task, body=[draft_or_repair, flaky_dry_run])", + triggers=("repair", "unreliable", "retry"), + task={"question": q_region}, + recipe=( + "Author ONE loop_until: init = Binding(source='task'); body =" + " [(a) a step running draft_or_repair_sql whose input is" + " Binding(source='step', step=) — it reads" + " the loop-carried value: the task on round 0, the failed" + " dry-run output (sql + error) afterwards; (b) a step running" + " flaky_dry_run on (a)]; until_capability = sql_ok with" + " until_input = Binding(source='step', step=);" + " max_iters = 3. Output = the loop." + ), + ), + "adversarial": dict( + title="Audit insights (adversarial verification)", + shape="fan_out(skeptic) → step(keep_verified)", + triggers=("audit", "verify insights"), + task={ + "insights": [ + "Average order value is roughly $75.", + "The average order value is $1,000,000.", + "Most users arrive via organic search.", + ] + }, + recipe=( + "Author: (1) a fan_out over task.insights running skeptic per" + " insight; (2) a step running keep_verified on the fan_out" + " output. Output = the keep_verified step." + ), + ), + "tournament": dict( + title="Pick the best chart (tournament)", + shape="loop_until(init=task.chart_options, body=[pair, fan_out])", + triggers=("best chart", "tournament"), + task={"chart_options": ["pie", "bar", "line", "scatter"]}, + recipe=( + "Author ONE loop_until: init = Binding(source='task'," + " path='chart_options'); body = [(a) a step running" + " pair_charts whose input is Binding(source='step', step=); (b) a fan_out over (a) running judge_chart" + " per pair]; until_capability = single_chart with until_input" + " = Binding(source='step', step=); max_iters" + " = 3. Output = the loop." + ), + ), + } + + +SCENARIOS = _scenario_defs() + + +def _scenario_for(text: str) -> str: + t = (text or "").lower() + for key, sc in SCENARIOS.items(): + if any(trigger in t for trigger in sc["triggers"]): + return key + return "sequence" + + +def _planner_instruction(sc) -> str: + return ( + "Author a WorkflowSpec using ONLY these capabilities: " + + _CAPS_BLURB + + f" Task input: {json.dumps(sc['task'])}. " + + sc["recipe"] + + _BINDING_RULES + ) + + +def _msg(text: str) -> Event: + return Event( + content=types.Content(role="model", parts=[types.Part(text=text)]) + ) + + +def _hash(spec: WorkflowSpec) -> str: + return sha256_hex(spec.model_dump(mode="json"))[:12] + + +@node(rerun_on_resume=True) +async def plan_and_run(ctx: Context, node_input): + reg = _registry() + key = _scenario_for(str(node_input or "")) + sc = SCENARIOS[key] + state_key = f"authored_workflow:ca:{key}" + + yield _msg( + f"🗂️ **Scenario: {sc['title']}** — expected shape `{sc['shape']}`," + " over mock `thelook_ecommerce`" + f" ({', '.join(TABLES)})." + ) + + # 1. LOAD-OR-AUTHOR (per-scenario frozen key: each shape replays + # independently — re-send the same prompt to replay without the model). + existing = ctx.state.get(state_key) + if existing: + spec = WorkflowSpec.model_validate(existing) + spec_hash = _hash(spec) + reused = True + yield _msg( + f"♻️ **Reusing frozen plan** for `{key}` — hash `{spec_hash}`. The" + " model is NOT re-invoked; the exact prior plan is replayed." + ) + else: + reused = False + planner = Agent( + name="planner", + model=MODEL, + output_schema=WorkflowSpec, + generate_content_config=DET, + instruction=_planner_instruction(sc), + ) + raw = await ctx.run_node( + planner, node_input=json.dumps(sc["task"]), run_id=f"plan_{key}" + ) + spec = WorkflowSpec.model_validate(raw) + spec_hash = _hash(spec) + steps = " → ".join(s.kind for s in spec.steps) + yield _msg( + f"📋 **Authored plan** (`{steps}`):\n```json\n" + f"{json.dumps(spec.model_dump(exclude_none=True), indent=1)}\n```" + ) + + # 2. VALIDATE + 2b. INDEPENDENCE LINTS. + warnings = WorkflowSpecValidator(reg).validate(spec) + lints = [w for w in warnings if w.startswith("plan-quality")] + facts = "\n".join(f" - {f}" for f in independence_facts(spec)) + yield _msg( + f"✅ **Validation passed.** 🧪 plan-quality lints: {len(lints)}." + f" Provenance (statically provable):\n{facts}" + + (f"\n⚠️ {lints}" if lints else "") + ) + + # 3. FREEZE (per scenario). + if not reused: + ctx.state[state_key] = spec.model_dump() + yield _msg( + f"🔒 **Frozen** under `{state_key}` — hash `{spec_hash}`. Re-send" + " this prompt: same plan, zero planner calls." + ) + + # 4. EXECUTE on the real engine via the #92 supervisor. + t0 = time.perf_counter() + interp = SpecInterpreter(reg, ctx) + result = await interp.execute(spec, sc["task"]) + elapsed = time.perf_counter() - t0 + yield _msg( + f"📄 **Result:**\n```json\n{json.dumps(result, indent=1, default=str)}" + f"\n```\n📊 **Cost:** {interp.dispatch_count} capability dispatches in" + f" {elapsed:.1f}s + " + + ("0 planner calls (frozen replay)." if reused else "1 planner call.") + ) + yield Event( + output={ + "scenario": key, + "hash": spec_hash, + "reused": reused, + "dispatches": interp.dispatch_count, + "result": result, + } + ) + + +root_agent = Workflow( + name="bq_ca_planner", + edges=[("START", plan_and_run)], +) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py new file mode 100644 index 00000000000..bba5a5d15e8 --- /dev/null +++ b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py @@ -0,0 +1,487 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""CI-safe tests for the BQ Conversational Analytics workflow demo (no LLM). + +Each demo scenario's expected workflow shape is built by hand, validated + +lint-checked against the demo registry, and EXECUTED end-to-end on the real +ADK engine with the language capabilities (nl2sql, summaries, classifier, +skeptic) swapped for deterministic stubs — so all seven coordination shapes +the demo authors on camera are pinned in CI. +""" + +from __future__ import annotations + +import json +import os +import sys + +from google.adk import Event +from google.adk.runners import Runner +from google.adk.sessions.in_memory_session_service import InMemorySessionService +from google.adk.workflow import node +from google.adk.workflow import Workflow +from google.genai import types +import pytest + +_HERE = os.path.dirname(os.path.abspath(__file__)) +# Import as a PACKAGE (bq_ca_planner.agent), not a bare `agent` module — the +# sibling demo's tests import their own `agent`, and a bare import would +# collide in sys.modules when pytest collects both directories. +sys.path.insert(0, _HERE) +sys.path.insert(0, os.path.join(_HERE, "..", "authored_workflow_spike")) +from authoring import Binding # noqa: E402 +from authoring import Branch # noqa: E402 +from authoring import Capability # noqa: E402 +from authoring import CapabilityRegistry # noqa: E402 +from authoring import FanOut # noqa: E402 +from authoring import LoopUntil # noqa: E402 +from authoring import Pipeline # noqa: E402 +from authoring import PipelineStage # noqa: E402 +from authoring import Route # noqa: E402 +from authoring import SpecInterpreter # noqa: E402 +from authoring import StepRef # noqa: E402 +from authoring import WorkflowSpec # noqa: E402 +from authoring import WorkflowSpecValidator # noqa: E402 +from bq_ca_planner import agent as demo # noqa: E402 + +_LLM_CAPS = ( + "nl2sql", + "draft_or_repair_sql", + "summarize_insight", + "classify_question", + "skeptic", +) + + +def _stub(name, fn): + def build(): + @node(name=name) + async def n(ctx, node_input): + yield Event(output=fn(node_input)) + + return n + + return build + + +def _stub_registry() -> CapabilityRegistry: + """The demo registry with the live language capabilities stubbed.""" + real = demo._registry() + stubs = [ + Capability( + name="nl2sql", + input_kind="item", + serialize_input=False, + build=_stub( + "nl2sql", + lambda s: { + "sql": ( + "SELECT region, SUM(sale_price) AS revenue FROM" + " order_items GROUP BY region" + ) + }, + ), + ), + Capability( + name="draft_or_repair_sql", + input_kind="item", + serialize_input=False, + build=_stub( + "draft_or_repair_sql", + lambda s: {"sql": "SELECT status FROM orders LIMIT 10"}, + ), + ), + Capability( + name="summarize_insight", + input_kind="item", + serialize_input=False, + build=_stub( + "summarize_insight", + lambda s: {"insight": "US-West leads revenue."}, + ), + ), + Capability( + name="classify_question", + input_kind="item", + serialize_input=False, + build=_stub( + "classify_question", + lambda s: { + "category": ( + "schema" if "mean" in json.dumps(s).lower() else "data" + ) + }, + ), + ), + Capability( + name="skeptic", + input_kind="item", + serialize_input=False, + build=_stub( + "skeptic", + lambda v: {"insight": str(v), "refuted": "1,000,000" in str(v)}, + ), + ), + ] + passthrough = [ + cap for name, cap in real._by_name.items() if name not in _LLM_CAPS + ] + return CapabilityRegistry(stubs + passthrough) + + +# ----------------------------------------------------- expected shapes +def _expected_spec(key: str) -> WorkflowSpec: + """The shape each scenario's planner recipe asks for, built by hand.""" + if key == "sequence": + return WorkflowSpec( + goal="revenue by region", + steps=[ + StepRef( + kind="step", + id="sql", + capability="nl2sql", + input=Binding(source="task"), + ), + StepRef( + kind="step", + id="check", + capability="dry_run", + input=Binding(source="step", step="sql"), + ), + StepRef( + kind="step", + id="rows", + capability="run_query", + input=Binding(source="step", step="check"), + ), + StepRef( + kind="step", + id="sum", + capability="summarize_insight", + input=Binding(source="step", step="rows"), + ), + ], + output=Binding(source="step", step="sum"), + ) + if key == "fanout": + return WorkflowSpec( + goal="profile data quality", + steps=[ + FanOut( + kind="fan_out", + id="profiles", + over=Binding(source="task", path="tables"), + capability="profile_table", + ), + StepRef( + kind="step", + id="report", + capability="quality_report", + input=Binding(source="step", step="profiles"), + ), + ], + output=Binding(source="step", step="report"), + ) + if key == "pipeline": + return WorkflowSpec( + goal="dashboard", + steps=[ + Pipeline( + kind="pipeline", + id="panels", + over=Binding(source="task", path="questions"), + stages=[ + PipelineStage(capability="nl2sql"), + PipelineStage(capability="dry_run"), + ], + ), + StepRef( + kind="step", + id="sum", + capability="summarize_insight", + input=Binding(source="step", step="panels"), + ), + ], + output=Binding(source="step", step="sum"), + ) + if key == "branch": + return WorkflowSpec( + goal="route the question", + steps=[ + StepRef( + kind="step", + id="cls", + capability="classify_question", + input=Binding(source="task"), + ), + Branch( + kind="branch", + id="route", + on=Binding(source="step", step="cls", path="category"), + routes=[ + Route( + value="data", + block=[ + StepRef( + kind="step", + id="d_sql", + capability="nl2sql", + input=Binding(source="task"), + ), + StepRef( + kind="step", + id="d_check", + capability="dry_run", + input=Binding(source="step", step="d_sql"), + ), + StepRef( + kind="step", + id="d_rows", + capability="run_query", + input=Binding(source="step", step="d_check"), + ), + StepRef( + kind="step", + id="d_sum", + capability="summarize_insight", + input=Binding(source="step", step="d_rows"), + ), + ], + ), + Route( + value="schema", + block=[ + StepRef( + kind="step", + id="s_desc", + capability="describe_schema", + input=Binding(source="task"), + ) + ], + ), + ], + ), + ], + output=Binding(source="step", step="route"), + ) + if key == "loop": + return WorkflowSpec( + goal="sql self-repair", + steps=[ + LoopUntil( + kind="loop_until", + id="repair", + init=Binding(source="task"), + body=[ + StepRef( + kind="step", + id="draft", + capability="draft_or_repair_sql", + input=Binding(source="step", step="repair"), + ), + StepRef( + kind="step", + id="check", + capability="flaky_dry_run", + input=Binding(source="step", step="draft"), + ), + ], + until_capability="sql_ok", + until_input=Binding(source="step", step="check"), + max_iters=3, + ), + ], + output=Binding(source="step", step="repair"), + ) + if key == "adversarial": + return WorkflowSpec( + goal="audit insights", + steps=[ + FanOut( + kind="fan_out", + id="verdicts", + over=Binding(source="task", path="insights"), + capability="skeptic", + ), + StepRef( + kind="step", + id="kept", + capability="keep_verified", + input=Binding(source="step", step="verdicts"), + ), + ], + output=Binding(source="step", step="kept"), + ) + if key == "tournament": + return WorkflowSpec( + goal="best chart", + steps=[ + LoopUntil( + kind="loop_until", + id="bracket", + init=Binding(source="task", path="chart_options"), + body=[ + StepRef( + kind="step", + id="pairs", + capability="pair_charts", + input=Binding(source="step", step="bracket"), + ), + FanOut( + kind="fan_out", + id="winners", + over=Binding(source="step", step="pairs"), + capability="judge_chart", + ), + ], + until_capability="single_chart", + until_input=Binding(source="step", step="winners"), + max_iters=3, + ), + ], + output=Binding(source="step", step="bracket"), + ) + raise KeyError(key) + + +async def _run(spec, registry, task_input): + holder = {} + + @node(rerun_on_resume=True) + async def parent(ctx, node_input): + holder["out"] = await SpecInterpreter(registry, ctx).execute( + spec, task_input + ) + yield Event(output={"_done": True}) + + wf = Workflow(name="t", edges=[("START", parent)]) + ss = InMemorySessionService() + r = Runner(app_name=wf.name, node=wf, session_service=ss) + s = await ss.create_session(app_name=wf.name, user_id="u") + async for _ in r.run_async( + user_id="u", + session_id=s.id, + new_message=types.Content(parts=[types.Part(text="go")], role="user"), + ): + pass + return holder["out"] + + +# ----------------------------------------------------- tests +def test_root_agent_importable_and_named(): + assert isinstance(demo.root_agent, Workflow) + assert demo.root_agent.name == "bq_ca_planner" + + +def test_registry_clean_and_typed(): + reg = demo._registry() + for name in _LLM_CAPS + ("dry_run", "run_query", "profile_table"): + assert name in reg + assert reg.open_map_warnings() == [] # enumerated fields only + + +def test_scenario_routing(): + assert demo._scenario_for("What was revenue by region?") == "sequence" + assert demo._scenario_for("Profile data quality please") == "fanout" + assert demo._scenario_for("Build a dashboard for these") == "pipeline" + assert demo._scenario_for("what does status Complete mean?") == "branch" + assert demo._scenario_for("the dry run is unreliable, retry") == "loop" + assert demo._scenario_for("audit these insights") == "adversarial" + assert demo._scenario_for("pick the best chart") == "tournament" + assert demo._scenario_for("hello") == "sequence" # default + + +def test_all_seven_shapes_validate_and_lint_clean(): + reg = demo._registry() + for key in demo.SCENARIOS: + warnings = WorkflowSpecValidator(reg).validate(_expected_spec(key)) + lints = [w for w in warnings if w.startswith("plan-quality")] + assert lints == [], f"{key}: {lints}" + + +@pytest.mark.asyncio +async def test_sequence_executes(): + out = await _run( + _expected_spec("sequence"), + _stub_registry(), + demo.SCENARIOS["sequence"]["task"], + ) + assert out == {"insight": "US-West leads revenue."} + + +@pytest.mark.asyncio +async def test_fanout_executes_no_llm_needed(): + # profiling + report are deterministic mocks even in the LIVE registry. + out = await _run( + _expected_spec("fanout"), + demo._registry(), + demo.SCENARIOS["fanout"]["task"], + ) + assert out == {"tables": 4, "worst_table": "users", "max_null_pct": 7.9} + + +@pytest.mark.asyncio +async def test_pipeline_executes_per_question(): + out = await _run( + _expected_spec("pipeline"), + _stub_registry(), + demo.SCENARIOS["pipeline"]["task"], + ) + assert out == {"insight": "US-West leads revenue."} + + +@pytest.mark.asyncio +async def test_branch_routes_schema_question(): + out = await _run( + _expected_spec("branch"), + _stub_registry(), + demo.SCENARIOS["branch"]["task"], # "...what does ... mean?" -> schema + ) + assert "Complete" in out["answer"] + + +@pytest.mark.asyncio +async def test_loop_repairs_sql_exactly_once(): + demo._FLAKY_CALLS["n"] = 0 + out = await _run( + _expected_spec("loop"), + _stub_registry(), + demo.SCENARIOS["loop"]["task"], + ) + assert out["valid"] is True + # odd call fails, even call passes -> exactly one repair iteration. + assert demo._FLAKY_CALLS["n"] == 2 + + +@pytest.mark.asyncio +async def test_adversarial_rejects_implausible_insight(): + out = await _run( + _expected_spec("adversarial"), + _stub_registry(), + demo.SCENARIOS["adversarial"]["task"], + ) + assert len(out["verified"]) == 2 + assert any("1,000,000" in r for r in out["rejected"]) + + +@pytest.mark.asyncio +async def test_tournament_picks_best_chart_no_llm_needed(): + # pairing + judging are deterministic mocks even in the LIVE registry. + out = await _run( + _expected_spec("tournament"), + demo._registry(), + demo.SCENARIOS["tournament"]["task"], + ) + assert out == ["bar"] From b29ea8fb67123fa7ce63cc872120355b1485260a Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Tue, 9 Jun 2026 14:42:53 -0700 Subject: [PATCH 33/64] =?UTF-8?q?fix(ca-demo):=20brace-free=20instruction?= =?UTF-8?q?=20strings=20=E2=80=94=20ADK=20templates=20{identifier}=20in=20?= =?UTF-8?q?instructions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ADK instruction templating treats {question}-style tokens as session-state injection and raises KeyError('Context variable not found') on unknown variables. The CA planner instruction embedded JSON-ish capability signatures ({question}, {sql, valid, error}) and the raw task JSON. Rewrote the capability blurb in words and replaced inline task JSON with a key list (the exact JSON already arrives as the planner's input message). --- .../bq_ca_planner/agent.py | 32 ++++++++++++------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py index 9def4038220..0eb2e7f8244 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py @@ -360,17 +360,23 @@ def _flaky_dry_run(s): # ------------------------------------------------- scenarios _CAPS_BLURB = ( - "nl2sql (item: {question} -> Sql), draft_or_repair_sql (item: {question," - " sql?, error?} -> Sql), summarize_insight (item: rows/stats JSON ->" - " Insight), classify_question (item: {question} -> Category with" - " category 'data'|'schema'), skeptic (item: one insight -> Verdict)," - " dry_run (item: Sql -> {sql, valid, error}), flaky_dry_run (same, may" - " fail transiently), sql_ok (item: dry-run output -> bool), run_query" - " (item: validated sql -> {rows}), profile_table (item: table name ->" - " stats), quality_report (LIST of stats -> report), describe_schema" - " (item: {question} -> {answer}), keep_verified (LIST of Verdicts ->" - " {verified, rejected}), pair_charts (LIST -> list of pairs)," - " judge_chart (item: pair -> winner), single_chart (LIST -> bool)." + # NOTE: instruction strings must stay BRACE-FREE — ADK templates + # "identifier" in instructions as session-state injection + # and raises KeyError on unknown variables. + "nl2sql (item: a question object -> Sql with field sql)," + " draft_or_repair_sql (item: a question plus optional prior sql and" + " error -> Sql), summarize_insight (item: rows or stats JSON -> Insight" + " with field insight), classify_question (item: a question -> Category" + " with field category equal to 'data' or 'schema'), skeptic (item: one" + " insight -> Verdict with fields insight and refuted), dry_run (item:" + " Sql -> object with sql, valid, error), flaky_dry_run (same as dry_run" + " but may fail transiently), sql_ok (item: dry-run output -> bool)," + " run_query (item: validated sql -> object with rows), profile_table" + " (item: a table name -> stats object), quality_report (LIST of stats" + " -> report object), describe_schema (item: a question -> object with" + " answer), keep_verified (LIST of Verdicts -> object with verified and" + " rejected), pair_charts (LIST -> list of pairs), judge_chart (item: a" + " pair -> the winner), single_chart (LIST -> bool)." ) _BINDING_RULES = ( @@ -503,10 +509,12 @@ def _scenario_for(text: str) -> str: def _planner_instruction(sc) -> str: + keys = ", ".join(f"'{k}'" for k in sc["task"]) return ( "Author a WorkflowSpec using ONLY these capabilities: " + _CAPS_BLURB - + f" Task input: {json.dumps(sc['task'])}. " + + " The task input JSON arrives as your input message; its keys:" + f" {keys}. " + sc["recipe"] + _BINDING_RULES ) From 17bc042b3605db13eb2974c19ccb0d835df93ad1 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Tue, 9 Jun 2026 14:46:31 -0700 Subject: [PATCH 34/64] fix(ca-demo): stubs tolerate authored binding shapes (dict / JSON string / raw value) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The plan is model-authored: a binding may hand a stub the whole step output (dict), a dotted path into it (raw string — the live failure: nl2sql -> dry_run bound with path='sql'), or a JSON-encoded payload. dry_run, flaky_dry_run, sql_ok, and keep_verified now coerce all three shapes via _sql_of/_field_of/_verdict_of helpers; regression test added (12 CA tests, 67 total). --- .../authored_workflow_ca_demo/README.md | 2 +- .../bq_ca_planner/agent.py | 68 ++++++++++++++++--- .../test_ca_demo_agent.py | 21 ++++++ 3 files changed, 81 insertions(+), 10 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/README.md b/contributing/samples/workflows/authored_workflow_ca_demo/README.md index 311fd5bde67..85092baceba 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_ca_demo/README.md @@ -60,7 +60,7 @@ replayable — a turn-by-turn agent retry never is.* ## 2. Correctness proof (no LLM, no BigQuery) ```bash -pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 11 +pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 12 ``` All seven expected shapes are built by hand, validated + lint-checked against diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py index 0eb2e7f8244..5ca08ecb8b1 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py @@ -141,6 +141,45 @@ class Verdict(BaseModel): refuted: bool +def _obj_of(v): + """Accept a dict, a JSON-encoded dict/list string, or return None.""" + if isinstance(v, (dict, list)): + return v + if isinstance(v, str): + try: + parsed = json.loads(v) + if isinstance(parsed, (dict, list)): + return parsed + except (ValueError, TypeError): + pass + return None + + +def _sql_of(v) -> str: + """The SQL text from an Sql dict, a JSON string, or a raw SQL string.""" + obj = _obj_of(v) + if isinstance(obj, dict): + return str(obj.get("sql", "")) + return v if isinstance(v, str) else "" + + +def _field_of(v, key, default=None): + obj = _obj_of(v) + if isinstance(obj, dict): + return obj.get(key, default) + return default + + +def _verdict_of(v) -> dict: + obj = _obj_of(v) + if isinstance(obj, dict) and "insight" in obj: + return { + "insight": str(obj["insight"]), + "refuted": bool(obj.get("refuted")), + } + return {"insight": str(v), "refuted": False} + + def _stub(name, fn): def build(): @node(name=name) @@ -240,8 +279,8 @@ def _registry() -> CapabilityRegistry: build=_stub( "dry_run", lambda s: { - "sql": (s or {}).get("sql", ""), - "valid": "select" in str((s or {}).get("sql", "")).lower(), + "sql": _sql_of(s), + "valid": "select" in _sql_of(s).lower(), "error": None, }, ), @@ -256,7 +295,12 @@ def _registry() -> CapabilityRegistry: name="sql_ok", input_kind="item", serialize_input=False, - build=_stub("sql_ok", lambda s: bool((s or {}).get("valid"))), + build=_stub( + "sql_ok", + lambda s: bool( + _field_of(s, "valid", s if s is not None else False) + ), + ), ), Capability( name="run_query", @@ -308,9 +352,15 @@ def _registry() -> CapabilityRegistry: "keep_verified", lambda vs: { "verified": [ - v["insight"] for v in vs if not v.get("refuted") + v["insight"] + for v in map(_verdict_of, vs or []) + if not v["refuted"] + ], + "rejected": [ + v["insight"] + for v in map(_verdict_of, vs or []) + if v["refuted"] ], - "rejected": [v["insight"] for v in vs if v.get("refuted")], }, ), ), @@ -345,14 +395,14 @@ def _flaky_dry_run(s): _FLAKY_CALLS["n"] += 1 if _FLAKY_CALLS["n"] % 2 == 1: # every odd call fails -> 1 repair per run return { - "question": (s or {}).get("question", ""), - "sql": (s or {}).get("sql", ""), + "question": str(_field_of(s, "question", "") or ""), + "sql": _sql_of(s), "valid": False, "error": "Table not found: `thelook.order` (did you mean orders?)", } return { - "question": (s or {}).get("question", ""), - "sql": (s or {}).get("sql", ""), + "question": str(_field_of(s, "question", "") or ""), + "sql": _sql_of(s), "valid": True, "error": None, } diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py index bba5a5d15e8..27ab2da72fc 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py @@ -380,6 +380,27 @@ async def parent(ctx, node_input): # ----------------------------------------------------- tests +def test_stubs_tolerate_authored_binding_shapes(): + # The plan is MODEL-authored: a binding may hand a stub the whole step + # output (dict), a dotted path into it (raw string), or a JSON-encoded + # payload. The live error this pins: nl2sql -> dry_run with path='sql' + # passed a raw SQL string and the stub assumed a dict. + raw_sql = "SELECT region FROM order_items" + assert demo._sql_of({"sql": raw_sql}) == raw_sql + assert demo._sql_of(json.dumps({"sql": raw_sql})) == raw_sql + assert demo._sql_of(raw_sql) == raw_sql + assert demo._field_of({"valid": True}, "valid") is True + assert demo._field_of(json.dumps({"valid": True}), "valid") is True + assert demo._verdict_of(json.dumps({"insight": "x", "refuted": True})) == { + "insight": "x", + "refuted": True, + } + assert demo._verdict_of("just text")["refuted"] is False + demo._FLAKY_CALLS["n"] = 1 # next call is even -> passes + out = demo._flaky_dry_run(raw_sql) # raw string input must not crash + assert out["valid"] is True and out["sql"] == raw_sql + + def test_root_agent_importable_and_named(): assert isinstance(demo.root_agent, Workflow) assert demo.root_agent.name == "bq_ca_planner" From 502f47018efce7be141165abcb3f789e3be0d378 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Tue, 9 Jun 2026 14:52:03 -0700 Subject: [PATCH 35/64] =?UTF-8?q?feat(ca-demo):=20live=20question=20as=20t?= =?UTF-8?q?ask=20input=20=E2=80=94=20template=20reuse=20on=20replay?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ask-a-question scenario now takes the user's actual message as the task input (other scenarios remain mode selectors with canned inputs). Re-sending with a different question reuses the frozen plan UNCHANGED while the new question flows through it — the RFC's replay-vs-template distinction made visible, labeled in the replay beat. The mock executor returns a different canned row set for year-scale SQL windows so the output visibly tracks the question (quarter vs year). Helpers (_text_of/_task_for/_rows_for) factored and unit-tested; 12 -> 15 CA tests, 70 total. --- .../authored_workflow_ca_demo/README.md | 25 ++++---- .../bq_ca_planner/agent.py | 60 +++++++++++++++++-- .../test_ca_demo_agent.py | 34 +++++++++++ 3 files changed, 103 insertions(+), 16 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/README.md b/contributing/samples/workflows/authored_workflow_ca_demo/README.md index 85092baceba..4a32c82f2fe 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_ca_demo/README.md @@ -33,15 +33,15 @@ adk web contributing/samples/workflows/authored_workflow_ca_demo --port 8001 Open the UI, pick `bq_ca_planner`, and send the prompts below — **one scenario per prompt**, each authoring a different coordination shape: -| # | Send this prompt | Shape authored | CA story | -| --- | ------------------------------------------------------------ | ---------------------------------------------------- | ---------------------------------------------------------------- | -| 1 | `What was revenue by region last quarter?` | sequence: `nl2sql → dry_run → run_query → summarize` | the basic ask-a-question flow | -| 2 | `Profile data quality across the dataset tables.` | fan-out → synthesize | per-table profiling in parallel, one report | -| 3 | `Build a dashboard for these three questions.` | pipeline(`nl2sql → dry_run`) per item | each panel translated + validated barrier-free | -| 4 | `Route my question: what does order status 'Complete' mean?` | classify & route (branch) | metadata questions skip SQL entirely | -| 5 | `Answer with SQL self-repair — the dry run is unreliable.` | loop_until + **loop-carried `init`** | draft → failed dry run → repair using the error | -| 6 | `Audit these insights — verify each one independently.` | adversarial verification | independent skeptics per insight; the $1M AOV claim gets refuted | -| 7 | `Pick the best chart for revenue by region.` | tournament | pairwise chart judging to a single winner | +| # | Send this prompt | Shape authored | CA story | +| --- | ------------------------------------------------------------ | ---------------------------------------------------- | -------------------------------------------------------------------------- | +| 1 | `What was revenue by region last quarter?` | sequence: `nl2sql → dry_run → run_query → summarize` | the basic ask-a-question flow — **your actual question is the task input** | +| 2 | `Profile data quality across the dataset tables.` | fan-out → synthesize | per-table profiling in parallel, one report | +| 3 | `Build a dashboard for these three questions.` | pipeline(`nl2sql → dry_run`) per item | each panel translated + validated barrier-free | +| 4 | `Route my question: what does order status 'Complete' mean?` | classify & route (branch) | metadata questions skip SQL entirely | +| 5 | `Answer with SQL self-repair — the dry run is unreliable.` | loop_until + **loop-carried `init`** | draft → failed dry run → repair using the error | +| 6 | `Audit these insights — verify each one independently.` | adversarial verification | independent skeptics per insight; the $1M AOV claim gets refuted | +| 7 | `Pick the best chart for revenue by region.` | tournament | pairwise chart judging to a single winner | What to point at as each one streams: @@ -49,6 +49,7 @@ What to point at as each one streams: - **📋 authored plan** — a *different* typed `WorkflowSpec` per prompt; same closed vocabulary every time. - **✅ + 🧪 validation & independence lints** — every scenario lints clean; the provenance facts are statically provable from the bindings. - **🔒 freeze (per-scenario key)** — **re-send any prompt**: same hash, `0 planner calls (frozen replay)`. Seven independent frozen plans in one session. +- **template reuse (scenario 1)** — after the first ask, send a *different* question (`What was revenue by region last year?`): the frozen plan is reused unchanged, your new question flows through it as new task input, and the mock rows change with the window (quarter vs year canned sets). Same plan, new data — the RFC's replay-vs-template distinction, live. - **📄 result + 📊 cost** — real execution on the #92 supervisor; the repair scenario shows exactly one repair iteration (`Table not found … did you mean orders?` → fixed), the audit scenario rejects the implausible insight, the tournament returns `["bar"]`. Talking point for scenario 5 (the differentiated one): *the repair loop needs @@ -60,7 +61,7 @@ replayable — a turn-by-turn agent retry never is.* ## 2. Correctness proof (no LLM, no BigQuery) ```bash -pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 12 +pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 15 ``` All seven expected shapes are built by hand, validated + lint-checked against @@ -81,3 +82,7 @@ against the **live** registry (their capabilities are deterministic mocks). repair loop behaves identically on every run and in CI. - Frozen plans are per-scenario (`authored_workflow:ca:`), so all seven replay independently within a session. +- Scenario 1 takes your live message as the question; the other six prompts + are mode selectors with canned task inputs (their results don't change + with your wording). All query results are canned either way — quarter vs + year selects between two mock row sets; there is no BigQuery behind it. diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py index 5ca08ecb8b1..c3f8445060d 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py @@ -96,6 +96,16 @@ {"region": "APAC", "revenue": 188777.75}, ] +# A year-scale window returns a different canned set, so the demo output +# visibly TRACKS the input: ask "last quarter" vs "last year" and the rows +# change (still mocks — no BigQuery behind them). +_CANNED_ROWS_YEAR = [ + {"region": "US-West", "revenue": 1648140.00}, + {"region": "US-East", "revenue": 1571980.40}, + {"region": "EMEA", "revenue": 1180510.90}, + {"region": "APAC", "revenue": 760222.15}, +] + _CANNED_PROFILES = { "orders": {"table": "orders", "row_count": 125210, "null_pct": 0.2}, "order_items": { @@ -306,7 +316,7 @@ def _registry() -> CapabilityRegistry: name="run_query", input_kind="item", serialize_input=False, - build=_stub("run_query", lambda s: {"rows": _CANNED_ROWS}), + build=_stub("run_query", lambda s: {"rows": _rows_for(s)}), ), Capability( name="profile_table", @@ -550,6 +560,34 @@ def _scenario_defs(): SCENARIOS = _scenario_defs() +def _rows_for(value) -> list: + """Canned rows; a year-scale window in the SQL selects the year set.""" + return _CANNED_ROWS_YEAR if "year" in _sql_of(value).lower() else _CANNED_ROWS + + +def _text_of(node_input) -> str: + """The user's message text, whatever shape the node input arrives in.""" + if isinstance(node_input, str): + return node_input + for holder in (node_input, getattr(node_input, "content", None)): + parts = getattr(holder, "parts", None) + if parts: + return " ".join(p.text for p in parts if getattr(p, "text", None)) + return str(node_input or "") + + +def _task_for(key: str, text: str) -> dict: + """The scenario's task input. The ask-a-question scenario takes the LIVE + user message as the question — so a re-send with a different question is + TEMPLATE REUSE: the frozen plan unchanged, new task input flowing through + it. Other scenarios keep their canned inputs (their prompts are mode + selectors, not questions).""" + task = dict(SCENARIOS[key]["task"]) + if key == "sequence" and text.strip(): + task = {"question": text.strip()} + return task + + def _scenario_for(text: str) -> str: t = (text or "").lower() for key, sc in SCENARIOS.items(): @@ -583,14 +621,17 @@ def _hash(spec: WorkflowSpec) -> str: @node(rerun_on_resume=True) async def plan_and_run(ctx: Context, node_input): reg = _registry() - key = _scenario_for(str(node_input or "")) + text = _text_of(node_input) + key = _scenario_for(text) sc = SCENARIOS[key] + task = _task_for(key, text) state_key = f"authored_workflow:ca:{key}" + task_note = f' — question: "{task["question"]}"' if key == "sequence" else "" yield _msg( f"🗂️ **Scenario: {sc['title']}** — expected shape `{sc['shape']}`," " over mock `thelook_ecommerce`" - f" ({', '.join(TABLES)})." + f" ({', '.join(TABLES)}){task_note}." ) # 1. LOAD-OR-AUTHOR (per-scenario frozen key: each shape replays @@ -600,9 +641,16 @@ async def plan_and_run(ctx: Context, node_input): spec = WorkflowSpec.model_validate(existing) spec_hash = _hash(spec) reused = True + fresh_input = task != sc["task"] yield _msg( f"♻️ **Reusing frozen plan** for `{key}` — hash `{spec_hash}`. The" - " model is NOT re-invoked; the exact prior plan is replayed." + " model is NOT re-invoked; the exact prior plan is replayed" + + ( + " — with your NEW question as the task input (**template" + " reuse**: same plan, new data flowing through it)." + if fresh_input + else "." + ) ) else: reused = False @@ -614,7 +662,7 @@ async def plan_and_run(ctx: Context, node_input): instruction=_planner_instruction(sc), ) raw = await ctx.run_node( - planner, node_input=json.dumps(sc["task"]), run_id=f"plan_{key}" + planner, node_input=json.dumps(task), run_id=f"plan_{key}" ) spec = WorkflowSpec.model_validate(raw) spec_hash = _hash(spec) @@ -645,7 +693,7 @@ async def plan_and_run(ctx: Context, node_input): # 4. EXECUTE on the real engine via the #92 supervisor. t0 = time.perf_counter() interp = SpecInterpreter(reg, ctx) - result = await interp.execute(spec, sc["task"]) + result = await interp.execute(spec, task) elapsed = time.perf_counter() - t0 yield _msg( f"📄 **Result:**\n```json\n{json.dumps(result, indent=1, default=str)}" diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py index 27ab2da72fc..8c624352bf3 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py @@ -401,6 +401,40 @@ def test_stubs_tolerate_authored_binding_shapes(): assert out["valid"] is True and out["sql"] == raw_sql +def test_rows_track_the_sql_window(): + # The mock executor returns a DIFFERENT canned set for a year-scale + # window, so the demo output visibly tracks the question. + q_sql = "SELECT ... WHERE created_at >= INTERVAL 1 QUARTER" + y_sql = "SELECT ... WHERE created_at >= INTERVAL 1 YEAR" + assert demo._rows_for(q_sql) == demo._CANNED_ROWS + assert demo._rows_for(y_sql) == demo._CANNED_ROWS_YEAR + assert demo._rows_for({"sql": y_sql}) == demo._CANNED_ROWS_YEAR + + +def test_text_of_extracts_user_message(): + assert demo._text_of("plain text") == "plain text" + content = types.Content( + role="user", parts=[types.Part(text="last year please")] + ) + assert demo._text_of(content) == "last year please" + + class Wrapped: + pass + + w = Wrapped() + w.content = content + assert demo._text_of(w) == "last year please" + + +def test_sequence_takes_live_question_others_stay_canned(): + q = "What was revenue by region last year?" + assert demo._task_for("sequence", q) == {"question": q} + # empty/whitespace falls back to the canned question + assert demo._task_for("sequence", " ") == demo.SCENARIOS["sequence"]["task"] + # mode-selector scenarios keep their canned inputs + assert demo._task_for("fanout", q) == demo.SCENARIOS["fanout"]["task"] + + def test_root_agent_importable_and_named(): assert isinstance(demo.root_agent, Workflow) assert demo.root_agent.name == "bq_ca_planner" From afd30e820c5743aab1fb872bbe1c98f4de0591a1 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Tue, 9 Jun 2026 15:29:57 -0700 Subject: [PATCH 36/64] fix(ca-demo): specialized scenario triggers beat the ask-a-question fallback 'give me the best chart for revenue by region' matched the sequence trigger 'revenue by region' before the tournament trigger 'best chart' (first-match in definition order) and replayed the frozen Q&A plan. The sequence scenario is the generic fallback for ANY question, so the router now checks all specialized scenarios first and falls back to sequence only when none match. Overlap cases pinned in the routing test (incl. the advertised prompt 7, which had the same latent collision). --- .../bq_ca_planner/agent.py | 9 +++++++++ .../test_ca_demo_agent.py | 14 ++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py index c3f8445060d..1912d5d9ab9 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py @@ -589,8 +589,17 @@ def _task_for(key: str, text: str) -> dict: def _scenario_for(text: str) -> str: + """Specialized scenarios win over the generic ask-a-question fallback. + + 'sequence' is the default for ANY question, so its triggers must never + shadow a specialized intent — e.g. "best chart for revenue by region" + contains both a tournament trigger and a sequence trigger and must route + to the tournament. + """ t = (text or "").lower() for key, sc in SCENARIOS.items(): + if key == "sequence": + continue # fallback only — checked last by construction if any(trigger in t for trigger in sc["triggers"]): return key return "sequence" diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py index 8c624352bf3..b6cce3ff163 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py @@ -456,6 +456,20 @@ def test_scenario_routing(): assert demo._scenario_for("audit these insights") == "adversarial" assert demo._scenario_for("pick the best chart") == "tournament" assert demo._scenario_for("hello") == "sequence" # default + # overlapping triggers: specialized intent must beat the generic fallback + # ("revenue by region" is a sequence trigger, but these aren't questions). + assert ( + demo._scenario_for("Pick the best chart for revenue by region.") + == "tournament" + ) + assert ( + demo._scenario_for("give me the best chart for revenue by region") + == "tournament" + ) + assert ( + demo._scenario_for("Profile data quality for revenue by region") + == "fanout" + ) def test_all_seven_shapes_validate_and_lint_clean(): From 9c6b4a5e5a730a17c9ebc5370d5ff099470c370e Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Tue, 9 Jun 2026 15:38:12 -0700 Subject: [PATCH 37/64] =?UTF-8?q?feat(ca-demo):=20render=5Fchart=20capabil?= =?UTF-8?q?ity=20=E2=80=94=20CA-style=20Vega-Lite=20chart=20artifacts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BigQuery Conversational Analytics returns Vega-Lite chart specs; the demo now does too. New deterministic render_chart capability (no plotting deps): input-tolerant (query rows dict, raw rows list, tournament winner list, or a bare chart-type string) -> chart_type + Unicode bar preview (renders in the ADK Web chat) + Vega-Lite spec. Wired into the sequence recipe (chart the query rows) and the tournament recipe (render the data with the winning mark); a 📈 beat surfaces any chart artifact found in interpreter state. 15 -> 16 CA tests, 71 total. --- .../authored_workflow_ca_demo/README.md | 23 ++-- .../bq_ca_planner/agent.py | 128 +++++++++++++++--- .../test_ca_demo_agent.py | 38 +++++- 3 files changed, 157 insertions(+), 32 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/README.md b/contributing/samples/workflows/authored_workflow_ca_demo/README.md index 4a32c82f2fe..7a2cf0d6361 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_ca_demo/README.md @@ -33,15 +33,15 @@ adk web contributing/samples/workflows/authored_workflow_ca_demo --port 8001 Open the UI, pick `bq_ca_planner`, and send the prompts below — **one scenario per prompt**, each authoring a different coordination shape: -| # | Send this prompt | Shape authored | CA story | -| --- | ------------------------------------------------------------ | ---------------------------------------------------- | -------------------------------------------------------------------------- | -| 1 | `What was revenue by region last quarter?` | sequence: `nl2sql → dry_run → run_query → summarize` | the basic ask-a-question flow — **your actual question is the task input** | -| 2 | `Profile data quality across the dataset tables.` | fan-out → synthesize | per-table profiling in parallel, one report | -| 3 | `Build a dashboard for these three questions.` | pipeline(`nl2sql → dry_run`) per item | each panel translated + validated barrier-free | -| 4 | `Route my question: what does order status 'Complete' mean?` | classify & route (branch) | metadata questions skip SQL entirely | -| 5 | `Answer with SQL self-repair — the dry run is unreliable.` | loop_until + **loop-carried `init`** | draft → failed dry run → repair using the error | -| 6 | `Audit these insights — verify each one independently.` | adversarial verification | independent skeptics per insight; the $1M AOV claim gets refuted | -| 7 | `Pick the best chart for revenue by region.` | tournament | pairwise chart judging to a single winner | +| # | Send this prompt | Shape authored | CA story | +| --- | ------------------------------------------------------------ | ------------------------------------------------------------------- | -------------------------------------------------------------------------- | +| 1 | `What was revenue by region last quarter?` | sequence: `nl2sql → dry_run → run_query → render_chart + summarize` | the basic ask-a-question flow — **your actual question is the task input** | +| 2 | `Profile data quality across the dataset tables.` | fan-out → synthesize | per-table profiling in parallel, one report | +| 3 | `Build a dashboard for these three questions.` | pipeline(`nl2sql → dry_run`) per item | each panel translated + validated barrier-free | +| 4 | `Route my question: what does order status 'Complete' mean?` | classify & route (branch) | metadata questions skip SQL entirely | +| 5 | `Answer with SQL self-repair — the dry run is unreliable.` | loop_until + **loop-carried `init`** | draft → failed dry run → repair using the error | +| 6 | `Audit these insights — verify each one independently.` | adversarial verification | independent skeptics per insight; the $1M AOV claim gets refuted | +| 7 | `Pick the best chart for revenue by region.` | tournament | pairwise chart judging to a single winner | What to point at as each one streams: @@ -50,6 +50,7 @@ What to point at as each one streams: - **✅ + 🧪 validation & independence lints** — every scenario lints clean; the provenance facts are statically provable from the bindings. - **🔒 freeze (per-scenario key)** — **re-send any prompt**: same hash, `0 planner calls (frozen replay)`. Seven independent frozen plans in one session. - **template reuse (scenario 1)** — after the first ask, send a *different* question (`What was revenue by region last year?`): the frozen plan is reused unchanged, your new question flows through it as new task input, and the mock rows change with the window (quarter vs year canned sets). Same plan, new data — the RFC's replay-vs-template distinction, live. +- **📈 chart** — scenarios 1 and 7 emit the Conversational-Analytics-style chart artifact: a Unicode bar preview rendered in the chat plus the **Vega-Lite spec** (what the real CA API returns). In the tournament, the bracket picks the mark and `render_chart` draws the data with it. - **📄 result + 📊 cost** — real execution on the #92 supervisor; the repair scenario shows exactly one repair iteration (`Table not found … did you mean orders?` → fixed), the audit scenario rejects the implausible insight, the tournament returns `["bar"]`. Talking point for scenario 5 (the differentiated one): *the repair loop needs @@ -61,14 +62,14 @@ replayable — a turn-by-turn agent retry never is.* ## 2. Correctness proof (no LLM, no BigQuery) ```bash -pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 15 +pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 16 ``` All seven expected shapes are built by hand, validated + lint-checked against the demo registry, and **executed end-to-end** with the language capabilities stubbed: the loop repairs exactly once, the branch routes the metadata question away from SQL, the audit rejects the implausible insight, the -tournament converges to `bar`. The fan-out and tournament scenarios execute +tournament converges to `bar` and renders it as a Vega-Lite chart artifact. The fan-out and tournament scenarios execute against the **live** registry (their capabilities are deterministic mocks). ## Notes diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py index 1912d5d9ab9..0358d067789 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py @@ -190,6 +190,65 @@ def _verdict_of(v) -> dict: return {"insight": str(v), "refuted": False} +_VEGA_MARK = {"bar": "bar", "line": "line", "scatter": "point", "pie": "arc"} + + +def _ascii_bars(rows, width: int = 24) -> str: + """A Unicode bar preview of (label, value) rows — renders in the chat.""" + pts = [] + for r in rows or []: + if not isinstance(r, dict): + continue + label = next((str(v) for v in r.values() if isinstance(v, str)), "?") + num = next( + (float(v) for v in r.values() if isinstance(v, (int, float))), 0.0 + ) + pts.append((label, num)) + if not pts: + return "(no rows)" + mx = max(n for _, n in pts) or 1.0 + lw = max(len(label) for label, _ in pts) + return "\n".join( + f"{label:<{lw}} {'█' * max(1, round(n / mx * width)):<{width}} " + f" {n:>14,.2f}" + for label, n in pts + ) + + +def _render_chart(v) -> dict: + """Build a chart from whatever the authored binding hands over: query + output (dict with rows), raw rows (list of dicts), a tournament winner + (list with one chart-type string), or a bare chart-type string. Emits the + Conversational-Analytics-style artifact: a Vega-Lite spec + a text + preview the chat can render.""" + chart_type, rows = "bar", _CANNED_ROWS + obj = _obj_of(v) + if isinstance(obj, dict): + rows = obj.get("rows", rows) + if str(obj.get("chart_type", "")) in _VEGA_MARK: + chart_type = str(obj["chart_type"]) + elif isinstance(obj, list) and obj: + if isinstance(obj[0], dict): + rows = obj + elif str(obj[0]) in _VEGA_MARK: + chart_type = str(obj[0]) + elif isinstance(v, str) and v in _VEGA_MARK: + chart_type = v + return { + "chart_type": chart_type, + "ascii": _ascii_bars(rows), + "vega_lite": { + "$schema": "https://vega.github.io/schema/vega-lite/v5.json", + "mark": _VEGA_MARK[chart_type], + "data": {"values": rows}, + "encoding": { + "x": {"field": "region", "type": "nominal"}, + "y": {"field": "revenue", "type": "quantitative"}, + }, + }, + } + + def _stub(name, fn): def build(): @node(name=name) @@ -354,6 +413,12 @@ def _registry() -> CapabilityRegistry: lambda q: {"answer": _SCHEMA_NOTES["default"]}, ), ), + Capability( + name="render_chart", + input_kind="item", + serialize_input=False, + build=_stub("render_chart", _render_chart), + ), Capability( name="keep_verified", input_kind="list", @@ -424,19 +489,21 @@ def _flaky_dry_run(s): # "identifier" in instructions as session-state injection # and raises KeyError on unknown variables. "nl2sql (item: a question object -> Sql with field sql)," - " draft_or_repair_sql (item: a question plus optional prior sql and" - " error -> Sql), summarize_insight (item: rows or stats JSON -> Insight" - " with field insight), classify_question (item: a question -> Category" - " with field category equal to 'data' or 'schema'), skeptic (item: one" - " insight -> Verdict with fields insight and refuted), dry_run (item:" - " Sql -> object with sql, valid, error), flaky_dry_run (same as dry_run" - " but may fail transiently), sql_ok (item: dry-run output -> bool)," - " run_query (item: validated sql -> object with rows), profile_table" - " (item: a table name -> stats object), quality_report (LIST of stats" - " -> report object), describe_schema (item: a question -> object with" - " answer), keep_verified (LIST of Verdicts -> object with verified and" - " rejected), pair_charts (LIST -> list of pairs), judge_chart (item: a" - " pair -> the winner), single_chart (LIST -> bool)." + " draft_or_repair_sql (item: a question plus optional prior sql and error" + " -> Sql), summarize_insight (item: rows or stats JSON -> Insight with" + " field insight), classify_question (item: a question -> Category with" + " field category equal to 'data' or 'schema'), skeptic (item: one insight" + " -> Verdict with fields insight and refuted), dry_run (item: Sql -> object" + " with sql, valid, error), flaky_dry_run (same as dry_run but may fail" + " transiently), sql_ok (item: dry-run output -> bool), run_query (item:" + " validated sql -> object with rows), profile_table (item: a table name ->" + " stats object), quality_report (LIST of stats -> report object)," + " describe_schema (item: a question -> object with answer), keep_verified" + " (LIST of Verdicts -> object with verified and rejected), render_chart" + " (item: query output with rows, or a chart-type winner -> a chart artifact" + " with chart_type, ascii preview, and a vega_lite spec), pair_charts (LIST" + " -> list of pairs), judge_chart (item: a pair -> the winner), single_chart" + " (LIST -> bool)." ) _BINDING_RULES = ( @@ -452,14 +519,16 @@ def _scenario_defs(): return { "sequence": dict( title="Ask a question (sequence)", - shape="step → step → step → step", + shape="step → step → step → render_chart + step", triggers=("revenue by region", "sequence"), task={"question": q_region}, recipe=( "Author, in order: (1) a step running nl2sql on the task;" " (2) a step running dry_run on it; (3) a step running" - " run_query on that; (4) a step running summarize_insight on" - " the rows. Output = the summarize step." + " run_query on that; (4) a step running render_chart on the" + " run_query step's output; (5) a step running" + " summarize_insight on the run_query step's output. Output =" + " the summarize step." ), ), "fanout": dict( @@ -541,7 +610,10 @@ def _scenario_defs(): ), "tournament": dict( title="Pick the best chart (tournament)", - shape="loop_until(init=task.chart_options, body=[pair, fan_out])", + shape=( + "loop_until(init=task.chart_options, body=[pair, fan_out])" + " → render_chart" + ), triggers=("best chart", "tournament"), task={"chart_options": ["pie", "bar", "line", "scatter"]}, recipe=( @@ -551,7 +623,9 @@ def _scenario_defs(): " loop's own id>); (b) a fan_out over (a) running judge_chart" " per pair]; until_capability = single_chart with until_input" " = Binding(source='step', step=); max_iters" - " = 3. Output = the loop." + " = 3. Then (2) a step running render_chart on the loop's" + " output (the winning chart type). Output = the render_chart" + " step." ), ), } @@ -704,8 +778,24 @@ async def plan_and_run(ctx: Context, node_input): interp = SpecInterpreter(reg, ctx) result = await interp.execute(spec, task) elapsed = time.perf_counter() - t0 + for chart in ( + v + for v in interp.state.values() + if isinstance(v, dict) and "vega_lite" in v + ): + yield _msg( + f"📈 **Chart ({chart['chart_type']})** — the" + " Conversational-Analytics-style artifact (text preview + Vega-Lite" + f" spec):\n```\n{chart['ascii']}\n```\n```json\n" + f"{json.dumps(chart['vega_lite'], indent=1)}\n```" + ) + display = ( + {k: v for k, v in result.items() if k != "vega_lite"} + if isinstance(result, dict) + else result + ) yield _msg( - f"📄 **Result:**\n```json\n{json.dumps(result, indent=1, default=str)}" + f"📄 **Result:**\n```json\n{json.dumps(display, indent=1, default=str)}" f"\n```\n📊 **Cost:** {interp.dispatch_count} capability dispatches in" f" {elapsed:.1f}s + " + ("0 planner calls (frozen replay)." if reused else "1 planner call.") diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py index b6cce3ff163..b7e65646b9f 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py @@ -166,6 +166,12 @@ def _expected_spec(key: str) -> WorkflowSpec: capability="run_query", input=Binding(source="step", step="check"), ), + StepRef( + kind="step", + id="chart", + capability="render_chart", + input=Binding(source="step", step="rows"), + ), StepRef( kind="step", id="sum", @@ -350,8 +356,14 @@ def _expected_spec(key: str) -> WorkflowSpec: until_input=Binding(source="step", step="winners"), max_iters=3, ), + StepRef( + kind="step", + id="viz", + capability="render_chart", + input=Binding(source="step", step="bracket"), + ), ], - output=Binding(source="step", step="bracket"), + output=Binding(source="step", step="viz"), ) raise KeyError(key) @@ -401,6 +413,25 @@ def test_stubs_tolerate_authored_binding_shapes(): assert out["valid"] is True and out["sql"] == raw_sql +def test_render_chart_accepts_authored_binding_shapes(): + # query output (dict with rows) -> bar over those rows + ch = demo._render_chart({"rows": demo._CANNED_ROWS_YEAR}) + assert ch["chart_type"] == "bar" + assert "1,648,140.00" in ch["ascii"] + assert ch["vega_lite"]["data"]["values"] == demo._CANNED_ROWS_YEAR + # tournament winner (list with one chart type) -> that mark, canned rows + ch = demo._render_chart(["pie"]) + assert ch["chart_type"] == "pie" + assert ch["vega_lite"]["mark"] == "arc" + # bare chart-type string and raw rows list + assert demo._render_chart("scatter")["vega_lite"]["mark"] == "point" + ch = demo._render_chart(demo._CANNED_ROWS) + assert "US-West" in ch["ascii"] + # ascii preview: one bar line per region, longest bar for the leader + lines = demo._render_chart({"rows": demo._CANNED_ROWS})["ascii"].splitlines() + assert len(lines) == 4 and lines[0].count("█") > lines[-1].count("█") + + def test_rows_track_the_sql_window(): # The mock executor returns a DIFFERENT canned set for a year-scale # window, so the demo output visibly tracks the question. @@ -553,4 +584,7 @@ async def test_tournament_picks_best_chart_no_llm_needed(): demo._registry(), demo.SCENARIOS["tournament"]["task"], ) - assert out == ["bar"] + # bracket converges to bar; the winner is rendered as a chart artifact. + assert out["chart_type"] == "bar" + assert out["vega_lite"]["mark"] == "bar" + assert "US-West" in out["ascii"] From 787e275ba356160942008c95ed567003194a2b02 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Tue, 9 Jun 2026 17:08:29 -0700 Subject: [PATCH 38/64] feat(ca-demo): intelligent mock executor (micro-warehouse intent engine) + inline chart images MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two upgrades from live dogfooding: * run_query is now a deterministic micro-warehouse: a synthetic 24-month x 4-region x 4-category fact table + SQL-intent parsing. The executor AGGREGATES the facts per the query's grouping (month/region/category), window (INTERVAL N YEAR/QUARTER/MONTH), filters (country/region/category literals), and measure alias (SUM(...) AS name) — replacing the canned keyword rows whose shape couldn't answer a trend question (live gap: a monthly-trend SQL got region rows back). Honest scope documented: intent execution, not SQL parsing; real BigQuery is the production step. * render_chart infers a LINE mark for date-shaped x labels (explicit tournament winners still win); _chart_png renders the artifact to PNG via matplotlib (optional) and the 📈 beat emits it as an inline image part — ADK Web shows a real chart; falls back to the Unicode preview. 18 -> 21 CA tests (engine windows/trend-alias-filter/total/category, line inference, PNG magic bytes, derived encoding); 76 total. --- .../authored_workflow_ca_demo/README.md | 22 +- .../bq_ca_planner/agent.py | 219 +++++++++++++++--- .../test_ca_demo_agent.py | 96 +++++++- 3 files changed, 293 insertions(+), 44 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/README.md b/contributing/samples/workflows/authored_workflow_ca_demo/README.md index 7a2cf0d6361..d2f9548d962 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_ca_demo/README.md @@ -6,9 +6,16 @@ a user asks data questions in natural language, and the planner **authors a different typed `WorkflowSpec` per scenario** over Conversational-Analytics capabilities — `nl2sql`, `dry_run`, `run_query`, `profile_table`, `skeptic`, chart judging — against a mock `thelook_ecommerce` dataset (the dataset the -CA docs demo against). Query execution / dry-run / profiling are -deterministic mocks (**no BigQuery project needed**); the language steps -(NL2SQL, summaries, classification, skeptics) are live Gemini calls. +CA docs demo against). Query execution runs on a **deterministic +micro-warehouse**: a synthetic 24-month × 4-region × 4-category fact table +plus SQL-intent parsing — the executor *aggregates* the facts per the +query's grouping (month/region/category), time window (`INTERVAL N YEAR/QUARTER/MONTH`), filters (`country = 'United States'`, region/category +literals), and measure alias (`AS total_sales`). No BigQuery project +needed, and answers genuinely track the question (a trend question returns +a real monthly series and charts as a line). Honest scope: it executes the +query's *intent*, not its SQL — a real BigQuery backend is the production +step. The language steps (NL2SQL, summaries, classification, skeptics) are +live Gemini calls. Every scenario runs the full #93 machinery: **author → validate → independence lints → freeze (per-scenario key) → execute on the real engine @@ -50,7 +57,7 @@ What to point at as each one streams: - **✅ + 🧪 validation & independence lints** — every scenario lints clean; the provenance facts are statically provable from the bindings. - **🔒 freeze (per-scenario key)** — **re-send any prompt**: same hash, `0 planner calls (frozen replay)`. Seven independent frozen plans in one session. - **template reuse (scenario 1)** — after the first ask, send a *different* question (`What was revenue by region last year?`): the frozen plan is reused unchanged, your new question flows through it as new task input, and the mock rows change with the window (quarter vs year canned sets). Same plan, new data — the RFC's replay-vs-template distinction, live. -- **📈 chart** — scenarios 1 and 7 emit the Conversational-Analytics-style chart artifact: a Unicode bar preview rendered in the chat plus the **Vega-Lite spec** (what the real CA API returns). In the tournament, the bracket picks the mark and `render_chart` draws the data with it. +- **📈 chart** — scenarios 1 and 7 emit the Conversational-Analytics-style chart artifact: a **rendered chart image inline in the chat** (matplotlib, optional — falls back to a Unicode preview) plus the **Vega-Lite spec** (what the real CA API returns). Time-series rows infer a line mark; in the tournament, the bracket picks the mark and `render_chart` draws the data with it. - **📄 result + 📊 cost** — real execution on the #92 supervisor; the repair scenario shows exactly one repair iteration (`Table not found … did you mean orders?` → fixed), the audit scenario rejects the implausible insight, the tournament returns `["bar"]`. Talking point for scenario 5 (the differentiated one): *the repair loop needs @@ -62,7 +69,7 @@ replayable — a turn-by-turn agent retry never is.* ## 2. Correctness proof (no LLM, no BigQuery) ```bash -pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 16 +pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 21 ``` All seven expected shapes are built by hand, validated + lint-checked against @@ -85,5 +92,6 @@ against the **live** registry (their capabilities are deterministic mocks). seven replay independently within a session. - Scenario 1 takes your live message as the question; the other six prompts are mode selectors with canned task inputs (their results don't change - with your wording). All query results are canned either way — quarter vs - year selects between two mock row sets; there is no BigQuery behind it. + with your wording). Query answers come from the deterministic + micro-warehouse above — real aggregation over synthetic facts; there is + no BigQuery behind it. diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py index 0358d067789..d2f55247072 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py @@ -44,7 +44,9 @@ from __future__ import annotations import json +import math import os +import re import sys import time from typing import Literal @@ -96,16 +98,96 @@ {"region": "APAC", "revenue": 188777.75}, ] -# A year-scale window returns a different canned set, so the demo output -# visibly TRACKS the input: ask "last quarter" vs "last year" and the rows -# change (still mocks — no BigQuery behind them). -_CANNED_ROWS_YEAR = [ - {"region": "US-West", "revenue": 1648140.00}, - {"region": "US-East", "revenue": 1571980.40}, - {"region": "EMEA", "revenue": 1180510.90}, - {"region": "APAC", "revenue": 760222.15}, +# ------------------------------------------------- micro-warehouse engine +# The "intelligent mock executor": a deterministic synthetic fact table +# (24 months x 4 regions x 4 categories) plus lightweight SQL-INTENT parsing. +# Instead of pattern-matching to a canned answer, run_query AGGREGATES the +# facts according to the query's grouping (month/region/category), window +# (INTERVAL N YEAR/QUARTER/MONTH), filters (country/region literals), and +# measure alias (SUM(...) AS ). Honest scope: it executes the query's +# INTENT, not its SQL — a real BigQuery backend is the production step. +_REGION_WEIGHT = {"US-West": 1.00, "US-East": 0.95, "EMEA": 0.72, "APAC": 0.46} +_CATEGORY_WEIGHT = { + "Outerwear": 0.34, + "Jeans": 0.27, + "Activewear": 0.22, + "Accessories": 0.17, +} +_MONTHS = [f"{y}-{m:02d}" for y in (2024, 2025) for m in range(1, 13)] +_BASE_MONTHLY = 142000.0 + + +def _seasonal(i: int) -> float: + # mild growth + yearly seasonality — deterministic, no RNG. + return 1.0 + 0.18 * math.sin(i * math.pi / 6) + 0.012 * i + + +_FACTS = [ + { + "month": month, + "region": region, + "category": category, + "revenue": round(_BASE_MONTHLY * rw * cw * _seasonal(i), 2), + } + for i, month in enumerate(_MONTHS) + for region, rw in _REGION_WEIGHT.items() + for category, cw in _CATEGORY_WEIGHT.items() ] + +def _query_engine(sql_text: str) -> list[dict]: + """Aggregate the synthetic facts according to the SQL's intent.""" + s = (sql_text or "").lower() + # time window: last N months from the warehouse's end (default: a quarter) + m_y = re.search(r"interval\s+(\d+)\s+year", s) + m_q = re.search(r"interval\s+(\d+)\s+quarter", s) + m_m = re.search(r"interval\s+(\d+)\s+month", s) + if m_y: + n = int(m_y.group(1)) * 12 + elif m_q: + n = int(m_q.group(1)) * 3 + elif m_m: + n = int(m_m.group(1)) + elif "year" in s: + n = 12 + else: + n = 3 + months = set(_MONTHS[-min(n, len(_MONTHS)) :]) + facts = [f for f in _FACTS if f["month"] in months] + # filters: country / region literals + if "united states" in s or "'us'" in s: + facts = [f for f in facts if f["region"].startswith("US-")] + for region in _REGION_WEIGHT: + if f"'{region.lower()}'" in s: + facts = [f for f in facts if f["region"] == region] + for category in _CATEGORY_WEIGHT: + if f"'{category.lower()}'" in s: + facts = [f for f in facts if f["category"] == category] + # grouping dimension + if re.search(r"date_trunc|group by\s+month|\bmonth\b", s): + dim = "month" + elif "category" in s or "department" in s: + dim = "category" + elif "region" in s or "country" in s: + dim = "region" + else: + dim = None + # measure name: honor the SQL's alias when present + alias = re.search(r"sum\([^)]*\)\s+as\s+([a-z_][a-z0-9_]*)", s) + measure = alias.group(1) if alias else "revenue" + if dim is None: + return [{measure: round(sum(f["revenue"] for f in facts), 2)}] + agg: dict = {} + for f in facts: + agg[f[dim]] = agg.get(f[dim], 0.0) + f["revenue"] + items = ( + sorted(agg.items()) + if dim == "month" + else sorted(agg.items(), key=lambda kv: -kv[1]) + ) + return [{dim: k, measure: round(v, 2)} for k, v in items] + + _CANNED_PROFILES = { "orders": {"table": "orders", "row_count": 125210, "null_pct": 0.2}, "order_items": { @@ -221,34 +303,100 @@ def _render_chart(v) -> dict: (list with one chart-type string), or a bare chart-type string. Emits the Conversational-Analytics-style artifact: a Vega-Lite spec + a text preview the chat can render.""" - chart_type, rows = "bar", _CANNED_ROWS + chart_type, rows, explicit = "bar", _CANNED_ROWS, False obj = _obj_of(v) if isinstance(obj, dict): rows = obj.get("rows", rows) if str(obj.get("chart_type", "")) in _VEGA_MARK: - chart_type = str(obj["chart_type"]) + chart_type, explicit = str(obj["chart_type"]), True elif isinstance(obj, list) and obj: if isinstance(obj[0], dict): rows = obj elif str(obj[0]) in _VEGA_MARK: - chart_type = str(obj[0]) + chart_type, explicit = str(obj[0]), True elif isinstance(v, str) and v in _VEGA_MARK: - chart_type = v + chart_type, explicit = v, True + # date-shaped x labels (a time series) default to a LINE mark unless the + # chart type was chosen explicitly (e.g. by the tournament winner). + if not explicit and any( + isinstance(r, dict) + and any( + isinstance(val, str) and re.match(r"^\d{4}-\d{2}", val) + for val in r.values() + ) + for r in rows or [] + ): + chart_type = "line" + first = rows[0] if rows and isinstance(rows[0], dict) else {} + x_field = next((k for k, v in first.items() if isinstance(v, str)), "label") + y_field = next( + (k for k, v in first.items() if isinstance(v, (int, float))), "value" + ) return { "chart_type": chart_type, + "x_field": x_field, + "y_field": y_field, "ascii": _ascii_bars(rows), "vega_lite": { "$schema": "https://vega.github.io/schema/vega-lite/v5.json", "mark": _VEGA_MARK[chart_type], "data": {"values": rows}, "encoding": { - "x": {"field": "region", "type": "nominal"}, - "y": {"field": "revenue", "type": "quantitative"}, + "x": {"field": x_field, "type": "nominal"}, + "y": {"field": y_field, "type": "quantitative"}, }, }, } +def _chart_png(chart: dict): + """Render the chart artifact to PNG bytes via matplotlib, or None. + + Optional dependency: without matplotlib the demo falls back to the text + preview + Vega-Lite spec (which any Vega editor renders faithfully).""" + try: + import matplotlib + + matplotlib.use("Agg") + import matplotlib.pyplot as plt + except ImportError: + return None + import io + + rows = chart["vega_lite"]["data"]["values"] + labels = [ + next((str(v) for v in r.values() if isinstance(v, str)), "?") + for r in rows + ] + values = [ + next((float(v) for v in r.values() if isinstance(v, (int, float))), 0.0) + for r in rows + ] + kind = chart["chart_type"] + fig, ax = plt.subplots(figsize=(6.4, 3.4), dpi=144) + if kind == "pie": + ax.pie(values, labels=labels, autopct="%1.0f%%") + elif kind == "line": + ax.plot(labels, values, marker="o", color="#4285F4") + elif kind == "scatter": + ax.scatter(labels, values, s=80, color="#4285F4") + else: + ax.bar(labels, values, color="#4285F4") + if kind != "pie": + ax.set_ylabel(chart.get("y_field", "value")) + ax.grid(axis="y", alpha=0.3) + ax.spines[["top", "right"]].set_visible(False) + ax.set_title( + f"{chart.get('y_field', 'value')} by {chart.get('x_field', 'label')}" + f" ({kind})" + ) + fig.tight_layout() + buf = io.BytesIO() + fig.savefig(buf, format="png") + plt.close(fig) + return buf.getvalue() + + def _stub(name, fn): def build(): @node(name=name) @@ -375,7 +523,9 @@ def _registry() -> CapabilityRegistry: name="run_query", input_kind="item", serialize_input=False, - build=_stub("run_query", lambda s: {"rows": _rows_for(s)}), + build=_stub( + "run_query", lambda s: {"rows": _query_engine(_sql_of(s))} + ), ), Capability( name="profile_table", @@ -634,11 +784,6 @@ def _scenario_defs(): SCENARIOS = _scenario_defs() -def _rows_for(value) -> list: - """Canned rows; a year-scale window in the SQL selects the year set.""" - return _CANNED_ROWS_YEAR if "year" in _sql_of(value).lower() else _CANNED_ROWS - - def _text_of(node_input) -> str: """The user's message text, whatever shape the node input arrives in.""" if isinstance(node_input, str): @@ -783,12 +928,34 @@ async def plan_and_run(ctx: Context, node_input): for v in interp.state.values() if isinstance(v, dict) and "vega_lite" in v ): - yield _msg( - f"📈 **Chart ({chart['chart_type']})** — the" - " Conversational-Analytics-style artifact (text preview + Vega-Lite" - f" spec):\n```\n{chart['ascii']}\n```\n```json\n" - f"{json.dumps(chart['vega_lite'], indent=1)}\n```" - ) + png = _chart_png(chart) + if png is not None: + yield Event( + content=types.Content( + role="model", + parts=[ + types.Part( + text=( + f"📈 **Chart ({chart['chart_type']})** — rendered" + " from the Conversational-Analytics-style" + " Vega-Lite artifact:" + ) + ), + types.Part.from_bytes(data=png, mime_type="image/png"), + ], + ) + ) + yield _msg( + "Vega-Lite spec (the portable artifact behind the image):\n" + f"```json\n{json.dumps(chart['vega_lite'], indent=1)}\n```" + ) + else: + yield _msg( + f"📈 **Chart ({chart['chart_type']})** — text preview + Vega-Lite" + " spec (install matplotlib for an inline rendered image):\n```\n" + f"{chart['ascii']}\n```\n```json\n" + f"{json.dumps(chart['vega_lite'], indent=1)}\n```" + ) display = ( {k: v for k, v in result.items() if k != "vega_lite"} if isinstance(result, dict) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py index b7e65646b9f..c8bcbae816e 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py @@ -415,10 +415,13 @@ def test_stubs_tolerate_authored_binding_shapes(): def test_render_chart_accepts_authored_binding_shapes(): # query output (dict with rows) -> bar over those rows - ch = demo._render_chart({"rows": demo._CANNED_ROWS_YEAR}) + region_rows = demo._query_engine( + "SELECT region, SUM(x) AS revenue ... GROUP BY region INTERVAL 1 YEAR" + ) + ch = demo._render_chart({"rows": region_rows}) assert ch["chart_type"] == "bar" - assert "1,648,140.00" in ch["ascii"] - assert ch["vega_lite"]["data"]["values"] == demo._CANNED_ROWS_YEAR + assert "US-West" in ch["ascii"] + assert ch["vega_lite"]["data"]["values"] == region_rows # tournament winner (list with one chart type) -> that mark, canned rows ch = demo._render_chart(["pie"]) assert ch["chart_type"] == "pie" @@ -432,14 +435,85 @@ def test_render_chart_accepts_authored_binding_shapes(): assert len(lines) == 4 and lines[0].count("█") > lines[-1].count("█") -def test_rows_track_the_sql_window(): - # The mock executor returns a DIFFERENT canned set for a year-scale - # window, so the demo output visibly tracks the question. - q_sql = "SELECT ... WHERE created_at >= INTERVAL 1 QUARTER" - y_sql = "SELECT ... WHERE created_at >= INTERVAL 1 YEAR" - assert demo._rows_for(q_sql) == demo._CANNED_ROWS - assert demo._rows_for(y_sql) == demo._CANNED_ROWS_YEAR - assert demo._rows_for({"sql": y_sql}) == demo._CANNED_ROWS_YEAR +def test_render_chart_derives_encoding_fields(): + ch = demo._render_chart({"rows": [{"category": "A", "count": 3}]}) + assert ch["x_field"] == "category" and ch["y_field"] == "count" + enc = ch["vega_lite"]["encoding"] + assert enc["x"]["field"] == "category" and enc["y"]["field"] == "count" + + +def test_chart_png_renders_or_falls_back(): + ch = demo._render_chart({"rows": demo._CANNED_ROWS}) + png = demo._chart_png(ch) + if png is None: + pytest.skip("matplotlib not installed — text fallback path") + assert png[:8] == b"\x89PNG\r\n\x1a\n" # real PNG bytes + assert len(png) > 5000 + # every chart kind renders without error + for kind in ("pie", "line", "scatter"): + assert demo._chart_png(demo._render_chart([kind])) is not None + + +def test_engine_aggregates_by_region_and_window(): + # The "intelligent mock": rows are AGGREGATED from synthetic facts per the + # SQL's intent, not pattern-matched to a canned answer. + q = demo._query_engine( + "SELECT country AS region, SUM(p) AS revenue ... GROUP BY region" + " ... INTERVAL 1 QUARTER" + ) + y = demo._query_engine( + "SELECT country AS region, SUM(p) AS revenue ... GROUP BY region" + " ... INTERVAL 1 YEAR" + ) + assert [r["region"] for r in q] == ["US-West", "US-East", "EMEA", "APAC"] + # a year window strictly contains the quarter window: + assert ( + all(yr["revenue"] > qr["revenue"] for yr, qr in zip(y, q)) and len(y) == 4 + ) + + +def test_engine_monthly_trend_with_alias_and_country_filter(): + # The exact live gap this replaces: a trend question now returns a real + # monthly series, honoring the SQL's measure alias and US filter. + rows = demo._query_engine( + "SELECT DATE_TRUNC(o.created_at, MONTH) AS month, SUM(oi.sale_price)" + " AS total_sales FROM ... WHERE country = 'United States' AND" + " created_at >= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 2 YEAR)" + " GROUP BY month ORDER BY month" + ) + assert len(rows) == 24 # 2 years of months + assert list(rows[0]) == ["month", "total_sales"] + assert rows[0]["month"] == "2024-01" and rows[-1]["month"] == "2025-12" + # US-only filter: below the all-regions total for the same window + all_rows = demo._query_engine( + "SELECT month, SUM(x) AS total_sales ... INTERVAL 2 YEAR GROUP BY month" + ) + assert rows[0]["total_sales"] < all_rows[0]["total_sales"] + + +def test_engine_grand_total_and_category_grouping(): + total = demo._query_engine("SELECT SUM(sale_price) ... INTERVAL 2 YEAR") + assert len(total) == 1 and total[0]["revenue"] > 0 + cats = demo._query_engine( + "SELECT category, SUM(x) AS revenue ... GROUP BY category" + ) + assert [r["category"] for r in cats] == [ + "Outerwear", + "Jeans", + "Activewear", + "Accessories", + ] + + +def test_chart_infers_line_for_time_series(): + rows = demo._query_engine( + "SELECT month, SUM(x) AS sales ... GROUP BY month INTERVAL 1 YEAR" + ) + ch = demo._render_chart({"rows": rows}) + assert ch["chart_type"] == "line" # date-shaped x labels -> trend line + assert ch["vega_lite"]["mark"] == "line" + # an explicit winner still wins over the inference: + assert demo._render_chart(["bar"])["chart_type"] == "bar" def test_text_of_extracts_user_message(): From f88aae6f3c119463a2f1f3a274632e5a68d57d9c Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Tue, 9 Jun 2026 21:12:56 -0700 Subject: [PATCH 39/64] fix(ca-demo): engine understands yearly/quarterly grains; trend charts for all time buckets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Live gap #2: 'sales trend for global based on 3 years' authored EXTRACT(YEAR ...) AS year GROUP BY year — the engine only knew monthly grouping, fell through to a single anonymous grand total, and charted one '?' bar. Fixes: * Time grains month/quarter/year (week -> month) detected from DATE_TRUNC(..., G), EXTRACT(G FROM ...), AS g aliases, and the GROUP BY clause — scoped to the actual clause (stop at ORDER BY/LIMIT, INTERVAL phrases stripped, so a trailing 'INTERVAL 1 YEAR' window never reads as a yearly grouping). * Monthly facts bucket into the requested grain (quarters as YYYY-Qn, years as YYYY); quarters sum exactly to their year. * Line-mark inference covers monthly, quarterly, and yearly labels (>= 2 points; a single total stays a bar; explicit winners still win). Pinned: the exact live yearly SQL, quarterly buckets, bucket-consistency, label inference. 21 -> 22 CA tests, 77 total. --- .../authored_workflow_ca_demo/README.md | 2 +- .../bq_ca_planner/agent.py | 73 +++++++++++++------ .../test_ca_demo_agent.py | 33 +++++++++ 3 files changed, 85 insertions(+), 23 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/README.md b/contributing/samples/workflows/authored_workflow_ca_demo/README.md index d2f9548d962..a1b6b6f11f1 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_ca_demo/README.md @@ -69,7 +69,7 @@ replayable — a turn-by-turn agent retry never is.* ## 2. Correctness proof (no LLM, no BigQuery) ```bash -pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 21 +pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 22 ``` All seven expected shapes are built by hand, validated + lint-checked against diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py index d2f55247072..559a7af44ea 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py @@ -163,29 +163,58 @@ def _query_engine(sql_text: str) -> list[dict]: for category in _CATEGORY_WEIGHT: if f"'{category.lower()}'" in s: facts = [f for f in facts if f["category"] == category] - # grouping dimension - if re.search(r"date_trunc|group by\s+month|\bmonth\b", s): - dim = "month" - elif "category" in s or "department" in s: + # measure name: honor the SQL's alias when present + alias = re.search(r"sum\([^)]*\)\s+as\s+([a-z_][a-z0-9_]*)", s) + measure = alias.group(1) if alias else "revenue" + # time grain: DATE_TRUNC(..., G) / EXTRACT(G FROM ...) / AS g / GROUP BY g. + # Scope the GROUP BY check to the actual clause (stop at ORDER BY/LIMIT) + # with INTERVAL phrases stripped — a trailing "INTERVAL 1 YEAR" window + # must not read as a yearly grouping. + gb_match = re.search(r"group by\s+(.*?)(?:\border by\b|\blimit\b|$)", s) + gb_clause = re.sub( + r"interval\s+\d+\s+\w+", "", gb_match.group(1) if gb_match else "" + ) + grain = None + for g in ("month", "week", "quarter", "year"): + if ( + re.search(rf"date_trunc\([^)]*,\s*{g}\s*\)", s) + or re.search(rf"extract\(\s*{g}\s+from", s) + or re.search(rf"\bas\s+{g}\b", s) + or re.search(rf"\b{g}\b", gb_clause) + ): + grain = "month" if g == "week" else g # weekly facts -> monthly grain + break + if grain: + + def bucket(month: str) -> str: + y, mm = month.split("-") + if grain == "month": + return month + if grain == "quarter": + return f"{y}-Q{(int(mm) - 1) // 3 + 1}" + return y # year + + agg: dict = {} + for f in facts: + b = bucket(f["month"]) + agg[b] = agg.get(b, 0.0) + f["revenue"] + return [{grain: k, measure: round(v, 2)} for k, v in sorted(agg.items())] + # categorical dimension + if "category" in s or "department" in s: dim = "category" elif "region" in s or "country" in s: dim = "region" else: dim = None - # measure name: honor the SQL's alias when present - alias = re.search(r"sum\([^)]*\)\s+as\s+([a-z_][a-z0-9_]*)", s) - measure = alias.group(1) if alias else "revenue" if dim is None: return [{measure: round(sum(f["revenue"] for f in facts), 2)}] - agg: dict = {} + agg = {} for f in facts: agg[f[dim]] = agg.get(f[dim], 0.0) + f["revenue"] - items = ( - sorted(agg.items()) - if dim == "month" - else sorted(agg.items(), key=lambda kv: -kv[1]) - ) - return [{dim: k, measure: round(v, 2)} for k, v in items] + return [ + {dim: k, measure: round(v, 2)} + for k, v in sorted(agg.items(), key=lambda kv: -kv[1]) + ] _CANNED_PROFILES = { @@ -316,16 +345,16 @@ def _render_chart(v) -> dict: chart_type, explicit = str(obj[0]), True elif isinstance(v, str) and v in _VEGA_MARK: chart_type, explicit = v, True + # date-shaped x labels (a time series) default to a LINE mark unless the # chart type was chosen explicitly (e.g. by the tournament winner). - if not explicit and any( - isinstance(r, dict) - and any( - isinstance(val, str) and re.match(r"^\d{4}-\d{2}", val) - for val in r.values() - ) - for r in rows or [] - ): + def _datelike(r) -> bool: + return isinstance(r, dict) and any( + isinstance(val, str) and re.match(r"^\d{4}(-\d{2}|-q\d|$)", val.lower()) + for val in r.values() + ) + + if not explicit and len(rows or []) >= 2 and all(map(_datelike, rows)): chart_type = "line" first = rows[0] if rows and isinstance(rows[0], dict) else {} x_field = next((k for k, v in first.items() if isinstance(v, str)), "label") diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py index c8bcbae816e..34ac01bb541 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py @@ -505,6 +505,32 @@ def test_engine_grand_total_and_category_grouping(): ] +def test_engine_yearly_and_quarterly_grains(): + # The exact live gap: EXTRACT(YEAR ...) AS year GROUP BY year produced a + # single anonymous grand total. Yearly and quarterly grains now bucket the + # monthly facts (the warehouse holds 24 months, so a 3-year window caps + # at 2 years of buckets). + yearly = demo._query_engine( + "SELECT EXTRACT(YEAR FROM t1.created_at) AS year, SUM(t2.sale_price)" + " AS total_sales FROM ... WHERE t1.created_at >=" + " TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 3 YEAR) GROUP BY year" + " ORDER BY year" + ) + assert [r["year"] for r in yearly] == ["2024", "2025"] + assert all(r["total_sales"] > 1_000_000 for r in yearly) + quarterly = demo._query_engine( + "SELECT DATE_TRUNC(created_at, QUARTER) AS quarter, SUM(x) AS revenue" + " ... INTERVAL 2 YEAR GROUP BY quarter" + ) + assert [r["quarter"] for r in quarterly] == [ + f"{y}-Q{q}" for y in (2024, 2025) for q in (1, 2, 3, 4) + ] + # buckets are consistent: quarters sum to their year. + assert round(sum(r["revenue"] for r in quarterly[:4]), 2) == round( + yearly[0]["total_sales"], 2 + ) + + def test_chart_infers_line_for_time_series(): rows = demo._query_engine( "SELECT month, SUM(x) AS sales ... GROUP BY month INTERVAL 1 YEAR" @@ -512,8 +538,15 @@ def test_chart_infers_line_for_time_series(): ch = demo._render_chart({"rows": rows}) assert ch["chart_type"] == "line" # date-shaped x labels -> trend line assert ch["vega_lite"]["mark"] == "line" + # quarterly and yearly buckets are time series too: + q_rows = [{"quarter": "2024-Q1", "v": 1.0}, {"quarter": "2024-Q2", "v": 2.0}] + assert demo._render_chart({"rows": q_rows})["chart_type"] == "line" + y_rows = [{"year": "2024", "v": 1.0}, {"year": "2025", "v": 2.0}] + assert demo._render_chart({"rows": y_rows})["chart_type"] == "line" # an explicit winner still wins over the inference: assert demo._render_chart(["bar"])["chart_type"] == "bar" + # a single point is not a trend: + assert demo._render_chart({"rows": [{"total": 5.0}]})["chart_type"] == "bar" def test_text_of_extracts_user_message(): From 667c8348fb38e00ff5753278034ad54ed4f088b2 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Wed, 10 Jun 2026 10:28:21 -0700 Subject: [PATCH 40/64] feat(ca-demo): REAL BigQuery execution over the public thelook dataset + multi-series charts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces semantic guessing with the real thing, per review feedback: * dry_run -> the actual BigQuery dry-run API (real errors, real bytes-scanned); run_query -> real execution against bigquery-public-data.thelook_ecommerce billed to GOOGLE_CLOUD_PROJECT, with safety rails: maximum_bytes_billed = 2 GB/query, 500-row cap, result cells JSON-ified (Decimal/date/datetime). Bare table refs are auto-qualified and backticked. Fallback to the deterministic micro-warehouse without credentials or with CA_DEMO_USE_BIGQUERY=0; every beat carries an engine field (bigquery/mock/mock-fallback) so the data source is never misrepresented. * Multi-series charts: x/series/measure derived from the result shape (time fields by name/value, a second categorical becomes one line per value, measures picked by name preference — an int year column is never mistaken for the measure); Vega-Lite color encoding + matplotlib multi-line PNG with legend; ascii preview field-aware and capped. 22 -> 27 CA tests (qualify, jsonify, no-credentials fallback, multi-series pivot, and a live-gated real-BigQuery round-trip — dry-run bytes, real error, real rows; passing locally). 81 total. --- .../bq_ca_planner/agent.py | 279 ++++++++++++++---- .../test_ca_demo_agent.py | 89 ++++++ 2 files changed, 317 insertions(+), 51 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py index 559a7af44ea..614aa54d845 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py @@ -217,6 +217,132 @@ def bucket(month: str) -> str: ] +# ------------------------------------------------- REAL BigQuery backend +# When credentials allow, dry_run and run_query hit the REAL +# bigquery-public-data.thelook_ecommerce dataset (billed to +# GOOGLE_CLOUD_PROJECT) — real dry-run errors, real bytes-scanned, real +# multi-dimensional results. Safety rails: maximum_bytes_billed caps each +# query, results cap at _MAX_ROWS. Anything that fails falls back to the +# deterministic micro-warehouse above, so CI and credential-less machines +# keep working. CA_DEMO_USE_BIGQUERY=0 forces the mock. +_BQ_DATASET = "bigquery-public-data.thelook_ecommerce" +_MAX_BYTES_BILLED = 2 * 1024**3 # 2 GB per query +_MAX_ROWS = 500 +_BQ = { + "client": None, + "disabled": os.environ.get("CA_DEMO_USE_BIGQUERY", "1") != "1", + "error": None, +} + + +def _bq_client(): + if _BQ["disabled"] or _BQ["error"]: + return None + if _BQ["client"] is None: + try: + from google.cloud import bigquery # optional dependency + + _BQ["client"] = bigquery.Client( + project=os.environ.get("GOOGLE_CLOUD_PROJECT") or None + ) + except Exception as e: # no lib / no credentials -> mock warehouse + _BQ["error"] = f"{type(e).__name__}: {e}" + return None + return _BQ["client"] + + +def _qualify_sql(sql: str) -> str: + """Fully qualify bare thelook table refs for real BigQuery.""" + s = (sql or "").replace("`", "") + s = re.sub( + r"(? dict: + sql = _qualify_sql(_sql_of(value)) + client = _bq_client() + if client is None: + return { + "sql": sql, + "valid": "select" in sql.lower(), + "error": None, + "engine": "mock", + } + from google.cloud import bigquery + + try: + job = client.query( + sql, + job_config=bigquery.QueryJobConfig(dry_run=True, use_query_cache=False), + ) + return { + "sql": sql, + "valid": True, + "error": None, + "engine": "bigquery", + "bytes_processed": int(job.total_bytes_processed or 0), + } + except Exception as e: # the REAL BigQuery error feeds the repair story + return { + "sql": sql, + "valid": False, + "error": str(e)[:500], + "engine": "bigquery", + } + + +def _execute_sql(value) -> dict: + sql = _qualify_sql(_sql_of(value)) + client = _bq_client() + if client is not None: + from google.cloud import bigquery + + try: + job = client.query( + sql, + job_config=bigquery.QueryJobConfig( + maximum_bytes_billed=_MAX_BYTES_BILLED + ), + ) + rows = [ + {k: _jsonify_cell(v) for k, v in dict(r).items()} + for r in job.result(max_results=_MAX_ROWS) + ] + return { + "rows": rows, + "engine": "bigquery", + "bytes_processed": int(job.total_bytes_processed or 0), + } + except Exception as e: + return { + "rows": _query_engine(sql), + "engine": "mock-fallback", + "note": str(e)[:200], + } + return {"rows": _query_engine(sql), "engine": "mock"} + + _CANNED_PROFILES = { "orders": {"table": "orders", "row_count": 125210, "null_pct": 0.2}, "order_items": { @@ -304,19 +430,30 @@ def _verdict_of(v) -> dict: _VEGA_MARK = {"bar": "bar", "line": "line", "scatter": "point", "pie": "arc"} -def _ascii_bars(rows, width: int = 24) -> str: - """A Unicode bar preview of (label, value) rows — renders in the chat.""" +def _ascii_bars(rows, x=None, y=None, series=None, width: int = 24) -> str: + """A Unicode bar preview of the rows — renders in the chat. Uses the + derived x/y/series fields when given (so an integer `year` column is + never mistaken for the measure); falls back to first-str/first-number.""" pts = [] for r in rows or []: if not isinstance(r, dict): continue - label = next((str(v) for v in r.values() if isinstance(v, str)), "?") - num = next( - (float(v) for v in r.values() if isinstance(v, (int, float))), 0.0 - ) + if x in r: + label = str(r.get(x, "?")) + else: + label = next((str(v) for v in r.values() if isinstance(v, str)), "?") + if series and series in r: + label = f"{r[series]} {label}" + if y in r: + num = float(r.get(y) or 0.0) + else: + num = next( + (float(v) for v in r.values() if isinstance(v, (int, float))), 0.0 + ) pts.append((label, num)) if not pts: return "(no rows)" + pts = pts[:40] # keep the chat readable for wide results mx = max(n for _, n in pts) or 1.0 lw = max(len(label) for label, _ in pts) return "\n".join( @@ -357,23 +494,62 @@ def _datelike(r) -> bool: if not explicit and len(rows or []) >= 2 and all(map(_datelike, rows)): chart_type = "line" first = rows[0] if rows and isinstance(rows[0], dict) else {} - x_field = next((k for k, v in first.items() if isinstance(v, str)), "label") - y_field = next( - (k for k, v in first.items() if isinstance(v, (int, float))), "value" + timeish = ("year", "quarter", "month", "week", "date", "day") + str_fields = [k for k, v in first.items() if isinstance(v, str)] + time_fields = [ + k + for k, v in first.items() + if k.lower() in timeish + or (isinstance(v, str) and re.match(r"^\d{4}([-/]\d{2})?", v)) + ] + num_fields = [ + k + for k, v in first.items() + if isinstance(v, (int, float)) + and not isinstance(v, bool) + and k not in time_fields + ] + x_field = ( + time_fields[0] + if time_fields + else (str_fields[0] if str_fields else "label") ) + # a second categorical field becomes the SERIES (one line per value) — + # e.g. GROUP BY region, year -> x=year, one series per region. + series_field = next( + (k for k in str_fields if k != x_field and k not in time_fields), None + ) + if series_field is None and len(time_fields) == 0: + series_field = None + + def _measure_rank(k: str) -> int: + kl = k.lower() + if "revenue" in kl or "sales" in kl: + return 0 + if "total" in kl or "amount" in kl: + return 1 + return 2 + + y_field = sorted(num_fields, key=_measure_rank)[0] if num_fields else "value" + if series_field and not explicit: + chart_type = "line" # comparing series over a dimension -> lines + encoding = { + "x": {"field": x_field, "type": "nominal"}, + "y": {"field": y_field, "type": "quantitative"}, + } + if series_field: + encoding["color"] = {"field": series_field, "type": "nominal"} return { "chart_type": chart_type, "x_field": x_field, "y_field": y_field, - "ascii": _ascii_bars(rows), + "series_field": series_field, + "ascii": _ascii_bars(rows, x=x_field, y=y_field, series=series_field), "vega_lite": { "$schema": "https://vega.github.io/schema/vega-lite/v5.json", "mark": _VEGA_MARK[chart_type], "data": {"values": rows}, - "encoding": { - "x": {"field": x_field, "type": "nominal"}, - "y": {"field": y_field, "type": "quantitative"}, - }, + "encoding": encoding, }, } @@ -393,32 +569,40 @@ def _chart_png(chart: dict): import io rows = chart["vega_lite"]["data"]["values"] - labels = [ - next((str(v) for v in r.values() if isinstance(v, str)), "?") - for r in rows - ] - values = [ - next((float(v) for v in r.values() if isinstance(v, (int, float))), 0.0) - for r in rows - ] + x, y = chart.get("x_field", "label"), chart.get("y_field", "value") + series = chart.get("series_field") kind = chart["chart_type"] - fig, ax = plt.subplots(figsize=(6.4, 3.4), dpi=144) - if kind == "pie": - ax.pie(values, labels=labels, autopct="%1.0f%%") - elif kind == "line": - ax.plot(labels, values, marker="o", color="#4285F4") - elif kind == "scatter": - ax.scatter(labels, values, s=80, color="#4285F4") + fig, ax = plt.subplots(figsize=(6.8, 3.6), dpi=144) + if series: + # one line per series value (e.g. per region), x shared + by_series: dict = {} + for r in rows: + by_series.setdefault(str(r.get(series, "?")), []).append( + (str(r.get(x, "?")), float(r.get(y) or 0.0)) + ) + for name, pts in sorted(by_series.items()): + pts.sort() + ax.plot([p[0] for p in pts], [p[1] for p in pts], marker="o", label=name) + ax.legend(fontsize=8) else: - ax.bar(labels, values, color="#4285F4") - if kind != "pie": - ax.set_ylabel(chart.get("y_field", "value")) + labels = [str(r.get(x, "?")) for r in rows] + values = [float(r.get(y) or 0.0) for r in rows] + if kind == "pie": + ax.pie(values, labels=labels, autopct="%1.0f%%") + elif kind == "line": + ax.plot(labels, values, marker="o", color="#4285F4") + elif kind == "scatter": + ax.scatter(labels, values, s=80, color="#4285F4") + else: + ax.bar(labels, values, color="#4285F4") + if kind != "pie" or series: + ax.set_ylabel(y) ax.grid(axis="y", alpha=0.3) ax.spines[["top", "right"]].set_visible(False) - ax.set_title( - f"{chart.get('y_field', 'value')} by {chart.get('x_field', 'label')}" - f" ({kind})" - ) + if len({str(r.get(x, "")) for r in rows}) > 8: + ax.tick_params(axis="x", labelrotation=60, labelsize=7) + title = f"{y} by {x}" + (f" per {series}" if series else "") + f" ({kind})" + ax.set_title(title) fig.tight_layout() buf = io.BytesIO() fig.savefig(buf, format="png") @@ -460,8 +644,9 @@ def _registry() -> CapabilityRegistry: "nl2sql", Sql, "Translate the question in the input JSON to one BigQuery" - f" StandardSQL SELECT over thelook_ecommerce: {schema_blurb}." - " Output Sql.", + " StandardSQL SELECT over the public dataset" + " bigquery-public-data.thelook_ecommerce (use fully-qualified" + f" table names): {schema_blurb}. Output Sql.", ), ), Capability( @@ -474,8 +659,9 @@ def _registry() -> CapabilityRegistry: Sql, "Input JSON has a question, and possibly a prior sql + error" " from a failed dry run. Draft (or repair, using the error)" - " one BigQuery StandardSQL SELECT over thelook_ecommerce:" - f" {schema_blurb}. Output Sql.", + " one BigQuery StandardSQL SELECT over the public dataset" + " bigquery-public-data.thelook_ecommerce (fully-qualified" + f" table names): {schema_blurb}. Output Sql.", ), ), Capability( @@ -522,14 +708,7 @@ def _registry() -> CapabilityRegistry: name="dry_run", input_kind="item", serialize_input=False, - build=_stub( - "dry_run", - lambda s: { - "sql": _sql_of(s), - "valid": "select" in _sql_of(s).lower(), - "error": None, - }, - ), + build=_stub("dry_run", _bq_dry_run), ), Capability( name="flaky_dry_run", @@ -552,9 +731,7 @@ def _registry() -> CapabilityRegistry: name="run_query", input_kind="item", serialize_input=False, - build=_stub( - "run_query", lambda s: {"rows": _query_engine(_sql_of(s))} - ), + build=_stub("run_query", _execute_sql), ), Capability( name="profile_table", diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py index 34ac01bb541..6b5ec766ec3 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py @@ -454,6 +454,95 @@ def test_chart_png_renders_or_falls_back(): assert demo._chart_png(demo._render_chart([kind])) is not None +def test_qualify_sql_for_real_bigquery(): + q = demo._qualify_sql( + "SELECT * FROM thelook_ecommerce.orders JOIN" + " thelook_ecommerce.order_items USING (order_id)" + ) + assert "`bigquery-public-data.thelook_ecommerce.orders`" in q + assert "`bigquery-public-data.thelook_ecommerce.order_items`" in q + # already-qualified and backticked inputs normalize to the same form + same = demo._qualify_sql( + "SELECT * FROM `bigquery-public-data.thelook_ecommerce.orders`" + ) + assert same.count("`bigquery-public-data.thelook_ecommerce.orders`") == 1 + + +def test_jsonify_cells(): + import datetime + import decimal + + assert demo._jsonify_cell(decimal.Decimal("3.14159")) == 3.14 + assert demo._jsonify_cell(2.71828) == 2.72 + assert demo._jsonify_cell(datetime.date(2024, 1, 31)) == "2024-01-31" + assert demo._jsonify_cell(datetime.datetime(2024, 1, 31, 12, 0)).startswith( + "2024-01-31T12:00" + ) + assert demo._jsonify_cell("x") == "x" and demo._jsonify_cell(7) == 7 + + +def test_dry_run_and_execute_fall_back_without_bigquery(monkeypatch): + monkeypatch.setitem(demo._BQ, "disabled", True) + d = demo._bq_dry_run({"sql": "SELECT region FROM thelook_ecommerce.orders"}) + assert d["engine"] == "mock" and d["valid"] is True + out = demo._execute_sql( + {"sql": "SELECT region, SUM(x) AS revenue ... GROUP BY region"} + ) + assert out["engine"] == "mock" + assert [r["region"] for r in out["rows"]][0] == "US-West" + + +def test_chart_multiseries_per_region_per_year(): + # The shape the user's real question produces: GROUP BY region, year with + # two measures. x = the time field, one SERIES per region, measure picked + # by name preference (total_sales over total_orders); int year never + # mistaken for the measure. + rows = [ + {"region": r, "year": y, "total_sales": s, "total_orders": o} + for (r, y, s, o) in [ + ("US-West", 2024, 100.0, 10), + ("US-West", 2025, 130.0, 12), + ("EMEA", 2024, 70.0, 8), + ("EMEA", 2025, 90.0, 9), + ] + ] + ch = demo._render_chart({"rows": rows}) + assert ch["x_field"] == "year" + assert ch["series_field"] == "region" + assert ch["y_field"] == "total_sales" + assert ch["chart_type"] == "line" + assert ch["vega_lite"]["encoding"]["color"]["field"] == "region" + assert "US-West" in ch["ascii"] and "130.00" in ch["ascii"] + png = demo._chart_png(ch) + if png is not None: + assert png[:8] == b"\x89PNG\r\n\x1a\n" + + +@pytest.mark.skipif( + not os.environ.get("CA_DEMO_LIVE_BQ"), + reason="live BigQuery round-trip (set CA_DEMO_LIVE_BQ=1 + credentials)", +) +def test_live_bigquery_roundtrip(): + good = demo._bq_dry_run({ + "sql": ( + "SELECT status, COUNT(*) AS n FROM thelook_ecommerce.orders" + " GROUP BY status" + ) + }) + assert good["engine"] == "bigquery" and good["valid"] is True + assert good["bytes_processed"] > 0 + bad = demo._bq_dry_run({"sql": "SELECT nope FROM thelook_ecommerce.orders"}) + assert bad["valid"] is False and bad["error"] # a REAL BigQuery error + out = demo._execute_sql({ + "sql": ( + "SELECT status, COUNT(*) AS n FROM thelook_ecommerce.orders" + " GROUP BY status ORDER BY n DESC LIMIT 3" + ) + }) + assert out["engine"] == "bigquery" and len(out["rows"]) == 3 + assert out["rows"][0]["n"] > 0 + + def test_engine_aggregates_by_region_and_window(): # The "intelligent mock": rows are AGGREGATED from synthetic facts per the # SQL's intent, not pattern-matched to a canned answer. From b4466f8e2da02856279ad2052c4cf6803a365412 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Wed, 10 Jun 2026 10:28:56 -0700 Subject: [PATCH 41/64] =?UTF-8?q?docs(ca-demo):=20README=20=E2=80=94=20rea?= =?UTF-8?q?l=20BigQuery=20backend,=20engine=20transparency,=20counts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../authored_workflow_ca_demo/README.md | 31 ++++++++++--------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/README.md b/contributing/samples/workflows/authored_workflow_ca_demo/README.md index a1b6b6f11f1..63bd0dd8edd 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_ca_demo/README.md @@ -6,16 +6,19 @@ a user asks data questions in natural language, and the planner **authors a different typed `WorkflowSpec` per scenario** over Conversational-Analytics capabilities — `nl2sql`, `dry_run`, `run_query`, `profile_table`, `skeptic`, chart judging — against a mock `thelook_ecommerce` dataset (the dataset the -CA docs demo against). Query execution runs on a **deterministic -micro-warehouse**: a synthetic 24-month × 4-region × 4-category fact table -plus SQL-intent parsing — the executor *aggregates* the facts per the -query's grouping (month/region/category), time window (`INTERVAL N YEAR/QUARTER/MONTH`), filters (`country = 'United States'`, region/category -literals), and measure alias (`AS total_sales`). No BigQuery project -needed, and answers genuinely track the question (a trend question returns -a real monthly series and charts as a line). Honest scope: it executes the -query's *intent*, not its SQL — a real BigQuery backend is the production -step. The language steps (NL2SQL, summaries, classification, skeptics) are -live Gemini calls. +CA docs demo against). **Query execution is REAL BigQuery** when +credentials allow: `dry_run` hits the actual BigQuery dry-run API (real +errors, real bytes-scanned) and `run_query` executes against +`bigquery-public-data.thelook_ecommerce`, billed to your +`GOOGLE_CLOUD_PROJECT` with safety rails (`maximum_bytes_billed` = 2 GB per +query, 500-row result cap). Multi-dimensional questions ("each region's +trend per year") return real grouped results and chart as multi-series +lines. Without credentials (or with `CA_DEMO_USE_BIGQUERY=0`), execution +falls back to a deterministic micro-warehouse (synthetic facts + +SQL-intent aggregation) so CI and credential-less machines keep working — +each dry-run/result beat carries an `engine` field (`bigquery` or `mock`) +so the demo never misrepresents its data source. The language steps +(NL2SQL, summaries, classification, skeptics) are live Gemini calls. Every scenario runs the full #93 machinery: **author → validate → independence lints → freeze (per-scenario key) → execute on the real engine @@ -69,7 +72,7 @@ replayable — a turn-by-turn agent retry never is.* ## 2. Correctness proof (no LLM, no BigQuery) ```bash -pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 22 +pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 27 (one live-gated: CA_DEMO_LIVE_BQ=1) ``` All seven expected shapes are built by hand, validated + lint-checked against @@ -92,6 +95,6 @@ against the **live** registry (their capabilities are deterministic mocks). seven replay independently within a session. - Scenario 1 takes your live message as the question; the other six prompts are mode selectors with canned task inputs (their results don't change - with your wording). Query answers come from the deterministic - micro-warehouse above — real aggregation over synthetic facts; there is - no BigQuery behind it. + with your wording). Query answers come from real BigQuery when + credentials allow (check the `engine` field in the dry-run/result beats); + otherwise the deterministic micro-warehouse. From 35c035b2ee146eca24a23a58484d97c89cb8e6c4 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Wed, 10 Jun 2026 10:51:12 -0700 Subject: [PATCH 42/64] feat(ca-demo): default flow self-repairs from REAL BigQuery dry-run errors; no fabricated fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Live finding: nl2sql emitted TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 5 YEAR) — a genuine BigQuery dialect error the real dry-run caught — but (a) the linear sequence plan ignored the verdict and ran the query anyway, and (b) the executor's no-credentials fallback fired on the FAILING query and served mock rows as an answer. * The ask-a-question recipe is now the real CA flow: loop_until( draft_or_repair_sql reading the loop-carried failed dry-run output -> REAL dry_run) -> run_query -> chart + summarize. The model repairs from the actual BigQuery error text. * _execute_sql no longer fabricates: a failing query returns empty rows + the real error (engine: bigquery); the mock engine is reserved for missing credentials only. * Tests: expected sequence shape updated; no-fabrication contract pinned with a raising fake client; plan-executing e2e tests pin the mock engine (no network in unit tests). 27 -> 28 CA tests, 82 total. --- .../authored_workflow_ca_demo/README.md | 21 ++++---- .../bq_ca_planner/agent.py | 34 +++++++----- .../test_ca_demo_agent.py | 54 ++++++++++++++----- 3 files changed, 72 insertions(+), 37 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/README.md b/contributing/samples/workflows/authored_workflow_ca_demo/README.md index 63bd0dd8edd..5585ca05174 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_ca_demo/README.md @@ -43,15 +43,15 @@ adk web contributing/samples/workflows/authored_workflow_ca_demo --port 8001 Open the UI, pick `bq_ca_planner`, and send the prompts below — **one scenario per prompt**, each authoring a different coordination shape: -| # | Send this prompt | Shape authored | CA story | -| --- | ------------------------------------------------------------ | ------------------------------------------------------------------- | -------------------------------------------------------------------------- | -| 1 | `What was revenue by region last quarter?` | sequence: `nl2sql → dry_run → run_query → render_chart + summarize` | the basic ask-a-question flow — **your actual question is the task input** | -| 2 | `Profile data quality across the dataset tables.` | fan-out → synthesize | per-table profiling in parallel, one report | -| 3 | `Build a dashboard for these three questions.` | pipeline(`nl2sql → dry_run`) per item | each panel translated + validated barrier-free | -| 4 | `Route my question: what does order status 'Complete' mean?` | classify & route (branch) | metadata questions skip SQL entirely | -| 5 | `Answer with SQL self-repair — the dry run is unreliable.` | loop_until + **loop-carried `init`** | draft → failed dry run → repair using the error | -| 6 | `Audit these insights — verify each one independently.` | adversarial verification | independent skeptics per insight; the $1M AOV claim gets refuted | -| 7 | `Pick the best chart for revenue by region.` | tournament | pairwise chart judging to a single winner | +| # | Send this prompt | Shape authored | CA story | +| --- | ------------------------------------------------------------ | ---------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | +| 1 | `What was revenue by region last quarter?` | `loop_until(draft → REAL dry-run → repair) → run_query → render_chart + summarize` | the standard CA flow — **your actual question is the task input**, and a real BigQuery dry-run error (e.g. `TIMESTAMP_SUB ... YEAR`) feeds the repair round | +| 2 | `Profile data quality across the dataset tables.` | fan-out → synthesize | per-table profiling in parallel, one report | +| 3 | `Build a dashboard for these three questions.` | pipeline(`nl2sql → dry_run`) per item | each panel translated + validated barrier-free | +| 4 | `Route my question: what does order status 'Complete' mean?` | classify & route (branch) | metadata questions skip SQL entirely | +| 5 | `Answer with SQL self-repair — the dry run is unreliable.` | loop_until + **loop-carried `init`** | draft → failed dry run → repair using the error | +| 6 | `Audit these insights — verify each one independently.` | adversarial verification | independent skeptics per insight; the $1M AOV claim gets refuted | +| 7 | `Pick the best chart for revenue by region.` | tournament | pairwise chart judging to a single winner | What to point at as each one streams: @@ -61,6 +61,7 @@ What to point at as each one streams: - **🔒 freeze (per-scenario key)** — **re-send any prompt**: same hash, `0 planner calls (frozen replay)`. Seven independent frozen plans in one session. - **template reuse (scenario 1)** — after the first ask, send a *different* question (`What was revenue by region last year?`): the frozen plan is reused unchanged, your new question flows through it as new task input, and the mock rows change with the window (quarter vs year canned sets). Same plan, new data — the RFC's replay-vs-template distinction, live. - **📈 chart** — scenarios 1 and 7 emit the Conversational-Analytics-style chart artifact: a **rendered chart image inline in the chat** (matplotlib, optional — falls back to a Unicode preview) plus the **Vega-Lite spec** (what the real CA API returns). Time-series rows infer a line mark; in the tournament, the bracket picks the mark and `render_chart` draws the data with it. +- **honest failure handling** — a query that still fails after repair returns empty rows + the real error (`engine: bigquery`); the mock warehouse is used ONLY when credentials are absent, never to paper over a failing query. - **📄 result + 📊 cost** — real execution on the #92 supervisor; the repair scenario shows exactly one repair iteration (`Table not found … did you mean orders?` → fixed), the audit scenario rejects the implausible insight, the tournament returns `["bar"]`. Talking point for scenario 5 (the differentiated one): *the repair loop needs @@ -72,7 +73,7 @@ replayable — a turn-by-turn agent retry never is.* ## 2. Correctness proof (no LLM, no BigQuery) ```bash -pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 27 (one live-gated: CA_DEMO_LIVE_BQ=1) +pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 28 (one live-gated: CA_DEMO_LIVE_BQ=1) ``` All seven expected shapes are built by hand, validated + lint-checked against diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py index 614aa54d845..36dad69d1a5 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py @@ -335,11 +335,10 @@ def _execute_sql(value) -> dict: "bytes_processed": int(job.total_bytes_processed or 0), } except Exception as e: - return { - "rows": _query_engine(sql), - "engine": "mock-fallback", - "note": str(e)[:200], - } + # A failing query must NOT fabricate an answer from the mock — that + # path is only for missing credentials. Return the failure honestly; + # the repair loop upstream exists to prevent reaching here. + return {"rows": [], "engine": "bigquery", "error": str(e)[:300]} return {"rows": _query_engine(sql), "engine": "mock"} @@ -874,17 +873,26 @@ def _scenario_defs(): q_region = "What was revenue by region last quarter?" return { "sequence": dict( - title="Ask a question (sequence)", - shape="step → step → step → render_chart + step", + title="Ask a question (draft → REAL dry-run → repair → execute)", + shape=( + "loop_until(draft_or_repair → real dry_run) → run_query →" + " render_chart + step" + ), triggers=("revenue by region", "sequence"), task={"question": q_region}, recipe=( - "Author, in order: (1) a step running nl2sql on the task;" - " (2) a step running dry_run on it; (3) a step running" - " run_query on that; (4) a step running render_chart on the" - " run_query step's output; (5) a step running" - " summarize_insight on the run_query step's output. Output =" - " the summarize step." + "Author, in order: (1) ONE loop_until for SQL drafting with" + " self-repair: init = Binding(source='task'); body = [(a) a" + " step running draft_or_repair_sql whose input is" + " Binding(source='step', step=) — round 0" + " reads the task, later rounds read the failed dry-run output" + " (sql + error); (b) a step running dry_run on (a)];" + " until_capability = sql_ok with until_input =" + " Binding(source='step', step=); max_iters = 3." + " (2) a step running run_query on the loop's output. (3) a" + " step running render_chart on the run_query step's output." + " (4) a step running summarize_insight on the run_query" + " step's output. Output = the summarize step." ), ), "fanout": dict( diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py index 6b5ec766ec3..350db68c1c8 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py @@ -148,23 +148,33 @@ def _expected_spec(key: str) -> WorkflowSpec: return WorkflowSpec( goal="revenue by region", steps=[ - StepRef( - kind="step", - id="sql", - capability="nl2sql", - input=Binding(source="task"), - ), - StepRef( - kind="step", - id="check", - capability="dry_run", - input=Binding(source="step", step="sql"), + LoopUntil( + kind="loop_until", + id="sqlgen", + init=Binding(source="task"), + body=[ + StepRef( + kind="step", + id="draft", + capability="draft_or_repair_sql", + input=Binding(source="step", step="sqlgen"), + ), + StepRef( + kind="step", + id="check", + capability="dry_run", + input=Binding(source="step", step="draft"), + ), + ], + until_capability="sql_ok", + until_input=Binding(source="step", step="check"), + max_iters=3, ), StepRef( kind="step", id="rows", capability="run_query", - input=Binding(source="step", step="check"), + input=Binding(source="step", step="sqlgen"), ), StepRef( kind="step", @@ -492,6 +502,20 @@ def test_dry_run_and_execute_fall_back_without_bigquery(monkeypatch): assert [r["region"] for r in out["rows"]][0] == "US-West" +def test_failing_query_returns_error_not_fabricated_rows(monkeypatch): + class _Boom: + + def query(self, *a, **k): + raise RuntimeError("400 invalid query") + + monkeypatch.setitem(demo._BQ, "disabled", False) + monkeypatch.setitem(demo._BQ, "error", None) + monkeypatch.setitem(demo._BQ, "client", _Boom()) + out = demo._execute_sql({"sql": "SELECT broken"}) + assert out["engine"] == "bigquery" + assert out["rows"] == [] and "400" in out["error"] # honest failure + + def test_chart_multiseries_per_region_per_year(): # The shape the user's real question produces: GROUP BY region, year with # two measures. x = the time field, one SERIES per region, measure picked @@ -708,7 +732,8 @@ def test_all_seven_shapes_validate_and_lint_clean(): @pytest.mark.asyncio -async def test_sequence_executes(): +async def test_sequence_executes(monkeypatch): + monkeypatch.setitem(demo._BQ, "disabled", True) # no network in unit tests out = await _run( _expected_spec("sequence"), _stub_registry(), @@ -729,7 +754,8 @@ async def test_fanout_executes_no_llm_needed(): @pytest.mark.asyncio -async def test_pipeline_executes_per_question(): +async def test_pipeline_executes_per_question(monkeypatch): + monkeypatch.setitem(demo._BQ, "disabled", True) out = await _run( _expected_spec("pipeline"), _stub_registry(), From d469636a1825c4465469a223c30844c90760370a Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Wed, 10 Jun 2026 11:09:39 -0700 Subject: [PATCH 43/64] feat(ca-demo): cross-session workflow reuse via an exported plan store MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Frozen plans now outlive the session. On freeze, the FULL FrozenWorkflowRecord (spec, hashes, planner/registry/capability versions + contract hashes, captured task_input_schema = template promotion) is exported to ca_plan_store/.json — the demo's stand-in for the ArtifactService in the RFC's revised Q1. A new session's reuse order is: session state -> plan store -> author fresh. The store path uses the RFC's DEFENSIVE import: spec_hash recomputed, re-validation against the CURRENT registry, manual-version + contract-hash drift are hard rejections (shown in chat, then fresh authoring — drift never silently replays a stale plan), and a new question is validated against the captured schema (cross-session template reuse). Tests: store round-trip + template reuse with a new question, tamper (hash-mismatch) rejection, contract-drift rejection with an unbumped version. 28 -> 30 CA tests, 84 total. --- .gitignore | 2 + .../authored_workflow_ca_demo/README.md | 10 +- .../bq_ca_planner/agent.py | 92 +++++++++++++++++-- .../test_ca_demo_agent.py | 65 +++++++++++++ 4 files changed, 156 insertions(+), 13 deletions(-) diff --git a/.gitignore b/.gitignore index 7bacd1d1d43..194948758a3 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,5 @@ security_audit_plan.json # ADK Web demo session stores (runtime) demo_sessions*.db +ca_demo_sessions*.db +ca_plan_store/ diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/README.md b/contributing/samples/workflows/authored_workflow_ca_demo/README.md index 5585ca05174..115b3c7c675 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_ca_demo/README.md @@ -58,7 +58,7 @@ What to point at as each one streams: - **🗂️ scenario banner** — the expected shape, named before the model authors it. - **📋 authored plan** — a *different* typed `WorkflowSpec` per prompt; same closed vocabulary every time. - **✅ + 🧪 validation & independence lints** — every scenario lints clean; the provenance facts are statically provable from the bindings. -- **🔒 freeze (per-scenario key)** — **re-send any prompt**: same hash, `0 planner calls (frozen replay)`. Seven independent frozen plans in one session. +- **🔒 freeze (per-scenario key) + 📦 cross-session export** — every authored plan exports its full `FrozenWorkflowRecord` to `ca_plan_store/.json`. **Re-send any prompt**: same hash, `0 planner calls (frozen replay)`. **Start a whole new session** and ask again: the plan is **imported from the store** through the RFC's defensive path — spec hash recomputed, re-validated against the current registry, manual-version + contract-hash drift fail loudly (with the rejection shown, then a fresh authoring), and your new question is validated against the captured `task_input_schema` (cross-session **template reuse**). Plans now outlive sessions. - **template reuse (scenario 1)** — after the first ask, send a *different* question (`What was revenue by region last year?`): the frozen plan is reused unchanged, your new question flows through it as new task input, and the mock rows change with the window (quarter vs year canned sets). Same plan, new data — the RFC's replay-vs-template distinction, live. - **📈 chart** — scenarios 1 and 7 emit the Conversational-Analytics-style chart artifact: a **rendered chart image inline in the chat** (matplotlib, optional — falls back to a Unicode preview) plus the **Vega-Lite spec** (what the real CA API returns). Time-series rows infer a line mark; in the tournament, the bracket picks the mark and `render_chart` draws the data with it. - **honest failure handling** — a query that still fails after repair returns empty rows + the real error (`engine: bigquery`); the mock warehouse is used ONLY when credentials are absent, never to paper over a failing query. @@ -73,7 +73,7 @@ replayable — a turn-by-turn agent retry never is.* ## 2. Correctness proof (no LLM, no BigQuery) ```bash -pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 28 (one live-gated: CA_DEMO_LIVE_BQ=1) +pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 30 (one live-gated: CA_DEMO_LIVE_BQ=1) ``` All seven expected shapes are built by hand, validated + lint-checked against @@ -92,8 +92,10 @@ against the **live** registry (their capabilities are deterministic mocks). vocabulary — is the claim here. - The `flaky_dry_run` failure is simulated (every odd call fails) so the repair loop behaves identically on every run and in CI. -- Frozen plans are per-scenario (`authored_workflow:ca:`), so all - seven replay independently within a session. +- Frozen plans are per-scenario (`authored_workflow:ca:`) in + session state, AND exported per-scenario to `ca_plan_store/` for + cross-session reuse (delete a file to force fresh authoring; the store is + the demo's stand-in for the ArtifactService in the RFC's revised Q1). - Scenario 1 takes your live message as the question; the other six prompts are mode selectors with canned task inputs (their results don't change with your wording). Query answers come from real BigQuery when diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py index 36dad69d1a5..4cbb28cafd6 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py @@ -43,6 +43,7 @@ from __future__ import annotations +import datetime import json import math import os @@ -71,6 +72,10 @@ ) from authoring import Capability # noqa: E402 from authoring import CapabilityRegistry # noqa: E402 +from authoring import export_plan # noqa: E402 +from authoring import FrozenWorkflowRecord # noqa: E402 +from authoring import import_plan # noqa: E402 +from authoring import PlanImportError # noqa: E402 from authoring import independence_facts # noqa: E402 from authoring import sha256_hex # noqa: E402 from authoring import SpecInterpreter # noqa: E402 @@ -1050,6 +1055,42 @@ def _planner_instruction(sc) -> str: ) +# ------------------------------------------------- cross-session plan store +# Frozen plans outlive the session: on freeze, the FULL FrozenWorkflowRecord +# is exported as a portable envelope to disk (a stand-in for the +# ArtifactService in production — RFC Q1). A NEW session imports it through +# the RFC's DEFENSIVE import: spec_hash recomputed, re-validated against the +# CURRENT registry, manual-version + contract-hash drift fail loudly, and +# the new task input is validated against the captured task_input_schema +# (template reuse). Drift never silently replays a stale plan — it falls +# back to authoring fresh, with the rejection shown. +_PLAN_STORE = os.path.join(os.getcwd(), "ca_plan_store") + + +def _store_plan(key: str, record: FrozenWorkflowRecord) -> str: + os.makedirs(_PLAN_STORE, exist_ok=True) + path = os.path.join(_PLAN_STORE, f"{key}.json") + with open(path, "w") as f: + json.dump(export_plan(record), f, indent=1) + return path + + +def _load_stored_plan(key: str, registry, task): + """Returns (spec, None) on a valid import, (None, reason) on a rejected + or unreadable envelope, (None, None) when nothing is stored.""" + path = os.path.join(_PLAN_STORE, f"{key}.json") + if not os.path.exists(path): + return None, None + try: + with open(path) as f: + envelope = json.load(f) + return import_plan(envelope, registry, task_input=task), None + except PlanImportError as e: + return None, str(e)[:300] + except Exception as e: # unreadable/corrupt file + return None, f"{type(e).__name__}: {e}" + + def _msg(text: str) -> Event: return Event( content=types.Content(role="model", parts=[types.Part(text=text)]) @@ -1076,20 +1117,39 @@ async def plan_and_run(ctx: Context, node_input): f" ({', '.join(TABLES)}){task_note}." ) - # 1. LOAD-OR-AUTHOR (per-scenario frozen key: each shape replays - # independently — re-send the same prompt to replay without the model). + # 1. LOAD-OR-AUTHOR. Reuse order: this session's state -> the + # CROSS-SESSION plan store (defensive import) -> author fresh. + spec, source = None, None existing = ctx.state.get(state_key) if existing: spec = WorkflowSpec.model_validate(existing) + source = "session state" + else: + spec, reject = _load_stored_plan(key, reg, task) + if spec is not None: + source = "plan store (CROSS-SESSION import)" + ctx.state[state_key] = spec.model_dump() # cache for this session + elif reject: + yield _msg( + f"🛑 **Plan-store import rejected** for `{key}` — {reject}\n" + "Drift never silently replays a stale plan; re-authoring fresh." + ) + if spec is not None: spec_hash = _hash(spec) reused = True fresh_input = task != sc["task"] yield _msg( - f"♻️ **Reusing frozen plan** for `{key}` — hash `{spec_hash}`. The" - " model is NOT re-invoked; the exact prior plan is replayed" + f"♻️ **Reusing frozen plan** for `{key}` from {source} — hash" + f" `{spec_hash}`. The model is NOT re-invoked" + + ( + "; the import recomputed the hash, re-validated against the" + " current registry, and checked contract-hash drift" + if "CROSS-SESSION" in (source or "") + else "" + ) + ( - " — with your NEW question as the task input (**template" - " reuse**: same plan, new data flowing through it)." + " — your NEW question is the task input (**template reuse**:" + " same plan, new data flowing through it)." if fresh_input else "." ) @@ -1124,12 +1184,26 @@ async def plan_and_run(ctx: Context, node_input): + (f"\n⚠️ {lints}" if lints else "") ) - # 3. FREEZE (per scenario). + # 3. FREEZE (per scenario) + EXPORT (cross-session). if not reused: ctx.state[state_key] = spec.model_dump() + record = FrozenWorkflowRecord.freeze( + spec, + planner_model=MODEL, + registry=reg, + created_at=datetime.datetime.now(datetime.timezone.utc).isoformat(), + task_input=task, + # capture the input schema = TEMPLATE promotion: a new session may + # run this plan on a NEW question, validated against this schema. + task_input_schema={"required": sorted(task)}, + ) + path = _store_plan(key, record) yield _msg( - f"🔒 **Frozen** under `{state_key}` — hash `{spec_hash}`. Re-send" - " this prompt: same plan, zero planner calls." + f"🔒 **Frozen** under `{state_key}` — hash `{spec_hash}`. 📦" + f" Exported the full record to `{os.path.relpath(path)}` —" + " **a NEW session will import and reuse this plan** (defensive" + " import: hash + registry + contract-hash checks, task input" + " validated against the captured schema)." ) # 4. EXECUTE on the real engine via the #92 supervisor. diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py index 350db68c1c8..948028730c5 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py @@ -516,6 +516,71 @@ def query(self, *a, **k): assert out["rows"] == [] and "400" in out["error"] # honest failure +def _freeze_record(key): + return demo.FrozenWorkflowRecord.freeze( + _expected_spec(key), + planner_model="gemini-3.5-flash", + registry=demo._registry(), + created_at="2026-06-10T00:00:00Z", + task_input=demo.SCENARIOS[key]["task"], + task_input_schema={"required": sorted(demo.SCENARIOS[key]["task"])}, + ) + + +def test_cross_session_store_roundtrip_and_template_reuse( + tmp_path, monkeypatch +): + # Session A freezes + exports; "session B" (no session state) imports the + # plan through the defensive path — including with a NEW question, which + # is template reuse validated against the captured task_input_schema. + monkeypatch.setattr(demo, "_PLAN_STORE", str(tmp_path)) + demo._store_plan("sequence", _freeze_record("sequence")) + # same canned input -> replay path + spec, reject = demo._load_stored_plan( + "sequence", demo._registry(), demo.SCENARIOS["sequence"]["task"] + ) + assert reject is None and spec is not None + # NEW question -> template path (schema validates the input) + spec, reject = demo._load_stored_plan( + "sequence", + demo._registry(), + {"question": "revenue by category last year?"}, + ) + assert reject is None and spec is not None + assert spec.model_dump() == _expected_spec("sequence").model_dump() + # nothing stored for another key + assert demo._load_stored_plan( + "fanout", demo._registry(), demo.SCENARIOS["fanout"]["task"] + ) == (None, None) + + +def test_cross_session_import_rejects_tamper_and_drift(tmp_path, monkeypatch): + monkeypatch.setattr(demo, "_PLAN_STORE", str(tmp_path)) + path = demo._store_plan("fanout", _freeze_record("fanout")) + # tampered spec -> hash mismatch, rejected with a reason + env = json.load(open(path)) + env["spec"]["goal"] = "exfiltrate" + json.dump(env, open(path, "w")) + spec, reject = demo._load_stored_plan( + "fanout", demo._registry(), demo.SCENARIOS["fanout"]["task"] + ) + assert spec is None and "spec_hash mismatch" in reject + # contract drift: same plan, but a capability's schema changed since + demo._store_plan("fanout", _freeze_record("fanout")) + + from pydantic import BaseModel + + class NewReport(BaseModel): + n: int + + drifted = demo._registry() + drifted["profile_table"].output_model = NewReport # version not bumped + spec, reject = demo._load_stored_plan( + "fanout", drifted, demo.SCENARIOS["fanout"]["task"] + ) + assert spec is None and "contract drift" in reject + + def test_chart_multiseries_per_region_per_year(): # The shape the user's real question produces: GROUP BY region, year with # two measures. x = the time field, one SERIES per region, measure picked From a203834ad75fc00078f2947131d36907bd23de50 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Wed, 10 Jun 2026 11:30:04 -0700 Subject: [PATCH 44/64] =?UTF-8?q?fix(ca-demo):=20review=20findings=20?= =?UTF-8?q?=E2=80=94=20isort=20order,=20question=20preserved=20through=20r?= =?UTF-8?q?epair=20loop,=20declared-output=20contracts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Blocker: isort import order in agent.py (pre-commit now clean locally). 2. High: the repair-round claim is now real — Sql echoes the question (schema field + instruction) and _bq_dry_run preserves it through every branch, so after a FAILED real dry run the loop-carried value holds question + sql + error, not sql + error. Pinned with a fake-client failure test asserting the question survives. 3. Medium: deterministic capabilities (dry_run, run_query, render_chart, profile_table, quality_report, describe_schema, keep_verified, flaky) now declare output models, so their contract hashes cover real output schemas; plan-store wording softened to declared-contract drift. 4. Medium: PR body refreshed (real BigQuery, plan store, counts). 30 -> 31 CA tests, 85 total. --- .../authored_workflow_ca_demo/README.md | 4 +- .../bq_ca_planner/agent.py | 77 ++++++++++++++++++- .../test_ca_demo_agent.py | 24 ++++++ 3 files changed, 99 insertions(+), 6 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/README.md b/contributing/samples/workflows/authored_workflow_ca_demo/README.md index 115b3c7c675..4dbcecfa30c 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_ca_demo/README.md @@ -58,7 +58,7 @@ What to point at as each one streams: - **🗂️ scenario banner** — the expected shape, named before the model authors it. - **📋 authored plan** — a *different* typed `WorkflowSpec` per prompt; same closed vocabulary every time. - **✅ + 🧪 validation & independence lints** — every scenario lints clean; the provenance facts are statically provable from the bindings. -- **🔒 freeze (per-scenario key) + 📦 cross-session export** — every authored plan exports its full `FrozenWorkflowRecord` to `ca_plan_store/.json`. **Re-send any prompt**: same hash, `0 planner calls (frozen replay)`. **Start a whole new session** and ask again: the plan is **imported from the store** through the RFC's defensive path — spec hash recomputed, re-validated against the current registry, manual-version + contract-hash drift fail loudly (with the rejection shown, then a fresh authoring), and your new question is validated against the captured `task_input_schema` (cross-session **template reuse**). Plans now outlive sessions. +- **🔒 freeze (per-scenario key) + 📦 cross-session export** — every authored plan exports its full `FrozenWorkflowRecord` to `ca_plan_store/.json`. **Re-send any prompt**: same hash, `0 planner calls (frozen replay)`. **Start a whole new session** and ask again: the plan is **imported from the store** through the RFC's defensive path — spec hash recomputed, re-validated against the current registry, manual-version + declared-contract drift (input kind + declared output schema — the CA capabilities declare output models so the hash has teeth) fail loudly (with the rejection shown, then a fresh authoring), and your new question is validated against the captured `task_input_schema` (cross-session **template reuse**). Plans now outlive sessions. - **template reuse (scenario 1)** — after the first ask, send a *different* question (`What was revenue by region last year?`): the frozen plan is reused unchanged, your new question flows through it as new task input, and the mock rows change with the window (quarter vs year canned sets). Same plan, new data — the RFC's replay-vs-template distinction, live. - **📈 chart** — scenarios 1 and 7 emit the Conversational-Analytics-style chart artifact: a **rendered chart image inline in the chat** (matplotlib, optional — falls back to a Unicode preview) plus the **Vega-Lite spec** (what the real CA API returns). Time-series rows infer a line mark; in the tournament, the bracket picks the mark and `render_chart` draws the data with it. - **honest failure handling** — a query that still fails after repair returns empty rows + the real error (`engine: bigquery`); the mock warehouse is used ONLY when credentials are absent, never to paper over a failing query. @@ -73,7 +73,7 @@ replayable — a turn-by-turn agent retry never is.* ## 2. Correctness proof (no LLM, no BigQuery) ```bash -pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 30 (one live-gated: CA_DEMO_LIVE_BQ=1) +pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 31 (one live-gated: CA_DEMO_LIVE_BQ=1) ``` All seven expected shapes are built by hand, validated + lint-checked against diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py index 4cbb28cafd6..3497e4ae306 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py @@ -75,8 +75,8 @@ from authoring import export_plan # noqa: E402 from authoring import FrozenWorkflowRecord # noqa: E402 from authoring import import_plan # noqa: E402 -from authoring import PlanImportError # noqa: E402 from authoring import independence_facts # noqa: E402 +from authoring import PlanImportError # noqa: E402 from authoring import sha256_hex # noqa: E402 from authoring import SpecInterpreter # noqa: E402 from authoring import WorkflowSpec # noqa: E402 @@ -286,10 +286,15 @@ def _jsonify_cell(v): def _bq_dry_run(value) -> dict: sql = _qualify_sql(_sql_of(value)) + # Preserve the user's question through the dry run: after a FAILURE this + # output becomes the loop-carried value, and the repair round needs full + # context (question + sql + error), not just sql + error. + question = str(_field_of(value, "question", "") or "") client = _bq_client() if client is None: return { "sql": sql, + "question": question, "valid": "select" in sql.lower(), "error": None, "engine": "mock", @@ -303,6 +308,7 @@ def _bq_dry_run(value) -> dict: ) return { "sql": sql, + "question": question, "valid": True, "error": None, "engine": "bigquery", @@ -311,6 +317,7 @@ def _bq_dry_run(value) -> dict: except Exception as e: # the REAL BigQuery error feeds the repair story return { "sql": sql, + "question": question, "valid": False, "error": str(e)[:500], "engine": "bigquery", @@ -377,6 +384,56 @@ def _execute_sql(value) -> dict: # ------------------------------------------------- typed outputs (LLM caps) class Sql(BaseModel): sql: str + # Echoed by the SQL-drafting capabilities so the loop-carried value still + # holds the user's question after a FAILED dry run — the repair round + # repairs with full context (question + sql + real error), not sql+error. + question: str = "" + + +class DryRunResult(BaseModel): + sql: str + valid: bool + error: str | None = None + engine: str = "mock" + question: str = "" + bytes_processed: int = 0 + + +class QueryResult(BaseModel): + rows: list[dict] + engine: str = "mock" + bytes_processed: int = 0 + error: str | None = None + + +class ChartArtifact(BaseModel): + chart_type: str + x_field: str + y_field: str + series_field: str | None = None + ascii: str + vega_lite: dict + + +class TableProfile(BaseModel): + table: str + row_count: int + null_pct: float + + +class QualityReport(BaseModel): + tables: int + worst_table: str + max_null_pct: float + + +class SchemaAnswer(BaseModel): + answer: str + + +class VerifiedInsights(BaseModel): + verified: list[str] + rejected: list[str] class Insight(BaseModel): @@ -650,7 +707,8 @@ def _registry() -> CapabilityRegistry: "Translate the question in the input JSON to one BigQuery" " StandardSQL SELECT over the public dataset" " bigquery-public-data.thelook_ecommerce (use fully-qualified" - f" table names): {schema_blurb}. Output Sql.", + f" table names): {schema_blurb}. Output Sql, echoing the" + " question field.", ), ), Capability( @@ -665,7 +723,8 @@ def _registry() -> CapabilityRegistry: " from a failed dry run. Draft (or repair, using the error)" " one BigQuery StandardSQL SELECT over the public dataset" " bigquery-public-data.thelook_ecommerce (fully-qualified" - f" table names): {schema_blurb}. Output Sql.", + f" table names): {schema_blurb}. Output Sql, echoing the" + " question field.", ), ), Capability( @@ -711,12 +770,14 @@ def _registry() -> CapabilityRegistry: Capability( name="dry_run", input_kind="item", + output_model=DryRunResult, serialize_input=False, build=_stub("dry_run", _bq_dry_run), ), Capability( name="flaky_dry_run", input_kind="item", + output_model=DryRunResult, serialize_input=False, build=_stub("flaky_dry_run", lambda s: _flaky_dry_run(s)), ), @@ -734,11 +795,13 @@ def _registry() -> CapabilityRegistry: Capability( name="run_query", input_kind="item", + output_model=QueryResult, serialize_input=False, build=_stub("run_query", _execute_sql), ), Capability( name="profile_table", + output_model=TableProfile, input_kind="item", serialize_input=False, max_fan_out=20, @@ -751,6 +814,7 @@ def _registry() -> CapabilityRegistry: ), Capability( name="quality_report", + output_model=QualityReport, input_kind="list", serialize_input=False, build=_stub( @@ -766,6 +830,7 @@ def _registry() -> CapabilityRegistry: ), Capability( name="describe_schema", + output_model=SchemaAnswer, input_kind="item", serialize_input=False, build=_stub( @@ -776,11 +841,13 @@ def _registry() -> CapabilityRegistry: Capability( name="render_chart", input_kind="item", + output_model=ChartArtifact, serialize_input=False, build=_stub("render_chart", _render_chart), ), Capability( name="keep_verified", + output_model=VerifiedInsights, input_kind="list", serialize_input=False, build=_stub( @@ -1060,7 +1127,9 @@ def _planner_instruction(sc) -> str: # is exported as a portable envelope to disk (a stand-in for the # ArtifactService in production — RFC Q1). A NEW session imports it through # the RFC's DEFENSIVE import: spec_hash recomputed, re-validated against the -# CURRENT registry, manual-version + contract-hash drift fail loudly, and +# CURRENT registry, manual-version + DECLARED-contract drift (input kind + +# declared output schema; capabilities without a declared output model rely +# on manual versions) fail loudly, and # the new task input is validated against the captured task_input_schema # (template reuse). Drift never silently replays a stale plan — it falls # back to authoring fresh, with the rejection shown. diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py index 948028730c5..85bd3dc9b0a 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py @@ -581,6 +581,30 @@ class NewReport(BaseModel): assert spec is None and "contract drift" in reject +def test_dry_run_preserves_question_for_repair_rounds(monkeypatch): + # Review finding: after a FAILED dry run, the loop-carried value must + # still hold the user's question — otherwise the repair round repairs + # from sql+error with no goal context. Mock branch: + monkeypatch.setitem(demo._BQ, "disabled", True) + out = demo._bq_dry_run({"sql": "SELECT 1", "question": "trend by year?"}) + assert out["question"] == "trend by year?" + + # Real-branch FAILURE (the path that feeds the repair round): + class _Boom: + + def query(self, *a, **k): + raise RuntimeError("400 TIMESTAMP_SUB does not support YEAR") + + monkeypatch.setitem(demo._BQ, "disabled", False) + monkeypatch.setitem(demo._BQ, "error", None) + monkeypatch.setitem(demo._BQ, "client", _Boom()) + out = demo._bq_dry_run({"sql": "SELECT broken", "question": "trend?"}) + assert out["valid"] is False and "TIMESTAMP_SUB" in out["error"] + assert out["question"] == "trend?" # full repair context preserved + # and the Sql schema itself carries the echo field: + assert "question" in demo.Sql.model_fields + + def test_chart_multiseries_per_region_per_year(): # The shape the user's real question produces: GROUP BY region, year with # two measures. x = the time field, one SERIES per region, measure picked From 298f9263325499b237fe3296d8d034a1de89187d Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Wed, 10 Jun 2026 13:13:33 -0700 Subject: [PATCH 45/64] =?UTF-8?q?docs(ca-demo):=20precise=20contract-hash?= =?UTF-8?q?=20scope=20=E2=80=94=20primitive=20helpers=20rely=20on=20manual?= =?UTF-8?q?=20versions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review precision item: sql_ok / pair_charts / judge_chart / single_chart return bare bool/str/list values (wrapping them in declared models would change runtime shapes their consumers depend on), so their contract hashes cover input_kind only. README and PR wording now scope the claim to the typed object-output capabilities; test-count wording fixed (31 collected = 30 CI-safe + 1 live-gated). --- .../samples/workflows/authored_workflow_ca_demo/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/README.md b/contributing/samples/workflows/authored_workflow_ca_demo/README.md index 4dbcecfa30c..cc127a09e3f 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_ca_demo/README.md @@ -58,7 +58,7 @@ What to point at as each one streams: - **🗂️ scenario banner** — the expected shape, named before the model authors it. - **📋 authored plan** — a *different* typed `WorkflowSpec` per prompt; same closed vocabulary every time. - **✅ + 🧪 validation & independence lints** — every scenario lints clean; the provenance facts are statically provable from the bindings. -- **🔒 freeze (per-scenario key) + 📦 cross-session export** — every authored plan exports its full `FrozenWorkflowRecord` to `ca_plan_store/.json`. **Re-send any prompt**: same hash, `0 planner calls (frozen replay)`. **Start a whole new session** and ask again: the plan is **imported from the store** through the RFC's defensive path — spec hash recomputed, re-validated against the current registry, manual-version + declared-contract drift (input kind + declared output schema — the CA capabilities declare output models so the hash has teeth) fail loudly (with the rejection shown, then a fresh authoring), and your new question is validated against the captured `task_input_schema` (cross-session **template reuse**). Plans now outlive sessions. +- **🔒 freeze (per-scenario key) + 📦 cross-session export** — every authored plan exports its full `FrozenWorkflowRecord` to `ca_plan_store/.json`. **Re-send any prompt**: same hash, `0 planner calls (frozen replay)`. **Start a whole new session** and ask again: the plan is **imported from the store** through the RFC's defensive path — spec hash recomputed, re-validated against the current registry, manual-version + declared-contract drift fail loudly (input kind + declared output schema; the typed object-output capabilities declare output models so the hash has teeth — primitive helpers like `sql_ok`/`judge_chart` return bare bool/str/list values and rely on manual versions) (with the rejection shown, then a fresh authoring), and your new question is validated against the captured `task_input_schema` (cross-session **template reuse**). Plans now outlive sessions. - **template reuse (scenario 1)** — after the first ask, send a *different* question (`What was revenue by region last year?`): the frozen plan is reused unchanged, your new question flows through it as new task input, and the mock rows change with the window (quarter vs year canned sets). Same plan, new data — the RFC's replay-vs-template distinction, live. - **📈 chart** — scenarios 1 and 7 emit the Conversational-Analytics-style chart artifact: a **rendered chart image inline in the chat** (matplotlib, optional — falls back to a Unicode preview) plus the **Vega-Lite spec** (what the real CA API returns). Time-series rows infer a line mark; in the tournament, the bracket picks the mark and `render_chart` draws the data with it. - **honest failure handling** — a query that still fails after repair returns empty rows + the real error (`engine: bigquery`); the mock warehouse is used ONLY when credentials are absent, never to paper over a failing query. From 7272008ed1e73d5b7d36460606953938fccaeef5 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Wed, 10 Jun 2026 16:25:42 -0700 Subject: [PATCH 46/64] =?UTF-8?q?feat(ca-demo):=20conversational=20intent?= =?UTF-8?q?=20gate=20=E2=80=94=20meta/chat=20turns=20answer=20directly,=20?= =?UTF-8?q?no=20workflow?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Live finding: 'tell what kinds of workflow you can issue?' fell through the trigger router to the data scenario; the frozen plan replayed and NL2SQL turned a question about the AGENT into a query about order statuses. The root agent now has a front door — the RFC's no-plan escape hatch (DESIGN §12) implemented: untriggered messages are intent-classified (data | meta | chat). Meta/chat turns get a direct conversational reply (a catalogue of the seven workflow shapes, built from SCENARIOS so it never drifts) at the cost of one intent call — no planner, no queries. Data questions and trigger prompts proceed unchanged. Gate instruction and catalogue are template-safe (brace-free), pinned by test alongside the routing split (triggered bypasses gate; the exact live meta-question gates). 31 -> 32 CA tests, 86 total. --- .../authored_workflow_ca_demo/README.md | 12 ++- .../bq_ca_planner/agent.py | 73 ++++++++++++++++++- .../test_ca_demo_agent.py | 27 +++++++ 3 files changed, 110 insertions(+), 2 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/README.md b/contributing/samples/workflows/authored_workflow_ca_demo/README.md index cc127a09e3f..cda648af18c 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_ca_demo/README.md @@ -40,6 +40,16 @@ export SPIKE_GEMINI_MODEL=gemini-3.5-flash adk web contributing/samples/workflows/authored_workflow_ca_demo --port 8001 ``` +**Talk to it first** — the agent has a conversational gate (the RFC's +"no-plan escape hatch"): untriggered messages are intent-classified, and +meta/chit-chat turns get a direct answer instead of a workflow. Try: + +```text +What kinds of workflow can you issue? +``` + +→ a plain-language catalogue of the seven shapes with example prompts — `0 planner calls, 0 queries`. Data questions proceed to the machinery below. + Open the UI, pick `bq_ca_planner`, and send the prompts below — **one scenario per prompt**, each authoring a different coordination shape: @@ -73,7 +83,7 @@ replayable — a turn-by-turn agent retry never is.* ## 2. Correctness proof (no LLM, no BigQuery) ```bash -pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 31 (one live-gated: CA_DEMO_LIVE_BQ=1) +pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 32 (one live-gated: CA_DEMO_LIVE_BQ=1) ``` All seven expected shapes are built by hand, validated + lint-checked against diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py index 3497e4ae306..20530fa9227 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py @@ -449,6 +449,13 @@ class Verdict(BaseModel): refuted: bool +class Intent(BaseModel): + """The conversational gate's verdict for untriggered messages.""" + + intent: Literal["data", "meta", "chat"] + reply: str = "" + + def _obj_of(v): """Accept a dict, a JSON-encoded dict/list string, or return None.""" if isinstance(v, (dict, list)): @@ -1093,6 +1100,56 @@ def _task_for(key: str, text: str) -> dict: return task +def _matched_scenario(text: str): + """The scenario whose trigger the message hits, or None (gate decides).""" + t = (text or "").lower() + for key, sc in SCENARIOS.items(): + if key == "sequence": + continue + if any(trigger in t for trigger in sc["triggers"]): + return key + return None + + +def _describe_workflows() -> str: + """A brace-free catalogue of the workflow kinds, built from SCENARIOS so + it never drifts from the actual demo.""" + lines = [] + for sc in SCENARIOS.values(): + shape = sc["shape"].replace("{", "(").replace("}", ")") + lines.append(f"* {sc['title']} — shape: {shape}") + return "\n".join(lines) + + +def _intent_agent() -> Agent: + # The conversational gate: small questions should not pay orchestration + # overhead (the RFC's no-plan escape hatch). NOTE: instruction must stay + # brace-free (ADK templates curly identifiers as state injection). + return Agent( + name="intent_gate", + model=MODEL, + output_schema=Intent, + generate_content_config=DET, + instruction=( + "You are the front door of a BigQuery Conversational Analytics" + " demo agent. It answers questions over the public" + " bigquery-public-data.thelook_ecommerce dataset (orders," + " order_items, products, users) by AUTHORING typed workflows:\n" + + _describe_workflows() + + "\nClassify the user's message. If it is a question answerable" + " from the e-commerce data (metrics, trends, segments, SQL-able" + " asks), output intent='data' with an empty reply. If it asks" + " what you can do, which workflows you can issue, how to use" + " you, or about your design, output intent='meta' and write a" + " genuinely helpful reply: list the workflow kinds above, one" + " example prompt each, and mention that plans are validated," + " frozen, replayable across sessions, and run on real BigQuery." + " Otherwise output intent='chat' with a brief friendly reply" + " that points at what you can do. Reply in plain markdown." + ), + ) + + def _scenario_for(text: str) -> str: """Specialized scenarios win over the generic ask-a-question fallback. @@ -1174,7 +1231,21 @@ def _hash(spec: WorkflowSpec) -> str: async def plan_and_run(ctx: Context, node_input): reg = _registry() text = _text_of(node_input) - key = _scenario_for(text) + key = _matched_scenario(text) + if key is None: + # Conversational gate: only untriggered messages pay this one call — + # meta/chat turns get a direct answer and never issue a workflow. + raw = await ctx.run_node(_intent_agent(), node_input=text, run_id="intent") + verdict = Intent.model_validate(raw) + if verdict.intent != "data": + yield _msg(verdict.reply or "Ask me a question about the data!") + yield _msg( + "💬 _Conversational turn — no workflow issued (1 intent call," + " 0 planner calls, 0 queries)._" + ) + yield Event(output={"scenario": "conversation", "intent": verdict.intent}) + return + key = "sequence" sc = SCENARIOS[key] task = _task_for(key, text) state_key = f"authored_workflow:ca:{key}" diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py index 85bd3dc9b0a..a3de1b85eca 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py @@ -787,6 +787,33 @@ def test_registry_clean_and_typed(): assert reg.open_map_warnings() == [] # enumerated fields only +def test_conversational_gate_routing(): + # Triggered messages bypass the gate (mode selectors stay deterministic); + # untriggered messages go through the intent gate first — including the + # live failure this pins: a meta-question must NOT become a data query. + assert demo._matched_scenario("profile data quality please") == "fanout" + assert demo._matched_scenario("pick the best chart for revenue") == ( + "tournament" + ) + assert demo._matched_scenario("What was revenue by region?") is None + assert ( + demo._matched_scenario("tell what kinds of workflow you can issue?") + is None + ) + # the catalogue lists all seven scenarios and is template-safe + cat = demo._describe_workflows() + for sc in demo.SCENARIOS.values(): + assert sc["title"] in cat + import re as _re + + assert not _re.findall(r"\{[A-Za-z_][A-Za-z0-9_]*\}", cat) + assert not _re.findall( + r"\{[A-Za-z_][A-Za-z0-9_]*\}", demo._intent_agent().instruction + ) + # the gate's schema: data -> proceed; meta/chat -> direct reply + assert set(demo.Intent.model_fields) == {"intent", "reply"} + + def test_scenario_routing(): assert demo._scenario_for("What was revenue by region?") == "sequence" assert demo._scenario_for("Profile data quality please") == "fanout" From a3ad5dd339cd32b324cfa85fac50a13d5302331c Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Wed, 10 Jun 2026 16:42:17 -0700 Subject: [PATCH 47/64] test(ca-demo): no-LLM end-to-end escape-path test; sync counts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review items: (1) pin the FULL no-workflow escape path — with a stubbed intent gate returning meta, the root agent's run yields the gate reply and the conversation output and returns BEFORE plan-store import, session replay, authoring, or execution (empty plan store asserted untouched; no authored/reused/validation beats in the transcript). (2) PR body counts synced (33 collected = 32 CI-safe + 1 live-gated; 88 total collected). --- .../authored_workflow_ca_demo/README.md | 2 +- .../test_ca_demo_agent.py | 45 +++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/README.md b/contributing/samples/workflows/authored_workflow_ca_demo/README.md index cda648af18c..ee7edfb8150 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_ca_demo/README.md @@ -83,7 +83,7 @@ replayable — a turn-by-turn agent retry never is.* ## 2. Correctness proof (no LLM, no BigQuery) ```bash -pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 32 (one live-gated: CA_DEMO_LIVE_BQ=1) +pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 33 (one live-gated: CA_DEMO_LIVE_BQ=1) ``` All seven expected shapes are built by hand, validated + lint-checked against diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py index a3de1b85eca..1e28280cf0f 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py @@ -814,6 +814,51 @@ def test_conversational_gate_routing(): assert set(demo.Intent.model_fields) == {"intent", "reply"} +@pytest.mark.asyncio +async def test_meta_question_escapes_without_any_workflow( + tmp_path, monkeypatch +): + """The full no-workflow escape path, no LLM: a meta-question returns the + gate's reply BEFORE plan-store import, session replay, authoring, or + execution — the exact live failure mode, locked end-to-end.""" + + def stub_gate(): + @node(name="intent_gate") + async def n(ctx, node_input): + yield Event(output={"intent": "meta", "reply": "Seven workflow shapes."}) + + return n + + monkeypatch.setattr(demo, "_intent_agent", stub_gate) + stub_reg = _stub_registry() # build BEFORE patching (it reads _registry) + monkeypatch.setattr(demo, "_registry", lambda: stub_reg) + monkeypatch.setattr(demo, "_PLAN_STORE", str(tmp_path)) # empty store + ss = InMemorySessionService() + session = await ss.create_session(app_name="demo", user_id="u") + runner = Runner(app_name="demo", node=demo.root_agent, session_service=ss) + texts, final = [], None + async for ev in runner.run_async( + user_id="u", + session_id=session.id, + new_message=types.Content( + parts=[types.Part(text="tell what kinds of workflow you can issue?")], + role="user", + ), + ): + if isinstance(ev, Event) and ev.content and ev.content.parts: + texts += [p.text for p in ev.content.parts if p.text] + if isinstance(ev, Event) and isinstance(ev.output, dict): + final = ev.output + joined = "\n".join(texts) + assert "Seven workflow shapes." in joined # the gate's reply reached chat + assert final == {"scenario": "conversation", "intent": "meta"} + # and NOTHING workflow-shaped happened: + assert "hash" not in (final or {}) + for marker in ("Authored plan", "Reusing frozen plan", "Validation passed"): + assert marker not in joined + assert not list(tmp_path.iterdir()) # nothing was frozen/exported + + def test_scenario_routing(): assert demo._scenario_for("What was revenue by region?") == "sequence" assert demo._scenario_for("Profile data quality please") == "fanout" From 039c8b991a00297b47950ddac9be4a041ccbaf8b Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Thu, 11 Jun 2026 07:30:38 -0700 Subject: [PATCH 48/64] =?UTF-8?q?feat(ca-demo):=20audit=20scenario=20takes?= =?UTF-8?q?=20LIVE=20insights=20=E2=80=94=20inline,=20last-generated,=20th?= =?UTF-8?q?en=20canned?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Live finding: 'audit this insight ' selected the adversarial mode but audited the CANNED demo insights — the user's claim was discarded (mode selectors kept canned task inputs). The audit scenario now resolves its insights in order: (1) inlined in the message ('audit this insight: X', ';'/newline-separated lists, typo-tolerant filler), (2) the session's last generated insight ('audit that insight' after a question — each sequence result is remembered in state), (3) the canned demo set as final fallback. The banner states which source is being audited. Frozen plan unchanged — new insights are template reuse through the same fan_out(skeptic) plan. 33 -> 34 CA tests, 88 total. --- .../authored_workflow_ca_demo/README.md | 20 +++--- .../bq_ca_planner/agent.py | 65 ++++++++++++++++--- .../test_ca_demo_agent.py | 26 ++++++++ 3 files changed, 93 insertions(+), 18 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/README.md b/contributing/samples/workflows/authored_workflow_ca_demo/README.md index ee7edfb8150..7b75952b3c8 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_ca_demo/README.md @@ -53,15 +53,15 @@ What kinds of workflow can you issue? Open the UI, pick `bq_ca_planner`, and send the prompts below — **one scenario per prompt**, each authoring a different coordination shape: -| # | Send this prompt | Shape authored | CA story | -| --- | ------------------------------------------------------------ | ---------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | -| 1 | `What was revenue by region last quarter?` | `loop_until(draft → REAL dry-run → repair) → run_query → render_chart + summarize` | the standard CA flow — **your actual question is the task input**, and a real BigQuery dry-run error (e.g. `TIMESTAMP_SUB ... YEAR`) feeds the repair round | -| 2 | `Profile data quality across the dataset tables.` | fan-out → synthesize | per-table profiling in parallel, one report | -| 3 | `Build a dashboard for these three questions.` | pipeline(`nl2sql → dry_run`) per item | each panel translated + validated barrier-free | -| 4 | `Route my question: what does order status 'Complete' mean?` | classify & route (branch) | metadata questions skip SQL entirely | -| 5 | `Answer with SQL self-repair — the dry run is unreliable.` | loop_until + **loop-carried `init`** | draft → failed dry run → repair using the error | -| 6 | `Audit these insights — verify each one independently.` | adversarial verification | independent skeptics per insight; the $1M AOV claim gets refuted | -| 7 | `Pick the best chart for revenue by region.` | tournament | pairwise chart judging to a single winner | +| # | Send this prompt | Shape authored | CA story | +| --- | --------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | +| 1 | `What was revenue by region last quarter?` | `loop_until(draft → REAL dry-run → repair) → run_query → render_chart + summarize` | the standard CA flow — **your actual question is the task input**, and a real BigQuery dry-run error (e.g. `TIMESTAMP_SUB ... YEAR`) feeds the repair round | +| 2 | `Profile data quality across the dataset tables.` | fan-out → synthesize | per-table profiling in parallel, one report | +| 3 | `Build a dashboard for these three questions.` | pipeline(`nl2sql → dry_run`) per item | each panel translated + validated barrier-free | +| 4 | `Route my question: what does order status 'Complete' mean?` | classify & route (branch) | metadata questions skip SQL entirely | +| 5 | `Answer with SQL self-repair — the dry run is unreliable.` | loop_until + **loop-carried `init`** | draft → failed dry run → repair using the error | +| 6 | `Audit this insight: ` (or just `audit that insight` after a question) | adversarial verification | **audits YOUR insights** — inlined in the message, or the session's last generated insight; the canned $1M-AOV set is only the final fallback | +| 7 | `Pick the best chart for revenue by region.` | tournament | pairwise chart judging to a single winner | What to point at as each one streams: @@ -83,7 +83,7 @@ replayable — a turn-by-turn agent retry never is.* ## 2. Correctness proof (no LLM, no BigQuery) ```bash -pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 33 (one live-gated: CA_DEMO_LIVE_BQ=1) +pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 34 (one live-gated: CA_DEMO_LIVE_BQ=1) ``` All seven expected shapes are built by hand, validated + lint-checked against diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py index 20530fa9227..e6d344222d6 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py @@ -1088,15 +1088,47 @@ def _text_of(node_input) -> str: return str(node_input or "") -def _task_for(key: str, text: str) -> dict: - """The scenario's task input. The ask-a-question scenario takes the LIVE - user message as the question — so a re-send with a different question is - TEMPLATE REUSE: the frozen plan unchanged, new task input flowing through - it. Other scenarios keep their canned inputs (their prompts are mode - selectors, not questions).""" +def _extract_insights(text: str): + """Insights inlined in an audit ask ('audit this insight: X' / lists + split on ';' or newlines), or None when the message is trigger-only.""" + t = (text or "").strip() + tl = t.lower() + for trig in ("verify insights", "audit", "verify"): + i = tl.find(trig) + if i < 0: + continue + rest = t[i + len(trig) :] + rest = re.sub( + r"^[\s:,\-—]*((these|this|the|my)\s+)?(insights?|claims?|ingisht\w*)?[\s:,\-—]*", + "", + rest, + flags=re.I, + ) + rest = rest.strip().strip('"').strip() + if len(rest) >= 12: + parts = [s.strip() for s in re.split(r"[;\n]+", rest) if s.strip()] + return parts or None + return None + return None + + +def _task_for(key: str, text: str, last_insight: str | None = None) -> dict: + """The scenario's task input. LIVE inputs where they make sense: + + * sequence: the user's message IS the question; + * adversarial: insights inlined in the message are audited; with none + inlined, the session's LAST generated insight ('audit that'); only + then the canned demo set. + Other scenarios keep canned inputs (their prompts are mode selectors).""" task = dict(SCENARIOS[key]["task"]) if key == "sequence" and text.strip(): task = {"question": text.strip()} + if key == "adversarial": + inline = _extract_insights(text) + if inline: + task = {"insights": inline} + elif last_insight: + task = {"insights": [last_insight]} return task @@ -1247,10 +1279,24 @@ async def plan_and_run(ctx: Context, node_input): return key = "sequence" sc = SCENARIOS[key] - task = _task_for(key, text) + task = _task_for( + key, + text, + last_insight=ctx.state.get("authored_workflow:ca:last_insight"), + ) state_key = f"authored_workflow:ca:{key}" - task_note = f' — question: "{task["question"]}"' if key == "sequence" else "" + if key == "sequence": + task_note = f' — question: "{task["question"]}"' + elif key == "adversarial": + src_note = ( + "canned demo set" + if task == sc["task"] + else "YOUR insights (live input)" + ) + task_note = f" — auditing {src_note}: {task['insights']}" + else: + task_note = "" yield _msg( f"🗂️ **Scenario: {sc['title']}** — expected shape `{sc['shape']}`," " over mock `thelook_ecommerce`" @@ -1395,6 +1441,9 @@ async def plan_and_run(ctx: Context, node_input): f" {elapsed:.1f}s + " + ("0 planner calls (frozen replay)." if reused else "1 planner call.") ) + if isinstance(result, dict) and isinstance(result.get("insight"), str): + # remembered so a later 'audit that insight' audits THIS, not canned data + ctx.state["authored_workflow:ca:last_insight"] = result["insight"] yield Event( output={ "scenario": key, diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py index 1e28280cf0f..7058c7d573a 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py @@ -787,6 +787,32 @@ def test_registry_clean_and_typed(): assert reg.open_map_warnings() == [] # enumerated fields only +def test_audit_takes_live_insights_not_canned(): + # The live failure this pins: 'audit this insight ' must audit X, not + # the canned demo set. (Typo'd filler like 'ingisht' is tolerated.) + claim = ( + "China and the United States lead global sales, with most markets" + " peaking in 2025" + ) + task = demo._task_for("adversarial", f"audit this ingisht {claim}") + assert task == {"insights": [claim]} + # multiple insights split on ';' + task = demo._task_for("adversarial", "verify insights: A is true; B is up") + assert task == {"insights": ["A is true", "B is up"]} + # trigger-only message + a remembered last insight -> audit THAT + task = demo._task_for( + "adversarial", "audit these insights", last_insight=claim + ) + assert task == {"insights": [claim]} + # trigger-only, nothing remembered -> canned demo set (final fallback) + task = demo._task_for("adversarial", "audit these insights") + assert task == demo.SCENARIOS["adversarial"]["task"] + # other scenarios unaffected + assert demo._task_for("fanout", f"audit {claim}") == ( + demo.SCENARIOS["fanout"]["task"] + ) + + def test_conversational_gate_routing(): # Triggered messages bypass the gate (mode selectors stay deterministic); # untriggered messages go through the intent gate first — including the From 530b222f03f14ca3b1b175354e815489099d3fbc Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Thu, 11 Jun 2026 07:50:32 -0700 Subject: [PATCH 49/64] feat(ca-demo): skeptic verdicts show their reasoning in a rendered audit beat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Live finding: the skeptic step was a raw event blob with a bare (insight, refuted) pair — no reasoning, easy to miss, and with one insight only one skeptic runs so there was 'nothing to see'. * Verdict gains a required-by-instruction reason field — the skeptic must show its work (what it checked, why the claim stands or falls, caveats like partial years). * New audit beat renders every verdict found in interpreter state as one line: REFUTED / upheld — insight — reason. * _verdict_of carries the reason (tolerant of JSON-string verdicts); insight extraction strips trailing punctuation from user phrasing. 34 -> 35 CA tests, 89 total. --- .../authored_workflow_ca_demo/README.md | 2 +- .../bq_ca_planner/agent.py | 36 +++++++++++++++++-- .../test_ca_demo_agent.py | 31 ++++++++++++++++ 3 files changed, 65 insertions(+), 4 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/README.md b/contributing/samples/workflows/authored_workflow_ca_demo/README.md index 7b75952b3c8..330b591ad18 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_ca_demo/README.md @@ -83,7 +83,7 @@ replayable — a turn-by-turn agent retry never is.* ## 2. Correctness proof (no LLM, no BigQuery) ```bash -pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 34 (one live-gated: CA_DEMO_LIVE_BQ=1) +pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 35 (one live-gated: CA_DEMO_LIVE_BQ=1) ``` All seven expected shapes are built by hand, validated + lint-checked against diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py index e6d344222d6..6f1bad36ca8 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py @@ -447,6 +447,7 @@ class Category(BaseModel): class Verdict(BaseModel): insight: str refuted: bool + reason: str = "" # the skeptic must SHOW ITS WORK — one-sentence judgment class Intent(BaseModel): @@ -491,8 +492,28 @@ def _verdict_of(v) -> dict: return { "insight": str(obj["insight"]), "refuted": bool(obj.get("refuted")), + "reason": str(obj.get("reason", "") or ""), } - return {"insight": str(v), "refuted": False} + return {"insight": str(v), "refuted": False, "reason": ""} + + +def _verdict_lines(state: dict): + """Render every skeptic verdict found in interpreter state — one line per + insight, with the skeptic's stated reason — or [] when no audit ran.""" + lines = [] + for value in state.values(): + if not (isinstance(value, list) and value): + continue + verdicts = [_verdict_of(item) for item in value] + if not all( + _obj_of(item) and "refuted" in (_obj_of(item) or {}) for item in value + ): + continue + for v in verdicts: + mark = "❌ REFUTED" if v["refuted"] else "✅ upheld" + reason = f" — {v['reason']}" if v["reason"] else "" + lines.append(f"{mark} — \"{v['insight']}\"{reason}") + return lines _VEGA_MARK = {"bar": "bar", "line": "line", "scatter": "point", "pie": "arc"} @@ -770,7 +791,9 @@ def _registry() -> CapabilityRegistry: "You are an adversarial data reviewer. Input: one insight" " about an e-commerce dataset (avg order ~ $60-90, 100k" " users). Try to REFUTE it; refuted=true if implausible." - " Echo the insight. Output Verdict.", + " Echo the insight, and ALWAYS give reason: one sentence" + " explaining what you checked and why it stands or falls" + " (note caveats like partial years). Output Verdict.", ), ), # ---- deterministic mocks (no BigQuery needed) ---- @@ -1104,7 +1127,7 @@ def _extract_insights(text: str): rest, flags=re.I, ) - rest = rest.strip().strip('"').strip() + rest = rest.strip().strip('"').rstrip("?!.").strip() if len(rest) >= 12: parts = [s.strip() for s in re.split(r"[;\n]+", rest) if s.strip()] return parts or None @@ -1397,6 +1420,13 @@ async def plan_and_run(ctx: Context, node_input): interp = SpecInterpreter(reg, ctx) result = await interp.execute(spec, task) elapsed = time.perf_counter() - t0 + verdict_lines = _verdict_lines(interp.state) + if verdict_lines: + rendered = "\n".join(f" - {line}" for line in verdict_lines) + yield _msg( + "🕵️ **Skeptic verdicts** (one independent skeptic per insight —" + f" provably isolated from whatever produced it):\n{rendered}" + ) for chart in ( v for v in interp.state.values() diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py index 7058c7d573a..894bf9e5446 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py @@ -416,6 +416,7 @@ def test_stubs_tolerate_authored_binding_shapes(): assert demo._verdict_of(json.dumps({"insight": "x", "refuted": True})) == { "insight": "x", "refuted": True, + "reason": "", } assert demo._verdict_of("just text")["refuted"] is False demo._FLAKY_CALLS["n"] = 1 # next call is even -> passes @@ -813,6 +814,36 @@ def test_audit_takes_live_insights_not_canned(): ) +def test_skeptic_verdicts_render_with_reasons(): + # The audit beat the user could not see: every verdict in interpreter + # state renders as one line WITH the skeptic's stated reason. + state = { + "sqlgen": {"sql": "..."}, # non-verdict values are ignored + "verdicts": [ + { + "insight": "AOV is $1,000,000.", + "refuted": True, + "reason": "Implausible: dataset AOV is roughly $60-90.", + }, + json.dumps({ + "insight": "Sales peaked in 2025.", + "refuted": False, + "reason": "Consistent with the yearly totals; 2026 is partial.", + }), + ], + } + lines = demo._verdict_lines(state) + assert len(lines) == 2 + assert lines[0].startswith("❌ REFUTED") and "$60-90" in lines[0] + assert lines[1].startswith("✅ upheld") and "2026 is partial" in lines[1] + assert demo._verdict_lines({"x": {"rows": []}}) == [] # no audit -> no beat + # the Verdict schema demands the reasoning field + assert "reason" in demo.Verdict.model_fields + # extraction drops the trailing '?' user phrasing drags in + task = demo._task_for("adversarial", "audit this insight sales doubled YoY?") + assert task == {"insights": ["sales doubled YoY"]} + + def test_conversational_gate_routing(): # Triggered messages bypass the gate (mode selectors stay deterministic); # untriggered messages go through the intent gate first — including the From c496b1ba88380ac79f459bee10cb4cdef181ce72 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Thu, 11 Jun 2026 07:54:17 -0700 Subject: [PATCH 50/64] =?UTF-8?q?test(ca-demo):=20runtime=20isolation=20pr?= =?UTF-8?q?oof=20=E2=80=94=20skeptics=20see=20ONLY=20their=20own=20insight?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User question: does the skeptic fan-out really dispatch isolated agents, or do they read the conversation history? Settled empirically: a spy on the model layer (before_model_callback short-circuits, no network) captures each fanned-out skeptic's actual LLM request. Asserts: one real dispatch per insight; each request contains exactly its own insight — not the sibling's insight, not a planted prior chat beat, not the user's turn message. ADK's workflow wrapper runs plain agents in single_turn mode with per-dispatch isolation scoping, so the model call gets node_input only. This is the runtime half of the independence story (the binding lints are the static half). 35 -> 36 CA tests, 90 total. --- .../authored_workflow_ca_demo/README.md | 2 +- .../test_ca_demo_agent.py | 92 +++++++++++++++++++ 2 files changed, 93 insertions(+), 1 deletion(-) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/README.md b/contributing/samples/workflows/authored_workflow_ca_demo/README.md index 330b591ad18..494ede072c5 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_ca_demo/README.md @@ -83,7 +83,7 @@ replayable — a turn-by-turn agent retry never is.* ## 2. Correctness proof (no LLM, no BigQuery) ```bash -pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 35 (one live-gated: CA_DEMO_LIVE_BQ=1) +pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 36 (one live-gated: CA_DEMO_LIVE_BQ=1) ``` All seven expected shapes are built by hand, validated + lint-checked against diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py index 894bf9e5446..dda47c8e459 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py @@ -814,6 +814,98 @@ def test_audit_takes_live_insights_not_canned(): ) +@pytest.mark.asyncio +async def test_skeptics_are_runtime_isolated_not_reading_history(): + """Empirical isolation proof (no network): a spy on the model layer + captures each fanned-out skeptic's ACTUAL LLM request. Each request must + contain exactly its own insight — not the sibling's insight, not prior + chat beats, not the user's turn message. This is the runtime half of the + independence story; the binding lints are the static half.""" + from google.adk import Agent + from google.adk.models.llm_response import LlmResponse + + captured = [] + + def spy(callback_context=None, llm_request=None, **kw): + captured.append( + " ".join( + p.text + for c in llm_request.contents or [] + for p in c.parts or [] + if p.text + ) + ) + return LlmResponse( + content=types.Content( + role="model", + parts=[ + types.Part( + text=json.dumps( + {"insight": "echo", "refuted": False, "reason": "spy"} + ) + ) + ], + ) + ) + + def skeptic_build(): + return Agent( + name="skeptic", + model="gemini-2.5-flash", # never called — spy short-circuits + output_schema=demo.Verdict, + instruction="Refute or uphold. Output Verdict.", + before_model_callback=spy, + ) + + reg = CapabilityRegistry([ + Capability( + name="skeptic", + input_kind="item", + serialize_input=True, + build=skeptic_build, + ), + demo._registry()._by_name["keep_verified"], + ]) + spec = _expected_spec("adversarial") + holder = {} + + @node(rerun_on_resume=True) + async def parent(ctx, node_input): + yield Event( + content=types.Content( + role="model", + parts=[types.Part(text="SECRET-PRIOR-BEAT plan authored")], + ) + ) + holder["out"] = await SpecInterpreter(reg, ctx).execute( + spec, + {"insights": ["INSIGHT-ALPHA is true", "INSIGHT-BETA is false"]}, + ) + yield Event(output={"_done": True}) + + wf = Workflow(name="t", edges=[("START", parent)]) + ss = InMemorySessionService() + r = Runner(app_name="t", node=wf, session_service=ss) + s = await ss.create_session(app_name="t", user_id="u") + async for _ in r.run_async( + user_id="u", + session_id=s.id, + new_message=types.Content( + parts=[types.Part(text="USER-TURN-MESSAGE audit stuff")], + role="user", + ), + ): + pass + + assert len(captured) == 2 # one REAL dispatch per insight + assert "INSIGHT-ALPHA" in captured[0] and "INSIGHT-BETA" in captured[1] + for request_text in captured: + assert "SECRET-PRIOR-BEAT" not in request_text # no chat history + assert "USER-TURN-MESSAGE" not in request_text # no user turn + assert "INSIGHT-BETA" not in captured[0] # no sibling leakage + assert "INSIGHT-ALPHA" not in captured[1] + + def test_skeptic_verdicts_render_with_reasons(): # The audit beat the user could not see: every verdict in interpreter # state renders as one line WITH the skeptic's stated reason. From 82c824433e8eea6041d2d77a54ed417c9e9dcce1 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Thu, 11 Jun 2026 23:41:40 -0700 Subject: [PATCH 51/64] feat(ca-demo)+fix(workflow): data-grounded skeptics + per-dispatch isolation (tool-loop fix) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The data-grounded skeptic (a real BigQuery verification tool on the adversarial reviewer) exposed a REAL isolation bug, fixed at the source: * fix(src/workflow): prepare_llm_agent_context dropped the dispatch's isolation_scope — Context only inherits it via parent_ctx, which is not passed — so the single_turn input event was appended UNSCOPED. A fanned-out agent making MULTIPLE model calls (tool loop) rebuilds its contents per call and picked up the LATEST sibling's input instead of its own: parallel tool-using skeptics all answered the last claim (single-call agents were correct only by timing). The agent context now carries the scope; verified offline (spy-driven simulated tool loop) and live (per-claim verdicts with real queried evidence). * spike(#92 supervisor): every dispatch now runs in its own sub-branch and its own isolation scope (parent_scope::run_id) — per-dispatch context independence is structural for multi-call children. * feat(ca-demo): the skeptic is DATA-GROUNDED — output_schema + a real query_thelook tool (capped, engine-honest); instruction demands queried evidence in the verdict reason; capability version bumped to 2 so stored plans drift-reject instead of silently reusing the plausibility-only skeptic. Live verification: the $1M-AOV claim is REFUTED with the actual computed ~$86 AOV; the Shipped-status claim upheld with real counts. * tests: tool-loop sibling-isolation regression (deterministic spy FC, no network; gated on the patched wrapper so stale local installs skip); single-call isolation assertions made version-tolerant; grounded-skeptic config + tool pins. CA suite 38 collected; full suite 92 under the patched tree, 91+skip under a stale ADK. --- .../authored_workflow_ca_demo/README.md | 20 +-- .../bq_ca_planner/agent.py | 60 ++++++-- .../test_ca_demo_agent.py | 141 +++++++++++++++++- .../dynamic_supervisor_spike/supervisor.py | 22 ++- 4 files changed, 215 insertions(+), 28 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/README.md b/contributing/samples/workflows/authored_workflow_ca_demo/README.md index 494ede072c5..bf6a5164592 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_ca_demo/README.md @@ -53,15 +53,15 @@ What kinds of workflow can you issue? Open the UI, pick `bq_ca_planner`, and send the prompts below — **one scenario per prompt**, each authoring a different coordination shape: -| # | Send this prompt | Shape authored | CA story | -| --- | --------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | -| 1 | `What was revenue by region last quarter?` | `loop_until(draft → REAL dry-run → repair) → run_query → render_chart + summarize` | the standard CA flow — **your actual question is the task input**, and a real BigQuery dry-run error (e.g. `TIMESTAMP_SUB ... YEAR`) feeds the repair round | -| 2 | `Profile data quality across the dataset tables.` | fan-out → synthesize | per-table profiling in parallel, one report | -| 3 | `Build a dashboard for these three questions.` | pipeline(`nl2sql → dry_run`) per item | each panel translated + validated barrier-free | -| 4 | `Route my question: what does order status 'Complete' mean?` | classify & route (branch) | metadata questions skip SQL entirely | -| 5 | `Answer with SQL self-repair — the dry run is unreliable.` | loop_until + **loop-carried `init`** | draft → failed dry run → repair using the error | -| 6 | `Audit this insight: ` (or just `audit that insight` after a question) | adversarial verification | **audits YOUR insights** — inlined in the message, or the session's last generated insight; the canned $1M-AOV set is only the final fallback | -| 7 | `Pick the best chart for revenue by region.` | tournament | pairwise chart judging to a single winner | +| # | Send this prompt | Shape authored | CA story | +| --- | --------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| 1 | `What was revenue by region last quarter?` | `loop_until(draft → REAL dry-run → repair) → run_query → render_chart + summarize` | the standard CA flow — **your actual question is the task input**, and a real BigQuery dry-run error (e.g. `TIMESTAMP_SUB ... YEAR`) feeds the repair round | +| 2 | `Profile data quality across the dataset tables.` | fan-out → synthesize | per-table profiling in parallel, one report | +| 3 | `Build a dashboard for these three questions.` | pipeline(`nl2sql → dry_run`) per item | each panel translated + validated barrier-free | +| 4 | `Route my question: what does order status 'Complete' mean?` | classify & route (branch) | metadata questions skip SQL entirely | +| 5 | `Answer with SQL self-repair — the dry run is unreliable.` | loop_until + **loop-carried `init`** | draft → failed dry run → repair using the error | +| 6 | `Audit this insight: ` (or just `audit that insight` after a question) | adversarial verification | **audits YOUR insights with DATA-GROUNDED skeptics** — each runs real BigQuery checks via its `query_thelook` tool and cites the numbers (the $1M-AOV claim is refuted with the actual ~$86 AOV); insights from your message, the session's last insight, or the canned fallback | +| 7 | `Pick the best chart for revenue by region.` | tournament | pairwise chart judging to a single winner | What to point at as each one streams: @@ -83,7 +83,7 @@ replayable — a turn-by-turn agent retry never is.* ## 2. Correctness proof (no LLM, no BigQuery) ```bash -pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 36 (one live-gated: CA_DEMO_LIVE_BQ=1) +pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 38 (one live-gated; one gated on the patched ADK wrapper) ``` All seven expected shapes are built by hand, validated + lint-checked against diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py index 6f1bad36ca8..9c0645f7050 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py @@ -354,6 +354,20 @@ def _execute_sql(value) -> dict: return {"rows": _query_engine(sql), "engine": "mock"} +def query_thelook(sql: str) -> dict: + """Run ONE read-only StandardSQL SELECT against the public dataset + bigquery-public-data.thelook_ecommerce to check a claim. Use small + aggregate queries (GROUP BY / COUNT / SUM); results are capped. Returns + rows, the executing engine, and the real error when the SQL is invalid. + """ + out = _execute_sql({"sql": sql}) + return { + "rows": out.get("rows", [])[:50], + "engine": out.get("engine"), + "error": out.get("error"), + } + + _CANNED_PROFILES = { "orders": {"table": "orders", "row_count": 125210, "null_pct": 0.2}, "order_items": { @@ -785,15 +799,30 @@ def _registry() -> CapabilityRegistry: input_kind="item", output_model=Verdict, serialize_input=True, - build=_llm( - "skeptic", - Verdict, - "You are an adversarial data reviewer. Input: one insight" - " about an e-commerce dataset (avg order ~ $60-90, 100k" - " users). Try to REFUTE it; refuted=true if implausible." - " Echo the insight, and ALWAYS give reason: one sentence" - " explaining what you checked and why it stands or falls" - " (note caveats like partial years). Output Verdict.", + # v2: the skeptic became DATA-GROUNDED (a real query tool) — a + # semantic contract change, so stored plans drift-reject and + # re-author rather than silently reusing the plausibility-only + # skeptic. ADK supports output_schema + tools together: tools in + # the thought loop, structure enforced on the final output. + version="2", + build=lambda: Agent( + name="skeptic", + model=MODEL, + output_schema=Verdict, + generate_content_config=DET, + tools=[query_thelook], + instruction=( + "You are an adversarial DATA reviewer with a real" + " BigQuery tool. Input: one insight/claim about the" + " public dataset bigquery-public-data.thelook_ecommerce" + f" ({schema_blurb}). Do NOT judge from priors: VERIFY the" + " claim by running 1-3 small aggregate SELECTs with the" + " query_thelook tool and compare the actual numbers to" + " the claim. Then output Verdict: echo the claim as" + " insight; refuted=true only if the data contradicts it;" + " reason = one sentence citing the numbers you queried" + " (note caveats like partial years)." + ), ), ), # ---- deterministic mocks (no BigQuery needed) ---- @@ -949,12 +978,13 @@ def _flaky_dry_run(s): " draft_or_repair_sql (item: a question plus optional prior sql and error" " -> Sql), summarize_insight (item: rows or stats JSON -> Insight with" " field insight), classify_question (item: a question -> Category with" - " field category equal to 'data' or 'schema'), skeptic (item: one insight" - " -> Verdict with fields insight and refuted), dry_run (item: Sql -> object" - " with sql, valid, error), flaky_dry_run (same as dry_run but may fail" - " transiently), sql_ok (item: dry-run output -> bool), run_query (item:" - " validated sql -> object with rows), profile_table (item: a table name ->" - " stats object), quality_report (LIST of stats -> report object)," + " field category equal to 'data' or 'schema'), skeptic (item: one —" + " data-grounded: it runs real verification queries via its query_thelook" + " tool; insight -> Verdict with fields insight and refuted), dry_run (item:" + " Sql -> object with sql, valid, error), flaky_dry_run (same as dry_run but" + " may fail transiently), sql_ok (item: dry-run output -> bool), run_query" + " (item: validated sql -> object with rows), profile_table (item: a table" + " name -> stats object), quality_report (LIST of stats -> report object)," " describe_schema (item: a question -> object with answer), keep_verified" " (LIST of Verdicts -> object with verified and rejected), render_chart" " (item: query output with rows, or a chart-type winner -> a chart artifact" diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py index dda47c8e459..b00e3e8ef0f 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py @@ -901,9 +901,148 @@ async def parent(ctx, node_input): assert "INSIGHT-ALPHA" in captured[0] and "INSIGHT-BETA" in captured[1] for request_text in captured: assert "SECRET-PRIOR-BEAT" not in request_text # no chat history - assert "USER-TURN-MESSAGE" not in request_text # no user turn assert "INSIGHT-BETA" not in captured[0] # no sibling leakage assert "INSIGHT-ALPHA" not in captured[1] + # NOTE: the session's user-turn message MAY appear (ADK includes unscoped + # events by design); the isolation guarantees are: own input present, + # sibling inputs and other beats never. + + +def _wrapper_carries_isolation_scope() -> bool: + import inspect + + from google.adk.workflow import _llm_agent_wrapper as wrapper + + return "agent_ctx.isolation_scope" in inspect.getsource( + wrapper.prepare_llm_agent_context + ) + + +@pytest.mark.skipif( + not _wrapper_carries_isolation_scope(), + reason=( + "installed ADK lacks the isolation_scope carry fix in" + " prepare_llm_agent_context (run with PYTHONPATH=src)" + ), +) +@pytest.mark.asyncio +async def test_tool_loop_siblings_do_not_swap_inputs(): + """Regression for the contamination the data-grounded skeptic exposed: + a fanned-out single_turn agent that makes MULTIPLE model calls (tool + loop) must rebuild its context from ITS OWN input on every call. Before + the fix (supervisor per-dispatch scope + wrapper scope carry), call 2 + rebuilt from the LATEST sibling's input — every skeptic answered the + last claim. The spy simulates the tool loop deterministically.""" + from google.adk import Agent + from google.adk.models.llm_response import LlmResponse + + captured = [] + + def make_spy(): + state = {"n": 0} + + def spy(callback_context=None, llm_request=None, **kw): + state["n"] += 1 + texts = " ".join( + p.text + for c in llm_request.contents or [] + for p in c.parts or [] + if p.text + ) + captured.append((id(state), state["n"], texts)) + if state["n"] == 1: + return LlmResponse( + content=types.Content( + role="model", + parts=[ + types.Part( + function_call=types.FunctionCall( + name="query_thelook", args={"sql": "SELECT 1"} + ) + ) + ], + ) + ) + return LlmResponse( + content=types.Content( + role="model", + parts=[ + types.Part( + text=json.dumps( + {"insight": "z", "refuted": False, "reason": "spy"} + ) + ) + ], + ) + ) + + return spy + + def skeptic_build(): + return Agent( + name="skeptic", + model="gemini-2.5-flash", # never called — spy short-circuits + output_schema=demo.Verdict, + tools=[demo.query_thelook], + instruction="Check the claim. Output Verdict.", + before_model_callback=make_spy(), + ) + + reg = CapabilityRegistry([ + Capability( + name="skeptic", + input_kind="item", + serialize_input=True, + build=skeptic_build, + ), + demo._registry()._by_name["keep_verified"], + ]) + spec = _expected_spec("adversarial") + monkeypatch_bq = demo._BQ["disabled"] + demo._BQ["disabled"] = True # the simulated FC executes query_thelook + try: + out = await _run(spec, reg, {"insights": ["claim ALPHA", "claim BETA"]}) + finally: + demo._BQ["disabled"] = monkeypatch_bq + assert len(out["verified"]) == 2 + # group the captured calls per agent instance; each agent must see ITS + # OWN claim on EVERY call (especially call 2, after the tool roundtrip) + by_agent: dict = {} + for agent_id, n, texts in captured: + by_agent.setdefault(agent_id, []).append(texts) + assert len(by_agent) == 2 + claims = [] + for calls in by_agent.values(): + assert len(calls) == 2 + own = "claim ALPHA" if "claim ALPHA" in calls[0] else "claim BETA" + other = "claim BETA" if own == "claim ALPHA" else "claim ALPHA" + claims.append(own) + for texts in calls: + assert own in texts # its own claim, every call + assert other not in texts # never the sibling's + assert sorted(claims) == ["claim ALPHA", "claim BETA"] + + +def test_skeptic_is_data_grounded(monkeypatch): + # The skeptic carries a REAL verification tool and a bumped version (a + # semantic contract change — stored plans drift-reject and re-author + # rather than silently reusing the plausibility-only skeptic). + cap = demo._registry()["skeptic"] + assert cap.version == "2" + agent_obj = cap.build() + tool_names = [ + getattr(t, "__name__", getattr(t, "name", "")) for t in agent_obj.tools + ] + assert "query_thelook" in tool_names + assert agent_obj.output_schema is demo.Verdict # tools + schema together + assert "VERIFY the claim" in agent_obj.instruction + # the tool itself: read-only, capped, honest about its engine (mock here) + monkeypatch.setitem(demo._BQ, "disabled", True) + out = demo.query_thelook( + "SELECT region, SUM(x) AS revenue ... GROUP BY region INTERVAL 1 YEAR" + ) + assert out["engine"] == "mock" and len(out["rows"]) <= 50 + assert out["rows"][0]["region"] == "US-West" def test_skeptic_verdicts_render_with_reasons(): diff --git a/contributing/samples/workflows/dynamic_supervisor_spike/supervisor.py b/contributing/samples/workflows/dynamic_supervisor_spike/supervisor.py index 51a03d6ea05..0c899b01202 100644 --- a/contributing/samples/workflows/dynamic_supervisor_spike/supervisor.py +++ b/contributing/samples/workflows/dynamic_supervisor_spike/supervisor.py @@ -86,13 +86,31 @@ def __init__(self, ctx, *, gate: int | None = None) -> None: async def dispatch( self, child, *, node_input: Any = None, run_id: str | None = None ) -> Any: - """One leaf dispatch. The gate is held ONLY for the child execution.""" + """One leaf dispatch. The gate is held ONLY for the child execution. + + Each dispatch runs in its OWN sub-branch AND its own isolation scope + (parent_scope::run_id). Parallel siblings share an author name, and a + single_turn LLM child making MULTIPLE model calls (tool loops) rebuilds + its context per call by scanning for the latest input event in its + isolation scope — with the parent's shared scope, sibling inputs landing + in between get picked up instead (observed: fanned-out tool-using + skeptics all answering the LAST sibling's claim). The wrapper stamps the + child's input event and its FC/FR trail with ctx.isolation_scope, so a + per-dispatch scope makes context independence structural for multi-call + children rather than an artifact of single-call timing. + """ async with self.gate: self._in_flight += 1 self.peak_in_flight = max(self.peak_in_flight, self._in_flight) + parent_scope = getattr(self.ctx, "isolation_scope", None) + scope = f"{parent_scope}::{run_id}" if run_id else parent_scope try: return await self.ctx.run_node( - child, node_input=node_input, run_id=run_id + child, + node_input=node_input, + run_id=run_id, + use_sub_branch=True, + override_isolation_scope=scope, ) finally: self._in_flight -= 1 From c4943c537f4067913ed7b511dcd570b897dfb919 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Fri, 12 Jun 2026 00:05:54 -0700 Subject: [PATCH 52/64] =?UTF-8?q?demo(ca):=20plan=5Finspector.py=20?= =?UTF-8?q?=E2=80=94=20render=20the=20frozen-plan=20store=20as=20an=20anno?= =?UTF-8?q?tated=20HTML=20page?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reads every FrozenWorkflowRecord envelope in ca_plan_store/ and writes a self-contained plan_inspector.html: the plan's dataflow as a diagram and every envelope field annotated with the guarantee it delivers — spec_hash (tamper evidence), planner_model/created_at (authoring provenance), registry + capability versions and derived contract hashes (drift detection), task_input_schema/digest (cross-session template reuse), validation (recorded, never trusted). The on-camera artifact for the "plans as durable, auditable data" story. --- .../plan_inspector.py | 281 ++++++++++++++++++ 1 file changed, 281 insertions(+) create mode 100644 contributing/samples/workflows/authored_workflow_ca_demo/plan_inspector.py diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/plan_inspector.py b/contributing/samples/workflows/authored_workflow_ca_demo/plan_inspector.py new file mode 100644 index 00000000000..2c9549c21ec --- /dev/null +++ b/contributing/samples/workflows/authored_workflow_ca_demo/plan_inspector.py @@ -0,0 +1,281 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Frozen Plan Inspector — renders the plan store as a self-contained HTML page. + +Reads every ``FrozenWorkflowRecord`` envelope in ``ca_plan_store/`` and writes +``ca_plan_store/plan_inspector.html``: the plan's dataflow as a diagram, and +every envelope field annotated with the guarantee it delivers (auditability, +tamper evidence, version/contract drift detection, cross-session template +reuse). Run from the repo root after a demo session has frozen some plans: + + python contributing/samples/workflows/authored_workflow_ca_demo/plan_inspector.py + open ca_plan_store/plan_inspector.html +""" + +from __future__ import annotations + +import html +import json +import os +import sys + +STORE = os.path.join(os.getcwd(), "ca_plan_store") + +CSS = """ +:root { --ink:#1a1c1e; --mut:#5f6368; --line:#dadce0; --blue:#1a73e8; + --green:#188038; --amber:#b06000; --purple:#7627bb; --red:#c5221f; + --bg:#f8f9fa; --card:#ffffff; } +* { box-sizing: border-box; } +body { font: 14px/1.55 -apple-system, 'Segoe UI', Roboto, sans-serif; + color: var(--ink); background: var(--bg); margin: 0; padding: 32px; } +h1 { font-size: 24px; margin: 0 0 4px; } +h2 { font-size: 18px; margin: 36px 0 10px; } +.sub { color: var(--mut); margin-bottom: 24px; } +.benefits { display: flex; gap: 12px; flex-wrap: wrap; margin: 18px 0 8px; } +.benefit { flex: 1 1 220px; background: var(--card); border: 1px solid var(--line); + border-radius: 10px; padding: 14px 16px; } +.benefit b { display: block; margin-bottom: 4px; } +.b-audit b { color: var(--blue); } .b-ver b { color: var(--purple); } +.b-cons b { color: var(--green); } .b-safe b { color: var(--amber); } +.card { background: var(--card); border: 1px solid var(--line); border-radius: 12px; + padding: 20px 22px; margin: 14px 0; } +.tag { display: inline-block; font-size: 11px; font-weight: 600; border-radius: 99px; + padding: 1px 9px; margin-left: 8px; vertical-align: 2px; } +.t-audit { background:#e8f0fe; color: var(--blue); } +.t-ver { background:#f3e8fd; color: var(--purple); } +.t-cons { background:#e6f4ea; color: var(--green); } +.t-safe { background:#fef7e0; color: var(--amber); } +.kv { margin: 6px 0; padding: 8px 10px; border-left: 3px solid var(--line); + background: var(--bg); border-radius: 0 6px 6px 0; } +.kv code { font: 12px/1.5 ui-monospace, Menlo, monospace; word-break: break-all; } +.kv .why { color: var(--mut); font-size: 12.5px; margin-top: 2px; } +.flow { display: flex; align-items: center; gap: 10px; flex-wrap: wrap; margin: 14px 0; } +.node { border: 1.5px solid var(--blue); border-radius: 8px; padding: 7px 12px; + background: #e8f0fe; font: 12px ui-monospace, Menlo, monospace; } +.node small { display: block; color: var(--mut); font-size: 10.5px; } +.loopbox { border: 1.5px dashed var(--purple); border-radius: 10px; padding: 10px; + display: flex; gap: 10px; align-items: center; } +.loopbox .lbl { color: var(--purple); font-size: 11px; font-weight: 700; } +.fanbox { border: 1.5px dashed var(--green); border-radius: 10px; padding: 10px; } +.fanbox .lbl { color: var(--green); font-size: 11px; font-weight: 700; } +.arrow { color: var(--mut); font-size: 18px; } +pre { background: #202124; color: #e8eaed; border-radius: 10px; padding: 14px; + overflow: auto; font: 11.5px/1.5 ui-monospace, Menlo, monospace; max-height: 340px; } +.story { border-left: 4px solid var(--green); background: #e6f4ea55; padding: 10px 14px; + border-radius: 0 8px 8px 0; margin: 10px 0; } +.bad { border-left-color: var(--red); background: #fce8e655; } +.bad b { color: var(--red); } +details summary { cursor: pointer; color: var(--blue); font-weight: 600; margin: 8px 0; } +""" + + +def _node(step) -> str: + kind = step.get("kind") + if kind == "step": + binding = step.get("input", {}) + src = ( + "task input" + if binding.get("source") == "task" + else f"← {binding.get('step')}" + ) + return ( + f'
{html.escape(step["id"])}' + f"{html.escape(step['capability'])} ·" + f" {html.escape(src)}
" + ) + if kind == "fan_out": + over = step.get("over", {}) + src = over.get("path") or over.get("step") or "task" + inner = ( + f'
{html.escape(step["id"])}' + f"{html.escape(step['capability'])} × each of" + f" {html.escape(str(src))}
" + ) + return ( + '
FAN-OUT (parallel,' + f" isolated){inner}
" + ) + if kind == "pipeline": + stages = " ".join( + f'
{html.escape(s["capability"])}
' + for s in step.get("stages", []) + ) + return ( + f'
PIPELINE (per item,' + f" barrier-free)
{stages}
" + ) + if kind == "loop_until": + body = " ".join( + _node(s) for s in step.get("body", []) + ) + return ( + f'
LOOP until' + f" {html.escape(step.get('until_capability', '?'))} (max" + f" {step.get('max_iters')}){body}
" + ) + if kind == "branch": + return f'
branch: {html.escape(step["id"])}
' + return f'
{html.escape(str(kind))}
' + + +def _flow(spec) -> str: + steps = " ".join( + _node(s) for s in spec.get("steps", []) + ) + return f'
{steps}
' + + +def _kv(label, value, why, tag, tag_label) -> str: + return ( + f'
{html.escape(label)}' + f'{tag_label}
' + f"{html.escape(value)}" + f'
{html.escape(why)}
' + ) + + +def _plan_card(name: str, env: dict) -> str: + spec = env.get("spec", {}) + caps = ", ".join(sorted(env.get("capability_versions", {}))) + ch = env.get("capability_contract_hashes", {}) + ch_short = {k: v[:12] for k, v in ch.items()} + parts = [ + f'

{html.escape(name)} — ' + f"“{html.escape(spec.get('goal', ''))}”

", + ( + "The plan, as data — every box is a pre-approved capability;" + " every arrow is a typed binding the validator checked:" + ), + _flow(spec), + _kv( + "spec_hash", + env.get("spec_hash", ""), + "Tamper evidence: every import recomputes sha256 over the spec and" + " rejects on mismatch. Change one character of the plan and it" + " will not load.", + "t-audit", + "AUDITABLE", + ), + _kv( + "planner_model · created_at", + f"{env.get('planner_model')} · {env.get('created_at')}", + "Provenance: which model authored this plan and when — the audit" + " trail starts at authoring, not at execution.", + "t-audit", + "AUDITABLE", + ), + _kv( + "registry_version · capability_versions", + f"registry v{env.get('registry_version')} · " + + json.dumps(env.get("capability_versions", {})), + "Versioning: the exact capability versions this plan was approved" + " against. The skeptic at v2 means a v1-era audit plan is REJECTED" + " on import and re-authored — semantics changed, so the plan must" + " too.", + "t-ver", + "VERSIONED", + ), + _kv( + "capability_contract_hashes", + json.dumps(ch_short), + "Drift detection without developer discipline: derived sha256 over" + " each capability's declared contract. A schema change nobody" + " version-bumped still refuses to load.", + "t-ver", + "VERSIONED", + ), + _kv( + "task_input_schema · task_input_digest", + f"{json.dumps(env.get('task_input_schema'))} · " + + str(env.get("task_input_digest", ""))[:16], + "Consistency across sessions: a NEW session imports this plan and" + " runs a NEW question through it (validated against the captured" + " schema) — same steps, same checks, same shape of answer. Zero" + " planner calls.", + "t-cons", + "CONSISTENT", + ), + _kv( + "validation", + json.dumps(env.get("validation", {})), + "Recorded but NEVER trusted: import re-validates against the" + " current registry. Lint waivers, if any, are recorded here too —" + " suppression is auditable.", + "t-safe", + "SAFE", + ), + ( + f"
Full envelope JSON ({caps})" + f"
{html.escape(json.dumps(env, indent=2))}
" + ), + "
", + ] + return "\n".join(parts) + + +def main() -> str: + envs = {} + for fn in sorted(os.listdir(STORE)): + if fn.endswith(".json"): + with open(os.path.join(STORE, fn)) as f: + envs[fn[:-5]] = json.load(f) + if not envs: + print("plan store is empty — run a demo session first", file=sys.stderr) + raise SystemExit(1) + + cards = "\n".join(_plan_card(k, v) for k, v in envs.items()) + page = f""" +Frozen Plan Inspector — RFC #93 +

Frozen Plan Inspector

+
The live contents of ca_plan_store/ — each file is a +FrozenWorkflowRecord: a model-authored workflow, frozen as a durable, portable artifact. +This page explains what each field buys you.
+ +
+
🔍 AuditableThe plan is data you can read, diff in a PR, +and hand to a reviewer — who authored it, when, with which model, and exactly what runs in +what order with what inputs. Turn-by-turn agent chatter leaves no such artifact.
+
🏷️ VersionedRegistry + per-capability versions and derived +contract hashes are sealed in. Capability changed since freezing? The plan refuses to load — +loudly — instead of silently running stale semantics.
+
♻️ ConsistentAuthoring is the only nondeterministic step, +and it happens once. Every session that imports this plan executes the same steps with the +same safety checks — only the data changes. Answers stop depending on the model's mood.
+
🛡️ SafeClosed capability vocabulary, typed bindings, +plan-quality lints, fail-closed defensive import. No model-written code is ever stored or +executed.
+
+ +
The consistency story in one line: Session A authored these plans +(1 planner call each). Session B — tomorrow, another user, another machine — imports them, +re-validates them, and runs new questions through them with 0 planner calls: the +same governed pipeline every time.
+
The tamper story in one line: edit one character of any +spec below and the next import fails with spec_hash mismatch; +change a capability's schema and it fails with contract drift. Drift never +replays silently.
+ +{cards} +""" + + out = os.path.join(STORE, "plan_inspector.html") + with open(out, "w") as f: + f.write(page) + print(out) + return out + + +if __name__ == "__main__": + main() From d58f4ef2f5e147dc6e1455a619f4af3cf461af6b Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Fri, 12 Jun 2026 00:18:34 -0700 Subject: [PATCH 53/64] fix(ca-demo): scenario banner states the ACTUAL data backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The banner still said 'over mock thelook_ecommerce' — stale from before the real-BigQuery upgrade and misleading on camera. It now reports LIVE bigquery-public-data.thelook_ecommerce when credentials allow, or the mock warehouse fallback, matching the engine field on every result. --- .../authored_workflow_ca_demo/bq_ca_planner/agent.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py index 9c0645f7050..767ef747fbb 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py @@ -1350,9 +1350,14 @@ async def plan_and_run(ctx: Context, node_input): task_note = f" — auditing {src_note}: {task['insights']}" else: task_note = "" + data_note = ( + "LIVE `bigquery-public-data.thelook_ecommerce`" + if _bq_client() is not None + else "mock `thelook_ecommerce` warehouse (no BigQuery credentials)" + ) yield _msg( f"🗂️ **Scenario: {sc['title']}** — expected shape `{sc['shape']}`," - " over mock `thelook_ecommerce`" + f" over {data_note}" f" ({', '.join(TABLES)}){task_note}." ) From 076ae11361277c650df59b68ebdf23990872d97a Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Fri, 12 Jun 2026 00:23:27 -0700 Subject: [PATCH 54/64] feat(ca-demo): all 7 real dataset tables + REAL profiling from __TABLES__ metadata MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Live finding: the demo listed only 4 of thelook_ecommerce's 7 tables, and profile_table still returned CANNED values chosen to look plausible — exactly the kind of fabrication the engine-honesty rule exists to prevent. * TABLES now covers the full real dataset (adds events, inventory_items, distribution_centers) — also widens nl2sql coverage to event/inventory questions. * profile_table queries the real __TABLES__ metadata (row_count, size_mb; cheap metadata reads) with engine=bigquery, falling back to clearly labeled canned values without credentials. Live-verified: events 2.43M rows / 368MB, distribution_centers 10 rows. * TableProfile/QualityReport contracts updated (row_count + size_mb; report = totals + largest table) — a DELIBERATE contract change: stored fanout plans drift-reject and re-author rather than running stale semantics. Fanout e2e pins all seven tables via the mock engine. --- .../bq_ca_planner/agent.py | 96 +++++++++++++++---- .../test_ca_demo_agent.py | 15 ++- 2 files changed, 89 insertions(+), 22 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py index 767ef747fbb..540d8286b75 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py @@ -91,9 +91,18 @@ # are deterministic mocks so the demo needs no BigQuery project. TABLES = { "orders": "order_id, user_id, status, created_at, num_of_item", - "order_items": "id, order_id, product_id, sale_price, status", + "order_items": "id, order_id, product_id, sale_price, status, created_at", "products": "id, name, category, brand, retail_price, department", "users": "id, email, age, country, traffic_source, created_at", + "events": ( + "id, user_id, session_id, created_at, city, browser," + " traffic_source, uri, event_type" + ), + "inventory_items": ( + "id, product_id, created_at, sold_at, cost, product_category," + " product_brand, product_distribution_center_id" + ), + "distribution_centers": "id, name, latitude, longitude", } _CANNED_ROWS = [ @@ -368,17 +377,60 @@ def query_thelook(sql: str) -> dict: } +# Mock fallback profiles (used WITHOUT credentials; clearly labeled via the +# engine field — with credentials, profiling queries the real __TABLES__). _CANNED_PROFILES = { - "orders": {"table": "orders", "row_count": 125210, "null_pct": 0.2}, + "orders": {"table": "orders", "row_count": 125000, "size_mb": 11.0}, "order_items": { "table": "order_items", - "row_count": 181830, - "null_pct": 0.0, + "row_count": 182000, + "size_mb": 24.0, + }, + "products": {"table": "products", "row_count": 29120, "size_mb": 4.8}, + "users": {"table": "users", "row_count": 100000, "size_mb": 27.0}, + "events": {"table": "events", "row_count": 2400000, "size_mb": 740.0}, + "inventory_items": { + "table": "inventory_items", + "row_count": 490000, + "size_mb": 138.0, + }, + "distribution_centers": { + "table": "distribution_centers", + "row_count": 10, + "size_mb": 0.1, }, - "products": {"table": "products", "row_count": 29120, "null_pct": 3.4}, - "users": {"table": "users", "row_count": 100000, "null_pct": 7.9}, } + +def _profile_table(value) -> dict: + """REAL table profile from BigQuery __TABLES__ metadata (row count, size) + when credentials allow; the canned fallback otherwise — engine-labeled.""" + name = str(value).strip().strip("`'\"") + if _bq_client() is not None and re.fullmatch(r"[A-Za-z_][\w]*", name): + out = _execute_sql({ + "sql": ( + "SELECT table_id, row_count, size_bytes FROM" + " `bigquery-public-data.thelook_ecommerce.__TABLES__` WHERE" + f" table_id = '{name}'" + ) + }) + rows = out.get("rows") or [] + if rows: + return { + "table": rows[0]["table_id"], + "row_count": int(rows[0]["row_count"]), + "size_mb": round(float(rows[0]["size_bytes"]) / 1048576, 1), + "engine": "bigquery", + } + prof = dict( + _CANNED_PROFILES.get( + name, {"table": name, "row_count": 0, "size_mb": 0.0} + ) + ) + prof["engine"] = "mock" + return prof + + _SCHEMA_NOTES = { "default": ( "orders.status takes Complete / Shipped / Processing / Cancelled /" @@ -432,13 +484,15 @@ class ChartArtifact(BaseModel): class TableProfile(BaseModel): table: str row_count: int - null_pct: float + size_mb: float + engine: str = "mock" class QualityReport(BaseModel): tables: int - worst_table: str - max_null_pct: float + total_rows: int + largest_table: str + total_size_mb: float class SchemaAnswer(BaseModel): @@ -864,12 +918,7 @@ def _registry() -> CapabilityRegistry: input_kind="item", serialize_input=False, max_fan_out=20, - build=_stub( - "profile_table", - lambda t: _CANNED_PROFILES.get( - str(t), {"table": str(t), "row_count": 0, "null_pct": 0.0} - ), - ), + build=_stub("profile_table", _profile_table), ), Capability( name="quality_report", @@ -880,10 +929,19 @@ def _registry() -> CapabilityRegistry: "quality_report", lambda profiles: { "tables": len(profiles), - "worst_table": max(profiles, key=lambda p: p["null_pct"])[ - "table" - ], - "max_null_pct": max(p["null_pct"] for p in profiles), + "total_rows": sum( + int(p.get("row_count", 0)) for p in profiles + ), + "largest_table": ( + max(profiles, key=lambda p: p.get("row_count", 0))[ + "table" + ] + if profiles + else "" + ), + "total_size_mb": round( + sum(float(p.get("size_mb", 0)) for p in profiles), 1 + ), }, ), ), diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py index b00e3e8ef0f..1c835a6316a 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py @@ -1192,14 +1192,23 @@ async def test_sequence_executes(monkeypatch): @pytest.mark.asyncio -async def test_fanout_executes_no_llm_needed(): - # profiling + report are deterministic mocks even in the LIVE registry. +async def test_fanout_executes_no_llm_needed(monkeypatch): + # All SEVEN real dataset tables, profiled in parallel. The mock engine is + # pinned here (no network in unit tests); with credentials the profiler + # queries the real __TABLES__ metadata and labels engine=bigquery. + monkeypatch.setitem(demo._BQ, "disabled", True) out = await _run( _expected_spec("fanout"), demo._registry(), demo.SCENARIOS["fanout"]["task"], ) - assert out == {"tables": 4, "worst_table": "users", "max_null_pct": 7.9} + profiles = demo._CANNED_PROFILES.values() + assert out == { + "tables": 7, + "total_rows": sum(p["row_count"] for p in profiles), + "largest_table": "events", + "total_size_mb": round(sum(p["size_mb"] for p in profiles), 1), + } @pytest.mark.asyncio From 04b626230074f22acadacb82b3165aed69b39d4c Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Fri, 12 Jun 2026 00:32:29 -0700 Subject: [PATCH 55/64] =?UTF-8?q?feat(ca-demo):=20everything=20real=20?= =?UTF-8?q?=E2=80=94=20no=20simulated=20data=20anywhere=20in=20the=20live?= =?UTF-8?q?=20path?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User ask: make the whole demo run on the real thelook_ecommerce dataset. Remaining fakery removed: * describe_schema v2: a data-grounded agent (query_thelook tool) that answers metadata questions by querying the REAL data (DISTINCT values, counts) — live-verified answer cites shipped_at/delivered_at semantics it discovered itself. The canned _SCHEMA_NOTES sentence is gone. * The repair scenario repairs a REALLY broken query: the task carries SQL referencing thelook_ecommerce.order (not orders); the REAL dry-run rejects it, the repair step fixes it from the actual BigQuery error, and run_query returns real rows (live-verified: China $364K...). The simulated flaky_dry_run capability and _FLAKY_CALLS are deleted — transient-failure simulation now lives only in the CI test stub. * render_chart: a bare tournament winner now charts REAL revenue-by-region rows fetched from BigQuery (canned rows remain only inside the no-credential mock engine). * Chart/loop/branch/tolerance tests pin the mock engine explicitly (no network in unit tests); the branch e2e stubs the now-LLM describe_schema. The only non-real component left BY DESIGN is the no-credential fallback warehouse, always labeled via the engine field. 38 CA tests collected; 91 green locally (92 under the patched ADK tree). --- .../authored_workflow_ca_demo/README.md | 11 +- .../bq_ca_planner/agent.py | 121 +++++++++--------- .../test_ca_demo_agent.py | 104 +++++++++++---- 3 files changed, 149 insertions(+), 87 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/README.md b/contributing/samples/workflows/authored_workflow_ca_demo/README.md index bf6a5164592..47dde1d44d0 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_ca_demo/README.md @@ -58,8 +58,8 @@ scenario per prompt**, each authoring a different coordination shape: | 1 | `What was revenue by region last quarter?` | `loop_until(draft → REAL dry-run → repair) → run_query → render_chart + summarize` | the standard CA flow — **your actual question is the task input**, and a real BigQuery dry-run error (e.g. `TIMESTAMP_SUB ... YEAR`) feeds the repair round | | 2 | `Profile data quality across the dataset tables.` | fan-out → synthesize | per-table profiling in parallel, one report | | 3 | `Build a dashboard for these three questions.` | pipeline(`nl2sql → dry_run`) per item | each panel translated + validated barrier-free | -| 4 | `Route my question: what does order status 'Complete' mean?` | classify & route (branch) | metadata questions skip SQL entirely | -| 5 | `Answer with SQL self-repair — the dry run is unreliable.` | loop_until + **loop-carried `init`** | draft → failed dry run → repair using the error | +| 4 | `Route my question: what does order status 'Complete' mean?` | classify & route (branch) | metadata questions skip SQL planning — answered by a data-grounded agent that queries the REAL data (DISTINCT values, counts) | +| 5 | `Answer with SQL self-repair — the dry run is unreliable.` | loop_until + **loop-carried `init`** | a REALLY broken query (`thelook_ecommerce.order`) is checked by the REAL dry-run, repaired from the actual BigQuery error, then executed | | 6 | `Audit this insight: ` (or just `audit that insight` after a question) | adversarial verification | **audits YOUR insights with DATA-GROUNDED skeptics** — each runs real BigQuery checks via its `query_thelook` tool and cites the numbers (the $1M-AOV claim is refuted with the actual ~$86 AOV); insights from your message, the session's last insight, or the canned fallback | | 7 | `Pick the best chart for revenue by region.` | tournament | pairwise chart judging to a single winner | @@ -83,7 +83,7 @@ replayable — a turn-by-turn agent retry never is.* ## 2. Correctness proof (no LLM, no BigQuery) ```bash -pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 38 (one live-gated; one gated on the patched ADK wrapper) +pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 38 collected (one live-gated; one gated on the patched ADK wrapper) ``` All seven expected shapes are built by hand, validated + lint-checked against @@ -100,8 +100,9 @@ against the **live** registry (their capabilities are deterministic mocks). free-decomposition evidence is the spike's demand gate and the main demo's free-authoring beat. The *variety* — seven shapes from one closed vocabulary — is the claim here. -- The `flaky_dry_run` failure is simulated (every odd call fails) so the - repair loop behaves identically on every run and in CI. +- Nothing in the live path is simulated anymore: the repair scenario + checks a really-broken query against the real dry-run; transient-failure + simulation now lives only in the CI test stubs. - Frozen plans are per-scenario (`authored_workflow:ca:`) in session state, AND exported per-scenario to `ca_plan_store/` for cross-session reuse (delete a file to force fresh authoring; the store is diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py index 540d8286b75..df5468ef0f5 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py @@ -431,19 +431,6 @@ def _profile_table(value) -> dict: return prof -_SCHEMA_NOTES = { - "default": ( - "orders.status takes Complete / Shipped / Processing / Cancelled /" - " Returned; 'Complete' means the order was delivered and the return" - " window has closed." - ) -} - -# Simulated transient dry-run failure (repair-loop scenario): every ODD call -# fails, so EVERY run of the loop shows exactly one repair iteration — -# deterministic on camera and in CI, and replay behaves identically. -_FLAKY_CALLS = {"n": 0} - _JUDGE_RANK = {"bar": 0, "line": 1, "scatter": 2, "pie": 3} @@ -626,7 +613,7 @@ def _render_chart(v) -> dict: (list with one chart-type string), or a bare chart-type string. Emits the Conversational-Analytics-style artifact: a Vega-Lite spec + a text preview the chat can render.""" - chart_type, rows, explicit = "bar", _CANNED_ROWS, False + chart_type, rows, explicit = "bar", None, False obj = _obj_of(v) if isinstance(obj, dict): rows = obj.get("rows", rows) @@ -650,6 +637,21 @@ def _datelike(r) -> bool: if not explicit and len(rows or []) >= 2 and all(map(_datelike, rows)): chart_type = "line" + if rows is None: + # No rows handed over (e.g. the tournament passes only the winning + # chart type): chart REAL revenue-by-region data, not canned values. + rows = ( + _execute_sql({ + "sql": ( + "SELECT u.country AS region, SUM(oi.sale_price) AS revenue" + " FROM thelook_ecommerce.order_items AS oi JOIN" + " thelook_ecommerce.orders AS o ON oi.order_id = o.order_id" + " JOIN thelook_ecommerce.users AS u ON o.user_id = u.id" + " GROUP BY region ORDER BY revenue DESC LIMIT 8" + ) + }).get("rows") + or _CANNED_ROWS + ) first = rows[0] if rows and isinstance(rows[0], dict) else {} timeish = ("year", "quarter", "month", "week", "date", "day") str_fields = [k for k, v in first.items() if isinstance(v, str)] @@ -816,7 +818,9 @@ def _registry() -> CapabilityRegistry: "draft_or_repair_sql", Sql, "Input JSON has a question, and possibly a prior sql + error" - " from a failed dry run. Draft (or repair, using the error)" + " from a failed dry run. If there is an error, REPAIR the sql" + " using it; if the sql is valid (no error), return it" + " unchanged. Otherwise draft" " one BigQuery StandardSQL SELECT over the public dataset" " bigquery-public-data.thelook_ecommerce (fully-qualified" f" table names): {schema_blurb}. Output Sql, echoing the" @@ -887,13 +891,6 @@ def _registry() -> CapabilityRegistry: serialize_input=False, build=_stub("dry_run", _bq_dry_run), ), - Capability( - name="flaky_dry_run", - input_kind="item", - output_model=DryRunResult, - serialize_input=False, - build=_stub("flaky_dry_run", lambda s: _flaky_dry_run(s)), - ), Capability( name="sql_ok", input_kind="item", @@ -949,10 +946,25 @@ def _registry() -> CapabilityRegistry: name="describe_schema", output_model=SchemaAnswer, input_kind="item", - serialize_input=False, - build=_stub( - "describe_schema", - lambda q: {"answer": _SCHEMA_NOTES["default"]}, + serialize_input=True, + # v2: answers metadata questions from the REAL dataset (it queries + # DISTINCT values / counts) instead of a canned sentence. + version="2", + build=lambda: Agent( + name="describe_schema", + model=MODEL, + output_schema=SchemaAnswer, + generate_content_config=DET, + tools=[query_thelook], + instruction=( + "Answer metadata/meaning questions about the public" + " dataset bigquery-public-data.thelook_ecommerce" + f" ({schema_blurb}). QUERY the real data with the" + " query_thelook tool (e.g. SELECT DISTINCT values, small" + " counts) rather than answering from priors. Output" + " SchemaAnswer: a concise answer grounded in the queried" + " values." + ), ), ), Capability( @@ -1010,23 +1022,6 @@ def _registry() -> CapabilityRegistry: ]) -def _flaky_dry_run(s): - _FLAKY_CALLS["n"] += 1 - if _FLAKY_CALLS["n"] % 2 == 1: # every odd call fails -> 1 repair per run - return { - "question": str(_field_of(s, "question", "") or ""), - "sql": _sql_of(s), - "valid": False, - "error": "Table not found: `thelook.order` (did you mean orders?)", - } - return { - "question": str(_field_of(s, "question", "") or ""), - "sql": _sql_of(s), - "valid": True, - "error": None, - } - - # ------------------------------------------------- scenarios _CAPS_BLURB = ( # NOTE: instruction strings must stay BRACE-FREE — ADK templates @@ -1039,8 +1034,8 @@ def _flaky_dry_run(s): " field category equal to 'data' or 'schema'), skeptic (item: one —" " data-grounded: it runs real verification queries via its query_thelook" " tool; insight -> Verdict with fields insight and refuted), dry_run (item:" - " Sql -> object with sql, valid, error), flaky_dry_run (same as dry_run but" - " may fail transiently), sql_ok (item: dry-run output -> bool), run_query" + " Sql or a task with sql -> object with sql, valid, error — the REAL" + " BigQuery dry-run), sql_ok (item: dry-run output -> bool), run_query" " (item: validated sql -> object with rows), profile_table (item: a table" " name -> stats object), quality_report (LIST of stats -> report object)," " describe_schema (item: a question -> object with answer), keep_verified" @@ -1130,19 +1125,31 @@ def _scenario_defs(): ), ), "loop": dict( - title="SQL self-repair (loop_until + loop-carried state)", - shape="loop_until(init=task, body=[draft_or_repair, flaky_dry_run])", - triggers=("repair", "unreliable", "retry"), - task={"question": q_region}, + title="SQL self-repair from a REAL broken query (loop_until)", + shape="loop_until(REAL dry_run → repair) → run_query", + triggers=("repair", "unreliable", "retry", "broken"), + task={ + "question": q_region, + "sql": ( # 'order' instead of 'orders' -> a REAL not-found error + "SELECT u.country AS region, SUM(oi.sale_price) AS revenue" + " FROM thelook_ecommerce.order_items AS oi JOIN" + " thelook_ecommerce.order AS o ON oi.order_id =" + " o.order_id JOIN thelook_ecommerce.users AS u ON" + " o.user_id = u.id GROUP BY region ORDER BY revenue DESC" + ), + }, recipe=( - "Author ONE loop_until: init = Binding(source='task'); body =" - " [(a) a step running draft_or_repair_sql whose input is" - " Binding(source='step', step=) — it reads" - " the loop-carried value: the task on round 0, the failed" - " dry-run output (sql + error) afterwards; (b) a step running" - " flaky_dry_run on (a)]; until_capability = sql_ok with" - " until_input = Binding(source='step', step=);" - " max_iters = 3. Output = the loop." + "Author, in order: (1) ONE loop_until: init =" + " Binding(source='task'); body = [(a) a step running dry_run" + " whose input is Binding(source='step', step=) — round 0 checks the task's sql, later rounds check" + " the repaired sql; (b) a step running draft_or_repair_sql" + " on (a) — it reads question + sql + the REAL BigQuery error" + " and outputs a fixed Sql (if there is no error, return the" + " sql unchanged)]; until_capability = sql_ok with until_input" + " = Binding(source='step', step=); max_iters =" + " 3. (2) a step running run_query on the loop's output." + " Output = the run_query step." ), ), "adversarial": dict( diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py index 1c835a6316a..8b57a25457b 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py @@ -62,6 +62,7 @@ "summarize_insight", "classify_question", "skeptic", + "describe_schema", # v2: data-grounded (query tool) — stubbed in tests ) @@ -134,6 +135,20 @@ def _stub_registry() -> CapabilityRegistry: lambda v: {"insight": str(v), "refuted": "1,000,000" in str(v)}, ), ), + Capability( + name="describe_schema", + input_kind="item", + serialize_input=False, + build=_stub( + "describe_schema", + lambda q: { + "answer": ( + "orders.status values include Complete, Shipped," + " Processing, Cancelled, Returned." + ) + }, + ), + ), ] passthrough = [ cap for name, cap in real._by_name.items() if name not in _LLM_CAPS @@ -294,7 +309,7 @@ def _expected_spec(key: str) -> WorkflowSpec: ) if key == "loop": return WorkflowSpec( - goal="sql self-repair", + goal="sql self-repair from a real broken query", steps=[ LoopUntil( kind="loop_until", @@ -303,23 +318,29 @@ def _expected_spec(key: str) -> WorkflowSpec: body=[ StepRef( kind="step", - id="draft", - capability="draft_or_repair_sql", + id="check", + capability="dry_run", input=Binding(source="step", step="repair"), ), StepRef( kind="step", - id="check", - capability="flaky_dry_run", - input=Binding(source="step", step="draft"), + id="fix", + capability="draft_or_repair_sql", + input=Binding(source="step", step="check"), ), ], until_capability="sql_ok", until_input=Binding(source="step", step="check"), max_iters=3, ), + StepRef( + kind="step", + id="rows", + capability="run_query", + input=Binding(source="step", step="repair"), + ), ], - output=Binding(source="step", step="repair"), + output=Binding(source="step", step="rows"), ) if key == "adversarial": return WorkflowSpec( @@ -402,11 +423,12 @@ async def parent(ctx, node_input): # ----------------------------------------------------- tests -def test_stubs_tolerate_authored_binding_shapes(): +def test_stubs_tolerate_authored_binding_shapes(monkeypatch): # The plan is MODEL-authored: a binding may hand a stub the whole step # output (dict), a dotted path into it (raw string), or a JSON-encoded # payload. The live error this pins: nl2sql -> dry_run with path='sql' # passed a raw SQL string and the stub assumed a dict. + monkeypatch.setitem(demo._BQ, "disabled", True) raw_sql = "SELECT region FROM order_items" assert demo._sql_of({"sql": raw_sql}) == raw_sql assert demo._sql_of(json.dumps({"sql": raw_sql})) == raw_sql @@ -419,12 +441,13 @@ def test_stubs_tolerate_authored_binding_shapes(): "reason": "", } assert demo._verdict_of("just text")["refuted"] is False - demo._FLAKY_CALLS["n"] = 1 # next call is even -> passes - out = demo._flaky_dry_run(raw_sql) # raw string input must not crash + # dry-run (mock branch) tolerates a raw SQL string input too + out = demo._bq_dry_run(raw_sql) assert out["valid"] is True and out["sql"] == raw_sql -def test_render_chart_accepts_authored_binding_shapes(): +def test_render_chart_accepts_authored_binding_shapes(monkeypatch): + monkeypatch.setitem(demo._BQ, "disabled", True) # query output (dict with rows) -> bar over those rows region_rows = demo._query_engine( "SELECT region, SUM(x) AS revenue ... GROUP BY region INTERVAL 1 YEAR" @@ -446,14 +469,16 @@ def test_render_chart_accepts_authored_binding_shapes(): assert len(lines) == 4 and lines[0].count("█") > lines[-1].count("█") -def test_render_chart_derives_encoding_fields(): +def test_render_chart_derives_encoding_fields(monkeypatch): + monkeypatch.setitem(demo._BQ, "disabled", True) ch = demo._render_chart({"rows": [{"category": "A", "count": 3}]}) assert ch["x_field"] == "category" and ch["y_field"] == "count" enc = ch["vega_lite"]["encoding"] assert enc["x"]["field"] == "category" and enc["y"]["field"] == "count" -def test_chart_png_renders_or_falls_back(): +def test_chart_png_renders_or_falls_back(monkeypatch): + monkeypatch.setitem(demo._BQ, "disabled", True) ch = demo._render_chart({"rows": demo._CANNED_ROWS}) png = demo._chart_png(ch) if png is None: @@ -606,7 +631,8 @@ def query(self, *a, **k): assert "question" in demo.Sql.model_fields -def test_chart_multiseries_per_region_per_year(): +def test_chart_multiseries_per_region_per_year(monkeypatch): + monkeypatch.setitem(demo._BQ, "disabled", True) # The shape the user's real question produces: GROUP BY region, year with # two measures. x = the time field, one SERIES per region, measure picked # by name preference (total_sales over total_orders); int year never @@ -734,7 +760,8 @@ def test_engine_yearly_and_quarterly_grains(): ) -def test_chart_infers_line_for_time_series(): +def test_chart_infers_line_for_time_series(monkeypatch): + monkeypatch.setitem(demo._BQ, "disabled", True) rows = demo._query_engine( "SELECT month, SUM(x) AS sales ... GROUP BY month INTERVAL 1 YEAR" ) @@ -1233,16 +1260,42 @@ async def test_branch_routes_schema_question(): @pytest.mark.asyncio -async def test_loop_repairs_sql_exactly_once(): - demo._FLAKY_CALLS["n"] = 0 - out = await _run( - _expected_spec("loop"), - _stub_registry(), - demo.SCENARIOS["loop"]["task"], +async def test_loop_repairs_sql_exactly_once(monkeypatch): + # The demo uses the REAL dry-run on a really-broken query; in CI the + # failure is simulated by a stateful stub (fails the first check only). + monkeypatch.setitem(demo._BQ, "disabled", True) + calls = {"n": 0} + + def checking(s): + calls["n"] += 1 + if calls["n"] == 1: + return { + "question": str(demo._field_of(s, "question", "") or ""), + "sql": demo._sql_of(s), + "valid": False, + "error": "Not found: Table thelook_ecommerce.order", + } + return { + "question": str(demo._field_of(s, "question", "") or ""), + "sql": demo._sql_of(s), + "valid": True, + "error": None, + } + + reg = _stub_registry() + caps = [c for n, c in reg._by_name.items() if n != "dry_run"] + caps.append( + Capability( + name="dry_run", + input_kind="item", + serialize_input=False, + build=_stub("dry_run", checking), + ) ) - assert out["valid"] is True - # odd call fails, even call passes -> exactly one repair iteration. - assert demo._FLAKY_CALLS["n"] == 2 + reg = CapabilityRegistry(caps) + out = await _run(_expected_spec("loop"), reg, demo.SCENARIOS["loop"]["task"]) + assert calls["n"] == 2 # exactly one repair round + assert out["engine"] == "mock" and out["rows"] # query ran after repair @pytest.mark.asyncio @@ -1257,7 +1310,8 @@ async def test_adversarial_rejects_implausible_insight(): @pytest.mark.asyncio -async def test_tournament_picks_best_chart_no_llm_needed(): +async def test_tournament_picks_best_chart_no_llm_needed(monkeypatch): + monkeypatch.setitem(demo._BQ, "disabled", True) # pairing + judging are deterministic mocks even in the LIVE registry. out = await _run( _expected_spec("tournament"), From 3786a51378747a76474f6cfbe61cec72ba3631c0 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Fri, 12 Jun 2026 00:36:17 -0700 Subject: [PATCH 56/64] feat(ca-demo): profiling discovers the LIVE table list (all 8, incl. the stray) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The console's Agents Hub shows 8 tables; the demo's hand-written catalogue listed 7. The 8th is real: 'thelook_ecommerce-table', an empty stray placeholder (0 rows, dashed name) in the public dataset. The profiling fan-out now discovers the table list live from __TABLES__ (cached per process; curated-catalogue fallback without credentials), so it always matches what the console shows — and surfacing the empty stray is itself an honest data-quality finding. Profile-name validation now permits dashed table names. 39 CA tests collected. --- .../bq_ca_planner/agent.py | 30 ++++++++++++++++++- .../test_ca_demo_agent.py | 24 ++++++++++++--- 2 files changed, 49 insertions(+), 5 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py index df5468ef0f5..31194d4d679 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py @@ -402,11 +402,37 @@ def query_thelook(sql: str) -> dict: } +_TABLE_LIST_CACHE: dict = {} + + +def _live_table_list() -> list: + """The dataset's ACTUAL tables from __TABLES__ (cached per process), + falling back to the curated catalogue without credentials. Includes + whatever really exists — e.g. the empty stray 'thelook_ecommerce-table' + placeholder — so profiling never drifts from what the console shows.""" + if "tables" in _TABLE_LIST_CACHE: + return _TABLE_LIST_CACHE["tables"] + tables = list(TABLES) + if _bq_client() is not None: + out = _execute_sql({ + "sql": ( + "SELECT table_id FROM" + " `bigquery-public-data.thelook_ecommerce.__TABLES__` ORDER BY" + " table_id" + ) + }) + live = [r["table_id"] for r in out.get("rows") or []] + if live: + tables = live + _TABLE_LIST_CACHE["tables"] = tables + return tables + + def _profile_table(value) -> dict: """REAL table profile from BigQuery __TABLES__ metadata (row count, size) when credentials allow; the canned fallback otherwise — engine-labeled.""" name = str(value).strip().strip("`'\"") - if _bq_client() is not None and re.fullmatch(r"[A-Za-z_][\w]*", name): + if _bq_client() is not None and re.fullmatch(r"[A-Za-z_][\w-]*", name): out = _execute_sql({ "sql": ( "SELECT table_id, row_count, size_bytes FROM" @@ -1241,6 +1267,8 @@ def _task_for(key: str, text: str, last_insight: str | None = None) -> dict: task = dict(SCENARIOS[key]["task"]) if key == "sequence" and text.strip(): task = {"question": text.strip()} + if key == "fanout": + task = {"tables": _live_table_list()} # whatever REALLY exists if key == "adversarial": inline = _extract_insights(text) if inline: diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py index 8b57a25457b..f7e85bd8e09 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py @@ -794,13 +794,29 @@ class Wrapped: assert demo._text_of(w) == "last year please" +def test_fanout_profiles_the_live_table_list(monkeypatch): + # Profiling fans out over whatever REALLY exists in the dataset (live + # __TABLES__, cached); without credentials it falls back to the curated + # catalogue. The live list legitimately includes the empty stray + # 'thelook_ecommerce-table' the console shows. + monkeypatch.setitem(demo._BQ, "disabled", True) + monkeypatch.setattr(demo, "_TABLE_LIST_CACHE", {}) + assert demo._task_for("fanout", "profile data quality") == { + "tables": list(demo.TABLES) + } + # cache short-circuits repeated metadata queries + monkeypatch.setattr(demo, "_TABLE_LIST_CACHE", {"tables": ["a", "b"]}) + assert demo._task_for("fanout", "x") == {"tables": ["a", "b"]} + + def test_sequence_takes_live_question_others_stay_canned(): q = "What was revenue by region last year?" assert demo._task_for("sequence", q) == {"question": q} # empty/whitespace falls back to the canned question assert demo._task_for("sequence", " ") == demo.SCENARIOS["sequence"]["task"] - # mode-selector scenarios keep their canned inputs - assert demo._task_for("fanout", q) == demo.SCENARIOS["fanout"]["task"] + # mode-selector scenarios keep canned/derived inputs (fanout discovers + # the live table list — see test_fanout_profiles_the_live_table_list) + assert demo._task_for("pipeline", q) == demo.SCENARIOS["pipeline"]["task"] def test_root_agent_importable_and_named(): @@ -836,8 +852,8 @@ def test_audit_takes_live_insights_not_canned(): task = demo._task_for("adversarial", "audit these insights") assert task == demo.SCENARIOS["adversarial"]["task"] # other scenarios unaffected - assert demo._task_for("fanout", f"audit {claim}") == ( - demo.SCENARIOS["fanout"]["task"] + assert demo._task_for("pipeline", f"audit {claim}") == ( + demo.SCENARIOS["pipeline"]["task"] ) From c41e6df8c460f7b688f1e747463a4b8f21cb3620 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Fri, 12 Jun 2026 00:41:28 -0700 Subject: [PATCH 57/64] fix(ca-demo): live table discovery excludes empty strays (row_count > 0) Matches the production CA agent's 7-table scope: the 0-row 'thelook_ecommerce-table' placeholder is excluded from profiling fan-outs. --- .../authored_workflow_ca_demo/bq_ca_planner/agent.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py index 31194d4d679..41884c5da77 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py @@ -406,10 +406,10 @@ def query_thelook(sql: str) -> dict: def _live_table_list() -> list: - """The dataset's ACTUAL tables from __TABLES__ (cached per process), - falling back to the curated catalogue without credentials. Includes - whatever really exists — e.g. the empty stray 'thelook_ecommerce-table' - placeholder — so profiling never drifts from what the console shows.""" + """The dataset's ACTUAL non-empty tables from __TABLES__ (cached per + process), falling back to the curated catalogue without credentials. + Empty strays (e.g. the 0-row 'thelook_ecommerce-table' placeholder) are + excluded — matching the production CA agent's 7-table scope.""" if "tables" in _TABLE_LIST_CACHE: return _TABLE_LIST_CACHE["tables"] tables = list(TABLES) @@ -417,8 +417,8 @@ def _live_table_list() -> list: out = _execute_sql({ "sql": ( "SELECT table_id FROM" - " `bigquery-public-data.thelook_ecommerce.__TABLES__` ORDER BY" - " table_id" + " `bigquery-public-data.thelook_ecommerce.__TABLES__` WHERE" + " row_count > 0 ORDER BY table_id" ) }) live = [r["table_id"] for r in out.get("rows") or []] From 3a4730a71b06cb6fed84ddc42880a1d7e3984135 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Fri, 12 Jun 2026 01:03:38 -0700 Subject: [PATCH 58/64] feat(ca-demo): dashboard pipeline EXECUTES every panel (draft/dry-run/run/chart) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Found via the production-CA head-to-head: the shipped pipeline scenario only translated+validated panels; the production agent executed them. Our vocabulary already expresses the full shape — pipeline stages draft_or_repair_sql -> REAL dry_run -> run_query -> render_chart, per panel, barrier-free. Live result: 3 real panels (12-row revenue line, top-5 categories bar, 60-row multi-series users-by-traffic-source line) in 10.0s / 12 dispatches against the real dataset. Panel questions updated to concrete 2025 asks; e2e pins 3 executed chart artifacts. --- .../bq_ca_planner/agent.py | 32 +++++++++++++------ .../test_ca_demo_agent.py | 19 +++++------ 2 files changed, 32 insertions(+), 19 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py index 41884c5da77..6c9e8eaf805 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py @@ -1118,22 +1118,34 @@ def _scenario_defs(): ), ), "pipeline": dict( - title="Build a dashboard (pipeline)", - shape="pipeline(nl2sql → dry_run) → step", + title="Build a dashboard (pipeline — executes every panel)", + shape=( + "pipeline(draft → REAL dry_run → run_query → render_chart)" + " per panel, barrier-free" + ), triggers=("dashboard",), task={ "questions": [ - {"question": "Top 5 product categories by revenue?"}, - {"question": "Monthly active users by traffic source?"}, - {"question": "Return rate by department?"}, + {"question": "Monthly total revenue for 2025"}, + { + "question": ( + "Top 5 product categories by total revenue in 2025" + ) + }, + { + "question": ( + "New users per month in 2025 broken down by" + " traffic source" + ) + }, ] }, recipe=( - "Author: (1) a pipeline over task.questions with two stages," - " nl2sql then dry_run, so each dashboard question is" - " translated and validated per item; (2) a step running" - " summarize_insight on the pipeline output. Output = the" - " summarize step." + "Author: ONE pipeline over task.questions with FOUR stages —" + " draft_or_repair_sql, then dry_run, then run_query, then" + " render_chart — so each dashboard panel is translated," + " validated, EXECUTED, and charted per item, barrier-free." + " Output = the pipeline." ), ), "branch": dict( diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py index f7e85bd8e09..51feb0bec2f 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py @@ -234,18 +234,14 @@ def _expected_spec(key: str) -> WorkflowSpec: id="panels", over=Binding(source="task", path="questions"), stages=[ - PipelineStage(capability="nl2sql"), + PipelineStage(capability="draft_or_repair_sql"), PipelineStage(capability="dry_run"), + PipelineStage(capability="run_query"), + PipelineStage(capability="render_chart"), ], ), - StepRef( - kind="step", - id="sum", - capability="summarize_insight", - input=Binding(source="step", step="panels"), - ), ], - output=Binding(source="step", step="sum"), + output=Binding(source="step", step="panels"), ) if key == "branch": return WorkflowSpec( @@ -1256,13 +1252,18 @@ async def test_fanout_executes_no_llm_needed(monkeypatch): @pytest.mark.asyncio async def test_pipeline_executes_per_question(monkeypatch): + # Every panel is translated, validated, EXECUTED, and charted — the shape + # the CA head-to-head proved out (3 real panels in ~10s, barrier-free). monkeypatch.setitem(demo._BQ, "disabled", True) out = await _run( _expected_spec("pipeline"), _stub_registry(), demo.SCENARIOS["pipeline"]["task"], ) - assert out == {"insight": "US-West leads revenue."} + assert len(out) == 3 + for panel in out: + assert "vega_lite" in panel and panel["chart_type"] in ("bar", "line") + assert panel["vega_lite"]["data"]["values"] # executed rows, per panel @pytest.mark.asyncio From 83a9636216f756010984f2ed44c27c81da234496 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Fri, 12 Jun 2026 08:07:49 -0700 Subject: [PATCH 59/64] feat(ca-demo): SQL freezing (numeric determinism) + human-feedback revisions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CA head-to-head showed our determinism was plan-level only — the drafting LLM still varied date windows run-to-run. SQL freezing extends the freeze one level deeper, and human feedback makes the artifact governable: * After a question's SQL passes the REAL dry-run, it is frozen to ca_plan_store/sql/.json (sql, sha256, engine, bytes, validated_at, revisions[]). Re-asking the exact question replays via a STATIC replay plan (dry_run -> run_query -> chart -> summarize; constant hash, no drafting step): the dry-run re-validates (warehouse-drift detection) and the numbers are deterministic. Live-verified: identical $644,971.22 across runs with the drafting LLM skipped. * 'revise: ' revises the frozen SQL for the session's last question (draft capability is now feedback-aware), MUST pass the real dry-run before replacing the artifact, and records the feedback + previous SQL in the revisions history — an auditable trail of who changed the query and why. Live-verified: 'exclude Cancelled/Returned' -> validated revision -> re-frozen -> executed (China $644K -> $480K). * Drift safety: a frozen SQL that no longer validates falls back to fresh authoring with the rejection shown. Tests: store roundtrip + normalized digests + revision history, static replay plan (constant hash, lint-clean, no drafting capability), e2e replay with 4 dispatches (no draft), revise-trigger routing. 95 total. --- .../authored_workflow_ca_demo/README.md | 18 ++ .../bq_ca_planner/agent.py | 258 +++++++++++++++++- .../test_ca_demo_agent.py | 77 ++++++ 3 files changed, 345 insertions(+), 8 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/README.md b/contributing/samples/workflows/authored_workflow_ca_demo/README.md index 47dde1d44d0..b7c2c341a3a 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/README.md +++ b/contributing/samples/workflows/authored_workflow_ca_demo/README.md @@ -93,6 +93,24 @@ question away from SQL, the audit rejects the implausible insight, the tournament converges to `bar` and renders it as a Vega-Lite chart artifact. The fan-out and tournament scenarios execute against the **live** registry (their capabilities are deterministic mocks). +## SQL freezing + human-feedback revision + +Plan freezing pins the *process*; **SQL freezing pins the numbers**. After a +question's SQL passes the real dry-run, it's frozen to +`ca_plan_store/sql/.json`. Re-ask the exact question (any +session): the drafting LLM is **skipped**, the frozen SQL re-validates +(doubling as warehouse-drift detection) and replays — live-verified +identical results run-to-run. Then govern it with feedback: + +```text +revise: exclude orders with status Cancelled or Returned +``` + +→ the SQL is revised to follow the feedback, must pass the REAL dry-run +before it replaces the frozen artifact, and the feedback itself is recorded +in the artifact's `revisions` history — who changed the query and why, +auditable. A failed revision leaves the frozen SQL untouched. + ## Notes - Honesty: like the security-audit demo, scenario recipes are diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py index 6c9e8eaf805..1cf8c20e9f7 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py @@ -70,6 +70,7 @@ "authored_workflow_spike", ), ) +from authoring import Binding # noqa: E402 from authoring import Capability # noqa: E402 from authoring import CapabilityRegistry # noqa: E402 from authoring import export_plan # noqa: E402 @@ -79,6 +80,7 @@ from authoring import PlanImportError # noqa: E402 from authoring import sha256_hex # noqa: E402 from authoring import SpecInterpreter # noqa: E402 +from authoring import StepRef # noqa: E402 from authoring import WorkflowSpec # noqa: E402 from authoring import WorkflowSpecValidator # noqa: E402 @@ -844,9 +846,11 @@ def _registry() -> CapabilityRegistry: "draft_or_repair_sql", Sql, "Input JSON has a question, and possibly a prior sql + error" - " from a failed dry run. If there is an error, REPAIR the sql" - " using it; if the sql is valid (no error), return it" - " unchanged. Otherwise draft" + " from a failed dry run, and possibly human feedback. If" + " there is feedback, REVISE the sql to follow it exactly. If" + " there is an error, REPAIR the sql using it; if the sql is" + " valid (no error, no feedback), return it unchanged." + " Otherwise draft" " one BigQuery StandardSQL SELECT over the public dataset" " bigquery-public-data.thelook_ecommerce (fully-qualified" f" table names): {schema_blurb}. Output Sql, echoing the" @@ -1190,6 +1194,22 @@ def _scenario_defs(): " Output = the run_query step." ), ), + "revise": dict( + title="Revise the frozen SQL from human feedback", + shape=( + "feedback → draft_or_repair (REAL dry-run) → re-freeze → execute" + ), + triggers=( + "revise", + "update the sql", + "update the query", + "change the query", + "instead of", + "redefine", + ), + task={}, + recipe="", + ), "adversarial": dict( title="Audit insights (adversarial verification)", shape="fan_out(skeptic) → step(keep_verified)", @@ -1391,6 +1411,113 @@ def _store_plan(key: str, record: FrozenWorkflowRecord) -> str: return path +_SQL_STORE = os.path.join(_PLAN_STORE, "sql") + + +def _now_iso() -> str: + return datetime.datetime.now(datetime.timezone.utc).isoformat() + + +def _q_digest(question: str) -> str: + return sha256_hex(re.sub(r"\s+", " ", (question or "").strip().lower()))[:16] + + +def _load_frozen_sql(question: str): + """The dry-run-validated SQL frozen for this exact question, or None.""" + if not question: + return None + path = os.path.join(_SQL_STORE, f"{_q_digest(question)}.json") + if not os.path.exists(path): + return None + try: + with open(path) as f: + return json.load(f) + except Exception: + return None + + +def _freeze_sql( + question: str, + sql: str, + *, + engine: str = "bigquery", + bytes_processed: int = 0, + feedback: str | None = None, + previous: dict | None = None, +) -> str: + """Freeze (or revise) the validated SQL for a question. + + SQL freezing extends plan freezing one level deeper: the drafting LLM is + the remaining nondeterministic step in a frozen plan, so caching its + dry-run-validated output makes replays NUMERICALLY deterministic. A + human-feedback revision appends to `revisions` — the feedback itself + becomes part of the auditable artifact (who changed the query and why). + """ + os.makedirs(_SQL_STORE, exist_ok=True) + path = os.path.join(_SQL_STORE, f"{_q_digest(question)}.json") + rec = ( + dict(previous) + if previous + else { + "question": (question or "").strip(), + "revisions": [], + } + ) + if previous is not None and feedback is not None: + rec.setdefault("revisions", []).append({ + "feedback": feedback, + "previous_sql": previous.get("sql"), + "revised_at": _now_iso(), + }) + rec.update({ + "sql": sql, + "sql_hash": sha256_hex(sql), + "engine": engine, + "bytes_processed": int(bytes_processed or 0), + "validated_at": _now_iso(), + }) + with open(path, "w") as f: + json.dump(rec, f, indent=1) + return path + + +def _frozen_sql_spec() -> WorkflowSpec: + """The static replay plan for a frozen SQL: re-validate (REAL dry-run, + which doubles as warehouse-drift detection) -> execute -> chart -> + summarize. No drafting LLM anywhere — numerically deterministic given an + unchanged dataset.""" + return WorkflowSpec( + goal="execute a frozen, human-auditable SQL", + steps=[ + StepRef( + kind="step", + id="check", + capability="dry_run", + input=Binding(source="task"), + ), + StepRef( + kind="step", + id="rows", + capability="run_query", + input=Binding(source="step", step="check"), + ), + StepRef( + kind="step", + id="chart", + capability="render_chart", + input=Binding(source="step", step="rows"), + ), + StepRef( + kind="step", + id="sum", + capability="summarize_insight", + input=Binding(source="step", step="rows"), + ), + ], + output=Binding(source="step", step="sum"), + ) + + def _load_stored_plan(key: str, registry, task): """Returns (spec, None) on a valid import, (None, reason) on a rejected or unreadable envelope, (None, None) when nothing is stored.""" @@ -1466,24 +1593,112 @@ async def plan_and_run(ctx: Context, node_input): f" ({', '.join(TABLES)}){task_note}." ) + used_frozen_sql = False + if key == "revise": + # HUMAN FEEDBACK on the frozen SQL of the session's last question: + # revise (LLM, feedback-aware) -> re-validate (REAL dry-run) -> + # re-freeze with the feedback recorded in the revision history -> + # execute the revised query. + last_q = ctx.state.get("authored_workflow:ca:last_question") + rec = _load_frozen_sql(last_q) if last_q else None + if rec is None: + yield _msg( + "🛠️ Nothing to revise yet — ask a data question first; its" + " validated SQL gets frozen, and then your feedback can amend it." + ) + yield Event(output={"scenario": "revise", "revised": False}) + return + yield _msg( + f'🛠️ **Revising the frozen SQL** for: "{last_q}" (revision' + f" #{len(rec.get('revisions', [])) + 1})\nYour feedback:" + f" _{text.strip()}_\nCurrent SQL:\n```sql\n{rec['sql']}\n```" + ) + raw = await ctx.run_node( + reg["draft_or_repair_sql"].build(), + node_input=json.dumps( + {"question": last_q, "sql": rec["sql"], "feedback": text.strip()} + ), + run_id="revise_sql", + ) + new_sql = _sql_of(raw) + check = _bq_dry_run({"question": last_q, "sql": new_sql}) + if not check.get("valid"): + yield _msg( + "🛑 The revised SQL failed the REAL dry-run —" + f" `{str(check.get('error'))[:200]}`. The frozen SQL is" + " UNCHANGED (a revision must validate before it replaces the" + " frozen artifact)." + ) + yield Event(output={"scenario": "revise", "revised": False}) + return + _freeze_sql( + last_q, + check["sql"], + engine=str(check.get("engine", "bigquery")), + bytes_processed=check.get("bytes_processed", 0), + feedback=text.strip(), + previous=rec, + ) + yield _msg( + "🧊 **Re-frozen** — the feedback is now part of the artifact's" + " revision history (auditable: who changed the query and why)." + f"\nRevised SQL:\n```sql\n{check['sql']}\n```\nRunning it:" + ) + spec = _frozen_sql_spec() + spec_hash = _hash(spec) + task = {"question": last_q, "sql": check["sql"]} + used_frozen_sql = True + reused = True + + if not used_frozen_sql and key == "sequence": + rec = _load_frozen_sql(task.get("question", "")) + if rec is not None: + pre = _bq_dry_run({"question": task["question"], "sql": rec["sql"]}) + if pre.get("valid"): + spec = _frozen_sql_spec() + spec_hash = _hash(spec) + task = {"question": task["question"], "sql": rec["sql"]} + used_frozen_sql = True + reused = True + yield _msg( + "🧊 **Frozen SQL replay** — this exact question was answered" + f" before; its validated SQL (hash `{rec['sql_hash'][:12]}`," + f" validated {rec['validated_at'][:19]}," + f" {len(rec.get('revisions', []))} human revision(s)) is reused" + " — the drafting LLM is SKIPPED, so the numbers are" + " deterministic given an unchanged dataset. The real dry-run" + " just re-validated it (warehouse-drift check)." + ) + else: + yield _msg( + "🧊 The frozen SQL for this question no longer validates" + f" (warehouse drift): `{str(pre.get('error'))[:160]}` —" + " re-authoring fresh." + ) + + if used_frozen_sql: + pass # spec/task pinned above; skip plan-store/session reuse + else: + spec, source = None, None # 1. LOAD-OR-AUTHOR. Reuse order: this session's state -> the # CROSS-SESSION plan store (defensive import) -> author fresh. - spec, source = None, None - existing = ctx.state.get(state_key) + existing = None if used_frozen_sql else ctx.state.get(state_key) if existing: spec = WorkflowSpec.model_validate(existing) source = "session state" - else: + elif not used_frozen_sql: spec, reject = _load_stored_plan(key, reg, task) if spec is not None: source = "plan store (CROSS-SESSION import)" ctx.state[state_key] = spec.model_dump() # cache for this session - elif reject: + elif reject: # noqa: F821 yield _msg( f"🛑 **Plan-store import rejected** for `{key}` — {reject}\n" "Drift never silently replays a stale plan; re-authoring fresh." ) - if spec is not None: + if used_frozen_sql: + pass # beats already emitted; spec/task/reused pinned + elif spec is not None: spec_hash = _hash(spec) reused = True fresh_input = task != sc["task"] @@ -1614,6 +1829,33 @@ async def plan_and_run(ctx: Context, node_input): if isinstance(result, dict) and isinstance(result.get("insight"), str): # remembered so a later 'audit that insight' audits THIS, not canned data ctx.state["authored_workflow:ca:last_insight"] = result["insight"] + if key in ("sequence", "revise") and task.get("question"): + ctx.state["authored_workflow:ca:last_question"] = task["question"] + if key == "sequence" and not used_frozen_sql: + # SQL FREEZING: the drafting LLM was the last nondeterministic step in + # a frozen plan — freeze its dry-run-validated output so replays of + # this exact question are numerically deterministic (and feedback can + # amend it auditably). + checked = next( + ( + v + for v in interp.state.values() + if isinstance(v, dict) and v.get("valid") is True and v.get("sql") + ), + None, + ) + if checked: + _freeze_sql( + task["question"], + checked["sql"], + engine=str(checked.get("engine", "bigquery")), + bytes_processed=checked.get("bytes_processed", 0), + ) + yield _msg( + "🧊 **SQL frozen** for this question — re-ask it (any session)" + " and the validated SQL replays with the drafting LLM skipped;" + " say `revise: ` to amend it with an audit trail." + ) yield Event( output={ "scenario": key, diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py index 51feb0bec2f..6d3fe560c02 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py @@ -1114,6 +1114,79 @@ def test_skeptic_verdicts_render_with_reasons(): assert task == {"insights": ["sales doubled YoY"]} +def test_sql_freezing_roundtrip_and_revision_history(tmp_path, monkeypatch): + # SQL freezing: the validated SQL for a question is a durable artifact; + # replays skip the drafting LLM (numeric determinism). Human feedback + # revisions append to an auditable history. + monkeypatch.setattr(demo, "_SQL_STORE", str(tmp_path)) + q = " What was REVENUE by region last quarter? " + demo._freeze_sql(q, "SELECT 1", engine="bigquery", bytes_processed=42) + # normalized question text hits the same record + rec = demo._load_frozen_sql("what was revenue by region last quarter?") + assert rec is not None and rec["sql"] == "SELECT 1" + assert rec["revisions"] == [] and rec["bytes_processed"] == 42 + # a feedback revision replaces the SQL and RECORDS the feedback + old sql + demo._freeze_sql( + q, + "SELECT 2", + feedback="use calendar quarters, not trailing 90 days", + previous=rec, + ) + rec2 = demo._load_frozen_sql(q) + assert rec2["sql"] == "SELECT 2" + assert len(rec2["revisions"]) == 1 + assert rec2["revisions"][0]["previous_sql"] == "SELECT 1" + assert "calendar quarters" in rec2["revisions"][0]["feedback"] + # unknown question -> None + assert demo._load_frozen_sql("never asked") is None + + +def test_frozen_sql_replay_plan_is_static_and_clean(): + # The replay plan is STATIC (constant hash — no authoring) and contains + # no drafting step: dry_run -> run_query -> chart -> summarize. + spec = demo._frozen_sql_spec() + caps = [s.capability for s in spec.steps] + assert caps == ["dry_run", "run_query", "render_chart", "summarize_insight"] + assert "draft_or_repair_sql" not in caps and "nl2sql" not in caps + warnings = WorkflowSpecValidator(demo._registry()).validate(spec) + assert [w for w in warnings if w.startswith("plan-quality")] == [] + h1 = demo.sha256_hex(spec.model_dump(mode="json")) + h2 = demo.sha256_hex(demo._frozen_sql_spec().model_dump(mode="json")) + assert h1 == h2 # deterministic replay plan + + +@pytest.mark.asyncio +async def test_frozen_sql_replay_skips_the_drafting_llm(monkeypatch): + monkeypatch.setitem(demo._BQ, "disabled", True) + out_holder = {} + + @node(rerun_on_resume=True) + async def parent(ctx, node_input): + interp = SpecInterpreter(_stub_registry(), ctx) + out_holder["out"] = await interp.execute( + demo._frozen_sql_spec(), + { + "question": "revenue by region?", + "sql": "SELECT region, SUM(x) AS revenue ... GROUP BY region", + }, + ) + out_holder["n"] = interp.dispatch_count + yield Event(output={"_done": True}) + + wf = Workflow(name="t", edges=[("START", parent)]) + ss = InMemorySessionService() + r = Runner(app_name="t", node=wf, session_service=ss) + s = await ss.create_session(app_name="t", user_id="u") + async for _ in r.run_async( + user_id="u", + session_id=s.id, + new_message=types.Content(parts=[types.Part(text="go")], role="user"), + ): + pass + assert out_holder["n"] == 4 # check, run, chart, summarize — NO draft + assert out_holder["out"] == {"insight": "US-West leads revenue."} + + def test_conversational_gate_routing(): # Triggered messages bypass the gate (mode selectors stay deterministic); # untriggered messages go through the intent gate first — including the @@ -1195,6 +1268,8 @@ def test_scenario_routing(): assert demo._scenario_for("audit these insights") == "adversarial" assert demo._scenario_for("pick the best chart") == "tournament" assert demo._scenario_for("hello") == "sequence" # default + assert demo._scenario_for("revise: use calendar quarters") == "revise" + assert demo._scenario_for("update the sql to exclude returns") == "revise" # overlapping triggers: specialized intent must beat the generic fallback # ("revenue by region" is a sequence trigger, but these aren't questions). assert ( @@ -1214,6 +1289,8 @@ def test_scenario_routing(): def test_all_seven_shapes_validate_and_lint_clean(): reg = demo._registry() for key in demo.SCENARIOS: + if key == "revise": # custom feedback flow, not an authored shape + continue warnings = WorkflowSpecValidator(reg).validate(_expected_spec(key)) lints = [w for w in warnings if w.startswith("plan-quality")] assert lints == [], f"{key}: {lints}" From 768c1d452cde22a91daced23dd7ec0687960337a Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Fri, 12 Jun 2026 08:26:20 -0700 Subject: [PATCH 60/64] demo(ca): plan inspector renders the frozen-SQL store with revision history MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 🧊 cards show the pinned statement (replays skip the drafting LLM), sql_hash/validated_at/engine, and every human-feedback revision with the preserved previous SQL — the numeric-determinism and governance benefits made visible alongside the plan envelopes. --- .../plan_inspector.py | 52 ++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/plan_inspector.py b/contributing/samples/workflows/authored_workflow_ca_demo/plan_inspector.py index 2c9549c21ec..2caf86f3934 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/plan_inspector.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/plan_inspector.py @@ -225,6 +225,45 @@ def _plan_card(name: str, env: dict) -> str: return "\n".join(parts) +def _sql_card(rec: dict) -> str: + revs = rec.get("revisions", []) + rev_rows = "".join( + f'
revision #{i + 1}' + f'HUMAN FEEDBACK
' + f"{html.escape(r.get('feedback', ''))}" + '
revised' + f' {html.escape(str(r.get("revised_at", ""))[:19])}' + f" — previous SQL preserved in the artifact:
" + "{html.escape(r.get('previous_sql') or '')}
" + for i, r in enumerate(revs) + ) + return ( + '

🧊 Frozen SQL — ' + f"“{html.escape(rec.get('question', ''))}”

" + "The numbers, pinned — replays of this exact question skip the" + " drafting LLM and run THIS statement (re-validated by a real dry-run" + " first, which doubles as warehouse-drift detection):" + f"
{html.escape(rec.get('sql', ''))}
" + + _kv( + "sql_hash · validated_at · engine", + f"{rec.get('sql_hash', '')[:16]} ·" + f" {str(rec.get('validated_at', ''))[:19]} ·" + f" {rec.get('engine', '')}", + "Numeric determinism: identical SQL means identical results on an" + " unchanged dataset — live-verified to the cent across runs and" + " sessions.", + "t-cons", + "CONSISTENT", + ) + + ( + rev_rows + or '
revisions
none yet
say “revise: <feedback>” in the demo — the change must pass the real dry-run before it lands, and the feedback is recorded here forever.
' + ) + + "
" + ) + + def main() -> str: envs = {} for fn in sorted(os.listdir(STORE)): @@ -234,8 +273,15 @@ def main() -> str: if not envs: print("plan store is empty — run a demo session first", file=sys.stderr) raise SystemExit(1) + sql_dir = os.path.join(STORE, "sql") + sql_cards = "" + if os.path.isdir(sql_dir): + for fn in sorted(os.listdir(sql_dir)): + if fn.endswith(".json"): + with open(os.path.join(sql_dir, fn)) as f: + sql_cards += _sql_card(json.load(f)) - cards = "\n".join(_plan_card(k, v) for k, v in envs.items()) + cards = sql_cards + "\n".join(_plan_card(k, v) for k, v in envs.items()) page = f""" Frozen Plan Inspector — RFC #93

Frozen Plan Inspector

@@ -256,6 +302,10 @@ def main() -> str:
🛡️ SafeClosed capability vocabulary, typed bindings, plan-quality lints, fail-closed defensive import. No model-written code is ever stored or executed.
+
🧊 Numerically deterministicSQL freezing extends the +freeze one level deeper: a question's dry-run-validated SQL becomes part of the artifact, +so replays skip the drafting LLM entirely — same numbers, to the cent, across runs and +sessions. Human feedback amends it through validation, with every revision recorded.
The consistency story in one line: Session A authored these plans From fd315c08da3f9a6afca4cdc3c4b17bc507243511 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Fri, 12 Jun 2026 08:30:10 -0700 Subject: [PATCH 61/64] demo(ca): plan inspector renders the LIVE session flow as a timeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit plan_inspector.py now takes an optional session id (plus app/user) and reads the actual session from the running ADK server: each user turn is rendered as a timeline card classified by the mechanism that answered it (frozen-SQL replay / human revision / authored fresh / frozen-plan reuse / conversation), with the sql hash, applied revision count, and the resulting insight — so the page tells the story of the demo run on screen, with the artifacts those turns created and reused directly below. python .../plan_inspector.py [app] [user] --- .../plan_inspector.py | 111 +++++++++++++++++- 1 file changed, 110 insertions(+), 1 deletion(-) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/plan_inspector.py b/contributing/samples/workflows/authored_workflow_ca_demo/plan_inspector.py index 2caf86f3934..8778f8af643 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/plan_inspector.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/plan_inspector.py @@ -78,6 +78,16 @@ .bad { border-left-color: var(--red); background: #fce8e655; } .bad b { color: var(--red); } details summary { cursor: pointer; color: var(--blue); font-weight: 600; margin: 8px 0; } +.turn { display: flex; gap: 14px; margin: 10px 0; } +.turn .num { flex: 0 0 30px; height: 30px; border-radius: 50%; background: var(--blue); + color: #fff; font-weight: 700; display: flex; align-items: center; + justify-content: center; } +.turn .body { flex: 1; background: var(--card); border: 1px solid var(--line); + border-radius: 10px; padding: 10px 14px; } +.turn .ask { font-weight: 600; } +.turn .mech { font-size: 12px; margin: 4px 0; } +.turn .insight { color: var(--mut); font-size: 12.5px; border-left: 3px solid var(--line); + padding-left: 8px; margin-top: 6px; } """ @@ -264,6 +274,92 @@ def _sql_card(rec: dict) -> str: ) +def _fetch_session(app: str, user: str, session_id: str, port: int = 8001): + import urllib.request + + url = f"http://127.0.0.1:{port}/apps/{app}/users/{user}/sessions/{session_id}" + try: + with urllib.request.urlopen(url, timeout=5) as r: + return json.loads(r.read()) + except Exception: + return None + + +def _session_timeline(session: dict) -> str: + """Render the session's ACTUAL flow: one card per user turn, classified by + the mechanism that answered it, linked to the artifact it touched.""" + import re + + turns, cur = [], None + for e in session.get("events", []): + content = e.get("content") or {} + texts = [ + p.get("text", "") for p in content.get("parts") or [] if p.get("text") + ] + blob = " ".join(texts) + if e.get("author") == "user" and blob.strip(): + cur = {"ask": blob.strip(), "beats": []} + turns.append(cur) + elif cur is not None and blob: + cur["beats"].append(blob) + + cards = [] + for i, t in enumerate(turns, 1): + beats = " ".join(t["beats"]) + if "Frozen SQL replay" in beats: + mech = ( + '🧊 FROZEN-SQL REPLAY drafting LLM' + " skipped — deterministic numbers" + ) + elif "Revising the frozen SQL" in beats: + mech = ( + '🛠 HUMAN REVISION feedback' + " validated by a REAL dry-run, recorded in the artifact, then" + " executed" + ) + elif "Authored plan" in beats: + mech = '📝 AUTHORED FRESH 1 planner call' + elif "Reusing frozen plan" in beats: + mech = ( + '♻️ FROZEN-PLAN REUSE 0 planner calls' + ) + elif "Conversational turn" in beats: + mech = ( + '💬 CONVERSATION no workflow issued' + ) + else: + mech = 'WORKFLOW' + hash_m = re.search(r"validated SQL \(hash `([0-9a-f]+)`", beats) + rev_m = re.search(r"(\d+) human revision", beats) + extra = "" + if hash_m: + extra += f" · sql `{hash_m.group(1)}`" + if rev_m: + extra += f" · {rev_m.group(1)} revision(s) applied" + ins_m = re.search(r'"insight": "([^"]+)', beats) + insight = ( + f'
{html.escape(ins_m.group(1)[:220])}
' + if ins_m + else "" + ) + cards.append( + f'
{i}
' + f'
“{html.escape(t["ask"][:160])}”
' + f'
{mech}{html.escape(extra)}
{insight}' + "
" + ) + if not cards: + return "" + return ( + '

▶️ This session, as it' + " actually ran

Read straight from the live ADK" + " session — each turn classified by the mechanism that answered it." + " The artifacts those turns created and reused are below.
" + + "".join(cards) + + "
" + ) + + def main() -> str: envs = {} for fn in sorted(os.listdir(STORE)): @@ -281,7 +377,20 @@ def main() -> str: with open(os.path.join(sql_dir, fn)) as f: sql_cards += _sql_card(json.load(f)) - cards = sql_cards + "\n".join(_plan_card(k, v) for k, v in envs.items()) + timeline = "" + if len(sys.argv) > 1: + sid = sys.argv[1] + app = sys.argv[2] if len(sys.argv) > 2 else "bq_ca_planner" + user = sys.argv[3] if len(sys.argv) > 3 else "user" + session = _fetch_session(app, user, sid) + if session: + timeline = _session_timeline(session) + + cards = ( + timeline + + sql_cards + + "\n".join(_plan_card(k, v) for k, v in envs.items()) + ) page = f""" Frozen Plan Inspector — RFC #93

Frozen Plan Inspector

From 0e034a1aa558a7aee8753d2699c065d5512fd0b5 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Fri, 12 Jun 2026 10:17:42 -0700 Subject: [PATCH 62/64] =?UTF-8?q?demo(ca):=20inspector=20reframed=20to=20s?= =?UTF-8?q?ell=20the=20RFC=20=E2=80=94=20frozen=20workflows=20with=20froze?= =?UTF-8?q?n=20middle=20results?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The page now pitches RFC #93 with the demo as evidence, not the demo itself. The model-authored typed plan is the centerpiece; the validated intermediate results its steps produce (the dry-run-checked SQL, in this instance) render INSIDE the workflow card, attached to the step that produced them — the drafting loop carries a 'result frozen, SKIPPED on replay' badge. Session timeline beats use RFC vocabulary (model authored the workflow / frozen-workflow replay / step-result replay / human-governed revision), and a closing card generalizes the mechanism beyond SQL (retrieved schemas, verified claims, chart specs) into the RFC's freezing tiers v1 / v1.1 / v1.2 / v2. --- .../plan_inspector.py | 352 ++++++++++-------- 1 file changed, 187 insertions(+), 165 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/plan_inspector.py b/contributing/samples/workflows/authored_workflow_ca_demo/plan_inspector.py index 8778f8af643..b4942374ab4 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/plan_inspector.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/plan_inspector.py @@ -12,16 +12,21 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Frozen Plan Inspector — renders the plan store as a self-contained HTML page. +"""RFC #93 evidence page — frozen workflows with frozen middle results. -Reads every ``FrozenWorkflowRecord`` envelope in ``ca_plan_store/`` and writes -``ca_plan_store/plan_inspector.html``: the plan's dataflow as a diagram, and -every envelope field annotated with the guarantee it delivers (auditability, -tamper evidence, version/contract drift detection, cross-session template -reuse). Run from the repo root after a demo session has frozen some plans: +Renders the plan store as a self-contained HTML pitch for the RFC: the +model-authored typed plan is the centerpiece; validated INTERMEDIATE step +results (here: the dry-run-checked SQL the drafting loop produced) freeze +onto the step that produced them, so replays skip that step's LLM entirely; +human feedback amends the frozen artifact through validation, with every +revision recorded. SQL is the demonstrated instance — the mechanism is the +RFC's general step-result freezing tier. - python contributing/samples/workflows/authored_workflow_ca_demo/plan_inspector.py + python plan_inspector.py [session-id] [app] [user] open ca_plan_store/plan_inspector.html + +With a session id, the page opens with that live session's actual flow, +each turn classified by the RFC mechanism that answered it. """ from __future__ import annotations @@ -29,6 +34,7 @@ import html import json import os +import re import sys STORE = os.path.join(os.getcwd(), "ca_plan_store") @@ -36,19 +42,22 @@ CSS = """ :root { --ink:#1a1c1e; --mut:#5f6368; --line:#dadce0; --blue:#1a73e8; --green:#188038; --amber:#b06000; --purple:#7627bb; --red:#c5221f; - --bg:#f8f9fa; --card:#ffffff; } + --ice:#0277bd; --bg:#f8f9fa; --card:#ffffff; } * { box-sizing: border-box; } body { font: 14px/1.55 -apple-system, 'Segoe UI', Roboto, sans-serif; color: var(--ink); background: var(--bg); margin: 0; padding: 32px; } h1 { font-size: 24px; margin: 0 0 4px; } h2 { font-size: 18px; margin: 36px 0 10px; } -.sub { color: var(--mut); margin-bottom: 24px; } -.benefits { display: flex; gap: 12px; flex-wrap: wrap; margin: 18px 0 8px; } -.benefit { flex: 1 1 220px; background: var(--card); border: 1px solid var(--line); - border-radius: 10px; padding: 14px 16px; } -.benefit b { display: block; margin-bottom: 4px; } -.b-audit b { color: var(--blue); } .b-ver b { color: var(--purple); } -.b-cons b { color: var(--green); } .b-safe b { color: var(--amber); } +h3 { font-size: 14px; margin: 18px 0 6px; } +.sub { color: var(--mut); margin-bottom: 18px; } +.pitch { border-left: 4px solid var(--blue); background: #e8f0fe66; + padding: 12px 16px; border-radius: 0 10px 10px 0; margin: 14px 0; } +.claims { display: flex; gap: 12px; flex-wrap: wrap; margin: 18px 0 8px; } +.claim { flex: 1 1 210px; background: var(--card); border: 1px solid var(--line); + border-radius: 10px; padding: 13px 15px; } +.claim b { display: block; margin-bottom: 4px; } +.c1 b { color: var(--blue); } .c2 b { color: var(--purple); } +.c3 b { color: var(--ice); } .c4 b { color: var(--green); } .c5 b { color: var(--amber); } .card { background: var(--card); border: 1px solid var(--line); border-radius: 12px; padding: 20px 22px; margin: 14px 0; } .tag { display: inline-block; font-size: 11px; font-weight: 600; border-radius: 99px; @@ -57,6 +66,7 @@ .t-ver { background:#f3e8fd; color: var(--purple); } .t-cons { background:#e6f4ea; color: var(--green); } .t-safe { background:#fef7e0; color: var(--amber); } +.t-ice { background:#e1f5fe; color: var(--ice); } .kv { margin: 6px 0; padding: 8px 10px; border-left: 3px solid var(--line); background: var(--bg); border-radius: 0 6px 6px 0; } .kv code { font: 12px/1.5 ui-monospace, Menlo, monospace; word-break: break-all; } @@ -65,18 +75,20 @@ .node { border: 1.5px solid var(--blue); border-radius: 8px; padding: 7px 12px; background: #e8f0fe; font: 12px ui-monospace, Menlo, monospace; } .node small { display: block; color: var(--mut); font-size: 10.5px; } -.loopbox { border: 1.5px dashed var(--purple); border-radius: 10px; padding: 10px; - display: flex; gap: 10px; align-items: center; } +.loopbox { border: 1.5px dashed var(--purple); border-radius: 10px; + padding: 14px 10px 10px; display: flex; gap: 10px; + align-items: center; position: relative; } .loopbox .lbl { color: var(--purple); font-size: 11px; font-weight: 700; } +.loopbox.iced { border-color: var(--ice); background: #e1f5fe33; } +.loopbox.iced::after { content: "❄️ result frozen — SKIPPED on replay"; + position: absolute; top: -11px; right: 10px; font-size: 10px; + background: var(--ice); color: #fff; border-radius: 99px; + padding: 1px 8px; } .fanbox { border: 1.5px dashed var(--green); border-radius: 10px; padding: 10px; } .fanbox .lbl { color: var(--green); font-size: 11px; font-weight: 700; } .arrow { color: var(--mut); font-size: 18px; } pre { background: #202124; color: #e8eaed; border-radius: 10px; padding: 14px; - overflow: auto; font: 11.5px/1.5 ui-monospace, Menlo, monospace; max-height: 340px; } -.story { border-left: 4px solid var(--green); background: #e6f4ea55; padding: 10px 14px; - border-radius: 0 8px 8px 0; margin: 10px 0; } -.bad { border-left-color: var(--red); background: #fce8e655; } -.bad b { color: var(--red); } + overflow: auto; font: 11.5px/1.5 ui-monospace, Menlo, monospace; max-height: 300px; } details summary { cursor: pointer; color: var(--blue); font-weight: 600; margin: 8px 0; } .turn { display: flex; gap: 14px; margin: 10px 0; } .turn .num { flex: 0 0 30px; height: 30px; border-radius: 50%; background: var(--blue); @@ -88,10 +100,17 @@ .turn .mech { font-size: 12px; margin: 4px 0; } .turn .insight { color: var(--mut); font-size: 12.5px; border-left: 3px solid var(--line); padding-left: 8px; margin-top: 6px; } +.midresult { border: 1.5px solid var(--ice); border-radius: 10px; + background: #e1f5fe44; padding: 12px 14px; margin: 10px 0; } +.midresult .q { font-weight: 600; } +.rev { border-left: 3px solid var(--blue); background: var(--bg); padding: 8px 10px; + border-radius: 0 6px 6px 0; margin: 6px 0; font-size: 12.5px; } """ +_SQL_PRODUCER_HINTS = ("draft", "sqlgen", "sql", "loop") + -def _node(step) -> str: +def _node(step, frozen_step_ids=()) -> str: kind = step.get("kind") if kind == "step": binding = step.get("input", {}) @@ -128,10 +147,11 @@ def _node(step) -> str: ) if kind == "loop_until": body = " ".join( - _node(s) for s in step.get("body", []) + _node(s, frozen_step_ids) for s in step.get("body", []) ) + iced = " iced" if step.get("id") in frozen_step_ids else "" return ( - f'
LOOP until' + f'
LOOP until' f" {html.escape(step.get('until_capability', '?'))} (max" f" {step.get('max_iters')}){body}
" ) @@ -140,9 +160,9 @@ def _node(step) -> str: return f'
{html.escape(str(kind))}
' -def _flow(spec) -> str: +def _flow(spec, frozen_step_ids=()) -> str: steps = " ".join( - _node(s) for s in spec.get("steps", []) + _node(s, frozen_step_ids) for s in spec.get("steps", []) ) return f'
{steps}
' @@ -156,53 +176,97 @@ def _kv(label, value, why, tag, tag_label) -> str: ) -def _plan_card(name: str, env: dict) -> str: +def _sql_producing_step_ids(spec) -> list: + """The plan steps whose validated results the store freezes (the drafting + loop in the ask-a-question plan).""" + ids = [] + for s in spec.get("steps", []): + sid = str(s.get("id", "")).lower() + if s.get("kind") == "loop_until" and any( + h in sid for h in _SQL_PRODUCER_HINTS + ): + ids.append(s.get("id")) + return ids + + +def _mid_result(rec: dict) -> str: + """A frozen middle result, attached to the plan that produced it.""" + revs = rec.get("revisions", []) + rev_html = "".join( + f'
revision #{i + 1} — human feedback:' + f" {html.escape(r.get('feedback', ''))}" + f"
previous artifact (preserved)" + f"
{html.escape(r.get('previous_sql') or '')}
" + for i, r in enumerate(revs) + ) + return ( + '
❄️ Frozen middle result —' + f" question: “{html.escape(rec.get('question', ''))}”
" + f"
artifact hash" + f" {rec.get('sql_hash', '')[:16]} · validated" + f" {str(rec.get('validated_at', ''))[:19]} ·" + f" engine {html.escape(str(rec.get('engine', '')))} ·" + f" {len(revs)} human revision(s)
" + f"
the validated artifact (SQL, in this instance)" + f"
{html.escape(rec.get('sql', ''))}
" + + (rev_html or "") + + "
" + ) + + +def _plan_card(name: str, env: dict, mid_results=()) -> str: spec = env.get("spec", {}) - caps = ", ".join(sorted(env.get("capability_versions", {}))) - ch = env.get("capability_contract_hashes", {}) - ch_short = {k: v[:12] for k, v in ch.items()} + frozen_ids = _sql_producing_step_ids(spec) if mid_results else [] parts = [ - f'

{html.escape(name)} — ' - f"“{html.escape(spec.get('goal', ''))}”

", + f'

Frozen workflow:' + f" {html.escape(name)} — “{html.escape(spec.get('goal', ''))}”

", ( - "The plan, as data — every box is a pre-approved capability;" - " every arrow is a typed binding the validator checked:" + "Authored by the model ONCE, as typed data — every box a" + " pre-approved capability, every arrow a typed binding the" + " validator checked. The plan replays across sessions with zero" + " planner calls:" ), - _flow(spec), + _flow(spec, frozen_ids), + ] + if mid_results: + parts.append( + "

❄️ Frozen middle results of this workflow

" + "
The RFC's step-result" + " freezing tier: the ❄️ step's validated output is frozen WITH the" + " plan. On replay the step's LLM is skipped — the run is" + " numerically deterministic — and the artifact re-validates on" + " load (drift detection). Human feedback amends it THROUGH" + " validation, every revision recorded:
" + ) + parts.extend(_mid_result(r) for r in mid_results) + parts += [ _kv( "spec_hash", env.get("spec_hash", ""), - "Tamper evidence: every import recomputes sha256 over the spec and" - " rejects on mismatch. Change one character of the plan and it" - " will not load.", + "Tamper evidence: every import recomputes sha256 over the spec" + " and rejects on mismatch.", "t-audit", "AUDITABLE", ), _kv( "planner_model · created_at", f"{env.get('planner_model')} · {env.get('created_at')}", - "Provenance: which model authored this plan and when — the audit" - " trail starts at authoring, not at execution.", + "Authoring provenance: which model wrote this orchestration and" + " when.", "t-audit", "AUDITABLE", ), _kv( - "registry_version · capability_versions", + "registry + capability versions · contract hashes", f"registry v{env.get('registry_version')} · " - + json.dumps(env.get("capability_versions", {})), - "Versioning: the exact capability versions this plan was approved" - " against. The skeptic at v2 means a v1-era audit plan is REJECTED" - " on import and re-authored — semantics changed, so the plan must" - " too.", - "t-ver", - "VERSIONED", - ), - _kv( - "capability_contract_hashes", - json.dumps(ch_short), - "Drift detection without developer discipline: derived sha256 over" - " each capability's declared contract. A schema change nobody" - " version-bumped still refuses to load.", + + json.dumps(env.get("capability_versions", {})) + + " · " + + json.dumps({ + k: v[:10] + for k, v in (env.get("capability_contract_hashes") or {}).items() + }), + "Drift detection: a capability whose contract changed since" + " freezing makes the plan refuse to load — loudly.", "t-ver", "VERSIONED", ), @@ -210,24 +274,13 @@ def _plan_card(name: str, env: dict) -> str: "task_input_schema · task_input_digest", f"{json.dumps(env.get('task_input_schema'))} · " + str(env.get("task_input_digest", ""))[:16], - "Consistency across sessions: a NEW session imports this plan and" - " runs a NEW question through it (validated against the captured" - " schema) — same steps, same checks, same shape of answer. Zero" - " planner calls.", + "Template reuse: a new session validates ITS question against" + " the captured schema and runs the same governed pipeline.", "t-cons", "CONSISTENT", ), - _kv( - "validation", - json.dumps(env.get("validation", {})), - "Recorded but NEVER trusted: import re-validates against the" - " current registry. Lint waivers, if any, are recorded here too —" - " suppression is auditable.", - "t-safe", - "SAFE", - ), ( - f"
Full envelope JSON ({caps})" + "
full frozen record (envelope JSON)" f"
{html.escape(json.dumps(env, indent=2))}
" ), "
", @@ -235,45 +288,6 @@ def _plan_card(name: str, env: dict) -> str: return "\n".join(parts) -def _sql_card(rec: dict) -> str: - revs = rec.get("revisions", []) - rev_rows = "".join( - f'
revision #{i + 1}' - f'HUMAN FEEDBACK
' - f"{html.escape(r.get('feedback', ''))}" - '
revised' - f' {html.escape(str(r.get("revised_at", ""))[:19])}' - f" — previous SQL preserved in the artifact:
" - "{html.escape(r.get('previous_sql') or '')}
" - for i, r in enumerate(revs) - ) - return ( - '

🧊 Frozen SQL — ' - f"“{html.escape(rec.get('question', ''))}”

" - "The numbers, pinned — replays of this exact question skip the" - " drafting LLM and run THIS statement (re-validated by a real dry-run" - " first, which doubles as warehouse-drift detection):" - f"
{html.escape(rec.get('sql', ''))}
" - + _kv( - "sql_hash · validated_at · engine", - f"{rec.get('sql_hash', '')[:16]} ·" - f" {str(rec.get('validated_at', ''))[:19]} ·" - f" {rec.get('engine', '')}", - "Numeric determinism: identical SQL means identical results on an" - " unchanged dataset — live-verified to the cent across runs and" - " sessions.", - "t-cons", - "CONSISTENT", - ) - + ( - rev_rows - or '
revisions
none yet
say “revise: <feedback>” in the demo — the change must pass the real dry-run before it lands, and the feedback is recorded here forever.
' - ) - + "
" - ) - - def _fetch_session(app: str, user: str, session_id: str, port: int = 8001): import urllib.request @@ -286,10 +300,6 @@ def _fetch_session(app: str, user: str, session_id: str, port: int = 8001): def _session_timeline(session: dict) -> str: - """Render the session's ACTUAL flow: one card per user turn, classified by - the mechanism that answered it, linked to the artifact it touched.""" - import re - turns, cur = [], None for e in session.get("events", []): content = e.get("content") or {} @@ -308,24 +318,30 @@ def _session_timeline(session: dict) -> str: beats = " ".join(t["beats"]) if "Frozen SQL replay" in beats: mech = ( - '🧊 FROZEN-SQL REPLAY drafting LLM' - " skipped — deterministic numbers" + '❄️ STEP-RESULT REPLAY the' + " workflow ran with its drafting step SKIPPED — the frozen" + " middle result reused; numbers deterministic" ) elif "Revising the frozen SQL" in beats: mech = ( - '🛠 HUMAN REVISION feedback' - " validated by a REAL dry-run, recorded in the artifact, then" - " executed" + '🛠 HUMAN-GOVERNED REVISION' + " feedback applied to the frozen middle result THROUGH a real" + " dry-run, recorded in the artifact, then executed" ) elif "Authored plan" in beats: - mech = '📝 AUTHORED FRESH 1 planner call' + mech = ( + '📝 MODEL AUTHORED THE WORKFLOW' + " once — typed plan, validated, frozen (1 planner call)" + ) elif "Reusing frozen plan" in beats: mech = ( - '♻️ FROZEN-PLAN REUSE 0 planner calls' + '♻️ FROZEN-WORKFLOW REPLAY 0' + " planner calls — new data through the same governed pipeline" ) elif "Conversational turn" in beats: mech = ( - '💬 CONVERSATION no workflow issued' + '💬 CONVERSATION intent gate —' + " no workflow issued" ) else: mech = 'WORKFLOW' @@ -333,7 +349,7 @@ def _session_timeline(session: dict) -> str: rev_m = re.search(r"(\d+) human revision", beats) extra = "" if hash_m: - extra += f" · sql `{hash_m.group(1)}`" + extra += f" · artifact `{hash_m.group(1)}`" if rev_m: extra += f" · {rev_m.group(1)} revision(s) applied" ins_m = re.search(r'"insight": "([^"]+)', beats) @@ -351,10 +367,11 @@ def _session_timeline(session: dict) -> str: if not cards: return "" return ( - '

▶️ This session, as it' - " actually ran

Read straight from the live ADK" - " session — each turn classified by the mechanism that answered it." - " The artifacts those turns created and reused are below.
" + '

▶️ The mechanism, live —' + " this session as it actually ran

Read straight" + " from the running ADK session. Watch the arc: the workflow answers" + " → a human amends its frozen middle result → the SAME workflow" + " replays carrying the revision.
" + "".join(cards) + "
" ) @@ -370,12 +387,12 @@ def main() -> str: print("plan store is empty — run a demo session first", file=sys.stderr) raise SystemExit(1) sql_dir = os.path.join(STORE, "sql") - sql_cards = "" + mid_results = [] if os.path.isdir(sql_dir): for fn in sorted(os.listdir(sql_dir)): if fn.endswith(".json"): with open(os.path.join(sql_dir, fn)) as f: - sql_cards += _sql_card(json.load(f)) + mid_results.append(json.load(f)) timeline = "" if len(sys.argv) > 1: @@ -386,47 +403,52 @@ def main() -> str: if session: timeline = _session_timeline(session) - cards = ( - timeline - + sql_cards - + "\n".join(_plan_card(k, v) for k, v in envs.items()) - ) + # middle results attach to the workflow that produced them (sequence). + cards = timeline + for name, env in envs.items(): + attach = mid_results if name == "sequence" else () + cards += _plan_card(name, env, attach) + page = f""" -Frozen Plan Inspector — RFC #93 -

Frozen Plan Inspector

-
The live contents of ca_plan_store/ — each file is a -FrozenWorkflowRecord: a model-authored workflow, frozen as a durable, portable artifact. -This page explains what each field buys you.
- -
-
🔍 AuditableThe plan is data you can read, diff in a PR, -and hand to a reviewer — who authored it, when, with which model, and exactly what runs in -what order with what inputs. Turn-by-turn agent chatter leaves no such artifact.
-
🏷️ VersionedRegistry + per-capability versions and derived -contract hashes are sealed in. Capability changed since freezing? The plan refuses to load — -loudly — instead of silently running stale semantics.
-
♻️ ConsistentAuthoring is the only nondeterministic step, -and it happens once. Every session that imports this plan executes the same steps with the -same safety checks — only the data changes. Answers stop depending on the model's mood.
-
🛡️ SafeClosed capability vocabulary, typed bindings, -plan-quality lints, fail-closed defensive import. No model-written code is ever stored or -executed.
-
🧊 Numerically deterministicSQL freezing extends the -freeze one level deeper: a question's dry-run-validated SQL becomes part of the artifact, -so replays skip the drafting LLM entirely — same numbers, to the cent, across runs and -sessions. Human feedback amends it through validation, with every revision recorded.
+RFC #93 — Frozen Workflows with Frozen Middle Results + +

RFC #93: Reproducible Model-Authored Workflows

+
Demonstrated live on BigQuery Conversational Analytics over +bigquery-public-data.thelook_ecommerce — every artifact on this page is real, +read from the running demo's plan store.
+ +
The thesis: a model should author orchestration once, as +typed data — then the workflow, and the validated middle results its steps produce, +freeze into durable artifacts. Replays skip the nondeterministic steps entirely; humans +amend the artifacts through validation, never by re-prompting; and everything is +auditable, versioned, and drift-checked. A chat agent gives you answers. This gives you a +governed analytics asset.
+ +
+
📝 Authored onceThe model emits a typed plan over a closed +capability vocabulary — no code, no sandbox. Validated, lint-checked, frozen, exported.
+
🏷️ Versioned & drift-checkedRegistry, capability versions, +and derived contract hashes seal in. Changed semantics → the plan refuses to load.
+
❄️ Middle results freeze tooThe step that drafts SQL is the last +nondeterministic step — so its dry-run-validated output freezes WITH the plan. Replays skip +it: same numbers, to the cent, across sessions.
+
🛠 Human-governedFeedback amends a frozen middle result through +real validation; the feedback and the previous artifact are preserved in the record — a +reviewed change, not a re-roll.
+
🔍 Auditable end to endWho authored the plan, what it runs, +which artifact answered, who revised it and why — all readable, diffable data.
-
The consistency story in one line: Session A authored these plans -(1 planner call each). Session B — tomorrow, another user, another machine — imports them, -re-validates them, and runs new questions through them with 0 planner calls: the -same governed pipeline every time.
-
The tamper story in one line: edit one character of any -spec below and the next import fails with spec_hash mismatch; -change a capability's schema and it fails with contract drift. Drift never -replays silently.
- {cards} + +

Why this matters beyond SQL

+What froze here is a SQL statement — but the mechanism is general: any step's validated +output can freeze the same way. A retrieved schema, a verified claim set, a chart +specification, an extraction template — each one a middle result that today is re-rolled by +an LLM on every run. The RFC's freezing tiers turn them into governed artifacts: +v1 the frozen plan (process determinism) · v1.1 the exported envelope +(portability + audit) · v1.2 frozen step results (numeric determinism + human +governance) · v2 templates (approved reuse against new inputs).
""" out = os.path.join(STORE, "plan_inspector.html") From 95926104581b411e17f6c83b2e54c7ca8fa565a8 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Fri, 12 Jun 2026 10:20:59 -0700 Subject: [PATCH 63/64] demo(ca): inspector scopes middle results to the rendered session's flow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Alignment check against a live session showed the page mixed THIS session's frozen middle result with artifacts from other sessions (the store is global by design). With a session id, middle results now filter to questions actually asked (or revised) in that session; artifacts from other sessions collapse into a labeled details card — the page mirrors the demo run on screen while still showing the cross-session store. --- .../plan_inspector.py | 48 +++++++++++++++++-- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/plan_inspector.py b/contributing/samples/workflows/authored_workflow_ca_demo/plan_inspector.py index b4942374ab4..095c66a385f 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/plan_inspector.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/plan_inspector.py @@ -299,6 +299,24 @@ def _fetch_session(app: str, user: str, session_id: str, port: int = 8001): return None +def _session_questions(session: dict) -> set: + """Lower-cased user-turn texts plus revise-target questions — used to + scope middle results to THIS session's actual flow.""" + qs = set() + for e in session.get("events", []): + content = e.get("content") or {} + texts = [ + p.get("text", "") for p in content.get("parts") or [] if p.get("text") + ] + blob = " ".join(texts) + if e.get("author") == "user" and blob.strip(): + qs.add(re.sub(r"\s+", " ", blob.strip().lower())) + m = re.search(r'Revising the frozen SQL\*?\*? for: \\?"([^"]+)"', blob) + if m: + qs.add(re.sub(r"\s+", " ", m.group(1).strip().lower())) + return qs + + def _session_timeline(session: dict) -> str: turns, cur = [], None for e in session.get("events", []): @@ -394,7 +412,7 @@ def main() -> str: with open(os.path.join(sql_dir, fn)) as f: mid_results.append(json.load(f)) - timeline = "" + timeline, session_qs = "", None if len(sys.argv) > 1: sid = sys.argv[1] app = sys.argv[2] if len(sys.argv) > 2 else "bq_ca_planner" @@ -402,12 +420,36 @@ def main() -> str: session = _fetch_session(app, user, sid) if session: timeline = _session_timeline(session) + session_qs = _session_questions(session) + + # Middle results attach to the workflow that produced them (sequence). + # With a session given, scope them to THAT session's flow; artifacts from + # other sessions collapse into a separate card (the store is global by + # design — that's the cross-session point — but the page should mirror + # the demo run on screen). + if session_qs is not None: + in_session = [ + r + for r in mid_results + if re.sub(r"\s+", " ", r.get("question", "").strip().lower()) + in session_qs + ] + other = [r for r in mid_results if r not in in_session] + else: + in_session, other = mid_results, [] - # middle results attach to the workflow that produced them (sequence). cards = timeline for name, env in envs.items(): - attach = mid_results if name == "sequence" else () + attach = in_session if name == "sequence" else () cards += _plan_card(name, env, attach) + if other: + cards += ( + '
❄️ Frozen middle results from' + f" OTHER sessions ({len(other)}) — the store is cross-session by" + " design" + + "".join(_mid_result(r) for r in other) + + "
" + ) page = f""" RFC #93 — Frozen Workflows with Frozen Middle Results From b41323ac1327bf1eafda96c2de940c7b7606e0d0 Mon Sep 17 00:00:00 2001 From: Haiyuan Cao Date: Fri, 12 Jun 2026 10:24:10 -0700 Subject: [PATCH 64/64] feat(ca-demo): frozen middle results record their workflow lineage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User question exposed the gap: middle results were stored BESIDE the frozen workflow (linked only by the inspector's heuristic), not structurally attached to it. The artifact now records plan_hash and produced_by_step at freeze time; revisions inherit the lineage. Design kept deliberate: the plan envelope stays a question-agnostic TEMPLATE (embedding per-question instances would bloat and conflate it) — middle results are per-task-input instances that REFERENCE their plan. The inspector attaches artifacts to workflow cards by recorded lineage (heuristic fallback for pre-lineage records) and renders the lineage on each card. Lineage pinned by tests incl. revision inheritance. --- .../bq_ca_planner/agent.py | 20 +++++-- .../plan_inspector.py | 56 ++++++++++++++----- .../test_ca_demo_agent.py | 16 ++++++ 3 files changed, 74 insertions(+), 18 deletions(-) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py index 1cf8c20e9f7..e3c39039041 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py @@ -1444,6 +1444,8 @@ def _freeze_sql( bytes_processed: int = 0, feedback: str | None = None, previous: dict | None = None, + plan_hash: str | None = None, + produced_by_step: str | None = None, ) -> str: """Freeze (or revise) the validated SQL for a question. @@ -1476,6 +1478,14 @@ def _freeze_sql( "bytes_processed": int(bytes_processed or 0), "validated_at": _now_iso(), }) + # WORKFLOW LINEAGE: the middle result is an instance of a specific plan's + # step for a specific question — record which plan (hash) and which step + # produced it, so the artifact is structurally attached to the frozen + # workflow, not just stored beside it. Revisions inherit the lineage. + if plan_hash: + rec["plan_hash"] = plan_hash + if produced_by_step: + rec["produced_by_step"] = produced_by_step with open(path, "w") as f: json.dump(rec, f, indent=1) return path @@ -1836,13 +1846,13 @@ async def plan_and_run(ctx: Context, node_input): # a frozen plan — freeze its dry-run-validated output so replays of # this exact question are numerically deterministic (and feedback can # amend it auditably). - checked = next( + checked_step, checked = next( ( - v - for v in interp.state.values() + (k, v) + for k, v in interp.state.items() if isinstance(v, dict) and v.get("valid") is True and v.get("sql") ), - None, + (None, None), ) if checked: _freeze_sql( @@ -1850,6 +1860,8 @@ async def plan_and_run(ctx: Context, node_input): checked["sql"], engine=str(checked.get("engine", "bigquery")), bytes_processed=checked.get("bytes_processed", 0), + plan_hash=spec_hash, + produced_by_step=checked_step, ) yield _msg( "🧊 **SQL frozen** for this question — re-ask it (any session)" diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/plan_inspector.py b/contributing/samples/workflows/authored_workflow_ca_demo/plan_inspector.py index 095c66a385f..1a8d6905cab 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/plan_inspector.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/plan_inspector.py @@ -176,17 +176,31 @@ def _kv(label, value, why, tag, tag_label) -> str: ) -def _sql_producing_step_ids(spec) -> list: - """The plan steps whose validated results the store freezes (the drafting - loop in the ask-a-question plan).""" - ids = [] +def _sql_producing_step_ids(spec, mid_results=()) -> list: + """The plan steps whose validated results froze. Prefer the lineage the + artifacts RECORDED (produced_by_step); fall back to the drafting-loop + heuristic for records frozen before lineage existed.""" + recorded = { + r.get("produced_by_step") + for r in mid_results + if r.get("produced_by_step") + } + step_ids = {s.get("id") for s in spec.get("steps", [])} + # lineage may point INSIDE a loop body — surface the loop in that case for s in spec.get("steps", []): - sid = str(s.get("id", "")).lower() - if s.get("kind") == "loop_until" and any( - h in sid for h in _SQL_PRODUCER_HINTS - ): - ids.append(s.get("id")) - return ids + if s.get("kind") == "loop_until": + body_ids = {b.get("id") for b in s.get("body", [])} + if recorded & body_ids: + recorded.add(s.get("id")) + ids = [i for i in recorded if i in step_ids] + if ids: + return ids + return [ + s.get("id") + for s in spec.get("steps", []) + if s.get("kind") == "loop_until" + and any(h in str(s.get("id", "")).lower() for h in _SQL_PRODUCER_HINTS) + ] def _mid_result(rec: dict) -> str: @@ -206,8 +220,16 @@ def _mid_result(rec: dict) -> str: f" {rec.get('sql_hash', '')[:16]} · validated" f" {str(rec.get('validated_at', ''))[:19]} ·" f" engine {html.escape(str(rec.get('engine', '')))} ·" - f" {len(revs)} human revision(s)
" - f"
the validated artifact (SQL, in this instance)" + f" {len(revs)} human revision(s)" + + ( + " · lineage: plan" + f" {html.escape(str(rec['plan_hash']))}, step" + f" {html.escape(str(rec.get('produced_by_step', '?')))}" + if rec.get("plan_hash") + else "" + ) + + "
" + "
the validated artifact (SQL, in this instance)" f"
{html.escape(rec.get('sql', ''))}
" + (rev_html or "") + "
" @@ -216,7 +238,7 @@ def _mid_result(rec: dict) -> str: def _plan_card(name: str, env: dict, mid_results=()) -> str: spec = env.get("spec", {}) - frozen_ids = _sql_producing_step_ids(spec) if mid_results else [] + frozen_ids = _sql_producing_step_ids(spec, mid_results) if mid_results else [] parts = [ f'

Frozen workflow:' f" {html.escape(name)} — “{html.escape(spec.get('goal', ''))}”

", @@ -440,7 +462,13 @@ def main() -> str: cards = timeline for name, env in envs.items(): - attach = in_session if name == "sequence" else () + short = str(env.get("spec_hash", ""))[:12] + attach = [ + r + for r in in_session + if r.get("plan_hash") == short + or (not r.get("plan_hash") and name == "sequence") + ] cards += _plan_card(name, env, attach) if other: cards += ( diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py index 6d3fe560c02..27d79ccfb99 100644 --- a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py +++ b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py @@ -1139,6 +1139,22 @@ def test_sql_freezing_roundtrip_and_revision_history(tmp_path, monkeypatch): assert "calendar quarters" in rec2["revisions"][0]["feedback"] # unknown question -> None assert demo._load_frozen_sql("never asked") is None + # WORKFLOW LINEAGE: the middle result records which plan + step produced + # it, and revisions inherit the lineage (the artifact is structurally + # attached to the frozen workflow, not just stored beside it). + demo._freeze_sql( + "lineage q", + "SELECT 1", + plan_hash="abc123def456", + produced_by_step="sqlgen", + ) + rec3 = demo._load_frozen_sql("lineage q") + assert rec3["plan_hash"] == "abc123def456" + assert rec3["produced_by_step"] == "sqlgen" + demo._freeze_sql("lineage q", "SELECT 2", feedback="tweak it", previous=rec3) + rec4 = demo._load_frozen_sql("lineage q") + assert rec4["plan_hash"] == "abc123def456" # lineage survives revisions + assert len(rec4["revisions"]) == 1 def test_frozen_sql_replay_plan_is_static_and_clean():